word2vec 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,145 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+ #include <stdio.h>
16
+ #include <stdlib.h>
17
+ #include <string.h>
18
+ #include <math.h>
19
+ /* #include <malloc.h> */
20
+
21
+ const long long max_size = 2000; // max length of strings
22
+ const long long N = 40; // number of closest words that will be shown
23
+ const long long max_w = 50; // max length of vocabulary entries
24
+
25
+ int main(int argc, char **argv) {
26
+ FILE *f;
27
+ char st1[max_size];
28
+ char bestw[N][max_size];
29
+ char file_name[max_size], st[100][max_size];
30
+ float dist, len, bestd[N], vec[max_size];
31
+ long long words, size, a, b, c, d, cn, bi[100];
32
+ char ch;
33
+ float *M;
34
+ char *vocab;
35
+ if (argc < 2) {
36
+ printf("Usage: ./word-analogy <FILE>\nwhere FILE contains word projections in the BINARY FORMAT\n");
37
+ return 0;
38
+ }
39
+ strcpy(file_name, argv[1]);
40
+ f = fopen(file_name, "rb");
41
+ if (f == NULL) {
42
+ printf("Input file not found\n");
43
+ return -1;
44
+ }
45
+ fscanf(f, "%lld", &words);
46
+ fscanf(f, "%lld", &size);
47
+ vocab = (char *)malloc((long long)words * max_w * sizeof(char));
48
+ M = (float *)malloc((long long)words * (long long)size * sizeof(float));
49
+ if (M == NULL) {
50
+ printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
51
+ return -1;
52
+ }
53
+ for (b = 0; b < words; b++) {
54
+ a = 0;
55
+ while (1) {
56
+ vocab[b * max_w + a] = fgetc(f);
57
+ if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
58
+ if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
59
+ }
60
+ vocab[b * max_w + a] = 0;
61
+ for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
62
+ len = 0;
63
+ for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
64
+ len = sqrt(len);
65
+ for (a = 0; a < size; a++) M[a + b * size] /= len;
66
+ }
67
+ fclose(f);
68
+ while (1) {
69
+ for (a = 0; a < N; a++) bestd[a] = 0;
70
+ for (a = 0; a < N; a++) bestw[a][0] = 0;
71
+ printf("Enter three words (EXIT to break): ");
72
+ a = 0;
73
+ while (1) {
74
+ st1[a] = fgetc(stdin);
75
+ if ((st1[a] == '\n') || (a >= max_size - 1)) {
76
+ st1[a] = 0;
77
+ break;
78
+ }
79
+ a++;
80
+ }
81
+ if (!strcmp(st1, "EXIT")) break;
82
+ cn = 0;
83
+ b = 0;
84
+ c = 0;
85
+ while (1) {
86
+ st[cn][b] = st1[c];
87
+ b++;
88
+ c++;
89
+ st[cn][b] = 0;
90
+ if (st1[c] == 0) break;
91
+ if (st1[c] == ' ') {
92
+ cn++;
93
+ b = 0;
94
+ c++;
95
+ }
96
+ }
97
+ cn++;
98
+ if (cn < 3) {
99
+ printf("Only %lld words were entered.. three words are needed at the input to perform the calculation\n", cn);
100
+ continue;
101
+ }
102
+ for (a = 0; a < cn; a++) {
103
+ for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
104
+ if (b == words) b = 0;
105
+ bi[a] = b;
106
+ printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]);
107
+ if (b == 0) {
108
+ printf("Out of dictionary word!\n");
109
+ break;
110
+ }
111
+ }
112
+ if (b == 0) continue;
113
+ printf("\n Word Distance\n------------------------------------------------------------------------\n");
114
+ for (a = 0; a < size; a++) vec[a] = M[a + bi[1] * size] - M[a + bi[0] * size] + M[a + bi[2] * size];
115
+ len = 0;
116
+ for (a = 0; a < size; a++) len += vec[a] * vec[a];
117
+ len = sqrt(len);
118
+ for (a = 0; a < size; a++) vec[a] /= len;
119
+ for (a = 0; a < N; a++) bestd[a] = 0;
120
+ for (a = 0; a < N; a++) bestw[a][0] = 0;
121
+ for (c = 0; c < words; c++) {
122
+ if (c == bi[0]) continue;
123
+ if (c == bi[1]) continue;
124
+ if (c == bi[2]) continue;
125
+ a = 0;
126
+ for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
127
+ if (a == 1) continue;
128
+ dist = 0;
129
+ for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
130
+ for (a = 0; a < N; a++) {
131
+ if (dist > bestd[a]) {
132
+ for (d = N - 1; d > a; d--) {
133
+ bestd[d] = bestd[d - 1];
134
+ strcpy(bestw[d], bestw[d - 1]);
135
+ }
136
+ bestd[a] = dist;
137
+ strcpy(bestw[a], &vocab[c * max_w]);
138
+ break;
139
+ }
140
+ }
141
+ }
142
+ for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]);
143
+ }
144
+ return 0;
145
+ }
@@ -0,0 +1,292 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+ #include <stdio.h>
16
+ #include <stdlib.h>
17
+ #include <string.h>
18
+ #include <math.h>
19
+ #include <pthread.h>
20
+
21
+ #define MAX_STRING 60
22
+
23
+ const int vocab_hash_size = 500000000; // Maximum 500M entries in the vocabulary
24
+
25
+ typedef float real; // Precision of float numbers
26
+
27
+ struct vocab_word {
28
+ long long cn;
29
+ char *word;
30
+ };
31
+
32
+ char train_file[MAX_STRING], output_file[MAX_STRING];
33
+ struct vocab_word *vocab;
34
+ int debug_mode = 2, min_count = 5, *vocab_hash, min_reduce = 1;
35
+ long long vocab_max_size = 10000, vocab_size = 0;
36
+ long long train_words = 0;
37
+ real threshold = 100;
38
+
39
+ unsigned long long next_random = 1;
40
+
41
+ // Reads a single word from a file, assuming space + tab + EOL to be word boundaries
42
+ void ReadWord(char *word, FILE *fin) {
43
+ int a = 0, ch;
44
+ while (!feof(fin)) {
45
+ ch = fgetc(fin);
46
+ if (ch == 13) continue;
47
+ if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
48
+ if (a > 0) {
49
+ if (ch == '\n') ungetc(ch, fin);
50
+ break;
51
+ }
52
+ if (ch == '\n') {
53
+ strcpy(word, (char *)"</s>");
54
+ return;
55
+ } else continue;
56
+ }
57
+ word[a] = ch;
58
+ a++;
59
+ if (a >= MAX_STRING - 1) a--; // Truncate too long words
60
+ }
61
+ word[a] = 0;
62
+ }
63
+
64
+ // Returns hash value of a word
65
+ int GetWordHash(char *word) {
66
+ unsigned long long a, hash = 1;
67
+ for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
68
+ hash = hash % vocab_hash_size;
69
+ return hash;
70
+ }
71
+
72
+ // Returns position of a word in the vocabulary; if the word is not found, returns -1
73
+ int SearchVocab(char *word) {
74
+ unsigned int hash = GetWordHash(word);
75
+ while (1) {
76
+ if (vocab_hash[hash] == -1) return -1;
77
+ if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
78
+ hash = (hash + 1) % vocab_hash_size;
79
+ }
80
+ return -1;
81
+ }
82
+
83
+ // Reads a word and returns its index in the vocabulary
84
+ int ReadWordIndex(FILE *fin) {
85
+ char word[MAX_STRING];
86
+ ReadWord(word, fin);
87
+ if (feof(fin)) return -1;
88
+ return SearchVocab(word);
89
+ }
90
+
91
+ // Adds a word to the vocabulary
92
+ int AddWordToVocab(char *word) {
93
+ unsigned int hash, length = strlen(word) + 1;
94
+ if (length > MAX_STRING) length = MAX_STRING;
95
+ vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
96
+ strcpy(vocab[vocab_size].word, word);
97
+ vocab[vocab_size].cn = 0;
98
+ vocab_size++;
99
+ // Reallocate memory if needed
100
+ if (vocab_size + 2 >= vocab_max_size) {
101
+ vocab_max_size += 10000;
102
+ vocab=(struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
103
+ }
104
+ hash = GetWordHash(word);
105
+ while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
106
+ vocab_hash[hash]=vocab_size - 1;
107
+ return vocab_size - 1;
108
+ }
109
+
110
+ // Used later for sorting by word counts
111
+ int VocabCompare(const void *a, const void *b) {
112
+ return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
113
+ }
114
+
115
+ // Sorts the vocabulary by frequency using word counts
116
+ void SortVocab() {
117
+ int a;
118
+ unsigned int hash;
119
+ // Sort the vocabulary and keep </s> at the first position
120
+ qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
121
+ for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
122
+ for (a = 0; a < vocab_size; a++) {
123
+ // Words occuring less than min_count times will be discarded from the vocab
124
+ if (vocab[a].cn < min_count) {
125
+ vocab_size--;
126
+ free(vocab[vocab_size].word);
127
+ } else {
128
+ // Hash will be re-computed, as after the sorting it is not actual
129
+ hash = GetWordHash(vocab[a].word);
130
+ while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
131
+ vocab_hash[hash] = a;
132
+ }
133
+ }
134
+ vocab = (struct vocab_word *)realloc(vocab, vocab_size * sizeof(struct vocab_word));
135
+ }
136
+
137
+ // Reduces the vocabulary by removing infrequent tokens
138
+ void ReduceVocab() {
139
+ int a, b = 0;
140
+ unsigned int hash;
141
+ for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
142
+ vocab[b].cn = vocab[a].cn;
143
+ vocab[b].word = vocab[a].word;
144
+ b++;
145
+ } else free(vocab[a].word);
146
+ vocab_size = b;
147
+ for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
148
+ for (a = 0; a < vocab_size; a++) {
149
+ // Hash will be re-computed, as it is not actual
150
+ hash = GetWordHash(vocab[a].word);
151
+ while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
152
+ vocab_hash[hash] = a;
153
+ }
154
+ fflush(stdout);
155
+ min_reduce++;
156
+ }
157
+
158
+ void LearnVocabFromTrainFile() {
159
+ char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2];
160
+ FILE *fin;
161
+ long long a, i, start = 1;
162
+ for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
163
+ fin = fopen(train_file, "rb");
164
+ if (fin == NULL) {
165
+ printf("ERROR: training data file not found!\n");
166
+ exit(1);
167
+ }
168
+ vocab_size = 0;
169
+ AddWordToVocab((char *)"</s>");
170
+ while (1) {
171
+ ReadWord(word, fin);
172
+ if (feof(fin)) break;
173
+ if (!strcmp(word, "</s>")) {
174
+ start = 1;
175
+ continue;
176
+ } else start = 0;
177
+ train_words++;
178
+ if ((debug_mode > 1) && (train_words % 100000 == 0)) {
179
+ printf("Words processed: %lldK Vocab size: %lldK %c", train_words / 1000, vocab_size / 1000, 13);
180
+ fflush(stdout);
181
+ }
182
+ i = SearchVocab(word);
183
+ if (i == -1) {
184
+ a = AddWordToVocab(word);
185
+ vocab[a].cn = 1;
186
+ } else vocab[i].cn++;
187
+ if (start) continue;
188
+ sprintf(bigram_word, "%s_%s", last_word, word);
189
+ bigram_word[MAX_STRING - 1] = 0;
190
+ strcpy(last_word, word);
191
+ i = SearchVocab(bigram_word);
192
+ if (i == -1) {
193
+ a = AddWordToVocab(bigram_word);
194
+ vocab[a].cn = 1;
195
+ } else vocab[i].cn++;
196
+ if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
197
+ }
198
+ SortVocab();
199
+ if (debug_mode > 0) {
200
+ printf("\nVocab size (unigrams + bigrams): %lld\n", vocab_size);
201
+ printf("Words in train file: %lld\n", train_words);
202
+ }
203
+ fclose(fin);
204
+ }
205
+
206
+ void TrainModel() {
207
+ long long pa = 0, pb = 0, pab = 0, oov, i, li = -1, cn = 0;
208
+ char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2];
209
+ real score;
210
+ FILE *fo, *fin;
211
+ printf("Starting training using file %s\n", train_file);
212
+ LearnVocabFromTrainFile();
213
+ fin = fopen(train_file, "rb");
214
+ fo = fopen(output_file, "wb");
215
+ word[0] = 0;
216
+ while (1) {
217
+ strcpy(last_word, word);
218
+ ReadWord(word, fin);
219
+ if (feof(fin)) break;
220
+ if (!strcmp(word, "</s>")) {
221
+ fprintf(fo, "\n");
222
+ continue;
223
+ }
224
+ cn++;
225
+ if ((debug_mode > 1) && (cn % 100000 == 0)) {
226
+ printf("Words written: %lldK%c", cn / 1000, 13);
227
+ fflush(stdout);
228
+ }
229
+ oov = 0;
230
+ i = SearchVocab(word);
231
+ if (i == -1) oov = 1; else pb = vocab[i].cn;
232
+ if (li == -1) oov = 1;
233
+ li = i;
234
+ sprintf(bigram_word, "%s_%s", last_word, word);
235
+ bigram_word[MAX_STRING - 1] = 0;
236
+ i = SearchVocab(bigram_word);
237
+ if (i == -1) oov = 1; else pab = vocab[i].cn;
238
+ if (pa < min_count) oov = 1;
239
+ if (pb < min_count) oov = 1;
240
+ if (oov) score = 0; else score = (pab - min_count) / (real)pa / (real)pb * (real)train_words;
241
+ if (score > threshold) {
242
+ fprintf(fo, "_%s", word);
243
+ pb = 0;
244
+ } else fprintf(fo, " %s", word);
245
+ pa = pb;
246
+ }
247
+ fclose(fo);
248
+ fclose(fin);
249
+ }
250
+
251
+ int ArgPos(char *str, int argc, char **argv) {
252
+ int a;
253
+ for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
254
+ if (a == argc - 1) {
255
+ printf("Argument missing for %s\n", str);
256
+ exit(1);
257
+ }
258
+ return a;
259
+ }
260
+ return -1;
261
+ }
262
+
263
+ int main(int argc, char **argv) {
264
+ int i;
265
+ if (argc == 1) {
266
+ printf("WORD2PHRASE tool v0.1a\n\n");
267
+ printf("Options:\n");
268
+ printf("Parameters for training:\n");
269
+ printf("\t-train <file>\n");
270
+ printf("\t\tUse text data from <file> to train the model\n");
271
+ printf("\t-output <file>\n");
272
+ printf("\t\tUse <file> to save the resulting word vectors / word clusters / phrases\n");
273
+ printf("\t-min-count <int>\n");
274
+ printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
275
+ printf("\t-threshold <float>\n");
276
+ printf("\t\t The <float> value represents threshold for forming the phrases (higher means less phrases); default 100\n");
277
+ printf("\t-debug <int>\n");
278
+ printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
279
+ printf("\nExamples:\n");
280
+ printf("./word2phrase -train text.txt -output phrases.txt -threshold 100 -debug 2\n\n");
281
+ return 0;
282
+ }
283
+ if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
284
+ if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
285
+ if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
286
+ if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
287
+ if ((i = ArgPos((char *)"-threshold", argc, argv)) > 0) threshold = atof(argv[i + 1]);
288
+ vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
289
+ vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
290
+ TrainModel();
291
+ return 0;
292
+ }
@@ -0,0 +1,702 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+ #include <stdio.h>
16
+ #include <stdlib.h>
17
+ #include <string.h>
18
+ #include <math.h>
19
+ #include <pthread.h>
20
+
21
+ #define MAX_STRING 100
22
+ #define EXP_TABLE_SIZE 1000
23
+ #define MAX_EXP 6
24
+ #define MAX_SENTENCE_LENGTH 1000
25
+ #define MAX_CODE_LENGTH 40
26
+
27
+ const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
28
+
29
+ typedef float real; // Precision of float numbers
30
+
31
+ struct vocab_word {
32
+ long long cn;
33
+ int *point;
34
+ char *word, *code, codelen;
35
+ };
36
+
37
+ char train_file[MAX_STRING], output_file[MAX_STRING];
38
+ char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
39
+ struct vocab_word *vocab;
40
+ int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1;
41
+ int *vocab_hash;
42
+ long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
43
+ long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0;
44
+ real alpha = 0.025, starting_alpha, sample = 1e-3;
45
+ real *syn0, *syn1, *syn1neg, *expTable;
46
+ clock_t start;
47
+
48
+ int hs = 0, negative = 5;
49
+ const int table_size = 1e8;
50
+ int *table;
51
+
52
+ void InitUnigramTable() {
53
+ int a, i;
54
+ double train_words_pow = 0;
55
+ double d1, power = 0.75;
56
+ table = (int *)malloc(table_size * sizeof(int));
57
+ for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);
58
+ i = 0;
59
+ d1 = pow(vocab[i].cn, power) / train_words_pow;
60
+ for (a = 0; a < table_size; a++) {
61
+ table[a] = i;
62
+ if (a / (double)table_size > d1) {
63
+ i++;
64
+ d1 += pow(vocab[i].cn, power) / train_words_pow;
65
+ }
66
+ if (i >= vocab_size) i = vocab_size - 1;
67
+ }
68
+ }
69
+
70
+ // Reads a single word from a file, assuming space + tab + EOL to be word boundaries
71
+ void ReadWord(char *word, FILE *fin) {
72
+ int a = 0, ch;
73
+ while (!feof(fin)) {
74
+ ch = fgetc(fin);
75
+ if (ch == 13) continue;
76
+ if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
77
+ if (a > 0) {
78
+ if (ch == '\n') ungetc(ch, fin);
79
+ break;
80
+ }
81
+ if (ch == '\n') {
82
+ strcpy(word, (char *)"</s>");
83
+ return;
84
+ } else continue;
85
+ }
86
+ word[a] = ch;
87
+ a++;
88
+ if (a >= MAX_STRING - 1) a--; // Truncate too long words
89
+ }
90
+ word[a] = 0;
91
+ }
92
+
93
+ // Returns hash value of a word
94
+ int GetWordHash(char *word) {
95
+ unsigned long long a, hash = 0;
96
+ for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
97
+ hash = hash % vocab_hash_size;
98
+ return hash;
99
+ }
100
+
101
+ // Returns position of a word in the vocabulary; if the word is not found, returns -1
102
+ int SearchVocab(char *word) {
103
+ unsigned int hash = GetWordHash(word);
104
+ while (1) {
105
+ if (vocab_hash[hash] == -1) return -1;
106
+ if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
107
+ hash = (hash + 1) % vocab_hash_size;
108
+ }
109
+ return -1;
110
+ }
111
+
112
+ // Reads a word and returns its index in the vocabulary
113
+ int ReadWordIndex(FILE *fin) {
114
+ char word[MAX_STRING];
115
+ ReadWord(word, fin);
116
+ if (feof(fin)) return -1;
117
+ return SearchVocab(word);
118
+ }
119
+
120
+ // Adds a word to the vocabulary
121
+ int AddWordToVocab(char *word) {
122
+ unsigned int hash, length = strlen(word) + 1;
123
+ if (length > MAX_STRING) length = MAX_STRING;
124
+ vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
125
+ strcpy(vocab[vocab_size].word, word);
126
+ vocab[vocab_size].cn = 0;
127
+ vocab_size++;
128
+ // Reallocate memory if needed
129
+ if (vocab_size + 2 >= vocab_max_size) {
130
+ vocab_max_size += 1000;
131
+ vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
132
+ }
133
+ hash = GetWordHash(word);
134
+ while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
135
+ vocab_hash[hash] = vocab_size - 1;
136
+ return vocab_size - 1;
137
+ }
138
+
139
+ // Used later for sorting by word counts
140
+ int VocabCompare(const void *a, const void *b) {
141
+ return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
142
+ }
143
+
144
+ // Sorts the vocabulary by frequency using word counts
145
+ void SortVocab() {
146
+ int a, size;
147
+ unsigned int hash;
148
+ // Sort the vocabulary and keep </s> at the first position
149
+ qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
150
+ for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
151
+ size = vocab_size;
152
+ train_words = 0;
153
+ for (a = 0; a < size; a++) {
154
+ // Words occuring less than min_count times will be discarded from the vocab
155
+ if ((vocab[a].cn < min_count) && (a != 0)) {
156
+ vocab_size--;
157
+ free(vocab[a].word);
158
+ } else {
159
+ // Hash will be re-computed, as after the sorting it is not actual
160
+ hash=GetWordHash(vocab[a].word);
161
+ while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
162
+ vocab_hash[hash] = a;
163
+ train_words += vocab[a].cn;
164
+ }
165
+ }
166
+ vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
167
+ // Allocate memory for the binary tree construction
168
+ for (a = 0; a < vocab_size; a++) {
169
+ vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
170
+ vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
171
+ }
172
+ }
173
+
174
+ // Reduces the vocabulary by removing infrequent tokens
175
+ void ReduceVocab() {
176
+ int a, b = 0;
177
+ unsigned int hash;
178
+ for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
179
+ vocab[b].cn = vocab[a].cn;
180
+ vocab[b].word = vocab[a].word;
181
+ b++;
182
+ } else free(vocab[a].word);
183
+ vocab_size = b;
184
+ for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
185
+ for (a = 0; a < vocab_size; a++) {
186
+ // Hash will be re-computed, as it is not actual
187
+ hash = GetWordHash(vocab[a].word);
188
+ while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
189
+ vocab_hash[hash] = a;
190
+ }
191
+ fflush(stdout);
192
+ min_reduce++;
193
+ }
194
+
195
+ // Create binary Huffman tree using the word counts
196
+ // Frequent words will have short uniqe binary codes
197
+ void CreateBinaryTree() {
198
+ long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
199
+ char code[MAX_CODE_LENGTH];
200
+ long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
201
+ long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
202
+ long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
203
+ for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;
204
+ for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;
205
+ pos1 = vocab_size - 1;
206
+ pos2 = vocab_size;
207
+ // Following algorithm constructs the Huffman tree by adding one node at a time
208
+ for (a = 0; a < vocab_size - 1; a++) {
209
+ // First, find two smallest nodes 'min1, min2'
210
+ if (pos1 >= 0) {
211
+ if (count[pos1] < count[pos2]) {
212
+ min1i = pos1;
213
+ pos1--;
214
+ } else {
215
+ min1i = pos2;
216
+ pos2++;
217
+ }
218
+ } else {
219
+ min1i = pos2;
220
+ pos2++;
221
+ }
222
+ if (pos1 >= 0) {
223
+ if (count[pos1] < count[pos2]) {
224
+ min2i = pos1;
225
+ pos1--;
226
+ } else {
227
+ min2i = pos2;
228
+ pos2++;
229
+ }
230
+ } else {
231
+ min2i = pos2;
232
+ pos2++;
233
+ }
234
+ count[vocab_size + a] = count[min1i] + count[min2i];
235
+ parent_node[min1i] = vocab_size + a;
236
+ parent_node[min2i] = vocab_size + a;
237
+ binary[min2i] = 1;
238
+ }
239
+ // Now assign binary code to each vocabulary word
240
+ for (a = 0; a < vocab_size; a++) {
241
+ b = a;
242
+ i = 0;
243
+ while (1) {
244
+ code[i] = binary[b];
245
+ point[i] = b;
246
+ i++;
247
+ b = parent_node[b];
248
+ if (b == vocab_size * 2 - 2) break;
249
+ }
250
+ vocab[a].codelen = i;
251
+ vocab[a].point[0] = vocab_size - 2;
252
+ for (b = 0; b < i; b++) {
253
+ vocab[a].code[i - b - 1] = code[b];
254
+ vocab[a].point[i - b] = point[b] - vocab_size;
255
+ }
256
+ }
257
+ free(count);
258
+ free(binary);
259
+ free(parent_node);
260
+ }
261
+
262
+ void LearnVocabFromTrainFile() {
263
+ char word[MAX_STRING];
264
+ FILE *fin;
265
+ long long a, i;
266
+ for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
267
+ fin = fopen(train_file, "rb");
268
+ if (fin == NULL) {
269
+ printf("ERROR: training data file not found!\n");
270
+ exit(1);
271
+ }
272
+ vocab_size = 0;
273
+ AddWordToVocab((char *)"</s>");
274
+ while (1) {
275
+ ReadWord(word, fin);
276
+ if (feof(fin)) break;
277
+ train_words++;
278
+ if ((debug_mode > 1) && (train_words % 100000 == 0)) {
279
+ printf("%lldK%c", train_words / 1000, 13);
280
+ fflush(stdout);
281
+ }
282
+ i = SearchVocab(word);
283
+ if (i == -1) {
284
+ a = AddWordToVocab(word);
285
+ vocab[a].cn = 1;
286
+ } else vocab[i].cn++;
287
+ if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
288
+ }
289
+ SortVocab();
290
+ if (debug_mode > 0) {
291
+ printf("Vocab size: %lld\n", vocab_size);
292
+ printf("Words in train file: %lld\n", train_words);
293
+ }
294
+ file_size = ftell(fin);
295
+ fclose(fin);
296
+ }
297
+
298
+ void SaveVocab() {
299
+ long long i;
300
+ FILE *fo = fopen(save_vocab_file, "wb");
301
+ for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
302
+ fclose(fo);
303
+ }
304
+
305
+ void ReadVocab() {
306
+ long long a, i = 0;
307
+ char c;
308
+ char word[MAX_STRING];
309
+ FILE *fin = fopen(read_vocab_file, "rb");
310
+ if (fin == NULL) {
311
+ printf("Vocabulary file not found\n");
312
+ exit(1);
313
+ }
314
+ for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
315
+ vocab_size = 0;
316
+ while (1) {
317
+ ReadWord(word, fin);
318
+ if (feof(fin)) break;
319
+ a = AddWordToVocab(word);
320
+ fscanf(fin, "%lld%c", &vocab[a].cn, &c);
321
+ i++;
322
+ }
323
+ SortVocab();
324
+ if (debug_mode > 0) {
325
+ printf("Vocab size: %lld\n", vocab_size);
326
+ printf("Words in train file: %lld\n", train_words);
327
+ }
328
+ fin = fopen(train_file, "rb");
329
+ if (fin == NULL) {
330
+ printf("ERROR: training data file not found!\n");
331
+ exit(1);
332
+ }
333
+ fseek(fin, 0, SEEK_END);
334
+ file_size = ftell(fin);
335
+ fclose(fin);
336
+ }
337
+
338
+ void InitNet() {
339
+ long long a, b;
340
+ unsigned long long next_random = 1;
341
+ a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real));
342
+ if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}
343
+ if (hs) {
344
+ a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real));
345
+ if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);}
346
+ for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
347
+ syn1[a * layer1_size + b] = 0;
348
+ }
349
+ if (negative>0) {
350
+ a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real));
351
+ if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
352
+ for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
353
+ syn1neg[a * layer1_size + b] = 0;
354
+ }
355
+ for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) {
356
+ next_random = next_random * (unsigned long long)25214903917 + 11;
357
+ syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;
358
+ }
359
+ CreateBinaryTree();
360
+ }
361
+
362
+ void *TrainModelThread(void *id) {
363
+ long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0;
364
+ long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
365
+ long long l1, l2, c, target, label, local_iter = iter;
366
+ unsigned long long next_random = (long long)id;
367
+ real f, g;
368
+ clock_t now;
369
+ real *neu1 = (real *)calloc(layer1_size, sizeof(real));
370
+ real *neu1e = (real *)calloc(layer1_size, sizeof(real));
371
+ FILE *fi = fopen(train_file, "rb");
372
+ fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
373
+ while (1) {
374
+ if (word_count - last_word_count > 10000) {
375
+ word_count_actual += word_count - last_word_count;
376
+ last_word_count = word_count;
377
+ if ((debug_mode > 1)) {
378
+ now=clock();
379
+ printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha,
380
+ word_count_actual / (real)(iter * train_words + 1) * 100,
381
+ word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
382
+ fflush(stdout);
383
+ }
384
+ alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1));
385
+ if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
386
+ }
387
+ if (sentence_length == 0) {
388
+ while (1) {
389
+ word = ReadWordIndex(fi);
390
+ if (feof(fi)) break;
391
+ if (word == -1) continue;
392
+ word_count++;
393
+ if (word == 0) break;
394
+ // The subsampling randomly discards frequent words while keeping the ranking same
395
+ if (sample > 0) {
396
+ real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
397
+ next_random = next_random * (unsigned long long)25214903917 + 11;
398
+ if (ran < (next_random & 0xFFFF) / (real)65536) continue;
399
+ }
400
+ sen[sentence_length] = word;
401
+ sentence_length++;
402
+ if (sentence_length >= MAX_SENTENCE_LENGTH) break;
403
+ }
404
+ sentence_position = 0;
405
+ }
406
+ if (feof(fi) || (word_count > train_words / num_threads)) {
407
+ word_count_actual += word_count - last_word_count;
408
+ local_iter--;
409
+ if (local_iter == 0) break;
410
+ word_count = 0;
411
+ last_word_count = 0;
412
+ sentence_length = 0;
413
+ fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
414
+ continue;
415
+ }
416
+ word = sen[sentence_position];
417
+ if (word == -1) continue;
418
+ for (c = 0; c < layer1_size; c++) neu1[c] = 0;
419
+ for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
420
+ next_random = next_random * (unsigned long long)25214903917 + 11;
421
+ b = next_random % window;
422
+ if (cbow) { //train the cbow architecture
423
+ // in -> hidden
424
+ cw = 0;
425
+ for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
426
+ c = sentence_position - window + a;
427
+ if (c < 0) continue;
428
+ if (c >= sentence_length) continue;
429
+ last_word = sen[c];
430
+ if (last_word == -1) continue;
431
+ for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size];
432
+ cw++;
433
+ }
434
+ if (cw) {
435
+ for (c = 0; c < layer1_size; c++) neu1[c] /= cw;
436
+ if (hs) for (d = 0; d < vocab[word].codelen; d++) {
437
+ f = 0;
438
+ l2 = vocab[word].point[d] * layer1_size;
439
+ // Propagate hidden -> output
440
+ for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
441
+ if (f <= -MAX_EXP) continue;
442
+ else if (f >= MAX_EXP) continue;
443
+ else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
444
+ // 'g' is the gradient multiplied by the learning rate
445
+ g = (1 - vocab[word].code[d] - f) * alpha;
446
+ // Propagate errors output -> hidden
447
+ for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
448
+ // Learn weights hidden -> output
449
+ for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
450
+ }
451
+ // NEGATIVE SAMPLING
452
+ if (negative > 0) for (d = 0; d < negative + 1; d++) {
453
+ if (d == 0) {
454
+ target = word;
455
+ label = 1;
456
+ } else {
457
+ next_random = next_random * (unsigned long long)25214903917 + 11;
458
+ target = table[(next_random >> 16) % table_size];
459
+ if (target == 0) target = next_random % (vocab_size - 1) + 1;
460
+ if (target == word) continue;
461
+ label = 0;
462
+ }
463
+ l2 = target * layer1_size;
464
+ f = 0;
465
+ for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
466
+ if (f > MAX_EXP) g = (label - 1) * alpha;
467
+ else if (f < -MAX_EXP) g = (label - 0) * alpha;
468
+ else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
469
+ for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
470
+ for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
471
+ }
472
+ // hidden -> in
473
+ for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
474
+ c = sentence_position - window + a;
475
+ if (c < 0) continue;
476
+ if (c >= sentence_length) continue;
477
+ last_word = sen[c];
478
+ if (last_word == -1) continue;
479
+ for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
480
+ }
481
+ }
482
+ } else { //train skip-gram
483
+ for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
484
+ c = sentence_position - window + a;
485
+ if (c < 0) continue;
486
+ if (c >= sentence_length) continue;
487
+ last_word = sen[c];
488
+ if (last_word == -1) continue;
489
+ l1 = last_word * layer1_size;
490
+ for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
491
+ // HIERARCHICAL SOFTMAX
492
+ if (hs) for (d = 0; d < vocab[word].codelen; d++) {
493
+ f = 0;
494
+ l2 = vocab[word].point[d] * layer1_size;
495
+ // Propagate hidden -> output
496
+ for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2];
497
+ if (f <= -MAX_EXP) continue;
498
+ else if (f >= MAX_EXP) continue;
499
+ else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
500
+ // 'g' is the gradient multiplied by the learning rate
501
+ g = (1 - vocab[word].code[d] - f) * alpha;
502
+ // Propagate errors output -> hidden
503
+ for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
504
+ // Learn weights hidden -> output
505
+ for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1];
506
+ }
507
+ // NEGATIVE SAMPLING
508
+ if (negative > 0) for (d = 0; d < negative + 1; d++) {
509
+ if (d == 0) {
510
+ target = word;
511
+ label = 1;
512
+ } else {
513
+ next_random = next_random * (unsigned long long)25214903917 + 11;
514
+ target = table[(next_random >> 16) % table_size];
515
+ if (target == 0) target = next_random % (vocab_size - 1) + 1;
516
+ if (target == word) continue;
517
+ label = 0;
518
+ }
519
+ l2 = target * layer1_size;
520
+ f = 0;
521
+ for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2];
522
+ if (f > MAX_EXP) g = (label - 1) * alpha;
523
+ else if (f < -MAX_EXP) g = (label - 0) * alpha;
524
+ else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
525
+ for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
526
+ for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1];
527
+ }
528
+ // Learn weights input -> hidden
529
+ for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c];
530
+ }
531
+ }
532
+ sentence_position++;
533
+ if (sentence_position >= sentence_length) {
534
+ sentence_length = 0;
535
+ continue;
536
+ }
537
+ }
538
+ fclose(fi);
539
+ free(neu1);
540
+ free(neu1e);
541
+ pthread_exit(NULL);
542
+ }
543
+
544
+ void TrainModel() {
545
+ long a, b, c, d;
546
+ FILE *fo;
547
+ pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
548
+ printf("Starting training using file %s\n", train_file);
549
+ starting_alpha = alpha;
550
+ if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile();
551
+ if (save_vocab_file[0] != 0) SaveVocab();
552
+ if (output_file[0] == 0) return;
553
+ InitNet();
554
+ if (negative > 0) InitUnigramTable();
555
+ start = clock();
556
+ for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
557
+ for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
558
+ fo = fopen(output_file, "wb");
559
+ if (classes == 0) {
560
+ // Save the word vectors
561
+ fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
562
+ for (a = 0; a < vocab_size; a++) {
563
+ fprintf(fo, "%s ", vocab[a].word);
564
+ if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
565
+ else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
566
+ fprintf(fo, "\n");
567
+ }
568
+ } else {
569
+ // Run K-means on the word vectors
570
+ int clcn = classes, iter = 10, closeid;
571
+ int *centcn = (int *)malloc(classes * sizeof(int));
572
+ int *cl = (int *)calloc(vocab_size, sizeof(int));
573
+ real closev, x;
574
+ real *cent = (real *)calloc(classes * layer1_size, sizeof(real));
575
+ for (a = 0; a < vocab_size; a++) cl[a] = a % clcn;
576
+ for (a = 0; a < iter; a++) {
577
+ for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0;
578
+ for (b = 0; b < clcn; b++) centcn[b] = 1;
579
+ for (c = 0; c < vocab_size; c++) {
580
+ for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
581
+ centcn[cl[c]]++;
582
+ }
583
+ for (b = 0; b < clcn; b++) {
584
+ closev = 0;
585
+ for (c = 0; c < layer1_size; c++) {
586
+ cent[layer1_size * b + c] /= centcn[b];
587
+ closev += cent[layer1_size * b + c] * cent[layer1_size * b + c];
588
+ }
589
+ closev = sqrt(closev);
590
+ for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev;
591
+ }
592
+ for (c = 0; c < vocab_size; c++) {
593
+ closev = -10;
594
+ closeid = 0;
595
+ for (d = 0; d < clcn; d++) {
596
+ x = 0;
597
+ for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b];
598
+ if (x > closev) {
599
+ closev = x;
600
+ closeid = d;
601
+ }
602
+ }
603
+ cl[c] = closeid;
604
+ }
605
+ }
606
+ // Save the K-means classes
607
+ for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
608
+ free(centcn);
609
+ free(cent);
610
+ free(cl);
611
+ }
612
+ fclose(fo);
613
+ }
614
+
615
+ int ArgPos(char *str, int argc, char **argv) {
616
+ int a;
617
+ for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
618
+ if (a == argc - 1) {
619
+ printf("Argument missing for %s\n", str);
620
+ exit(1);
621
+ }
622
+ return a;
623
+ }
624
+ return -1;
625
+ }
626
+
627
+ int main(int argc, char **argv) {
628
+ int i;
629
+ if (argc == 1) {
630
+ printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
631
+ printf("Options:\n");
632
+ printf("Parameters for training:\n");
633
+ printf("\t-train <file>\n");
634
+ printf("\t\tUse text data from <file> to train the model\n");
635
+ printf("\t-output <file>\n");
636
+ printf("\t\tUse <file> to save the resulting word vectors / word clusters\n");
637
+ printf("\t-size <int>\n");
638
+ printf("\t\tSet size of word vectors; default is 100\n");
639
+ printf("\t-window <int>\n");
640
+ printf("\t\tSet max skip length between words; default is 5\n");
641
+ printf("\t-sample <float>\n");
642
+ printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
643
+ printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
644
+ printf("\t-hs <int>\n");
645
+ printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
646
+ printf("\t-negative <int>\n");
647
+ printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
648
+ printf("\t-threads <int>\n");
649
+ printf("\t\tUse <int> threads (default 12)\n");
650
+ printf("\t-iter <int>\n");
651
+ printf("\t\tRun more training iterations (default 5)\n");
652
+ printf("\t-min-count <int>\n");
653
+ printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
654
+ printf("\t-alpha <float>\n");
655
+ printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
656
+ printf("\t-classes <int>\n");
657
+ printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
658
+ printf("\t-debug <int>\n");
659
+ printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
660
+ printf("\t-binary <int>\n");
661
+ printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
662
+ printf("\t-save-vocab <file>\n");
663
+ printf("\t\tThe vocabulary will be saved to <file>\n");
664
+ printf("\t-read-vocab <file>\n");
665
+ printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
666
+ printf("\t-cbow <int>\n");
667
+ printf("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n");
668
+ printf("\nExamples:\n");
669
+ printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n");
670
+ return 0;
671
+ }
672
+ output_file[0] = 0;
673
+ save_vocab_file[0] = 0;
674
+ read_vocab_file[0] = 0;
675
+ if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
676
+ if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
677
+ if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]);
678
+ if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]);
679
+ if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
680
+ if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
681
+ if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]);
682
+ if (cbow) alpha = 0.05;
683
+ if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
684
+ if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
685
+ if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);
686
+ if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);
687
+ if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);
688
+ if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);
689
+ if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
690
+ if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]);
691
+ if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
692
+ if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
693
+ vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
694
+ vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
695
+ expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
696
+ for (i = 0; i < EXP_TABLE_SIZE; i++) {
697
+ expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
698
+ expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
699
+ }
700
+ TrainModel();
701
+ return 0;
702
+ }