word2vec-rb 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -1
- data/.vscode/c_cpp_properties.json +20 -0
- data/.vscode/settings.json +6 -0
- data/Gemfile.lock +1 -1
- data/README.md +17 -0
- data/ext/word2vec/common.c +198 -1
- data/ext/word2vec/common.h +31 -2
- data/ext/word2vec/tokenizer.c +19 -0
- data/ext/word2vec/vocab.c +0 -172
- data/ext/word2vec/word2vec.c +17 -0
- data/lib/word2vec/version.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 192304d569b2fb573300a33ff7ed05fc37202c30cf55144fc043cc544e620f23
|
4
|
+
data.tar.gz: e5473521b82560242b19ae2efa9c4e07a7798bb3c0ea2e97d97e5f3a1f2601cf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: db2cc2396b4a7193a0cb4efd40ffefbefabd88861f257aea8ac90d032d9ebac8f0759cd952678174f764cdce2605a94c521e24d9433a8a1eaaf135e5dcc1cf89
|
7
|
+
data.tar.gz: 26949750236e3929f9794967570c3fb257c53474c9d79b857f12d4214ecca075afe4a32dbb5a6e5253b44ea1566a391aff70c9617a2914ee700779e3dd8a4af9
|
data/.gitignore
CHANGED
@@ -0,0 +1,20 @@
|
|
1
|
+
{
|
2
|
+
"configurations": [
|
3
|
+
{
|
4
|
+
"name": "Mac",
|
5
|
+
"includePath": [
|
6
|
+
"${workspaceFolder}/**",
|
7
|
+
"/Library/Developer/CommandLineTools/SDKs/MacOSX11.3.sdk/System/Library/Frameworks/Ruby.framework/Versions/2.6/usr/include/ruby-2.6.0/**"
|
8
|
+
],
|
9
|
+
"defines": [],
|
10
|
+
"macFrameworkPath": [
|
11
|
+
"/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks"
|
12
|
+
],
|
13
|
+
"compilerPath": "/usr/bin/clang",
|
14
|
+
"cStandard": "c17",
|
15
|
+
"cppStandard": "c++98",
|
16
|
+
"intelliSenseMode": "macos-clang-x64"
|
17
|
+
}
|
18
|
+
],
|
19
|
+
"version": 4
|
20
|
+
}
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -78,6 +78,23 @@ Word2vec::Model.build\_vocab("./data/text7", "./data/vocab.txt")
|
|
78
78
|
|
79
79
|
The output file will have a list of words and its number of appearances separated by line break.
|
80
80
|
|
81
|
+
### Tokenizer: create a binary file by tokenizing an input file
|
82
|
+
|
83
|
+
This method requires a vocabulary file precreated.
|
84
|
+
|
85
|
+
```ruby
|
86
|
+
require 'word2vec'
|
87
|
+
|
88
|
+
Word2vec::Model.tokenize("./data/text7", "./data/vocab.txt", "./data/toekized.bin")
|
89
|
+
```
|
90
|
+
|
91
|
+
The output file will contain a sequence of binary identificators of each word of the input file.
|
92
|
+
|
93
|
+
Read output file with:
|
94
|
+
|
95
|
+
long long id;
|
96
|
+
fread(&id, sizeof(id), 1, fi);
|
97
|
+
|
81
98
|
## Development
|
82
99
|
|
83
100
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/ext/word2vec/common.c
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
#include "common.h"
|
2
2
|
|
3
|
+
char train_file[MAX_STRING];
|
4
|
+
char read_vocab_file[MAX_STRING];
|
3
5
|
// max length of strings
|
4
6
|
const unsigned long max_size = 2000;
|
5
7
|
// number of closest words that will be shown
|
@@ -44,4 +46,199 @@ void word2vec_model_load(word2vec_model* model, char* file_name) {
|
|
44
46
|
for (a = 0; a < size; a++) model->vectors[a + b * size] /= len;
|
45
47
|
}
|
46
48
|
fclose(f);
|
47
|
-
}
|
49
|
+
}
|
50
|
+
|
51
|
+
// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
|
52
|
+
void ReadWord(char *word, FILE *fin) {
|
53
|
+
int a = 0, ch;
|
54
|
+
while (!feof(fin)) {
|
55
|
+
ch = fgetc(fin);
|
56
|
+
if (ch == 13) continue;
|
57
|
+
if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
|
58
|
+
if (a > 0) {
|
59
|
+
if (ch == '\n') ungetc(ch, fin);
|
60
|
+
break;
|
61
|
+
}
|
62
|
+
if (ch == '\n') {
|
63
|
+
strcpy(word, (char *)"</s>");
|
64
|
+
return;
|
65
|
+
} else continue;
|
66
|
+
}
|
67
|
+
word[a] = ch;
|
68
|
+
a++;
|
69
|
+
if (a >= MAX_STRING - 1) a--; // Truncate too long words
|
70
|
+
}
|
71
|
+
word[a] = 0;
|
72
|
+
}
|
73
|
+
|
74
|
+
// Returns hash value of a word
|
75
|
+
unsigned long long GetWordHash(char *word) {
|
76
|
+
unsigned long long a, hash = 0;
|
77
|
+
for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
|
78
|
+
hash = hash % vocab_hash_size;
|
79
|
+
return hash;
|
80
|
+
}
|
81
|
+
|
82
|
+
// Returns position of a word in the vocabulary; if the word is not found, returns -1
|
83
|
+
unsigned long long SearchVocab(char *word) {
|
84
|
+
unsigned long long hash = GetWordHash(word);
|
85
|
+
while (1) {
|
86
|
+
if (vocab_hash[hash] == -1) return -1;
|
87
|
+
if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
|
88
|
+
hash = (hash + 1) % vocab_hash_size;
|
89
|
+
}
|
90
|
+
return -1;
|
91
|
+
}
|
92
|
+
|
93
|
+
// Reads a word and returns its index in the vocabulary
|
94
|
+
unsigned long long ReadWordIndex(FILE *fin) {
|
95
|
+
char word[MAX_STRING];
|
96
|
+
ReadWord(word, fin);
|
97
|
+
if (feof(fin)) return -1;
|
98
|
+
return SearchVocab(word);
|
99
|
+
}
|
100
|
+
|
101
|
+
// Adds a word to the vocabulary
|
102
|
+
unsigned long long AddWordToVocab(char *word) {
|
103
|
+
unsigned long long hash, length = strlen(word) + 1;
|
104
|
+
if (length > MAX_STRING) length = MAX_STRING;
|
105
|
+
vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
|
106
|
+
strcpy(vocab[vocab_size].word, word);
|
107
|
+
vocab[vocab_size].cn = 0;
|
108
|
+
vocab_size++;
|
109
|
+
// Reallocate memory if needed
|
110
|
+
if (vocab_size + 2 >= vocab_max_size) {
|
111
|
+
vocab_max_size += 1000;
|
112
|
+
vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
|
113
|
+
}
|
114
|
+
hash = GetWordHash(word);
|
115
|
+
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
116
|
+
vocab_hash[hash] = vocab_size - 1;
|
117
|
+
return vocab_size - 1;
|
118
|
+
}
|
119
|
+
|
120
|
+
// Reduces the vocabulary by removing infrequent tokens
|
121
|
+
void ReduceVocab() {
|
122
|
+
int a, b = 0;
|
123
|
+
unsigned long long hash;
|
124
|
+
for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
|
125
|
+
vocab[b].cn = vocab[a].cn;
|
126
|
+
vocab[b].word = vocab[a].word;
|
127
|
+
b++;
|
128
|
+
} else free(vocab[a].word);
|
129
|
+
vocab_size = b;
|
130
|
+
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
131
|
+
for (a = 0; a < vocab_size; a++) {
|
132
|
+
// Hash will be re-computed, as it is not actual
|
133
|
+
hash = GetWordHash(vocab[a].word);
|
134
|
+
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
135
|
+
vocab_hash[hash] = a;
|
136
|
+
}
|
137
|
+
fflush(stdout);
|
138
|
+
min_reduce++;
|
139
|
+
}
|
140
|
+
|
141
|
+
// Used later for sorting by word counts
|
142
|
+
int VocabCompare(const void *a, const void *b) {
|
143
|
+
return (int)(((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn);
|
144
|
+
}
|
145
|
+
|
146
|
+
// Sorts the vocabulary by frequency using word counts
|
147
|
+
void SortVocab() {
|
148
|
+
long long a, size;
|
149
|
+
unsigned long long hash;
|
150
|
+
// Sort the vocabulary and keep </s> at the first position
|
151
|
+
qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
|
152
|
+
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
153
|
+
size = vocab_size;
|
154
|
+
train_words = 0;
|
155
|
+
for (a = 0; a < size; a++) {
|
156
|
+
// Words occuring less than min_count times will be discarded from the vocab
|
157
|
+
if ((vocab[a].cn < min_count) && (a != 0)) {
|
158
|
+
vocab_size--;
|
159
|
+
free(vocab[a].word);
|
160
|
+
} else {
|
161
|
+
// Hash will be re-computed, as after the sorting it is not actual
|
162
|
+
hash=GetWordHash(vocab[a].word);
|
163
|
+
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
164
|
+
vocab_hash[hash] = a;
|
165
|
+
train_words += vocab[a].cn;
|
166
|
+
}
|
167
|
+
}
|
168
|
+
vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
|
169
|
+
// Allocate memory for the binary tree construction
|
170
|
+
for (a = 0; a < vocab_size; a++) {
|
171
|
+
vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
|
172
|
+
vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
|
173
|
+
}
|
174
|
+
}
|
175
|
+
|
176
|
+
// Learn vocabulary data from file
|
177
|
+
void LearnVocabFromTrainFile() {
|
178
|
+
char word[MAX_STRING];
|
179
|
+
FILE *fin;
|
180
|
+
long long a, i;
|
181
|
+
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
182
|
+
fin = fopen(train_file, "rb");
|
183
|
+
if (fin == NULL) {
|
184
|
+
printf("ERROR: training data file not found!\n");
|
185
|
+
exit(1);
|
186
|
+
}
|
187
|
+
vocab_size = 0;
|
188
|
+
AddWordToVocab((char *)"</s>");
|
189
|
+
while (1) {
|
190
|
+
ReadWord(word, fin);
|
191
|
+
if (feof(fin)) break;
|
192
|
+
train_words++;
|
193
|
+
if ((debug_mode > 1) && (train_words % 100000 == 0)) {
|
194
|
+
printf("%lldK%c", train_words / 1000, 13);
|
195
|
+
fflush(stdout);
|
196
|
+
}
|
197
|
+
i = SearchVocab(word);
|
198
|
+
if (i == -1) {
|
199
|
+
a = AddWordToVocab(word);
|
200
|
+
vocab[a].cn = 1;
|
201
|
+
} else vocab[i].cn++;
|
202
|
+
if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
|
203
|
+
}
|
204
|
+
SortVocab();
|
205
|
+
if (debug_mode > 0) {
|
206
|
+
printf("Vocab size: %lld\n", vocab_size);
|
207
|
+
printf("Words in train file: %lld\n", train_words);
|
208
|
+
}
|
209
|
+
file_size = ftell(fin);
|
210
|
+
fclose(fin);
|
211
|
+
}
|
212
|
+
|
213
|
+
void ReadVocab() {
|
214
|
+
long long a, i = 0;
|
215
|
+
char c;
|
216
|
+
char word[MAX_STRING];
|
217
|
+
FILE *fin = fopen(read_vocab_file, "rb");
|
218
|
+
if (fin == NULL) {
|
219
|
+
printf("Vocabulary file not found\n");
|
220
|
+
exit(1);
|
221
|
+
}
|
222
|
+
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
223
|
+
vocab_size = 0;
|
224
|
+
while (1) {
|
225
|
+
ReadWord(word, fin);
|
226
|
+
if (feof(fin)) break;
|
227
|
+
a = AddWordToVocab(word);
|
228
|
+
fscanf(fin, "%lld%c", &vocab[a].cn, &c);
|
229
|
+
i++;
|
230
|
+
}
|
231
|
+
SortVocab();
|
232
|
+
if (debug_mode > 0) {
|
233
|
+
printf("Vocab size: %lld\n", vocab_size);
|
234
|
+
printf("Words in train file: %lld\n", train_words);
|
235
|
+
}
|
236
|
+
fin = fopen(train_file, "rb");
|
237
|
+
if (fin == NULL) {
|
238
|
+
printf("ERROR: training data file not found!\n");
|
239
|
+
exit(1);
|
240
|
+
}
|
241
|
+
fseek(fin, 0, SEEK_END);
|
242
|
+
file_size = ftell(fin);
|
243
|
+
fclose(fin);
|
244
|
+
}
|
data/ext/word2vec/common.h
CHANGED
@@ -10,13 +10,26 @@
|
|
10
10
|
#include <sys/types.h>
|
11
11
|
#include <ctype.h>
|
12
12
|
|
13
|
+
#define MAX_STRING 100
|
14
|
+
#define MAX_CODE_LENGTH 40
|
15
|
+
|
13
16
|
// max length of strings
|
14
17
|
extern const unsigned long max_size;
|
15
18
|
// number of closest words that will be shown
|
16
19
|
extern const long long N;
|
17
20
|
// max length of vocabulary entries
|
18
21
|
extern const long long max_w;
|
19
|
-
|
22
|
+
// Maximum 30 * 0.7 = 21M words in the vocabulary
|
23
|
+
extern const int vocab_hash_size;
|
24
|
+
extern long long *vocab_hash;
|
25
|
+
extern struct vocab_word *vocab;
|
26
|
+
|
27
|
+
extern char train_file[];
|
28
|
+
extern char read_vocab_file[];
|
29
|
+
extern long long vocab_max_size, vocab_size;
|
30
|
+
extern long long train_words, file_size ;
|
31
|
+
extern int debug_mode, min_count, min_reduce;
|
32
|
+
|
20
33
|
typedef struct word2vec_model_s {
|
21
34
|
long long word_count;
|
22
35
|
char *vocabulary; // char *[word_count]
|
@@ -29,10 +42,26 @@ typedef struct WordSimilarity_s {
|
|
29
42
|
float score;
|
30
43
|
} WordSimilarity;
|
31
44
|
|
45
|
+
struct vocab_word {
|
46
|
+
long long cn;
|
47
|
+
int *point;
|
48
|
+
char *word, *code, codelen;
|
49
|
+
};
|
50
|
+
|
51
|
+
void ReadWord(char *word, FILE *fin);
|
52
|
+
unsigned long long GetWordHash(char *word);
|
53
|
+
unsigned long long SearchVocab(char *word) ;
|
54
|
+
unsigned long long ReadWordIndex(FILE *fin);
|
55
|
+
|
56
|
+
void LearnVocabFromTrainFile();
|
57
|
+
void ReadVocab();
|
58
|
+
|
59
|
+
/// main functions
|
32
60
|
void word2vec_model_load(word2vec_model* model, char* file_name);
|
33
61
|
size_t word2vec_model_distance(word2vec_model* model, char* word, WordSimilarity word_list[]);
|
34
62
|
size_t word2vec_model_analogy(word2vec_model* model, char* wordx1, char* wordy1, char* wordx2, WordSimilarity word_list[]);
|
35
63
|
void word2vec_model_accuracy(word2vec_model* model, char* file_name);
|
36
|
-
void word2vec_build_vocab(char*
|
64
|
+
void word2vec_build_vocab(char* train_file, char* save_vocab_file);
|
65
|
+
void word2vec_tokenize(char* train_file, char* vocab_file, char* output_file);
|
37
66
|
|
38
67
|
#endif /* _WORD2VEC_COMMON_H */
|
@@ -0,0 +1,19 @@
|
|
1
|
+
#include "common.h"
|
2
|
+
|
3
|
+
void word2vec_tokenize(char* train_file, char* vocab_file, char* output_file) {
|
4
|
+
long long word;
|
5
|
+
vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
|
6
|
+
vocab_hash = (long long *)calloc(vocab_hash_size, sizeof(long long));
|
7
|
+
strcpy(train_file, train_file);
|
8
|
+
strcpy(read_vocab_file, vocab_file);
|
9
|
+
ReadVocab();
|
10
|
+
FILE *fi = fopen(train_file, "rb");
|
11
|
+
FILE *fo = fopen(output_file, "wb");
|
12
|
+
while (1) {
|
13
|
+
word = ReadWordIndex(fi);
|
14
|
+
if (feof(fi)) break;
|
15
|
+
fwrite(&word, sizeof(word), 1, fo);
|
16
|
+
}
|
17
|
+
fclose(fi);
|
18
|
+
fclose(fo);
|
19
|
+
}
|
data/ext/word2vec/vocab.c
CHANGED
@@ -1,185 +1,13 @@
|
|
1
1
|
#include "common.h"
|
2
2
|
|
3
|
-
#define MAX_STRING 100
|
4
|
-
#define MAX_CODE_LENGTH 40
|
5
|
-
|
6
3
|
const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
|
7
4
|
|
8
|
-
struct vocab_word {
|
9
|
-
long long cn;
|
10
|
-
int *point;
|
11
|
-
char *word, *code, codelen;
|
12
|
-
};
|
13
|
-
|
14
|
-
char train_file[MAX_STRING];
|
15
5
|
struct vocab_word *vocab;
|
16
6
|
int debug_mode = 2, min_count = 5, min_reduce = 1;
|
17
7
|
long long vocab_max_size = 1000, vocab_size = 0;
|
18
8
|
long long *vocab_hash;
|
19
9
|
long long train_words = 0, file_size = 0;
|
20
10
|
|
21
|
-
// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
|
22
|
-
void ReadWord(char *word, FILE *fin) {
|
23
|
-
int a = 0, ch;
|
24
|
-
while (!feof(fin)) {
|
25
|
-
ch = fgetc(fin);
|
26
|
-
if (ch == 13) continue;
|
27
|
-
if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
|
28
|
-
if (a > 0) {
|
29
|
-
if (ch == '\n') ungetc(ch, fin);
|
30
|
-
break;
|
31
|
-
}
|
32
|
-
if (ch == '\n') {
|
33
|
-
strcpy(word, (char *)"</s>");
|
34
|
-
return;
|
35
|
-
} else continue;
|
36
|
-
}
|
37
|
-
word[a] = ch;
|
38
|
-
a++;
|
39
|
-
if (a >= MAX_STRING - 1) a--; // Truncate too long words
|
40
|
-
}
|
41
|
-
word[a] = 0;
|
42
|
-
}
|
43
|
-
|
44
|
-
// Returns hash value of a word
|
45
|
-
unsigned long long GetWordHash(char *word) {
|
46
|
-
unsigned long long a, hash = 0;
|
47
|
-
for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
|
48
|
-
hash = hash % vocab_hash_size;
|
49
|
-
return hash;
|
50
|
-
}
|
51
|
-
|
52
|
-
// Returns position of a word in the vocabulary; if the word is not found, returns -1
|
53
|
-
unsigned long long SearchVocab(char *word) {
|
54
|
-
unsigned long long hash = GetWordHash(word);
|
55
|
-
while (1) {
|
56
|
-
if (vocab_hash[hash] == -1) return -1;
|
57
|
-
if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
|
58
|
-
hash = (hash + 1) % vocab_hash_size;
|
59
|
-
}
|
60
|
-
return -1;
|
61
|
-
}
|
62
|
-
|
63
|
-
// Reads a word and returns its index in the vocabulary
|
64
|
-
unsigned long long ReadWordIndex(FILE *fin) {
|
65
|
-
char word[MAX_STRING];
|
66
|
-
ReadWord(word, fin);
|
67
|
-
if (feof(fin)) return -1;
|
68
|
-
return SearchVocab(word);
|
69
|
-
}
|
70
|
-
|
71
|
-
// Adds a word to the vocabulary
|
72
|
-
unsigned long long AddWordToVocab(char *word) {
|
73
|
-
unsigned long long hash, length = strlen(word) + 1;
|
74
|
-
if (length > MAX_STRING) length = MAX_STRING;
|
75
|
-
vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
|
76
|
-
strcpy(vocab[vocab_size].word, word);
|
77
|
-
vocab[vocab_size].cn = 0;
|
78
|
-
vocab_size++;
|
79
|
-
// Reallocate memory if needed
|
80
|
-
if (vocab_size + 2 >= vocab_max_size) {
|
81
|
-
vocab_max_size += 1000;
|
82
|
-
vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
|
83
|
-
}
|
84
|
-
hash = GetWordHash(word);
|
85
|
-
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
86
|
-
vocab_hash[hash] = vocab_size - 1;
|
87
|
-
return vocab_size - 1;
|
88
|
-
}
|
89
|
-
|
90
|
-
// Used later for sorting by word counts
|
91
|
-
int VocabCompare(const void *a, const void *b) {
|
92
|
-
return (int)(((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn);
|
93
|
-
}
|
94
|
-
|
95
|
-
// Sorts the vocabulary by frequency using word counts
|
96
|
-
void SortVocab() {
|
97
|
-
long long a, size;
|
98
|
-
unsigned long long hash;
|
99
|
-
// Sort the vocabulary and keep </s> at the first position
|
100
|
-
qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
|
101
|
-
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
102
|
-
size = vocab_size;
|
103
|
-
train_words = 0;
|
104
|
-
for (a = 0; a < size; a++) {
|
105
|
-
// Words occuring less than min_count times will be discarded from the vocab
|
106
|
-
if ((vocab[a].cn < min_count) && (a != 0)) {
|
107
|
-
vocab_size--;
|
108
|
-
free(vocab[a].word);
|
109
|
-
} else {
|
110
|
-
// Hash will be re-computed, as after the sorting it is not actual
|
111
|
-
hash=GetWordHash(vocab[a].word);
|
112
|
-
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
113
|
-
vocab_hash[hash] = a;
|
114
|
-
train_words += vocab[a].cn;
|
115
|
-
}
|
116
|
-
}
|
117
|
-
vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
|
118
|
-
// Allocate memory for the binary tree construction
|
119
|
-
for (a = 0; a < vocab_size; a++) {
|
120
|
-
vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
|
121
|
-
vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
|
122
|
-
}
|
123
|
-
}
|
124
|
-
|
125
|
-
// Reduces the vocabulary by removing infrequent tokens
|
126
|
-
void ReduceVocab() {
|
127
|
-
int a, b = 0;
|
128
|
-
unsigned long long hash;
|
129
|
-
for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
|
130
|
-
vocab[b].cn = vocab[a].cn;
|
131
|
-
vocab[b].word = vocab[a].word;
|
132
|
-
b++;
|
133
|
-
} else free(vocab[a].word);
|
134
|
-
vocab_size = b;
|
135
|
-
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
136
|
-
for (a = 0; a < vocab_size; a++) {
|
137
|
-
// Hash will be re-computed, as it is not actual
|
138
|
-
hash = GetWordHash(vocab[a].word);
|
139
|
-
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
140
|
-
vocab_hash[hash] = a;
|
141
|
-
}
|
142
|
-
fflush(stdout);
|
143
|
-
min_reduce++;
|
144
|
-
}
|
145
|
-
|
146
|
-
// Learn vocabulary data from file
|
147
|
-
void LearnVocabFromTrainFile() {
|
148
|
-
char word[MAX_STRING];
|
149
|
-
FILE *fin;
|
150
|
-
long long a, i;
|
151
|
-
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
152
|
-
fin = fopen(train_file, "rb");
|
153
|
-
if (fin == NULL) {
|
154
|
-
printf("ERROR: training data file not found!\n");
|
155
|
-
exit(1);
|
156
|
-
}
|
157
|
-
vocab_size = 0;
|
158
|
-
AddWordToVocab((char *)"</s>");
|
159
|
-
while (1) {
|
160
|
-
ReadWord(word, fin);
|
161
|
-
if (feof(fin)) break;
|
162
|
-
train_words++;
|
163
|
-
if ((debug_mode > 1) && (train_words % 100000 == 0)) {
|
164
|
-
printf("%lldK%c", train_words / 1000, 13);
|
165
|
-
fflush(stdout);
|
166
|
-
}
|
167
|
-
i = SearchVocab(word);
|
168
|
-
if (i == -1) {
|
169
|
-
a = AddWordToVocab(word);
|
170
|
-
vocab[a].cn = 1;
|
171
|
-
} else vocab[i].cn++;
|
172
|
-
if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
|
173
|
-
}
|
174
|
-
SortVocab();
|
175
|
-
if (debug_mode > 0) {
|
176
|
-
printf("Vocab size: %lld\n", vocab_size);
|
177
|
-
printf("Words in train file: %lld\n", train_words);
|
178
|
-
}
|
179
|
-
file_size = ftell(fin);
|
180
|
-
fclose(fin);
|
181
|
-
}
|
182
|
-
|
183
11
|
void SaveVocab(char* save_vocab_file) {
|
184
12
|
long long i;
|
185
13
|
FILE *fo = fopen(save_vocab_file, "wb");
|
data/ext/word2vec/word2vec.c
CHANGED
@@ -141,6 +141,22 @@ static VALUE build_vocab(VALUE mod, VALUE rb_train_file_name, VALUE rb_vocab_fil
|
|
141
141
|
return Qtrue;
|
142
142
|
}
|
143
143
|
|
144
|
+
/*
|
145
|
+
* tokenize a file
|
146
|
+
* @param [String] rb_train_file_name
|
147
|
+
* @param [String] rb_vocab_file_name
|
148
|
+
* @param [String] rb_output_file_name
|
149
|
+
*/
|
150
|
+
static VALUE tokenize(VALUE mod, VALUE rb_train_file_name, VALUE rb_vocab_file_name, VALUE rb_output_file_name) {
|
151
|
+
char* train_filename = StringValueCStr(rb_train_file_name);
|
152
|
+
char* vocab_filename = StringValueCStr(rb_vocab_file_name);
|
153
|
+
char* output_filename = StringValueCStr(rb_output_file_name);
|
154
|
+
|
155
|
+
word2vec_tokenize(train_filename, vocab_filename, output_filename);
|
156
|
+
|
157
|
+
return Qtrue;
|
158
|
+
}
|
159
|
+
|
144
160
|
void Init_word2vec(void) {
|
145
161
|
VALUE mWord2vec = rb_define_module("Word2vec");
|
146
162
|
VALUE mWord2vecModel = rb_define_class_under(mWord2vec, "Model", rb_cObject);
|
@@ -151,4 +167,5 @@ void Init_word2vec(void) {
|
|
151
167
|
rb_define_method(mWord2vecModel, "analogy", model_analogy, 3);
|
152
168
|
rb_define_method(mWord2vecModel, "accuracy", model_accuracy, 1);
|
153
169
|
rb_define_singleton_method(mWord2vecModel, "build_vocab", build_vocab, 2);
|
170
|
+
rb_define_singleton_method(mWord2vecModel, "tokenize", tokenize, 3);
|
154
171
|
}
|
data/lib/word2vec/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: word2vec-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dani Vela
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-06-
|
11
|
+
date: 2021-06-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -80,6 +80,8 @@ files:
|
|
80
80
|
- ".gitignore"
|
81
81
|
- ".rspec"
|
82
82
|
- ".travis.yml"
|
83
|
+
- ".vscode/c_cpp_properties.json"
|
84
|
+
- ".vscode/settings.json"
|
83
85
|
- CHANGELOG
|
84
86
|
- Gemfile
|
85
87
|
- Gemfile.lock
|
@@ -96,6 +98,7 @@ files:
|
|
96
98
|
- ext/word2vec/common.h
|
97
99
|
- ext/word2vec/distance.c
|
98
100
|
- ext/word2vec/extconf.rb
|
101
|
+
- ext/word2vec/tokenizer.c
|
99
102
|
- ext/word2vec/vocab.c
|
100
103
|
- ext/word2vec/word2vec.c
|
101
104
|
- lib/word2vec.rb
|