word2vec-rb 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -1
- data/.vscode/c_cpp_properties.json +20 -0
- data/.vscode/settings.json +6 -0
- data/Gemfile.lock +1 -1
- data/README.md +17 -0
- data/ext/word2vec/common.c +198 -1
- data/ext/word2vec/common.h +31 -2
- data/ext/word2vec/tokenizer.c +19 -0
- data/ext/word2vec/vocab.c +0 -172
- data/ext/word2vec/word2vec.c +17 -0
- data/lib/word2vec/version.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 192304d569b2fb573300a33ff7ed05fc37202c30cf55144fc043cc544e620f23
|
4
|
+
data.tar.gz: e5473521b82560242b19ae2efa9c4e07a7798bb3c0ea2e97d97e5f3a1f2601cf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: db2cc2396b4a7193a0cb4efd40ffefbefabd88861f257aea8ac90d032d9ebac8f0759cd952678174f764cdce2605a94c521e24d9433a8a1eaaf135e5dcc1cf89
|
7
|
+
data.tar.gz: 26949750236e3929f9794967570c3fb257c53474c9d79b857f12d4214ecca075afe4a32dbb5a6e5253b44ea1566a391aff70c9617a2914ee700779e3dd8a4af9
|
data/.gitignore
CHANGED
@@ -0,0 +1,20 @@
|
|
1
|
+
{
|
2
|
+
"configurations": [
|
3
|
+
{
|
4
|
+
"name": "Mac",
|
5
|
+
"includePath": [
|
6
|
+
"${workspaceFolder}/**",
|
7
|
+
"/Library/Developer/CommandLineTools/SDKs/MacOSX11.3.sdk/System/Library/Frameworks/Ruby.framework/Versions/2.6/usr/include/ruby-2.6.0/**"
|
8
|
+
],
|
9
|
+
"defines": [],
|
10
|
+
"macFrameworkPath": [
|
11
|
+
"/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks"
|
12
|
+
],
|
13
|
+
"compilerPath": "/usr/bin/clang",
|
14
|
+
"cStandard": "c17",
|
15
|
+
"cppStandard": "c++98",
|
16
|
+
"intelliSenseMode": "macos-clang-x64"
|
17
|
+
}
|
18
|
+
],
|
19
|
+
"version": 4
|
20
|
+
}
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -78,6 +78,23 @@ Word2vec::Model.build\_vocab("./data/text7", "./data/vocab.txt")
|
|
78
78
|
|
79
79
|
The output file will have a list of words and its number of appearances separated by line break.
|
80
80
|
|
81
|
+
### Tokenizer: create a binary file by tokenizing an input file
|
82
|
+
|
83
|
+
This method requires a vocabulary file precreated.
|
84
|
+
|
85
|
+
```ruby
|
86
|
+
require 'word2vec'
|
87
|
+
|
88
|
+
Word2vec::Model.tokenize("./data/text7", "./data/vocab.txt", "./data/toekized.bin")
|
89
|
+
```
|
90
|
+
|
91
|
+
The output file will contain a sequence of binary identificators of each word of the input file.
|
92
|
+
|
93
|
+
Read output file with:
|
94
|
+
|
95
|
+
long long id;
|
96
|
+
fread(&id, sizeof(id), 1, fi);
|
97
|
+
|
81
98
|
## Development
|
82
99
|
|
83
100
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/ext/word2vec/common.c
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
#include "common.h"
|
2
2
|
|
3
|
+
char train_file[MAX_STRING];
|
4
|
+
char read_vocab_file[MAX_STRING];
|
3
5
|
// max length of strings
|
4
6
|
const unsigned long max_size = 2000;
|
5
7
|
// number of closest words that will be shown
|
@@ -44,4 +46,199 @@ void word2vec_model_load(word2vec_model* model, char* file_name) {
|
|
44
46
|
for (a = 0; a < size; a++) model->vectors[a + b * size] /= len;
|
45
47
|
}
|
46
48
|
fclose(f);
|
47
|
-
}
|
49
|
+
}
|
50
|
+
|
51
|
+
// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
|
52
|
+
void ReadWord(char *word, FILE *fin) {
|
53
|
+
int a = 0, ch;
|
54
|
+
while (!feof(fin)) {
|
55
|
+
ch = fgetc(fin);
|
56
|
+
if (ch == 13) continue;
|
57
|
+
if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
|
58
|
+
if (a > 0) {
|
59
|
+
if (ch == '\n') ungetc(ch, fin);
|
60
|
+
break;
|
61
|
+
}
|
62
|
+
if (ch == '\n') {
|
63
|
+
strcpy(word, (char *)"</s>");
|
64
|
+
return;
|
65
|
+
} else continue;
|
66
|
+
}
|
67
|
+
word[a] = ch;
|
68
|
+
a++;
|
69
|
+
if (a >= MAX_STRING - 1) a--; // Truncate too long words
|
70
|
+
}
|
71
|
+
word[a] = 0;
|
72
|
+
}
|
73
|
+
|
74
|
+
// Returns hash value of a word
|
75
|
+
unsigned long long GetWordHash(char *word) {
|
76
|
+
unsigned long long a, hash = 0;
|
77
|
+
for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
|
78
|
+
hash = hash % vocab_hash_size;
|
79
|
+
return hash;
|
80
|
+
}
|
81
|
+
|
82
|
+
// Returns position of a word in the vocabulary; if the word is not found, returns -1
|
83
|
+
unsigned long long SearchVocab(char *word) {
|
84
|
+
unsigned long long hash = GetWordHash(word);
|
85
|
+
while (1) {
|
86
|
+
if (vocab_hash[hash] == -1) return -1;
|
87
|
+
if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
|
88
|
+
hash = (hash + 1) % vocab_hash_size;
|
89
|
+
}
|
90
|
+
return -1;
|
91
|
+
}
|
92
|
+
|
93
|
+
// Reads a word and returns its index in the vocabulary
|
94
|
+
unsigned long long ReadWordIndex(FILE *fin) {
|
95
|
+
char word[MAX_STRING];
|
96
|
+
ReadWord(word, fin);
|
97
|
+
if (feof(fin)) return -1;
|
98
|
+
return SearchVocab(word);
|
99
|
+
}
|
100
|
+
|
101
|
+
// Adds a word to the vocabulary
|
102
|
+
unsigned long long AddWordToVocab(char *word) {
|
103
|
+
unsigned long long hash, length = strlen(word) + 1;
|
104
|
+
if (length > MAX_STRING) length = MAX_STRING;
|
105
|
+
vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
|
106
|
+
strcpy(vocab[vocab_size].word, word);
|
107
|
+
vocab[vocab_size].cn = 0;
|
108
|
+
vocab_size++;
|
109
|
+
// Reallocate memory if needed
|
110
|
+
if (vocab_size + 2 >= vocab_max_size) {
|
111
|
+
vocab_max_size += 1000;
|
112
|
+
vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
|
113
|
+
}
|
114
|
+
hash = GetWordHash(word);
|
115
|
+
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
116
|
+
vocab_hash[hash] = vocab_size - 1;
|
117
|
+
return vocab_size - 1;
|
118
|
+
}
|
119
|
+
|
120
|
+
// Reduces the vocabulary by removing infrequent tokens
|
121
|
+
void ReduceVocab() {
|
122
|
+
int a, b = 0;
|
123
|
+
unsigned long long hash;
|
124
|
+
for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
|
125
|
+
vocab[b].cn = vocab[a].cn;
|
126
|
+
vocab[b].word = vocab[a].word;
|
127
|
+
b++;
|
128
|
+
} else free(vocab[a].word);
|
129
|
+
vocab_size = b;
|
130
|
+
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
131
|
+
for (a = 0; a < vocab_size; a++) {
|
132
|
+
// Hash will be re-computed, as it is not actual
|
133
|
+
hash = GetWordHash(vocab[a].word);
|
134
|
+
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
135
|
+
vocab_hash[hash] = a;
|
136
|
+
}
|
137
|
+
fflush(stdout);
|
138
|
+
min_reduce++;
|
139
|
+
}
|
140
|
+
|
141
|
+
// Used later for sorting by word counts
|
142
|
+
int VocabCompare(const void *a, const void *b) {
|
143
|
+
return (int)(((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn);
|
144
|
+
}
|
145
|
+
|
146
|
+
// Sorts the vocabulary by frequency using word counts
|
147
|
+
void SortVocab() {
|
148
|
+
long long a, size;
|
149
|
+
unsigned long long hash;
|
150
|
+
// Sort the vocabulary and keep </s> at the first position
|
151
|
+
qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
|
152
|
+
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
153
|
+
size = vocab_size;
|
154
|
+
train_words = 0;
|
155
|
+
for (a = 0; a < size; a++) {
|
156
|
+
// Words occuring less than min_count times will be discarded from the vocab
|
157
|
+
if ((vocab[a].cn < min_count) && (a != 0)) {
|
158
|
+
vocab_size--;
|
159
|
+
free(vocab[a].word);
|
160
|
+
} else {
|
161
|
+
// Hash will be re-computed, as after the sorting it is not actual
|
162
|
+
hash=GetWordHash(vocab[a].word);
|
163
|
+
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
164
|
+
vocab_hash[hash] = a;
|
165
|
+
train_words += vocab[a].cn;
|
166
|
+
}
|
167
|
+
}
|
168
|
+
vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
|
169
|
+
// Allocate memory for the binary tree construction
|
170
|
+
for (a = 0; a < vocab_size; a++) {
|
171
|
+
vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
|
172
|
+
vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
|
173
|
+
}
|
174
|
+
}
|
175
|
+
|
176
|
+
// Learn vocabulary data from file
|
177
|
+
void LearnVocabFromTrainFile() {
|
178
|
+
char word[MAX_STRING];
|
179
|
+
FILE *fin;
|
180
|
+
long long a, i;
|
181
|
+
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
182
|
+
fin = fopen(train_file, "rb");
|
183
|
+
if (fin == NULL) {
|
184
|
+
printf("ERROR: training data file not found!\n");
|
185
|
+
exit(1);
|
186
|
+
}
|
187
|
+
vocab_size = 0;
|
188
|
+
AddWordToVocab((char *)"</s>");
|
189
|
+
while (1) {
|
190
|
+
ReadWord(word, fin);
|
191
|
+
if (feof(fin)) break;
|
192
|
+
train_words++;
|
193
|
+
if ((debug_mode > 1) && (train_words % 100000 == 0)) {
|
194
|
+
printf("%lldK%c", train_words / 1000, 13);
|
195
|
+
fflush(stdout);
|
196
|
+
}
|
197
|
+
i = SearchVocab(word);
|
198
|
+
if (i == -1) {
|
199
|
+
a = AddWordToVocab(word);
|
200
|
+
vocab[a].cn = 1;
|
201
|
+
} else vocab[i].cn++;
|
202
|
+
if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
|
203
|
+
}
|
204
|
+
SortVocab();
|
205
|
+
if (debug_mode > 0) {
|
206
|
+
printf("Vocab size: %lld\n", vocab_size);
|
207
|
+
printf("Words in train file: %lld\n", train_words);
|
208
|
+
}
|
209
|
+
file_size = ftell(fin);
|
210
|
+
fclose(fin);
|
211
|
+
}
|
212
|
+
|
213
|
+
void ReadVocab() {
|
214
|
+
long long a, i = 0;
|
215
|
+
char c;
|
216
|
+
char word[MAX_STRING];
|
217
|
+
FILE *fin = fopen(read_vocab_file, "rb");
|
218
|
+
if (fin == NULL) {
|
219
|
+
printf("Vocabulary file not found\n");
|
220
|
+
exit(1);
|
221
|
+
}
|
222
|
+
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
223
|
+
vocab_size = 0;
|
224
|
+
while (1) {
|
225
|
+
ReadWord(word, fin);
|
226
|
+
if (feof(fin)) break;
|
227
|
+
a = AddWordToVocab(word);
|
228
|
+
fscanf(fin, "%lld%c", &vocab[a].cn, &c);
|
229
|
+
i++;
|
230
|
+
}
|
231
|
+
SortVocab();
|
232
|
+
if (debug_mode > 0) {
|
233
|
+
printf("Vocab size: %lld\n", vocab_size);
|
234
|
+
printf("Words in train file: %lld\n", train_words);
|
235
|
+
}
|
236
|
+
fin = fopen(train_file, "rb");
|
237
|
+
if (fin == NULL) {
|
238
|
+
printf("ERROR: training data file not found!\n");
|
239
|
+
exit(1);
|
240
|
+
}
|
241
|
+
fseek(fin, 0, SEEK_END);
|
242
|
+
file_size = ftell(fin);
|
243
|
+
fclose(fin);
|
244
|
+
}
|
data/ext/word2vec/common.h
CHANGED
@@ -10,13 +10,26 @@
|
|
10
10
|
#include <sys/types.h>
|
11
11
|
#include <ctype.h>
|
12
12
|
|
13
|
+
#define MAX_STRING 100
|
14
|
+
#define MAX_CODE_LENGTH 40
|
15
|
+
|
13
16
|
// max length of strings
|
14
17
|
extern const unsigned long max_size;
|
15
18
|
// number of closest words that will be shown
|
16
19
|
extern const long long N;
|
17
20
|
// max length of vocabulary entries
|
18
21
|
extern const long long max_w;
|
19
|
-
|
22
|
+
// Maximum 30 * 0.7 = 21M words in the vocabulary
|
23
|
+
extern const int vocab_hash_size;
|
24
|
+
extern long long *vocab_hash;
|
25
|
+
extern struct vocab_word *vocab;
|
26
|
+
|
27
|
+
extern char train_file[];
|
28
|
+
extern char read_vocab_file[];
|
29
|
+
extern long long vocab_max_size, vocab_size;
|
30
|
+
extern long long train_words, file_size ;
|
31
|
+
extern int debug_mode, min_count, min_reduce;
|
32
|
+
|
20
33
|
typedef struct word2vec_model_s {
|
21
34
|
long long word_count;
|
22
35
|
char *vocabulary; // char *[word_count]
|
@@ -29,10 +42,26 @@ typedef struct WordSimilarity_s {
|
|
29
42
|
float score;
|
30
43
|
} WordSimilarity;
|
31
44
|
|
45
|
+
struct vocab_word {
|
46
|
+
long long cn;
|
47
|
+
int *point;
|
48
|
+
char *word, *code, codelen;
|
49
|
+
};
|
50
|
+
|
51
|
+
void ReadWord(char *word, FILE *fin);
|
52
|
+
unsigned long long GetWordHash(char *word);
|
53
|
+
unsigned long long SearchVocab(char *word) ;
|
54
|
+
unsigned long long ReadWordIndex(FILE *fin);
|
55
|
+
|
56
|
+
void LearnVocabFromTrainFile();
|
57
|
+
void ReadVocab();
|
58
|
+
|
59
|
+
/// main functions
|
32
60
|
void word2vec_model_load(word2vec_model* model, char* file_name);
|
33
61
|
size_t word2vec_model_distance(word2vec_model* model, char* word, WordSimilarity word_list[]);
|
34
62
|
size_t word2vec_model_analogy(word2vec_model* model, char* wordx1, char* wordy1, char* wordx2, WordSimilarity word_list[]);
|
35
63
|
void word2vec_model_accuracy(word2vec_model* model, char* file_name);
|
36
|
-
void word2vec_build_vocab(char*
|
64
|
+
void word2vec_build_vocab(char* train_file, char* save_vocab_file);
|
65
|
+
void word2vec_tokenize(char* train_file, char* vocab_file, char* output_file);
|
37
66
|
|
38
67
|
#endif /* _WORD2VEC_COMMON_H */
|
@@ -0,0 +1,19 @@
|
|
1
|
+
#include "common.h"
|
2
|
+
|
3
|
+
void word2vec_tokenize(char* train_file, char* vocab_file, char* output_file) {
|
4
|
+
long long word;
|
5
|
+
vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
|
6
|
+
vocab_hash = (long long *)calloc(vocab_hash_size, sizeof(long long));
|
7
|
+
strcpy(train_file, train_file);
|
8
|
+
strcpy(read_vocab_file, vocab_file);
|
9
|
+
ReadVocab();
|
10
|
+
FILE *fi = fopen(train_file, "rb");
|
11
|
+
FILE *fo = fopen(output_file, "wb");
|
12
|
+
while (1) {
|
13
|
+
word = ReadWordIndex(fi);
|
14
|
+
if (feof(fi)) break;
|
15
|
+
fwrite(&word, sizeof(word), 1, fo);
|
16
|
+
}
|
17
|
+
fclose(fi);
|
18
|
+
fclose(fo);
|
19
|
+
}
|
data/ext/word2vec/vocab.c
CHANGED
@@ -1,185 +1,13 @@
|
|
1
1
|
#include "common.h"
|
2
2
|
|
3
|
-
#define MAX_STRING 100
|
4
|
-
#define MAX_CODE_LENGTH 40
|
5
|
-
|
6
3
|
const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
|
7
4
|
|
8
|
-
struct vocab_word {
|
9
|
-
long long cn;
|
10
|
-
int *point;
|
11
|
-
char *word, *code, codelen;
|
12
|
-
};
|
13
|
-
|
14
|
-
char train_file[MAX_STRING];
|
15
5
|
struct vocab_word *vocab;
|
16
6
|
int debug_mode = 2, min_count = 5, min_reduce = 1;
|
17
7
|
long long vocab_max_size = 1000, vocab_size = 0;
|
18
8
|
long long *vocab_hash;
|
19
9
|
long long train_words = 0, file_size = 0;
|
20
10
|
|
21
|
-
// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
|
22
|
-
void ReadWord(char *word, FILE *fin) {
|
23
|
-
int a = 0, ch;
|
24
|
-
while (!feof(fin)) {
|
25
|
-
ch = fgetc(fin);
|
26
|
-
if (ch == 13) continue;
|
27
|
-
if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
|
28
|
-
if (a > 0) {
|
29
|
-
if (ch == '\n') ungetc(ch, fin);
|
30
|
-
break;
|
31
|
-
}
|
32
|
-
if (ch == '\n') {
|
33
|
-
strcpy(word, (char *)"</s>");
|
34
|
-
return;
|
35
|
-
} else continue;
|
36
|
-
}
|
37
|
-
word[a] = ch;
|
38
|
-
a++;
|
39
|
-
if (a >= MAX_STRING - 1) a--; // Truncate too long words
|
40
|
-
}
|
41
|
-
word[a] = 0;
|
42
|
-
}
|
43
|
-
|
44
|
-
// Returns hash value of a word
|
45
|
-
unsigned long long GetWordHash(char *word) {
|
46
|
-
unsigned long long a, hash = 0;
|
47
|
-
for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
|
48
|
-
hash = hash % vocab_hash_size;
|
49
|
-
return hash;
|
50
|
-
}
|
51
|
-
|
52
|
-
// Returns position of a word in the vocabulary; if the word is not found, returns -1
|
53
|
-
unsigned long long SearchVocab(char *word) {
|
54
|
-
unsigned long long hash = GetWordHash(word);
|
55
|
-
while (1) {
|
56
|
-
if (vocab_hash[hash] == -1) return -1;
|
57
|
-
if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
|
58
|
-
hash = (hash + 1) % vocab_hash_size;
|
59
|
-
}
|
60
|
-
return -1;
|
61
|
-
}
|
62
|
-
|
63
|
-
// Reads a word and returns its index in the vocabulary
|
64
|
-
unsigned long long ReadWordIndex(FILE *fin) {
|
65
|
-
char word[MAX_STRING];
|
66
|
-
ReadWord(word, fin);
|
67
|
-
if (feof(fin)) return -1;
|
68
|
-
return SearchVocab(word);
|
69
|
-
}
|
70
|
-
|
71
|
-
// Adds a word to the vocabulary
|
72
|
-
unsigned long long AddWordToVocab(char *word) {
|
73
|
-
unsigned long long hash, length = strlen(word) + 1;
|
74
|
-
if (length > MAX_STRING) length = MAX_STRING;
|
75
|
-
vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
|
76
|
-
strcpy(vocab[vocab_size].word, word);
|
77
|
-
vocab[vocab_size].cn = 0;
|
78
|
-
vocab_size++;
|
79
|
-
// Reallocate memory if needed
|
80
|
-
if (vocab_size + 2 >= vocab_max_size) {
|
81
|
-
vocab_max_size += 1000;
|
82
|
-
vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
|
83
|
-
}
|
84
|
-
hash = GetWordHash(word);
|
85
|
-
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
86
|
-
vocab_hash[hash] = vocab_size - 1;
|
87
|
-
return vocab_size - 1;
|
88
|
-
}
|
89
|
-
|
90
|
-
// Used later for sorting by word counts
|
91
|
-
int VocabCompare(const void *a, const void *b) {
|
92
|
-
return (int)(((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn);
|
93
|
-
}
|
94
|
-
|
95
|
-
// Sorts the vocabulary by frequency using word counts
|
96
|
-
void SortVocab() {
|
97
|
-
long long a, size;
|
98
|
-
unsigned long long hash;
|
99
|
-
// Sort the vocabulary and keep </s> at the first position
|
100
|
-
qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
|
101
|
-
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
102
|
-
size = vocab_size;
|
103
|
-
train_words = 0;
|
104
|
-
for (a = 0; a < size; a++) {
|
105
|
-
// Words occuring less than min_count times will be discarded from the vocab
|
106
|
-
if ((vocab[a].cn < min_count) && (a != 0)) {
|
107
|
-
vocab_size--;
|
108
|
-
free(vocab[a].word);
|
109
|
-
} else {
|
110
|
-
// Hash will be re-computed, as after the sorting it is not actual
|
111
|
-
hash=GetWordHash(vocab[a].word);
|
112
|
-
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
113
|
-
vocab_hash[hash] = a;
|
114
|
-
train_words += vocab[a].cn;
|
115
|
-
}
|
116
|
-
}
|
117
|
-
vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
|
118
|
-
// Allocate memory for the binary tree construction
|
119
|
-
for (a = 0; a < vocab_size; a++) {
|
120
|
-
vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
|
121
|
-
vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
|
122
|
-
}
|
123
|
-
}
|
124
|
-
|
125
|
-
// Reduces the vocabulary by removing infrequent tokens
|
126
|
-
void ReduceVocab() {
|
127
|
-
int a, b = 0;
|
128
|
-
unsigned long long hash;
|
129
|
-
for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
|
130
|
-
vocab[b].cn = vocab[a].cn;
|
131
|
-
vocab[b].word = vocab[a].word;
|
132
|
-
b++;
|
133
|
-
} else free(vocab[a].word);
|
134
|
-
vocab_size = b;
|
135
|
-
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
136
|
-
for (a = 0; a < vocab_size; a++) {
|
137
|
-
// Hash will be re-computed, as it is not actual
|
138
|
-
hash = GetWordHash(vocab[a].word);
|
139
|
-
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
140
|
-
vocab_hash[hash] = a;
|
141
|
-
}
|
142
|
-
fflush(stdout);
|
143
|
-
min_reduce++;
|
144
|
-
}
|
145
|
-
|
146
|
-
// Learn vocabulary data from file
|
147
|
-
void LearnVocabFromTrainFile() {
|
148
|
-
char word[MAX_STRING];
|
149
|
-
FILE *fin;
|
150
|
-
long long a, i;
|
151
|
-
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
152
|
-
fin = fopen(train_file, "rb");
|
153
|
-
if (fin == NULL) {
|
154
|
-
printf("ERROR: training data file not found!\n");
|
155
|
-
exit(1);
|
156
|
-
}
|
157
|
-
vocab_size = 0;
|
158
|
-
AddWordToVocab((char *)"</s>");
|
159
|
-
while (1) {
|
160
|
-
ReadWord(word, fin);
|
161
|
-
if (feof(fin)) break;
|
162
|
-
train_words++;
|
163
|
-
if ((debug_mode > 1) && (train_words % 100000 == 0)) {
|
164
|
-
printf("%lldK%c", train_words / 1000, 13);
|
165
|
-
fflush(stdout);
|
166
|
-
}
|
167
|
-
i = SearchVocab(word);
|
168
|
-
if (i == -1) {
|
169
|
-
a = AddWordToVocab(word);
|
170
|
-
vocab[a].cn = 1;
|
171
|
-
} else vocab[i].cn++;
|
172
|
-
if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
|
173
|
-
}
|
174
|
-
SortVocab();
|
175
|
-
if (debug_mode > 0) {
|
176
|
-
printf("Vocab size: %lld\n", vocab_size);
|
177
|
-
printf("Words in train file: %lld\n", train_words);
|
178
|
-
}
|
179
|
-
file_size = ftell(fin);
|
180
|
-
fclose(fin);
|
181
|
-
}
|
182
|
-
|
183
11
|
void SaveVocab(char* save_vocab_file) {
|
184
12
|
long long i;
|
185
13
|
FILE *fo = fopen(save_vocab_file, "wb");
|
data/ext/word2vec/word2vec.c
CHANGED
@@ -141,6 +141,22 @@ static VALUE build_vocab(VALUE mod, VALUE rb_train_file_name, VALUE rb_vocab_fil
|
|
141
141
|
return Qtrue;
|
142
142
|
}
|
143
143
|
|
144
|
+
/*
|
145
|
+
* tokenize a file
|
146
|
+
* @param [String] rb_train_file_name
|
147
|
+
* @param [String] rb_vocab_file_name
|
148
|
+
* @param [String] rb_output_file_name
|
149
|
+
*/
|
150
|
+
static VALUE tokenize(VALUE mod, VALUE rb_train_file_name, VALUE rb_vocab_file_name, VALUE rb_output_file_name) {
|
151
|
+
char* train_filename = StringValueCStr(rb_train_file_name);
|
152
|
+
char* vocab_filename = StringValueCStr(rb_vocab_file_name);
|
153
|
+
char* output_filename = StringValueCStr(rb_output_file_name);
|
154
|
+
|
155
|
+
word2vec_tokenize(train_filename, vocab_filename, output_filename);
|
156
|
+
|
157
|
+
return Qtrue;
|
158
|
+
}
|
159
|
+
|
144
160
|
void Init_word2vec(void) {
|
145
161
|
VALUE mWord2vec = rb_define_module("Word2vec");
|
146
162
|
VALUE mWord2vecModel = rb_define_class_under(mWord2vec, "Model", rb_cObject);
|
@@ -151,4 +167,5 @@ void Init_word2vec(void) {
|
|
151
167
|
rb_define_method(mWord2vecModel, "analogy", model_analogy, 3);
|
152
168
|
rb_define_method(mWord2vecModel, "accuracy", model_accuracy, 1);
|
153
169
|
rb_define_singleton_method(mWord2vecModel, "build_vocab", build_vocab, 2);
|
170
|
+
rb_define_singleton_method(mWord2vecModel, "tokenize", tokenize, 3);
|
154
171
|
}
|
data/lib/word2vec/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: word2vec-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dani Vela
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-06-
|
11
|
+
date: 2021-06-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -80,6 +80,8 @@ files:
|
|
80
80
|
- ".gitignore"
|
81
81
|
- ".rspec"
|
82
82
|
- ".travis.yml"
|
83
|
+
- ".vscode/c_cpp_properties.json"
|
84
|
+
- ".vscode/settings.json"
|
83
85
|
- CHANGELOG
|
84
86
|
- Gemfile
|
85
87
|
- Gemfile.lock
|
@@ -96,6 +98,7 @@ files:
|
|
96
98
|
- ext/word2vec/common.h
|
97
99
|
- ext/word2vec/distance.c
|
98
100
|
- ext/word2vec/extconf.rb
|
101
|
+
- ext/word2vec/tokenizer.c
|
99
102
|
- ext/word2vec/vocab.c
|
100
103
|
- ext/word2vec/word2vec.c
|
101
104
|
- lib/word2vec.rb
|