word2vec-rb 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f1cc8aad12e17e08da6428945fa3fcdab514739a5ff166588796b0de4535ba10
4
- data.tar.gz: 3deaf3e6d6f1bff3e06b6353d8c29591fa00c8f5e8c8f3079f0d9fdf029786b7
3
+ metadata.gz: 192304d569b2fb573300a33ff7ed05fc37202c30cf55144fc043cc544e620f23
4
+ data.tar.gz: e5473521b82560242b19ae2efa9c4e07a7798bb3c0ea2e97d97e5f3a1f2601cf
5
5
  SHA512:
6
- metadata.gz: 916361e4543fadac744e4502f53241b105789bbf928be51d2e17df93b3ac318e2b9125d7146ec67146a591ca87ee7d2928ca15aadb1fabc0b2c31b82e058497b
7
- data.tar.gz: 9daa3535f9c23254b5fa45e9759fab036a297ffa491e3fc03287c3c9c072c7b314925843d376e678f1b8f61915839f5e0689e1fa27268704eaf5bd086fdaecc3
6
+ metadata.gz: db2cc2396b4a7193a0cb4efd40ffefbefabd88861f257aea8ac90d032d9ebac8f0759cd952678174f764cdce2605a94c521e24d9433a8a1eaaf135e5dcc1cf89
7
+ data.tar.gz: 26949750236e3929f9794967570c3fb257c53474c9d79b857f12d4214ecca075afe4a32dbb5a6e5253b44ea1566a391aff70c9617a2914ee700779e3dd8a4af9
data/.gitignore CHANGED
@@ -13,4 +13,5 @@
13
13
  lib/word2vec/word2vec.bundle
14
14
 
15
15
  data/text7
16
- data/vocab.txt
16
+ data/vocab.txt
17
+ data/tokenized.bin
@@ -0,0 +1,20 @@
1
+ {
2
+ "configurations": [
3
+ {
4
+ "name": "Mac",
5
+ "includePath": [
6
+ "${workspaceFolder}/**",
7
+ "/Library/Developer/CommandLineTools/SDKs/MacOSX11.3.sdk/System/Library/Frameworks/Ruby.framework/Versions/2.6/usr/include/ruby-2.6.0/**"
8
+ ],
9
+ "defines": [],
10
+ "macFrameworkPath": [
11
+ "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks"
12
+ ],
13
+ "compilerPath": "/usr/bin/clang",
14
+ "cStandard": "c17",
15
+ "cppStandard": "c++98",
16
+ "intelliSenseMode": "macos-clang-x64"
17
+ }
18
+ ],
19
+ "version": 4
20
+ }
@@ -0,0 +1,6 @@
1
+ {
2
+ "files.associations": {
3
+ "common.h": "c",
4
+ "ruby.h": "c"
5
+ }
6
+ }
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- word2vec-rb (0.4.0)
4
+ word2vec-rb (0.5.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
data/README.md CHANGED
@@ -78,6 +78,23 @@ Word2vec::Model.build\_vocab("./data/text7", "./data/vocab.txt")
78
78
 
79
79
  The output file will have a list of words and its number of appearances separated by line break.
80
80
 
81
+ ### Tokenizer: create a binary file by tokenizing an input file
82
+
83
+ This method requires a vocabulary file precreated.
84
+
85
+ ```ruby
86
+ require 'word2vec'
87
+
88
+ Word2vec::Model.tokenize("./data/text7", "./data/vocab.txt", "./data/toekized.bin")
89
+ ```
90
+
91
+ The output file will contain a sequence of binary identificators of each word of the input file.
92
+
93
+ Read output file with:
94
+
95
+ long long id;
96
+ fread(&id, sizeof(id), 1, fi);
97
+
81
98
  ## Development
82
99
 
83
100
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -1,5 +1,7 @@
1
1
  #include "common.h"
2
2
 
3
+ char train_file[MAX_STRING];
4
+ char read_vocab_file[MAX_STRING];
3
5
  // max length of strings
4
6
  const unsigned long max_size = 2000;
5
7
  // number of closest words that will be shown
@@ -44,4 +46,199 @@ void word2vec_model_load(word2vec_model* model, char* file_name) {
44
46
  for (a = 0; a < size; a++) model->vectors[a + b * size] /= len;
45
47
  }
46
48
  fclose(f);
47
- }
49
+ }
50
+
51
+ // Reads a single word from a file, assuming space + tab + EOL to be word boundaries
52
+ void ReadWord(char *word, FILE *fin) {
53
+ int a = 0, ch;
54
+ while (!feof(fin)) {
55
+ ch = fgetc(fin);
56
+ if (ch == 13) continue;
57
+ if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
58
+ if (a > 0) {
59
+ if (ch == '\n') ungetc(ch, fin);
60
+ break;
61
+ }
62
+ if (ch == '\n') {
63
+ strcpy(word, (char *)"</s>");
64
+ return;
65
+ } else continue;
66
+ }
67
+ word[a] = ch;
68
+ a++;
69
+ if (a >= MAX_STRING - 1) a--; // Truncate too long words
70
+ }
71
+ word[a] = 0;
72
+ }
73
+
74
+ // Returns hash value of a word
75
+ unsigned long long GetWordHash(char *word) {
76
+ unsigned long long a, hash = 0;
77
+ for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
78
+ hash = hash % vocab_hash_size;
79
+ return hash;
80
+ }
81
+
82
+ // Returns position of a word in the vocabulary; if the word is not found, returns -1
83
+ unsigned long long SearchVocab(char *word) {
84
+ unsigned long long hash = GetWordHash(word);
85
+ while (1) {
86
+ if (vocab_hash[hash] == -1) return -1;
87
+ if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
88
+ hash = (hash + 1) % vocab_hash_size;
89
+ }
90
+ return -1;
91
+ }
92
+
93
+ // Reads a word and returns its index in the vocabulary
94
+ unsigned long long ReadWordIndex(FILE *fin) {
95
+ char word[MAX_STRING];
96
+ ReadWord(word, fin);
97
+ if (feof(fin)) return -1;
98
+ return SearchVocab(word);
99
+ }
100
+
101
+ // Adds a word to the vocabulary
102
+ unsigned long long AddWordToVocab(char *word) {
103
+ unsigned long long hash, length = strlen(word) + 1;
104
+ if (length > MAX_STRING) length = MAX_STRING;
105
+ vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
106
+ strcpy(vocab[vocab_size].word, word);
107
+ vocab[vocab_size].cn = 0;
108
+ vocab_size++;
109
+ // Reallocate memory if needed
110
+ if (vocab_size + 2 >= vocab_max_size) {
111
+ vocab_max_size += 1000;
112
+ vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
113
+ }
114
+ hash = GetWordHash(word);
115
+ while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
116
+ vocab_hash[hash] = vocab_size - 1;
117
+ return vocab_size - 1;
118
+ }
119
+
120
+ // Reduces the vocabulary by removing infrequent tokens
121
+ void ReduceVocab() {
122
+ int a, b = 0;
123
+ unsigned long long hash;
124
+ for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
125
+ vocab[b].cn = vocab[a].cn;
126
+ vocab[b].word = vocab[a].word;
127
+ b++;
128
+ } else free(vocab[a].word);
129
+ vocab_size = b;
130
+ for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
131
+ for (a = 0; a < vocab_size; a++) {
132
+ // Hash will be re-computed, as it is not actual
133
+ hash = GetWordHash(vocab[a].word);
134
+ while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
135
+ vocab_hash[hash] = a;
136
+ }
137
+ fflush(stdout);
138
+ min_reduce++;
139
+ }
140
+
141
+ // Used later for sorting by word counts
142
+ int VocabCompare(const void *a, const void *b) {
143
+ return (int)(((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn);
144
+ }
145
+
146
+ // Sorts the vocabulary by frequency using word counts
147
+ void SortVocab() {
148
+ long long a, size;
149
+ unsigned long long hash;
150
+ // Sort the vocabulary and keep </s> at the first position
151
+ qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
152
+ for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
153
+ size = vocab_size;
154
+ train_words = 0;
155
+ for (a = 0; a < size; a++) {
156
+ // Words occuring less than min_count times will be discarded from the vocab
157
+ if ((vocab[a].cn < min_count) && (a != 0)) {
158
+ vocab_size--;
159
+ free(vocab[a].word);
160
+ } else {
161
+ // Hash will be re-computed, as after the sorting it is not actual
162
+ hash=GetWordHash(vocab[a].word);
163
+ while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
164
+ vocab_hash[hash] = a;
165
+ train_words += vocab[a].cn;
166
+ }
167
+ }
168
+ vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
169
+ // Allocate memory for the binary tree construction
170
+ for (a = 0; a < vocab_size; a++) {
171
+ vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
172
+ vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
173
+ }
174
+ }
175
+
176
+ // Learn vocabulary data from file
177
+ void LearnVocabFromTrainFile() {
178
+ char word[MAX_STRING];
179
+ FILE *fin;
180
+ long long a, i;
181
+ for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
182
+ fin = fopen(train_file, "rb");
183
+ if (fin == NULL) {
184
+ printf("ERROR: training data file not found!\n");
185
+ exit(1);
186
+ }
187
+ vocab_size = 0;
188
+ AddWordToVocab((char *)"</s>");
189
+ while (1) {
190
+ ReadWord(word, fin);
191
+ if (feof(fin)) break;
192
+ train_words++;
193
+ if ((debug_mode > 1) && (train_words % 100000 == 0)) {
194
+ printf("%lldK%c", train_words / 1000, 13);
195
+ fflush(stdout);
196
+ }
197
+ i = SearchVocab(word);
198
+ if (i == -1) {
199
+ a = AddWordToVocab(word);
200
+ vocab[a].cn = 1;
201
+ } else vocab[i].cn++;
202
+ if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
203
+ }
204
+ SortVocab();
205
+ if (debug_mode > 0) {
206
+ printf("Vocab size: %lld\n", vocab_size);
207
+ printf("Words in train file: %lld\n", train_words);
208
+ }
209
+ file_size = ftell(fin);
210
+ fclose(fin);
211
+ }
212
+
213
+ void ReadVocab() {
214
+ long long a, i = 0;
215
+ char c;
216
+ char word[MAX_STRING];
217
+ FILE *fin = fopen(read_vocab_file, "rb");
218
+ if (fin == NULL) {
219
+ printf("Vocabulary file not found\n");
220
+ exit(1);
221
+ }
222
+ for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
223
+ vocab_size = 0;
224
+ while (1) {
225
+ ReadWord(word, fin);
226
+ if (feof(fin)) break;
227
+ a = AddWordToVocab(word);
228
+ fscanf(fin, "%lld%c", &vocab[a].cn, &c);
229
+ i++;
230
+ }
231
+ SortVocab();
232
+ if (debug_mode > 0) {
233
+ printf("Vocab size: %lld\n", vocab_size);
234
+ printf("Words in train file: %lld\n", train_words);
235
+ }
236
+ fin = fopen(train_file, "rb");
237
+ if (fin == NULL) {
238
+ printf("ERROR: training data file not found!\n");
239
+ exit(1);
240
+ }
241
+ fseek(fin, 0, SEEK_END);
242
+ file_size = ftell(fin);
243
+ fclose(fin);
244
+ }
@@ -10,13 +10,26 @@
10
10
  #include <sys/types.h>
11
11
  #include <ctype.h>
12
12
 
13
+ #define MAX_STRING 100
14
+ #define MAX_CODE_LENGTH 40
15
+
13
16
  // max length of strings
14
17
  extern const unsigned long max_size;
15
18
  // number of closest words that will be shown
16
19
  extern const long long N;
17
20
  // max length of vocabulary entries
18
21
  extern const long long max_w;
19
-
22
+ // Maximum 30 * 0.7 = 21M words in the vocabulary
23
+ extern const int vocab_hash_size;
24
+ extern long long *vocab_hash;
25
+ extern struct vocab_word *vocab;
26
+
27
+ extern char train_file[];
28
+ extern char read_vocab_file[];
29
+ extern long long vocab_max_size, vocab_size;
30
+ extern long long train_words, file_size ;
31
+ extern int debug_mode, min_count, min_reduce;
32
+
20
33
  typedef struct word2vec_model_s {
21
34
  long long word_count;
22
35
  char *vocabulary; // char *[word_count]
@@ -29,10 +42,26 @@ typedef struct WordSimilarity_s {
29
42
  float score;
30
43
  } WordSimilarity;
31
44
 
45
+ struct vocab_word {
46
+ long long cn;
47
+ int *point;
48
+ char *word, *code, codelen;
49
+ };
50
+
51
+ void ReadWord(char *word, FILE *fin);
52
+ unsigned long long GetWordHash(char *word);
53
+ unsigned long long SearchVocab(char *word) ;
54
+ unsigned long long ReadWordIndex(FILE *fin);
55
+
56
+ void LearnVocabFromTrainFile();
57
+ void ReadVocab();
58
+
59
+ /// main functions
32
60
  void word2vec_model_load(word2vec_model* model, char* file_name);
33
61
  size_t word2vec_model_distance(word2vec_model* model, char* word, WordSimilarity word_list[]);
34
62
  size_t word2vec_model_analogy(word2vec_model* model, char* wordx1, char* wordy1, char* wordx2, WordSimilarity word_list[]);
35
63
  void word2vec_model_accuracy(word2vec_model* model, char* file_name);
36
- void word2vec_build_vocab(char* train, char* save_vocab_file);
64
+ void word2vec_build_vocab(char* train_file, char* save_vocab_file);
65
+ void word2vec_tokenize(char* train_file, char* vocab_file, char* output_file);
37
66
 
38
67
  #endif /* _WORD2VEC_COMMON_H */
@@ -0,0 +1,19 @@
1
+ #include "common.h"
2
+
3
+ void word2vec_tokenize(char* train_file, char* vocab_file, char* output_file) {
4
+ long long word;
5
+ vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
6
+ vocab_hash = (long long *)calloc(vocab_hash_size, sizeof(long long));
7
+ strcpy(train_file, train_file);
8
+ strcpy(read_vocab_file, vocab_file);
9
+ ReadVocab();
10
+ FILE *fi = fopen(train_file, "rb");
11
+ FILE *fo = fopen(output_file, "wb");
12
+ while (1) {
13
+ word = ReadWordIndex(fi);
14
+ if (feof(fi)) break;
15
+ fwrite(&word, sizeof(word), 1, fo);
16
+ }
17
+ fclose(fi);
18
+ fclose(fo);
19
+ }
data/ext/word2vec/vocab.c CHANGED
@@ -1,185 +1,13 @@
1
1
  #include "common.h"
2
2
 
3
- #define MAX_STRING 100
4
- #define MAX_CODE_LENGTH 40
5
-
6
3
  const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
7
4
 
8
- struct vocab_word {
9
- long long cn;
10
- int *point;
11
- char *word, *code, codelen;
12
- };
13
-
14
- char train_file[MAX_STRING];
15
5
  struct vocab_word *vocab;
16
6
  int debug_mode = 2, min_count = 5, min_reduce = 1;
17
7
  long long vocab_max_size = 1000, vocab_size = 0;
18
8
  long long *vocab_hash;
19
9
  long long train_words = 0, file_size = 0;
20
10
 
21
- // Reads a single word from a file, assuming space + tab + EOL to be word boundaries
22
- void ReadWord(char *word, FILE *fin) {
23
- int a = 0, ch;
24
- while (!feof(fin)) {
25
- ch = fgetc(fin);
26
- if (ch == 13) continue;
27
- if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
28
- if (a > 0) {
29
- if (ch == '\n') ungetc(ch, fin);
30
- break;
31
- }
32
- if (ch == '\n') {
33
- strcpy(word, (char *)"</s>");
34
- return;
35
- } else continue;
36
- }
37
- word[a] = ch;
38
- a++;
39
- if (a >= MAX_STRING - 1) a--; // Truncate too long words
40
- }
41
- word[a] = 0;
42
- }
43
-
44
- // Returns hash value of a word
45
- unsigned long long GetWordHash(char *word) {
46
- unsigned long long a, hash = 0;
47
- for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
48
- hash = hash % vocab_hash_size;
49
- return hash;
50
- }
51
-
52
- // Returns position of a word in the vocabulary; if the word is not found, returns -1
53
- unsigned long long SearchVocab(char *word) {
54
- unsigned long long hash = GetWordHash(word);
55
- while (1) {
56
- if (vocab_hash[hash] == -1) return -1;
57
- if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
58
- hash = (hash + 1) % vocab_hash_size;
59
- }
60
- return -1;
61
- }
62
-
63
- // Reads a word and returns its index in the vocabulary
64
- unsigned long long ReadWordIndex(FILE *fin) {
65
- char word[MAX_STRING];
66
- ReadWord(word, fin);
67
- if (feof(fin)) return -1;
68
- return SearchVocab(word);
69
- }
70
-
71
- // Adds a word to the vocabulary
72
- unsigned long long AddWordToVocab(char *word) {
73
- unsigned long long hash, length = strlen(word) + 1;
74
- if (length > MAX_STRING) length = MAX_STRING;
75
- vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
76
- strcpy(vocab[vocab_size].word, word);
77
- vocab[vocab_size].cn = 0;
78
- vocab_size++;
79
- // Reallocate memory if needed
80
- if (vocab_size + 2 >= vocab_max_size) {
81
- vocab_max_size += 1000;
82
- vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
83
- }
84
- hash = GetWordHash(word);
85
- while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
86
- vocab_hash[hash] = vocab_size - 1;
87
- return vocab_size - 1;
88
- }
89
-
90
- // Used later for sorting by word counts
91
- int VocabCompare(const void *a, const void *b) {
92
- return (int)(((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn);
93
- }
94
-
95
- // Sorts the vocabulary by frequency using word counts
96
- void SortVocab() {
97
- long long a, size;
98
- unsigned long long hash;
99
- // Sort the vocabulary and keep </s> at the first position
100
- qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
101
- for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
102
- size = vocab_size;
103
- train_words = 0;
104
- for (a = 0; a < size; a++) {
105
- // Words occuring less than min_count times will be discarded from the vocab
106
- if ((vocab[a].cn < min_count) && (a != 0)) {
107
- vocab_size--;
108
- free(vocab[a].word);
109
- } else {
110
- // Hash will be re-computed, as after the sorting it is not actual
111
- hash=GetWordHash(vocab[a].word);
112
- while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
113
- vocab_hash[hash] = a;
114
- train_words += vocab[a].cn;
115
- }
116
- }
117
- vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
118
- // Allocate memory for the binary tree construction
119
- for (a = 0; a < vocab_size; a++) {
120
- vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
121
- vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
122
- }
123
- }
124
-
125
- // Reduces the vocabulary by removing infrequent tokens
126
- void ReduceVocab() {
127
- int a, b = 0;
128
- unsigned long long hash;
129
- for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
130
- vocab[b].cn = vocab[a].cn;
131
- vocab[b].word = vocab[a].word;
132
- b++;
133
- } else free(vocab[a].word);
134
- vocab_size = b;
135
- for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
136
- for (a = 0; a < vocab_size; a++) {
137
- // Hash will be re-computed, as it is not actual
138
- hash = GetWordHash(vocab[a].word);
139
- while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
140
- vocab_hash[hash] = a;
141
- }
142
- fflush(stdout);
143
- min_reduce++;
144
- }
145
-
146
- // Learn vocabulary data from file
147
- void LearnVocabFromTrainFile() {
148
- char word[MAX_STRING];
149
- FILE *fin;
150
- long long a, i;
151
- for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
152
- fin = fopen(train_file, "rb");
153
- if (fin == NULL) {
154
- printf("ERROR: training data file not found!\n");
155
- exit(1);
156
- }
157
- vocab_size = 0;
158
- AddWordToVocab((char *)"</s>");
159
- while (1) {
160
- ReadWord(word, fin);
161
- if (feof(fin)) break;
162
- train_words++;
163
- if ((debug_mode > 1) && (train_words % 100000 == 0)) {
164
- printf("%lldK%c", train_words / 1000, 13);
165
- fflush(stdout);
166
- }
167
- i = SearchVocab(word);
168
- if (i == -1) {
169
- a = AddWordToVocab(word);
170
- vocab[a].cn = 1;
171
- } else vocab[i].cn++;
172
- if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
173
- }
174
- SortVocab();
175
- if (debug_mode > 0) {
176
- printf("Vocab size: %lld\n", vocab_size);
177
- printf("Words in train file: %lld\n", train_words);
178
- }
179
- file_size = ftell(fin);
180
- fclose(fin);
181
- }
182
-
183
11
  void SaveVocab(char* save_vocab_file) {
184
12
  long long i;
185
13
  FILE *fo = fopen(save_vocab_file, "wb");
@@ -141,6 +141,22 @@ static VALUE build_vocab(VALUE mod, VALUE rb_train_file_name, VALUE rb_vocab_fil
141
141
  return Qtrue;
142
142
  }
143
143
 
144
+ /*
145
+ * tokenize a file
146
+ * @param [String] rb_train_file_name
147
+ * @param [String] rb_vocab_file_name
148
+ * @param [String] rb_output_file_name
149
+ */
150
+ static VALUE tokenize(VALUE mod, VALUE rb_train_file_name, VALUE rb_vocab_file_name, VALUE rb_output_file_name) {
151
+ char* train_filename = StringValueCStr(rb_train_file_name);
152
+ char* vocab_filename = StringValueCStr(rb_vocab_file_name);
153
+ char* output_filename = StringValueCStr(rb_output_file_name);
154
+
155
+ word2vec_tokenize(train_filename, vocab_filename, output_filename);
156
+
157
+ return Qtrue;
158
+ }
159
+
144
160
  void Init_word2vec(void) {
145
161
  VALUE mWord2vec = rb_define_module("Word2vec");
146
162
  VALUE mWord2vecModel = rb_define_class_under(mWord2vec, "Model", rb_cObject);
@@ -151,4 +167,5 @@ void Init_word2vec(void) {
151
167
  rb_define_method(mWord2vecModel, "analogy", model_analogy, 3);
152
168
  rb_define_method(mWord2vecModel, "accuracy", model_accuracy, 1);
153
169
  rb_define_singleton_method(mWord2vecModel, "build_vocab", build_vocab, 2);
170
+ rb_define_singleton_method(mWord2vecModel, "tokenize", tokenize, 3);
154
171
  }
@@ -1,3 +1,3 @@
1
1
  module Word2vec
2
- VERSION = "0.4.0"
2
+ VERSION = "0.5.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: word2vec-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dani Vela
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-06-14 00:00:00.000000000 Z
11
+ date: 2021-06-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -80,6 +80,8 @@ files:
80
80
  - ".gitignore"
81
81
  - ".rspec"
82
82
  - ".travis.yml"
83
+ - ".vscode/c_cpp_properties.json"
84
+ - ".vscode/settings.json"
83
85
  - CHANGELOG
84
86
  - Gemfile
85
87
  - Gemfile.lock
@@ -96,6 +98,7 @@ files:
96
98
  - ext/word2vec/common.h
97
99
  - ext/word2vec/distance.c
98
100
  - ext/word2vec/extconf.rb
101
+ - ext/word2vec/tokenizer.c
99
102
  - ext/word2vec/vocab.c
100
103
  - ext/word2vec/word2vec.c
101
104
  - lib/word2vec.rb