word2vec-rb 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f1cc8aad12e17e08da6428945fa3fcdab514739a5ff166588796b0de4535ba10
4
- data.tar.gz: 3deaf3e6d6f1bff3e06b6353d8c29591fa00c8f5e8c8f3079f0d9fdf029786b7
3
+ metadata.gz: 192304d569b2fb573300a33ff7ed05fc37202c30cf55144fc043cc544e620f23
4
+ data.tar.gz: e5473521b82560242b19ae2efa9c4e07a7798bb3c0ea2e97d97e5f3a1f2601cf
5
5
  SHA512:
6
- metadata.gz: 916361e4543fadac744e4502f53241b105789bbf928be51d2e17df93b3ac318e2b9125d7146ec67146a591ca87ee7d2928ca15aadb1fabc0b2c31b82e058497b
7
- data.tar.gz: 9daa3535f9c23254b5fa45e9759fab036a297ffa491e3fc03287c3c9c072c7b314925843d376e678f1b8f61915839f5e0689e1fa27268704eaf5bd086fdaecc3
6
+ metadata.gz: db2cc2396b4a7193a0cb4efd40ffefbefabd88861f257aea8ac90d032d9ebac8f0759cd952678174f764cdce2605a94c521e24d9433a8a1eaaf135e5dcc1cf89
7
+ data.tar.gz: 26949750236e3929f9794967570c3fb257c53474c9d79b857f12d4214ecca075afe4a32dbb5a6e5253b44ea1566a391aff70c9617a2914ee700779e3dd8a4af9
data/.gitignore CHANGED
@@ -13,4 +13,5 @@
13
13
  lib/word2vec/word2vec.bundle
14
14
 
15
15
  data/text7
16
- data/vocab.txt
16
+ data/vocab.txt
17
+ data/tokenized.bin
@@ -0,0 +1,20 @@
1
+ {
2
+ "configurations": [
3
+ {
4
+ "name": "Mac",
5
+ "includePath": [
6
+ "${workspaceFolder}/**",
7
+ "/Library/Developer/CommandLineTools/SDKs/MacOSX11.3.sdk/System/Library/Frameworks/Ruby.framework/Versions/2.6/usr/include/ruby-2.6.0/**"
8
+ ],
9
+ "defines": [],
10
+ "macFrameworkPath": [
11
+ "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks"
12
+ ],
13
+ "compilerPath": "/usr/bin/clang",
14
+ "cStandard": "c17",
15
+ "cppStandard": "c++98",
16
+ "intelliSenseMode": "macos-clang-x64"
17
+ }
18
+ ],
19
+ "version": 4
20
+ }
@@ -0,0 +1,6 @@
1
+ {
2
+ "files.associations": {
3
+ "common.h": "c",
4
+ "ruby.h": "c"
5
+ }
6
+ }
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- word2vec-rb (0.4.0)
4
+ word2vec-rb (0.5.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
data/README.md CHANGED
@@ -78,6 +78,23 @@ Word2vec::Model.build\_vocab("./data/text7", "./data/vocab.txt")
78
78
 
79
79
  The output file will have a list of words and its number of appearances separated by line break.
80
80
 
81
+ ### Tokenizer: create a binary file by tokenizing an input file
82
+
83
+ This method requires a vocabulary file precreated.
84
+
85
+ ```ruby
86
+ require 'word2vec'
87
+
88
+ Word2vec::Model.tokenize("./data/text7", "./data/vocab.txt", "./data/toekized.bin")
89
+ ```
90
+
91
+ The output file will contain a sequence of binary identificators of each word of the input file.
92
+
93
+ Read output file with:
94
+
95
+ long long id;
96
+ fread(&id, sizeof(id), 1, fi);
97
+
81
98
  ## Development
82
99
 
83
100
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -1,5 +1,7 @@
1
1
  #include "common.h"
2
2
 
3
+ char train_file[MAX_STRING];
4
+ char read_vocab_file[MAX_STRING];
3
5
  // max length of strings
4
6
  const unsigned long max_size = 2000;
5
7
  // number of closest words that will be shown
@@ -44,4 +46,199 @@ void word2vec_model_load(word2vec_model* model, char* file_name) {
44
46
  for (a = 0; a < size; a++) model->vectors[a + b * size] /= len;
45
47
  }
46
48
  fclose(f);
47
- }
49
+ }
50
+
51
+ // Reads a single word from a file, assuming space + tab + EOL to be word boundaries
52
+ void ReadWord(char *word, FILE *fin) {
53
+ int a = 0, ch;
54
+ while (!feof(fin)) {
55
+ ch = fgetc(fin);
56
+ if (ch == 13) continue;
57
+ if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
58
+ if (a > 0) {
59
+ if (ch == '\n') ungetc(ch, fin);
60
+ break;
61
+ }
62
+ if (ch == '\n') {
63
+ strcpy(word, (char *)"</s>");
64
+ return;
65
+ } else continue;
66
+ }
67
+ word[a] = ch;
68
+ a++;
69
+ if (a >= MAX_STRING - 1) a--; // Truncate too long words
70
+ }
71
+ word[a] = 0;
72
+ }
73
+
74
+ // Returns hash value of a word
75
+ unsigned long long GetWordHash(char *word) {
76
+ unsigned long long a, hash = 0;
77
+ for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
78
+ hash = hash % vocab_hash_size;
79
+ return hash;
80
+ }
81
+
82
+ // Returns position of a word in the vocabulary; if the word is not found, returns -1
83
+ unsigned long long SearchVocab(char *word) {
84
+ unsigned long long hash = GetWordHash(word);
85
+ while (1) {
86
+ if (vocab_hash[hash] == -1) return -1;
87
+ if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
88
+ hash = (hash + 1) % vocab_hash_size;
89
+ }
90
+ return -1;
91
+ }
92
+
93
+ // Reads a word and returns its index in the vocabulary
94
+ unsigned long long ReadWordIndex(FILE *fin) {
95
+ char word[MAX_STRING];
96
+ ReadWord(word, fin);
97
+ if (feof(fin)) return -1;
98
+ return SearchVocab(word);
99
+ }
100
+
101
+ // Adds a word to the vocabulary
102
+ unsigned long long AddWordToVocab(char *word) {
103
+ unsigned long long hash, length = strlen(word) + 1;
104
+ if (length > MAX_STRING) length = MAX_STRING;
105
+ vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
106
+ strcpy(vocab[vocab_size].word, word);
107
+ vocab[vocab_size].cn = 0;
108
+ vocab_size++;
109
+ // Reallocate memory if needed
110
+ if (vocab_size + 2 >= vocab_max_size) {
111
+ vocab_max_size += 1000;
112
+ vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
113
+ }
114
+ hash = GetWordHash(word);
115
+ while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
116
+ vocab_hash[hash] = vocab_size - 1;
117
+ return vocab_size - 1;
118
+ }
119
+
120
+ // Reduces the vocabulary by removing infrequent tokens
121
+ void ReduceVocab() {
122
+ int a, b = 0;
123
+ unsigned long long hash;
124
+ for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
125
+ vocab[b].cn = vocab[a].cn;
126
+ vocab[b].word = vocab[a].word;
127
+ b++;
128
+ } else free(vocab[a].word);
129
+ vocab_size = b;
130
+ for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
131
+ for (a = 0; a < vocab_size; a++) {
132
+ // Hash will be re-computed, as it is not actual
133
+ hash = GetWordHash(vocab[a].word);
134
+ while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
135
+ vocab_hash[hash] = a;
136
+ }
137
+ fflush(stdout);
138
+ min_reduce++;
139
+ }
140
+
141
+ // Used later for sorting by word counts
142
+ int VocabCompare(const void *a, const void *b) {
143
+ return (int)(((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn);
144
+ }
145
+
146
+ // Sorts the vocabulary by frequency using word counts
147
+ void SortVocab() {
148
+ long long a, size;
149
+ unsigned long long hash;
150
+ // Sort the vocabulary and keep </s> at the first position
151
+ qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
152
+ for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
153
+ size = vocab_size;
154
+ train_words = 0;
155
+ for (a = 0; a < size; a++) {
156
+ // Words occuring less than min_count times will be discarded from the vocab
157
+ if ((vocab[a].cn < min_count) && (a != 0)) {
158
+ vocab_size--;
159
+ free(vocab[a].word);
160
+ } else {
161
+ // Hash will be re-computed, as after the sorting it is not actual
162
+ hash=GetWordHash(vocab[a].word);
163
+ while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
164
+ vocab_hash[hash] = a;
165
+ train_words += vocab[a].cn;
166
+ }
167
+ }
168
+ vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
169
+ // Allocate memory for the binary tree construction
170
+ for (a = 0; a < vocab_size; a++) {
171
+ vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
172
+ vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
173
+ }
174
+ }
175
+
176
+ // Learn vocabulary data from file
177
+ void LearnVocabFromTrainFile() {
178
+ char word[MAX_STRING];
179
+ FILE *fin;
180
+ long long a, i;
181
+ for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
182
+ fin = fopen(train_file, "rb");
183
+ if (fin == NULL) {
184
+ printf("ERROR: training data file not found!\n");
185
+ exit(1);
186
+ }
187
+ vocab_size = 0;
188
+ AddWordToVocab((char *)"</s>");
189
+ while (1) {
190
+ ReadWord(word, fin);
191
+ if (feof(fin)) break;
192
+ train_words++;
193
+ if ((debug_mode > 1) && (train_words % 100000 == 0)) {
194
+ printf("%lldK%c", train_words / 1000, 13);
195
+ fflush(stdout);
196
+ }
197
+ i = SearchVocab(word);
198
+ if (i == -1) {
199
+ a = AddWordToVocab(word);
200
+ vocab[a].cn = 1;
201
+ } else vocab[i].cn++;
202
+ if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
203
+ }
204
+ SortVocab();
205
+ if (debug_mode > 0) {
206
+ printf("Vocab size: %lld\n", vocab_size);
207
+ printf("Words in train file: %lld\n", train_words);
208
+ }
209
+ file_size = ftell(fin);
210
+ fclose(fin);
211
+ }
212
+
213
+ void ReadVocab() {
214
+ long long a, i = 0;
215
+ char c;
216
+ char word[MAX_STRING];
217
+ FILE *fin = fopen(read_vocab_file, "rb");
218
+ if (fin == NULL) {
219
+ printf("Vocabulary file not found\n");
220
+ exit(1);
221
+ }
222
+ for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
223
+ vocab_size = 0;
224
+ while (1) {
225
+ ReadWord(word, fin);
226
+ if (feof(fin)) break;
227
+ a = AddWordToVocab(word);
228
+ fscanf(fin, "%lld%c", &vocab[a].cn, &c);
229
+ i++;
230
+ }
231
+ SortVocab();
232
+ if (debug_mode > 0) {
233
+ printf("Vocab size: %lld\n", vocab_size);
234
+ printf("Words in train file: %lld\n", train_words);
235
+ }
236
+ fin = fopen(train_file, "rb");
237
+ if (fin == NULL) {
238
+ printf("ERROR: training data file not found!\n");
239
+ exit(1);
240
+ }
241
+ fseek(fin, 0, SEEK_END);
242
+ file_size = ftell(fin);
243
+ fclose(fin);
244
+ }
@@ -10,13 +10,26 @@
10
10
  #include <sys/types.h>
11
11
  #include <ctype.h>
12
12
 
13
+ #define MAX_STRING 100
14
+ #define MAX_CODE_LENGTH 40
15
+
13
16
  // max length of strings
14
17
  extern const unsigned long max_size;
15
18
  // number of closest words that will be shown
16
19
  extern const long long N;
17
20
  // max length of vocabulary entries
18
21
  extern const long long max_w;
19
-
22
+ // Maximum 30 * 0.7 = 21M words in the vocabulary
23
+ extern const int vocab_hash_size;
24
+ extern long long *vocab_hash;
25
+ extern struct vocab_word *vocab;
26
+
27
+ extern char train_file[];
28
+ extern char read_vocab_file[];
29
+ extern long long vocab_max_size, vocab_size;
30
+ extern long long train_words, file_size ;
31
+ extern int debug_mode, min_count, min_reduce;
32
+
20
33
  typedef struct word2vec_model_s {
21
34
  long long word_count;
22
35
  char *vocabulary; // char *[word_count]
@@ -29,10 +42,26 @@ typedef struct WordSimilarity_s {
29
42
  float score;
30
43
  } WordSimilarity;
31
44
 
45
+ struct vocab_word {
46
+ long long cn;
47
+ int *point;
48
+ char *word, *code, codelen;
49
+ };
50
+
51
+ void ReadWord(char *word, FILE *fin);
52
+ unsigned long long GetWordHash(char *word);
53
+ unsigned long long SearchVocab(char *word) ;
54
+ unsigned long long ReadWordIndex(FILE *fin);
55
+
56
+ void LearnVocabFromTrainFile();
57
+ void ReadVocab();
58
+
59
+ /// main functions
32
60
  void word2vec_model_load(word2vec_model* model, char* file_name);
33
61
  size_t word2vec_model_distance(word2vec_model* model, char* word, WordSimilarity word_list[]);
34
62
  size_t word2vec_model_analogy(word2vec_model* model, char* wordx1, char* wordy1, char* wordx2, WordSimilarity word_list[]);
35
63
  void word2vec_model_accuracy(word2vec_model* model, char* file_name);
36
- void word2vec_build_vocab(char* train, char* save_vocab_file);
64
+ void word2vec_build_vocab(char* train_file, char* save_vocab_file);
65
+ void word2vec_tokenize(char* train_file, char* vocab_file, char* output_file);
37
66
 
38
67
  #endif /* _WORD2VEC_COMMON_H */
@@ -0,0 +1,19 @@
1
+ #include "common.h"
2
+
3
+ void word2vec_tokenize(char* train_file, char* vocab_file, char* output_file) {
4
+ long long word;
5
+ vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
6
+ vocab_hash = (long long *)calloc(vocab_hash_size, sizeof(long long));
7
+ strcpy(train_file, train_file);
8
+ strcpy(read_vocab_file, vocab_file);
9
+ ReadVocab();
10
+ FILE *fi = fopen(train_file, "rb");
11
+ FILE *fo = fopen(output_file, "wb");
12
+ while (1) {
13
+ word = ReadWordIndex(fi);
14
+ if (feof(fi)) break;
15
+ fwrite(&word, sizeof(word), 1, fo);
16
+ }
17
+ fclose(fi);
18
+ fclose(fo);
19
+ }
data/ext/word2vec/vocab.c CHANGED
@@ -1,185 +1,13 @@
1
1
  #include "common.h"
2
2
 
3
- #define MAX_STRING 100
4
- #define MAX_CODE_LENGTH 40
5
-
6
3
  const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
7
4
 
8
- struct vocab_word {
9
- long long cn;
10
- int *point;
11
- char *word, *code, codelen;
12
- };
13
-
14
- char train_file[MAX_STRING];
15
5
  struct vocab_word *vocab;
16
6
  int debug_mode = 2, min_count = 5, min_reduce = 1;
17
7
  long long vocab_max_size = 1000, vocab_size = 0;
18
8
  long long *vocab_hash;
19
9
  long long train_words = 0, file_size = 0;
20
10
 
21
- // Reads a single word from a file, assuming space + tab + EOL to be word boundaries
22
- void ReadWord(char *word, FILE *fin) {
23
- int a = 0, ch;
24
- while (!feof(fin)) {
25
- ch = fgetc(fin);
26
- if (ch == 13) continue;
27
- if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
28
- if (a > 0) {
29
- if (ch == '\n') ungetc(ch, fin);
30
- break;
31
- }
32
- if (ch == '\n') {
33
- strcpy(word, (char *)"</s>");
34
- return;
35
- } else continue;
36
- }
37
- word[a] = ch;
38
- a++;
39
- if (a >= MAX_STRING - 1) a--; // Truncate too long words
40
- }
41
- word[a] = 0;
42
- }
43
-
44
- // Returns hash value of a word
45
- unsigned long long GetWordHash(char *word) {
46
- unsigned long long a, hash = 0;
47
- for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
48
- hash = hash % vocab_hash_size;
49
- return hash;
50
- }
51
-
52
- // Returns position of a word in the vocabulary; if the word is not found, returns -1
53
- unsigned long long SearchVocab(char *word) {
54
- unsigned long long hash = GetWordHash(word);
55
- while (1) {
56
- if (vocab_hash[hash] == -1) return -1;
57
- if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
58
- hash = (hash + 1) % vocab_hash_size;
59
- }
60
- return -1;
61
- }
62
-
63
- // Reads a word and returns its index in the vocabulary
64
- unsigned long long ReadWordIndex(FILE *fin) {
65
- char word[MAX_STRING];
66
- ReadWord(word, fin);
67
- if (feof(fin)) return -1;
68
- return SearchVocab(word);
69
- }
70
-
71
- // Adds a word to the vocabulary
72
- unsigned long long AddWordToVocab(char *word) {
73
- unsigned long long hash, length = strlen(word) + 1;
74
- if (length > MAX_STRING) length = MAX_STRING;
75
- vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
76
- strcpy(vocab[vocab_size].word, word);
77
- vocab[vocab_size].cn = 0;
78
- vocab_size++;
79
- // Reallocate memory if needed
80
- if (vocab_size + 2 >= vocab_max_size) {
81
- vocab_max_size += 1000;
82
- vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
83
- }
84
- hash = GetWordHash(word);
85
- while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
86
- vocab_hash[hash] = vocab_size - 1;
87
- return vocab_size - 1;
88
- }
89
-
90
- // Used later for sorting by word counts
91
- int VocabCompare(const void *a, const void *b) {
92
- return (int)(((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn);
93
- }
94
-
95
- // Sorts the vocabulary by frequency using word counts
96
- void SortVocab() {
97
- long long a, size;
98
- unsigned long long hash;
99
- // Sort the vocabulary and keep </s> at the first position
100
- qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
101
- for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
102
- size = vocab_size;
103
- train_words = 0;
104
- for (a = 0; a < size; a++) {
105
- // Words occuring less than min_count times will be discarded from the vocab
106
- if ((vocab[a].cn < min_count) && (a != 0)) {
107
- vocab_size--;
108
- free(vocab[a].word);
109
- } else {
110
- // Hash will be re-computed, as after the sorting it is not actual
111
- hash=GetWordHash(vocab[a].word);
112
- while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
113
- vocab_hash[hash] = a;
114
- train_words += vocab[a].cn;
115
- }
116
- }
117
- vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
118
- // Allocate memory for the binary tree construction
119
- for (a = 0; a < vocab_size; a++) {
120
- vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
121
- vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
122
- }
123
- }
124
-
125
- // Reduces the vocabulary by removing infrequent tokens
126
- void ReduceVocab() {
127
- int a, b = 0;
128
- unsigned long long hash;
129
- for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
130
- vocab[b].cn = vocab[a].cn;
131
- vocab[b].word = vocab[a].word;
132
- b++;
133
- } else free(vocab[a].word);
134
- vocab_size = b;
135
- for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
136
- for (a = 0; a < vocab_size; a++) {
137
- // Hash will be re-computed, as it is not actual
138
- hash = GetWordHash(vocab[a].word);
139
- while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
140
- vocab_hash[hash] = a;
141
- }
142
- fflush(stdout);
143
- min_reduce++;
144
- }
145
-
146
- // Learn vocabulary data from file
147
- void LearnVocabFromTrainFile() {
148
- char word[MAX_STRING];
149
- FILE *fin;
150
- long long a, i;
151
- for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
152
- fin = fopen(train_file, "rb");
153
- if (fin == NULL) {
154
- printf("ERROR: training data file not found!\n");
155
- exit(1);
156
- }
157
- vocab_size = 0;
158
- AddWordToVocab((char *)"</s>");
159
- while (1) {
160
- ReadWord(word, fin);
161
- if (feof(fin)) break;
162
- train_words++;
163
- if ((debug_mode > 1) && (train_words % 100000 == 0)) {
164
- printf("%lldK%c", train_words / 1000, 13);
165
- fflush(stdout);
166
- }
167
- i = SearchVocab(word);
168
- if (i == -1) {
169
- a = AddWordToVocab(word);
170
- vocab[a].cn = 1;
171
- } else vocab[i].cn++;
172
- if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
173
- }
174
- SortVocab();
175
- if (debug_mode > 0) {
176
- printf("Vocab size: %lld\n", vocab_size);
177
- printf("Words in train file: %lld\n", train_words);
178
- }
179
- file_size = ftell(fin);
180
- fclose(fin);
181
- }
182
-
183
11
  void SaveVocab(char* save_vocab_file) {
184
12
  long long i;
185
13
  FILE *fo = fopen(save_vocab_file, "wb");
@@ -141,6 +141,22 @@ static VALUE build_vocab(VALUE mod, VALUE rb_train_file_name, VALUE rb_vocab_fil
141
141
  return Qtrue;
142
142
  }
143
143
 
144
+ /*
145
+ * tokenize a file
146
+ * @param [String] rb_train_file_name
147
+ * @param [String] rb_vocab_file_name
148
+ * @param [String] rb_output_file_name
149
+ */
150
+ static VALUE tokenize(VALUE mod, VALUE rb_train_file_name, VALUE rb_vocab_file_name, VALUE rb_output_file_name) {
151
+ char* train_filename = StringValueCStr(rb_train_file_name);
152
+ char* vocab_filename = StringValueCStr(rb_vocab_file_name);
153
+ char* output_filename = StringValueCStr(rb_output_file_name);
154
+
155
+ word2vec_tokenize(train_filename, vocab_filename, output_filename);
156
+
157
+ return Qtrue;
158
+ }
159
+
144
160
  void Init_word2vec(void) {
145
161
  VALUE mWord2vec = rb_define_module("Word2vec");
146
162
  VALUE mWord2vecModel = rb_define_class_under(mWord2vec, "Model", rb_cObject);
@@ -151,4 +167,5 @@ void Init_word2vec(void) {
151
167
  rb_define_method(mWord2vecModel, "analogy", model_analogy, 3);
152
168
  rb_define_method(mWord2vecModel, "accuracy", model_accuracy, 1);
153
169
  rb_define_singleton_method(mWord2vecModel, "build_vocab", build_vocab, 2);
170
+ rb_define_singleton_method(mWord2vecModel, "tokenize", tokenize, 3);
154
171
  }
@@ -1,3 +1,3 @@
1
1
  module Word2vec
2
- VERSION = "0.4.0"
2
+ VERSION = "0.5.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: word2vec-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dani Vela
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-06-14 00:00:00.000000000 Z
11
+ date: 2021-06-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -80,6 +80,8 @@ files:
80
80
  - ".gitignore"
81
81
  - ".rspec"
82
82
  - ".travis.yml"
83
+ - ".vscode/c_cpp_properties.json"
84
+ - ".vscode/settings.json"
83
85
  - CHANGELOG
84
86
  - Gemfile
85
87
  - Gemfile.lock
@@ -96,6 +98,7 @@ files:
96
98
  - ext/word2vec/common.h
97
99
  - ext/word2vec/distance.c
98
100
  - ext/word2vec/extconf.rb
101
+ - ext/word2vec/tokenizer.c
99
102
  - ext/word2vec/vocab.c
100
103
  - ext/word2vec/word2vec.c
101
104
  - lib/word2vec.rb