word2vec-rb 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +4 -1
- data/CHANGELOG +4 -0
- data/Gemfile.lock +1 -1
- data/README.md +10 -0
- data/data/readme.md +1 -1
- data/ext/word2vec/common.h +1 -0
- data/ext/word2vec/vocab.c +197 -0
- data/ext/word2vec/word2vec.c +15 -0
- data/lib/word2vec/version.rb +1 -1
- data/word2vec-rb.gemspec +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f1cc8aad12e17e08da6428945fa3fcdab514739a5ff166588796b0de4535ba10
|
4
|
+
data.tar.gz: 3deaf3e6d6f1bff3e06b6353d8c29591fa00c8f5e8c8f3079f0d9fdf029786b7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 916361e4543fadac744e4502f53241b105789bbf928be51d2e17df93b3ac318e2b9125d7146ec67146a591ca87ee7d2928ca15aadb1fabc0b2c31b82e058497b
|
7
|
+
data.tar.gz: 9daa3535f9c23254b5fa45e9759fab036a297ffa491e3fc03287c3c9c072c7b314925843d376e678f1b8f61915839f5e0689e1fa27268704eaf5bd086fdaecc3
|
data/.gitignore
CHANGED
data/CHANGELOG
CHANGED
@@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
6
6
|
|
7
7
|
## [Unreleased]
|
8
8
|
|
9
|
+
## [0.4.0] - 2021-06-14
|
10
|
+
### Added
|
11
|
+
- Vocabulary. Create a vocabulary file with its number of appearances.
|
12
|
+
|
9
13
|
## [0.3.0] - 2021-04-26
|
10
14
|
### Added
|
11
15
|
- Accuracy. Meassure the quality of the vectors.
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -68,6 +68,16 @@ model.accuracy("./data/questions-words.txt")
|
|
68
68
|
# Outputs the results on terminal
|
69
69
|
```
|
70
70
|
|
71
|
+
### Vocabulary: create a vocabulary file from a train file:
|
72
|
+
|
73
|
+
```ruby
|
74
|
+
require 'word2vec'
|
75
|
+
|
76
|
+
Word2vec::Model.build\_vocab("./data/text7", "./data/vocab.txt")
|
77
|
+
```
|
78
|
+
|
79
|
+
The output file will have a list of words and its number of appearances separated by line break.
|
80
|
+
|
71
81
|
## Development
|
72
82
|
|
73
83
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/data/readme.md
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
Files in this directory are only for testing purposes.
|
data/ext/word2vec/common.h
CHANGED
@@ -33,5 +33,6 @@ void word2vec_model_load(word2vec_model* model, char* file_name);
|
|
33
33
|
size_t word2vec_model_distance(word2vec_model* model, char* word, WordSimilarity word_list[]);
|
34
34
|
size_t word2vec_model_analogy(word2vec_model* model, char* wordx1, char* wordy1, char* wordx2, WordSimilarity word_list[]);
|
35
35
|
void word2vec_model_accuracy(word2vec_model* model, char* file_name);
|
36
|
+
void word2vec_build_vocab(char* train, char* save_vocab_file);
|
36
37
|
|
37
38
|
#endif /* _WORD2VEC_COMMON_H */
|
@@ -0,0 +1,197 @@
|
|
1
|
+
#include "common.h"
|
2
|
+
|
3
|
+
#define MAX_STRING 100
|
4
|
+
#define MAX_CODE_LENGTH 40
|
5
|
+
|
6
|
+
const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
|
7
|
+
|
8
|
+
struct vocab_word {
|
9
|
+
long long cn;
|
10
|
+
int *point;
|
11
|
+
char *word, *code, codelen;
|
12
|
+
};
|
13
|
+
|
14
|
+
char train_file[MAX_STRING];
|
15
|
+
struct vocab_word *vocab;
|
16
|
+
int debug_mode = 2, min_count = 5, min_reduce = 1;
|
17
|
+
long long vocab_max_size = 1000, vocab_size = 0;
|
18
|
+
long long *vocab_hash;
|
19
|
+
long long train_words = 0, file_size = 0;
|
20
|
+
|
21
|
+
// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
|
22
|
+
void ReadWord(char *word, FILE *fin) {
|
23
|
+
int a = 0, ch;
|
24
|
+
while (!feof(fin)) {
|
25
|
+
ch = fgetc(fin);
|
26
|
+
if (ch == 13) continue;
|
27
|
+
if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
|
28
|
+
if (a > 0) {
|
29
|
+
if (ch == '\n') ungetc(ch, fin);
|
30
|
+
break;
|
31
|
+
}
|
32
|
+
if (ch == '\n') {
|
33
|
+
strcpy(word, (char *)"</s>");
|
34
|
+
return;
|
35
|
+
} else continue;
|
36
|
+
}
|
37
|
+
word[a] = ch;
|
38
|
+
a++;
|
39
|
+
if (a >= MAX_STRING - 1) a--; // Truncate too long words
|
40
|
+
}
|
41
|
+
word[a] = 0;
|
42
|
+
}
|
43
|
+
|
44
|
+
// Returns hash value of a word
|
45
|
+
unsigned long long GetWordHash(char *word) {
|
46
|
+
unsigned long long a, hash = 0;
|
47
|
+
for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
|
48
|
+
hash = hash % vocab_hash_size;
|
49
|
+
return hash;
|
50
|
+
}
|
51
|
+
|
52
|
+
// Returns position of a word in the vocabulary; if the word is not found, returns -1
|
53
|
+
unsigned long long SearchVocab(char *word) {
|
54
|
+
unsigned long long hash = GetWordHash(word);
|
55
|
+
while (1) {
|
56
|
+
if (vocab_hash[hash] == -1) return -1;
|
57
|
+
if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
|
58
|
+
hash = (hash + 1) % vocab_hash_size;
|
59
|
+
}
|
60
|
+
return -1;
|
61
|
+
}
|
62
|
+
|
63
|
+
// Reads a word and returns its index in the vocabulary
|
64
|
+
unsigned long long ReadWordIndex(FILE *fin) {
|
65
|
+
char word[MAX_STRING];
|
66
|
+
ReadWord(word, fin);
|
67
|
+
if (feof(fin)) return -1;
|
68
|
+
return SearchVocab(word);
|
69
|
+
}
|
70
|
+
|
71
|
+
// Adds a word to the vocabulary
|
72
|
+
unsigned long long AddWordToVocab(char *word) {
|
73
|
+
unsigned long long hash, length = strlen(word) + 1;
|
74
|
+
if (length > MAX_STRING) length = MAX_STRING;
|
75
|
+
vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
|
76
|
+
strcpy(vocab[vocab_size].word, word);
|
77
|
+
vocab[vocab_size].cn = 0;
|
78
|
+
vocab_size++;
|
79
|
+
// Reallocate memory if needed
|
80
|
+
if (vocab_size + 2 >= vocab_max_size) {
|
81
|
+
vocab_max_size += 1000;
|
82
|
+
vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
|
83
|
+
}
|
84
|
+
hash = GetWordHash(word);
|
85
|
+
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
86
|
+
vocab_hash[hash] = vocab_size - 1;
|
87
|
+
return vocab_size - 1;
|
88
|
+
}
|
89
|
+
|
90
|
+
// Used later for sorting by word counts
|
91
|
+
int VocabCompare(const void *a, const void *b) {
|
92
|
+
return (int)(((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn);
|
93
|
+
}
|
94
|
+
|
95
|
+
// Sorts the vocabulary by frequency using word counts
|
96
|
+
void SortVocab() {
|
97
|
+
long long a, size;
|
98
|
+
unsigned long long hash;
|
99
|
+
// Sort the vocabulary and keep </s> at the first position
|
100
|
+
qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
|
101
|
+
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
102
|
+
size = vocab_size;
|
103
|
+
train_words = 0;
|
104
|
+
for (a = 0; a < size; a++) {
|
105
|
+
// Words occuring less than min_count times will be discarded from the vocab
|
106
|
+
if ((vocab[a].cn < min_count) && (a != 0)) {
|
107
|
+
vocab_size--;
|
108
|
+
free(vocab[a].word);
|
109
|
+
} else {
|
110
|
+
// Hash will be re-computed, as after the sorting it is not actual
|
111
|
+
hash=GetWordHash(vocab[a].word);
|
112
|
+
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
113
|
+
vocab_hash[hash] = a;
|
114
|
+
train_words += vocab[a].cn;
|
115
|
+
}
|
116
|
+
}
|
117
|
+
vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
|
118
|
+
// Allocate memory for the binary tree construction
|
119
|
+
for (a = 0; a < vocab_size; a++) {
|
120
|
+
vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
|
121
|
+
vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
|
122
|
+
}
|
123
|
+
}
|
124
|
+
|
125
|
+
// Reduces the vocabulary by removing infrequent tokens
|
126
|
+
void ReduceVocab() {
|
127
|
+
int a, b = 0;
|
128
|
+
unsigned long long hash;
|
129
|
+
for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
|
130
|
+
vocab[b].cn = vocab[a].cn;
|
131
|
+
vocab[b].word = vocab[a].word;
|
132
|
+
b++;
|
133
|
+
} else free(vocab[a].word);
|
134
|
+
vocab_size = b;
|
135
|
+
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
136
|
+
for (a = 0; a < vocab_size; a++) {
|
137
|
+
// Hash will be re-computed, as it is not actual
|
138
|
+
hash = GetWordHash(vocab[a].word);
|
139
|
+
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
|
140
|
+
vocab_hash[hash] = a;
|
141
|
+
}
|
142
|
+
fflush(stdout);
|
143
|
+
min_reduce++;
|
144
|
+
}
|
145
|
+
|
146
|
+
// Learn vocabulary data from file
|
147
|
+
void LearnVocabFromTrainFile() {
|
148
|
+
char word[MAX_STRING];
|
149
|
+
FILE *fin;
|
150
|
+
long long a, i;
|
151
|
+
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
|
152
|
+
fin = fopen(train_file, "rb");
|
153
|
+
if (fin == NULL) {
|
154
|
+
printf("ERROR: training data file not found!\n");
|
155
|
+
exit(1);
|
156
|
+
}
|
157
|
+
vocab_size = 0;
|
158
|
+
AddWordToVocab((char *)"</s>");
|
159
|
+
while (1) {
|
160
|
+
ReadWord(word, fin);
|
161
|
+
if (feof(fin)) break;
|
162
|
+
train_words++;
|
163
|
+
if ((debug_mode > 1) && (train_words % 100000 == 0)) {
|
164
|
+
printf("%lldK%c", train_words / 1000, 13);
|
165
|
+
fflush(stdout);
|
166
|
+
}
|
167
|
+
i = SearchVocab(word);
|
168
|
+
if (i == -1) {
|
169
|
+
a = AddWordToVocab(word);
|
170
|
+
vocab[a].cn = 1;
|
171
|
+
} else vocab[i].cn++;
|
172
|
+
if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
|
173
|
+
}
|
174
|
+
SortVocab();
|
175
|
+
if (debug_mode > 0) {
|
176
|
+
printf("Vocab size: %lld\n", vocab_size);
|
177
|
+
printf("Words in train file: %lld\n", train_words);
|
178
|
+
}
|
179
|
+
file_size = ftell(fin);
|
180
|
+
fclose(fin);
|
181
|
+
}
|
182
|
+
|
183
|
+
void SaveVocab(char* save_vocab_file) {
|
184
|
+
long long i;
|
185
|
+
FILE *fo = fopen(save_vocab_file, "wb");
|
186
|
+
for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
|
187
|
+
fclose(fo);
|
188
|
+
}
|
189
|
+
|
190
|
+
// Build vocabulary file
|
191
|
+
void word2vec_build_vocab(char* train, char* save_vocab_file) {
|
192
|
+
strcpy(train_file, train);
|
193
|
+
vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
|
194
|
+
vocab_hash = calloc(vocab_hash_size, sizeof(long long));
|
195
|
+
LearnVocabFromTrainFile();
|
196
|
+
SaveVocab(save_vocab_file);
|
197
|
+
}
|
data/ext/word2vec/word2vec.c
CHANGED
@@ -127,6 +127,20 @@ static VALUE model_accuracy(VALUE mod, VALUE rb_file_name) {
|
|
127
127
|
return Qtrue;
|
128
128
|
}
|
129
129
|
|
130
|
+
/*
|
131
|
+
* build the vocabubaly file from train file
|
132
|
+
* @param [String] rb_train_file_name
|
133
|
+
* @param [String] rb_vocab_file_name
|
134
|
+
*/
|
135
|
+
static VALUE build_vocab(VALUE mod, VALUE rb_train_file_name, VALUE rb_vocab_file_name) {
|
136
|
+
char* train_filename = StringValueCStr(rb_train_file_name);
|
137
|
+
char* vocab_filename = StringValueCStr(rb_vocab_file_name);
|
138
|
+
|
139
|
+
word2vec_build_vocab(train_filename, vocab_filename);
|
140
|
+
|
141
|
+
return Qtrue;
|
142
|
+
}
|
143
|
+
|
130
144
|
void Init_word2vec(void) {
|
131
145
|
VALUE mWord2vec = rb_define_module("Word2vec");
|
132
146
|
VALUE mWord2vecModel = rb_define_class_under(mWord2vec, "Model", rb_cObject);
|
@@ -136,4 +150,5 @@ void Init_word2vec(void) {
|
|
136
150
|
rb_define_method(mWord2vecModel, "distance", model_distance, 1);
|
137
151
|
rb_define_method(mWord2vecModel, "analogy", model_analogy, 3);
|
138
152
|
rb_define_method(mWord2vecModel, "accuracy", model_accuracy, 1);
|
153
|
+
rb_define_singleton_method(mWord2vecModel, "build_vocab", build_vocab, 2);
|
139
154
|
}
|
data/lib/word2vec/version.rb
CHANGED
data/word2vec-rb.gemspec
CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.summary = %q{Ruby interface to use word2vec arithmetic.}
|
10
10
|
spec.description = %q{To use this gem is required the file`vectors.bin` where is stored the output of the Google algorithm called `word2vec`. This gem doesn't produce this file. Once produced, this can can load it and use it to calculate some arithmetic operations like distance between words or to calculate the relations between them.'}
|
11
11
|
spec.homepage = "https://github.com/madcato/word2vec-rb"
|
12
|
-
spec.required_ruby_version = Gem::Requirement.new(">= 2.
|
12
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 2.7.1")
|
13
13
|
|
14
14
|
spec.metadata["homepage_uri"] = spec.homepage
|
15
15
|
spec.metadata["source_code_uri"] = "https://github.com/madcato/word2vec-rb"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: word2vec-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dani Vela
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-06-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -96,6 +96,7 @@ files:
|
|
96
96
|
- ext/word2vec/common.h
|
97
97
|
- ext/word2vec/distance.c
|
98
98
|
- ext/word2vec/extconf.rb
|
99
|
+
- ext/word2vec/vocab.c
|
99
100
|
- ext/word2vec/word2vec.c
|
100
101
|
- lib/word2vec.rb
|
101
102
|
- lib/word2vec/version.rb
|
@@ -115,7 +116,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
115
116
|
requirements:
|
116
117
|
- - ">="
|
117
118
|
- !ruby/object:Gem::Version
|
118
|
-
version: 2.
|
119
|
+
version: 2.7.1
|
119
120
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
120
121
|
requirements:
|
121
122
|
- - ">="
|