word2vec-rb 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 52dcba70c88f9dcdc15a0238b8ec6c7fa6d3c7988cc6dab9e2003cc546d0b2ba
4
- data.tar.gz: f870194dac4c8577ab031d87728d9fda7cfd909ce169763de9ecde2c7bbd1446
3
+ metadata.gz: f1cc8aad12e17e08da6428945fa3fcdab514739a5ff166588796b0de4535ba10
4
+ data.tar.gz: 3deaf3e6d6f1bff3e06b6353d8c29591fa00c8f5e8c8f3079f0d9fdf029786b7
5
5
  SHA512:
6
- metadata.gz: 55722fb57402098a512d7442302001558229ed293dbc8f60b1820830fecce154c5b5c6067b9c37800f0edbe4ac29d2670e4706575d905f9fe329c0c87544c0b1
7
- data.tar.gz: 33950ed6cda995920856f2d3ba41baaa74d70befa2fe0241954fc88b51f897bdd0238b8a468866ce0524f154e65243413df7759fb0291e3b407a55cace66c6e8
6
+ metadata.gz: 916361e4543fadac744e4502f53241b105789bbf928be51d2e17df93b3ac318e2b9125d7146ec67146a591ca87ee7d2928ca15aadb1fabc0b2c31b82e058497b
7
+ data.tar.gz: 9daa3535f9c23254b5fa45e9759fab036a297ffa491e3fc03287c3c9c072c7b314925843d376e678f1b8f61915839f5e0689e1fa27268704eaf5bd086fdaecc3
data/.gitignore CHANGED
@@ -10,4 +10,7 @@
10
10
  # rspec failure tracking
11
11
  .rspec_status
12
12
 
13
- lib/word2vec/word2vec.bundle
13
+ lib/word2vec/word2vec.bundle
14
+
15
+ data/text7
16
+ data/vocab.txt
data/CHANGELOG CHANGED
@@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
6
6
 
7
7
  ## [Unreleased]
8
8
 
9
+ ## [0.4.0] - 2021-06-14
10
+ ### Added
11
+ - Vocabulary. Create a vocabulary file with its number of appearances.
12
+
9
13
  ## [0.3.0] - 2021-04-26
10
14
  ### Added
11
15
  - Accuracy. Meassure the quality of the vectors.
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- word2vec-rb (0.3.0)
4
+ word2vec-rb (0.4.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
data/README.md CHANGED
@@ -68,6 +68,16 @@ model.accuracy("./data/questions-words.txt")
68
68
  # Outputs the results on terminal
69
69
  ```
70
70
 
71
+ ### Vocabulary: create a vocabulary file from a train file:
72
+
73
+ ```ruby
74
+ require 'word2vec'
75
+
76
+ Word2vec::Model.build\_vocab("./data/text7", "./data/vocab.txt")
77
+ ```
78
+
79
+ The output file will have a list of words and its number of appearances separated by line break.
80
+
71
81
  ## Development
72
82
 
73
83
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
data/data/readme.md CHANGED
@@ -1 +1 @@
1
- File `minimal.bin` is only for testing purposes.
1
+ Files in this directory are only for testing purposes.
@@ -33,5 +33,6 @@ void word2vec_model_load(word2vec_model* model, char* file_name);
33
33
  size_t word2vec_model_distance(word2vec_model* model, char* word, WordSimilarity word_list[]);
34
34
  size_t word2vec_model_analogy(word2vec_model* model, char* wordx1, char* wordy1, char* wordx2, WordSimilarity word_list[]);
35
35
  void word2vec_model_accuracy(word2vec_model* model, char* file_name);
36
+ void word2vec_build_vocab(char* train, char* save_vocab_file);
36
37
 
37
38
  #endif /* _WORD2VEC_COMMON_H */
@@ -0,0 +1,197 @@
1
+ #include "common.h"
2
+
3
+ #define MAX_STRING 100
4
+ #define MAX_CODE_LENGTH 40
5
+
6
+ const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
7
+
8
+ struct vocab_word {
9
+ long long cn;
10
+ int *point;
11
+ char *word, *code, codelen;
12
+ };
13
+
14
+ char train_file[MAX_STRING];
15
+ struct vocab_word *vocab;
16
+ int debug_mode = 2, min_count = 5, min_reduce = 1;
17
+ long long vocab_max_size = 1000, vocab_size = 0;
18
+ long long *vocab_hash;
19
+ long long train_words = 0, file_size = 0;
20
+
21
+ // Reads a single word from a file, assuming space + tab + EOL to be word boundaries
22
+ void ReadWord(char *word, FILE *fin) {
23
+ int a = 0, ch;
24
+ while (!feof(fin)) {
25
+ ch = fgetc(fin);
26
+ if (ch == 13) continue;
27
+ if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
28
+ if (a > 0) {
29
+ if (ch == '\n') ungetc(ch, fin);
30
+ break;
31
+ }
32
+ if (ch == '\n') {
33
+ strcpy(word, (char *)"</s>");
34
+ return;
35
+ } else continue;
36
+ }
37
+ word[a] = ch;
38
+ a++;
39
+ if (a >= MAX_STRING - 1) a--; // Truncate too long words
40
+ }
41
+ word[a] = 0;
42
+ }
43
+
44
+ // Returns hash value of a word
45
+ unsigned long long GetWordHash(char *word) {
46
+ unsigned long long a, hash = 0;
47
+ for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
48
+ hash = hash % vocab_hash_size;
49
+ return hash;
50
+ }
51
+
52
+ // Returns position of a word in the vocabulary; if the word is not found, returns -1
53
+ unsigned long long SearchVocab(char *word) {
54
+ unsigned long long hash = GetWordHash(word);
55
+ while (1) {
56
+ if (vocab_hash[hash] == -1) return -1;
57
+ if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
58
+ hash = (hash + 1) % vocab_hash_size;
59
+ }
60
+ return -1;
61
+ }
62
+
63
+ // Reads a word and returns its index in the vocabulary
64
+ unsigned long long ReadWordIndex(FILE *fin) {
65
+ char word[MAX_STRING];
66
+ ReadWord(word, fin);
67
+ if (feof(fin)) return -1;
68
+ return SearchVocab(word);
69
+ }
70
+
71
+ // Adds a word to the vocabulary
72
+ unsigned long long AddWordToVocab(char *word) {
73
+ unsigned long long hash, length = strlen(word) + 1;
74
+ if (length > MAX_STRING) length = MAX_STRING;
75
+ vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
76
+ strcpy(vocab[vocab_size].word, word);
77
+ vocab[vocab_size].cn = 0;
78
+ vocab_size++;
79
+ // Reallocate memory if needed
80
+ if (vocab_size + 2 >= vocab_max_size) {
81
+ vocab_max_size += 1000;
82
+ vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
83
+ }
84
+ hash = GetWordHash(word);
85
+ while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
86
+ vocab_hash[hash] = vocab_size - 1;
87
+ return vocab_size - 1;
88
+ }
89
+
90
+ // Used later for sorting by word counts
91
+ int VocabCompare(const void *a, const void *b) {
92
+ return (int)(((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn);
93
+ }
94
+
95
+ // Sorts the vocabulary by frequency using word counts
96
+ void SortVocab() {
97
+ long long a, size;
98
+ unsigned long long hash;
99
+ // Sort the vocabulary and keep </s> at the first position
100
+ qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
101
+ for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
102
+ size = vocab_size;
103
+ train_words = 0;
104
+ for (a = 0; a < size; a++) {
105
+ // Words occuring less than min_count times will be discarded from the vocab
106
+ if ((vocab[a].cn < min_count) && (a != 0)) {
107
+ vocab_size--;
108
+ free(vocab[a].word);
109
+ } else {
110
+ // Hash will be re-computed, as after the sorting it is not actual
111
+ hash=GetWordHash(vocab[a].word);
112
+ while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
113
+ vocab_hash[hash] = a;
114
+ train_words += vocab[a].cn;
115
+ }
116
+ }
117
+ vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
118
+ // Allocate memory for the binary tree construction
119
+ for (a = 0; a < vocab_size; a++) {
120
+ vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
121
+ vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
122
+ }
123
+ }
124
+
125
+ // Reduces the vocabulary by removing infrequent tokens
126
+ void ReduceVocab() {
127
+ int a, b = 0;
128
+ unsigned long long hash;
129
+ for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
130
+ vocab[b].cn = vocab[a].cn;
131
+ vocab[b].word = vocab[a].word;
132
+ b++;
133
+ } else free(vocab[a].word);
134
+ vocab_size = b;
135
+ for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
136
+ for (a = 0; a < vocab_size; a++) {
137
+ // Hash will be re-computed, as it is not actual
138
+ hash = GetWordHash(vocab[a].word);
139
+ while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
140
+ vocab_hash[hash] = a;
141
+ }
142
+ fflush(stdout);
143
+ min_reduce++;
144
+ }
145
+
146
+ // Learn vocabulary data from file
147
+ void LearnVocabFromTrainFile() {
148
+ char word[MAX_STRING];
149
+ FILE *fin;
150
+ long long a, i;
151
+ for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
152
+ fin = fopen(train_file, "rb");
153
+ if (fin == NULL) {
154
+ printf("ERROR: training data file not found!\n");
155
+ exit(1);
156
+ }
157
+ vocab_size = 0;
158
+ AddWordToVocab((char *)"</s>");
159
+ while (1) {
160
+ ReadWord(word, fin);
161
+ if (feof(fin)) break;
162
+ train_words++;
163
+ if ((debug_mode > 1) && (train_words % 100000 == 0)) {
164
+ printf("%lldK%c", train_words / 1000, 13);
165
+ fflush(stdout);
166
+ }
167
+ i = SearchVocab(word);
168
+ if (i == -1) {
169
+ a = AddWordToVocab(word);
170
+ vocab[a].cn = 1;
171
+ } else vocab[i].cn++;
172
+ if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
173
+ }
174
+ SortVocab();
175
+ if (debug_mode > 0) {
176
+ printf("Vocab size: %lld\n", vocab_size);
177
+ printf("Words in train file: %lld\n", train_words);
178
+ }
179
+ file_size = ftell(fin);
180
+ fclose(fin);
181
+ }
182
+
183
+ void SaveVocab(char* save_vocab_file) {
184
+ long long i;
185
+ FILE *fo = fopen(save_vocab_file, "wb");
186
+ for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
187
+ fclose(fo);
188
+ }
189
+
190
+ // Build vocabulary file
191
+ void word2vec_build_vocab(char* train, char* save_vocab_file) {
192
+ strcpy(train_file, train);
193
+ vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
194
+ vocab_hash = calloc(vocab_hash_size, sizeof(long long));
195
+ LearnVocabFromTrainFile();
196
+ SaveVocab(save_vocab_file);
197
+ }
@@ -127,6 +127,20 @@ static VALUE model_accuracy(VALUE mod, VALUE rb_file_name) {
127
127
  return Qtrue;
128
128
  }
129
129
 
130
+ /*
131
+ * build the vocabubaly file from train file
132
+ * @param [String] rb_train_file_name
133
+ * @param [String] rb_vocab_file_name
134
+ */
135
+ static VALUE build_vocab(VALUE mod, VALUE rb_train_file_name, VALUE rb_vocab_file_name) {
136
+ char* train_filename = StringValueCStr(rb_train_file_name);
137
+ char* vocab_filename = StringValueCStr(rb_vocab_file_name);
138
+
139
+ word2vec_build_vocab(train_filename, vocab_filename);
140
+
141
+ return Qtrue;
142
+ }
143
+
130
144
  void Init_word2vec(void) {
131
145
  VALUE mWord2vec = rb_define_module("Word2vec");
132
146
  VALUE mWord2vecModel = rb_define_class_under(mWord2vec, "Model", rb_cObject);
@@ -136,4 +150,5 @@ void Init_word2vec(void) {
136
150
  rb_define_method(mWord2vecModel, "distance", model_distance, 1);
137
151
  rb_define_method(mWord2vecModel, "analogy", model_analogy, 3);
138
152
  rb_define_method(mWord2vecModel, "accuracy", model_accuracy, 1);
153
+ rb_define_singleton_method(mWord2vecModel, "build_vocab", build_vocab, 2);
139
154
  }
@@ -1,3 +1,3 @@
1
1
  module Word2vec
2
- VERSION = "0.3.0"
2
+ VERSION = "0.4.0"
3
3
  end
data/word2vec-rb.gemspec CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
9
9
  spec.summary = %q{Ruby interface to use word2vec arithmetic.}
10
10
  spec.description = %q{To use this gem is required the file`vectors.bin` where is stored the output of the Google algorithm called `word2vec`. This gem doesn't produce this file. Once produced, this can can load it and use it to calculate some arithmetic operations like distance between words or to calculate the relations between them.'}
11
11
  spec.homepage = "https://github.com/madcato/word2vec-rb"
12
- spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
12
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.7.1")
13
13
 
14
14
  spec.metadata["homepage_uri"] = spec.homepage
15
15
  spec.metadata["source_code_uri"] = "https://github.com/madcato/word2vec-rb"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: word2vec-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dani Vela
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-04-26 00:00:00.000000000 Z
11
+ date: 2021-06-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -96,6 +96,7 @@ files:
96
96
  - ext/word2vec/common.h
97
97
  - ext/word2vec/distance.c
98
98
  - ext/word2vec/extconf.rb
99
+ - ext/word2vec/vocab.c
99
100
  - ext/word2vec/word2vec.c
100
101
  - lib/word2vec.rb
101
102
  - lib/word2vec/version.rb
@@ -115,7 +116,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
115
116
  requirements:
116
117
  - - ">="
117
118
  - !ruby/object:Gem::Version
118
- version: 2.3.0
119
+ version: 2.7.1
119
120
  required_rubygems_version: !ruby/object:Gem::Requirement
120
121
  requirements:
121
122
  - - ">="