RubyGems - word2vec-rb - Versions diffs - 0.3.0 → 0.4.0 - Mend

word2vec-rb 0.3.0 → 0.4.0

Files changed (12) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 52dcba70c88f9dcdc15a0238b8ec6c7fa6d3c7988cc6dab9e2003cc546d0b2ba
-  data.tar.gz: f870194dac4c8577ab031d87728d9fda7cfd909ce169763de9ecde2c7bbd1446
+  metadata.gz: f1cc8aad12e17e08da6428945fa3fcdab514739a5ff166588796b0de4535ba10
+  data.tar.gz: 3deaf3e6d6f1bff3e06b6353d8c29591fa00c8f5e8c8f3079f0d9fdf029786b7
 SHA512:
-  metadata.gz: 55722fb57402098a512d7442302001558229ed293dbc8f60b1820830fecce154c5b5c6067b9c37800f0edbe4ac29d2670e4706575d905f9fe329c0c87544c0b1
-  data.tar.gz: 33950ed6cda995920856f2d3ba41baaa74d70befa2fe0241954fc88b51f897bdd0238b8a468866ce0524f154e65243413df7759fb0291e3b407a55cace66c6e8
+  metadata.gz: 916361e4543fadac744e4502f53241b105789bbf928be51d2e17df93b3ac318e2b9125d7146ec67146a591ca87ee7d2928ca15aadb1fabc0b2c31b82e058497b
+  data.tar.gz: 9daa3535f9c23254b5fa45e9759fab036a297ffa491e3fc03287c3c9c072c7b314925843d376e678f1b8f61915839f5e0689e1fa27268704eaf5bd086fdaecc3

data/.gitignore CHANGED Viewed

@@ -10,4 +10,7 @@
 # rspec failure tracking
 .rspec_status
-lib/word2vec/word2vec.bundle
+lib/word2vec/word2vec.bundle
+data/text7
+data/vocab.txt

data/CHANGELOG CHANGED Viewed

@@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
+## [0.4.0] - 2021-06-14
+### Added
+- Vocabulary. Create a vocabulary file with its number of appearances.
 ## [0.3.0] - 2021-04-26
 ### Added
 - Accuracy. Meassure the quality of the vectors.

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    word2vec-rb (0.3.0)
+    word2vec-rb (0.4.0)
 GEM
   remote: https://rubygems.org/

data/README.md CHANGED Viewed

@@ -68,6 +68,16 @@ model.accuracy("./data/questions-words.txt")
 # Outputs the results on terminal
 ```
+### Vocabulary: create a vocabulary file from a train file:
+```ruby
+require 'word2vec'
+Word2vec::Model.build\_vocab("./data/text7", "./data/vocab.txt")
+```
+The output file will have a list of words and its number of appearances separated by line break.
 ## Development
 After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.

data/data/readme.md CHANGED Viewed

	@@ -1 +1 @@
1	- ~~File~~ ~~`minimal.bin`~~ is only for testing purposes.
1	+ Files in this directory are only for testing purposes.

data/ext/word2vec/common.h CHANGED Viewed

@@ -33,5 +33,6 @@ void word2vec_model_load(word2vec_model* model, char* file_name);
 size_t word2vec_model_distance(word2vec_model* model, char* word, WordSimilarity word_list[]);
 size_t word2vec_model_analogy(word2vec_model* model, char* wordx1, char* wordy1, char* wordx2, WordSimilarity word_list[]);
 void word2vec_model_accuracy(word2vec_model* model, char* file_name);
+void word2vec_build_vocab(char* train, char* save_vocab_file);
 #endif /* _WORD2VEC_COMMON_H */

data/ext/word2vec/vocab.c ADDED Viewed

@@ -0,0 +1,197 @@
+#include "common.h"
+#define MAX_STRING 100
+#define MAX_CODE_LENGTH 40
+const int vocab_hash_size = 30000000;  // Maximum 30 * 0.7 = 21M words in the vocabulary
+struct vocab_word {
+  long long cn;
+  int *point;
+  char *word, *code, codelen;
+};
+char train_file[MAX_STRING];
+struct vocab_word *vocab;
+int debug_mode = 2, min_count = 5, min_reduce = 1;
+long long vocab_max_size = 1000, vocab_size = 0;
+long long *vocab_hash;
+long long train_words = 0, file_size = 0;
+// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
+void ReadWord(char *word, FILE *fin) {
+  int a = 0, ch;
+  while (!feof(fin)) {
+    ch = fgetc(fin);
+    if (ch == 13) continue;
+    if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
+      if (a > 0) {
+        if (ch == '\n') ungetc(ch, fin);
+        break;
+      }
+      if (ch == '\n') {
+        strcpy(word, (char *)"</s>");
+        return;
+      } else continue;
+    }
+    word[a] = ch;
+    a++;
+    if (a >= MAX_STRING - 1) a--;   // Truncate too long words
+  }
+  word[a] = 0;
+}
+// Returns hash value of a word
+unsigned long long GetWordHash(char *word) {
+  unsigned long long a, hash = 0;
+  for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
+  hash = hash % vocab_hash_size;
+  return hash;
+}
+// Returns position of a word in the vocabulary; if the word is not found, returns -1
+unsigned long long SearchVocab(char *word) {
+  unsigned long long hash = GetWordHash(word);
+  while (1) {
+    if (vocab_hash[hash] == -1) return -1;
+    if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
+    hash = (hash + 1) % vocab_hash_size;
+  }
+  return -1;
+}
+// Reads a word and returns its index in the vocabulary
+unsigned long long ReadWordIndex(FILE *fin) {
+  char word[MAX_STRING];
+  ReadWord(word, fin);
+  if (feof(fin)) return -1;
+  return SearchVocab(word);
+}
+// Adds a word to the vocabulary
+unsigned long long AddWordToVocab(char *word) {
+  unsigned long long hash, length = strlen(word) + 1;
+  if (length > MAX_STRING) length = MAX_STRING;
+  vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
+  strcpy(vocab[vocab_size].word, word);
+  vocab[vocab_size].cn = 0;
+  vocab_size++;
+  // Reallocate memory if needed
+  if (vocab_size + 2 >= vocab_max_size) {
+    vocab_max_size += 1000;
+    vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
+  }
+  hash = GetWordHash(word);
+  while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
+  vocab_hash[hash] = vocab_size - 1;
+  return vocab_size - 1;
+}
+// Used later for sorting by word counts
+int VocabCompare(const void *a, const void *b) {
+    return (int)(((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn);
+}
+// Sorts the vocabulary by frequency using word counts
+void SortVocab() {
+  long long a, size;
+  unsigned long long hash;
+  // Sort the vocabulary and keep </s> at the first position
+  qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
+  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
+  size = vocab_size;
+  train_words = 0;
+  for (a = 0; a < size; a++) {
+    // Words occuring less than min_count times will be discarded from the vocab
+    if ((vocab[a].cn < min_count) && (a != 0)) {
+      vocab_size--;
+      free(vocab[a].word);
+    } else {
+      // Hash will be re-computed, as after the sorting it is not actual
+      hash=GetWordHash(vocab[a].word);
+      while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
+      vocab_hash[hash] = a;
+      train_words += vocab[a].cn;
+    }
+  }
+  vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
+  // Allocate memory for the binary tree construction
+  for (a = 0; a < vocab_size; a++) {
+    vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
+    vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
+  }
+}
+// Reduces the vocabulary by removing infrequent tokens
+void ReduceVocab() {
+  int a, b = 0;
+  unsigned long long hash;
+  for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
+    vocab[b].cn = vocab[a].cn;
+    vocab[b].word = vocab[a].word;
+    b++;
+  } else free(vocab[a].word);
+  vocab_size = b;
+  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
+  for (a = 0; a < vocab_size; a++) {
+    // Hash will be re-computed, as it is not actual
+    hash = GetWordHash(vocab[a].word);
+    while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
+    vocab_hash[hash] = a;
+  }
+  fflush(stdout);
+  min_reduce++;
+}
+// Learn vocabulary data from file
+void LearnVocabFromTrainFile() {
+  char word[MAX_STRING];
+  FILE *fin;
+  long long a, i;
+  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
+  fin = fopen(train_file, "rb");
+  if (fin == NULL) {
+    printf("ERROR: training data file not found!\n");
+    exit(1);
+  }
+  vocab_size = 0;
+  AddWordToVocab((char *)"</s>");
+  while (1) {
+    ReadWord(word, fin);
+    if (feof(fin)) break;
+    train_words++;
+    if ((debug_mode > 1) && (train_words % 100000 == 0)) {
+      printf("%lldK%c", train_words / 1000, 13);
+      fflush(stdout);
+    }
+    i = SearchVocab(word);
+    if (i == -1) {
+      a = AddWordToVocab(word);
+      vocab[a].cn = 1;
+    } else vocab[i].cn++;
+    if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
+  }
+  SortVocab();
+  if (debug_mode > 0) {
+    printf("Vocab size: %lld\n", vocab_size);
+    printf("Words in train file: %lld\n", train_words);
+  }
+  file_size = ftell(fin);
+  fclose(fin);
+}
+void SaveVocab(char* save_vocab_file) {
+  long long i;
+  FILE *fo = fopen(save_vocab_file, "wb");
+  for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
+  fclose(fo);
+}
+// Build vocabulary file
+void word2vec_build_vocab(char* train, char* save_vocab_file) {
+  strcpy(train_file, train);
+  vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
+  vocab_hash = calloc(vocab_hash_size, sizeof(long long));
+  LearnVocabFromTrainFile();
+  SaveVocab(save_vocab_file);
+}

data/ext/word2vec/word2vec.c CHANGED Viewed

@@ -127,6 +127,20 @@ static VALUE model_accuracy(VALUE mod, VALUE rb_file_name) {
   return Qtrue;
 }
+/*
+ * build the vocabubaly file from train file
+ * @param [String] rb_train_file_name
+ * @param [String] rb_vocab_file_name
+ */
+static VALUE build_vocab(VALUE mod, VALUE rb_train_file_name, VALUE rb_vocab_file_name) {
+  char* train_filename = StringValueCStr(rb_train_file_name);
+  char* vocab_filename = StringValueCStr(rb_vocab_file_name);
+  word2vec_build_vocab(train_filename, vocab_filename);
+  return Qtrue;
+}
 void Init_word2vec(void) {
   VALUE mWord2vec = rb_define_module("Word2vec");
   VALUE mWord2vecModel = rb_define_class_under(mWord2vec, "Model", rb_cObject);
@@ -136,4 +150,5 @@ void Init_word2vec(void) {
   rb_define_method(mWord2vecModel, "distance", model_distance, 1);
   rb_define_method(mWord2vecModel, "analogy", model_analogy, 3);
   rb_define_method(mWord2vecModel, "accuracy", model_accuracy, 1);
+  rb_define_singleton_method(mWord2vecModel, "build_vocab", build_vocab, 2);
 }

data/lib/word2vec/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Word2vec
-    VERSION = "0.3.0"
+    VERSION = "0.4.0"
 end

data/word2vec-rb.gemspec CHANGED Viewed

@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
   spec.summary       = %q{Ruby interface to use word2vec arithmetic.}
   spec.description   = %q{To use this gem is required the file`vectors.bin` where is stored the output of the Google algorithm called `word2vec`. This gem doesn't produce this file. Once produced, this can can load it and use it to calculate some arithmetic operations like distance between words or to calculate the relations between them.'}
   spec.homepage      = "https://github.com/madcato/word2vec-rb"
-  spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
+  spec.required_ruby_version = Gem::Requirement.new(">= 2.7.1")
   spec.metadata["homepage_uri"] = spec.homepage
   spec.metadata["source_code_uri"] = "https://github.com/madcato/word2vec-rb"

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: word2vec-rb
 version: !ruby/object:Gem::Version
-  version: 0.3.0
+  version: 0.4.0
 platform: ruby
 authors:
 - Dani Vela
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2021-04-26 00:00:00.000000000 Z
+date: 2021-06-14 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -96,6 +96,7 @@ files:
 - ext/word2vec/common.h
 - ext/word2vec/distance.c
 - ext/word2vec/extconf.rb
+- ext/word2vec/vocab.c
 - ext/word2vec/word2vec.c
 - lib/word2vec.rb
 - lib/word2vec/version.rb
@@ -115,7 +116,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      version: 2.3.0
+      version: 2.7.1
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="