RubyGems - word2vec-rb - Versions diffs - 0.4.0 → 0.5.0 - Mend

word2vec-rb 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/.gitignore +2 -1
data/.vscode/c_cpp_properties.json +20 -0
data/.vscode/settings.json +6 -0
data/Gemfile.lock +1 -1
data/README.md +17 -0
data/ext/word2vec/common.c +198 -1
data/ext/word2vec/common.h +31 -2
data/ext/word2vec/tokenizer.c +19 -0
data/ext/word2vec/vocab.c +0 -172
data/ext/word2vec/word2vec.c +17 -0
data/lib/word2vec/version.rb +1 -1
metadata +5 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: f1cc8aad12e17e08da6428945fa3fcdab514739a5ff166588796b0de4535ba10
-  data.tar.gz: 3deaf3e6d6f1bff3e06b6353d8c29591fa00c8f5e8c8f3079f0d9fdf029786b7
+  metadata.gz: 192304d569b2fb573300a33ff7ed05fc37202c30cf55144fc043cc544e620f23
+  data.tar.gz: e5473521b82560242b19ae2efa9c4e07a7798bb3c0ea2e97d97e5f3a1f2601cf
 SHA512:
-  metadata.gz: 916361e4543fadac744e4502f53241b105789bbf928be51d2e17df93b3ac318e2b9125d7146ec67146a591ca87ee7d2928ca15aadb1fabc0b2c31b82e058497b
-  data.tar.gz: 9daa3535f9c23254b5fa45e9759fab036a297ffa491e3fc03287c3c9c072c7b314925843d376e678f1b8f61915839f5e0689e1fa27268704eaf5bd086fdaecc3
+  metadata.gz: db2cc2396b4a7193a0cb4efd40ffefbefabd88861f257aea8ac90d032d9ebac8f0759cd952678174f764cdce2605a94c521e24d9433a8a1eaaf135e5dcc1cf89
+  data.tar.gz: 26949750236e3929f9794967570c3fb257c53474c9d79b857f12d4214ecca075afe4a32dbb5a6e5253b44ea1566a391aff70c9617a2914ee700779e3dd8a4af9

data/.gitignore CHANGED Viewed

@@ -13,4 +13,5 @@
 lib/word2vec/word2vec.bundle
 data/text7
-data/vocab.txt
+data/vocab.txt
+data/tokenized.bin

data/.vscode/c_cpp_properties.json ADDED Viewed

@@ -0,0 +1,20 @@
+{
+    "configurations": [
+        {
+            "name": "Mac",
+            "includePath": [
+                "${workspaceFolder}/**",
+                "/Library/Developer/CommandLineTools/SDKs/MacOSX11.3.sdk/System/Library/Frameworks/Ruby.framework/Versions/2.6/usr/include/ruby-2.6.0/**"
+            ],
+            "defines": [],
+            "macFrameworkPath": [
+                "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks"
+            ],
+            "compilerPath": "/usr/bin/clang",
+            "cStandard": "c17",
+            "cppStandard": "c++98",
+            "intelliSenseMode": "macos-clang-x64"
+        }
+    ],
+    "version": 4
+}

data/.vscode/settings.json ADDED Viewed

@@ -0,0 +1,6 @@
+{
+    "files.associations": {
+        "common.h": "c",
+        "ruby.h": "c"
+    }
+}

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    word2vec-rb (0.4.0)
+    word2vec-rb (0.5.0)
 GEM
   remote: https://rubygems.org/

data/README.md CHANGED Viewed

@@ -78,6 +78,23 @@ Word2vec::Model.build\_vocab("./data/text7", "./data/vocab.txt")
 The output file will have a list of words and its number of appearances separated by line break.
+### Tokenizer: create a binary file by tokenizing an input file
+This method requires a vocabulary file precreated.
+```ruby
+require 'word2vec'
+Word2vec::Model.tokenize("./data/text7", "./data/vocab.txt", "./data/toekized.bin")
+```
+The output file will contain a sequence of binary identificators of each word of the input file.
+Read output file with:
+    long long id;
+    fread(&id, sizeof(id), 1, fi);
 ## Development
 After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.

data/ext/word2vec/common.c CHANGED Viewed

@@ -1,5 +1,7 @@
 #include "common.h"
+char train_file[MAX_STRING];
+char read_vocab_file[MAX_STRING];
 // max length of strings
 const unsigned long max_size = 2000;
 // number of closest words that will be shown
@@ -44,4 +46,199 @@ void word2vec_model_load(word2vec_model* model, char* file_name) {
     for (a = 0; a < size; a++) model->vectors[a + b * size] /= len;
   }
   fclose(f);
-}
+}
+// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
+void ReadWord(char *word, FILE *fin) {
+  int a = 0, ch;
+  while (!feof(fin)) {
+    ch = fgetc(fin);
+    if (ch == 13) continue;
+    if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
+      if (a > 0) {
+        if (ch == '\n') ungetc(ch, fin);
+        break;
+      }
+      if (ch == '\n') {
+        strcpy(word, (char *)"</s>");
+        return;
+      } else continue;
+    }
+    word[a] = ch;
+    a++;
+    if (a >= MAX_STRING - 1) a--;   // Truncate too long words
+  }
+  word[a] = 0;
+}
+// Returns hash value of a word
+unsigned long long GetWordHash(char *word) {
+  unsigned long long a, hash = 0;
+  for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
+  hash = hash % vocab_hash_size;
+  return hash;
+}
+// Returns position of a word in the vocabulary; if the word is not found, returns -1
+unsigned long long SearchVocab(char *word) {
+  unsigned long long hash = GetWordHash(word);
+  while (1) {
+    if (vocab_hash[hash] == -1) return -1;
+    if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
+    hash = (hash + 1) % vocab_hash_size;
+  }
+  return -1;
+}
+// Reads a word and returns its index in the vocabulary
+unsigned long long ReadWordIndex(FILE *fin) {
+  char word[MAX_STRING];
+  ReadWord(word, fin);
+  if (feof(fin)) return -1;
+  return SearchVocab(word);
+}
+// Adds a word to the vocabulary
+unsigned long long AddWordToVocab(char *word) {
+  unsigned long long hash, length = strlen(word) + 1;
+  if (length > MAX_STRING) length = MAX_STRING;
+  vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
+  strcpy(vocab[vocab_size].word, word);
+  vocab[vocab_size].cn = 0;
+  vocab_size++;
+  // Reallocate memory if needed
+  if (vocab_size + 2 >= vocab_max_size) {
+    vocab_max_size += 1000;
+    vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
+  }
+  hash = GetWordHash(word);
+  while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
+  vocab_hash[hash] = vocab_size - 1;
+  return vocab_size - 1;
+}
+// Reduces the vocabulary by removing infrequent tokens
+void ReduceVocab() {
+  int a, b = 0;
+  unsigned long long hash;
+  for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
+    vocab[b].cn = vocab[a].cn;
+    vocab[b].word = vocab[a].word;
+    b++;
+  } else free(vocab[a].word);
+  vocab_size = b;
+  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
+  for (a = 0; a < vocab_size; a++) {
+    // Hash will be re-computed, as it is not actual
+    hash = GetWordHash(vocab[a].word);
+    while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
+    vocab_hash[hash] = a;
+  }
+  fflush(stdout);
+  min_reduce++;
+}
+// Used later for sorting by word counts
+int VocabCompare(const void *a, const void *b) {
+    return (int)(((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn);
+}
+// Sorts the vocabulary by frequency using word counts
+void SortVocab() {
+  long long a, size;
+  unsigned long long hash;
+  // Sort the vocabulary and keep </s> at the first position
+  qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
+  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
+  size = vocab_size;
+  train_words = 0;
+  for (a = 0; a < size; a++) {
+    // Words occuring less than min_count times will be discarded from the vocab
+    if ((vocab[a].cn < min_count) && (a != 0)) {
+      vocab_size--;
+      free(vocab[a].word);
+    } else {
+      // Hash will be re-computed, as after the sorting it is not actual
+      hash=GetWordHash(vocab[a].word);
+      while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
+      vocab_hash[hash] = a;
+      train_words += vocab[a].cn;
+    }
+  }
+  vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
+  // Allocate memory for the binary tree construction
+  for (a = 0; a < vocab_size; a++) {
+    vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
+    vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
+  }
+}
+// Learn vocabulary data from file
+void LearnVocabFromTrainFile() {
+  char word[MAX_STRING];
+  FILE *fin;
+  long long a, i;
+  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
+  fin = fopen(train_file, "rb");
+  if (fin == NULL) {
+    printf("ERROR: training data file not found!\n");
+    exit(1);
+  }
+  vocab_size = 0;
+  AddWordToVocab((char *)"</s>");
+  while (1) {
+    ReadWord(word, fin);
+    if (feof(fin)) break;
+    train_words++;
+    if ((debug_mode > 1) && (train_words % 100000 == 0)) {
+      printf("%lldK%c", train_words / 1000, 13);
+      fflush(stdout);
+    }
+    i = SearchVocab(word);
+    if (i == -1) {
+      a = AddWordToVocab(word);
+      vocab[a].cn = 1;
+    } else vocab[i].cn++;
+    if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
+  }
+  SortVocab();
+  if (debug_mode > 0) {
+    printf("Vocab size: %lld\n", vocab_size);
+    printf("Words in train file: %lld\n", train_words);
+  }
+  file_size = ftell(fin);
+  fclose(fin);
+}
+void ReadVocab() {
+  long long a, i = 0;
+  char c;
+  char word[MAX_STRING];
+  FILE *fin = fopen(read_vocab_file, "rb");
+  if (fin == NULL) {
+    printf("Vocabulary file not found\n");
+    exit(1);
+  }
+  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
+  vocab_size = 0;
+  while (1) {
+    ReadWord(word, fin);
+    if (feof(fin)) break;
+    a = AddWordToVocab(word);
+    fscanf(fin, "%lld%c", &vocab[a].cn, &c);
+    i++;
+  }
+  SortVocab();
+  if (debug_mode > 0) {
+    printf("Vocab size: %lld\n", vocab_size);
+    printf("Words in train file: %lld\n", train_words);
+  }
+  fin = fopen(train_file, "rb");
+  if (fin == NULL) {
+    printf("ERROR: training data file not found!\n");
+    exit(1);
+  }
+  fseek(fin, 0, SEEK_END);
+  file_size = ftell(fin);
+  fclose(fin);
+}

data/ext/word2vec/common.h CHANGED Viewed

@@ -10,13 +10,26 @@
 #include <sys/types.h>
 #include <ctype.h>
+#define MAX_STRING 100
+#define MAX_CODE_LENGTH 40
 // max length of strings
 extern const unsigned long max_size;
 // number of closest words that will be shown
 extern const long long N;
 // max length of vocabulary entries
 extern const long long max_w;
+// Maximum 30 * 0.7 = 21M words in the vocabulary
+extern const int vocab_hash_size;
+extern long long *vocab_hash;
+extern struct vocab_word *vocab;
+extern char train_file[];
+extern char read_vocab_file[];
+extern long long vocab_max_size, vocab_size;
+extern long long train_words, file_size ;
+extern int debug_mode, min_count, min_reduce;
 typedef struct word2vec_model_s {
   long long word_count;
   char *vocabulary;  // char *[word_count]
@@ -29,10 +42,26 @@ typedef struct WordSimilarity_s {
   float score;
 } WordSimilarity;
+struct vocab_word {
+  long long cn;
+  int *point;
+  char *word, *code, codelen;
+};
+void ReadWord(char *word, FILE *fin);
+unsigned long long GetWordHash(char *word);
+unsigned long long SearchVocab(char *word) ;
+unsigned long long ReadWordIndex(FILE *fin);
+void LearnVocabFromTrainFile();
+void ReadVocab();
+/// main functions
 void word2vec_model_load(word2vec_model* model, char* file_name);
 size_t word2vec_model_distance(word2vec_model* model, char* word, WordSimilarity word_list[]);
 size_t word2vec_model_analogy(word2vec_model* model, char* wordx1, char* wordy1, char* wordx2, WordSimilarity word_list[]);
 void word2vec_model_accuracy(word2vec_model* model, char* file_name);
-void word2vec_build_vocab(char* train, char* save_vocab_file);
+void word2vec_build_vocab(char* train_file, char* save_vocab_file);
+void word2vec_tokenize(char* train_file, char* vocab_file, char* output_file);
 #endif /* _WORD2VEC_COMMON_H */

data/ext/word2vec/tokenizer.c ADDED Viewed

@@ -0,0 +1,19 @@
+#include "common.h"
+void word2vec_tokenize(char* train_file, char* vocab_file, char* output_file) {
+  long long word;
+  vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
+  vocab_hash = (long long *)calloc(vocab_hash_size, sizeof(long long));
+  strcpy(train_file, train_file);
+  strcpy(read_vocab_file, vocab_file);
+  ReadVocab();
+  FILE *fi = fopen(train_file, "rb");
+  FILE *fo = fopen(output_file, "wb");
+  while (1) {
+    word = ReadWordIndex(fi);
+    if (feof(fi)) break;
+    fwrite(&word, sizeof(word), 1, fo);
+  }
+  fclose(fi);
+  fclose(fo);
+}

data/ext/word2vec/vocab.c CHANGED Viewed

@@ -1,185 +1,13 @@
 #include "common.h"
-#define MAX_STRING 100
-#define MAX_CODE_LENGTH 40
 const int vocab_hash_size = 30000000;  // Maximum 30 * 0.7 = 21M words in the vocabulary
-struct vocab_word {
-  long long cn;
-  int *point;
-  char *word, *code, codelen;
-};
-char train_file[MAX_STRING];
 struct vocab_word *vocab;
 int debug_mode = 2, min_count = 5, min_reduce = 1;
 long long vocab_max_size = 1000, vocab_size = 0;
 long long *vocab_hash;
 long long train_words = 0, file_size = 0;
-// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
-void ReadWord(char *word, FILE *fin) {
-  int a = 0, ch;
-  while (!feof(fin)) {
-    ch = fgetc(fin);
-    if (ch == 13) continue;
-    if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
-      if (a > 0) {
-        if (ch == '\n') ungetc(ch, fin);
-        break;
-      }
-      if (ch == '\n') {
-        strcpy(word, (char *)"</s>");
-        return;
-      } else continue;
-    }
-    word[a] = ch;
-    a++;
-    if (a >= MAX_STRING - 1) a--;   // Truncate too long words
-  }
-  word[a] = 0;
-}
-// Returns hash value of a word
-unsigned long long GetWordHash(char *word) {
-  unsigned long long a, hash = 0;
-  for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
-  hash = hash % vocab_hash_size;
-  return hash;
-}
-// Returns position of a word in the vocabulary; if the word is not found, returns -1
-unsigned long long SearchVocab(char *word) {
-  unsigned long long hash = GetWordHash(word);
-  while (1) {
-    if (vocab_hash[hash] == -1) return -1;
-    if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
-    hash = (hash + 1) % vocab_hash_size;
-  }
-  return -1;
-}
-// Reads a word and returns its index in the vocabulary
-unsigned long long ReadWordIndex(FILE *fin) {
-  char word[MAX_STRING];
-  ReadWord(word, fin);
-  if (feof(fin)) return -1;
-  return SearchVocab(word);
-}
-// Adds a word to the vocabulary
-unsigned long long AddWordToVocab(char *word) {
-  unsigned long long hash, length = strlen(word) + 1;
-  if (length > MAX_STRING) length = MAX_STRING;
-  vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
-  strcpy(vocab[vocab_size].word, word);
-  vocab[vocab_size].cn = 0;
-  vocab_size++;
-  // Reallocate memory if needed
-  if (vocab_size + 2 >= vocab_max_size) {
-    vocab_max_size += 1000;
-    vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
-  }
-  hash = GetWordHash(word);
-  while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
-  vocab_hash[hash] = vocab_size - 1;
-  return vocab_size - 1;
-}
-// Used later for sorting by word counts
-int VocabCompare(const void *a, const void *b) {
-    return (int)(((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn);
-}
-// Sorts the vocabulary by frequency using word counts
-void SortVocab() {
-  long long a, size;
-  unsigned long long hash;
-  // Sort the vocabulary and keep </s> at the first position
-  qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
-  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
-  size = vocab_size;
-  train_words = 0;
-  for (a = 0; a < size; a++) {
-    // Words occuring less than min_count times will be discarded from the vocab
-    if ((vocab[a].cn < min_count) && (a != 0)) {
-      vocab_size--;
-      free(vocab[a].word);
-    } else {
-      // Hash will be re-computed, as after the sorting it is not actual
-      hash=GetWordHash(vocab[a].word);
-      while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
-      vocab_hash[hash] = a;
-      train_words += vocab[a].cn;
-    }
-  }
-  vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
-  // Allocate memory for the binary tree construction
-  for (a = 0; a < vocab_size; a++) {
-    vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
-    vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
-  }
-}
-// Reduces the vocabulary by removing infrequent tokens
-void ReduceVocab() {
-  int a, b = 0;
-  unsigned long long hash;
-  for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
-    vocab[b].cn = vocab[a].cn;
-    vocab[b].word = vocab[a].word;
-    b++;
-  } else free(vocab[a].word);
-  vocab_size = b;
-  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
-  for (a = 0; a < vocab_size; a++) {
-    // Hash will be re-computed, as it is not actual
-    hash = GetWordHash(vocab[a].word);
-    while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
-    vocab_hash[hash] = a;
-  }
-  fflush(stdout);
-  min_reduce++;
-}
-// Learn vocabulary data from file
-void LearnVocabFromTrainFile() {
-  char word[MAX_STRING];
-  FILE *fin;
-  long long a, i;
-  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
-  fin = fopen(train_file, "rb");
-  if (fin == NULL) {
-    printf("ERROR: training data file not found!\n");
-    exit(1);
-  }
-  vocab_size = 0;
-  AddWordToVocab((char *)"</s>");
-  while (1) {
-    ReadWord(word, fin);
-    if (feof(fin)) break;
-    train_words++;
-    if ((debug_mode > 1) && (train_words % 100000 == 0)) {
-      printf("%lldK%c", train_words / 1000, 13);
-      fflush(stdout);
-    }
-    i = SearchVocab(word);
-    if (i == -1) {
-      a = AddWordToVocab(word);
-      vocab[a].cn = 1;
-    } else vocab[i].cn++;
-    if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
-  }
-  SortVocab();
-  if (debug_mode > 0) {
-    printf("Vocab size: %lld\n", vocab_size);
-    printf("Words in train file: %lld\n", train_words);
-  }
-  file_size = ftell(fin);
-  fclose(fin);
-}
 void SaveVocab(char* save_vocab_file) {
   long long i;
   FILE *fo = fopen(save_vocab_file, "wb");

data/ext/word2vec/word2vec.c CHANGED Viewed

@@ -141,6 +141,22 @@ static VALUE build_vocab(VALUE mod, VALUE rb_train_file_name, VALUE rb_vocab_fil
   return Qtrue;
 }
+/*
+ * tokenize a file
+ * @param [String] rb_train_file_name
+ * @param [String] rb_vocab_file_name
+ * @param [String] rb_output_file_name
+ */
+static VALUE tokenize(VALUE mod, VALUE rb_train_file_name, VALUE rb_vocab_file_name, VALUE rb_output_file_name) {
+  char* train_filename = StringValueCStr(rb_train_file_name);
+  char* vocab_filename = StringValueCStr(rb_vocab_file_name);
+  char* output_filename = StringValueCStr(rb_output_file_name);
+  word2vec_tokenize(train_filename, vocab_filename, output_filename);
+  return Qtrue;
+}
 void Init_word2vec(void) {
   VALUE mWord2vec = rb_define_module("Word2vec");
   VALUE mWord2vecModel = rb_define_class_under(mWord2vec, "Model", rb_cObject);
@@ -151,4 +167,5 @@ void Init_word2vec(void) {
   rb_define_method(mWord2vecModel, "analogy", model_analogy, 3);
   rb_define_method(mWord2vecModel, "accuracy", model_accuracy, 1);
   rb_define_singleton_method(mWord2vecModel, "build_vocab", build_vocab, 2);
+  rb_define_singleton_method(mWord2vecModel, "tokenize", tokenize, 3);
 }

data/lib/word2vec/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Word2vec
-    VERSION = "0.4.0"
+    VERSION = "0.5.0"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: word2vec-rb
 version: !ruby/object:Gem::Version
-  version: 0.4.0
+  version: 0.5.0
 platform: ruby
 authors:
 - Dani Vela
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2021-06-14 00:00:00.000000000 Z
+date: 2021-06-30 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -80,6 +80,8 @@ files:
 - ".gitignore"
 - ".rspec"
 - ".travis.yml"
+- ".vscode/c_cpp_properties.json"
+- ".vscode/settings.json"
 - CHANGELOG
 - Gemfile
 - Gemfile.lock
@@ -96,6 +98,7 @@ files:
 - ext/word2vec/common.h
 - ext/word2vec/distance.c
 - ext/word2vec/extconf.rb
+- ext/word2vec/tokenizer.c
 - ext/word2vec/vocab.c
 - ext/word2vec/word2vec.c
 - lib/word2vec.rb