RubyGems - word2vec-rb - Versions diffs - 0.1.0 - Mend

word2vec-rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: 99c301d5325561032f1d8b71b39387708b957b4e656ed449b943f0debb3c015b
+  data.tar.gz: b4dfc40380ed5b7c505aef4c403b6234202732ffc89e6593d0d1f274846addae
+SHA512:
+  metadata.gz: 40a8267dff46b2db00b8734f3a4c6bb2c245955047de3971253af6222b2a41ed1dfde25eb436707ba409a67bdb6e4e580f641277d34cfc5b9be8c6d649a8485d
+  data.tar.gz: 64306bf16b1e6b2f60fa7560e96e370e912d7d01498c6b464c90220bc0b534760ffc5002d8ac3c73ccbaa1a6ce0de67b9a1c4b3afb12b019b0433541e9964276

data/.gitignore ADDED Viewed

@@ -0,0 +1,13 @@
+/.bundle/
+/.yardoc
+/_yardoc/
+/coverage/
+/doc/
+/pkg/
+/spec/reports/
+/tmp/
+# rspec failure tracking
+.rspec_status
+lib/word2vec/word2vec.bundle

data/.rspec ADDED Viewed

@@ -0,0 +1,3 @@
+--format documentation
+--color
+--require spec_helper

data/.travis.yml ADDED Viewed

@@ -0,0 +1,6 @@
+---
+language: ruby
+cache: bundler
+rvm:
+  - 2.7.1
+before_install: gem install bundler -v 2.1.4

data/CHANGELOG ADDED Viewed

@@ -0,0 +1,16 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [Unreleased]
+## [0.1.0] - 2021-04-25
+### Added
+- Load word2vec model from bin file.
+- Find the nearest words to a given one.
+### Changed
+### Fixed
+### Removed

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source "https://rubygems.org"
+# Specify your gem's dependencies in word2vec-rb.gemspec
+gemspec

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,38 @@
+PATH
+  remote: .
+  specs:
+    word2vec-rb (0.1.0)
+GEM
+  remote: https://rubygems.org/
+  specs:
+    diff-lcs (1.4.4)
+    rake (12.3.3)
+    rake-compiler (1.1.1)
+      rake
+    rspec (3.10.0)
+      rspec-core (~> 3.10.0)
+      rspec-expectations (~> 3.10.0)
+      rspec-mocks (~> 3.10.0)
+    rspec-core (3.10.1)
+      rspec-support (~> 3.10.0)
+    rspec-expectations (3.10.1)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.10.0)
+    rspec-mocks (3.10.2)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.10.0)
+    rspec-support (3.10.2)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  bundler (~> 2.1.0)
+  rake (~> 12.0)
+  rake-compiler (~> 1.0)
+  rspec (~> 3.0)
+  word2vec-rb!
+BUNDLED WITH
+   2.1.4

data/README.md ADDED Viewed

@@ -0,0 +1,55 @@
+# word2vec-rb
+Gem using word2vec functionality from https://code.google.com/archive/p/word2vec/
+This gem was developed using the `.c` files of the Google word2vec as base. Mostly by applying copy-and-paste.
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'word2vec-rb'
+```
+And then execute:
+    $ bundle install
+Or install it yourself as:
+    $ gem install word2vec-rb
+## Usage
+### To find the nearest words, try:
+    require 'word2vec'
+    model = Word2vec::Model.load("./data/minimal.bin")
+    words = model.distance("from")
+    words.each do |w|
+      puts "#{w.first} #{w.last}"
+    end
+## Development
+After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
+To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
+### Build extension
+    $ rake build
+### Launch tests
+    $ rake spec
+### Build extension
+    $ rake compile
+## Contributing
+Bug reports and pull requests are welcome on GitHub at https://github.com/madcato/word2vec-rb.

data/Rakefile ADDED Viewed

@@ -0,0 +1,12 @@
+require "bundler/gem_tasks"
+require "rspec/core/rake_task"
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec
+require "rake/extensiontask"
+Rake::ExtensionTask.new "word2vec" do |ext|
+  ext.lib_dir = "lib/word2vec"
+end

data/bin/console ADDED Viewed

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require "bundler/setup"
+require "word2vec"
+# You can add fixtures and/or initialization code here to make experimenting
+# with your gem easier. You can also use a different console, if you like.
+# (If you use this, don't forget to add pry to your Gemfile!)
+# require "pry"
+# Pry.start
+require "irb"
+IRB.start(__FILE__)

data/bin/setup ADDED Viewed

@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+set -euo pipefail
+IFS=$'\n\t'
+set -vx
+bundle install
+# Do any other automated setup that you need to do here

data/data/minimal.bin ADDED Viewed

Binary file

data/data/readme.md ADDED Viewed

	@@ -0,0 +1 @@
1	+ File `minimal.bin` is only for testing purposes.

data/ext/word2vec/common.c ADDED Viewed

@@ -0,0 +1,117 @@
+#include "common.h"
+// max length of strings
+const long long max_size = 2000;
+// number of closest words that will be shown
+const long long N = 40;
+// max length of vocabulary entries
+const long long max_w = 50;
+// Load the binary vector file generated by word2vec project
+void word2vec_model_load(word2vec_model* model, char* file_name) {
+  FILE *f;
+  f = fopen(file_name, "rb");
+  if (f == NULL) {
+    rb_raise(rb_eArgError, "file not found");
+    return;
+  }
+  long long words, size;
+  fscanf(f, "%lld", &words);
+  fscanf(f, "%lld", &size);
+  model->word_count = words;
+  model->vector_dim = size;
+  model->vocabulary = ZALLOC_N(char, (long long)words * max_w);
+  model->vectors = ALLOC_N(float, model->word_count * model->vector_dim);
+  if (model->vectors == NULL) {
+    rb_raise(rb_eNoMemError, "Cannot allocate memory: %lld MB    %lld  %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
+    return;
+  }
+  for (long long b = 0; b < words; b++) {
+    long long a = 0;
+    while (true) {
+      model->vocabulary[b * max_w + a] = fgetc(f);
+      if (feof(f) || (model->vocabulary[b * max_w + a] == ' ')) break;
+      if ((a < max_w) && (model->vocabulary[b * max_w + a] != '\n')) a++;
+    }
+    model->vocabulary[b * max_w + a] = 0;
+    for (a = 0; a < size; a++) fread(&model->vectors[a + b * size], sizeof(float), 1, f);
+    float len = 0;
+    for (a = 0; a < size; a++) len += model->vectors[a + b * size] * model->vectors[a + b * size];
+    len = sqrt(len);
+    for (a = 0; a < size; a++) model->vectors[a + b * size] /= len;
+  }
+  fclose(f);
+}
+// Find nearest words in the model
+size_t word2vec_model_distance(word2vec_model* model, char* word, WordSimilarity word_list[]) {
+  if (strlen(word) >= max_size) {
+    rb_raise(rb_eArgError, "word must be %lld character max size", max_size);
+    return 0;
+  }
+  long long size = model->vector_dim;
+  long long a;
+  char *bestw[N];
+  float bestd[N];
+  size_t besti[N];
+  for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char));
+  a = 0;
+  long long b = 0;
+  long long c = 0;
+  long long words = model->word_count;
+  for (b = 0; b < words; b++) {
+    if (!strcmp(&model->vocabulary[b * max_w], word)) break;
+  }
+  if (b == words) b = -1;
+  long long bi = b;
+  if (b == -1) {
+    rb_raise(rb_eArgError, "Out of dictionary word!");
+    return 0;
+  }
+  float vec[max_size];
+  float dist;
+  long long d;
+  for (a = 0; a < size; a++) vec[a] = 0;
+  for (a = 0; a < size; a++) vec[a] += model->vectors[a + bi * size];
+  float len = 0;
+  for (a = 0; a < size; a++) len += vec[a] * vec[a];
+  len = sqrt(len);
+  for (a = 0; a < size; a++) vec[a] /= len;
+  for (a = 0; a < N; a++) bestd[a] = -1;
+  for (a = 0; a < N; a++) bestw[a][0] = 0;
+  for (c = 0; c < words; c++) {
+    a = 0;
+    if (bi == c) continue;
+    dist = 0;
+    for (a = 0; a < size; a++) dist += vec[a] * model->vectors[a + c * size];
+    for (a = 0; a < N; a++) {
+      if (dist > bestd[a]) {
+        for (d = N - 1; d > a; d--) {
+          bestd[d] = bestd[d - 1];
+          strcpy(bestw[d], bestw[d - 1]);
+          besti[d] = d - 1;
+        }
+        bestd[a] = dist;
+        strcpy(bestw[a], &model->vocabulary[c * max_w]);
+        besti[a] = c * max_w;
+        break;
+      }
+    }
+  }
+  for (a = 0; a < N; a++) {
+    word_list[a].index = besti[a];
+    word_list[a].score = bestd[a];
+  }
+  for (a = 0; a < N; a++) free(bestw[a]);
+  return N;
+}

data/ext/word2vec/common.h ADDED Viewed

@@ -0,0 +1,29 @@
+#ifndef _WORD2VEC_COMMON_H
+#define _WORD2VEC_COMMON_H
+#include <ruby.h>
+#include <ruby/io.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+extern const long long N;  // number of closest words that will be shown
+typedef struct word2vec_model_s {
+  long long word_count;
+  char *vocabulary;  // char *[word_count]
+  long long vector_dim;
+  float *vectors;  // float[word_count][vector_dim]
+} word2vec_model;
+typedef struct WordSimilarity_s {
+  size_t index;
+  float score;
+} WordSimilarity;
+void word2vec_model_load(word2vec_model* model, char* file_name);
+size_t word2vec_model_distance(word2vec_model* model, char* word, WordSimilarity word_list[]);
+#endif /* _WORD2VEC_COMMON_H */

data/ext/word2vec/extconf.rb ADDED Viewed

@@ -0,0 +1,46 @@
+require "mkmf"
+additional_prefixed_cflags = %w(-std=gnu99)
+additional_suffixed_cflags = %w(-Wno-declaration-after-statement)
+additional_prefixed_ldflags = %w()
+#
+# Add some additional development-oriented warning flags. Enable by compiling with:
+#
+#     rake compile -- --enable-development
+#
+# or, if using with the actual gem (for whatever reason):
+#
+#     gem install word2vec -- --enable-development
+#
+if enable_config("development")
+  additional_prefixed_cflags = [*additional_prefixed_cflags, *%w(-Wall -Wextra -Werror)]
+end
+#
+# Use `clang`'s [AddressSanitizer](http://clang.llvm.org/docs/AddressSanitizer.html). Enable by compiling with:
+#
+#     rake compile -- --enable-address-sanitizer
+#
+if enable_config("address-sanitizer")
+  additional_prefixed_cflags = [*additional_prefixed_cflags, "-fsanitize=address"]
+  additional_prefixed_ldflags = [*additional_prefixed_ldflags, "-fsanitize=address"]
+end
+unless (new_prefixed_cflags = additional_prefixed_cflags - $CFLAGS.split(/\s+/)).empty?
+  $CFLAGS.prepend(new_prefixed_cflags.join(" ") << " ")
+end
+unless (new_suffixed_cflags = additional_suffixed_cflags - $CFLAGS.split(/\s+/)).empty?
+  $CFLAGS << " " << new_suffixed_cflags.join(" ")
+end
+unless (new_prefixed_ldflags = additional_prefixed_ldflags - $LDFLAGS.split(/\s+/)).empty?
+  $LDFLAGS.prepend(new_prefixed_ldflags.join(" ") << " ")
+end
+## Check existence of functions before build
+# Check for the C11 [`getdelim`](http://pubs.opengroup.org/onlinepubs/9699919799/functions/getdelim.html) function.
+abort "missing getdelim()" unless have_func("getdelim")
+create_makefile "word2vec/word2vec"

data/ext/word2vec/word2vec.c ADDED Viewed

@@ -0,0 +1,93 @@
+#include "common.h"
+/*
+ * model_deallocate
+ * clean model memory
+ */
+static void model_deallocate(word2vec_model *model) {
+  if (model != NULL) {
+    if (model->vocabulary != NULL) {
+      xfree(model->vocabulary);
+    }
+    if (model->vectors != NULL) {
+      xfree(model->vectors);
+    }
+    xfree(model);
+  }
+}
+/*
+ * model_load
+ * load the vectors.bin file from disc
+ * @param [String] rb_filename
+ */
+static VALUE model_load(VALUE mod, VALUE rb_filename) {
+  word2vec_model* model = ZALLOC(word2vec_model);
+  char* filename = StringValueCStr(rb_filename);
+  word2vec_model_load(model, filename);
+  return Data_Wrap_Struct(mod, NULL, model_deallocate, model);
+}
+/*
+ * model vocabulary length
+ * @return [Integer]
+ */
+static VALUE model_word_count(VALUE mod) {
+  word2vec_model *model;
+  Data_Get_Struct(mod, word2vec_model, model);
+  return SIZET2NUM(model->word_count);
+}
+/*
+ * model vector dimensionality
+ * @return [Integer]
+ */
+static VALUE model_vector_dim(VALUE mod) {
+  word2vec_model *model;
+  Data_Get_Struct(mod, word2vec_model, model);
+  return SIZET2NUM(model->vector_dim);
+}
+/*
+ * model find the nearest distance words
+ * @param [String] rb_word
+ * @return [Hash<String, Float>]
+ */
+static VALUE model_distance(VALUE mod, VALUE rb_word) {
+  word2vec_model *model;
+  Data_Get_Struct(mod, word2vec_model, model);
+  char* word = StringValueCStr(rb_word);
+  WordSimilarity word_list[N];
+  size_t word_count = word2vec_model_distance(model, word, word_list);
+  VALUE rb_ret = rb_hash_new();
+  for (size_t i = 0 ; i < word_count ; i++) {
+    size_t index = word_list[i].index;
+    if (index >= 0) {
+      VALUE rb_word = rb_str_freeze(rb_utf8_str_new_cstr(&model->vocabulary[index]));
+      VALUE rb_score = DBL2NUM(word_list[i].score);
+      rb_hash_aset(rb_ret, rb_word, rb_score);
+    }
+  }
+  return rb_ret;
+}
+void Init_word2vec(void) {
+  VALUE mWord2vec = rb_define_module("Word2vec");
+  VALUE mWord2vecModel = rb_define_class_under(mWord2vec, "Model", rb_cObject);
+  rb_define_singleton_method(mWord2vecModel, "load", model_load, 1);
+  rb_define_method(mWord2vecModel, "word_count", model_word_count, 0);
+  rb_define_method(mWord2vecModel, "vector_dim", model_vector_dim, 0);
+  rb_define_method(mWord2vecModel, "distance", model_distance, 1);
+}

data/lib/word2vec.rb ADDED Viewed

@@ -0,0 +1,23 @@
+require "word2vec/version"
+module Word2vec
+  class Error < StandardError; end
+  # Your code goes here...
+  class Hola
+    # Say hi to the world!
+    #
+    # Example:
+    #   >> Hola.hi("spanish")
+    #   => hola mundo
+    #
+    # Arguments:
+    #   language: (String)
+    def self.hi(language = "english")
+      translator = Translator.new(language)
+      puts translator.hi
+    end
+  end
+end
+require "word2vec/word2vec"

data/lib/word2vec/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Word2vec
+    VERSION = "0.1.0"
+end

data/word2vec-rb.gemspec ADDED Viewed

@@ -0,0 +1,34 @@
+require_relative 'lib/word2vec/version'
+Gem::Specification.new do |spec|
+  spec.name          = "word2vec-rb"
+  spec.version       = Word2vec::VERSION
+  spec.authors       = ["Dani Vela"]
+  spec.email         = ["veladan@me.com"]
+  spec.summary       = %q{Ruby interface to use word2vec arithmetic.}
+  spec.description   = %q{To use this gem is required the file`vectors.bin` where is stored the output of the Google algorithm called `word2vec`. This gem doesn't produce this file. Once produced, this can can load it and use it to calculate some arithmetic operations like distance between words or to calculate the relations between them.'}
+  spec.homepage      = "https://github.com/madcato/word2vec-rb"
+  spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
+  spec.metadata["homepage_uri"] = spec.homepage
+  spec.metadata["source_code_uri"] = "https://github.com/madcato/word2vec-rb"
+  spec.metadata["changelog_uri"] = "http://github.com/macato/word2vec-rb/CHANGELOG"
+  # Specify which files should be added to the gem when it is released.
+  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
+  spec.files         = Dir.chdir(File.expand_path('..', __FILE__)) do
+    `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
+  end
+  spec.bindir        = "exe"
+  spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
+  spec.require_paths = ["lib"]
+  spec.license       = 'MIT'
+  spec.extensions = %w[ext/word2vec/extconf.rb]
+  spec.add_development_dependency "bundler", "~> 2.1.0"
+  spec.add_development_dependency "rake", "~> 12.0"
+  spec.add_development_dependency "rake-compiler", "~> 1.0"
+  spec.add_development_dependency "rspec", "~> 3.0"
+end

metadata ADDED Viewed

@@ -0,0 +1,125 @@
+--- !ruby/object:Gem::Specification
+name: word2vec-rb
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Dani Vela
+autorequire:
+bindir: exe
+cert_chain: []
+date: 2021-04-26 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 2.1.0
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 2.1.0
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '12.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '12.0'
+- !ruby/object:Gem::Dependency
+  name: rake-compiler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+description: To use this gem is required the file`vectors.bin` where is stored the
+  output of the Google algorithm called `word2vec`. This gem doesn't produce this
+  file. Once produced, this can can load it and use it to calculate some arithmetic
+  operations like distance between words or to calculate the relations between them.'
+email:
+- veladan@me.com
+executables: []
+extensions:
+- ext/word2vec/extconf.rb
+extra_rdoc_files: []
+files:
+- ".gitignore"
+- ".rspec"
+- ".travis.yml"
+- CHANGELOG
+- Gemfile
+- Gemfile.lock
+- README.md
+- Rakefile
+- bin/console
+- bin/setup
+- data/minimal.bin
+- data/readme.md
+- ext/word2vec/common.c
+- ext/word2vec/common.h
+- ext/word2vec/extconf.rb
+- ext/word2vec/word2vec.c
+- lib/word2vec.rb
+- lib/word2vec/version.rb
+- word2vec-rb.gemspec
+homepage: https://github.com/madcato/word2vec-rb
+licenses:
+- MIT
+metadata:
+  homepage_uri: https://github.com/madcato/word2vec-rb
+  source_code_uri: https://github.com/madcato/word2vec-rb
+  changelog_uri: http://github.com/macato/word2vec-rb/CHANGELOG
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: 2.3.0
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 3.1.2
+signing_key:
+specification_version: 4
+summary: Ruby interface to use word2vec arithmetic.
+test_files: []