RubyGems - ffi-fasttext - Versions diffs - 0.1.0 - Mend

ffi-fasttext 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

checksums.yaml +7 -0
data/.gitignore +44 -0
data/.travis.yml +5 -0
data/Gemfile +6 -0
data/LICENSE.txt +21 -0
data/README.md +59 -0
data/Rakefile +19 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/ext/ffi/fasttext/Rakefile +71 -0
data/ffi-fasttext.gemspec +40 -0
data/lib/ffi/fasttext.rb +108 -0
data/lib/ffi/fasttext/version.rb +5 -0
data/vendor/fasttext/LICENSE +30 -0
data/vendor/fasttext/PATENTS +33 -0
data/vendor/fasttext/args.cc +250 -0
data/vendor/fasttext/args.h +71 -0
data/vendor/fasttext/dictionary.cc +475 -0
data/vendor/fasttext/dictionary.h +112 -0
data/vendor/fasttext/fasttext.cc +693 -0
data/vendor/fasttext/fasttext.h +97 -0
data/vendor/fasttext/ffi_fasttext.cc +66 -0
data/vendor/fasttext/main.cc +270 -0
data/vendor/fasttext/matrix.cc +144 -0
data/vendor/fasttext/matrix.h +57 -0
data/vendor/fasttext/model.cc +341 -0
data/vendor/fasttext/model.h +110 -0
data/vendor/fasttext/productquantizer.cc +211 -0
data/vendor/fasttext/productquantizer.h +67 -0
data/vendor/fasttext/qmatrix.cc +121 -0
data/vendor/fasttext/qmatrix.h +65 -0
data/vendor/fasttext/real.h +19 -0
data/vendor/fasttext/utils.cc +29 -0
data/vendor/fasttext/utils.h +25 -0
data/vendor/fasttext/vector.cc +137 -0
data/vendor/fasttext/vector.h +53 -0
metadata +151 -0

data/vendor/fasttext/fasttext.h ADDED

@@ -0,0 +1,97 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+#ifndef FASTTEXT_FASTTEXT_H
+#define FASTTEXT_FASTTEXT_H
+#define FASTTEXT_VERSION 12 /* Version 1b */
+#define FASTTEXT_FILEFORMAT_MAGIC_INT32 793712314
+#include <time.h>
+#include <atomic>
+#include <memory>
+#include <set>
+#include "args.h"
+#include "dictionary.h"
+#include "matrix.h"
+#include "qmatrix.h"
+#include "model.h"
+#include "real.h"
+#include "utils.h"
+#include "vector.h"
+namespace fasttext {
+class FastText {
+  private:
+    std::shared_ptr<Args> args_;
+    std::shared_ptr<Dictionary> dict_;
+    std::shared_ptr<Matrix> input_;
+    std::shared_ptr<Matrix> output_;
+    std::shared_ptr<QMatrix> qinput_;
+    std::shared_ptr<QMatrix> qoutput_;
+    std::shared_ptr<Model> model_;
+    std::atomic<int64_t> tokenCount;
+    clock_t start;
+    void signModel(std::ostream&);
+    bool checkModel(std::istream&);
+    bool quant_;
+    int32_t version;
+  public:
+    FastText();
+    void getVector(Vector&, const std::string&) const;
+    std::shared_ptr<const Dictionary> getDictionary() const;
+    void saveVectors();
+    void saveOutput();
+    void saveModel();
+    void loadModel(std::istream&);
+    void loadModel(const std::string&);
+    void printInfo(real, real);
+    void supervised(Model&, real, const std::vector<int32_t>&,
+                    const std::vector<int32_t>&);
+    void cbow(Model&, real, const std::vector<int32_t>&);
+    void skipgram(Model&, real, const std::vector<int32_t>&);
+    std::vector<int32_t> selectEmbeddings(int32_t) const;
+    void quantize(std::shared_ptr<Args>);
+    void test(std::istream&, int32_t);
+    void predict(std::istream&, int32_t, bool);
+    void predict(
+        std::istream&,
+        int32_t,
+        std::vector<std::pair<real, std::string>>&) const;
+    void wordVectors();
+    void sentenceVectors();
+    void ngramVectors(std::string);
+    void textVectors();
+    void printWordVectors();
+    void printSentenceVectors();
+    void precomputeWordVectors(Matrix&);
+    void findNN(const Matrix&, const Vector&, int32_t,
+                const std::set<std::string>&);
+    void nn(int32_t);
+    void analogies(int32_t);
+    void trainThread(int32_t);
+    void train(std::shared_ptr<Args>);
+    void loadVectors(std::string);
+    int getDimension() const;
+};
+}
+#endif

data/vendor/fasttext/ffi_fasttext.cc ADDED

@@ -0,0 +1,66 @@
+#include <algorithm>
+#include <iostream>
+#include <cstring>
+#include <math.h>
+#include <sstream>
+#include <string>
+#include <vector>
+#include "real.h"
+#include "fasttext.h"
+#ifdef __cplusplus
+#define EXTERN_C       extern "C"
+#define EXTERN_C_BEGIN extern "C" {
+#define EXTERN_C_END   }
+#else
+#define EXTERN_C       /* Nothing */
+#define EXTERN_C_BEGIN /* Nothing */
+#define EXTERN_C_END   /* Nothing */
+#endif
+EXTERN_C_BEGIN
+fasttext::FastText* create(const char* model_name) {
+  fasttext::FastText* new_fasttext = new fasttext::FastText();
+  new_fasttext->loadModel(std::string(model_name));
+  return new_fasttext;
+}
+void destroy(fasttext::FastText* destroy_fasttext) {
+  delete destroy_fasttext;
+}
+void predict_string_free(const char* match) {
+  if (match != NULL) {
+    delete[] match;
+  }
+}
+const char* predict(fasttext::FastText* fasttext_pointer, const char* key, int32_t number_of_predictions) {
+  std::string string_key(key);
+  std::stringstream key_stream;
+  std::ostringstream output_stream;
+  key_stream.str(string_key);
+  key_stream << std::endl;
+  std::vector<std::pair<fasttext::real, std::string>> predictions;
+  fasttext_pointer->predict(key_stream, number_of_predictions, predictions);
+  for (auto iter = predictions.begin(); iter != predictions.end(); iter++) {
+    output_stream << iter->second << " " << std::exp(iter->first) << " ";
+  }
+  if (!output_stream.str().empty()) {
+    std::string first = output_stream.str();
+    char *val  = new char[first.size() + 1]{0};
+    val[first.size()] = '\0';
+    memcpy(val, first.c_str(), first.size());
+    return val;
+  }
+  return NULL;
+}
+EXTERN_C_END

data/vendor/fasttext/main.cc ADDED

@@ -0,0 +1,270 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+#include <iostream>
+#include "fasttext.h"
+#include "args.h"
+using namespace fasttext;
+void printUsage() {
+  std::cerr
+    << "usage: fasttext <command> <args>\n\n"
+    << "The commands supported by fasttext are:\n\n"
+    << "  supervised              train a supervised classifier\n"
+    << "  quantize                quantize a model to reduce the memory usage\n"
+    << "  test                    evaluate a supervised classifier\n"
+    << "  predict                 predict most likely labels\n"
+    << "  predict-prob            predict most likely labels with probabilities\n"
+    << "  skipgram                train a skipgram model\n"
+    << "  cbow                    train a cbow model\n"
+    << "  print-word-vectors      print word vectors given a trained model\n"
+    << "  print-sentence-vectors  print sentence vectors given a trained model\n"
+    << "  nn                      query for nearest neighbors\n"
+    << "  analogies               query for analogies\n"
+    << std::endl;
+}
+void printQuantizeUsage() {
+  std::cerr
+    << "usage: fasttext quantize <args>"
+    << std::endl;
+}
+void printTestUsage() {
+  std::cerr
+    << "usage: fasttext test <model> <test-data> [<k>]\n\n"
+    << "  <model>      model filename\n"
+    << "  <test-data>  test data filename (if -, read from stdin)\n"
+    << "  <k>          (optional; 1 by default) predict top k labels\n"
+    << std::endl;
+}
+void printPredictUsage() {
+  std::cerr
+    << "usage: fasttext predict[-prob] <model> <test-data> [<k>]\n\n"
+    << "  <model>      model filename\n"
+    << "  <test-data>  test data filename (if -, read from stdin)\n"
+    << "  <k>          (optional; 1 by default) predict top k labels\n"
+    << std::endl;
+}
+void printPrintWordVectorsUsage() {
+  std::cerr
+    << "usage: fasttext print-word-vectors <model>\n\n"
+    << "  <model>      model filename\n"
+    << std::endl;
+}
+void printPrintSentenceVectorsUsage() {
+  std::cerr
+    << "usage: fasttext print-sentence-vectors <model>\n\n"
+    << "  <model>      model filename\n"
+    << std::endl;
+}
+void printPrintNgramsUsage() {
+  std::cerr
+    << "usage: fasttext print-ngrams <model> <word>\n\n"
+    << "  <model>      model filename\n"
+    << "  <word>       word to print\n"
+    << std::endl;
+}
+void quantize(const std::vector<std::string>& args) {
+  std::shared_ptr<Args> a = std::make_shared<Args>();
+  if (args.size() < 3) {
+    printQuantizeUsage();
+    a->printHelp();
+    exit(EXIT_FAILURE);
+  }
+  a->parseArgs(args);
+  FastText fasttext;
+  fasttext.quantize(a);
+  exit(0);
+}
+void printNNUsage() {
+  std::cout
+    << "usage: fasttext nn <model> <k>\n\n"
+    << "  <model>      model filename\n"
+    << "  <k>          (optional; 10 by default) predict top k labels\n"
+    << std::endl;
+}
+void printAnalogiesUsage() {
+  std::cout
+    << "usage: fasttext analogies <model> <k>\n\n"
+    << "  <model>      model filename\n"
+    << "  <k>          (optional; 10 by default) predict top k labels\n"
+    << std::endl;
+}
+void test(const std::vector<std::string>& args) {
+  if (args.size() < 4 || args.size() > 5) {
+    printTestUsage();
+    exit(EXIT_FAILURE);
+  }
+  int32_t k = 1;
+  if (args.size() >= 5) {
+    k = std::stoi(args[4]);
+  }
+  FastText fasttext;
+  fasttext.loadModel(args[2]);
+  std::string infile = args[3];
+  if (infile == "-") {
+    fasttext.test(std::cin, k);
+  } else {
+    std::ifstream ifs(infile);
+    if (!ifs.is_open()) {
+      std::cerr << "Test file cannot be opened!" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    fasttext.test(ifs, k);
+    ifs.close();
+  }
+  exit(0);
+}
+void predict(const std::vector<std::string>& args) {
+  if (args.size() < 4 || args.size() > 5) {
+    printPredictUsage();
+    exit(EXIT_FAILURE);
+  }
+  int32_t k = 1;
+  if (args.size() >= 5) {
+    k = std::stoi(args[4]);
+  }
+  bool print_prob = args[1] == "predict-prob";
+  FastText fasttext;
+  fasttext.loadModel(std::string(args[2]));
+  std::string infile(args[3]);
+  if (infile == "-") {
+    fasttext.predict(std::cin, k, print_prob);
+  } else {
+    std::ifstream ifs(infile);
+    if (!ifs.is_open()) {
+      std::cerr << "Input file cannot be opened!" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    fasttext.predict(ifs, k, print_prob);
+    ifs.close();
+  }
+  exit(0);
+}
+void printWordVectors(const std::vector<std::string> args) {
+  if (args.size() != 3) {
+    printPrintWordVectorsUsage();
+    exit(EXIT_FAILURE);
+  }
+  FastText fasttext;
+  fasttext.loadModel(std::string(args[2]));
+  fasttext.printWordVectors();
+  exit(0);
+}
+void printSentenceVectors(const std::vector<std::string> args) {
+  if (args.size() != 3) {
+    printPrintSentenceVectorsUsage();
+    exit(EXIT_FAILURE);
+  }
+  FastText fasttext;
+  fasttext.loadModel(std::string(args[2]));
+  fasttext.printSentenceVectors();
+  exit(0);
+}
+void printNgrams(const std::vector<std::string> args) {
+  if (args.size() != 4) {
+    printPrintNgramsUsage();
+    exit(EXIT_FAILURE);
+  }
+  FastText fasttext;
+  fasttext.loadModel(std::string(args[2]));
+  fasttext.ngramVectors(std::string(args[3]));
+  exit(0);
+}
+void nn(const std::vector<std::string> args) {
+  int32_t k;
+  if (args.size() == 3) {
+    k = 10;
+  } else if (args.size() == 4) {
+    k = std::stoi(args[3]);
+  } else {
+    printNNUsage();
+    exit(EXIT_FAILURE);
+  }
+  FastText fasttext;
+  fasttext.loadModel(std::string(args[2]));
+  fasttext.nn(k);
+  exit(0);
+}
+void analogies(const std::vector<std::string> args) {
+  int32_t k;
+  if (args.size() == 3) {
+    k = 10;
+  } else if (args.size() == 4) {
+    k = std::stoi(args[3]);
+  } else {
+    printAnalogiesUsage();
+    exit(EXIT_FAILURE);
+  }
+  FastText fasttext;
+  fasttext.loadModel(std::string(args[2]));
+  fasttext.analogies(k);
+  exit(0);
+}
+void train(const std::vector<std::string> args) {
+  std::shared_ptr<Args> a = std::make_shared<Args>();
+  a->parseArgs(args);
+  FastText fasttext;
+  fasttext.train(a);
+}
+int main(int argc, char** argv) {
+  std::vector<std::string> args(argv, argv + argc);
+  if (args.size() < 2) {
+    printUsage();
+    exit(EXIT_FAILURE);
+  }
+  std::string command(args[1]);
+  if (command == "skipgram" || command == "cbow" || command == "supervised") {
+    train(args);
+  } else if (command == "test") {
+    test(args);
+  } else if (command == "quantize") {
+    quantize(args);
+  } else if (command == "print-word-vectors") {
+    printWordVectors(args);
+  } else if (command == "print-sentence-vectors") {
+    printSentenceVectors(args);
+  } else if (command == "print-ngrams") {
+    printNgrams(args);
+  } else if (command == "nn") {
+    nn(args);
+  } else if (command == "analogies") {
+    analogies(args);
+  } else if (command == "predict" || command == "predict-prob" ) {
+    predict(args);
+  } else {
+    printUsage();
+    exit(EXIT_FAILURE);
+  }
+  return 0;
+}

data/vendor/fasttext/matrix.cc ADDED

@@ -0,0 +1,144 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+#include "matrix.h"
+#include <assert.h>
+#include <random>
+#include "utils.h"
+#include "vector.h"
+namespace fasttext {
+Matrix::Matrix() {
+  m_ = 0;
+  n_ = 0;
+  data_ = nullptr;
+}
+Matrix::Matrix(int64_t m, int64_t n) {
+  m_ = m;
+  n_ = n;
+  data_ = new real[m * n];
+}
+Matrix::Matrix(const Matrix& other) {
+  m_ = other.m_;
+  n_ = other.n_;
+  data_ = new real[m_ * n_];
+  for (int64_t i = 0; i < (m_ * n_); i++) {
+    data_[i] = other.data_[i];
+  }
+}
+Matrix& Matrix::operator=(const Matrix& other) {
+  Matrix temp(other);
+  m_ = temp.m_;
+  n_ = temp.n_;
+  std::swap(data_, temp.data_);
+  return *this;
+}
+Matrix::~Matrix() {
+  delete[] data_;
+}
+void Matrix::zero() {
+  for (int64_t i = 0; i < (m_ * n_); i++) {
+      data_[i] = 0.0;
+  }
+}
+void Matrix::uniform(real a) {
+  std::minstd_rand rng(1);
+  std::uniform_real_distribution<> uniform(-a, a);
+  for (int64_t i = 0; i < (m_ * n_); i++) {
+    data_[i] = uniform(rng);
+  }
+}
+real Matrix::dotRow(const Vector& vec, int64_t i) const {
+  assert(i >= 0);
+  assert(i < m_);
+  assert(vec.size() == n_);
+  real d = 0.0;
+  for (int64_t j = 0; j < n_; j++) {
+    d += at(i, j) * vec.data_[j];
+  }
+  return d;
+}
+void Matrix::addRow(const Vector& vec, int64_t i, real a) {
+  assert(i >= 0);
+  assert(i < m_);
+  assert(vec.size() == n_);
+  for (int64_t j = 0; j < n_; j++) {
+    data_[i * n_ + j] += a * vec.data_[j];
+  }
+}
+void Matrix::multiplyRow(const Vector& nums, int64_t ib, int64_t ie) {
+  if (ie == -1) {ie = m_;}
+  assert(ie <= nums.size());
+  for (auto i = ib; i < ie; i++) {
+    real n = nums[i-ib];
+    if (n != 0) {
+      for (auto j = 0; j < n_; j++) {
+        at(i, j) *= n;
+      }
+    }
+  }
+}
+void Matrix::divideRow(const Vector& denoms, int64_t ib, int64_t ie) {
+  if (ie == -1) {ie = m_;}
+  assert(ie <= denoms.size());
+  for (auto i = ib; i < ie; i++) {
+    real n = denoms[i-ib];
+    if (n != 0) {
+      for (auto j = 0; j < n_; j++) {
+        at(i, j) /= n;
+      }
+    }
+  }
+}
+real Matrix::l2NormRow(int64_t i) const {
+  auto norm = 0.0;
+  for (auto j = 0; j < n_; j++) {
+    const real v = at(i,j);
+    norm += v * v;
+  }
+  return std::sqrt(norm);
+}
+void Matrix::l2NormRow(Vector& norms) const {
+  assert(norms.size() == m_);
+    for (auto i = 0; i < m_; i++) {
+      norms[i] = l2NormRow(i);
+    }
+}
+void Matrix::save(std::ostream& out) {
+  out.write((char*) &m_, sizeof(int64_t));
+  out.write((char*) &n_, sizeof(int64_t));
+  out.write((char*) data_, m_ * n_ * sizeof(real));
+}
+void Matrix::load(std::istream& in) {
+  in.read((char*) &m_, sizeof(int64_t));
+  in.read((char*) &n_, sizeof(int64_t));
+  delete[] data_;
+  data_ = new real[m_ * n_];
+  in.read((char*) data_, m_ * n_ * sizeof(real));
+}
+}