RubyGems - ffi-fasttext - Versions diffs - 0.1.0 - Mend

ffi-fasttext 0.1.0

Files changed (37) hide show

checksums.yaml +7 -0
data/.gitignore +44 -0
data/.travis.yml +5 -0
data/Gemfile +6 -0
data/LICENSE.txt +21 -0
data/README.md +59 -0
data/Rakefile +19 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/ext/ffi/fasttext/Rakefile +71 -0
data/ffi-fasttext.gemspec +40 -0
data/lib/ffi/fasttext.rb +108 -0
data/lib/ffi/fasttext/version.rb +5 -0
data/vendor/fasttext/LICENSE +30 -0
data/vendor/fasttext/PATENTS +33 -0
data/vendor/fasttext/args.cc +250 -0
data/vendor/fasttext/args.h +71 -0
data/vendor/fasttext/dictionary.cc +475 -0
data/vendor/fasttext/dictionary.h +112 -0
data/vendor/fasttext/fasttext.cc +693 -0
data/vendor/fasttext/fasttext.h +97 -0
data/vendor/fasttext/ffi_fasttext.cc +66 -0
data/vendor/fasttext/main.cc +270 -0
data/vendor/fasttext/matrix.cc +144 -0
data/vendor/fasttext/matrix.h +57 -0
data/vendor/fasttext/model.cc +341 -0
data/vendor/fasttext/model.h +110 -0
data/vendor/fasttext/productquantizer.cc +211 -0
data/vendor/fasttext/productquantizer.h +67 -0
data/vendor/fasttext/qmatrix.cc +121 -0
data/vendor/fasttext/qmatrix.h +65 -0
data/vendor/fasttext/real.h +19 -0
data/vendor/fasttext/utils.cc +29 -0
data/vendor/fasttext/utils.h +25 -0
data/vendor/fasttext/vector.cc +137 -0
data/vendor/fasttext/vector.h +53 -0
metadata +151 -0

@@ -0,0 +1,57 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+#ifndef FASTTEXT_MATRIX_H
+#define FASTTEXT_MATRIX_H
+#include <cstdint>
+#include <istream>
+#include <ostream>
+#include "real.h"
+namespace fasttext {
+class Vector;
+class Matrix {
+  public:
+    real* data_;
+    int64_t m_;
+    int64_t n_;
+    Matrix();
+    Matrix(int64_t, int64_t);
+    Matrix(const Matrix&);
+    Matrix& operator=(const Matrix&);
+    ~Matrix();
+    inline const real& at(int64_t i, int64_t j) const {return data_[i * n_ + j];};
+    inline real& at(int64_t i, int64_t j) {return data_[i * n_ + j];};
+    void zero();
+    void uniform(real);
+    real dotRow(const Vector&, int64_t) const;
+    void addRow(const Vector&, int64_t, real);
+    void multiplyRow(const Vector& nums, int64_t ib = 0, int64_t ie = -1);
+    void divideRow(const Vector& denoms, int64_t ib = 0, int64_t ie = -1);
+    real l2NormRow(int64_t i) const;
+    void l2NormRow(Vector& norms) const;
+    void save(std::ostream&);
+    void load(std::istream&);
+};
+}
+#endif

data/vendor/fasttext/model.cc ADDED

@@ -0,0 +1,341 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+#include "model.h"
+#include <iostream>
+#include <assert.h>
+#include <algorithm>
+namespace fasttext {
+Model::Model(std::shared_ptr<Matrix> wi,
+             std::shared_ptr<Matrix> wo,
+             std::shared_ptr<Args> args,
+             int32_t seed)
+  : hidden_(args->dim), output_(wo->m_),
+  grad_(args->dim), rng(seed), quant_(false)
+{
+  wi_ = wi;
+  wo_ = wo;
+  args_ = args;
+  osz_ = wo->m_;
+  hsz_ = args->dim;
+  negpos = 0;
+  loss_ = 0.0;
+  nexamples_ = 1;
+  initSigmoid();
+  initLog();
+}
+Model::~Model() {
+  delete[] t_sigmoid;
+  delete[] t_log;
+}
+void Model::setQuantizePointer(std::shared_ptr<QMatrix> qwi,
+                               std::shared_ptr<QMatrix> qwo, bool qout) {
+  qwi_ = qwi;
+  qwo_ = qwo;
+  if (qout) {
+    osz_ = qwo_->getM();
+  }
+}
+real Model::binaryLogistic(int32_t target, bool label, real lr) {
+  real score = sigmoid(wo_->dotRow(hidden_, target));
+  real alpha = lr * (real(label) - score);
+  grad_.addRow(*wo_, target, alpha);
+  wo_->addRow(hidden_, target, alpha);
+  if (label) {
+    return -log(score);
+  } else {
+    return -log(1.0 - score);
+  }
+}
+real Model::negativeSampling(int32_t target, real lr) {
+  real loss = 0.0;
+  grad_.zero();
+  for (int32_t n = 0; n <= args_->neg; n++) {
+    if (n == 0) {
+      loss += binaryLogistic(target, true, lr);
+    } else {
+      loss += binaryLogistic(getNegative(target), false, lr);
+    }
+  }
+  return loss;
+}
+real Model::hierarchicalSoftmax(int32_t target, real lr) {
+  real loss = 0.0;
+  grad_.zero();
+  const std::vector<bool>& binaryCode = codes[target];
+  const std::vector<int32_t>& pathToRoot = paths[target];
+  for (int32_t i = 0; i < pathToRoot.size(); i++) {
+    loss += binaryLogistic(pathToRoot[i], binaryCode[i], lr);
+  }
+  return loss;
+}
+void Model::computeOutputSoftmax(Vector& hidden, Vector& output) const {
+  if (quant_ && args_->qout) {
+    output.mul(*qwo_, hidden);
+  } else {
+    output.mul(*wo_, hidden);
+  }
+  real max = output[0], z = 0.0;
+  for (int32_t i = 0; i < osz_; i++) {
+    max = std::max(output[i], max);
+  }
+  for (int32_t i = 0; i < osz_; i++) {
+    output[i] = exp(output[i] - max);
+    z += output[i];
+  }
+  for (int32_t i = 0; i < osz_; i++) {
+    output[i] /= z;
+  }
+}
+void Model::computeOutputSoftmax() {
+  computeOutputSoftmax(hidden_, output_);
+}
+real Model::softmax(int32_t target, real lr) {
+  grad_.zero();
+  computeOutputSoftmax();
+  for (int32_t i = 0; i < osz_; i++) {
+    real label = (i == target) ? 1.0 : 0.0;
+    real alpha = lr * (label - output_[i]);
+    grad_.addRow(*wo_, i, alpha);
+    wo_->addRow(hidden_, i, alpha);
+  }
+  return -log(output_[target]);
+}
+void Model::computeHidden(const std::vector<int32_t>& input, Vector& hidden) const {
+  assert(hidden.size() == hsz_);
+  hidden.zero();
+  for (auto it = input.cbegin(); it != input.cend(); ++it) {
+    if(quant_) {
+      hidden.addRow(*qwi_, *it);
+    } else {
+      hidden.addRow(*wi_, *it);
+    }
+  }
+  hidden.mul(1.0 / input.size());
+}
+bool Model::comparePairs(const std::pair<real, int32_t> &l,
+                         const std::pair<real, int32_t> &r) {
+  return l.first > r.first;
+}
+void Model::predict(const std::vector<int32_t>& input, int32_t k,
+                    std::vector<std::pair<real, int32_t>>& heap,
+                    Vector& hidden, Vector& output) const {
+  assert(k > 0);
+  heap.reserve(k + 1);
+  computeHidden(input, hidden);
+  if (args_->loss == loss_name::hs) {
+    dfs(k, 2 * osz_ - 2, 0.0, heap, hidden);
+  } else {
+    findKBest(k, heap, hidden, output);
+  }
+  std::sort_heap(heap.begin(), heap.end(), comparePairs);
+}
+void Model::predict(const std::vector<int32_t>& input, int32_t k,
+                    std::vector<std::pair<real, int32_t>>& heap) {
+  predict(input, k, heap, hidden_, output_);
+}
+void Model::findKBest(int32_t k, std::vector<std::pair<real, int32_t>>& heap,
+                      Vector& hidden, Vector& output) const {
+  computeOutputSoftmax(hidden, output);
+  for (int32_t i = 0; i < osz_; i++) {
+    if (heap.size() == k && log(output[i]) < heap.front().first) {
+      continue;
+    }
+    heap.push_back(std::make_pair(log(output[i]), i));
+    std::push_heap(heap.begin(), heap.end(), comparePairs);
+    if (heap.size() > k) {
+      std::pop_heap(heap.begin(), heap.end(), comparePairs);
+      heap.pop_back();
+    }
+  }
+}
+void Model::dfs(int32_t k, int32_t node, real score,
+                std::vector<std::pair<real, int32_t>>& heap,
+                Vector& hidden) const {
+  if (heap.size() == k && score < heap.front().first) {
+    return;
+  }
+  if (tree[node].left == -1 && tree[node].right == -1) {
+    heap.push_back(std::make_pair(score, node));
+    std::push_heap(heap.begin(), heap.end(), comparePairs);
+    if (heap.size() > k) {
+      std::pop_heap(heap.begin(), heap.end(), comparePairs);
+      heap.pop_back();
+    }
+    return;
+  }
+  real f;
+  if (quant_ && args_->qout) {
+    f= sigmoid(qwo_->dotRow(hidden, node - osz_));
+  } else {
+    f= sigmoid(wo_->dotRow(hidden, node - osz_));
+  }
+  dfs(k, tree[node].left, score + log(1.0 - f), heap, hidden);
+  dfs(k, tree[node].right, score + log(f), heap, hidden);
+}
+void Model::update(const std::vector<int32_t>& input, int32_t target, real lr) {
+  assert(target >= 0);
+  assert(target < osz_);
+  if (input.size() == 0) return;
+  computeHidden(input, hidden_);
+  if (args_->loss == loss_name::ns) {
+    loss_ += negativeSampling(target, lr);
+  } else if (args_->loss == loss_name::hs) {
+    loss_ += hierarchicalSoftmax(target, lr);
+  } else {
+    loss_ += softmax(target, lr);
+  }
+  nexamples_ += 1;
+  if (args_->model == model_name::sup) {
+    grad_.mul(1.0 / input.size());
+  }
+  for (auto it = input.cbegin(); it != input.cend(); ++it) {
+    wi_->addRow(grad_, *it, 1.0);
+  }
+}
+void Model::setTargetCounts(const std::vector<int64_t>& counts) {
+  assert(counts.size() == osz_);
+  if (args_->loss == loss_name::ns) {
+    initTableNegatives(counts);
+  }
+  if (args_->loss == loss_name::hs) {
+    buildTree(counts);
+  }
+}
+void Model::initTableNegatives(const std::vector<int64_t>& counts) {
+  real z = 0.0;
+  for (size_t i = 0; i < counts.size(); i++) {
+    z += pow(counts[i], 0.5);
+  }
+  for (size_t i = 0; i < counts.size(); i++) {
+    real c = pow(counts[i], 0.5);
+    for (size_t j = 0; j < c * NEGATIVE_TABLE_SIZE / z; j++) {
+      negatives.push_back(i);
+    }
+  }
+  std::shuffle(negatives.begin(), negatives.end(), rng);
+}
+int32_t Model::getNegative(int32_t target) {
+  int32_t negative;
+  do {
+    negative = negatives[negpos];
+    negpos = (negpos + 1) % negatives.size();
+  } while (target == negative);
+  return negative;
+}
+void Model::buildTree(const std::vector<int64_t>& counts) {
+  tree.resize(2 * osz_ - 1);
+  for (int32_t i = 0; i < 2 * osz_ - 1; i++) {
+    tree[i].parent = -1;
+    tree[i].left = -1;
+    tree[i].right = -1;
+    tree[i].count = 1e15;
+    tree[i].binary = false;
+  }
+  for (int32_t i = 0; i < osz_; i++) {
+    tree[i].count = counts[i];
+  }
+  int32_t leaf = osz_ - 1;
+  int32_t node = osz_;
+  for (int32_t i = osz_; i < 2 * osz_ - 1; i++) {
+    int32_t mini[2];
+    for (int32_t j = 0; j < 2; j++) {
+      if (leaf >= 0 && tree[leaf].count < tree[node].count) {
+        mini[j] = leaf--;
+      } else {
+        mini[j] = node++;
+      }
+    }
+    tree[i].left = mini[0];
+    tree[i].right = mini[1];
+    tree[i].count = tree[mini[0]].count + tree[mini[1]].count;
+    tree[mini[0]].parent = i;
+    tree[mini[1]].parent = i;
+    tree[mini[1]].binary = true;
+  }
+  for (int32_t i = 0; i < osz_; i++) {
+    std::vector<int32_t> path;
+    std::vector<bool> code;
+    int32_t j = i;
+    while (tree[j].parent != -1) {
+      path.push_back(tree[j].parent - osz_);
+      code.push_back(tree[j].binary);
+      j = tree[j].parent;
+    }
+    paths.push_back(path);
+    codes.push_back(code);
+  }
+}
+real Model::getLoss() const {
+  return loss_ / nexamples_;
+}
+void Model::initSigmoid() {
+  t_sigmoid = new real[SIGMOID_TABLE_SIZE + 1];
+  for (int i = 0; i < SIGMOID_TABLE_SIZE + 1; i++) {
+    real x = real(i * 2 * MAX_SIGMOID) / SIGMOID_TABLE_SIZE - MAX_SIGMOID;
+    t_sigmoid[i] = 1.0 / (1.0 + std::exp(-x));
+  }
+}
+void Model::initLog() {
+  t_log = new real[LOG_TABLE_SIZE + 1];
+  for (int i = 0; i < LOG_TABLE_SIZE + 1; i++) {
+    real x = (real(i) + 1e-5) / LOG_TABLE_SIZE;
+    t_log[i] = std::log(x);
+  }
+}
+real Model::log(real x) const {
+  if (x > 1.0) {
+    return 0.0;
+  }
+  int i = int(x * LOG_TABLE_SIZE);
+  return t_log[i];
+}
+real Model::sigmoid(real x) const {
+  if (x < -MAX_SIGMOID) {
+    return 0.0;
+  } else if (x > MAX_SIGMOID) {
+    return 1.0;
+  } else {
+    int i = int((x + MAX_SIGMOID) * SIGMOID_TABLE_SIZE / MAX_SIGMOID / 2);
+    return t_sigmoid[i];
+  }
+}
+}

data/vendor/fasttext/model.h ADDED

@@ -0,0 +1,110 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+#ifndef FASTTEXT_MODEL_H
+#define FASTTEXT_MODEL_H
+#include <vector>
+#include <random>
+#include <utility>
+#include <memory>
+#include "args.h"
+#include "matrix.h"
+#include "vector.h"
+#include "qmatrix.h"
+#include "real.h"
+#define SIGMOID_TABLE_SIZE 512
+#define MAX_SIGMOID 8
+#define LOG_TABLE_SIZE 512
+namespace fasttext {
+struct Node {
+  int32_t parent;
+  int32_t left;
+  int32_t right;
+  int64_t count;
+  bool binary;
+};
+class Model {
+  private:
+    std::shared_ptr<Matrix> wi_;
+    std::shared_ptr<Matrix> wo_;
+    std::shared_ptr<QMatrix> qwi_;
+    std::shared_ptr<QMatrix> qwo_;
+    std::shared_ptr<Args> args_;
+    Vector hidden_;
+    Vector output_;
+    Vector grad_;
+    int32_t hsz_;
+    int32_t osz_;
+    real loss_;
+    int64_t nexamples_;
+    real* t_sigmoid;
+    real* t_log;
+    // used for negative sampling:
+    std::vector<int32_t> negatives;
+    size_t negpos;
+    // used for hierarchical softmax:
+    std::vector< std::vector<int32_t> > paths;
+    std::vector< std::vector<bool> > codes;
+    std::vector<Node> tree;
+    static bool comparePairs(const std::pair<real, int32_t>&,
+                             const std::pair<real, int32_t>&);
+    int32_t getNegative(int32_t target);
+    void initSigmoid();
+    void initLog();
+    static const int32_t NEGATIVE_TABLE_SIZE = 10000000;
+  public:
+    Model(std::shared_ptr<Matrix>, std::shared_ptr<Matrix>,
+          std::shared_ptr<Args>, int32_t);
+    ~Model();
+    real binaryLogistic(int32_t, bool, real);
+    real negativeSampling(int32_t, real);
+    real hierarchicalSoftmax(int32_t, real);
+    real softmax(int32_t, real);
+    void predict(const std::vector<int32_t>&, int32_t,
+                 std::vector<std::pair<real, int32_t>>&,
+                 Vector&, Vector&) const;
+    void predict(const std::vector<int32_t>&, int32_t,
+                 std::vector<std::pair<real, int32_t>>&);
+    void dfs(int32_t, int32_t, real,
+             std::vector<std::pair<real, int32_t>>&,
+             Vector&) const;
+    void findKBest(int32_t, std::vector<std::pair<real, int32_t>>&,
+                   Vector&, Vector&) const;
+    void update(const std::vector<int32_t>&, int32_t, real);
+    void computeHidden(const std::vector<int32_t>&, Vector&) const;
+    void computeOutputSoftmax(Vector&, Vector&) const;
+    void computeOutputSoftmax();
+    void setTargetCounts(const std::vector<int64_t>&);
+    void initTableNegatives(const std::vector<int64_t>&);
+    void buildTree(const std::vector<int64_t>&);
+    real getLoss() const;
+    real sigmoid(real) const;
+    real log(real) const;
+    std::minstd_rand rng;
+    bool quant_;
+    void setQuantizePointer(std::shared_ptr<QMatrix>, std::shared_ptr<QMatrix>, bool);
+};
+}
+#endif