RubyGems - ffi-fasttext - Versions diffs - 0.1.0 - Mend

ffi-fasttext 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

checksums.yaml +7 -0
data/.gitignore +44 -0
data/.travis.yml +5 -0
data/Gemfile +6 -0
data/LICENSE.txt +21 -0
data/README.md +59 -0
data/Rakefile +19 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/ext/ffi/fasttext/Rakefile +71 -0
data/ffi-fasttext.gemspec +40 -0
data/lib/ffi/fasttext.rb +108 -0
data/lib/ffi/fasttext/version.rb +5 -0
data/vendor/fasttext/LICENSE +30 -0
data/vendor/fasttext/PATENTS +33 -0
data/vendor/fasttext/args.cc +250 -0
data/vendor/fasttext/args.h +71 -0
data/vendor/fasttext/dictionary.cc +475 -0
data/vendor/fasttext/dictionary.h +112 -0
data/vendor/fasttext/fasttext.cc +693 -0
data/vendor/fasttext/fasttext.h +97 -0
data/vendor/fasttext/ffi_fasttext.cc +66 -0
data/vendor/fasttext/main.cc +270 -0
data/vendor/fasttext/matrix.cc +144 -0
data/vendor/fasttext/matrix.h +57 -0
data/vendor/fasttext/model.cc +341 -0
data/vendor/fasttext/model.h +110 -0
data/vendor/fasttext/productquantizer.cc +211 -0
data/vendor/fasttext/productquantizer.h +67 -0
data/vendor/fasttext/qmatrix.cc +121 -0
data/vendor/fasttext/qmatrix.h +65 -0
data/vendor/fasttext/real.h +19 -0
data/vendor/fasttext/utils.cc +29 -0
data/vendor/fasttext/utils.h +25 -0
data/vendor/fasttext/vector.cc +137 -0
data/vendor/fasttext/vector.h +53 -0
metadata +151 -0

data/vendor/fasttext/productquantizer.cc ADDED

@@ -0,0 +1,211 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+#include "productquantizer.h"
+#include <algorithm>
+#include <iostream>
+namespace fasttext {
+real distL2(const real* x, const real* y, int32_t d) {
+  real dist = 0;
+  for (auto i = 0; i < d; i++) {
+    auto tmp = x[i] - y[i];
+    dist += tmp * tmp;
+  }
+  return dist;
+}
+ProductQuantizer::ProductQuantizer(int32_t dim, int32_t dsub): dim_(dim),
+  nsubq_(dim / dsub), dsub_(dsub), centroids_(dim * ksub_), rng(seed_) {
+  lastdsub_ = dim_ % dsub;
+  if (lastdsub_ == 0) {lastdsub_ = dsub_;}
+  else {nsubq_++;}
+}
+const real* ProductQuantizer::get_centroids(int32_t m, uint8_t i) const {
+  if (m == nsubq_ - 1) {return &centroids_[m * ksub_ * dsub_ + i * lastdsub_];}
+  return &centroids_[(m * ksub_ + i) * dsub_];
+}
+real* ProductQuantizer::get_centroids(int32_t m, uint8_t i) {
+  if (m == nsubq_ - 1) {return &centroids_[m * ksub_ * dsub_ + i * lastdsub_];}
+  return &centroids_[(m * ksub_ + i) * dsub_];
+}
+real ProductQuantizer::assign_centroid(const real * x, const real* c0,
+                                       uint8_t* code, int32_t d) const {
+  const real* c = c0;
+  real dis = distL2(x, c, d);
+  code[0] = 0;
+  for (auto j = 1; j < ksub_; j++) {
+    c += d;
+    real disij = distL2(x, c, d);
+    if (disij < dis) {
+      code[0] = (uint8_t) j;
+      dis = disij;
+    }
+  }
+  return dis;
+}
+void ProductQuantizer::Estep(const real* x, const real* centroids,
+                             uint8_t* codes, int32_t d,
+                             int32_t n) const {
+  for (auto i = 0; i < n; i++) {
+    assign_centroid(x + i * d, centroids, codes + i, d);
+  }
+}
+void ProductQuantizer::MStep(const real* x0, real* centroids,
+                             const uint8_t* codes,
+                             int32_t d, int32_t n) {
+  std::vector<int32_t> nelts(ksub_, 0);
+  memset(centroids, 0, sizeof(real) * d * ksub_);
+  const real* x = x0;
+  for (auto i = 0; i < n; i++) {
+    auto k = codes[i];
+    real* c = centroids + k * d;
+    for (auto j = 0; j < d; j++) {
+      c[j] += x[j];
+    }
+    nelts[k]++;
+    x += d;
+  }
+  real* c = centroids;
+  for (auto k = 0; k < ksub_; k++) {
+    real z = (real) nelts[k];
+    if (z != 0) {
+      for (auto j = 0; j < d; j++) {
+        c[j] /= z;
+      }
+    }
+    c += d;
+  }
+  std::uniform_real_distribution<> runiform(0,1);
+  for (auto k = 0; k < ksub_; k++) {
+    if (nelts[k] == 0) {
+      int32_t m = 0;
+      while (runiform(rng) * (n - ksub_) >= nelts[m] - 1) {
+        m = (m + 1) % ksub_;
+      }
+      memcpy(centroids + k * d, centroids + m * d, sizeof(real) * d);
+      for (auto j = 0; j < d; j++) {
+        int32_t sign = (j % 2) * 2 - 1;
+        centroids[k * d + j] += sign * eps_;
+        centroids[m * d + j] -= sign * eps_;
+      }
+      nelts[k] = nelts[m] / 2;
+      nelts[m] -= nelts[k];
+    }
+  }
+}
+void ProductQuantizer::kmeans(const real *x, real* c, int32_t n, int32_t d) {
+  std::vector<int32_t> perm(n,0);
+  std::iota(perm.begin(), perm.end(), 0);
+  std::shuffle(perm.begin(), perm.end(), rng);
+  for (auto i = 0; i < ksub_; i++) {
+    memcpy (&c[i * d], x + perm[i] * d, d * sizeof(real));
+  }
+  uint8_t* codes = new uint8_t[n];
+  for (auto i = 0; i < niter_; i++) {
+    Estep(x, c, codes, d, n);
+    MStep(x, c, codes, d, n);
+  }
+  delete [] codes;
+}
+void ProductQuantizer::train(int32_t n, const real * x) {
+  if (n < ksub_) {
+    std::cerr<<"Matrix too small for quantization, must have > 256 rows"<<std::endl;
+    exit(1);
+  }
+  std::vector<int32_t> perm(n, 0);
+  std::iota(perm.begin(), perm.end(), 0);
+  auto d = dsub_;
+  auto np = std::min(n, max_points_);
+  real* xslice = new real[np * dsub_];
+  for (auto m = 0; m < nsubq_; m++) {
+    if (m == nsubq_-1) {d = lastdsub_;}
+    if (np != n) {std::shuffle(perm.begin(), perm.end(), rng);}
+    for (auto j = 0; j < np; j++) {
+      memcpy (xslice + j * d, x + perm[j] * dim_ + m * dsub_, d * sizeof(real));
+    }
+    kmeans(xslice, get_centroids(m, 0), np, d);
+  }
+  delete [] xslice;
+}
+real ProductQuantizer::mulcode(const Vector& x, const uint8_t* codes,
+                               int32_t t, real alpha) const {
+  real res = 0.0;
+  auto d = dsub_;
+  const uint8_t* code = codes + nsubq_ * t;
+  for (auto m = 0; m < nsubq_; m++) {
+    const real* c = get_centroids(m, code[m]);
+    if (m == nsubq_ - 1) {d = lastdsub_;}
+    for(auto n = 0; n < d; n++) {
+      res += x[m * dsub_ + n] * c[n];
+    }
+  }
+  return res * alpha;
+}
+void ProductQuantizer::addcode(Vector& x, const uint8_t* codes,
+                               int32_t t, real alpha) const {
+  auto d = dsub_;
+  const uint8_t* code = codes + nsubq_ * t;
+  for (auto m = 0; m < nsubq_; m++) {
+    const real* c = get_centroids(m, code[m]);
+    if (m == nsubq_ - 1) {d = lastdsub_;}
+    for(auto n = 0; n < d; n++) {
+      x[m * dsub_ + n] += alpha * c[n];
+    }
+  }
+}
+void ProductQuantizer::compute_code(const real* x, uint8_t* code) const {
+  auto d = dsub_;
+  for (auto m = 0; m < nsubq_; m++) {
+    if (m == nsubq_ - 1) {d = lastdsub_;}
+    assign_centroid(x + m * dsub_, get_centroids(m, 0), code + m, d);
+  }
+}
+void ProductQuantizer::compute_codes(const real* x, uint8_t* codes,
+                                     int32_t n) const {
+  for (auto i = 0; i < n; i++) {
+    compute_code(x + i * dim_, codes + i * nsubq_);
+  }
+}
+void ProductQuantizer::save(std::ostream& out) {
+  out.write((char*) &dim_, sizeof(dim_));
+  out.write((char*) &nsubq_, sizeof(nsubq_));
+  out.write((char*) &dsub_, sizeof(dsub_));
+  out.write((char*) &lastdsub_, sizeof(lastdsub_));
+  out.write((char*) centroids_.data(), centroids_.size() * sizeof(real));
+}
+void ProductQuantizer::load(std::istream& in) {
+  in.read((char*) &dim_, sizeof(dim_));
+  in.read((char*) &nsubq_, sizeof(nsubq_));
+  in.read((char*) &dsub_, sizeof(dsub_));
+  in.read((char*) &lastdsub_, sizeof(lastdsub_));
+  centroids_.resize(dim_ * ksub_);
+  for (auto i=0; i < centroids_.size(); i++) {
+    in.read((char*) &centroids_[i], sizeof(real));
+  }
+}
+}

data/vendor/fasttext/productquantizer.h ADDED

@@ -0,0 +1,67 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+#ifndef FASTTEXT_PRODUCT_QUANTIZER_H
+#define FASTTEXT_PRODUCT_QUANTIZER_H
+#include <cstring>
+#include <istream>
+#include <ostream>
+#include <vector>
+#include <random>
+#include "real.h"
+#include "vector.h"
+namespace fasttext {
+class ProductQuantizer {
+  private:
+    const int32_t nbits_ = 8;
+    const int32_t ksub_ = 1 << nbits_;
+    const int32_t max_points_per_cluster_ = 256;
+    const int32_t max_points_ = max_points_per_cluster_ * ksub_;
+    const int32_t seed_ = 1234;
+    const int32_t niter_ = 25;
+    const real eps_ = 1e-7;
+    int32_t dim_;
+    int32_t nsubq_;
+    int32_t dsub_;
+    int32_t lastdsub_;
+    std::vector<real> centroids_;
+    std::minstd_rand rng;
+  public:
+    ProductQuantizer() {}
+    ProductQuantizer(int32_t, int32_t);
+    real* get_centroids (int32_t, uint8_t);
+    const real* get_centroids(int32_t, uint8_t) const;
+    real assign_centroid(const real*, const real*, uint8_t*, int32_t) const;
+    void Estep(const real*, const real*, uint8_t*, int32_t, int32_t) const;
+    void MStep(const real*, real*, const uint8_t*, int32_t, int32_t);
+    void kmeans(const real*, real*, int32_t, int32_t);
+    void train(int, const real*);
+    real mulcode(const Vector&, const uint8_t*, int32_t, real) const;
+    void addcode(Vector&, const uint8_t*, int32_t, real) const;
+    void compute_code(const real*, uint8_t*)  const;
+    void compute_codes(const real*, uint8_t*, int32_t)  const;
+    void save(std::ostream&);
+    void load(std::istream&);
+};
+}
+#endif

data/vendor/fasttext/qmatrix.cc ADDED

@@ -0,0 +1,121 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+#include "qmatrix.h"
+#include <assert.h>
+#include <iostream>
+namespace fasttext {
+QMatrix::QMatrix() : qnorm_(false),
+  m_(0), n_(0), codesize_(0) {}
+QMatrix::QMatrix(const Matrix& mat, int32_t dsub, bool qnorm)
+      : qnorm_(qnorm), m_(mat.m_), n_(mat.n_),
+        codesize_(m_ * ((n_ + dsub - 1) / dsub)) {
+  if (codesize_ > 0) {
+    codes_ = new uint8_t[codesize_];
+  }
+  pq_ = std::unique_ptr<ProductQuantizer>( new ProductQuantizer(n_, dsub));
+  if (qnorm_) {
+    norm_codes_ = new uint8_t[m_];
+    npq_ = std::unique_ptr<ProductQuantizer>( new ProductQuantizer(1, 1));
+  }
+  quantize(mat);
+}
+QMatrix::~QMatrix() {
+  if (codesize_ > 0) {
+    delete[] codes_;
+  }
+  if (qnorm_) { delete[] norm_codes_; }
+}
+void QMatrix::quantizeNorm(const Vector& norms) {
+  assert(qnorm_);
+  assert(norms.m_ == m_);
+  auto dataptr = norms.data_;
+  npq_->train(m_, dataptr);
+  npq_->compute_codes(dataptr, norm_codes_, m_);
+}
+void QMatrix::quantize(const Matrix& matrix) {
+  assert(n_ == matrix.n_);
+  assert(m_ == matrix.m_);
+  Matrix temp(matrix);
+  if (qnorm_) {
+    Vector norms(temp.m_);
+    temp.l2NormRow(norms);
+    temp.divideRow(norms);
+    quantizeNorm(norms);
+  }
+  auto dataptr = temp.data_;
+  pq_->train(m_, dataptr);
+  pq_->compute_codes(dataptr, codes_, m_);
+}
+void QMatrix::addToVector(Vector& x, int32_t t) const {
+  real norm = 1;
+  if (qnorm_) {
+    norm = npq_->get_centroids(0, norm_codes_[t])[0];
+  }
+  pq_->addcode(x, codes_, t, norm);
+}
+real QMatrix::dotRow(const Vector& vec, int64_t i) const {
+  assert(i >= 0);
+  assert(i < m_);
+  assert(vec.size() == n_);
+  real norm = 1;
+  if (qnorm_) {
+    norm = npq_->get_centroids(0, norm_codes_[i])[0];
+  }
+  return pq_->mulcode(vec, codes_, i, norm);
+}
+int64_t QMatrix::getM() const {
+  return m_;
+}
+int64_t QMatrix::getN() const {
+  return n_;
+}
+void QMatrix::save(std::ostream& out) {
+    out.write((char*) &qnorm_, sizeof(qnorm_));
+    out.write((char*) &m_, sizeof(m_));
+    out.write((char*) &n_, sizeof(n_));
+    out.write((char*) &codesize_, sizeof(codesize_));
+    out.write((char*) codes_, codesize_ * sizeof(uint8_t));
+    pq_->save(out);
+    if (qnorm_) {
+      out.write((char*) norm_codes_, m_ * sizeof(uint8_t));
+      npq_->save(out);
+    }
+}
+void QMatrix::load(std::istream& in) {
+    in.read((char*) &qnorm_, sizeof(qnorm_));
+    in.read((char*) &m_, sizeof(m_));
+    in.read((char*) &n_, sizeof(n_));
+    in.read((char*) &codesize_, sizeof(codesize_));
+    codes_ = new uint8_t[codesize_];
+    in.read((char*) codes_, codesize_ * sizeof(uint8_t));
+    pq_ = std::unique_ptr<ProductQuantizer>( new ProductQuantizer());
+    pq_->load(in);
+    if (qnorm_) {
+      norm_codes_ = new uint8_t[m_];
+      in.read((char*) norm_codes_, m_ * sizeof(uint8_t));
+      npq_ = std::unique_ptr<ProductQuantizer>( new ProductQuantizer());
+      npq_->load(in);
+    }
+}
+}

data/vendor/fasttext/qmatrix.h ADDED

@@ -0,0 +1,65 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+#ifndef FASTTEXT_QMATRIX_H
+#define FASTTEXT_QMATRIX_H
+#include <cstdint>
+#include <istream>
+#include <ostream>
+#include <vector>
+#include <memory>
+#include "real.h"
+#include "matrix.h"
+#include "vector.h"
+#include "productquantizer.h"
+namespace fasttext {
+class QMatrix {
+  private:
+    std::unique_ptr<ProductQuantizer> pq_;
+    std::unique_ptr<ProductQuantizer> npq_;
+    uint8_t* codes_;
+    uint8_t* norm_codes_;
+    bool qnorm_;
+    int64_t m_;
+    int64_t n_;
+    int32_t codesize_;
+  public:
+    QMatrix();
+    QMatrix(const Matrix&, int32_t, bool);
+    ~QMatrix();
+    int64_t getM() const;
+    int64_t getN() const;
+    void quantizeNorm(const Vector&);
+    void quantize(const Matrix&);
+    void addToVector(Vector& x, int32_t t) const;
+    real dotRow(const Vector&, int64_t) const;
+    void save(std::ostream&);
+    void load(std::istream&);
+};
+}
+#endif