RubyGems - fasttext - Versions diffs - 0.1.2 → 0.2.2 - Mend

fasttext 0.1.2 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +19 -0
data/LICENSE.txt +18 -18
data/README.md +26 -19
data/ext/fasttext/ext.cpp +131 -134
data/ext/fasttext/extconf.rb +2 -4
data/lib/fasttext/classifier.rb +23 -10
data/lib/fasttext/model.rb +10 -0
data/lib/fasttext/vectorizer.rb +11 -5
data/lib/fasttext/version.rb +1 -1
data/vendor/fastText/README.md +3 -3
data/vendor/fastText/src/args.cc +179 -6
data/vendor/fastText/src/args.h +29 -1
data/vendor/fastText/src/autotune.cc +477 -0
data/vendor/fastText/src/autotune.h +89 -0
data/vendor/fastText/src/densematrix.cc +27 -7
data/vendor/fastText/src/densematrix.h +10 -2
data/vendor/fastText/src/fasttext.cc +125 -114
data/vendor/fastText/src/fasttext.h +31 -52
data/vendor/fastText/src/main.cc +32 -13
data/vendor/fastText/src/meter.cc +148 -2
data/vendor/fastText/src/meter.h +24 -2
data/vendor/fastText/src/model.cc +0 -1
data/vendor/fastText/src/real.h +0 -1
data/vendor/fastText/src/utils.cc +25 -0
data/vendor/fastText/src/utils.h +29 -0
data/vendor/fastText/src/vector.cc +0 -1
metadata +14 -69
data/lib/fasttext/ext.bundle +0 -0

data/lib/fasttext/vectorizer.rb CHANGED Viewed

@@ -20,13 +20,19 @@ module FastText
       verbose: 2,
       pretrained_vectors: "",
       save_output: false,
-      # seed: 0
+      seed: 0,
+      autotune_validation_file: "",
+      autotune_metric: "f1",
+      autotune_predictions: 1,
+      autotune_duration: 60 * 5,
+      autotune_model_size: ""
     }
     def fit(x)
-      input = input_path(x)
       @m ||= Ext::Model.new
-      m.train(DEFAULT_OPTIONS.merge(@options).merge(input: input))
+      a = build_args(DEFAULT_OPTIONS)
+      a.input, _ref = input_path(x)
+      m.train(a)
     end
     def nearest_neighbors(word, k: 10)
@@ -43,7 +49,7 @@ module FastText
     # https://github.com/facebookresearch/fastText/issues/518
     def input_path(x)
       if x.is_a?(String)
-        x
+        [x, nil]
       else
         tempfile = Tempfile.new("fasttext")
         x.each do |xi|
@@ -51,7 +57,7 @@ module FastText
           tempfile.write("\n")
         end
         tempfile.close
-        tempfile.path
+        [tempfile.path, tempfile]
       end
     end
   end

data/lib/fasttext/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module FastText
-  VERSION = "0.1.2"
+  VERSION = "0.2.2"
 end

data/vendor/fastText/README.md CHANGED Viewed

@@ -89,9 +89,9 @@ There is also the master branch that contains all of our most recent work, but c
 ### Building fastText using make (preferred)
 ```
-$ wget https://github.com/facebookresearch/fastText/archive/v0.9.1.zip
-$ unzip v0.9.1.zip
-$ cd fastText-0.9.1
+$ wget https://github.com/facebookresearch/fastText/archive/v0.9.2.zip
+$ unzip v0.9.2.zip
+$ cd fastText-0.9.2
 $ make
 ```

data/vendor/fastText/src/args.cc CHANGED Viewed

@@ -12,6 +12,8 @@
 #include <iostream>
 #include <stdexcept>
+#include <string>
+#include <unordered_map>
 namespace fasttext {
@@ -36,12 +38,19 @@ Args::Args() {
   verbose = 2;
   pretrainedVectors = "";
   saveOutput = false;
+  seed = 0;
   qout = false;
   retrain = false;
   qnorm = false;
   cutoff = 0;
   dsub = 2;
+  autotuneValidationFile = "";
+  autotuneMetric = "f1";
+  autotunePredictions = 1;
+  autotuneDuration = 60 * 5; // 5 minutes
+  autotuneModelSize = "";
 }
 std::string Args::lossToString(loss_name ln) const {
@@ -78,6 +87,24 @@ std::string Args::modelToString(model_name mn) const {
   return "Unknown model name!"; // should never happen
 }
+std::string Args::metricToString(metric_name mn) const {
+  switch (mn) {
+    case metric_name::f1score:
+      return "f1score";
+    case metric_name::f1scoreLabel:
+      return "f1scoreLabel";
+    case metric_name::precisionAtRecall:
+      return "precisionAtRecall";
+    case metric_name::precisionAtRecallLabel:
+      return "precisionAtRecallLabel";
+    case metric_name::recallAtPrecision:
+      return "recallAtPrecision";
+    case metric_name::recallAtPrecisionLabel:
+      return "recallAtPrecisionLabel";
+  }
+  return "Unknown metric name!"; // should never happen
+}
 void Args::parseArgs(const std::vector<std::string>& args) {
   std::string command(args[1]);
   if (command == "supervised") {
@@ -97,6 +124,8 @@ void Args::parseArgs(const std::vector<std::string>& args) {
       exit(EXIT_FAILURE);
     }
     try {
+      setManual(args[ai].substr(1));
       if (args[ai] == "-h") {
         std::cerr << "Here is the help! Usage:" << std::endl;
         printHelp();
@@ -157,6 +186,8 @@ void Args::parseArgs(const std::vector<std::string>& args) {
       } else if (args[ai] == "-saveOutput") {
         saveOutput = true;
         ai--;
+      } else if (args[ai] == "-seed") {
+        seed = std::stoi(args.at(ai + 1));
       } else if (args[ai] == "-qnorm") {
         qnorm = true;
         ai--;
@@ -170,6 +201,18 @@ void Args::parseArgs(const std::vector<std::string>& args) {
         cutoff = std::stoi(args.at(ai + 1));
       } else if (args[ai] == "-dsub") {
         dsub = std::stoi(args.at(ai + 1));
+      } else if (args[ai] == "-autotune-validation") {
+        autotuneValidationFile = std::string(args.at(ai + 1));
+      } else if (args[ai] == "-autotune-metric") {
+        autotuneMetric = std::string(args.at(ai + 1));
+        getAutotuneMetric(); // throws exception if not able to parse
+        getAutotuneMetricLabel(); // throws exception if not able to parse
+      } else if (args[ai] == "-autotune-predictions") {
+        autotunePredictions = std::stoi(args.at(ai + 1));
+      } else if (args[ai] == "-autotune-duration") {
+        autotuneDuration = std::stoi(args.at(ai + 1));
+      } else if (args[ai] == "-autotune-modelsize") {
+        autotuneModelSize = std::string(args.at(ai + 1));
       } else {
         std::cerr << "Unknown argument: " << args[ai] << std::endl;
         printHelp();
@@ -186,7 +229,7 @@ void Args::parseArgs(const std::vector<std::string>& args) {
     printHelp();
     exit(EXIT_FAILURE);
   }
-  if (wordNgrams <= 1 && maxn == 0) {
+  if (wordNgrams <= 1 && maxn == 0 && !hasAutotune()) {
     bucket = 0;
   }
 }
@@ -195,6 +238,7 @@ void Args::printHelp() {
   printBasicHelp();
   printDictionaryHelp();
   printTrainingHelp();
+  printAutotuneHelp();
   printQuantizationHelp();
 }
@@ -227,7 +271,8 @@ void Args::printTrainingHelp() {
   std::cerr
       << "\nThe following arguments for training are optional:\n"
       << "  -lr                 learning rate [" << lr << "]\n"
-      << "  -lrUpdateRate       change the rate of updates for the learning rate ["
+      << "  -lrUpdateRate       change the rate of updates for the learning "
+         "rate ["
       << lrUpdateRate << "]\n"
       << "  -dim                size of word vectors [" << dim << "]\n"
       << "  -ws                 size of the context window [" << ws << "]\n"
@@ -235,11 +280,31 @@ void Args::printTrainingHelp() {
       << "  -neg                number of negatives sampled [" << neg << "]\n"
       << "  -loss               loss function {ns, hs, softmax, one-vs-all} ["
       << lossToString(loss) << "]\n"
-      << "  -thread             number of threads [" << thread << "]\n"
-      << "  -pretrainedVectors  pretrained word vectors for supervised learning ["
+      << "  -thread             number of threads (set to 1 to ensure "
+         "reproducible results) ["
+      << thread << "]\n"
+      << "  -pretrainedVectors  pretrained word vectors for supervised "
+         "learning ["
       << pretrainedVectors << "]\n"
       << "  -saveOutput         whether output params should be saved ["
-      << boolToString(saveOutput) << "]\n";
+      << boolToString(saveOutput) << "]\n"
+      << "  -seed               random generator seed  [" << seed << "]\n";
+}
+void Args::printAutotuneHelp() {
+  std::cerr << "\nThe following arguments are for autotune:\n"
+            << "  -autotune-validation            validation file to be used "
+               "for evaluation\n"
+            << "  -autotune-metric                metric objective {f1, "
+               "f1:labelname} ["
+            << autotuneMetric << "]\n"
+            << "  -autotune-predictions           number of predictions used "
+               "for evaluation  ["
+            << autotunePredictions << "]\n"
+            << "  -autotune-duration              maximum duration in seconds ["
+            << autotuneDuration << "]\n"
+            << "  -autotune-modelsize             constraint model file size ["
+            << autotuneModelSize << "] (empty = do not quantize)\n";
 }
 void Args::printQuantizationHelp() {
@@ -247,7 +312,8 @@ void Args::printQuantizationHelp() {
       << "\nThe following arguments for quantization are optional:\n"
       << "  -cutoff             number of words and ngrams to retain ["
       << cutoff << "]\n"
-      << "  -retrain            whether embeddings are finetuned if a cutoff is applied ["
+      << "  -retrain            whether embeddings are finetuned if a cutoff "
+         "is applied ["
       << boolToString(retrain) << "]\n"
       << "  -qnorm              whether the norm is quantized separately ["
       << boolToString(qnorm) << "]\n"
@@ -317,4 +383,111 @@ void Args::dump(std::ostream& out) const {
       << " " << t << std::endl;
 }
+bool Args::hasAutotune() const {
+  return !autotuneValidationFile.empty();
+}
+bool Args::isManual(const std::string& argName) const {
+  return (manualArgs_.count(argName) != 0);
+}
+void Args::setManual(const std::string& argName) {
+  manualArgs_.emplace(argName);
+}
+metric_name Args::getAutotuneMetric() const {
+  if (autotuneMetric.substr(0, 3) == "f1:") {
+    return metric_name::f1scoreLabel;
+  } else if (autotuneMetric == "f1") {
+    return metric_name::f1score;
+  } else if (autotuneMetric.substr(0, 18) == "precisionAtRecall:") {
+    size_t semicolon = autotuneMetric.find(":", 18);
+    if (semicolon != std::string::npos) {
+      return metric_name::precisionAtRecallLabel;
+    }
+    return metric_name::precisionAtRecall;
+  } else if (autotuneMetric.substr(0, 18) == "recallAtPrecision:") {
+    size_t semicolon = autotuneMetric.find(":", 18);
+    if (semicolon != std::string::npos) {
+      return metric_name::recallAtPrecisionLabel;
+    }
+    return metric_name::recallAtPrecision;
+  }
+  throw std::runtime_error("Unknown metric : " + autotuneMetric);
+}
+std::string Args::getAutotuneMetricLabel() const {
+  metric_name metric = getAutotuneMetric();
+  std::string label;
+  if (metric == metric_name::f1scoreLabel) {
+    label = autotuneMetric.substr(3);
+  } else if (
+      metric == metric_name::precisionAtRecallLabel ||
+      metric == metric_name::recallAtPrecisionLabel) {
+    size_t semicolon = autotuneMetric.find(":", 18);
+    label = autotuneMetric.substr(semicolon + 1);
+  } else {
+    return label;
+  }
+  if (label.empty()) {
+    throw std::runtime_error("Empty metric label : " + autotuneMetric);
+  }
+  return label;
+}
+double Args::getAutotuneMetricValue() const {
+  metric_name metric = getAutotuneMetric();
+  double value = 0.0;
+  if (metric == metric_name::precisionAtRecallLabel ||
+      metric == metric_name::precisionAtRecall ||
+      metric == metric_name::recallAtPrecisionLabel ||
+      metric == metric_name::recallAtPrecision) {
+    size_t firstSemicolon = 18; // semicolon position in "precisionAtRecall:"
+    size_t secondSemicolon = autotuneMetric.find(":", firstSemicolon);
+    const std::string valueStr =
+        autotuneMetric.substr(firstSemicolon, secondSemicolon - firstSemicolon);
+    value = std::stof(valueStr) / 100.0;
+  }
+  return value;
+}
+int64_t Args::getAutotuneModelSize() const {
+  std::string modelSize = autotuneModelSize;
+  if (modelSize.empty()) {
+    return Args::kUnlimitedModelSize;
+  }
+  std::unordered_map<char, int> units = {
+      {'k', 1000},
+      {'K', 1000},
+      {'m', 1000000},
+      {'M', 1000000},
+      {'g', 1000000000},
+      {'G', 1000000000},
+  };
+  uint64_t multiplier = 1;
+  char lastCharacter = modelSize.back();
+  if (units.count(lastCharacter)) {
+    multiplier = units[lastCharacter];
+    modelSize = modelSize.substr(0, modelSize.size() - 1);
+  }
+  uint64_t size = 0;
+  size_t nonNumericCharacter = 0;
+  bool parseError = false;
+  try {
+    size = std::stol(modelSize, &nonNumericCharacter);
+  } catch (std::invalid_argument&) {
+    parseError = true;
+  }
+  if (!parseError && nonNumericCharacter != modelSize.size()) {
+    parseError = true;
+  }
+  if (parseError) {
+    throw std::invalid_argument(
+        "Unable to parse model size " + autotuneModelSize);
+  }
+  return size * multiplier;
+}
 } // namespace fasttext

data/vendor/fastText/src/args.h CHANGED Viewed

@@ -11,18 +11,28 @@
 #include <istream>
 #include <ostream>
 #include <string>
+#include <unordered_set>
 #include <vector>
 namespace fasttext {
 enum class model_name : int { cbow = 1, sg, sup };
 enum class loss_name : int { hs = 1, ns, softmax, ova };
+enum class metric_name : int {
+  f1score = 1,
+  f1scoreLabel,
+  precisionAtRecall,
+  precisionAtRecallLabel,
+  recallAtPrecision,
+  recallAtPrecisionLabel
+};
 class Args {
  protected:
-  std::string lossToString(loss_name) const;
   std::string boolToString(bool) const;
   std::string modelToString(model_name) const;
+  std::string metricToString(metric_name) const;
+  std::unordered_set<std::string> manualArgs_;
  public:
   Args();
@@ -48,6 +58,7 @@ class Args {
   int verbose;
   std::string pretrainedVectors;
   bool saveOutput;
+  int seed;
   bool qout;
   bool retrain;
@@ -55,14 +66,31 @@ class Args {
   size_t cutoff;
   size_t dsub;
+  std::string autotuneValidationFile;
+  std::string autotuneMetric;
+  int autotunePredictions;
+  int autotuneDuration;
+  std::string autotuneModelSize;
   void parseArgs(const std::vector<std::string>& args);
   void printHelp();
   void printBasicHelp();
   void printDictionaryHelp();
   void printTrainingHelp();
+  void printAutotuneHelp();
   void printQuantizationHelp();
   void save(std::ostream&);
   void load(std::istream&);
   void dump(std::ostream&) const;
+  bool hasAutotune() const;
+  bool isManual(const std::string& argName) const;
+  void setManual(const std::string& argName);
+  std::string lossToString(loss_name) const;
+  metric_name getAutotuneMetric() const;
+  std::string getAutotuneMetricLabel() const;
+  double getAutotuneMetricValue() const;
+  int64_t getAutotuneModelSize() const;
+  static constexpr double kUnlimitedModelSize = -1.0;
 };
 } // namespace fasttext