fasttext 0.2.0 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1bb25234ba16e6af2a30087522d498984a933274d24719be58cd348f6f456c11
4
- data.tar.gz: 2ec68073ce633dae21077be74264f686153abff5431d850c5cf87aa26304b299
3
+ metadata.gz: 2d8ffe46ec8c99cd82a51bae25a0459e80c0cae15cb659bdaf14bf2c2f7101db
4
+ data.tar.gz: 9dc3c29570f263f940b9d45e0bee85768f594283bb1b7fb1a50df2edf749cf62
5
5
  SHA512:
6
- metadata.gz: '058233f6d6dcfc698b4e00f940d927962c28bef1aa51048ffdfb969accadcaf0616fbc3353c56c35e6609f1324c5d2ba469c4ba242b8a87daf60fd2b1fa7a35b'
7
- data.tar.gz: e2bf6042ddc218398584b399acdde2ad89ece74e3009ac5348981e4ce958057233b10b5a5a504cb2e316c4a03fb4595fea524e3af0f30d787a1fb689885f6d5d
6
+ metadata.gz: 6cd42f0002111aa1a77531d514a35e7d9490e7be903a3b1051b0b4070b2da4daf8a6b28200abe4ff5c5cbc55abb4c914511c5a311e4070656d98587b660186d0
7
+ data.tar.gz: 8cbdf4eee1d98951222ab1e047c06d97edb0822d69b7260f7a97819b1cd3f043bc87b4dd9680907e94bd79d3ba420cddf324d46df9fe0d90e8e9a7e675e83b4d
data/CHANGELOG.md CHANGED
@@ -1,3 +1,19 @@
1
+ ## 0.2.4 (2022-01-16)
2
+
3
+ - Improved ARM detection
4
+
5
+ ## 0.2.3 (2021-11-15)
6
+
7
+ - Fixed installation error with ARM Mac
8
+
9
+ ## 0.2.2 (2021-10-16)
10
+
11
+ - Fixed `file cannot be opened` errors
12
+
13
+ ## 0.2.1 (2021-05-23)
14
+
15
+ - Improved performance
16
+
1
17
  ## 0.2.0 (2021-05-17)
2
18
 
3
19
  - Updated to Rice 4
data/LICENSE.txt CHANGED
@@ -1,7 +1,7 @@
1
1
  MIT License
2
2
 
3
3
  Copyright (c) 2016-present, Facebook, Inc.
4
- Copyright (c) 2019-2021 Andrew Kane
4
+ Copyright (c) 2019-2022 Andrew Kane
5
5
 
6
6
  Permission is hereby granted, free of charge, to any person obtaining a copy
7
7
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -1,8 +1,8 @@
1
- # fastText
1
+ # fastText Ruby
2
2
 
3
3
  [fastText](https://fasttext.cc) - efficient text classification and representation learning - for Ruby
4
4
 
5
- [![Build Status](https://github.com/ankane/fastText/workflows/build/badge.svg?branch=master)](https://github.com/ankane/fastText/actions)
5
+ [![Build Status](https://github.com/ankane/fastText-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/fastText-ruby/actions)
6
6
 
7
7
  ## Installation
8
8
 
@@ -256,22 +256,22 @@ model.predict("bon appétit")
256
256
 
257
257
  ## History
258
258
 
259
- View the [changelog](https://github.com/ankane/fasttext/blob/master/CHANGELOG.md)
259
+ View the [changelog](https://github.com/ankane/fastText-ruby/blob/master/CHANGELOG.md)
260
260
 
261
261
  ## Contributing
262
262
 
263
263
  Everyone is encouraged to help improve this project. Here are a few ways you can help:
264
264
 
265
- - [Report bugs](https://github.com/ankane/fasttext/issues)
266
- - Fix bugs and [submit pull requests](https://github.com/ankane/fasttext/pulls)
265
+ - [Report bugs](https://github.com/ankane/fastText-ruby/issues)
266
+ - Fix bugs and [submit pull requests](https://github.com/ankane/fastText-ruby/pulls)
267
267
  - Write, clarify, or fix documentation
268
268
  - Suggest or add new features
269
269
 
270
270
  To get started with development:
271
271
 
272
272
  ```sh
273
- git clone --recursive https://github.com/ankane/fastText.git
274
- cd fastText
273
+ git clone --recursive https://github.com/ankane/fastText-ruby.git
274
+ cd fastText-ruby
275
275
  bundle install
276
276
  bundle exec rake compile
277
277
  bundle exec rake test
data/ext/fasttext/ext.cpp CHANGED
@@ -16,13 +16,12 @@
16
16
  #include <rice/rice.hpp>
17
17
  #include <rice/stl.hpp>
18
18
 
19
+ using fasttext::Args;
19
20
  using fasttext::FastText;
20
21
 
21
22
  using Rice::Array;
22
23
  using Rice::Constructor;
23
- using Rice::Hash;
24
24
  using Rice::Module;
25
- using Rice::Object;
26
25
  using Rice::define_class_under;
27
26
  using Rice::define_module;
28
27
  using Rice::define_module_under;
@@ -47,107 +46,69 @@ namespace Rice::detail
47
46
  };
48
47
  }
49
48
 
50
- fasttext::Args buildArgs(Hash h) {
51
- fasttext::Args a;
52
-
53
- std::vector<Hash::Entry> v;
54
- Hash::iterator it = h.begin();
55
- Hash::iterator end = h.end();
56
-
57
- for(; it != end; ++it)
58
- {
59
- std::string name = it->key.to_s().str();
60
- VALUE value = (it->value).value();
61
-
62
- if (name == "input") {
63
- a.input = Rice::detail::From_Ruby<std::string>().convert(value);
64
- } else if (name == "output") {
65
- a.output = Rice::detail::From_Ruby<std::string>().convert(value);
66
- } else if (name == "lr") {
67
- a.lr = Rice::detail::From_Ruby<double>().convert(value);
68
- } else if (name == "lr_update_rate") {
69
- a.lrUpdateRate = Rice::detail::From_Ruby<int>().convert(value);
70
- } else if (name == "dim") {
71
- a.dim = Rice::detail::From_Ruby<int>().convert(value);
72
- } else if (name == "ws") {
73
- a.ws = Rice::detail::From_Ruby<int>().convert(value);
74
- } else if (name == "epoch") {
75
- a.epoch = Rice::detail::From_Ruby<int>().convert(value);
76
- } else if (name == "min_count") {
77
- a.minCount = Rice::detail::From_Ruby<int>().convert(value);
78
- } else if (name == "min_count_label") {
79
- a.minCountLabel = Rice::detail::From_Ruby<int>().convert(value);
80
- } else if (name == "neg") {
81
- a.neg = Rice::detail::From_Ruby<int>().convert(value);
82
- } else if (name == "word_ngrams") {
83
- a.wordNgrams = Rice::detail::From_Ruby<int>().convert(value);
84
- } else if (name == "loss") {
85
- std::string str = Rice::detail::From_Ruby<std::string>().convert(value);
86
- if (str == "softmax") {
87
- a.loss = fasttext::loss_name::softmax;
88
- } else if (str == "ns") {
89
- a.loss = fasttext::loss_name::ns;
90
- } else if (str == "hs") {
91
- a.loss = fasttext::loss_name::hs;
92
- } else if (str == "ova") {
93
- a.loss = fasttext::loss_name::ova;
94
- } else {
95
- throw std::invalid_argument("Unknown loss: " + str);
96
- }
97
- } else if (name == "model") {
98
- std::string str = Rice::detail::From_Ruby<std::string>().convert(value);
99
- if (str == "supervised") {
100
- a.model = fasttext::model_name::sup;
101
- } else if (str == "skipgram") {
102
- a.model = fasttext::model_name::sg;
103
- } else if (str == "cbow") {
104
- a.model = fasttext::model_name::cbow;
105
- } else {
106
- throw std::invalid_argument("Unknown model: " + str);
107
- }
108
- } else if (name == "bucket") {
109
- a.bucket = Rice::detail::From_Ruby<int>().convert(value);
110
- } else if (name == "minn") {
111
- a.minn = Rice::detail::From_Ruby<int>().convert(value);
112
- } else if (name == "maxn") {
113
- a.maxn = Rice::detail::From_Ruby<int>().convert(value);
114
- } else if (name == "thread") {
115
- a.thread = Rice::detail::From_Ruby<int>().convert(value);
116
- } else if (name == "t") {
117
- a.t = Rice::detail::From_Ruby<double>().convert(value);
118
- } else if (name == "label_prefix") {
119
- a.label = Rice::detail::From_Ruby<std::string>().convert(value);
120
- } else if (name == "verbose") {
121
- a.verbose = Rice::detail::From_Ruby<int>().convert(value);
122
- } else if (name == "pretrained_vectors") {
123
- a.pretrainedVectors = Rice::detail::From_Ruby<std::string>().convert(value);
124
- } else if (name == "save_output") {
125
- a.saveOutput = Rice::detail::From_Ruby<bool>().convert(value);
126
- } else if (name == "seed") {
127
- a.seed = Rice::detail::From_Ruby<int>().convert(value);
128
- } else if (name == "autotune_validation_file") {
129
- a.autotuneValidationFile = Rice::detail::From_Ruby<std::string>().convert(value);
130
- } else if (name == "autotune_metric") {
131
- a.autotuneMetric = Rice::detail::From_Ruby<std::string>().convert(value);
132
- } else if (name == "autotune_predictions") {
133
- a.autotunePredictions = Rice::detail::From_Ruby<int>().convert(value);
134
- } else if (name == "autotune_duration") {
135
- a.autotuneDuration = Rice::detail::From_Ruby<int>().convert(value);
136
- } else if (name == "autotune_model_size") {
137
- a.autotuneModelSize = Rice::detail::From_Ruby<std::string>().convert(value);
138
- } else {
139
- throw std::invalid_argument("Unknown argument: " + name);
140
- }
141
- }
142
- return a;
143
- }
144
-
145
49
  extern "C"
146
50
  void Init_ext()
147
51
  {
148
52
  Module rb_mFastText = define_module("FastText");
149
53
  Module rb_mExt = define_module_under(rb_mFastText, "Ext");
150
54
 
55
+ define_class_under<Args>(rb_mExt, "Args")
56
+ .define_constructor(Constructor<Args>())
57
+ .define_attr("input", &Args::input)
58
+ .define_attr("output", &Args::output)
59
+ .define_attr("lr", &Args::lr)
60
+ .define_attr("lr_update_rate", &Args::lrUpdateRate)
61
+ .define_attr("dim", &Args::dim)
62
+ .define_attr("ws", &Args::ws)
63
+ .define_attr("epoch", &Args::epoch)
64
+ .define_attr("min_count", &Args::minCount)
65
+ .define_attr("min_count_label", &Args::minCountLabel)
66
+ .define_attr("neg", &Args::neg)
67
+ .define_attr("word_ngrams", &Args::wordNgrams)
68
+ .define_method(
69
+ "loss=",
70
+ [](Args& a, const std::string& str) {
71
+ if (str == "softmax") {
72
+ a.loss = fasttext::loss_name::softmax;
73
+ } else if (str == "ns") {
74
+ a.loss = fasttext::loss_name::ns;
75
+ } else if (str == "hs") {
76
+ a.loss = fasttext::loss_name::hs;
77
+ } else if (str == "ova") {
78
+ a.loss = fasttext::loss_name::ova;
79
+ } else {
80
+ throw std::invalid_argument("Unknown loss: " + str);
81
+ }
82
+ })
83
+ .define_method(
84
+ "model=",
85
+ [](Args& a, const std::string& str) {
86
+ if (str == "supervised") {
87
+ a.model = fasttext::model_name::sup;
88
+ } else if (str == "skipgram") {
89
+ a.model = fasttext::model_name::sg;
90
+ } else if (str == "cbow") {
91
+ a.model = fasttext::model_name::cbow;
92
+ } else {
93
+ throw std::invalid_argument("Unknown model: " + str);
94
+ }
95
+ })
96
+ .define_attr("bucket", &Args::bucket)
97
+ .define_attr("minn", &Args::minn)
98
+ .define_attr("maxn", &Args::maxn)
99
+ .define_attr("thread", &Args::thread)
100
+ .define_attr("t", &Args::t)
101
+ .define_attr("label_prefix", &Args::label)
102
+ .define_attr("verbose", &Args::verbose)
103
+ .define_attr("pretrained_vectors", &Args::pretrainedVectors)
104
+ .define_attr("save_output", &Args::saveOutput)
105
+ .define_attr("seed", &Args::seed)
106
+ .define_attr("autotune_validation_file", &Args::autotuneValidationFile)
107
+ .define_attr("autotune_metric", &Args::autotuneMetric)
108
+ .define_attr("autotune_predictions", &Args::autotunePredictions)
109
+ .define_attr("autotune_duration", &Args::autotuneDuration)
110
+ .define_attr("autotune_model_size", &Args::autotuneModelSize);
111
+
151
112
  define_class_under<FastText>(rb_mExt, "Model")
152
113
  .define_constructor(Constructor<FastText>())
153
114
  .define_method(
@@ -235,13 +196,12 @@ void Init_ext()
235
196
  .define_method(
236
197
  "word_vector",
237
198
  [](FastText& m, const std::string& word) {
238
- int dimension = m.getDimension();
199
+ auto dimension = m.getDimension();
239
200
  fasttext::Vector vec = fasttext::Vector(dimension);
240
201
  m.getWordVector(vec, word);
241
- float* data = vec.data();
242
202
  Array ret;
243
- for (int i = 0; i < dimension; i++) {
244
- ret.push(data[i]);
203
+ for (size_t i = 0; i < vec.size(); i++) {
204
+ ret.push(vec[i]);
245
205
  }
246
206
  return ret;
247
207
  })
@@ -263,20 +223,18 @@ void Init_ext()
263
223
  "sentence_vector",
264
224
  [](FastText& m, const std::string& text) {
265
225
  std::istringstream in(text);
266
- int dimension = m.getDimension();
226
+ auto dimension = m.getDimension();
267
227
  fasttext::Vector vec = fasttext::Vector(dimension);
268
228
  m.getSentenceVector(in, vec);
269
- float* data = vec.data();
270
229
  Array ret;
271
- for (int i = 0; i < dimension; i++) {
272
- ret.push(data[i]);
230
+ for (size_t i = 0; i < vec.size(); i++) {
231
+ ret.push(vec[i]);
273
232
  }
274
233
  return ret;
275
234
  })
276
235
  .define_method(
277
236
  "train",
278
- [](FastText& m, Hash h) {
279
- auto a = buildArgs(h);
237
+ [](FastText& m, Args& a) {
280
238
  if (a.hasAutotune()) {
281
239
  fasttext::Autotune autotune(std::shared_ptr<fasttext::FastText>(&m, [](fasttext::FastText*) {}));
282
240
  autotune.train(a);
@@ -286,8 +244,8 @@ void Init_ext()
286
244
  })
287
245
  .define_method(
288
246
  "quantize",
289
- [](FastText& m, Hash h) {
290
- m.quantize(buildArgs(h));
247
+ [](FastText& m, Args& a) {
248
+ m.quantize(a);
291
249
  })
292
250
  .define_method(
293
251
  "supervised?",
@@ -1,7 +1,9 @@
1
1
  require "mkmf-rice"
2
2
 
3
+ # -march=native not supported with ARM Mac
4
+ default_optflags = RbConfig::CONFIG["host_os"] =~ /darwin/i && RbConfig::CONFIG["host_cpu"] =~ /arm|aarch64/i ? "" : "-march=native"
3
5
  # -pthread and -O3 set by default
4
- $CXXFLAGS << " -std=c++17 -funroll-loops " << with_config("optflags", "-march=native")
6
+ $CXXFLAGS << " -std=c++17 $(optflags) -funroll-loops " << with_config("optflags", default_optflags)
5
7
 
6
8
  ext = File.expand_path(".", __dir__)
7
9
  fasttext = File.expand_path("../../vendor/fastText/src", __dir__)
@@ -30,14 +30,16 @@ module FastText
30
30
  }
31
31
 
32
32
  def fit(x, y = nil, autotune_set: nil)
33
- input = input_path(x, y)
33
+ input, _ref = input_path(x, y)
34
34
  @m ||= Ext::Model.new
35
- opts = DEFAULT_OPTIONS.merge(@options).merge(input: input, model: "supervised")
35
+ a = build_args(DEFAULT_OPTIONS)
36
+ a.input = input
37
+ a.model = "supervised"
36
38
  if autotune_set
37
39
  x, y = autotune_set
38
- opts.merge!(autotune_validation_file: input_path(x, y))
40
+ a.autotune_validation_file, _autotune_ref = input_path(x, y)
39
41
  end
40
- m.train(opts)
42
+ m.train(a)
41
43
  end
42
44
 
43
45
  def predict(text, k: 1, threshold: 0.0)
@@ -47,16 +49,16 @@ module FastText
47
49
  # TODO predict multiple in C++ for performance
48
50
  result =
49
51
  text.map do |t|
50
- m.predict(prep_text(t), k, threshold).map do |v|
52
+ m.predict(prep_text(t), k, threshold).to_h do |v|
51
53
  [remove_prefix(v[1]), v[0]]
52
- end.to_h
54
+ end
53
55
  end
54
56
 
55
57
  multiple ? result : result.first
56
58
  end
57
59
 
58
60
  def test(x, y = nil, k: 1)
59
- input = input_path(x, y)
61
+ input, _ref = input_path(x, y)
60
62
  res = m.test(input, k)
61
63
  {
62
64
  examples: res[0],
@@ -67,7 +69,8 @@ module FastText
67
69
 
68
70
  # TODO support options
69
71
  def quantize
70
- m.quantize({})
72
+ a = Ext::Args.new
73
+ m.quantize(a)
71
74
  end
72
75
 
73
76
  def labels(include_freq: false)
@@ -85,7 +88,7 @@ module FastText
85
88
  def input_path(x, y)
86
89
  if x.is_a?(String)
87
90
  raise ArgumentError, "Cannot pass y with file" if y
88
- x
91
+ [x, nil]
89
92
  else
90
93
  tempfile = Tempfile.new("fasttext")
91
94
  x.zip(y) do |xi, yi|
@@ -95,7 +98,7 @@ module FastText
95
98
  tempfile.write("\n")
96
99
  end
97
100
  tempfile.close
98
- tempfile.path
101
+ [tempfile.path, tempfile]
99
102
  end
100
103
  end
101
104
 
@@ -56,5 +56,15 @@ module FastText
56
56
  def m
57
57
  @m || (raise Error, "Not fit")
58
58
  end
59
+
60
+ def build_args(default_options)
61
+ a = Ext::Args.new
62
+ opts = @options.dup
63
+ default_options.each do |k, v|
64
+ a.send("#{k}=", opts.delete(k) || v)
65
+ end
66
+ raise ArgumentError, "Unknown argument: #{opts.keys.first}" if opts.any?
67
+ a
68
+ end
59
69
  end
60
70
  end
@@ -29,9 +29,10 @@ module FastText
29
29
  }
30
30
 
31
31
  def fit(x)
32
- input = input_path(x)
33
32
  @m ||= Ext::Model.new
34
- m.train(DEFAULT_OPTIONS.merge(@options).merge(input: input))
33
+ a = build_args(DEFAULT_OPTIONS)
34
+ a.input, _ref = input_path(x)
35
+ m.train(a)
35
36
  end
36
37
 
37
38
  def nearest_neighbors(word, k: 10)
@@ -48,7 +49,7 @@ module FastText
48
49
  # https://github.com/facebookresearch/fastText/issues/518
49
50
  def input_path(x)
50
51
  if x.is_a?(String)
51
- x
52
+ [x, nil]
52
53
  else
53
54
  tempfile = Tempfile.new("fasttext")
54
55
  x.each do |xi|
@@ -56,7 +57,7 @@ module FastText
56
57
  tempfile.write("\n")
57
58
  end
58
59
  tempfile.close
59
- tempfile.path
60
+ [tempfile.path, tempfile]
60
61
  end
61
62
  end
62
63
  end
@@ -1,3 +1,3 @@
1
1
  module FastText
2
- VERSION = "0.2.0"
2
+ VERSION = "0.2.4"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fasttext
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-05-17 00:00:00.000000000 Z
11
+ date: 2022-01-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rice
@@ -71,7 +71,7 @@ files:
71
71
  - vendor/fastText/src/utils.h
72
72
  - vendor/fastText/src/vector.cc
73
73
  - vendor/fastText/src/vector.h
74
- homepage: https://github.com/ankane/fastText
74
+ homepage: https://github.com/ankane/fastText-ruby
75
75
  licenses:
76
76
  - MIT
77
77
  metadata: {}
@@ -90,9 +90,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
90
90
  - !ruby/object:Gem::Version
91
91
  version: '0'
92
92
  requirements: []
93
- rubygems_version: 3.2.3
93
+ rubygems_version: 3.3.3
94
94
  signing_key:
95
95
  specification_version: 4
96
- summary: fastText - efficient text classification and representation learning - for
97
- Ruby
96
+ summary: Efficient text classification and representation learning for Ruby
98
97
  test_files: []