fasttext 0.2.0 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1bb25234ba16e6af2a30087522d498984a933274d24719be58cd348f6f456c11
4
- data.tar.gz: 2ec68073ce633dae21077be74264f686153abff5431d850c5cf87aa26304b299
3
+ metadata.gz: 2d8ffe46ec8c99cd82a51bae25a0459e80c0cae15cb659bdaf14bf2c2f7101db
4
+ data.tar.gz: 9dc3c29570f263f940b9d45e0bee85768f594283bb1b7fb1a50df2edf749cf62
5
5
  SHA512:
6
- metadata.gz: '058233f6d6dcfc698b4e00f940d927962c28bef1aa51048ffdfb969accadcaf0616fbc3353c56c35e6609f1324c5d2ba469c4ba242b8a87daf60fd2b1fa7a35b'
7
- data.tar.gz: e2bf6042ddc218398584b399acdde2ad89ece74e3009ac5348981e4ce958057233b10b5a5a504cb2e316c4a03fb4595fea524e3af0f30d787a1fb689885f6d5d
6
+ metadata.gz: 6cd42f0002111aa1a77531d514a35e7d9490e7be903a3b1051b0b4070b2da4daf8a6b28200abe4ff5c5cbc55abb4c914511c5a311e4070656d98587b660186d0
7
+ data.tar.gz: 8cbdf4eee1d98951222ab1e047c06d97edb0822d69b7260f7a97819b1cd3f043bc87b4dd9680907e94bd79d3ba420cddf324d46df9fe0d90e8e9a7e675e83b4d
data/CHANGELOG.md CHANGED
@@ -1,3 +1,19 @@
1
+ ## 0.2.4 (2022-01-16)
2
+
3
+ - Improved ARM detection
4
+
5
+ ## 0.2.3 (2021-11-15)
6
+
7
+ - Fixed installation error with ARM Mac
8
+
9
+ ## 0.2.2 (2021-10-16)
10
+
11
+ - Fixed `file cannot be opened` errors
12
+
13
+ ## 0.2.1 (2021-05-23)
14
+
15
+ - Improved performance
16
+
1
17
  ## 0.2.0 (2021-05-17)
2
18
 
3
19
  - Updated to Rice 4
data/LICENSE.txt CHANGED
@@ -1,7 +1,7 @@
1
1
  MIT License
2
2
 
3
3
  Copyright (c) 2016-present, Facebook, Inc.
4
- Copyright (c) 2019-2021 Andrew Kane
4
+ Copyright (c) 2019-2022 Andrew Kane
5
5
 
6
6
  Permission is hereby granted, free of charge, to any person obtaining a copy
7
7
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -1,8 +1,8 @@
1
- # fastText
1
+ # fastText Ruby
2
2
 
3
3
  [fastText](https://fasttext.cc) - efficient text classification and representation learning - for Ruby
4
4
 
5
- [![Build Status](https://github.com/ankane/fastText/workflows/build/badge.svg?branch=master)](https://github.com/ankane/fastText/actions)
5
+ [![Build Status](https://github.com/ankane/fastText-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/fastText-ruby/actions)
6
6
 
7
7
  ## Installation
8
8
 
@@ -256,22 +256,22 @@ model.predict("bon appétit")
256
256
 
257
257
  ## History
258
258
 
259
- View the [changelog](https://github.com/ankane/fasttext/blob/master/CHANGELOG.md)
259
+ View the [changelog](https://github.com/ankane/fastText-ruby/blob/master/CHANGELOG.md)
260
260
 
261
261
  ## Contributing
262
262
 
263
263
  Everyone is encouraged to help improve this project. Here are a few ways you can help:
264
264
 
265
- - [Report bugs](https://github.com/ankane/fasttext/issues)
266
- - Fix bugs and [submit pull requests](https://github.com/ankane/fasttext/pulls)
265
+ - [Report bugs](https://github.com/ankane/fastText-ruby/issues)
266
+ - Fix bugs and [submit pull requests](https://github.com/ankane/fastText-ruby/pulls)
267
267
  - Write, clarify, or fix documentation
268
268
  - Suggest or add new features
269
269
 
270
270
  To get started with development:
271
271
 
272
272
  ```sh
273
- git clone --recursive https://github.com/ankane/fastText.git
274
- cd fastText
273
+ git clone --recursive https://github.com/ankane/fastText-ruby.git
274
+ cd fastText-ruby
275
275
  bundle install
276
276
  bundle exec rake compile
277
277
  bundle exec rake test
data/ext/fasttext/ext.cpp CHANGED
@@ -16,13 +16,12 @@
16
16
  #include <rice/rice.hpp>
17
17
  #include <rice/stl.hpp>
18
18
 
19
+ using fasttext::Args;
19
20
  using fasttext::FastText;
20
21
 
21
22
  using Rice::Array;
22
23
  using Rice::Constructor;
23
- using Rice::Hash;
24
24
  using Rice::Module;
25
- using Rice::Object;
26
25
  using Rice::define_class_under;
27
26
  using Rice::define_module;
28
27
  using Rice::define_module_under;
@@ -47,107 +46,69 @@ namespace Rice::detail
47
46
  };
48
47
  }
49
48
 
50
- fasttext::Args buildArgs(Hash h) {
51
- fasttext::Args a;
52
-
53
- std::vector<Hash::Entry> v;
54
- Hash::iterator it = h.begin();
55
- Hash::iterator end = h.end();
56
-
57
- for(; it != end; ++it)
58
- {
59
- std::string name = it->key.to_s().str();
60
- VALUE value = (it->value).value();
61
-
62
- if (name == "input") {
63
- a.input = Rice::detail::From_Ruby<std::string>().convert(value);
64
- } else if (name == "output") {
65
- a.output = Rice::detail::From_Ruby<std::string>().convert(value);
66
- } else if (name == "lr") {
67
- a.lr = Rice::detail::From_Ruby<double>().convert(value);
68
- } else if (name == "lr_update_rate") {
69
- a.lrUpdateRate = Rice::detail::From_Ruby<int>().convert(value);
70
- } else if (name == "dim") {
71
- a.dim = Rice::detail::From_Ruby<int>().convert(value);
72
- } else if (name == "ws") {
73
- a.ws = Rice::detail::From_Ruby<int>().convert(value);
74
- } else if (name == "epoch") {
75
- a.epoch = Rice::detail::From_Ruby<int>().convert(value);
76
- } else if (name == "min_count") {
77
- a.minCount = Rice::detail::From_Ruby<int>().convert(value);
78
- } else if (name == "min_count_label") {
79
- a.minCountLabel = Rice::detail::From_Ruby<int>().convert(value);
80
- } else if (name == "neg") {
81
- a.neg = Rice::detail::From_Ruby<int>().convert(value);
82
- } else if (name == "word_ngrams") {
83
- a.wordNgrams = Rice::detail::From_Ruby<int>().convert(value);
84
- } else if (name == "loss") {
85
- std::string str = Rice::detail::From_Ruby<std::string>().convert(value);
86
- if (str == "softmax") {
87
- a.loss = fasttext::loss_name::softmax;
88
- } else if (str == "ns") {
89
- a.loss = fasttext::loss_name::ns;
90
- } else if (str == "hs") {
91
- a.loss = fasttext::loss_name::hs;
92
- } else if (str == "ova") {
93
- a.loss = fasttext::loss_name::ova;
94
- } else {
95
- throw std::invalid_argument("Unknown loss: " + str);
96
- }
97
- } else if (name == "model") {
98
- std::string str = Rice::detail::From_Ruby<std::string>().convert(value);
99
- if (str == "supervised") {
100
- a.model = fasttext::model_name::sup;
101
- } else if (str == "skipgram") {
102
- a.model = fasttext::model_name::sg;
103
- } else if (str == "cbow") {
104
- a.model = fasttext::model_name::cbow;
105
- } else {
106
- throw std::invalid_argument("Unknown model: " + str);
107
- }
108
- } else if (name == "bucket") {
109
- a.bucket = Rice::detail::From_Ruby<int>().convert(value);
110
- } else if (name == "minn") {
111
- a.minn = Rice::detail::From_Ruby<int>().convert(value);
112
- } else if (name == "maxn") {
113
- a.maxn = Rice::detail::From_Ruby<int>().convert(value);
114
- } else if (name == "thread") {
115
- a.thread = Rice::detail::From_Ruby<int>().convert(value);
116
- } else if (name == "t") {
117
- a.t = Rice::detail::From_Ruby<double>().convert(value);
118
- } else if (name == "label_prefix") {
119
- a.label = Rice::detail::From_Ruby<std::string>().convert(value);
120
- } else if (name == "verbose") {
121
- a.verbose = Rice::detail::From_Ruby<int>().convert(value);
122
- } else if (name == "pretrained_vectors") {
123
- a.pretrainedVectors = Rice::detail::From_Ruby<std::string>().convert(value);
124
- } else if (name == "save_output") {
125
- a.saveOutput = Rice::detail::From_Ruby<bool>().convert(value);
126
- } else if (name == "seed") {
127
- a.seed = Rice::detail::From_Ruby<int>().convert(value);
128
- } else if (name == "autotune_validation_file") {
129
- a.autotuneValidationFile = Rice::detail::From_Ruby<std::string>().convert(value);
130
- } else if (name == "autotune_metric") {
131
- a.autotuneMetric = Rice::detail::From_Ruby<std::string>().convert(value);
132
- } else if (name == "autotune_predictions") {
133
- a.autotunePredictions = Rice::detail::From_Ruby<int>().convert(value);
134
- } else if (name == "autotune_duration") {
135
- a.autotuneDuration = Rice::detail::From_Ruby<int>().convert(value);
136
- } else if (name == "autotune_model_size") {
137
- a.autotuneModelSize = Rice::detail::From_Ruby<std::string>().convert(value);
138
- } else {
139
- throw std::invalid_argument("Unknown argument: " + name);
140
- }
141
- }
142
- return a;
143
- }
144
-
145
49
  extern "C"
146
50
  void Init_ext()
147
51
  {
148
52
  Module rb_mFastText = define_module("FastText");
149
53
  Module rb_mExt = define_module_under(rb_mFastText, "Ext");
150
54
 
55
+ define_class_under<Args>(rb_mExt, "Args")
56
+ .define_constructor(Constructor<Args>())
57
+ .define_attr("input", &Args::input)
58
+ .define_attr("output", &Args::output)
59
+ .define_attr("lr", &Args::lr)
60
+ .define_attr("lr_update_rate", &Args::lrUpdateRate)
61
+ .define_attr("dim", &Args::dim)
62
+ .define_attr("ws", &Args::ws)
63
+ .define_attr("epoch", &Args::epoch)
64
+ .define_attr("min_count", &Args::minCount)
65
+ .define_attr("min_count_label", &Args::minCountLabel)
66
+ .define_attr("neg", &Args::neg)
67
+ .define_attr("word_ngrams", &Args::wordNgrams)
68
+ .define_method(
69
+ "loss=",
70
+ [](Args& a, const std::string& str) {
71
+ if (str == "softmax") {
72
+ a.loss = fasttext::loss_name::softmax;
73
+ } else if (str == "ns") {
74
+ a.loss = fasttext::loss_name::ns;
75
+ } else if (str == "hs") {
76
+ a.loss = fasttext::loss_name::hs;
77
+ } else if (str == "ova") {
78
+ a.loss = fasttext::loss_name::ova;
79
+ } else {
80
+ throw std::invalid_argument("Unknown loss: " + str);
81
+ }
82
+ })
83
+ .define_method(
84
+ "model=",
85
+ [](Args& a, const std::string& str) {
86
+ if (str == "supervised") {
87
+ a.model = fasttext::model_name::sup;
88
+ } else if (str == "skipgram") {
89
+ a.model = fasttext::model_name::sg;
90
+ } else if (str == "cbow") {
91
+ a.model = fasttext::model_name::cbow;
92
+ } else {
93
+ throw std::invalid_argument("Unknown model: " + str);
94
+ }
95
+ })
96
+ .define_attr("bucket", &Args::bucket)
97
+ .define_attr("minn", &Args::minn)
98
+ .define_attr("maxn", &Args::maxn)
99
+ .define_attr("thread", &Args::thread)
100
+ .define_attr("t", &Args::t)
101
+ .define_attr("label_prefix", &Args::label)
102
+ .define_attr("verbose", &Args::verbose)
103
+ .define_attr("pretrained_vectors", &Args::pretrainedVectors)
104
+ .define_attr("save_output", &Args::saveOutput)
105
+ .define_attr("seed", &Args::seed)
106
+ .define_attr("autotune_validation_file", &Args::autotuneValidationFile)
107
+ .define_attr("autotune_metric", &Args::autotuneMetric)
108
+ .define_attr("autotune_predictions", &Args::autotunePredictions)
109
+ .define_attr("autotune_duration", &Args::autotuneDuration)
110
+ .define_attr("autotune_model_size", &Args::autotuneModelSize);
111
+
151
112
  define_class_under<FastText>(rb_mExt, "Model")
152
113
  .define_constructor(Constructor<FastText>())
153
114
  .define_method(
@@ -235,13 +196,12 @@ void Init_ext()
235
196
  .define_method(
236
197
  "word_vector",
237
198
  [](FastText& m, const std::string& word) {
238
- int dimension = m.getDimension();
199
+ auto dimension = m.getDimension();
239
200
  fasttext::Vector vec = fasttext::Vector(dimension);
240
201
  m.getWordVector(vec, word);
241
- float* data = vec.data();
242
202
  Array ret;
243
- for (int i = 0; i < dimension; i++) {
244
- ret.push(data[i]);
203
+ for (size_t i = 0; i < vec.size(); i++) {
204
+ ret.push(vec[i]);
245
205
  }
246
206
  return ret;
247
207
  })
@@ -263,20 +223,18 @@ void Init_ext()
263
223
  "sentence_vector",
264
224
  [](FastText& m, const std::string& text) {
265
225
  std::istringstream in(text);
266
- int dimension = m.getDimension();
226
+ auto dimension = m.getDimension();
267
227
  fasttext::Vector vec = fasttext::Vector(dimension);
268
228
  m.getSentenceVector(in, vec);
269
- float* data = vec.data();
270
229
  Array ret;
271
- for (int i = 0; i < dimension; i++) {
272
- ret.push(data[i]);
230
+ for (size_t i = 0; i < vec.size(); i++) {
231
+ ret.push(vec[i]);
273
232
  }
274
233
  return ret;
275
234
  })
276
235
  .define_method(
277
236
  "train",
278
- [](FastText& m, Hash h) {
279
- auto a = buildArgs(h);
237
+ [](FastText& m, Args& a) {
280
238
  if (a.hasAutotune()) {
281
239
  fasttext::Autotune autotune(std::shared_ptr<fasttext::FastText>(&m, [](fasttext::FastText*) {}));
282
240
  autotune.train(a);
@@ -286,8 +244,8 @@ void Init_ext()
286
244
  })
287
245
  .define_method(
288
246
  "quantize",
289
- [](FastText& m, Hash h) {
290
- m.quantize(buildArgs(h));
247
+ [](FastText& m, Args& a) {
248
+ m.quantize(a);
291
249
  })
292
250
  .define_method(
293
251
  "supervised?",
@@ -1,7 +1,9 @@
1
1
  require "mkmf-rice"
2
2
 
3
+ # -march=native not supported with ARM Mac
4
+ default_optflags = RbConfig::CONFIG["host_os"] =~ /darwin/i && RbConfig::CONFIG["host_cpu"] =~ /arm|aarch64/i ? "" : "-march=native"
3
5
  # -pthread and -O3 set by default
4
- $CXXFLAGS << " -std=c++17 -funroll-loops " << with_config("optflags", "-march=native")
6
+ $CXXFLAGS << " -std=c++17 $(optflags) -funroll-loops " << with_config("optflags", default_optflags)
5
7
 
6
8
  ext = File.expand_path(".", __dir__)
7
9
  fasttext = File.expand_path("../../vendor/fastText/src", __dir__)
@@ -30,14 +30,16 @@ module FastText
30
30
  }
31
31
 
32
32
  def fit(x, y = nil, autotune_set: nil)
33
- input = input_path(x, y)
33
+ input, _ref = input_path(x, y)
34
34
  @m ||= Ext::Model.new
35
- opts = DEFAULT_OPTIONS.merge(@options).merge(input: input, model: "supervised")
35
+ a = build_args(DEFAULT_OPTIONS)
36
+ a.input = input
37
+ a.model = "supervised"
36
38
  if autotune_set
37
39
  x, y = autotune_set
38
- opts.merge!(autotune_validation_file: input_path(x, y))
40
+ a.autotune_validation_file, _autotune_ref = input_path(x, y)
39
41
  end
40
- m.train(opts)
42
+ m.train(a)
41
43
  end
42
44
 
43
45
  def predict(text, k: 1, threshold: 0.0)
@@ -47,16 +49,16 @@ module FastText
47
49
  # TODO predict multiple in C++ for performance
48
50
  result =
49
51
  text.map do |t|
50
- m.predict(prep_text(t), k, threshold).map do |v|
52
+ m.predict(prep_text(t), k, threshold).to_h do |v|
51
53
  [remove_prefix(v[1]), v[0]]
52
- end.to_h
54
+ end
53
55
  end
54
56
 
55
57
  multiple ? result : result.first
56
58
  end
57
59
 
58
60
  def test(x, y = nil, k: 1)
59
- input = input_path(x, y)
61
+ input, _ref = input_path(x, y)
60
62
  res = m.test(input, k)
61
63
  {
62
64
  examples: res[0],
@@ -67,7 +69,8 @@ module FastText
67
69
 
68
70
  # TODO support options
69
71
  def quantize
70
- m.quantize({})
72
+ a = Ext::Args.new
73
+ m.quantize(a)
71
74
  end
72
75
 
73
76
  def labels(include_freq: false)
@@ -85,7 +88,7 @@ module FastText
85
88
  def input_path(x, y)
86
89
  if x.is_a?(String)
87
90
  raise ArgumentError, "Cannot pass y with file" if y
88
- x
91
+ [x, nil]
89
92
  else
90
93
  tempfile = Tempfile.new("fasttext")
91
94
  x.zip(y) do |xi, yi|
@@ -95,7 +98,7 @@ module FastText
95
98
  tempfile.write("\n")
96
99
  end
97
100
  tempfile.close
98
- tempfile.path
101
+ [tempfile.path, tempfile]
99
102
  end
100
103
  end
101
104
 
@@ -56,5 +56,15 @@ module FastText
56
56
  def m
57
57
  @m || (raise Error, "Not fit")
58
58
  end
59
+
60
+ def build_args(default_options)
61
+ a = Ext::Args.new
62
+ opts = @options.dup
63
+ default_options.each do |k, v|
64
+ a.send("#{k}=", opts.delete(k) || v)
65
+ end
66
+ raise ArgumentError, "Unknown argument: #{opts.keys.first}" if opts.any?
67
+ a
68
+ end
59
69
  end
60
70
  end
@@ -29,9 +29,10 @@ module FastText
29
29
  }
30
30
 
31
31
  def fit(x)
32
- input = input_path(x)
33
32
  @m ||= Ext::Model.new
34
- m.train(DEFAULT_OPTIONS.merge(@options).merge(input: input))
33
+ a = build_args(DEFAULT_OPTIONS)
34
+ a.input, _ref = input_path(x)
35
+ m.train(a)
35
36
  end
36
37
 
37
38
  def nearest_neighbors(word, k: 10)
@@ -48,7 +49,7 @@ module FastText
48
49
  # https://github.com/facebookresearch/fastText/issues/518
49
50
  def input_path(x)
50
51
  if x.is_a?(String)
51
- x
52
+ [x, nil]
52
53
  else
53
54
  tempfile = Tempfile.new("fasttext")
54
55
  x.each do |xi|
@@ -56,7 +57,7 @@ module FastText
56
57
  tempfile.write("\n")
57
58
  end
58
59
  tempfile.close
59
- tempfile.path
60
+ [tempfile.path, tempfile]
60
61
  end
61
62
  end
62
63
  end
@@ -1,3 +1,3 @@
1
1
  module FastText
2
- VERSION = "0.2.0"
2
+ VERSION = "0.2.4"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fasttext
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-05-17 00:00:00.000000000 Z
11
+ date: 2022-01-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rice
@@ -71,7 +71,7 @@ files:
71
71
  - vendor/fastText/src/utils.h
72
72
  - vendor/fastText/src/vector.cc
73
73
  - vendor/fastText/src/vector.h
74
- homepage: https://github.com/ankane/fastText
74
+ homepage: https://github.com/ankane/fastText-ruby
75
75
  licenses:
76
76
  - MIT
77
77
  metadata: {}
@@ -90,9 +90,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
90
90
  - !ruby/object:Gem::Version
91
91
  version: '0'
92
92
  requirements: []
93
- rubygems_version: 3.2.3
93
+ rubygems_version: 3.3.3
94
94
  signing_key:
95
95
  specification_version: 4
96
- summary: fastText - efficient text classification and representation learning - for
97
- Ruby
96
+ summary: Efficient text classification and representation learning for Ruby
98
97
  test_files: []