fasttext 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/LICENSE.txt +1 -1
- data/README.md +3 -3
- data/ext/fasttext/ext.cpp +68 -66
- data/lib/fasttext/version.rb +1 -1
- data/lib/fasttext.rb +1 -1
- metadata +6 -10
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 9978bfe50053f76326bfd7f97b7acb6ff7af67ca4acba77f21de5000175428b1
|
|
4
|
+
data.tar.gz: 7488a7dc79f8d2e62636468f18e916b8bfbccec59897909a1f666a3ff02a1d86
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ffd2b8f15be1fdbbd0a0b10dfe94c482645c2961ca119af160c8a9952bb95380c5512fb2de85b9885c773f3393fbcc776efa2dcaf94dd390c459470974a07ef7
|
|
7
|
+
data.tar.gz: 831fa02b20d7b594dc3ceefe9b33499df37d75436ea724369b3dc976d950b72c533ae400a1d4c26838381746c94c227bccb50f9ca341d5575e87dd88e2908e41
|
data/CHANGELOG.md
CHANGED
data/LICENSE.txt
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
MIT License
|
|
2
2
|
|
|
3
3
|
Copyright (c) 2016-present, Facebook, Inc.
|
|
4
|
-
Copyright (c) 2019-
|
|
4
|
+
Copyright (c) 2019-2025 Andrew Kane
|
|
5
5
|
|
|
6
6
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
7
|
of this software and associated documentation files (the "Software"), to deal
|
data/README.md
CHANGED
|
@@ -167,8 +167,8 @@ FastText::Classifier.new(
|
|
|
167
167
|
dim: 100, # size of word vectors
|
|
168
168
|
ws: 5, # size of the context window
|
|
169
169
|
epoch: 5, # number of epochs
|
|
170
|
-
min_count: 1, # minimal number of word
|
|
171
|
-
min_count_label: 1, # minimal number of label
|
|
170
|
+
min_count: 1, # minimal number of word occurrences
|
|
171
|
+
min_count_label: 1, # minimal number of label occurrences
|
|
172
172
|
minn: 0, # min length of char ngram
|
|
173
173
|
maxn: 0, # max length of char ngram
|
|
174
174
|
neg: 5, # number of negatives sampled
|
|
@@ -197,7 +197,7 @@ FastText::Vectorizer.new(
|
|
|
197
197
|
dim: 100, # size of word vectors
|
|
198
198
|
ws: 5, # size of the context window
|
|
199
199
|
epoch: 5, # number of epochs
|
|
200
|
-
min_count: 5, # minimal number of word
|
|
200
|
+
min_count: 5, # minimal number of word occurrences
|
|
201
201
|
minn: 3, # min length of char ngram
|
|
202
202
|
maxn: 6, # max length of char ngram
|
|
203
203
|
neg: 5, # number of negatives sampled
|
data/ext/fasttext/ext.cpp
CHANGED
|
@@ -1,15 +1,21 @@
|
|
|
1
1
|
// stdlib
|
|
2
2
|
#include <cmath>
|
|
3
3
|
#include <cstdint>
|
|
4
|
-
#include <
|
|
4
|
+
#include <fstream>
|
|
5
|
+
#include <memory>
|
|
5
6
|
#include <sstream>
|
|
6
7
|
#include <stdexcept>
|
|
8
|
+
#include <string>
|
|
9
|
+
#include <utility>
|
|
10
|
+
#include <vector>
|
|
7
11
|
|
|
8
12
|
// fasttext
|
|
9
13
|
#include <args.h>
|
|
10
14
|
#include <autotune.h>
|
|
11
15
|
#include <densematrix.h>
|
|
16
|
+
#include <dictionary.h>
|
|
12
17
|
#include <fasttext.h>
|
|
18
|
+
#include <meter.h>
|
|
13
19
|
#include <real.h>
|
|
14
20
|
#include <vector.h>
|
|
15
21
|
|
|
@@ -20,41 +26,35 @@
|
|
|
20
26
|
using fasttext::Args;
|
|
21
27
|
using fasttext::FastText;
|
|
22
28
|
|
|
23
|
-
|
|
24
|
-
using Rice::Constructor;
|
|
25
|
-
using Rice::Module;
|
|
26
|
-
using Rice::define_class_under;
|
|
27
|
-
using Rice::define_module;
|
|
28
|
-
using Rice::define_module_under;
|
|
29
|
-
|
|
30
|
-
namespace Rice::detail
|
|
31
|
-
{
|
|
29
|
+
namespace Rice::detail {
|
|
32
30
|
template<>
|
|
33
|
-
class To_Ruby<std::vector<std::pair<fasttext::real, std::string>>>
|
|
34
|
-
{
|
|
31
|
+
class To_Ruby<std::vector<std::pair<fasttext::real, std::string>>> {
|
|
35
32
|
public:
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
33
|
+
explicit To_Ruby(Arg* arg) : arg_(arg) { }
|
|
34
|
+
|
|
35
|
+
VALUE convert(const std::vector<std::pair<fasttext::real, std::string>>& x) {
|
|
36
|
+
VALUE ret = detail::protect(rb_ary_new2, x.size());
|
|
39
37
|
for (const auto& v : x) {
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
a
|
|
43
|
-
ret
|
|
38
|
+
VALUE p1 = To_Ruby<fasttext::real>().convert(v.first);
|
|
39
|
+
VALUE p2 = To_Ruby<std::string>().convert(v.second);
|
|
40
|
+
VALUE a = detail::protect(rb_ary_new3, 2, p1, p2);
|
|
41
|
+
detail::protect(rb_ary_push, ret, a);
|
|
44
42
|
}
|
|
45
43
|
return ret;
|
|
46
44
|
}
|
|
45
|
+
|
|
46
|
+
private:
|
|
47
|
+
Arg* arg_ = nullptr;
|
|
47
48
|
};
|
|
48
|
-
}
|
|
49
|
+
} // namespace Rice::detail
|
|
49
50
|
|
|
50
51
|
extern "C"
|
|
51
|
-
void Init_ext()
|
|
52
|
-
|
|
53
|
-
Module
|
|
54
|
-
Module rb_mExt = define_module_under(rb_mFastText, "Ext");
|
|
52
|
+
void Init_ext() {
|
|
53
|
+
Rice::Module rb_mFastText = Rice::define_module("FastText");
|
|
54
|
+
Rice::Module rb_mExt = Rice::define_module_under(rb_mFastText, "Ext");
|
|
55
55
|
|
|
56
|
-
define_class_under<Args>(rb_mExt, "Args")
|
|
57
|
-
.define_constructor(Constructor<Args>())
|
|
56
|
+
Rice::define_class_under<Args>(rb_mExt, "Args")
|
|
57
|
+
.define_constructor(Rice::Constructor<Args>())
|
|
58
58
|
.define_attr("input", &Args::input)
|
|
59
59
|
.define_attr("output", &Args::output)
|
|
60
60
|
.define_attr("lr", &Args::lr)
|
|
@@ -110,24 +110,24 @@ void Init_ext()
|
|
|
110
110
|
.define_attr("autotune_duration", &Args::autotuneDuration)
|
|
111
111
|
.define_attr("autotune_model_size", &Args::autotuneModelSize);
|
|
112
112
|
|
|
113
|
-
define_class_under<FastText>(rb_mExt, "Model")
|
|
114
|
-
.define_constructor(Constructor<FastText>())
|
|
113
|
+
Rice::define_class_under<FastText>(rb_mExt, "Model")
|
|
114
|
+
.define_constructor(Rice::Constructor<FastText>())
|
|
115
115
|
.define_method(
|
|
116
116
|
"words",
|
|
117
117
|
[](FastText& m) {
|
|
118
118
|
std::shared_ptr<const fasttext::Dictionary> d = m.getDictionary();
|
|
119
119
|
std::vector<int64_t> freq = d->getCounts(fasttext::entry_type::word);
|
|
120
120
|
|
|
121
|
-
Array vocab_list;
|
|
122
|
-
Array vocab_freq;
|
|
121
|
+
Rice::Array vocab_list;
|
|
122
|
+
Rice::Array vocab_freq;
|
|
123
123
|
for (int32_t i = 0; i < d->nwords(); i++) {
|
|
124
|
-
vocab_list.push(d->getWord(i));
|
|
125
|
-
vocab_freq.push(freq
|
|
124
|
+
vocab_list.push(d->getWord(i), false);
|
|
125
|
+
vocab_freq.push(freq.at(i), false);
|
|
126
126
|
}
|
|
127
127
|
|
|
128
|
-
Array ret;
|
|
129
|
-
ret.push(vocab_list);
|
|
130
|
-
ret.push(vocab_freq);
|
|
128
|
+
Rice::Array ret;
|
|
129
|
+
ret.push(vocab_list, false);
|
|
130
|
+
ret.push(vocab_freq, false);
|
|
131
131
|
return ret;
|
|
132
132
|
})
|
|
133
133
|
.define_method(
|
|
@@ -136,22 +136,22 @@ void Init_ext()
|
|
|
136
136
|
std::shared_ptr<const fasttext::Dictionary> d = m.getDictionary();
|
|
137
137
|
std::vector<int64_t> freq = d->getCounts(fasttext::entry_type::label);
|
|
138
138
|
|
|
139
|
-
Array vocab_list;
|
|
140
|
-
Array vocab_freq;
|
|
139
|
+
Rice::Array vocab_list;
|
|
140
|
+
Rice::Array vocab_freq;
|
|
141
141
|
for (int32_t i = 0; i < d->nlabels(); i++) {
|
|
142
|
-
vocab_list.push(d->getLabel(i));
|
|
143
|
-
vocab_freq.push(freq
|
|
142
|
+
vocab_list.push(d->getLabel(i), false);
|
|
143
|
+
vocab_freq.push(freq.at(i), false);
|
|
144
144
|
}
|
|
145
145
|
|
|
146
|
-
Array ret;
|
|
147
|
-
ret.push(vocab_list);
|
|
148
|
-
ret.push(vocab_freq);
|
|
146
|
+
Rice::Array ret;
|
|
147
|
+
ret.push(vocab_list, false);
|
|
148
|
+
ret.push(vocab_freq, false);
|
|
149
149
|
return ret;
|
|
150
150
|
})
|
|
151
151
|
.define_method(
|
|
152
152
|
"test",
|
|
153
153
|
[](FastText& m, const std::string& filename, int32_t k) {
|
|
154
|
-
std::ifstream ifs
|
|
154
|
+
std::ifstream ifs{filename};
|
|
155
155
|
if (!ifs.is_open()) {
|
|
156
156
|
throw std::invalid_argument("Test file cannot be opened!");
|
|
157
157
|
}
|
|
@@ -159,21 +159,21 @@ void Init_ext()
|
|
|
159
159
|
m.test(ifs, k, 0.0, meter);
|
|
160
160
|
ifs.close();
|
|
161
161
|
|
|
162
|
-
Array ret;
|
|
163
|
-
ret.push(meter.nexamples());
|
|
164
|
-
ret.push(meter.precision());
|
|
165
|
-
ret.push(meter.recall());
|
|
162
|
+
Rice::Array ret;
|
|
163
|
+
ret.push(meter.nexamples(), false);
|
|
164
|
+
ret.push(meter.precision(), false);
|
|
165
|
+
ret.push(meter.recall(), false);
|
|
166
166
|
return ret;
|
|
167
167
|
})
|
|
168
168
|
.define_method(
|
|
169
169
|
"load_model",
|
|
170
|
-
[](FastText& m, const std::string&
|
|
171
|
-
m.loadModel(
|
|
170
|
+
[](FastText& m, const std::string& filename) {
|
|
171
|
+
m.loadModel(filename);
|
|
172
172
|
})
|
|
173
173
|
.define_method(
|
|
174
174
|
"save_model",
|
|
175
|
-
[](FastText& m, const std::string&
|
|
176
|
-
m.saveModel(
|
|
175
|
+
[](FastText& m, const std::string& filename) {
|
|
176
|
+
m.saveModel(filename);
|
|
177
177
|
})
|
|
178
178
|
.define_method("dimension", &FastText::getDimension)
|
|
179
179
|
.define_method("quantized?", &FastText::isQuant)
|
|
@@ -182,7 +182,7 @@ void Init_ext()
|
|
|
182
182
|
.define_method(
|
|
183
183
|
"predict",
|
|
184
184
|
[](FastText& m, const std::string& text, int32_t k, float threshold) {
|
|
185
|
-
std::stringstream ioss
|
|
185
|
+
std::stringstream ioss{text};
|
|
186
186
|
std::vector<std::pair<fasttext::real, std::string>> predictions;
|
|
187
187
|
m.predictLine(ioss, predictions, k, threshold);
|
|
188
188
|
return predictions;
|
|
@@ -197,12 +197,13 @@ void Init_ext()
|
|
|
197
197
|
.define_method(
|
|
198
198
|
"word_vector",
|
|
199
199
|
[](FastText& m, const std::string& word) {
|
|
200
|
-
|
|
201
|
-
fasttext::Vector vec
|
|
200
|
+
int dimension = m.getDimension();
|
|
201
|
+
fasttext::Vector vec{dimension};
|
|
202
202
|
m.getWordVector(vec, word);
|
|
203
|
-
Array ret;
|
|
204
|
-
|
|
205
|
-
|
|
203
|
+
Rice::Array ret;
|
|
204
|
+
// fasttext::Vector uses int64_t for size and indexing
|
|
205
|
+
for (int64_t i = 0; i < vec.size(); i++) {
|
|
206
|
+
ret.push(vec[i], false);
|
|
206
207
|
}
|
|
207
208
|
return ret;
|
|
208
209
|
})
|
|
@@ -214,22 +215,23 @@ void Init_ext()
|
|
|
214
215
|
std::shared_ptr<const fasttext::Dictionary> d = m.getDictionary();
|
|
215
216
|
d->getSubwords(word, ngrams, subwords);
|
|
216
217
|
|
|
217
|
-
Array ret;
|
|
218
|
+
Rice::Array ret;
|
|
218
219
|
for (const auto& subword : subwords) {
|
|
219
|
-
ret.push(subword);
|
|
220
|
+
ret.push(subword, false);
|
|
220
221
|
}
|
|
221
222
|
return ret;
|
|
222
223
|
})
|
|
223
224
|
.define_method(
|
|
224
225
|
"sentence_vector",
|
|
225
226
|
[](FastText& m, const std::string& text) {
|
|
226
|
-
std::istringstream in
|
|
227
|
-
|
|
228
|
-
fasttext::Vector vec
|
|
227
|
+
std::istringstream in{text};
|
|
228
|
+
int dimension = m.getDimension();
|
|
229
|
+
fasttext::Vector vec{dimension};
|
|
229
230
|
m.getSentenceVector(in, vec);
|
|
230
|
-
Array ret;
|
|
231
|
-
|
|
232
|
-
|
|
231
|
+
Rice::Array ret;
|
|
232
|
+
// fasttext::Vector uses int64_t for size and indexing
|
|
233
|
+
for (int64_t i = 0; i < vec.size(); i++) {
|
|
234
|
+
ret.push(vec[i], false);
|
|
233
235
|
}
|
|
234
236
|
return ret;
|
|
235
237
|
})
|
|
@@ -237,7 +239,7 @@ void Init_ext()
|
|
|
237
239
|
"train",
|
|
238
240
|
[](FastText& m, Args& a) {
|
|
239
241
|
if (a.hasAutotune()) {
|
|
240
|
-
fasttext::Autotune autotune
|
|
242
|
+
fasttext::Autotune autotune{std::shared_ptr<fasttext::FastText>(&m, [](fasttext::FastText*) {})};
|
|
241
243
|
autotune.train(a);
|
|
242
244
|
} else {
|
|
243
245
|
m.train(a);
|
data/lib/fasttext/version.rb
CHANGED
data/lib/fasttext.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: fasttext
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.5.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Andrew Kane
|
|
8
|
-
autorequire:
|
|
9
8
|
bindir: bin
|
|
10
9
|
cert_chain: []
|
|
11
|
-
date:
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
12
11
|
dependencies:
|
|
13
12
|
- !ruby/object:Gem::Dependency
|
|
14
13
|
name: rice
|
|
@@ -16,15 +15,14 @@ dependencies:
|
|
|
16
15
|
requirements:
|
|
17
16
|
- - ">="
|
|
18
17
|
- !ruby/object:Gem::Version
|
|
19
|
-
version: 4.
|
|
18
|
+
version: '4.7'
|
|
20
19
|
type: :runtime
|
|
21
20
|
prerelease: false
|
|
22
21
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
22
|
requirements:
|
|
24
23
|
- - ">="
|
|
25
24
|
- !ruby/object:Gem::Version
|
|
26
|
-
version: 4.
|
|
27
|
-
description:
|
|
25
|
+
version: '4.7'
|
|
28
26
|
email: andrew@ankane.org
|
|
29
27
|
executables: []
|
|
30
28
|
extensions:
|
|
@@ -75,7 +73,6 @@ homepage: https://github.com/ankane/fastText-ruby
|
|
|
75
73
|
licenses:
|
|
76
74
|
- MIT
|
|
77
75
|
metadata: {}
|
|
78
|
-
post_install_message:
|
|
79
76
|
rdoc_options: []
|
|
80
77
|
require_paths:
|
|
81
78
|
- lib
|
|
@@ -83,15 +80,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
83
80
|
requirements:
|
|
84
81
|
- - ">="
|
|
85
82
|
- !ruby/object:Gem::Version
|
|
86
|
-
version: '3.
|
|
83
|
+
version: '3.3'
|
|
87
84
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
88
85
|
requirements:
|
|
89
86
|
- - ">="
|
|
90
87
|
- !ruby/object:Gem::Version
|
|
91
88
|
version: '0'
|
|
92
89
|
requirements: []
|
|
93
|
-
rubygems_version:
|
|
94
|
-
signing_key:
|
|
90
|
+
rubygems_version: 4.0.6
|
|
95
91
|
specification_version: 4
|
|
96
92
|
summary: Efficient text classification and representation learning for Ruby
|
|
97
93
|
test_files: []
|