wapiti 0.1.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/HISTORY.md +8 -0
- data/LICENSE +1 -1
- data/README.md +39 -95
- data/ext/wapiti/bcd.c +1 -1
- data/ext/wapiti/extconf.rb +15 -1
- data/ext/wapiti/lbfgs.c +6 -6
- data/ext/wapiti/model.c +2 -3
- data/ext/wapiti/model.h +0 -7
- data/ext/wapiti/native.c +89 -239
- data/ext/wapiti/native.h +0 -5
- data/ext/wapiti/pattern.c +1 -1
- data/ext/wapiti/progress.c +19 -44
- data/ext/wapiti/progress.h +1 -4
- data/ext/wapiti/rprop.c +3 -4
- data/ext/wapiti/sgdl1.c +3 -3
- data/ext/wapiti/tools.c +36 -30
- data/ext/wapiti/tools.h +9 -4
- data/ext/wapiti/trainers.c +55 -0
- data/ext/wapiti/trainers.h +4 -1
- data/lib/wapiti.rb +4 -24
- data/lib/wapiti/dataset.rb +162 -0
- data/lib/wapiti/errors.rb +0 -4
- data/lib/wapiti/log.rb +29 -0
- data/lib/wapiti/model.rb +63 -40
- data/lib/wapiti/options.rb +66 -29
- data/lib/wapiti/sequence.rb +105 -0
- data/lib/wapiti/token.rb +74 -0
- data/lib/wapiti/version.rb +1 -1
- metadata +20 -80
- data/.autotest +0 -13
- data/.rspec +0 -3
- data/.simplecov +0 -3
- data/Gemfile +0 -29
- data/Rakefile +0 -63
- data/ext/wapiti/wapiti.c +0 -410
- data/spec/fixtures/ch.mod +0 -18550
- data/spec/fixtures/chpattern.txt +0 -52
- data/spec/fixtures/chtest.txt +0 -1973
- data/spec/fixtures/chtrain.txt +0 -19995
- data/spec/fixtures/nppattern.txt +0 -52
- data/spec/fixtures/nptest.txt +0 -1973
- data/spec/fixtures/nptrain.txt +0 -19995
- data/spec/fixtures/pattern.txt +0 -14
- data/spec/fixtures/test.txt +0 -60000
- data/spec/fixtures/train.txt +0 -1200
- data/spec/spec_helper.rb +0 -41
- data/spec/wapiti/model_spec.rb +0 -233
- data/spec/wapiti/native_spec.rb +0 -11
- data/spec/wapiti/options_spec.rb +0 -185
- data/spec/wapiti/utility_spec.rb +0 -22
- data/wapiti.gemspec +0 -49
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
|
-
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: f0357489368d9bbe57ea34e2bee7be8f6ae7542875971acdf48f24b038aebb32
|
|
4
|
+
data.tar.gz: 7267f065c30e82f581942cae7789bc4d71999f7a418c1549b5cd41a8ed4a1b80
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 194656b3d90ed6fedf32a2e1a8dc34cec4eccaeec71098c2499054697c66a3209d9436cfcf66e1b4b43e19e2f2b50f4ad5e2682df9d21f126e812940591de226
|
|
7
|
+
data.tar.gz: 9d365d09193a7c1657b1583331f20d1f985ad0f8c351947c791dd407a163463f4eb21ee6ac74545bd0f61ca6f88ab0ed669aaeefdaefdd03f2543cc22078a4f5
|
data/HISTORY.md
CHANGED
|
@@ -1,3 +1,11 @@
|
|
|
1
|
+
1.0.0 / 2017-12-xx
|
|
2
|
+
==================
|
|
3
|
+
* Added support for Windows platform
|
|
4
|
+
* Open files only if names are untainted
|
|
5
|
+
* Finalized API
|
|
6
|
+
* Fixed error reporting
|
|
7
|
+
* Removed progress logging
|
|
8
|
+
|
|
1
9
|
0.1.1 / 2014-02-27
|
|
2
10
|
==================
|
|
3
11
|
* Updated train routine
|
data/LICENSE
CHANGED
data/README.md
CHANGED
|
@@ -5,18 +5,14 @@ The Wapiti-Ruby gem provides a wicked fast linear-chain CRF
|
|
|
5
5
|
API for sequence segmentation and labelling; it is based on the
|
|
6
6
|
codebase of [wapiti](http://wapiti.limsi.fr/).
|
|
7
7
|
|
|
8
|
-
[](https://travis-ci.org/inukshuk/wapiti-ruby)
|
|
9
|
+
[](https://ci.appveyor.com/project/inukshuk/wapiti-ruby/branch/master)
|
|
10
|
+
[](https://coveralls.io/github/inukshuk/wapiti-ruby?branch=master)
|
|
10
11
|
|
|
11
12
|
Requirements
|
|
12
13
|
------------
|
|
13
|
-
Wapiti is written in C and Ruby and requires a compiler with C99
|
|
14
|
-
|
|
15
|
-
all necessary packages through your distribution.
|
|
16
|
-
|
|
17
|
-
The Wapiti Ruby gem has been confirmed to work with MRI 2.x, 1.9.x, 1.8.7,
|
|
18
|
-
and Rubinius.
|
|
19
|
-
|
|
14
|
+
Wapiti is written in C and Ruby and requires a compiler with C99 support;
|
|
15
|
+
it has been confirmed to work on Linux, macOS, and Windows.
|
|
20
16
|
|
|
21
17
|
Quickstart
|
|
22
18
|
----------
|
|
@@ -29,49 +25,35 @@ Quickstart
|
|
|
29
25
|
|
|
30
26
|
Using a pattern and training data stored in a file:
|
|
31
27
|
|
|
32
|
-
model = Wapiti.train('train.txt', :
|
|
33
|
-
|
|
28
|
+
model = Wapiti.train('train.txt', pattern: 'pattern.txt')
|
|
29
|
+
#=> #<Wapiti::Model:0x0000010188f868>
|
|
34
30
|
model.labels
|
|
35
|
-
|
|
31
|
+
#=> ["B-ADJP", "B-ADVP", "B-CONJP" ...]
|
|
36
32
|
model.save('ch.mod')
|
|
37
|
-
|
|
33
|
+
#=> saves the model as 'ch.mod'
|
|
38
34
|
|
|
39
|
-
Alternatively, you can pass in the training data as
|
|
40
|
-
|
|
35
|
+
Alternatively, you can pass in the training data as a `Wapiti::Dataset`;
|
|
36
|
+
this class supports the default text format used by Wapiti as well as
|
|
37
|
+
additiional formats (such as YAML or XML) and an API to make it easier
|
|
38
|
+
to manage data sets used for input and training.
|
|
41
39
|
|
|
42
|
-
data =
|
|
43
|
-
data << ['Confidence NN B-NP', 'in IN B-PP', 'the DT B-NP', 'pound NN I-NP', '. . O']
|
|
44
|
-
...
|
|
40
|
+
data = Wapiti::Dataset.open('chtrain.xml')
|
|
45
41
|
model = Wapiti.train(data, options)
|
|
46
42
|
|
|
47
|
-
You can consult the `Wapiti::Options` class for a list of
|
|
48
|
-
configuration options and algorithms
|
|
49
|
-
|
|
50
|
-
Wapiti::Options.attribute_names
|
|
51
|
-
=> [:algorithm, :check, :compact, :convergence_window, :development_data,
|
|
52
|
-
:jobsize, :label, :max_iterations, :maxent, :pattern, :posterior, :rho1,
|
|
53
|
-
:rho2, :score, :sparse, :stop_epsilon, :stop_window, :threads]
|
|
54
|
-
Wapiti::Options.algorithms
|
|
55
|
-
=> ["l-bfgs", "sgd-l1", "bcd", "rprop", "rprop+", "rprop-", "auto"]
|
|
43
|
+
You can consult the `Wapiti::Options.attribute_names` class for a list of
|
|
44
|
+
supported configuration options and `Wapiti::Options.algorithms` for
|
|
45
|
+
all supported algorithms:
|
|
56
46
|
|
|
57
47
|
Use `#valid?` or `#validate` (which returns error messages) to make sure
|
|
58
48
|
your configuration is supported by Wapiti.
|
|
59
49
|
|
|
60
|
-
You can pass options either as an options hash or by adding a block to the
|
|
61
|
-
method invocation:
|
|
62
|
-
|
|
63
|
-
model = Wapiti::Model.train(data) do |config|
|
|
64
|
-
config.pattern = 'pattern.txt'
|
|
65
|
-
threads = 4
|
|
66
|
-
end
|
|
67
|
-
|
|
68
50
|
Before saving your model you can use `compact` to reduce the model's size:
|
|
69
51
|
|
|
70
52
|
model.save 'm1.mod'
|
|
71
|
-
|
|
53
|
+
#=> m1.mod file size 1.8M
|
|
72
54
|
model.compact
|
|
73
55
|
model.save 'm2.mod'
|
|
74
|
-
|
|
56
|
+
#=> m2.mod file size 471K
|
|
75
57
|
|
|
76
58
|
|
|
77
59
|
### Loading existing Models
|
|
@@ -80,50 +62,33 @@ Before saving your model you can use `compact` to reduce the model's size:
|
|
|
80
62
|
|
|
81
63
|
### Labelling
|
|
82
64
|
|
|
83
|
-
By calling `#label` on a Model instance you can add labels to
|
|
84
|
-
data:
|
|
65
|
+
By calling `#label` on a Model instance you can add labels to a dataset:
|
|
85
66
|
|
|
86
67
|
model = Wapiti.load('m2.mod')
|
|
87
|
-
|
|
88
|
-
|
|
68
|
+
input = Wapiti::Dataset.load('chtest.txt')
|
|
69
|
+
output = model.label(input, tagged: true)
|
|
89
70
|
|
|
90
|
-
The result is
|
|
91
|
-
|
|
92
|
-
|
|
71
|
+
The result is a new `Wapiti::Dataset` with the predicted labels for each
|
|
72
|
+
token. If your input data was already tagged, you can compare the input
|
|
73
|
+
and output datasets to evaluate your results:
|
|
93
74
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
model.label [['Confidence NN', 'in IN', 'the DT', 'pound NN', '. .']]
|
|
98
|
-
=> [[["Confidence NN", "B-NP"], ["in IN", "B-PP"], ["the DT", "B-NP"],
|
|
99
|
-
["pound NN", "I-NP"], [". .", "O"]]]
|
|
75
|
+
output - input
|
|
76
|
+
# => new dataset of output sequences which are tagged differently than expected
|
|
100
77
|
|
|
101
78
|
If you pass a block to `#label` Wapiti will yield each token and the
|
|
102
79
|
corresponding label:
|
|
103
80
|
|
|
104
|
-
model.label
|
|
81
|
+
model.label input do |token, label|
|
|
105
82
|
[token.downcase, label.downcase]
|
|
106
83
|
end
|
|
107
|
-
=> [[["confidence nn", "b-np"], ["in in", "b-pp"], ["the dt", "b-np"],
|
|
108
|
-
["pound nn", "i-np"], [". .", "o"]]]
|
|
109
84
|
|
|
110
85
|
Note that if you set the *:score* option (either in the Model's `#options` or
|
|
111
86
|
when calling `#label`), the score for each label will be appended to
|
|
112
87
|
each token/label tuple as a floating point number or passed as a third
|
|
113
88
|
argument to the passed-in block.
|
|
114
89
|
|
|
115
|
-
model.label
|
|
116
|
-
=>
|
|
117
|
-
|
|
118
|
-
Similarly, if you set the *:nbest* option to a value greater than one, Wapiti
|
|
119
|
-
will append more label and, optionally, score values to each tuple.
|
|
120
|
-
|
|
121
|
-
model.label [['Confidence NN']], :score => true, :nbest => 3, :skip_tokens => true
|
|
122
|
-
=> [[["B-NP", 4.642034838737357, "B-VP", 1.7040256847206927, "B-ADJP", 0.7636429298060177]]]
|
|
123
|
-
|
|
124
|
-
Note how we also suppressed the output of the token string using the
|
|
125
|
-
*:skip_tokens* option.
|
|
126
|
-
|
|
90
|
+
model.label input, score: true
|
|
91
|
+
# => Dataset where each token will include a score
|
|
127
92
|
|
|
128
93
|
### Statistics
|
|
129
94
|
|
|
@@ -131,41 +96,20 @@ By setting the *:check* option you can tell Wapiti to keep statistics during
|
|
|
131
96
|
the labelling phase (for the statistics to be meaningful you obviously need
|
|
132
97
|
to provide input data that is already labelled). Wapiti does not reset the
|
|
133
98
|
counters during consecutive calls to `#label` to allow you to collect
|
|
134
|
-
accumulative
|
|
135
|
-
`#
|
|
99
|
+
accumulative stats; however, you can reset the counters at any time, by calling
|
|
100
|
+
`#reset_counters`.
|
|
136
101
|
|
|
137
102
|
After calling `#label` with the *:check* options set and appropriately labelled
|
|
138
103
|
input, you can access the statistics via `#statistics` (the individual values
|
|
139
104
|
are also available through the associated attribute readers).
|
|
140
105
|
|
|
141
|
-
model.label
|
|
142
|
-
|
|
143
|
-
:
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
Citing
|
|
148
|
-
------
|
|
149
|
-
If you're using Wapiti-Ruby for research purposes, please use the following
|
|
150
|
-
citation of the original wapiti package:
|
|
151
|
-
|
|
152
|
-
@article{lavergne2010practical,
|
|
153
|
-
author = {Lavergne, Thomas and Capp\'{e}, Olivier and Yvon, Fran\c{c}ois},
|
|
154
|
-
title = {Practical Very Large Scale {CRFs}},
|
|
155
|
-
booktitle = {Proceedings the 48th Annual Meeting of the Association for
|
|
156
|
-
Computational Linguistics (ACL)},
|
|
157
|
-
month = {July},
|
|
158
|
-
year = {2010},
|
|
159
|
-
location = {Uppsala, Sweden},
|
|
160
|
-
publisher = {Association for Computational Linguistics},
|
|
161
|
-
pages = {504--513},
|
|
162
|
-
url = {http://www.aclweb.org/anthology/P10-1052}
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
If you're profiting from any of the Wapiti-Ruby specific features you are
|
|
166
|
-
welcome to also refer back to the
|
|
167
|
-
[Wapiti-Ruby homepage](http://github.com/inukshuk/wapiti-ruby/).
|
|
106
|
+
model.label input, check: true
|
|
107
|
+
model.stats
|
|
108
|
+
=> {:token=>{:count=>1896, :errors=>137, :rate=>7.225738396624472},
|
|
109
|
+
:sequence=>{:count=>77, :errors=>50, :rate=>64.93506493506494}}
|
|
168
110
|
|
|
111
|
+
For convenience, you can also use the `#check` method, which
|
|
112
|
+
will reset the counters, check your input, and return the stats.
|
|
169
113
|
|
|
170
114
|
Contributing
|
|
171
115
|
------------
|
|
@@ -183,7 +127,7 @@ example, fix the bug and submit a pull request.
|
|
|
183
127
|
|
|
184
128
|
License
|
|
185
129
|
-------
|
|
186
|
-
Copyright 2011-
|
|
130
|
+
Copyright 2011-2018 Sylvester Keil. All rights reserved.
|
|
187
131
|
|
|
188
132
|
Copyright 2009-2013 CNRS. All rights reserved.
|
|
189
133
|
|
data/ext/wapiti/bcd.c
CHANGED
data/ext/wapiti/extconf.rb
CHANGED
|
@@ -1,6 +1,20 @@
|
|
|
1
1
|
require 'mkmf'
|
|
2
|
+
require 'rbconfig'
|
|
2
3
|
|
|
3
|
-
|
|
4
|
+
cflags = %w{
|
|
5
|
+
-std=c99
|
|
6
|
+
-W
|
|
7
|
+
-Wall
|
|
8
|
+
-Wno-declaration-after-statement
|
|
9
|
+
-O3
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
case RbConfig::CONFIG['host_os']
|
|
13
|
+
when /^linux/i
|
|
14
|
+
cflags[0] = '-std=gnu99'
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
$CFLAGS << ' ' << cflags.join(' ')
|
|
4
18
|
|
|
5
19
|
have_library('pthread')
|
|
6
20
|
have_library('m')
|
data/ext/wapiti/lbfgs.c
CHANGED
|
@@ -104,12 +104,12 @@ void trn_lbfgs(mdl_t *mdl) {
|
|
|
104
104
|
uint64_t f;
|
|
105
105
|
if (fscanf(file, "%"PRIu64, &f) != 1)
|
|
106
106
|
fatal("1 %s", err);
|
|
107
|
-
if (fscanf(file, "%
|
|
107
|
+
if (fscanf(file, "%le %le", &xp[f], &gp[f]) != 2)
|
|
108
108
|
fatal("2 %s", err);
|
|
109
109
|
for (uint32_t m = 0; m < M; m++) {
|
|
110
|
-
if (fscanf(file, "%
|
|
110
|
+
if (fscanf(file, "%le", &s[m][f]) != 1)
|
|
111
111
|
fatal("3 %s", err);
|
|
112
|
-
if (fscanf(file, "%
|
|
112
|
+
if (fscanf(file, "%le", &y[m][f]) != 1)
|
|
113
113
|
fatal("4 %s", err);
|
|
114
114
|
}
|
|
115
115
|
}
|
|
@@ -271,7 +271,7 @@ void trn_lbfgs(mdl_t *mdl) {
|
|
|
271
271
|
memcpy(x, xp, sizeof(double) * F);
|
|
272
272
|
break;
|
|
273
273
|
}
|
|
274
|
-
if (uit_progress(mdl
|
|
274
|
+
if (uit_progress(mdl) == false)
|
|
275
275
|
break;
|
|
276
276
|
// 3rd step: we update the history used for approximating the
|
|
277
277
|
// inverse of the diagonal of the hessian
|
|
@@ -314,9 +314,9 @@ void trn_lbfgs(mdl_t *mdl) {
|
|
|
314
314
|
fprintf(file, "#state#0#%"PRIu32"#%"PRIu64"\n", M, F);
|
|
315
315
|
for (uint64_t f = 0; f < F; f++) {
|
|
316
316
|
fprintf(file, "%"PRIu64, f);
|
|
317
|
-
fprintf(file, " %
|
|
317
|
+
fprintf(file, " %le %le", xp[f], gp[f]);
|
|
318
318
|
for (uint32_t m = 0; m < M; m++)
|
|
319
|
-
fprintf(file, " %
|
|
319
|
+
fprintf(file, " %le %le", s[m][f], y[m][f]);
|
|
320
320
|
fprintf(file, "\n");
|
|
321
321
|
}
|
|
322
322
|
fclose(file);
|
data/ext/wapiti/model.c
CHANGED
|
@@ -74,7 +74,6 @@ mdl_t *mdl_new(rdr_t *rdr) {
|
|
|
74
74
|
mdl->train = mdl->devel = NULL;
|
|
75
75
|
mdl->reader = rdr;
|
|
76
76
|
mdl->werr = NULL;
|
|
77
|
-
mdl->total = 0.0;
|
|
78
77
|
return mdl;
|
|
79
78
|
}
|
|
80
79
|
|
|
@@ -272,7 +271,7 @@ void mdl_save(mdl_t *mdl, FILE *file) {
|
|
|
272
271
|
rdr_save(mdl->reader, file);
|
|
273
272
|
for (uint64_t f = 0; f < mdl->nftr; f++)
|
|
274
273
|
if (mdl->theta[f] != 0.0)
|
|
275
|
-
fprintf(file, "%"PRIu64"=%
|
|
274
|
+
fprintf(file, "%"PRIu64"=%le\n", f, mdl->theta[f]);
|
|
276
275
|
}
|
|
277
276
|
|
|
278
277
|
/* mdl_load:
|
|
@@ -298,7 +297,7 @@ void mdl_load(mdl_t *mdl, FILE *file) {
|
|
|
298
297
|
for (uint64_t i = 0; i < nact; i++) {
|
|
299
298
|
uint64_t f;
|
|
300
299
|
double v;
|
|
301
|
-
if (fscanf(file, "%"SCNu64"=%
|
|
300
|
+
if (fscanf(file, "%"SCNu64"=%le\n", &f, &v) != 2)
|
|
302
301
|
fatal(err);
|
|
303
302
|
mdl->theta[f] = v;
|
|
304
303
|
}
|
data/ext/wapiti/model.h
CHANGED
|
@@ -30,15 +30,12 @@
|
|
|
30
30
|
|
|
31
31
|
#include <stddef.h>
|
|
32
32
|
#include <stdint.h>
|
|
33
|
-
#include <sys/time.h>
|
|
34
33
|
|
|
35
34
|
#include "options.h"
|
|
36
35
|
#include "sequence.h"
|
|
37
36
|
#include "reader.h"
|
|
38
37
|
#include "wapiti.h"
|
|
39
38
|
|
|
40
|
-
typedef struct timeval tms_t;
|
|
41
|
-
|
|
42
39
|
/* mdl_t:
|
|
43
40
|
* Represent a linear-chain CRF model. The model contain both unigram and
|
|
44
41
|
* bigram features. It is caracterized by <nlbl> the number of labels, <nobs>
|
|
@@ -86,10 +83,6 @@ struct mdl_s {
|
|
|
86
83
|
double *werr; // Window of error rate of last iters
|
|
87
84
|
uint32_t wcnt; // Number of iters in the window
|
|
88
85
|
uint32_t wpos; // Position for the next iter
|
|
89
|
-
|
|
90
|
-
// Timing
|
|
91
|
-
tms_t timer; // start time of last iter
|
|
92
|
-
double total; // total training time
|
|
93
86
|
};
|
|
94
87
|
|
|
95
88
|
mdl_t *mdl_new(rdr_t *rdr);
|
data/ext/wapiti/native.c
CHANGED
|
@@ -10,43 +10,16 @@
|
|
|
10
10
|
#include "quark.h"
|
|
11
11
|
#include "tools.h"
|
|
12
12
|
#include "wapiti.h"
|
|
13
|
-
|
|
14
13
|
#include "native.h"
|
|
15
14
|
|
|
16
15
|
VALUE mWapiti;
|
|
17
16
|
VALUE mNative;
|
|
18
|
-
|
|
19
17
|
VALUE cOptions;
|
|
20
18
|
VALUE cModel;
|
|
21
|
-
|
|
19
|
+
VALUE cArgumentError;
|
|
22
20
|
VALUE cNativeError;
|
|
23
|
-
VALUE cConfigurationError;
|
|
24
21
|
VALUE cLogger;
|
|
25
22
|
|
|
26
|
-
|
|
27
|
-
/* --- Forward declarations --- */
|
|
28
|
-
|
|
29
|
-
int wapiti_main(int argc, char *argv[argc]);
|
|
30
|
-
|
|
31
|
-
void dolabel(mdl_t *mdl);
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
/* --- Utilities --- */
|
|
35
|
-
|
|
36
|
-
static const struct {
|
|
37
|
-
const char *name;
|
|
38
|
-
void (* train)(mdl_t *mdl);
|
|
39
|
-
} trn_lst[] = {
|
|
40
|
-
{"l-bfgs", trn_lbfgs},
|
|
41
|
-
{"sgd-l1", trn_sgdl1},
|
|
42
|
-
{"bcd", trn_bcd },
|
|
43
|
-
{"rprop", trn_rprop},
|
|
44
|
-
{"rprop+", trn_rprop},
|
|
45
|
-
{"rprop-", trn_rprop}
|
|
46
|
-
};
|
|
47
|
-
static const uint32_t trn_cnt = sizeof(trn_lst) / sizeof(trn_lst[0]);
|
|
48
|
-
|
|
49
|
-
|
|
50
23
|
/* --- Options Class --- */
|
|
51
24
|
|
|
52
25
|
// Auxiliary Methods
|
|
@@ -68,6 +41,14 @@ static void copy_string(char **dst, VALUE rb_string) {
|
|
|
68
41
|
memcpy(*dst, StringValuePtr(rb_string), RSTRING_LEN(rb_string) + 1);
|
|
69
42
|
}
|
|
70
43
|
|
|
44
|
+
// Moves a string to the heap. We use this to move default
|
|
45
|
+
// values to the heap during initialization.
|
|
46
|
+
static char *to_heap(const char *string) {
|
|
47
|
+
char* ptr = calloc(strlen(string), sizeof(char));
|
|
48
|
+
memcpy(ptr, string, strlen(string));
|
|
49
|
+
return ptr;
|
|
50
|
+
}
|
|
51
|
+
|
|
71
52
|
|
|
72
53
|
// Constructor / Desctructor
|
|
73
54
|
|
|
@@ -76,11 +57,11 @@ static void mark_options(opt_t* options __attribute__((__unused__))) {
|
|
|
76
57
|
}
|
|
77
58
|
|
|
78
59
|
static void deallocate_options(opt_t* options) {
|
|
79
|
-
|
|
80
60
|
// free string options
|
|
81
61
|
if (options->input) { free(options->input); }
|
|
82
62
|
if (options->output) { free(options->output); }
|
|
83
63
|
if (options->algo) { free((void*)options->algo); }
|
|
64
|
+
if (options->type) { free((void*)options->type); }
|
|
84
65
|
if (options->devel) { free(options->devel); }
|
|
85
66
|
if (options->pattern) { free((void*)options->pattern); }
|
|
86
67
|
|
|
@@ -101,21 +82,20 @@ static VALUE initialize_options(int argc, VALUE *argv, VALUE self) {
|
|
|
101
82
|
options->maxiter = INT_MAX;
|
|
102
83
|
}
|
|
103
84
|
|
|
104
|
-
//
|
|
105
|
-
// are on the heap
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
options->algo = tmp;
|
|
85
|
+
// Copy default algorithm and type name to the heap
|
|
86
|
+
// so that all options strings are on the heap.
|
|
87
|
+
options->algo = to_heap(options->algo);
|
|
88
|
+
options->type = to_heap(options->type);
|
|
109
89
|
|
|
110
90
|
if (argc > 1) {
|
|
111
|
-
rb_raise(
|
|
91
|
+
rb_raise(cArgumentError,
|
|
112
92
|
"wrong number of arguments (%d for 0..1)", argc);
|
|
113
93
|
}
|
|
114
94
|
|
|
115
95
|
// set defaults
|
|
116
96
|
if (argc) {
|
|
117
97
|
Check_Type(argv[0], T_HASH);
|
|
118
|
-
(void)rb_funcall(self, rb_intern("update"), 1, argv[0]);
|
|
98
|
+
(void)rb_funcall(self, rb_intern("update!"), 1, argv[0]);
|
|
119
99
|
}
|
|
120
100
|
|
|
121
101
|
// yield self if block_given?
|
|
@@ -431,7 +411,6 @@ static VALUE options_model(VALUE self) {
|
|
|
431
411
|
static VALUE options_set_model(VALUE self, VALUE rb_string) {
|
|
432
412
|
opt_t *options = get_options(self);
|
|
433
413
|
copy_string(&(options->model), rb_string);
|
|
434
|
-
|
|
435
414
|
return rb_string;
|
|
436
415
|
}
|
|
437
416
|
|
|
@@ -443,19 +422,17 @@ static VALUE options_algorithm(VALUE self) {
|
|
|
443
422
|
static VALUE options_set_algorithm(VALUE self, VALUE rb_string) {
|
|
444
423
|
opt_t *options = get_options(self);
|
|
445
424
|
copy_string((char**)&(options->algo), rb_string);
|
|
446
|
-
|
|
447
425
|
return rb_string;
|
|
448
426
|
}
|
|
449
427
|
|
|
450
|
-
static VALUE
|
|
451
|
-
char *
|
|
452
|
-
return rb_str_new2(
|
|
428
|
+
static VALUE options_type(VALUE self) {
|
|
429
|
+
const char *type = get_options(self)->type;
|
|
430
|
+
return rb_str_new2(type ? type : "");
|
|
453
431
|
}
|
|
454
432
|
|
|
455
|
-
static VALUE
|
|
433
|
+
static VALUE options_set_type(VALUE self, VALUE rb_string) {
|
|
456
434
|
opt_t *options = get_options(self);
|
|
457
|
-
copy_string(&(options->
|
|
458
|
-
|
|
435
|
+
copy_string((char**)&(options->type), rb_string);
|
|
459
436
|
return rb_string;
|
|
460
437
|
}
|
|
461
438
|
|
|
@@ -565,11 +542,8 @@ void Init_options() {
|
|
|
565
542
|
rb_define_alias(cOptions, "algo", "algorithm");
|
|
566
543
|
rb_define_alias(cOptions, "algo=", "algorithm=");
|
|
567
544
|
|
|
568
|
-
rb_define_method(cOptions, "
|
|
569
|
-
rb_define_method(cOptions, "
|
|
570
|
-
|
|
571
|
-
rb_define_alias(cOptions, "devel", "development_data");
|
|
572
|
-
rb_define_alias(cOptions, "devel=", "development_data=");
|
|
545
|
+
rb_define_method(cOptions, "type", options_type, 0);
|
|
546
|
+
rb_define_method(cOptions, "type=", options_set_type, 1);
|
|
573
547
|
|
|
574
548
|
rb_define_method(cOptions, "clip", options_clip, 0);
|
|
575
549
|
rb_define_method(cOptions, "clip=", options_set_clip, 1);
|
|
@@ -640,7 +614,7 @@ static VALUE allocate_model(VALUE self) {
|
|
|
640
614
|
|
|
641
615
|
static VALUE model_set_options(VALUE self, VALUE rb_options) {
|
|
642
616
|
if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
|
|
643
|
-
rb_raise(
|
|
617
|
+
rb_raise(cArgumentError, "argument must be a Wapiti::Options instance");
|
|
644
618
|
}
|
|
645
619
|
|
|
646
620
|
mdl_t *model = get_model(self);
|
|
@@ -661,22 +635,20 @@ static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
|
|
|
661
635
|
VALUE options;
|
|
662
636
|
|
|
663
637
|
if (argc > 1) {
|
|
664
|
-
rb_raise(
|
|
638
|
+
rb_raise(cArgumentError,
|
|
665
639
|
"wrong number of arguments (%d for 0..1)", argc);
|
|
666
640
|
}
|
|
667
641
|
|
|
668
642
|
if (argc) {
|
|
669
643
|
if (TYPE(argv[0]) == T_HASH) {
|
|
670
644
|
options = rb_funcall(cOptions, rb_intern("new"), 1, argv[0]);
|
|
671
|
-
}
|
|
672
|
-
else {
|
|
645
|
+
} else {
|
|
673
646
|
if (strncmp("Wapiti::Options", rb_obj_classname(argv[0]), 15) != 0) {
|
|
674
|
-
rb_raise(
|
|
647
|
+
rb_raise(cArgumentError, "argument must be a hash or an options instance");
|
|
675
648
|
}
|
|
676
649
|
options = argv[0];
|
|
677
650
|
}
|
|
678
|
-
}
|
|
679
|
-
else {
|
|
651
|
+
} else {
|
|
680
652
|
options = rb_funcall(cOptions, rb_intern("new"), 0);
|
|
681
653
|
}
|
|
682
654
|
|
|
@@ -693,7 +665,7 @@ static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
|
|
|
693
665
|
}
|
|
694
666
|
|
|
695
667
|
// initialize counters
|
|
696
|
-
rb_funcall(self, rb_intern("
|
|
668
|
+
rb_funcall(self, rb_intern("reset_counters"), 0);
|
|
697
669
|
|
|
698
670
|
return self;
|
|
699
671
|
}
|
|
@@ -713,10 +685,6 @@ static VALUE model_nftr(VALUE self) {
|
|
|
713
685
|
return INT2FIX(get_model(self)->nftr);
|
|
714
686
|
}
|
|
715
687
|
|
|
716
|
-
static VALUE model_total(VALUE self) {
|
|
717
|
-
return rb_float_new(get_model(self)->total);
|
|
718
|
-
}
|
|
719
|
-
|
|
720
688
|
|
|
721
689
|
// Instance methods
|
|
722
690
|
|
|
@@ -738,7 +706,7 @@ static VALUE model_compact(VALUE self) {
|
|
|
738
706
|
// otherwise uses the passed-in argument as the Model's path.
|
|
739
707
|
static VALUE model_save(int argc, VALUE *argv, VALUE self) {
|
|
740
708
|
if (argc > 1) {
|
|
741
|
-
rb_raise(
|
|
709
|
+
rb_raise(cArgumentError,
|
|
742
710
|
"wrong number of arguments (%d for 0..1)", argc);
|
|
743
711
|
}
|
|
744
712
|
|
|
@@ -751,17 +719,13 @@ static VALUE model_save(int argc, VALUE *argv, VALUE self) {
|
|
|
751
719
|
}
|
|
752
720
|
|
|
753
721
|
// open the output file
|
|
754
|
-
FILE *file = 0;
|
|
755
722
|
VALUE path = rb_ivar_get(self, rb_intern("@path"));
|
|
756
723
|
|
|
757
724
|
if (NIL_P(path)) {
|
|
758
|
-
|
|
759
|
-
}
|
|
760
|
-
|
|
761
|
-
if (!(file = fopen(StringValueCStr(path), "w"))) {
|
|
762
|
-
rb_raise(cNativeError, "failed to save model: failed to open model file");
|
|
725
|
+
fatal("failed to save model: no path given");
|
|
763
726
|
}
|
|
764
727
|
|
|
728
|
+
FILE *file = ufopen(path, "w");
|
|
765
729
|
mdl_save(model, file);
|
|
766
730
|
fclose(file);
|
|
767
731
|
|
|
@@ -770,7 +734,7 @@ static VALUE model_save(int argc, VALUE *argv, VALUE self) {
|
|
|
770
734
|
|
|
771
735
|
static VALUE model_load(int argc, VALUE *argv, VALUE self) {
|
|
772
736
|
if (argc > 1) {
|
|
773
|
-
rb_raise(
|
|
737
|
+
rb_raise(cArgumentError,
|
|
774
738
|
"wrong number of arguments (%d for 0..1)", argc);
|
|
775
739
|
}
|
|
776
740
|
|
|
@@ -783,17 +747,13 @@ static VALUE model_load(int argc, VALUE *argv, VALUE self) {
|
|
|
783
747
|
}
|
|
784
748
|
|
|
785
749
|
// open the model file
|
|
786
|
-
FILE *file = 0;
|
|
787
750
|
VALUE path = rb_ivar_get(self, rb_intern("@path"));
|
|
788
751
|
|
|
789
752
|
if (NIL_P(path)) {
|
|
790
|
-
|
|
791
|
-
}
|
|
792
|
-
|
|
793
|
-
if (!(file = fopen(StringValueCStr(path), "r"))) {
|
|
794
|
-
rb_raise(cNativeError, "failed to load model: failed to open model file");
|
|
753
|
+
fatal("failed to load model: no path given");
|
|
795
754
|
}
|
|
796
755
|
|
|
756
|
+
FILE *file = ufopen(path, "r");
|
|
797
757
|
mdl_load(model, file);
|
|
798
758
|
fclose(file);
|
|
799
759
|
|
|
@@ -849,31 +809,44 @@ static dat_t *to_dat(rdr_t *reader, VALUE data, bool labelled) {
|
|
|
849
809
|
return dat;
|
|
850
810
|
}
|
|
851
811
|
|
|
812
|
+
static dat_t *ld_dat(rdr_t *reader, VALUE data, bool labelled) {
|
|
813
|
+
FILE *file;
|
|
814
|
+
dat_t *dat = (dat_t*)0;
|
|
852
815
|
|
|
853
|
-
|
|
816
|
+
switch (TYPE(data)) {
|
|
817
|
+
case T_STRING:
|
|
818
|
+
file = ufopen(data, "r");
|
|
819
|
+
dat = rdr_readdat(reader, file, labelled);
|
|
820
|
+
fclose(file);
|
|
821
|
+
break;
|
|
854
822
|
|
|
855
|
-
|
|
823
|
+
case T_ARRAY:
|
|
824
|
+
dat = to_dat(reader, data, labelled);
|
|
825
|
+
break;
|
|
856
826
|
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
if (!strcmp(model->opt->algo, trn_lst[trn].name)) break;
|
|
827
|
+
default:
|
|
828
|
+
fatal("invalid data type (expected instance of String or Array)");
|
|
860
829
|
}
|
|
861
830
|
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
}
|
|
831
|
+
return dat;
|
|
832
|
+
}
|
|
833
|
+
|
|
866
834
|
|
|
835
|
+
static VALUE model_train(VALUE self, VALUE train, VALUE devel) {
|
|
867
836
|
FILE *file;
|
|
837
|
+
mdl_t *model = get_model(self);
|
|
838
|
+
trn_t trn = trn_get(model->opt->algo);
|
|
839
|
+
model->type = typ_get(model->opt->type);
|
|
868
840
|
|
|
869
841
|
// Load the pattern file. This will unlock the database if previously
|
|
870
842
|
// locked by loading a model.
|
|
871
843
|
if (model->opt->pattern) {
|
|
844
|
+
info("load patterns");
|
|
872
845
|
file = fopen(model->opt->pattern, "r");
|
|
873
846
|
|
|
874
847
|
if (!file) {
|
|
875
|
-
|
|
876
|
-
|
|
848
|
+
pfatal("failed to train model: failed to load pattern file '%s'",
|
|
849
|
+
model->opt->pattern);
|
|
877
850
|
}
|
|
878
851
|
|
|
879
852
|
rdr_loadpat(model->reader, file);
|
|
@@ -886,58 +859,45 @@ static VALUE model_train(VALUE self, VALUE data) {
|
|
|
886
859
|
// Load the training data. When this is done we lock the quarks as we
|
|
887
860
|
// don't want to put in the model, informations present only in the
|
|
888
861
|
// development set.
|
|
889
|
-
|
|
890
|
-
switch (TYPE(data)) {
|
|
891
|
-
case T_STRING:
|
|
892
|
-
if (!(file = fopen(StringValuePtr(data), "r"))) {
|
|
893
|
-
rb_raise(cNativeError, "failed to train model: failed to open training data '%s", StringValuePtr(data));
|
|
894
|
-
}
|
|
895
|
-
|
|
896
|
-
model->train = rdr_readdat(model->reader, file, true);
|
|
897
|
-
fclose(file);
|
|
898
|
-
|
|
899
|
-
break;
|
|
900
|
-
case T_ARRAY:
|
|
901
|
-
model->train = to_dat(model->reader, data, true);
|
|
902
|
-
|
|
903
|
-
break;
|
|
904
|
-
default:
|
|
905
|
-
rb_raise(cNativeError, "failed to train model: invalid training data type (expected instance of String or Array)");
|
|
906
|
-
}
|
|
862
|
+
model->train = ld_dat(model->reader, train, true);
|
|
907
863
|
|
|
908
864
|
qrk_lock(model->reader->lbl, true);
|
|
909
865
|
qrk_lock(model->reader->obs, true);
|
|
910
866
|
|
|
911
867
|
if (!model->train || model->train->nseq == 0) {
|
|
912
|
-
|
|
868
|
+
fatal("failed to train model: no training data loaded");
|
|
913
869
|
}
|
|
914
870
|
|
|
915
871
|
// If present, load the development set in the model. If not specified,
|
|
916
872
|
// the training dataset will be used instead.
|
|
917
|
-
if (
|
|
918
|
-
|
|
919
|
-
rb_raise(cNativeError,
|
|
920
|
-
"failed to train model: cannot open development file '%s'", model->opt->devel);
|
|
921
|
-
}
|
|
922
|
-
|
|
923
|
-
model->devel = rdr_readdat(model->reader, file, true);
|
|
924
|
-
fclose(file);
|
|
873
|
+
if (TYPE(devel) != T_NIL) {
|
|
874
|
+
model->devel = ld_dat(model->reader, devel, true);
|
|
925
875
|
}
|
|
926
876
|
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
877
|
+
// Initialize the model. If a previous model was loaded, this will be
|
|
878
|
+
// just a resync, else the model structure will be created.
|
|
879
|
+
info((model->theta == NULL) ? "initialize model" : "re-sync model");
|
|
930
880
|
mdl_sync(model);
|
|
931
881
|
|
|
932
|
-
|
|
882
|
+
info("nb train: %"PRIu32"", model->train->nseq);
|
|
883
|
+
if (model->devel != NULL)
|
|
884
|
+
info("nb devel: %"PRIu32"", model->devel->nseq);
|
|
885
|
+
info("nb labels: %"PRIu32"", model->nlbl);
|
|
886
|
+
info("nb blocks: %"PRIu64"", model->nobs);
|
|
887
|
+
info("nb features: %"PRIu64"", model->nftr);
|
|
888
|
+
|
|
889
|
+
info("training model with %s", model->opt->algo);
|
|
933
890
|
uit_setup(model);
|
|
934
|
-
|
|
891
|
+
trn(model);
|
|
935
892
|
uit_cleanup(model);
|
|
936
893
|
|
|
937
|
-
// If requested compact the model.
|
|
938
894
|
if (model->opt->compact) {
|
|
939
|
-
|
|
895
|
+
const uint64_t O = model->nobs;
|
|
896
|
+
const uint64_t F = model->nftr;
|
|
897
|
+
info("compacting model");
|
|
940
898
|
mdl_compact(model);
|
|
899
|
+
info("%8"PRIu64" observations removed", O - model->nobs);
|
|
900
|
+
info("%8"PRIu64" features removed", F - model->nftr);
|
|
941
901
|
}
|
|
942
902
|
|
|
943
903
|
return self;
|
|
@@ -980,8 +940,7 @@ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
|
|
|
980
940
|
|
|
981
941
|
if (N == 1) {
|
|
982
942
|
tag_viterbi(model, seq, out, scs, psc);
|
|
983
|
-
}
|
|
984
|
-
else {
|
|
943
|
+
} else {
|
|
985
944
|
tag_nbviterbi(model, seq, N, (void*)out, scs, (void*)psc);
|
|
986
945
|
}
|
|
987
946
|
|
|
@@ -993,16 +952,13 @@ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
|
|
|
993
952
|
if (!model->opt->label) {
|
|
994
953
|
VALUE token = rb_str_new2(raw->lines[t]);
|
|
995
954
|
|
|
996
|
-
#ifdef HAVE_RUBY_ENCODING_H
|
|
997
955
|
int enc = rb_enc_find_index("UTF-8");
|
|
998
956
|
rb_enc_associate_index(token, enc);
|
|
999
|
-
#endif
|
|
1000
957
|
|
|
1001
958
|
rb_ary_push(tokens, token);
|
|
1002
959
|
}
|
|
1003
960
|
|
|
1004
961
|
for (n = 0; n < N; ++n) {
|
|
1005
|
-
|
|
1006
962
|
uint64_t lbl = out[t * N + n];
|
|
1007
963
|
rb_ary_push(tokens, rb_str_new2(qrk_id2str(lbls, lbl)));
|
|
1008
964
|
|
|
@@ -1010,7 +966,6 @@ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
|
|
|
1010
966
|
if (model->opt->outsc) {
|
|
1011
967
|
rb_ary_push(tokens, rb_float_new(psc[t * N + n]));
|
|
1012
968
|
}
|
|
1013
|
-
|
|
1014
969
|
}
|
|
1015
970
|
|
|
1016
971
|
// yield token/label pair to block if given
|
|
@@ -1020,9 +975,7 @@ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
|
|
|
1020
975
|
|
|
1021
976
|
rb_ary_push(sequence, tokens);
|
|
1022
977
|
|
|
1023
|
-
|
|
1024
978
|
// TODO output sequence score: scs[n] (float)
|
|
1025
|
-
|
|
1026
979
|
}
|
|
1027
980
|
|
|
1028
981
|
// Statistics
|
|
@@ -1036,8 +989,7 @@ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
|
|
|
1036
989
|
if (seq->pos[t].lbl != out[t * N]) {
|
|
1037
990
|
terr++;
|
|
1038
991
|
err = 1;
|
|
1039
|
-
}
|
|
1040
|
-
else {
|
|
992
|
+
} else {
|
|
1041
993
|
stat[2][out[t * N]]++;
|
|
1042
994
|
}
|
|
1043
995
|
}
|
|
@@ -1053,10 +1005,8 @@ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
|
|
|
1053
1005
|
|
|
1054
1006
|
serr = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_errors")));
|
|
1055
1007
|
rb_ivar_set(self, rb_intern("@sequence_errors"), INT2FIX(serr + err));
|
|
1056
|
-
|
|
1057
1008
|
}
|
|
1058
1009
|
|
|
1059
|
-
|
|
1060
1010
|
// Cleanup memory used for this sequence
|
|
1061
1011
|
xfree(scs);
|
|
1062
1012
|
xfree(psc);
|
|
@@ -1090,7 +1040,6 @@ static VALUE decode_sequence_array(VALUE self, VALUE array) {
|
|
|
1090
1040
|
for (j = 0; j < k; ++j) {
|
|
1091
1041
|
VALUE line = rb_ary_entry(sequence, j);
|
|
1092
1042
|
Check_Type(line, T_STRING);
|
|
1093
|
-
|
|
1094
1043
|
raw->lines[j] = StringValueCStr(line);
|
|
1095
1044
|
}
|
|
1096
1045
|
|
|
@@ -1103,13 +1052,7 @@ static VALUE decode_sequence_array(VALUE self, VALUE array) {
|
|
|
1103
1052
|
}
|
|
1104
1053
|
|
|
1105
1054
|
static VALUE decode_sequence_file(VALUE self, VALUE path) {
|
|
1106
|
-
|
|
1107
|
-
FILE *file;
|
|
1108
|
-
|
|
1109
|
-
if (!(file = fopen(StringValueCStr(path), "r"))) {
|
|
1110
|
-
rb_raise(cNativeError, "failed to label data: could not open file '%s'", StringValueCStr(path));
|
|
1111
|
-
}
|
|
1112
|
-
|
|
1055
|
+
FILE *file = ufopen(path, "r");
|
|
1113
1056
|
mdl_t *model = get_model(self);
|
|
1114
1057
|
raw_t *raw;
|
|
1115
1058
|
|
|
@@ -1119,7 +1062,6 @@ static VALUE decode_sequence_file(VALUE self, VALUE path) {
|
|
|
1119
1062
|
// to take care of not discarding the raw input as we want to send it
|
|
1120
1063
|
// back to the output with the additional predicted labels.
|
|
1121
1064
|
while (!feof(file)) {
|
|
1122
|
-
|
|
1123
1065
|
// So, first read an input sequence keeping the raw_t object
|
|
1124
1066
|
// available, and label it with Viterbi.
|
|
1125
1067
|
if ((raw = rdr_readraw(model->reader, file)) == 0) {
|
|
@@ -1133,12 +1075,12 @@ static VALUE decode_sequence_file(VALUE self, VALUE path) {
|
|
|
1133
1075
|
return result;
|
|
1134
1076
|
}
|
|
1135
1077
|
|
|
1136
|
-
//
|
|
1078
|
+
// call-seq:
|
|
1137
1079
|
// m.label(tokens, options = {}) # => array of labelled tokens
|
|
1138
1080
|
// m.label(filename, options = {}) # => array of labelled tokens
|
|
1139
1081
|
//
|
|
1140
1082
|
static VALUE model_label(VALUE self, VALUE data) {
|
|
1141
|
-
VALUE result;
|
|
1083
|
+
VALUE result = (VALUE)0;
|
|
1142
1084
|
|
|
1143
1085
|
switch (TYPE(data)) {
|
|
1144
1086
|
case T_STRING:
|
|
@@ -1148,7 +1090,7 @@ static VALUE model_label(VALUE self, VALUE data) {
|
|
|
1148
1090
|
result = decode_sequence_array(self, data);
|
|
1149
1091
|
break;
|
|
1150
1092
|
default:
|
|
1151
|
-
|
|
1093
|
+
fatal("failed to label data: invalid data (expected type String or Array)");
|
|
1152
1094
|
}
|
|
1153
1095
|
|
|
1154
1096
|
return result;
|
|
@@ -1157,125 +1099,33 @@ static VALUE model_label(VALUE self, VALUE data) {
|
|
|
1157
1099
|
static void Init_model() {
|
|
1158
1100
|
cModel = rb_define_class_under(mWapiti, "Model", rb_cObject);
|
|
1159
1101
|
rb_define_alloc_func(cModel, allocate_model);
|
|
1160
|
-
|
|
1161
|
-
rb_define_method(cModel, "initialize", initialize_model, -1);
|
|
1162
|
-
|
|
1163
1102
|
rb_define_attr(cModel, "options", 1, 0);
|
|
1164
1103
|
|
|
1165
|
-
|
|
1104
|
+
rb_define_method(cModel, "initialize", initialize_model, -1);
|
|
1166
1105
|
rb_define_method(cModel, "nlbl", model_nlbl, 0);
|
|
1167
1106
|
rb_define_method(cModel, "labels", model_labels, 0);
|
|
1168
|
-
|
|
1169
1107
|
rb_define_method(cModel, "nobs", model_nobs, 0);
|
|
1170
1108
|
rb_define_alias(cModel, "observations", "nobs");
|
|
1171
|
-
|
|
1172
1109
|
rb_define_method(cModel, "nftr", model_nftr, 0);
|
|
1173
1110
|
rb_define_alias(cModel, "features", "nftr");
|
|
1174
|
-
|
|
1175
|
-
rb_define_method(cModel, "total", model_total, 0);
|
|
1176
|
-
|
|
1177
1111
|
rb_define_method(cModel, "sync", model_sync, 0);
|
|
1178
1112
|
rb_define_method(cModel, "compact", model_compact, 0);
|
|
1179
1113
|
rb_define_method(cModel, "save", model_save, -1);
|
|
1180
1114
|
rb_define_method(cModel, "load", model_load, -1);
|
|
1181
|
-
|
|
1182
|
-
rb_define_method(cModel, "train", model_train, 1);
|
|
1115
|
+
rb_define_method(cModel, "train", model_train, 2);
|
|
1183
1116
|
rb_define_method(cModel, "label", model_label, 1);
|
|
1184
1117
|
}
|
|
1185
1118
|
|
|
1186
|
-
/* --- Top-Level Utility Methods --- */
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
static VALUE label(VALUE self __attribute__((__unused__)), VALUE rb_options) {
|
|
1190
|
-
if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
|
|
1191
|
-
rb_raise(cNativeError, "argument must be a native options instance");
|
|
1192
|
-
}
|
|
1193
|
-
|
|
1194
|
-
opt_t *options = get_options(rb_options);
|
|
1195
|
-
|
|
1196
|
-
if (options->mode != 1) {
|
|
1197
|
-
rb_raise(cNativeError, "invalid options argument: mode should be set to 1 for labelling");
|
|
1198
|
-
}
|
|
1199
|
-
|
|
1200
|
-
mdl_t *model = mdl_new(rdr_new(options->maxent));
|
|
1201
|
-
model->opt = options;
|
|
1202
|
-
|
|
1203
|
-
dolabel(model);
|
|
1204
|
-
|
|
1205
|
-
mdl_free(model);
|
|
1206
|
-
|
|
1207
|
-
return Qnil;
|
|
1208
|
-
}
|
|
1209
|
-
|
|
1210
|
-
#if defined EXTRA
|
|
1211
|
-
static VALUE dump(VALUE self __attribute__((__unused__)), VALUE rb_options) {
|
|
1212
|
-
if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
|
|
1213
|
-
rb_raise(cNativeError, "argument must be a native options instance");
|
|
1214
|
-
}
|
|
1215
|
-
|
|
1216
|
-
opt_t *options = get_options(rb_options);
|
|
1217
|
-
|
|
1218
|
-
if (options->mode != 2) {
|
|
1219
|
-
rb_raise(cNativeError, "invalid options argument: mode should be set to 2 for training");
|
|
1220
|
-
}
|
|
1221
|
-
|
|
1222
|
-
mdl_t *model = mdl_new(rdr_new(options->maxent));
|
|
1223
|
-
model->opt = options;
|
|
1224
|
-
|
|
1225
|
-
dodump(model);
|
|
1226
|
-
|
|
1227
|
-
mdl_free(model);
|
|
1228
|
-
|
|
1229
|
-
return Qnil;
|
|
1230
|
-
}
|
|
1231
|
-
|
|
1232
|
-
// This function is a proxy for Wapiti's main entry point.
|
|
1233
|
-
static VALUE wapiti(VALUE self __attribute__((__unused__)), VALUE arguments) {
|
|
1234
|
-
int result = -1, argc = 0;
|
|
1235
|
-
char **ap, *argv[18], *input, *tmp;
|
|
1236
|
-
|
|
1237
|
-
Check_Type(arguments, T_STRING);
|
|
1238
|
-
tmp = StringValueCStr(arguments);
|
|
1239
|
-
|
|
1240
|
-
// allocate space for argument vector
|
|
1241
|
-
input = (char*)malloc(strlen(tmp) + 8);
|
|
1242
|
-
|
|
1243
|
-
// prepend command name
|
|
1244
|
-
strncpy(input, "wapiti ", 8);
|
|
1245
|
-
strncat(input, tmp, strlen(input) - 8);
|
|
1246
|
-
|
|
1247
|
-
// remember allocation pointer
|
|
1248
|
-
tmp = input;
|
|
1249
|
-
|
|
1250
|
-
// turn input string into argument vector (using
|
|
1251
|
-
// only the first seventeen tokens from input)
|
|
1252
|
-
for (ap = argv; (*ap = strsep(&input, " \t")) != (char*)0; ++argc) {
|
|
1253
|
-
if ((**ap != '\0') && (++ap >= &argv[18])) break;
|
|
1254
|
-
}
|
|
1255
|
-
|
|
1256
|
-
// call main entry point
|
|
1257
|
-
result = wapiti_main(argc, argv);
|
|
1258
|
-
|
|
1259
|
-
// free allocated memory
|
|
1260
|
-
free(tmp);
|
|
1261
|
-
|
|
1262
|
-
return INT2FIX(result);
|
|
1263
|
-
}
|
|
1264
|
-
#endif
|
|
1265
|
-
|
|
1266
1119
|
/* --- Wapiti Extension Entry Point --- */
|
|
1267
1120
|
|
|
1268
1121
|
void Init_native() {
|
|
1269
1122
|
mWapiti = rb_const_get(rb_mKernel, rb_intern("Wapiti"));
|
|
1270
1123
|
mNative = rb_define_module_under(mWapiti, "Native");
|
|
1271
1124
|
|
|
1125
|
+
cArgumentError = rb_const_get(rb_mKernel, rb_intern("ArgumentError"));
|
|
1272
1126
|
cNativeError = rb_const_get(mWapiti, rb_intern("NativeError"));
|
|
1273
|
-
cConfigurationError = rb_const_get(mWapiti, rb_intern("ConfigurationError"));
|
|
1274
1127
|
cLogger = rb_funcall(mWapiti, rb_intern("log"), 0);
|
|
1275
1128
|
|
|
1276
|
-
rb_define_singleton_method(mNative, "label", label, 1);
|
|
1277
|
-
// rb_define_singleton_method(mNative, "wapiti", wapiti, 1);
|
|
1278
|
-
|
|
1279
1129
|
rb_define_const(mNative, "VERSION", rb_str_new2(VERSION));
|
|
1280
1130
|
|
|
1281
1131
|
Init_options();
|