wapiti 0.1.1 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/HISTORY.md +8 -0
- data/LICENSE +1 -1
- data/README.md +39 -95
- data/ext/wapiti/bcd.c +1 -1
- data/ext/wapiti/extconf.rb +15 -1
- data/ext/wapiti/lbfgs.c +6 -6
- data/ext/wapiti/model.c +2 -3
- data/ext/wapiti/model.h +0 -7
- data/ext/wapiti/native.c +89 -239
- data/ext/wapiti/native.h +0 -5
- data/ext/wapiti/pattern.c +1 -1
- data/ext/wapiti/progress.c +19 -44
- data/ext/wapiti/progress.h +1 -4
- data/ext/wapiti/rprop.c +3 -4
- data/ext/wapiti/sgdl1.c +3 -3
- data/ext/wapiti/tools.c +36 -30
- data/ext/wapiti/tools.h +9 -4
- data/ext/wapiti/trainers.c +55 -0
- data/ext/wapiti/trainers.h +4 -1
- data/lib/wapiti.rb +4 -24
- data/lib/wapiti/dataset.rb +162 -0
- data/lib/wapiti/errors.rb +0 -4
- data/lib/wapiti/log.rb +29 -0
- data/lib/wapiti/model.rb +63 -40
- data/lib/wapiti/options.rb +66 -29
- data/lib/wapiti/sequence.rb +105 -0
- data/lib/wapiti/token.rb +74 -0
- data/lib/wapiti/version.rb +1 -1
- metadata +20 -80
- data/.autotest +0 -13
- data/.rspec +0 -3
- data/.simplecov +0 -3
- data/Gemfile +0 -29
- data/Rakefile +0 -63
- data/ext/wapiti/wapiti.c +0 -410
- data/spec/fixtures/ch.mod +0 -18550
- data/spec/fixtures/chpattern.txt +0 -52
- data/spec/fixtures/chtest.txt +0 -1973
- data/spec/fixtures/chtrain.txt +0 -19995
- data/spec/fixtures/nppattern.txt +0 -52
- data/spec/fixtures/nptest.txt +0 -1973
- data/spec/fixtures/nptrain.txt +0 -19995
- data/spec/fixtures/pattern.txt +0 -14
- data/spec/fixtures/test.txt +0 -60000
- data/spec/fixtures/train.txt +0 -1200
- data/spec/spec_helper.rb +0 -41
- data/spec/wapiti/model_spec.rb +0 -233
- data/spec/wapiti/native_spec.rb +0 -11
- data/spec/wapiti/options_spec.rb +0 -185
- data/spec/wapiti/utility_spec.rb +0 -22
- data/wapiti.gemspec +0 -49
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: f0357489368d9bbe57ea34e2bee7be8f6ae7542875971acdf48f24b038aebb32
|
4
|
+
data.tar.gz: 7267f065c30e82f581942cae7789bc4d71999f7a418c1549b5cd41a8ed4a1b80
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 194656b3d90ed6fedf32a2e1a8dc34cec4eccaeec71098c2499054697c66a3209d9436cfcf66e1b4b43e19e2f2b50f4ad5e2682df9d21f126e812940591de226
|
7
|
+
data.tar.gz: 9d365d09193a7c1657b1583331f20d1f985ad0f8c351947c791dd407a163463f4eb21ee6ac74545bd0f61ca6f88ab0ed669aaeefdaefdd03f2543cc22078a4f5
|
data/HISTORY.md
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
1.0.0 / 2017-12-xx
|
2
|
+
==================
|
3
|
+
* Added support for Windows platform
|
4
|
+
* Open files only if names are untainted
|
5
|
+
* Finalized API
|
6
|
+
* Fixed error reporting
|
7
|
+
* Removed progress logging
|
8
|
+
|
1
9
|
0.1.1 / 2014-02-27
|
2
10
|
==================
|
3
11
|
* Updated train routine
|
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -5,18 +5,14 @@ The Wapiti-Ruby gem provides a wicked fast linear-chain CRF
|
|
5
5
|
API for sequence segmentation and labelling; it is based on the
|
6
6
|
codebase of [wapiti](http://wapiti.limsi.fr/).
|
7
7
|
|
8
|
-
[![Build Status](https://
|
9
|
-
[![
|
8
|
+
[![Linux Build Status](https://travis-ci.org/inukshuk/wapiti-ruby.svg?branch=master)](https://travis-ci.org/inukshuk/wapiti-ruby)
|
9
|
+
[![Windows Build Status](https://ci.appveyor.com/api/projects/status/12rtxe2o8p55g1w6/branch/master?svg=true)](https://ci.appveyor.com/project/inukshuk/wapiti-ruby/branch/master)
|
10
|
+
[![Coverage Status](https://coveralls.io/repos/github/inukshuk/wapiti-ruby/badge.svg?branch=master)](https://coveralls.io/github/inukshuk/wapiti-ruby?branch=master)
|
10
11
|
|
11
12
|
Requirements
|
12
13
|
------------
|
13
|
-
Wapiti is written in C and Ruby and requires a compiler with C99
|
14
|
-
|
15
|
-
all necessary packages through your distribution.
|
16
|
-
|
17
|
-
The Wapiti Ruby gem has been confirmed to work with MRI 2.x, 1.9.x, 1.8.7,
|
18
|
-
and Rubinius.
|
19
|
-
|
14
|
+
Wapiti is written in C and Ruby and requires a compiler with C99 support;
|
15
|
+
it has been confirmed to work on Linux, macOS, and Windows.
|
20
16
|
|
21
17
|
Quickstart
|
22
18
|
----------
|
@@ -29,49 +25,35 @@ Quickstart
|
|
29
25
|
|
30
26
|
Using a pattern and training data stored in a file:
|
31
27
|
|
32
|
-
model = Wapiti.train('train.txt', :
|
33
|
-
|
28
|
+
model = Wapiti.train('train.txt', pattern: 'pattern.txt')
|
29
|
+
#=> #<Wapiti::Model:0x0000010188f868>
|
34
30
|
model.labels
|
35
|
-
|
31
|
+
#=> ["B-ADJP", "B-ADVP", "B-CONJP" ...]
|
36
32
|
model.save('ch.mod')
|
37
|
-
|
33
|
+
#=> saves the model as 'ch.mod'
|
38
34
|
|
39
|
-
Alternatively, you can pass in the training data as
|
40
|
-
|
35
|
+
Alternatively, you can pass in the training data as a `Wapiti::Dataset`;
|
36
|
+
this class supports the default text format used by Wapiti as well as
|
37
|
+
additiional formats (such as YAML or XML) and an API to make it easier
|
38
|
+
to manage data sets used for input and training.
|
41
39
|
|
42
|
-
data =
|
43
|
-
data << ['Confidence NN B-NP', 'in IN B-PP', 'the DT B-NP', 'pound NN I-NP', '. . O']
|
44
|
-
...
|
40
|
+
data = Wapiti::Dataset.open('chtrain.xml')
|
45
41
|
model = Wapiti.train(data, options)
|
46
42
|
|
47
|
-
You can consult the `Wapiti::Options` class for a list of
|
48
|
-
configuration options and algorithms
|
49
|
-
|
50
|
-
Wapiti::Options.attribute_names
|
51
|
-
=> [:algorithm, :check, :compact, :convergence_window, :development_data,
|
52
|
-
:jobsize, :label, :max_iterations, :maxent, :pattern, :posterior, :rho1,
|
53
|
-
:rho2, :score, :sparse, :stop_epsilon, :stop_window, :threads]
|
54
|
-
Wapiti::Options.algorithms
|
55
|
-
=> ["l-bfgs", "sgd-l1", "bcd", "rprop", "rprop+", "rprop-", "auto"]
|
43
|
+
You can consult the `Wapiti::Options.attribute_names` class for a list of
|
44
|
+
supported configuration options and `Wapiti::Options.algorithms` for
|
45
|
+
all supported algorithms:
|
56
46
|
|
57
47
|
Use `#valid?` or `#validate` (which returns error messages) to make sure
|
58
48
|
your configuration is supported by Wapiti.
|
59
49
|
|
60
|
-
You can pass options either as an options hash or by adding a block to the
|
61
|
-
method invocation:
|
62
|
-
|
63
|
-
model = Wapiti::Model.train(data) do |config|
|
64
|
-
config.pattern = 'pattern.txt'
|
65
|
-
threads = 4
|
66
|
-
end
|
67
|
-
|
68
50
|
Before saving your model you can use `compact` to reduce the model's size:
|
69
51
|
|
70
52
|
model.save 'm1.mod'
|
71
|
-
|
53
|
+
#=> m1.mod file size 1.8M
|
72
54
|
model.compact
|
73
55
|
model.save 'm2.mod'
|
74
|
-
|
56
|
+
#=> m2.mod file size 471K
|
75
57
|
|
76
58
|
|
77
59
|
### Loading existing Models
|
@@ -80,50 +62,33 @@ Before saving your model you can use `compact` to reduce the model's size:
|
|
80
62
|
|
81
63
|
### Labelling
|
82
64
|
|
83
|
-
By calling `#label` on a Model instance you can add labels to
|
84
|
-
data:
|
65
|
+
By calling `#label` on a Model instance you can add labels to a dataset:
|
85
66
|
|
86
67
|
model = Wapiti.load('m2.mod')
|
87
|
-
|
88
|
-
|
68
|
+
input = Wapiti::Dataset.load('chtest.txt')
|
69
|
+
output = model.label(input, tagged: true)
|
89
70
|
|
90
|
-
The result is
|
91
|
-
|
92
|
-
|
71
|
+
The result is a new `Wapiti::Dataset` with the predicted labels for each
|
72
|
+
token. If your input data was already tagged, you can compare the input
|
73
|
+
and output datasets to evaluate your results:
|
93
74
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
model.label [['Confidence NN', 'in IN', 'the DT', 'pound NN', '. .']]
|
98
|
-
=> [[["Confidence NN", "B-NP"], ["in IN", "B-PP"], ["the DT", "B-NP"],
|
99
|
-
["pound NN", "I-NP"], [". .", "O"]]]
|
75
|
+
output - input
|
76
|
+
# => new dataset of output sequences which are tagged differently than expected
|
100
77
|
|
101
78
|
If you pass a block to `#label` Wapiti will yield each token and the
|
102
79
|
corresponding label:
|
103
80
|
|
104
|
-
model.label
|
81
|
+
model.label input do |token, label|
|
105
82
|
[token.downcase, label.downcase]
|
106
83
|
end
|
107
|
-
=> [[["confidence nn", "b-np"], ["in in", "b-pp"], ["the dt", "b-np"],
|
108
|
-
["pound nn", "i-np"], [". .", "o"]]]
|
109
84
|
|
110
85
|
Note that if you set the *:score* option (either in the Model's `#options` or
|
111
86
|
when calling `#label`), the score for each label will be appended to
|
112
87
|
each token/label tuple as a floating point number or passed as a third
|
113
88
|
argument to the passed-in block.
|
114
89
|
|
115
|
-
model.label
|
116
|
-
=>
|
117
|
-
|
118
|
-
Similarly, if you set the *:nbest* option to a value greater than one, Wapiti
|
119
|
-
will append more label and, optionally, score values to each tuple.
|
120
|
-
|
121
|
-
model.label [['Confidence NN']], :score => true, :nbest => 3, :skip_tokens => true
|
122
|
-
=> [[["B-NP", 4.642034838737357, "B-VP", 1.7040256847206927, "B-ADJP", 0.7636429298060177]]]
|
123
|
-
|
124
|
-
Note how we also suppressed the output of the token string using the
|
125
|
-
*:skip_tokens* option.
|
126
|
-
|
90
|
+
model.label input, score: true
|
91
|
+
# => Dataset where each token will include a score
|
127
92
|
|
128
93
|
### Statistics
|
129
94
|
|
@@ -131,41 +96,20 @@ By setting the *:check* option you can tell Wapiti to keep statistics during
|
|
131
96
|
the labelling phase (for the statistics to be meaningful you obviously need
|
132
97
|
to provide input data that is already labelled). Wapiti does not reset the
|
133
98
|
counters during consecutive calls to `#label` to allow you to collect
|
134
|
-
accumulative
|
135
|
-
`#
|
99
|
+
accumulative stats; however, you can reset the counters at any time, by calling
|
100
|
+
`#reset_counters`.
|
136
101
|
|
137
102
|
After calling `#label` with the *:check* options set and appropriately labelled
|
138
103
|
input, you can access the statistics via `#statistics` (the individual values
|
139
104
|
are also available through the associated attribute readers).
|
140
105
|
|
141
|
-
model.label
|
142
|
-
|
143
|
-
:
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
Citing
|
148
|
-
------
|
149
|
-
If you're using Wapiti-Ruby for research purposes, please use the following
|
150
|
-
citation of the original wapiti package:
|
151
|
-
|
152
|
-
@article{lavergne2010practical,
|
153
|
-
author = {Lavergne, Thomas and Capp\'{e}, Olivier and Yvon, Fran\c{c}ois},
|
154
|
-
title = {Practical Very Large Scale {CRFs}},
|
155
|
-
booktitle = {Proceedings the 48th Annual Meeting of the Association for
|
156
|
-
Computational Linguistics (ACL)},
|
157
|
-
month = {July},
|
158
|
-
year = {2010},
|
159
|
-
location = {Uppsala, Sweden},
|
160
|
-
publisher = {Association for Computational Linguistics},
|
161
|
-
pages = {504--513},
|
162
|
-
url = {http://www.aclweb.org/anthology/P10-1052}
|
163
|
-
}
|
164
|
-
|
165
|
-
If you're profiting from any of the Wapiti-Ruby specific features you are
|
166
|
-
welcome to also refer back to the
|
167
|
-
[Wapiti-Ruby homepage](http://github.com/inukshuk/wapiti-ruby/).
|
106
|
+
model.label input, check: true
|
107
|
+
model.stats
|
108
|
+
=> {:token=>{:count=>1896, :errors=>137, :rate=>7.225738396624472},
|
109
|
+
:sequence=>{:count=>77, :errors=>50, :rate=>64.93506493506494}}
|
168
110
|
|
111
|
+
For convenience, you can also use the `#check` method, which
|
112
|
+
will reset the counters, check your input, and return the stats.
|
169
113
|
|
170
114
|
Contributing
|
171
115
|
------------
|
@@ -183,7 +127,7 @@ example, fix the bug and submit a pull request.
|
|
183
127
|
|
184
128
|
License
|
185
129
|
-------
|
186
|
-
Copyright 2011-
|
130
|
+
Copyright 2011-2018 Sylvester Keil. All rights reserved.
|
187
131
|
|
188
132
|
Copyright 2009-2013 CNRS. All rights reserved.
|
189
133
|
|
data/ext/wapiti/bcd.c
CHANGED
data/ext/wapiti/extconf.rb
CHANGED
@@ -1,6 +1,20 @@
|
|
1
1
|
require 'mkmf'
|
2
|
+
require 'rbconfig'
|
2
3
|
|
3
|
-
|
4
|
+
cflags = %w{
|
5
|
+
-std=c99
|
6
|
+
-W
|
7
|
+
-Wall
|
8
|
+
-Wno-declaration-after-statement
|
9
|
+
-O3
|
10
|
+
}
|
11
|
+
|
12
|
+
case RbConfig::CONFIG['host_os']
|
13
|
+
when /^linux/i
|
14
|
+
cflags[0] = '-std=gnu99'
|
15
|
+
end
|
16
|
+
|
17
|
+
$CFLAGS << ' ' << cflags.join(' ')
|
4
18
|
|
5
19
|
have_library('pthread')
|
6
20
|
have_library('m')
|
data/ext/wapiti/lbfgs.c
CHANGED
@@ -104,12 +104,12 @@ void trn_lbfgs(mdl_t *mdl) {
|
|
104
104
|
uint64_t f;
|
105
105
|
if (fscanf(file, "%"PRIu64, &f) != 1)
|
106
106
|
fatal("1 %s", err);
|
107
|
-
if (fscanf(file, "%
|
107
|
+
if (fscanf(file, "%le %le", &xp[f], &gp[f]) != 2)
|
108
108
|
fatal("2 %s", err);
|
109
109
|
for (uint32_t m = 0; m < M; m++) {
|
110
|
-
if (fscanf(file, "%
|
110
|
+
if (fscanf(file, "%le", &s[m][f]) != 1)
|
111
111
|
fatal("3 %s", err);
|
112
|
-
if (fscanf(file, "%
|
112
|
+
if (fscanf(file, "%le", &y[m][f]) != 1)
|
113
113
|
fatal("4 %s", err);
|
114
114
|
}
|
115
115
|
}
|
@@ -271,7 +271,7 @@ void trn_lbfgs(mdl_t *mdl) {
|
|
271
271
|
memcpy(x, xp, sizeof(double) * F);
|
272
272
|
break;
|
273
273
|
}
|
274
|
-
if (uit_progress(mdl
|
274
|
+
if (uit_progress(mdl) == false)
|
275
275
|
break;
|
276
276
|
// 3rd step: we update the history used for approximating the
|
277
277
|
// inverse of the diagonal of the hessian
|
@@ -314,9 +314,9 @@ void trn_lbfgs(mdl_t *mdl) {
|
|
314
314
|
fprintf(file, "#state#0#%"PRIu32"#%"PRIu64"\n", M, F);
|
315
315
|
for (uint64_t f = 0; f < F; f++) {
|
316
316
|
fprintf(file, "%"PRIu64, f);
|
317
|
-
fprintf(file, " %
|
317
|
+
fprintf(file, " %le %le", xp[f], gp[f]);
|
318
318
|
for (uint32_t m = 0; m < M; m++)
|
319
|
-
fprintf(file, " %
|
319
|
+
fprintf(file, " %le %le", s[m][f], y[m][f]);
|
320
320
|
fprintf(file, "\n");
|
321
321
|
}
|
322
322
|
fclose(file);
|
data/ext/wapiti/model.c
CHANGED
@@ -74,7 +74,6 @@ mdl_t *mdl_new(rdr_t *rdr) {
|
|
74
74
|
mdl->train = mdl->devel = NULL;
|
75
75
|
mdl->reader = rdr;
|
76
76
|
mdl->werr = NULL;
|
77
|
-
mdl->total = 0.0;
|
78
77
|
return mdl;
|
79
78
|
}
|
80
79
|
|
@@ -272,7 +271,7 @@ void mdl_save(mdl_t *mdl, FILE *file) {
|
|
272
271
|
rdr_save(mdl->reader, file);
|
273
272
|
for (uint64_t f = 0; f < mdl->nftr; f++)
|
274
273
|
if (mdl->theta[f] != 0.0)
|
275
|
-
fprintf(file, "%"PRIu64"=%
|
274
|
+
fprintf(file, "%"PRIu64"=%le\n", f, mdl->theta[f]);
|
276
275
|
}
|
277
276
|
|
278
277
|
/* mdl_load:
|
@@ -298,7 +297,7 @@ void mdl_load(mdl_t *mdl, FILE *file) {
|
|
298
297
|
for (uint64_t i = 0; i < nact; i++) {
|
299
298
|
uint64_t f;
|
300
299
|
double v;
|
301
|
-
if (fscanf(file, "%"SCNu64"=%
|
300
|
+
if (fscanf(file, "%"SCNu64"=%le\n", &f, &v) != 2)
|
302
301
|
fatal(err);
|
303
302
|
mdl->theta[f] = v;
|
304
303
|
}
|
data/ext/wapiti/model.h
CHANGED
@@ -30,15 +30,12 @@
|
|
30
30
|
|
31
31
|
#include <stddef.h>
|
32
32
|
#include <stdint.h>
|
33
|
-
#include <sys/time.h>
|
34
33
|
|
35
34
|
#include "options.h"
|
36
35
|
#include "sequence.h"
|
37
36
|
#include "reader.h"
|
38
37
|
#include "wapiti.h"
|
39
38
|
|
40
|
-
typedef struct timeval tms_t;
|
41
|
-
|
42
39
|
/* mdl_t:
|
43
40
|
* Represent a linear-chain CRF model. The model contain both unigram and
|
44
41
|
* bigram features. It is caracterized by <nlbl> the number of labels, <nobs>
|
@@ -86,10 +83,6 @@ struct mdl_s {
|
|
86
83
|
double *werr; // Window of error rate of last iters
|
87
84
|
uint32_t wcnt; // Number of iters in the window
|
88
85
|
uint32_t wpos; // Position for the next iter
|
89
|
-
|
90
|
-
// Timing
|
91
|
-
tms_t timer; // start time of last iter
|
92
|
-
double total; // total training time
|
93
86
|
};
|
94
87
|
|
95
88
|
mdl_t *mdl_new(rdr_t *rdr);
|
data/ext/wapiti/native.c
CHANGED
@@ -10,43 +10,16 @@
|
|
10
10
|
#include "quark.h"
|
11
11
|
#include "tools.h"
|
12
12
|
#include "wapiti.h"
|
13
|
-
|
14
13
|
#include "native.h"
|
15
14
|
|
16
15
|
VALUE mWapiti;
|
17
16
|
VALUE mNative;
|
18
|
-
|
19
17
|
VALUE cOptions;
|
20
18
|
VALUE cModel;
|
21
|
-
|
19
|
+
VALUE cArgumentError;
|
22
20
|
VALUE cNativeError;
|
23
|
-
VALUE cConfigurationError;
|
24
21
|
VALUE cLogger;
|
25
22
|
|
26
|
-
|
27
|
-
/* --- Forward declarations --- */
|
28
|
-
|
29
|
-
int wapiti_main(int argc, char *argv[argc]);
|
30
|
-
|
31
|
-
void dolabel(mdl_t *mdl);
|
32
|
-
|
33
|
-
|
34
|
-
/* --- Utilities --- */
|
35
|
-
|
36
|
-
static const struct {
|
37
|
-
const char *name;
|
38
|
-
void (* train)(mdl_t *mdl);
|
39
|
-
} trn_lst[] = {
|
40
|
-
{"l-bfgs", trn_lbfgs},
|
41
|
-
{"sgd-l1", trn_sgdl1},
|
42
|
-
{"bcd", trn_bcd },
|
43
|
-
{"rprop", trn_rprop},
|
44
|
-
{"rprop+", trn_rprop},
|
45
|
-
{"rprop-", trn_rprop}
|
46
|
-
};
|
47
|
-
static const uint32_t trn_cnt = sizeof(trn_lst) / sizeof(trn_lst[0]);
|
48
|
-
|
49
|
-
|
50
23
|
/* --- Options Class --- */
|
51
24
|
|
52
25
|
// Auxiliary Methods
|
@@ -68,6 +41,14 @@ static void copy_string(char **dst, VALUE rb_string) {
|
|
68
41
|
memcpy(*dst, StringValuePtr(rb_string), RSTRING_LEN(rb_string) + 1);
|
69
42
|
}
|
70
43
|
|
44
|
+
// Moves a string to the heap. We use this to move default
|
45
|
+
// values to the heap during initialization.
|
46
|
+
static char *to_heap(const char *string) {
|
47
|
+
char* ptr = calloc(strlen(string), sizeof(char));
|
48
|
+
memcpy(ptr, string, strlen(string));
|
49
|
+
return ptr;
|
50
|
+
}
|
51
|
+
|
71
52
|
|
72
53
|
// Constructor / Desctructor
|
73
54
|
|
@@ -76,11 +57,11 @@ static void mark_options(opt_t* options __attribute__((__unused__))) {
|
|
76
57
|
}
|
77
58
|
|
78
59
|
static void deallocate_options(opt_t* options) {
|
79
|
-
|
80
60
|
// free string options
|
81
61
|
if (options->input) { free(options->input); }
|
82
62
|
if (options->output) { free(options->output); }
|
83
63
|
if (options->algo) { free((void*)options->algo); }
|
64
|
+
if (options->type) { free((void*)options->type); }
|
84
65
|
if (options->devel) { free(options->devel); }
|
85
66
|
if (options->pattern) { free((void*)options->pattern); }
|
86
67
|
|
@@ -101,21 +82,20 @@ static VALUE initialize_options(int argc, VALUE *argv, VALUE self) {
|
|
101
82
|
options->maxiter = INT_MAX;
|
102
83
|
}
|
103
84
|
|
104
|
-
//
|
105
|
-
// are on the heap
|
106
|
-
|
107
|
-
|
108
|
-
options->algo = tmp;
|
85
|
+
// Copy default algorithm and type name to the heap
|
86
|
+
// so that all options strings are on the heap.
|
87
|
+
options->algo = to_heap(options->algo);
|
88
|
+
options->type = to_heap(options->type);
|
109
89
|
|
110
90
|
if (argc > 1) {
|
111
|
-
rb_raise(
|
91
|
+
rb_raise(cArgumentError,
|
112
92
|
"wrong number of arguments (%d for 0..1)", argc);
|
113
93
|
}
|
114
94
|
|
115
95
|
// set defaults
|
116
96
|
if (argc) {
|
117
97
|
Check_Type(argv[0], T_HASH);
|
118
|
-
(void)rb_funcall(self, rb_intern("update"), 1, argv[0]);
|
98
|
+
(void)rb_funcall(self, rb_intern("update!"), 1, argv[0]);
|
119
99
|
}
|
120
100
|
|
121
101
|
// yield self if block_given?
|
@@ -431,7 +411,6 @@ static VALUE options_model(VALUE self) {
|
|
431
411
|
static VALUE options_set_model(VALUE self, VALUE rb_string) {
|
432
412
|
opt_t *options = get_options(self);
|
433
413
|
copy_string(&(options->model), rb_string);
|
434
|
-
|
435
414
|
return rb_string;
|
436
415
|
}
|
437
416
|
|
@@ -443,19 +422,17 @@ static VALUE options_algorithm(VALUE self) {
|
|
443
422
|
static VALUE options_set_algorithm(VALUE self, VALUE rb_string) {
|
444
423
|
opt_t *options = get_options(self);
|
445
424
|
copy_string((char**)&(options->algo), rb_string);
|
446
|
-
|
447
425
|
return rb_string;
|
448
426
|
}
|
449
427
|
|
450
|
-
static VALUE
|
451
|
-
char *
|
452
|
-
return rb_str_new2(
|
428
|
+
static VALUE options_type(VALUE self) {
|
429
|
+
const char *type = get_options(self)->type;
|
430
|
+
return rb_str_new2(type ? type : "");
|
453
431
|
}
|
454
432
|
|
455
|
-
static VALUE
|
433
|
+
static VALUE options_set_type(VALUE self, VALUE rb_string) {
|
456
434
|
opt_t *options = get_options(self);
|
457
|
-
copy_string(&(options->
|
458
|
-
|
435
|
+
copy_string((char**)&(options->type), rb_string);
|
459
436
|
return rb_string;
|
460
437
|
}
|
461
438
|
|
@@ -565,11 +542,8 @@ void Init_options() {
|
|
565
542
|
rb_define_alias(cOptions, "algo", "algorithm");
|
566
543
|
rb_define_alias(cOptions, "algo=", "algorithm=");
|
567
544
|
|
568
|
-
rb_define_method(cOptions, "
|
569
|
-
rb_define_method(cOptions, "
|
570
|
-
|
571
|
-
rb_define_alias(cOptions, "devel", "development_data");
|
572
|
-
rb_define_alias(cOptions, "devel=", "development_data=");
|
545
|
+
rb_define_method(cOptions, "type", options_type, 0);
|
546
|
+
rb_define_method(cOptions, "type=", options_set_type, 1);
|
573
547
|
|
574
548
|
rb_define_method(cOptions, "clip", options_clip, 0);
|
575
549
|
rb_define_method(cOptions, "clip=", options_set_clip, 1);
|
@@ -640,7 +614,7 @@ static VALUE allocate_model(VALUE self) {
|
|
640
614
|
|
641
615
|
static VALUE model_set_options(VALUE self, VALUE rb_options) {
|
642
616
|
if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
|
643
|
-
rb_raise(
|
617
|
+
rb_raise(cArgumentError, "argument must be a Wapiti::Options instance");
|
644
618
|
}
|
645
619
|
|
646
620
|
mdl_t *model = get_model(self);
|
@@ -661,22 +635,20 @@ static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
|
|
661
635
|
VALUE options;
|
662
636
|
|
663
637
|
if (argc > 1) {
|
664
|
-
rb_raise(
|
638
|
+
rb_raise(cArgumentError,
|
665
639
|
"wrong number of arguments (%d for 0..1)", argc);
|
666
640
|
}
|
667
641
|
|
668
642
|
if (argc) {
|
669
643
|
if (TYPE(argv[0]) == T_HASH) {
|
670
644
|
options = rb_funcall(cOptions, rb_intern("new"), 1, argv[0]);
|
671
|
-
}
|
672
|
-
else {
|
645
|
+
} else {
|
673
646
|
if (strncmp("Wapiti::Options", rb_obj_classname(argv[0]), 15) != 0) {
|
674
|
-
rb_raise(
|
647
|
+
rb_raise(cArgumentError, "argument must be a hash or an options instance");
|
675
648
|
}
|
676
649
|
options = argv[0];
|
677
650
|
}
|
678
|
-
}
|
679
|
-
else {
|
651
|
+
} else {
|
680
652
|
options = rb_funcall(cOptions, rb_intern("new"), 0);
|
681
653
|
}
|
682
654
|
|
@@ -693,7 +665,7 @@ static VALUE initialize_model(int argc, VALUE *argv, VALUE self) {
|
|
693
665
|
}
|
694
666
|
|
695
667
|
// initialize counters
|
696
|
-
rb_funcall(self, rb_intern("
|
668
|
+
rb_funcall(self, rb_intern("reset_counters"), 0);
|
697
669
|
|
698
670
|
return self;
|
699
671
|
}
|
@@ -713,10 +685,6 @@ static VALUE model_nftr(VALUE self) {
|
|
713
685
|
return INT2FIX(get_model(self)->nftr);
|
714
686
|
}
|
715
687
|
|
716
|
-
static VALUE model_total(VALUE self) {
|
717
|
-
return rb_float_new(get_model(self)->total);
|
718
|
-
}
|
719
|
-
|
720
688
|
|
721
689
|
// Instance methods
|
722
690
|
|
@@ -738,7 +706,7 @@ static VALUE model_compact(VALUE self) {
|
|
738
706
|
// otherwise uses the passed-in argument as the Model's path.
|
739
707
|
static VALUE model_save(int argc, VALUE *argv, VALUE self) {
|
740
708
|
if (argc > 1) {
|
741
|
-
rb_raise(
|
709
|
+
rb_raise(cArgumentError,
|
742
710
|
"wrong number of arguments (%d for 0..1)", argc);
|
743
711
|
}
|
744
712
|
|
@@ -751,17 +719,13 @@ static VALUE model_save(int argc, VALUE *argv, VALUE self) {
|
|
751
719
|
}
|
752
720
|
|
753
721
|
// open the output file
|
754
|
-
FILE *file = 0;
|
755
722
|
VALUE path = rb_ivar_get(self, rb_intern("@path"));
|
756
723
|
|
757
724
|
if (NIL_P(path)) {
|
758
|
-
|
759
|
-
}
|
760
|
-
|
761
|
-
if (!(file = fopen(StringValueCStr(path), "w"))) {
|
762
|
-
rb_raise(cNativeError, "failed to save model: failed to open model file");
|
725
|
+
fatal("failed to save model: no path given");
|
763
726
|
}
|
764
727
|
|
728
|
+
FILE *file = ufopen(path, "w");
|
765
729
|
mdl_save(model, file);
|
766
730
|
fclose(file);
|
767
731
|
|
@@ -770,7 +734,7 @@ static VALUE model_save(int argc, VALUE *argv, VALUE self) {
|
|
770
734
|
|
771
735
|
static VALUE model_load(int argc, VALUE *argv, VALUE self) {
|
772
736
|
if (argc > 1) {
|
773
|
-
rb_raise(
|
737
|
+
rb_raise(cArgumentError,
|
774
738
|
"wrong number of arguments (%d for 0..1)", argc);
|
775
739
|
}
|
776
740
|
|
@@ -783,17 +747,13 @@ static VALUE model_load(int argc, VALUE *argv, VALUE self) {
|
|
783
747
|
}
|
784
748
|
|
785
749
|
// open the model file
|
786
|
-
FILE *file = 0;
|
787
750
|
VALUE path = rb_ivar_get(self, rb_intern("@path"));
|
788
751
|
|
789
752
|
if (NIL_P(path)) {
|
790
|
-
|
791
|
-
}
|
792
|
-
|
793
|
-
if (!(file = fopen(StringValueCStr(path), "r"))) {
|
794
|
-
rb_raise(cNativeError, "failed to load model: failed to open model file");
|
753
|
+
fatal("failed to load model: no path given");
|
795
754
|
}
|
796
755
|
|
756
|
+
FILE *file = ufopen(path, "r");
|
797
757
|
mdl_load(model, file);
|
798
758
|
fclose(file);
|
799
759
|
|
@@ -849,31 +809,44 @@ static dat_t *to_dat(rdr_t *reader, VALUE data, bool labelled) {
|
|
849
809
|
return dat;
|
850
810
|
}
|
851
811
|
|
812
|
+
static dat_t *ld_dat(rdr_t *reader, VALUE data, bool labelled) {
|
813
|
+
FILE *file;
|
814
|
+
dat_t *dat = (dat_t*)0;
|
852
815
|
|
853
|
-
|
816
|
+
switch (TYPE(data)) {
|
817
|
+
case T_STRING:
|
818
|
+
file = ufopen(data, "r");
|
819
|
+
dat = rdr_readdat(reader, file, labelled);
|
820
|
+
fclose(file);
|
821
|
+
break;
|
854
822
|
|
855
|
-
|
823
|
+
case T_ARRAY:
|
824
|
+
dat = to_dat(reader, data, labelled);
|
825
|
+
break;
|
856
826
|
|
857
|
-
|
858
|
-
|
859
|
-
if (!strcmp(model->opt->algo, trn_lst[trn].name)) break;
|
827
|
+
default:
|
828
|
+
fatal("invalid data type (expected instance of String or Array)");
|
860
829
|
}
|
861
830
|
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
}
|
831
|
+
return dat;
|
832
|
+
}
|
833
|
+
|
866
834
|
|
835
|
+
static VALUE model_train(VALUE self, VALUE train, VALUE devel) {
|
867
836
|
FILE *file;
|
837
|
+
mdl_t *model = get_model(self);
|
838
|
+
trn_t trn = trn_get(model->opt->algo);
|
839
|
+
model->type = typ_get(model->opt->type);
|
868
840
|
|
869
841
|
// Load the pattern file. This will unlock the database if previously
|
870
842
|
// locked by loading a model.
|
871
843
|
if (model->opt->pattern) {
|
844
|
+
info("load patterns");
|
872
845
|
file = fopen(model->opt->pattern, "r");
|
873
846
|
|
874
847
|
if (!file) {
|
875
|
-
|
876
|
-
|
848
|
+
pfatal("failed to train model: failed to load pattern file '%s'",
|
849
|
+
model->opt->pattern);
|
877
850
|
}
|
878
851
|
|
879
852
|
rdr_loadpat(model->reader, file);
|
@@ -886,58 +859,45 @@ static VALUE model_train(VALUE self, VALUE data) {
|
|
886
859
|
// Load the training data. When this is done we lock the quarks as we
|
887
860
|
// don't want to put in the model, informations present only in the
|
888
861
|
// development set.
|
889
|
-
|
890
|
-
switch (TYPE(data)) {
|
891
|
-
case T_STRING:
|
892
|
-
if (!(file = fopen(StringValuePtr(data), "r"))) {
|
893
|
-
rb_raise(cNativeError, "failed to train model: failed to open training data '%s", StringValuePtr(data));
|
894
|
-
}
|
895
|
-
|
896
|
-
model->train = rdr_readdat(model->reader, file, true);
|
897
|
-
fclose(file);
|
898
|
-
|
899
|
-
break;
|
900
|
-
case T_ARRAY:
|
901
|
-
model->train = to_dat(model->reader, data, true);
|
902
|
-
|
903
|
-
break;
|
904
|
-
default:
|
905
|
-
rb_raise(cNativeError, "failed to train model: invalid training data type (expected instance of String or Array)");
|
906
|
-
}
|
862
|
+
model->train = ld_dat(model->reader, train, true);
|
907
863
|
|
908
864
|
qrk_lock(model->reader->lbl, true);
|
909
865
|
qrk_lock(model->reader->obs, true);
|
910
866
|
|
911
867
|
if (!model->train || model->train->nseq == 0) {
|
912
|
-
|
868
|
+
fatal("failed to train model: no training data loaded");
|
913
869
|
}
|
914
870
|
|
915
871
|
// If present, load the development set in the model. If not specified,
|
916
872
|
// the training dataset will be used instead.
|
917
|
-
if (
|
918
|
-
|
919
|
-
rb_raise(cNativeError,
|
920
|
-
"failed to train model: cannot open development file '%s'", model->opt->devel);
|
921
|
-
}
|
922
|
-
|
923
|
-
model->devel = rdr_readdat(model->reader, file, true);
|
924
|
-
fclose(file);
|
873
|
+
if (TYPE(devel) != T_NIL) {
|
874
|
+
model->devel = ld_dat(model->reader, devel, true);
|
925
875
|
}
|
926
876
|
|
927
|
-
|
928
|
-
|
929
|
-
|
877
|
+
// Initialize the model. If a previous model was loaded, this will be
|
878
|
+
// just a resync, else the model structure will be created.
|
879
|
+
info((model->theta == NULL) ? "initialize model" : "re-sync model");
|
930
880
|
mdl_sync(model);
|
931
881
|
|
932
|
-
|
882
|
+
info("nb train: %"PRIu32"", model->train->nseq);
|
883
|
+
if (model->devel != NULL)
|
884
|
+
info("nb devel: %"PRIu32"", model->devel->nseq);
|
885
|
+
info("nb labels: %"PRIu32"", model->nlbl);
|
886
|
+
info("nb blocks: %"PRIu64"", model->nobs);
|
887
|
+
info("nb features: %"PRIu64"", model->nftr);
|
888
|
+
|
889
|
+
info("training model with %s", model->opt->algo);
|
933
890
|
uit_setup(model);
|
934
|
-
|
891
|
+
trn(model);
|
935
892
|
uit_cleanup(model);
|
936
893
|
|
937
|
-
// If requested compact the model.
|
938
894
|
if (model->opt->compact) {
|
939
|
-
|
895
|
+
const uint64_t O = model->nobs;
|
896
|
+
const uint64_t F = model->nftr;
|
897
|
+
info("compacting model");
|
940
898
|
mdl_compact(model);
|
899
|
+
info("%8"PRIu64" observations removed", O - model->nobs);
|
900
|
+
info("%8"PRIu64" features removed", F - model->nftr);
|
941
901
|
}
|
942
902
|
|
943
903
|
return self;
|
@@ -980,8 +940,7 @@ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
|
|
980
940
|
|
981
941
|
if (N == 1) {
|
982
942
|
tag_viterbi(model, seq, out, scs, psc);
|
983
|
-
}
|
984
|
-
else {
|
943
|
+
} else {
|
985
944
|
tag_nbviterbi(model, seq, N, (void*)out, scs, (void*)psc);
|
986
945
|
}
|
987
946
|
|
@@ -993,16 +952,13 @@ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
|
|
993
952
|
if (!model->opt->label) {
|
994
953
|
VALUE token = rb_str_new2(raw->lines[t]);
|
995
954
|
|
996
|
-
#ifdef HAVE_RUBY_ENCODING_H
|
997
955
|
int enc = rb_enc_find_index("UTF-8");
|
998
956
|
rb_enc_associate_index(token, enc);
|
999
|
-
#endif
|
1000
957
|
|
1001
958
|
rb_ary_push(tokens, token);
|
1002
959
|
}
|
1003
960
|
|
1004
961
|
for (n = 0; n < N; ++n) {
|
1005
|
-
|
1006
962
|
uint64_t lbl = out[t * N + n];
|
1007
963
|
rb_ary_push(tokens, rb_str_new2(qrk_id2str(lbls, lbl)));
|
1008
964
|
|
@@ -1010,7 +966,6 @@ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
|
|
1010
966
|
if (model->opt->outsc) {
|
1011
967
|
rb_ary_push(tokens, rb_float_new(psc[t * N + n]));
|
1012
968
|
}
|
1013
|
-
|
1014
969
|
}
|
1015
970
|
|
1016
971
|
// yield token/label pair to block if given
|
@@ -1020,9 +975,7 @@ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
|
|
1020
975
|
|
1021
976
|
rb_ary_push(sequence, tokens);
|
1022
977
|
|
1023
|
-
|
1024
978
|
// TODO output sequence score: scs[n] (float)
|
1025
|
-
|
1026
979
|
}
|
1027
980
|
|
1028
981
|
// Statistics
|
@@ -1036,8 +989,7 @@ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
|
|
1036
989
|
if (seq->pos[t].lbl != out[t * N]) {
|
1037
990
|
terr++;
|
1038
991
|
err = 1;
|
1039
|
-
}
|
1040
|
-
else {
|
992
|
+
} else {
|
1041
993
|
stat[2][out[t * N]]++;
|
1042
994
|
}
|
1043
995
|
}
|
@@ -1053,10 +1005,8 @@ static VALUE decode_sequence(VALUE self, mdl_t *model, raw_t *raw) {
|
|
1053
1005
|
|
1054
1006
|
serr = FIX2INT(rb_ivar_get(self, rb_intern("@sequence_errors")));
|
1055
1007
|
rb_ivar_set(self, rb_intern("@sequence_errors"), INT2FIX(serr + err));
|
1056
|
-
|
1057
1008
|
}
|
1058
1009
|
|
1059
|
-
|
1060
1010
|
// Cleanup memory used for this sequence
|
1061
1011
|
xfree(scs);
|
1062
1012
|
xfree(psc);
|
@@ -1090,7 +1040,6 @@ static VALUE decode_sequence_array(VALUE self, VALUE array) {
|
|
1090
1040
|
for (j = 0; j < k; ++j) {
|
1091
1041
|
VALUE line = rb_ary_entry(sequence, j);
|
1092
1042
|
Check_Type(line, T_STRING);
|
1093
|
-
|
1094
1043
|
raw->lines[j] = StringValueCStr(line);
|
1095
1044
|
}
|
1096
1045
|
|
@@ -1103,13 +1052,7 @@ static VALUE decode_sequence_array(VALUE self, VALUE array) {
|
|
1103
1052
|
}
|
1104
1053
|
|
1105
1054
|
static VALUE decode_sequence_file(VALUE self, VALUE path) {
|
1106
|
-
|
1107
|
-
FILE *file;
|
1108
|
-
|
1109
|
-
if (!(file = fopen(StringValueCStr(path), "r"))) {
|
1110
|
-
rb_raise(cNativeError, "failed to label data: could not open file '%s'", StringValueCStr(path));
|
1111
|
-
}
|
1112
|
-
|
1055
|
+
FILE *file = ufopen(path, "r");
|
1113
1056
|
mdl_t *model = get_model(self);
|
1114
1057
|
raw_t *raw;
|
1115
1058
|
|
@@ -1119,7 +1062,6 @@ static VALUE decode_sequence_file(VALUE self, VALUE path) {
|
|
1119
1062
|
// to take care of not discarding the raw input as we want to send it
|
1120
1063
|
// back to the output with the additional predicted labels.
|
1121
1064
|
while (!feof(file)) {
|
1122
|
-
|
1123
1065
|
// So, first read an input sequence keeping the raw_t object
|
1124
1066
|
// available, and label it with Viterbi.
|
1125
1067
|
if ((raw = rdr_readraw(model->reader, file)) == 0) {
|
@@ -1133,12 +1075,12 @@ static VALUE decode_sequence_file(VALUE self, VALUE path) {
|
|
1133
1075
|
return result;
|
1134
1076
|
}
|
1135
1077
|
|
1136
|
-
//
|
1078
|
+
// call-seq:
|
1137
1079
|
// m.label(tokens, options = {}) # => array of labelled tokens
|
1138
1080
|
// m.label(filename, options = {}) # => array of labelled tokens
|
1139
1081
|
//
|
1140
1082
|
static VALUE model_label(VALUE self, VALUE data) {
|
1141
|
-
VALUE result;
|
1083
|
+
VALUE result = (VALUE)0;
|
1142
1084
|
|
1143
1085
|
switch (TYPE(data)) {
|
1144
1086
|
case T_STRING:
|
@@ -1148,7 +1090,7 @@ static VALUE model_label(VALUE self, VALUE data) {
|
|
1148
1090
|
result = decode_sequence_array(self, data);
|
1149
1091
|
break;
|
1150
1092
|
default:
|
1151
|
-
|
1093
|
+
fatal("failed to label data: invalid data (expected type String or Array)");
|
1152
1094
|
}
|
1153
1095
|
|
1154
1096
|
return result;
|
@@ -1157,125 +1099,33 @@ static VALUE model_label(VALUE self, VALUE data) {
|
|
1157
1099
|
static void Init_model() {
|
1158
1100
|
cModel = rb_define_class_under(mWapiti, "Model", rb_cObject);
|
1159
1101
|
rb_define_alloc_func(cModel, allocate_model);
|
1160
|
-
|
1161
|
-
rb_define_method(cModel, "initialize", initialize_model, -1);
|
1162
|
-
|
1163
1102
|
rb_define_attr(cModel, "options", 1, 0);
|
1164
1103
|
|
1165
|
-
|
1104
|
+
rb_define_method(cModel, "initialize", initialize_model, -1);
|
1166
1105
|
rb_define_method(cModel, "nlbl", model_nlbl, 0);
|
1167
1106
|
rb_define_method(cModel, "labels", model_labels, 0);
|
1168
|
-
|
1169
1107
|
rb_define_method(cModel, "nobs", model_nobs, 0);
|
1170
1108
|
rb_define_alias(cModel, "observations", "nobs");
|
1171
|
-
|
1172
1109
|
rb_define_method(cModel, "nftr", model_nftr, 0);
|
1173
1110
|
rb_define_alias(cModel, "features", "nftr");
|
1174
|
-
|
1175
|
-
rb_define_method(cModel, "total", model_total, 0);
|
1176
|
-
|
1177
1111
|
rb_define_method(cModel, "sync", model_sync, 0);
|
1178
1112
|
rb_define_method(cModel, "compact", model_compact, 0);
|
1179
1113
|
rb_define_method(cModel, "save", model_save, -1);
|
1180
1114
|
rb_define_method(cModel, "load", model_load, -1);
|
1181
|
-
|
1182
|
-
rb_define_method(cModel, "train", model_train, 1);
|
1115
|
+
rb_define_method(cModel, "train", model_train, 2);
|
1183
1116
|
rb_define_method(cModel, "label", model_label, 1);
|
1184
1117
|
}
|
1185
1118
|
|
1186
|
-
/* --- Top-Level Utility Methods --- */
|
1187
|
-
|
1188
|
-
|
1189
|
-
static VALUE label(VALUE self __attribute__((__unused__)), VALUE rb_options) {
|
1190
|
-
if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
|
1191
|
-
rb_raise(cNativeError, "argument must be a native options instance");
|
1192
|
-
}
|
1193
|
-
|
1194
|
-
opt_t *options = get_options(rb_options);
|
1195
|
-
|
1196
|
-
if (options->mode != 1) {
|
1197
|
-
rb_raise(cNativeError, "invalid options argument: mode should be set to 1 for labelling");
|
1198
|
-
}
|
1199
|
-
|
1200
|
-
mdl_t *model = mdl_new(rdr_new(options->maxent));
|
1201
|
-
model->opt = options;
|
1202
|
-
|
1203
|
-
dolabel(model);
|
1204
|
-
|
1205
|
-
mdl_free(model);
|
1206
|
-
|
1207
|
-
return Qnil;
|
1208
|
-
}
|
1209
|
-
|
1210
|
-
#if defined EXTRA
|
1211
|
-
static VALUE dump(VALUE self __attribute__((__unused__)), VALUE rb_options) {
|
1212
|
-
if (strncmp("Wapiti::Options", rb_obj_classname(rb_options), 15) != 0) {
|
1213
|
-
rb_raise(cNativeError, "argument must be a native options instance");
|
1214
|
-
}
|
1215
|
-
|
1216
|
-
opt_t *options = get_options(rb_options);
|
1217
|
-
|
1218
|
-
if (options->mode != 2) {
|
1219
|
-
rb_raise(cNativeError, "invalid options argument: mode should be set to 2 for training");
|
1220
|
-
}
|
1221
|
-
|
1222
|
-
mdl_t *model = mdl_new(rdr_new(options->maxent));
|
1223
|
-
model->opt = options;
|
1224
|
-
|
1225
|
-
dodump(model);
|
1226
|
-
|
1227
|
-
mdl_free(model);
|
1228
|
-
|
1229
|
-
return Qnil;
|
1230
|
-
}
|
1231
|
-
|
1232
|
-
// This function is a proxy for Wapiti's main entry point.
|
1233
|
-
static VALUE wapiti(VALUE self __attribute__((__unused__)), VALUE arguments) {
|
1234
|
-
int result = -1, argc = 0;
|
1235
|
-
char **ap, *argv[18], *input, *tmp;
|
1236
|
-
|
1237
|
-
Check_Type(arguments, T_STRING);
|
1238
|
-
tmp = StringValueCStr(arguments);
|
1239
|
-
|
1240
|
-
// allocate space for argument vector
|
1241
|
-
input = (char*)malloc(strlen(tmp) + 8);
|
1242
|
-
|
1243
|
-
// prepend command name
|
1244
|
-
strncpy(input, "wapiti ", 8);
|
1245
|
-
strncat(input, tmp, strlen(input) - 8);
|
1246
|
-
|
1247
|
-
// remember allocation pointer
|
1248
|
-
tmp = input;
|
1249
|
-
|
1250
|
-
// turn input string into argument vector (using
|
1251
|
-
// only the first seventeen tokens from input)
|
1252
|
-
for (ap = argv; (*ap = strsep(&input, " \t")) != (char*)0; ++argc) {
|
1253
|
-
if ((**ap != '\0') && (++ap >= &argv[18])) break;
|
1254
|
-
}
|
1255
|
-
|
1256
|
-
// call main entry point
|
1257
|
-
result = wapiti_main(argc, argv);
|
1258
|
-
|
1259
|
-
// free allocated memory
|
1260
|
-
free(tmp);
|
1261
|
-
|
1262
|
-
return INT2FIX(result);
|
1263
|
-
}
|
1264
|
-
#endif
|
1265
|
-
|
1266
1119
|
/* --- Wapiti Extension Entry Point --- */
|
1267
1120
|
|
1268
1121
|
void Init_native() {
|
1269
1122
|
mWapiti = rb_const_get(rb_mKernel, rb_intern("Wapiti"));
|
1270
1123
|
mNative = rb_define_module_under(mWapiti, "Native");
|
1271
1124
|
|
1125
|
+
cArgumentError = rb_const_get(rb_mKernel, rb_intern("ArgumentError"));
|
1272
1126
|
cNativeError = rb_const_get(mWapiti, rb_intern("NativeError"));
|
1273
|
-
cConfigurationError = rb_const_get(mWapiti, rb_intern("ConfigurationError"));
|
1274
1127
|
cLogger = rb_funcall(mWapiti, rb_intern("log"), 0);
|
1275
1128
|
|
1276
|
-
rb_define_singleton_method(mNative, "label", label, 1);
|
1277
|
-
// rb_define_singleton_method(mNative, "wapiti", wapiti, 1);
|
1278
|
-
|
1279
1129
|
rb_define_const(mNative, "VERSION", rb_str_new2(VERSION));
|
1280
1130
|
|
1281
1131
|
Init_options();
|