wapiti 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.autotest +13 -0
- data/.gitignore +5 -0
- data/.rspec +3 -0
- data/Gemfile +6 -0
- data/LICENSE +30 -0
- data/README.md +153 -0
- data/Rakefile +33 -0
- data/ext/wapiti/bcd.c +392 -0
- data/ext/wapiti/decoder.c +535 -0
- data/ext/wapiti/decoder.h +46 -0
- data/ext/wapiti/extconf.rb +8 -0
- data/ext/wapiti/gradient.c +818 -0
- data/ext/wapiti/gradient.h +81 -0
- data/ext/wapiti/lbfgs.c +294 -0
- data/ext/wapiti/model.c +296 -0
- data/ext/wapiti/model.h +100 -0
- data/ext/wapiti/native.c +1238 -0
- data/ext/wapiti/native.h +15 -0
- data/ext/wapiti/options.c +278 -0
- data/ext/wapiti/options.h +91 -0
- data/ext/wapiti/pattern.c +395 -0
- data/ext/wapiti/pattern.h +56 -0
- data/ext/wapiti/progress.c +167 -0
- data/ext/wapiti/progress.h +43 -0
- data/ext/wapiti/quark.c +272 -0
- data/ext/wapiti/quark.h +46 -0
- data/ext/wapiti/reader.c +553 -0
- data/ext/wapiti/reader.h +73 -0
- data/ext/wapiti/rprop.c +191 -0
- data/ext/wapiti/sequence.h +148 -0
- data/ext/wapiti/sgdl1.c +218 -0
- data/ext/wapiti/thread.c +171 -0
- data/ext/wapiti/thread.h +42 -0
- data/ext/wapiti/tools.c +202 -0
- data/ext/wapiti/tools.h +54 -0
- data/ext/wapiti/trainers.h +39 -0
- data/ext/wapiti/vmath.c +372 -0
- data/ext/wapiti/vmath.h +51 -0
- data/ext/wapiti/wapiti.c +288 -0
- data/ext/wapiti/wapiti.h +45 -0
- data/lib/wapiti.rb +30 -0
- data/lib/wapiti/errors.rb +17 -0
- data/lib/wapiti/model.rb +49 -0
- data/lib/wapiti/options.rb +113 -0
- data/lib/wapiti/utility.rb +15 -0
- data/lib/wapiti/version.rb +3 -0
- data/spec/fixtures/ch.mod +18550 -0
- data/spec/fixtures/chpattern.txt +52 -0
- data/spec/fixtures/chtest.txt +1973 -0
- data/spec/fixtures/chtrain.txt +19995 -0
- data/spec/fixtures/nppattern.txt +52 -0
- data/spec/fixtures/nptest.txt +1973 -0
- data/spec/fixtures/nptrain.txt +19995 -0
- data/spec/fixtures/pattern.txt +14 -0
- data/spec/fixtures/test.txt +60000 -0
- data/spec/fixtures/train.txt +1200 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/wapiti/model_spec.rb +173 -0
- data/spec/wapiti/native_spec.rb +12 -0
- data/spec/wapiti/options_spec.rb +175 -0
- data/spec/wapiti/utility_spec.rb +22 -0
- data/wapiti.gemspec +35 -0
- metadata +178 -0
data/.autotest
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'autotest/fsevent' if RUBY_PLATFORM =~ /darwin/
|
2
|
+
|
3
|
+
Autotest.add_hook :initialize do |at|
|
4
|
+
at.add_mapping(/.*\.[ch]$/) do |f, _|
|
5
|
+
at.files_matching(/native_spec\.rb$/)
|
6
|
+
at.files_matching(/options_spec\.rb$/)
|
7
|
+
at.files_matching(/model_spec\.rb$/)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
Autotest.add_hook :run_command do |at|
|
12
|
+
system 'bundle exec rake compile'
|
13
|
+
end
|
data/.rspec
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
Wapiti-Ruby
|
2
|
+
Copyright 2011 Sylvester Keil. All rights reserved.
|
3
|
+
|
4
|
+
Wapiti - A linear-chain CRF tool
|
5
|
+
Copyright 2009-2011 CNRS. All rights reserved.
|
6
|
+
|
7
|
+
Redistribution and use in source and binary forms, with or without
|
8
|
+
modification, are permitted provided that the following conditions are met:
|
9
|
+
|
10
|
+
1. Redistributions of source code must retain the above copyright notice,
|
11
|
+
this list of conditions and the following disclaimer.
|
12
|
+
|
13
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
14
|
+
this list of conditions and the following disclaimer in the documentation
|
15
|
+
and/or other materials provided with the distribution.
|
16
|
+
|
17
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR
|
18
|
+
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
19
|
+
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
20
|
+
EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
21
|
+
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
22
|
+
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
23
|
+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
24
|
+
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
25
|
+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
26
|
+
EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
27
|
+
|
28
|
+
The views and conclusions contained in the software and documentation are
|
29
|
+
those of the authors and should not be interpreted as representing official
|
30
|
+
policies, either expressed or implied, of the copyright holder.
|
data/README.md
ADDED
@@ -0,0 +1,153 @@
|
|
1
|
+
Wapiti-Ruby
|
2
|
+
===========
|
3
|
+
|
4
|
+
The Wapiti-Ruby gem provides a wicked fast linear-chain CRF
|
5
|
+
([Conditional Random Fields](http://en.wikipedia.org/wiki/Conditional_random_field))
|
6
|
+
API for sequence segmentation and labelling; it is based on the codebase of
|
7
|
+
Thomas Lavergne's awesome [wapiti](http://wapiti.limsi.fr/).
|
8
|
+
|
9
|
+
|
10
|
+
Requirements
|
11
|
+
------------
|
12
|
+
|
13
|
+
Wapiti-Ruby is written in C and Ruby and requires a compiler with C99
|
14
|
+
support (e.g., gcc); the gem has been confirmed to work with MRI 1.9, 1.8.7,
|
15
|
+
and Rubinius.
|
16
|
+
|
17
|
+
|
18
|
+
Quickstart
|
19
|
+
----------
|
20
|
+
|
21
|
+
### Installation
|
22
|
+
|
23
|
+
$ [sudo] gem install wapiti
|
24
|
+
|
25
|
+
### Creating a Model
|
26
|
+
|
27
|
+
Using a pattern and training data stored in a file:
|
28
|
+
|
29
|
+
model = Wapiti.train('train.txt', :pattern => 'pattern.txt')
|
30
|
+
=> #<Wapiti::Model:0x0000010188f868>
|
31
|
+
model.labels
|
32
|
+
=> ["B-ADJP", "B-ADVP", "B-CONJP" ...]
|
33
|
+
model.save('ch.mod')
|
34
|
+
=> # saves the model as 'ch.mod'
|
35
|
+
|
36
|
+
Alternatively, you can pass in the training data as an array; the array
|
37
|
+
should contain one array for each sequence of training data.
|
38
|
+
|
39
|
+
data = []
|
40
|
+
data << ['Confidence NN B-NP', 'in IN B-PP', 'the DT B-NP', 'pound NN I-NP', '. . O']
|
41
|
+
...
|
42
|
+
model = Wapiti.train(data, options)
|
43
|
+
|
44
|
+
You can consult the `Wapiti::Options` class for a list of supported
|
45
|
+
configuration options and algorithms:
|
46
|
+
|
47
|
+
Wapiti::Options.attribute_names
|
48
|
+
=> [:algorithm, :check, :compact, :convergence_window, :development_data,
|
49
|
+
:jobsize, :label, :max_iterations, :maxent, :pattern, :posterior, :rho1,
|
50
|
+
:rho2, :score, :sparse, :stop_epsilon, :stop_window, :threads]
|
51
|
+
Wapiti::Options.algorithms
|
52
|
+
=> ["l-bfgs", "sgd-l1", "bcd", "rprop", "rprop+", "rprop-", "auto"]
|
53
|
+
|
54
|
+
Use `#valid?` or `#validate` (which returns error messages) to make sure
|
55
|
+
your configuration is supported by Wapiti.
|
56
|
+
|
57
|
+
You can pass options either as an options hash or by adding a block to the
|
58
|
+
method invocation:
|
59
|
+
|
60
|
+
model = Wapiti::Model.train(data) do |config|
|
61
|
+
config.pattern = 'pattern.txt'
|
62
|
+
threads = 4
|
63
|
+
end
|
64
|
+
|
65
|
+
Before saving your model you can use `compact` to reduce the model's size:
|
66
|
+
|
67
|
+
model.save 'm1.mod'
|
68
|
+
=> # m1.mod file size 1.8M
|
69
|
+
model.compact
|
70
|
+
model.save 'm2.mod'
|
71
|
+
=> # m2.mod file size 471K
|
72
|
+
|
73
|
+
### Loading existing Models
|
74
|
+
|
75
|
+
model = Wapiti::Model.load('m1.mod')
|
76
|
+
|
77
|
+
### Labelling
|
78
|
+
|
79
|
+
By calling `#label` on a Model instance you can add labels to your sequence
|
80
|
+
data:
|
81
|
+
|
82
|
+
model = Waiti.load('m2.mod')
|
83
|
+
model.label('test.txt')
|
84
|
+
=> [[["Confidence NN B-NP", "B-NP"], ["in IN B-PP", "B-PP"] ... ]
|
85
|
+
|
86
|
+
The result is an array of sequence arrays; each sequence array consists of
|
87
|
+
the original token and feature string (when using test data, the final
|
88
|
+
feature is usually the expected label) and the label calculated by Wapiti.
|
89
|
+
|
90
|
+
As with training data, you can pass in data either by filename or as
|
91
|
+
a Ruby Array:
|
92
|
+
|
93
|
+
model.label [['Confidence NN', 'in IN', 'the DT', 'pound NN', '. .']]
|
94
|
+
=> [[["Confidence NN", "B-NP"], ["in IN", "B-PP"], ["the DT", "B-NP"],
|
95
|
+
["pound NN", "I-NP"], [". .", "O"]]]
|
96
|
+
|
97
|
+
If you pass a block to `#label` Wapiti will yield each token and the
|
98
|
+
corresponding label:
|
99
|
+
|
100
|
+
model.label [['Confidence NN', 'in IN', 'the DT', 'pound NN', '. .']] do |token, label|
|
101
|
+
[token.downcase, label.downcase]
|
102
|
+
end
|
103
|
+
=> [[["confidence nn", "b-np"], ["in in", "b-pp"], ["the dt", "b-np"],
|
104
|
+
["pound nn", "i-np"], [". .", "o"]]]
|
105
|
+
|
106
|
+
|
107
|
+
Citing
|
108
|
+
------
|
109
|
+
|
110
|
+
If you're using Wapiti-Ruby for research purposes, please use the following
|
111
|
+
citation of the original wapiti package:
|
112
|
+
|
113
|
+
@article{lavergne2010practical,
|
114
|
+
author = {Lavergne, Thomas and Capp\'{e}, Olivier and Yvon, Fran\c{c}ois},
|
115
|
+
title = {Practical Very Large Scale {CRFs}},
|
116
|
+
booktitle = {Proceedings the 48th Annual Meeting of the Association for
|
117
|
+
Computational Linguistics (ACL)},
|
118
|
+
month = {July},
|
119
|
+
year = {2010},
|
120
|
+
location = {Uppsala, Sweden},
|
121
|
+
publisher = {Association for Computational Linguistics},
|
122
|
+
pages = {504--513},
|
123
|
+
url = {http://www.aclweb.org/anthology/P10-1052}
|
124
|
+
}
|
125
|
+
|
126
|
+
If you're profiting from any of the Wapiti-Ruby specific features you are
|
127
|
+
welcome to also refer back to the
|
128
|
+
[Wapiti-Ruby homepage](http://github.com/inukshuk/wapiti-ruby/).
|
129
|
+
|
130
|
+
|
131
|
+
Contributing
|
132
|
+
------------
|
133
|
+
|
134
|
+
The Wapiti-Ruby source code is
|
135
|
+
[hosted on GitHub](http://github.com/inukshuk/wapiti-ruby/).
|
136
|
+
You can check out a copy of the latest code using Git:
|
137
|
+
|
138
|
+
$ git clone https://github.com/inukshuk/wapiti-ruby.git
|
139
|
+
|
140
|
+
If you've found a bug or have a question, please open an issue on the
|
141
|
+
[Wapiti-Ruby issue tracker](http://github.com/inukshuk/wapiti-ruby/issues).
|
142
|
+
Or, for extra credit, clone the Wapiti-Ruby repository, write a failing
|
143
|
+
example, fix the bug and submit a pull request.
|
144
|
+
|
145
|
+
|
146
|
+
License
|
147
|
+
-------
|
148
|
+
|
149
|
+
Copyright 2011 Sylvester Keil. All rights reserved.
|
150
|
+
|
151
|
+
Copyright 2009-2011 CNRS. All rights reserved.
|
152
|
+
|
153
|
+
Wapiti-Ruby is distributed under a BSD-style license. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
lib = File.expand_path('../lib/', __FILE__)
|
2
|
+
$:.unshift lib unless $:.include?(lib)
|
3
|
+
|
4
|
+
require 'rake/clean'
|
5
|
+
require 'rake/testtask'
|
6
|
+
require 'rake/extensiontask'
|
7
|
+
|
8
|
+
require 'wapiti/version'
|
9
|
+
|
10
|
+
task :default => [:test]
|
11
|
+
|
12
|
+
Rake::ExtensionTask.new do |ext|
|
13
|
+
ext.name = 'native'
|
14
|
+
|
15
|
+
ext.ext_dir = 'ext/wapiti'
|
16
|
+
ext.lib_dir = 'lib/wapiti'
|
17
|
+
|
18
|
+
CLEAN.include("#{ext.lib_dir}/native.*")
|
19
|
+
CLEAN.include("#{ext.tmp_dir}")
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
task :build => [:clean] do
|
24
|
+
system 'gem build wapiti.gemspec'
|
25
|
+
end
|
26
|
+
|
27
|
+
task :release => [:build] do
|
28
|
+
system "git tag #{Wapiti::VERSION}"
|
29
|
+
system "gem push wapiti-#{Wapiti::VERSION}.gem"
|
30
|
+
end
|
31
|
+
|
32
|
+
CLEAN.include('*.gem')
|
33
|
+
CLEAN.include('*.rbc')
|
data/ext/wapiti/bcd.c
ADDED
@@ -0,0 +1,392 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
|
28
|
+
#include <assert.h>
|
29
|
+
#include <math.h>
|
30
|
+
#include <stdbool.h>
|
31
|
+
#include <stddef.h>
|
32
|
+
#include <stdlib.h>
|
33
|
+
#include <string.h>
|
34
|
+
|
35
|
+
#include "wapiti.h"
|
36
|
+
#include "gradient.h"
|
37
|
+
#include "model.h"
|
38
|
+
#include "options.h"
|
39
|
+
#include "progress.h"
|
40
|
+
#include "sequence.h"
|
41
|
+
#include "tools.h"
|
42
|
+
#include "vmath.h"
|
43
|
+
|
44
|
+
/******************************************************************************
|
45
|
+
* Blockwise Coordinates descent trainer
|
46
|
+
* The gradient and hessian computation used for the BCD is very similar to
|
47
|
+
* the generic one define below but there is some important differences:
|
48
|
+
* - The forward and backward recursions doesn't have to be performed fully
|
49
|
+
* but just in the range of activity of the considered block. So if the
|
50
|
+
* block is active only at position t, the alpha recusion is done from 1
|
51
|
+
* to t and the beta one from T to t, dividing the amount of computations
|
52
|
+
* by 2.
|
53
|
+
* - Samely the update of the gradient and hessian have to be done only at
|
54
|
+
* position where the block is active, so in the common case where the
|
55
|
+
* block is active only once in the sequence, the improvement can be huge.
|
56
|
+
* - And finally, there is no need to compute the logloss, which can take a
|
57
|
+
* long time due to the computation of the log()s.
|
58
|
+
******************************************************************************/
|
59
|
+
typedef struct bcd_s bcd_t;
|
60
|
+
struct bcd_s {
|
61
|
+
double *ugrd; // [Y]
|
62
|
+
double *uhes; // [Y]
|
63
|
+
double *bgrd; // [Y][Y]
|
64
|
+
double *bhes; // [Y][Y]
|
65
|
+
size_t *actpos; // [T]
|
66
|
+
size_t actcnt;
|
67
|
+
grd_t *grd;
|
68
|
+
};
|
69
|
+
|
70
|
+
/* bcd_soft:
|
71
|
+
* The softmax function.
|
72
|
+
*/
|
73
|
+
static double bcd_soft(double z, double r) {
|
74
|
+
if (z > r) return z - r;
|
75
|
+
if (z < -r) return z + r;
|
76
|
+
return 0.0;
|
77
|
+
}
|
78
|
+
|
79
|
+
/* bcd_actpos:
|
80
|
+
* List position where the given block is active in the sequence and setup the
|
81
|
+
* limits for the fwd/bwd.
|
82
|
+
*/
|
83
|
+
static void bcd_actpos(mdl_t *mdl, bcd_t *bcd, const seq_t *seq, size_t o) {
|
84
|
+
const int T = seq->len;
|
85
|
+
size_t *actpos = bcd->actpos;
|
86
|
+
size_t actcnt = 0;
|
87
|
+
for (int t = 0; t < T; t++) {
|
88
|
+
const pos_t *pos = &(seq->pos[t]);
|
89
|
+
bool ok = false;
|
90
|
+
if (mdl->kind[o] & 1)
|
91
|
+
for (size_t n = 0; !ok && n < pos->ucnt; n++)
|
92
|
+
if (pos->uobs[n] == o)
|
93
|
+
ok = true;
|
94
|
+
if (mdl->kind[o] & 2)
|
95
|
+
for (size_t n = 0; !ok && n < pos->bcnt; n++)
|
96
|
+
if (pos->bobs[n] == o)
|
97
|
+
ok = true;
|
98
|
+
if (!ok)
|
99
|
+
continue;
|
100
|
+
actpos[actcnt++] = t;
|
101
|
+
}
|
102
|
+
assert(actcnt != 0);
|
103
|
+
bcd->actcnt = actcnt;
|
104
|
+
bcd->grd->first = actpos[0];
|
105
|
+
bcd->grd->last = actpos[actcnt - 1];
|
106
|
+
}
|
107
|
+
|
108
|
+
/* bct_flgradhes:
|
109
|
+
* Update the gradient and hessian for <blk> on sequence <seq>. This one is
|
110
|
+
* very similar than the trn_spupgrad function but does the computation only
|
111
|
+
* at active pos and approximate also the hessian.
|
112
|
+
*/
|
113
|
+
static void bcd_flgradhes(mdl_t *mdl, bcd_t *bcd, const seq_t *seq, size_t o) {
|
114
|
+
const grd_t *grd = bcd->grd;
|
115
|
+
const size_t Y = mdl->nlbl;
|
116
|
+
const size_t T = seq->len;
|
117
|
+
const double (*psi )[T][Y][Y] = (void *)grd->psi;
|
118
|
+
const double (*alpha)[T][Y] = (void *)grd->alpha;
|
119
|
+
const double (*beta )[T][Y] = (void *)grd->beta;
|
120
|
+
const double *unorm = grd->unorm;
|
121
|
+
const double *bnorm = grd->bnorm;
|
122
|
+
const size_t *actpos = bcd->actpos;
|
123
|
+
const size_t actcnt = bcd->actcnt;
|
124
|
+
double *ugrd = bcd->ugrd;
|
125
|
+
double *uhes = bcd->uhes;
|
126
|
+
double *bgrd = bcd->bgrd;
|
127
|
+
double *bhes = bcd->bhes;
|
128
|
+
// Update the gradient and the hessian but here we sum only on the
|
129
|
+
// positions where the block is active for unigrams features
|
130
|
+
if (mdl->kind[o] & 1) {
|
131
|
+
for (size_t n = 0; n < actcnt; n++) {
|
132
|
+
const size_t t = actpos[n];
|
133
|
+
for (size_t y = 0; y < Y; y++) {
|
134
|
+
const double e = (*alpha)[t][y] * (*beta)[t][y]
|
135
|
+
* unorm[t];
|
136
|
+
ugrd[y] += e;
|
137
|
+
uhes[y] += e * (1.0 - e);
|
138
|
+
}
|
139
|
+
const size_t y = seq->pos[t].lbl;
|
140
|
+
ugrd[y] -= 1.0;
|
141
|
+
}
|
142
|
+
}
|
143
|
+
if ((mdl->kind[o] & 2) == 0)
|
144
|
+
return;
|
145
|
+
// for bigrams features
|
146
|
+
for (size_t n = 0; n < actcnt; n++) {
|
147
|
+
const size_t t = actpos[n];
|
148
|
+
if (t == 0)
|
149
|
+
continue;
|
150
|
+
for (size_t yp = 0, d = 0; yp < Y; yp++) {
|
151
|
+
for (size_t y = 0; y < Y; y++, d++) {
|
152
|
+
double e = (*alpha)[t - 1][yp] * (*beta)[t][y]
|
153
|
+
* (*psi)[t][yp][y] * bnorm[t];
|
154
|
+
bgrd[d] += e;
|
155
|
+
bhes[d] += e * (1.0 - e);
|
156
|
+
}
|
157
|
+
}
|
158
|
+
const size_t yp = seq->pos[t - 1].lbl;
|
159
|
+
const size_t y = seq->pos[t ].lbl;
|
160
|
+
bgrd[yp * Y + y] -= 1.0;
|
161
|
+
}
|
162
|
+
}
|
163
|
+
|
164
|
+
/* bct_spgradhes:
|
165
|
+
* Update the gradient and hessian for <blk> on sequence <seq>. This one is
|
166
|
+
* very similar than the trn_spupgrad function but does the computation only
|
167
|
+
* at active pos and approximate also the hessian.
|
168
|
+
*/
|
169
|
+
static void bcd_spgradhes(mdl_t *mdl, bcd_t *bcd, const seq_t *seq, size_t o) {
|
170
|
+
const grd_t *grd = bcd->grd;
|
171
|
+
const size_t Y = mdl->nlbl;
|
172
|
+
const size_t T = seq->len;
|
173
|
+
const double (*psiuni)[T][Y] = (void *)grd->psiuni;
|
174
|
+
const double *psival = grd->psi;
|
175
|
+
const size_t *psiyp = grd->psiyp;
|
176
|
+
const size_t (*psiidx)[T][Y] = (void *)grd->psiidx;
|
177
|
+
const size_t *psioff = grd->psioff;
|
178
|
+
const double (*alpha)[T][Y] = (void *)grd->alpha;
|
179
|
+
const double (*beta )[T][Y] = (void *)grd->beta;
|
180
|
+
const double *unorm = grd->unorm;
|
181
|
+
const double *bnorm = grd->bnorm;
|
182
|
+
const size_t *actpos = bcd->actpos;
|
183
|
+
const size_t actcnt = bcd->actcnt;
|
184
|
+
double *ugrd = bcd->ugrd;
|
185
|
+
double *uhes = bcd->uhes;
|
186
|
+
double *bgrd = bcd->bgrd;
|
187
|
+
double *bhes = bcd->bhes;
|
188
|
+
// Update the gradient and the hessian but here we sum only on the
|
189
|
+
// positions where the block is active for unigrams features
|
190
|
+
if (mdl->kind[o] & 1) {
|
191
|
+
for (size_t n = 0; n < actcnt; n++) {
|
192
|
+
const size_t t = actpos[n];
|
193
|
+
for (size_t y = 0; y < Y; y++) {
|
194
|
+
const double e = (*alpha)[t][y] * (*beta)[t][y]
|
195
|
+
* unorm[t];
|
196
|
+
ugrd[y] += e;
|
197
|
+
uhes[y] += e * (1.0 - e);
|
198
|
+
}
|
199
|
+
const size_t y = seq->pos[t].lbl;
|
200
|
+
ugrd[y] -= 1.0;
|
201
|
+
}
|
202
|
+
}
|
203
|
+
if ((mdl->kind[o] & 2) == 0)
|
204
|
+
return;
|
205
|
+
// for bigrams features
|
206
|
+
for (size_t n = 0; n < actcnt; n++) {
|
207
|
+
const size_t t = actpos[n];
|
208
|
+
if (t == 0)
|
209
|
+
continue;
|
210
|
+
// We build the expectation matrix
|
211
|
+
double e[Y][Y];
|
212
|
+
for (size_t yp = 0; yp < Y; yp++)
|
213
|
+
for (size_t y = 0; y < Y; y++)
|
214
|
+
e[yp][y] = (*alpha)[t - 1][yp] * (*beta)[t][y]
|
215
|
+
* (*psiuni)[t][y] * bnorm[t];
|
216
|
+
const size_t off = psioff[t];
|
217
|
+
for (size_t n = 0, y = 0; n < (*psiidx)[t][Y - 1]; ) {
|
218
|
+
while (n >= (*psiidx)[t][y])
|
219
|
+
y++;
|
220
|
+
while (n < (*psiidx)[t][y]) {
|
221
|
+
const size_t yp = psiyp [off + n];
|
222
|
+
const double v = psival[off + n];
|
223
|
+
e[yp][y] += e[yp][y] * v;
|
224
|
+
n++;
|
225
|
+
}
|
226
|
+
}
|
227
|
+
// And use it
|
228
|
+
for (size_t yp = 0, d = 0; yp < Y; yp++) {
|
229
|
+
for (size_t y = 0; y < Y; y++, d++) {
|
230
|
+
bgrd[d] += e[yp][y];
|
231
|
+
bhes[d] += e[yp][y] * (1.0 - e[yp][y]);
|
232
|
+
}
|
233
|
+
}
|
234
|
+
const size_t yp = seq->pos[t - 1].lbl;
|
235
|
+
const size_t y = seq->pos[t ].lbl;
|
236
|
+
bgrd[yp * Y + y] -= 1.0;
|
237
|
+
}
|
238
|
+
}
|
239
|
+
|
240
|
+
/* bct_update:
|
241
|
+
* Update the model with the computed gradient and hessian.
|
242
|
+
*/
|
243
|
+
static void bcd_update(mdl_t *mdl, bcd_t *bcd, size_t o) {
|
244
|
+
const double rho1 = mdl->opt->rho1;
|
245
|
+
const double rho2 = mdl->opt->rho2;
|
246
|
+
const double kappa = mdl->opt->bcd.kappa;
|
247
|
+
const size_t Y = mdl->nlbl;
|
248
|
+
const double *ugrd = bcd->ugrd;
|
249
|
+
const double *bgrd = bcd->bgrd;
|
250
|
+
double *uhes = bcd->uhes;
|
251
|
+
double *bhes = bcd->bhes;
|
252
|
+
if (mdl->kind[o] & 1) {
|
253
|
+
// Adjust the hessian
|
254
|
+
double a = 1.0;
|
255
|
+
for (size_t y = 0; y < Y; y++)
|
256
|
+
a = max(a, fabs(ugrd[y] / uhes[y]));
|
257
|
+
xvm_scale(uhes, uhes, a * kappa, Y);
|
258
|
+
// Update the model
|
259
|
+
double *w = mdl->theta + mdl->uoff[o];
|
260
|
+
for (size_t y = 0; y < Y; y++) {
|
261
|
+
double z = uhes[y] * w[y] - ugrd[y];
|
262
|
+
double d = uhes[y] + rho2;
|
263
|
+
w[y] = bcd_soft(z, rho1) / d;
|
264
|
+
}
|
265
|
+
}
|
266
|
+
if (mdl->kind[o] & 2) {
|
267
|
+
// Adjust the hessian
|
268
|
+
double a = 1.0;
|
269
|
+
for (size_t i = 0; i < Y * Y; i++)
|
270
|
+
a = max(a, fabs(bgrd[i] / bhes[i]));
|
271
|
+
xvm_scale(bhes, bhes, a * kappa, Y * Y);
|
272
|
+
// Update the model
|
273
|
+
double *bw = mdl->theta + mdl->boff[o];
|
274
|
+
for (size_t i = 0; i < Y * Y; i++) {
|
275
|
+
double z = bhes[i] * bw[i] - bgrd[i];
|
276
|
+
double d = bhes[i] + rho2;
|
277
|
+
bw[i] = bcd_soft(z, rho1) / d;
|
278
|
+
}
|
279
|
+
}
|
280
|
+
}
|
281
|
+
|
282
|
+
/* trn_bcd
|
283
|
+
* Train the model using the blockwise coordinates descend method.
|
284
|
+
*/
|
285
|
+
void trn_bcd(mdl_t *mdl) {
|
286
|
+
const size_t Y = mdl->nlbl;
|
287
|
+
const size_t O = mdl->nobs;
|
288
|
+
const size_t T = mdl->train->mlen;
|
289
|
+
const size_t S = mdl->train->nseq;
|
290
|
+
const int K = mdl->opt->maxiter;
|
291
|
+
// Build the index:
|
292
|
+
// Count active sequences per blocks
|
293
|
+
info(" - Build the index\n");
|
294
|
+
info(" 1/2 -- scan the sequences\n");
|
295
|
+
size_t tot = 0, cnt[O], lcl[O];
|
296
|
+
for (size_t o = 0; o < O; o++)
|
297
|
+
cnt[o] = 0, lcl[o] = none;
|
298
|
+
for (size_t s = 0; s < S; s++) {
|
299
|
+
// List actives blocks
|
300
|
+
const seq_t *seq = mdl->train->seq[s];
|
301
|
+
for (int t = 0; t < seq->len; t++) {
|
302
|
+
for (size_t b = 0; b < seq->pos[t].ucnt; b++)
|
303
|
+
lcl[seq->pos[t].uobs[b]] = s;
|
304
|
+
for (size_t b = 0; b < seq->pos[t].bcnt; b++)
|
305
|
+
lcl[seq->pos[t].bobs[b]] = s;
|
306
|
+
}
|
307
|
+
// Updates blocks count
|
308
|
+
for (size_t o = 0; o < O; o++)
|
309
|
+
cnt[o] += (lcl[o] == s);
|
310
|
+
}
|
311
|
+
for (size_t o = 0; o < O; o++)
|
312
|
+
tot += cnt[o];
|
313
|
+
// Allocate memory
|
314
|
+
size_t *idx_cnt = xmalloc(sizeof(size_t ) * O);
|
315
|
+
size_t **idx_lst = xmalloc(sizeof(size_t *) * O);
|
316
|
+
for (size_t o = 0; o < O; o++) {
|
317
|
+
idx_cnt[o] = cnt[o];
|
318
|
+
idx_lst[o] = xmalloc(sizeof(size_t) * cnt[o]);
|
319
|
+
}
|
320
|
+
// Populate the index
|
321
|
+
info(" 2/2 -- Populate the index\n");
|
322
|
+
for (size_t o = 0; o < O; o++)
|
323
|
+
cnt[o] = 0, lcl[o] = none;
|
324
|
+
for (size_t s = 0; s < S; s++) {
|
325
|
+
// List actives blocks
|
326
|
+
const seq_t *seq = mdl->train->seq[s];
|
327
|
+
for (int t = 0; t < seq->len; t++) {
|
328
|
+
for (size_t b = 0; b < seq->pos[t].ucnt; b++)
|
329
|
+
lcl[seq->pos[t].uobs[b]] = s;
|
330
|
+
for (size_t b = 0; b < seq->pos[t].bcnt; b++)
|
331
|
+
lcl[seq->pos[t].bobs[b]] = s;
|
332
|
+
}
|
333
|
+
// Build index
|
334
|
+
for (size_t o = 0; o < O; o++)
|
335
|
+
if (lcl[o] == s)
|
336
|
+
idx_lst[o][cnt[o]++] = s;
|
337
|
+
}
|
338
|
+
info(" Done\n");
|
339
|
+
// Allocate the specific trainer of BCD
|
340
|
+
bcd_t *bcd = xmalloc(sizeof(bcd_t));
|
341
|
+
bcd->ugrd = xvm_new(Y);
|
342
|
+
bcd->uhes = xvm_new(Y);
|
343
|
+
bcd->bgrd = xvm_new(Y * Y);
|
344
|
+
bcd->bhes = xvm_new(Y * Y);
|
345
|
+
bcd->actpos = xmalloc(sizeof(size_t) * T);
|
346
|
+
bcd->grd = grd_new(mdl, NULL);
|
347
|
+
// And train the model
|
348
|
+
for (int i = 0; i < K; i++) {
|
349
|
+
for (size_t o = 0; o < O; o++) {
|
350
|
+
// Clear the gradient and the hessian
|
351
|
+
for (size_t y = 0, d = 0; y < Y; y++) {
|
352
|
+
bcd->ugrd[y] = 0.0;
|
353
|
+
bcd->uhes[y] = 0.0;
|
354
|
+
for (size_t yp = 0; yp < Y; yp++, d++) {
|
355
|
+
bcd->bgrd[d] = 0.0;
|
356
|
+
bcd->bhes[d] = 0.0;
|
357
|
+
}
|
358
|
+
}
|
359
|
+
// Process active sequences
|
360
|
+
for (size_t s = 0; s < idx_cnt[o]; s++) {
|
361
|
+
const size_t id = idx_lst[o][s];
|
362
|
+
const seq_t *seq = mdl->train->seq[id];
|
363
|
+
bcd_actpos(mdl, bcd, seq, o);
|
364
|
+
grd_check(bcd->grd, seq->len);
|
365
|
+
if (mdl->opt->sparse) {
|
366
|
+
grd_spdopsi(bcd->grd, seq);
|
367
|
+
grd_spfwdbwd(bcd->grd, seq);
|
368
|
+
bcd_spgradhes(mdl, bcd, seq, o);
|
369
|
+
} else {
|
370
|
+
grd_fldopsi(bcd->grd, seq);
|
371
|
+
grd_flfwdbwd(bcd->grd, seq);
|
372
|
+
bcd_flgradhes(mdl, bcd, seq, o);
|
373
|
+
}
|
374
|
+
}
|
375
|
+
// And update the model
|
376
|
+
bcd_update(mdl, bcd, o);
|
377
|
+
}
|
378
|
+
if (!uit_progress(mdl, i + 1, -1.0))
|
379
|
+
break;
|
380
|
+
}
|
381
|
+
// Cleanup memory
|
382
|
+
grd_free(bcd->grd);
|
383
|
+
xvm_free(bcd->ugrd); xvm_free(bcd->uhes);
|
384
|
+
xvm_free(bcd->bgrd); xvm_free(bcd->bhes);
|
385
|
+
free(bcd->actpos);
|
386
|
+
free(bcd);
|
387
|
+
for (size_t o = 0; o < O; o++)
|
388
|
+
free(idx_lst[o]);
|
389
|
+
free(idx_lst);
|
390
|
+
free(idx_cnt);
|
391
|
+
}
|
392
|
+
|