wapiti 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.autotest +13 -0
- data/.gitignore +5 -0
- data/.rspec +3 -0
- data/Gemfile +6 -0
- data/LICENSE +30 -0
- data/README.md +153 -0
- data/Rakefile +33 -0
- data/ext/wapiti/bcd.c +392 -0
- data/ext/wapiti/decoder.c +535 -0
- data/ext/wapiti/decoder.h +46 -0
- data/ext/wapiti/extconf.rb +8 -0
- data/ext/wapiti/gradient.c +818 -0
- data/ext/wapiti/gradient.h +81 -0
- data/ext/wapiti/lbfgs.c +294 -0
- data/ext/wapiti/model.c +296 -0
- data/ext/wapiti/model.h +100 -0
- data/ext/wapiti/native.c +1238 -0
- data/ext/wapiti/native.h +15 -0
- data/ext/wapiti/options.c +278 -0
- data/ext/wapiti/options.h +91 -0
- data/ext/wapiti/pattern.c +395 -0
- data/ext/wapiti/pattern.h +56 -0
- data/ext/wapiti/progress.c +167 -0
- data/ext/wapiti/progress.h +43 -0
- data/ext/wapiti/quark.c +272 -0
- data/ext/wapiti/quark.h +46 -0
- data/ext/wapiti/reader.c +553 -0
- data/ext/wapiti/reader.h +73 -0
- data/ext/wapiti/rprop.c +191 -0
- data/ext/wapiti/sequence.h +148 -0
- data/ext/wapiti/sgdl1.c +218 -0
- data/ext/wapiti/thread.c +171 -0
- data/ext/wapiti/thread.h +42 -0
- data/ext/wapiti/tools.c +202 -0
- data/ext/wapiti/tools.h +54 -0
- data/ext/wapiti/trainers.h +39 -0
- data/ext/wapiti/vmath.c +372 -0
- data/ext/wapiti/vmath.h +51 -0
- data/ext/wapiti/wapiti.c +288 -0
- data/ext/wapiti/wapiti.h +45 -0
- data/lib/wapiti.rb +30 -0
- data/lib/wapiti/errors.rb +17 -0
- data/lib/wapiti/model.rb +49 -0
- data/lib/wapiti/options.rb +113 -0
- data/lib/wapiti/utility.rb +15 -0
- data/lib/wapiti/version.rb +3 -0
- data/spec/fixtures/ch.mod +18550 -0
- data/spec/fixtures/chpattern.txt +52 -0
- data/spec/fixtures/chtest.txt +1973 -0
- data/spec/fixtures/chtrain.txt +19995 -0
- data/spec/fixtures/nppattern.txt +52 -0
- data/spec/fixtures/nptest.txt +1973 -0
- data/spec/fixtures/nptrain.txt +19995 -0
- data/spec/fixtures/pattern.txt +14 -0
- data/spec/fixtures/test.txt +60000 -0
- data/spec/fixtures/train.txt +1200 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/wapiti/model_spec.rb +173 -0
- data/spec/wapiti/native_spec.rb +12 -0
- data/spec/wapiti/options_spec.rb +175 -0
- data/spec/wapiti/utility_spec.rb +22 -0
- data/wapiti.gemspec +35 -0
- metadata +178 -0
data/.autotest
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'autotest/fsevent' if RUBY_PLATFORM =~ /darwin/
|
2
|
+
|
3
|
+
Autotest.add_hook :initialize do |at|
|
4
|
+
at.add_mapping(/.*\.[ch]$/) do |f, _|
|
5
|
+
at.files_matching(/native_spec\.rb$/)
|
6
|
+
at.files_matching(/options_spec\.rb$/)
|
7
|
+
at.files_matching(/model_spec\.rb$/)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
Autotest.add_hook :run_command do |at|
|
12
|
+
system 'bundle exec rake compile'
|
13
|
+
end
|
data/.rspec
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
Wapiti-Ruby
|
2
|
+
Copyright 2011 Sylvester Keil. All rights reserved.
|
3
|
+
|
4
|
+
Wapiti - A linear-chain CRF tool
|
5
|
+
Copyright 2009-2011 CNRS. All rights reserved.
|
6
|
+
|
7
|
+
Redistribution and use in source and binary forms, with or without
|
8
|
+
modification, are permitted provided that the following conditions are met:
|
9
|
+
|
10
|
+
1. Redistributions of source code must retain the above copyright notice,
|
11
|
+
this list of conditions and the following disclaimer.
|
12
|
+
|
13
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
14
|
+
this list of conditions and the following disclaimer in the documentation
|
15
|
+
and/or other materials provided with the distribution.
|
16
|
+
|
17
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR
|
18
|
+
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
19
|
+
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
20
|
+
EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
21
|
+
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
22
|
+
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
23
|
+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
24
|
+
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
25
|
+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
26
|
+
EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
27
|
+
|
28
|
+
The views and conclusions contained in the software and documentation are
|
29
|
+
those of the authors and should not be interpreted as representing official
|
30
|
+
policies, either expressed or implied, of the copyright holder.
|
data/README.md
ADDED
@@ -0,0 +1,153 @@
|
|
1
|
+
Wapiti-Ruby
|
2
|
+
===========
|
3
|
+
|
4
|
+
The Wapiti-Ruby gem provides a wicked fast linear-chain CRF
|
5
|
+
([Conditional Random Fields](http://en.wikipedia.org/wiki/Conditional_random_field))
|
6
|
+
API for sequence segmentation and labelling; it is based on the codebase of
|
7
|
+
Thomas Lavergne's awesome [wapiti](http://wapiti.limsi.fr/).
|
8
|
+
|
9
|
+
|
10
|
+
Requirements
|
11
|
+
------------
|
12
|
+
|
13
|
+
Wapiti-Ruby is written in C and Ruby and requires a compiler with C99
|
14
|
+
support (e.g., gcc); the gem has been confirmed to work with MRI 1.9, 1.8.7,
|
15
|
+
and Rubinius.
|
16
|
+
|
17
|
+
|
18
|
+
Quickstart
|
19
|
+
----------
|
20
|
+
|
21
|
+
### Installation
|
22
|
+
|
23
|
+
$ [sudo] gem install wapiti
|
24
|
+
|
25
|
+
### Creating a Model
|
26
|
+
|
27
|
+
Using a pattern and training data stored in a file:
|
28
|
+
|
29
|
+
model = Wapiti.train('train.txt', :pattern => 'pattern.txt')
|
30
|
+
=> #<Wapiti::Model:0x0000010188f868>
|
31
|
+
model.labels
|
32
|
+
=> ["B-ADJP", "B-ADVP", "B-CONJP" ...]
|
33
|
+
model.save('ch.mod')
|
34
|
+
=> # saves the model as 'ch.mod'
|
35
|
+
|
36
|
+
Alternatively, you can pass in the training data as an array; the array
|
37
|
+
should contain one array for each sequence of training data.
|
38
|
+
|
39
|
+
data = []
|
40
|
+
data << ['Confidence NN B-NP', 'in IN B-PP', 'the DT B-NP', 'pound NN I-NP', '. . O']
|
41
|
+
...
|
42
|
+
model = Wapiti.train(data, options)
|
43
|
+
|
44
|
+
You can consult the `Wapiti::Options` class for a list of supported
|
45
|
+
configuration options and algorithms:
|
46
|
+
|
47
|
+
Wapiti::Options.attribute_names
|
48
|
+
=> [:algorithm, :check, :compact, :convergence_window, :development_data,
|
49
|
+
:jobsize, :label, :max_iterations, :maxent, :pattern, :posterior, :rho1,
|
50
|
+
:rho2, :score, :sparse, :stop_epsilon, :stop_window, :threads]
|
51
|
+
Wapiti::Options.algorithms
|
52
|
+
=> ["l-bfgs", "sgd-l1", "bcd", "rprop", "rprop+", "rprop-", "auto"]
|
53
|
+
|
54
|
+
Use `#valid?` or `#validate` (which returns error messages) to make sure
|
55
|
+
your configuration is supported by Wapiti.
|
56
|
+
|
57
|
+
You can pass options either as an options hash or by adding a block to the
|
58
|
+
method invocation:
|
59
|
+
|
60
|
+
model = Wapiti::Model.train(data) do |config|
|
61
|
+
config.pattern = 'pattern.txt'
|
62
|
+
threads = 4
|
63
|
+
end
|
64
|
+
|
65
|
+
Before saving your model you can use `compact` to reduce the model's size:
|
66
|
+
|
67
|
+
model.save 'm1.mod'
|
68
|
+
=> # m1.mod file size 1.8M
|
69
|
+
model.compact
|
70
|
+
model.save 'm2.mod'
|
71
|
+
=> # m2.mod file size 471K
|
72
|
+
|
73
|
+
### Loading existing Models
|
74
|
+
|
75
|
+
model = Wapiti::Model.load('m1.mod')
|
76
|
+
|
77
|
+
### Labelling
|
78
|
+
|
79
|
+
By calling `#label` on a Model instance you can add labels to your sequence
|
80
|
+
data:
|
81
|
+
|
82
|
+
model = Waiti.load('m2.mod')
|
83
|
+
model.label('test.txt')
|
84
|
+
=> [[["Confidence NN B-NP", "B-NP"], ["in IN B-PP", "B-PP"] ... ]
|
85
|
+
|
86
|
+
The result is an array of sequence arrays; each sequence array consists of
|
87
|
+
the original token and feature string (when using test data, the final
|
88
|
+
feature is usually the expected label) and the label calculated by Wapiti.
|
89
|
+
|
90
|
+
As with training data, you can pass in data either by filename or as
|
91
|
+
a Ruby Array:
|
92
|
+
|
93
|
+
model.label [['Confidence NN', 'in IN', 'the DT', 'pound NN', '. .']]
|
94
|
+
=> [[["Confidence NN", "B-NP"], ["in IN", "B-PP"], ["the DT", "B-NP"],
|
95
|
+
["pound NN", "I-NP"], [". .", "O"]]]
|
96
|
+
|
97
|
+
If you pass a block to `#label` Wapiti will yield each token and the
|
98
|
+
corresponding label:
|
99
|
+
|
100
|
+
model.label [['Confidence NN', 'in IN', 'the DT', 'pound NN', '. .']] do |token, label|
|
101
|
+
[token.downcase, label.downcase]
|
102
|
+
end
|
103
|
+
=> [[["confidence nn", "b-np"], ["in in", "b-pp"], ["the dt", "b-np"],
|
104
|
+
["pound nn", "i-np"], [". .", "o"]]]
|
105
|
+
|
106
|
+
|
107
|
+
Citing
|
108
|
+
------
|
109
|
+
|
110
|
+
If you're using Wapiti-Ruby for research purposes, please use the following
|
111
|
+
citation of the original wapiti package:
|
112
|
+
|
113
|
+
@article{lavergne2010practical,
|
114
|
+
author = {Lavergne, Thomas and Capp\'{e}, Olivier and Yvon, Fran\c{c}ois},
|
115
|
+
title = {Practical Very Large Scale {CRFs}},
|
116
|
+
booktitle = {Proceedings the 48th Annual Meeting of the Association for
|
117
|
+
Computational Linguistics (ACL)},
|
118
|
+
month = {July},
|
119
|
+
year = {2010},
|
120
|
+
location = {Uppsala, Sweden},
|
121
|
+
publisher = {Association for Computational Linguistics},
|
122
|
+
pages = {504--513},
|
123
|
+
url = {http://www.aclweb.org/anthology/P10-1052}
|
124
|
+
}
|
125
|
+
|
126
|
+
If you're profiting from any of the Wapiti-Ruby specific features you are
|
127
|
+
welcome to also refer back to the
|
128
|
+
[Wapiti-Ruby homepage](http://github.com/inukshuk/wapiti-ruby/).
|
129
|
+
|
130
|
+
|
131
|
+
Contributing
|
132
|
+
------------
|
133
|
+
|
134
|
+
The Wapiti-Ruby source code is
|
135
|
+
[hosted on GitHub](http://github.com/inukshuk/wapiti-ruby/).
|
136
|
+
You can check out a copy of the latest code using Git:
|
137
|
+
|
138
|
+
$ git clone https://github.com/inukshuk/wapiti-ruby.git
|
139
|
+
|
140
|
+
If you've found a bug or have a question, please open an issue on the
|
141
|
+
[Wapiti-Ruby issue tracker](http://github.com/inukshuk/wapiti-ruby/issues).
|
142
|
+
Or, for extra credit, clone the Wapiti-Ruby repository, write a failing
|
143
|
+
example, fix the bug and submit a pull request.
|
144
|
+
|
145
|
+
|
146
|
+
License
|
147
|
+
-------
|
148
|
+
|
149
|
+
Copyright 2011 Sylvester Keil. All rights reserved.
|
150
|
+
|
151
|
+
Copyright 2009-2011 CNRS. All rights reserved.
|
152
|
+
|
153
|
+
Wapiti-Ruby is distributed under a BSD-style license. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
lib = File.expand_path('../lib/', __FILE__)
|
2
|
+
$:.unshift lib unless $:.include?(lib)
|
3
|
+
|
4
|
+
require 'rake/clean'
|
5
|
+
require 'rake/testtask'
|
6
|
+
require 'rake/extensiontask'
|
7
|
+
|
8
|
+
require 'wapiti/version'
|
9
|
+
|
10
|
+
task :default => [:test]
|
11
|
+
|
12
|
+
Rake::ExtensionTask.new do |ext|
|
13
|
+
ext.name = 'native'
|
14
|
+
|
15
|
+
ext.ext_dir = 'ext/wapiti'
|
16
|
+
ext.lib_dir = 'lib/wapiti'
|
17
|
+
|
18
|
+
CLEAN.include("#{ext.lib_dir}/native.*")
|
19
|
+
CLEAN.include("#{ext.tmp_dir}")
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
task :build => [:clean] do
|
24
|
+
system 'gem build wapiti.gemspec'
|
25
|
+
end
|
26
|
+
|
27
|
+
task :release => [:build] do
|
28
|
+
system "git tag #{Wapiti::VERSION}"
|
29
|
+
system "gem push wapiti-#{Wapiti::VERSION}.gem"
|
30
|
+
end
|
31
|
+
|
32
|
+
CLEAN.include('*.gem')
|
33
|
+
CLEAN.include('*.rbc')
|
data/ext/wapiti/bcd.c
ADDED
@@ -0,0 +1,392 @@
|
|
1
|
+
/*
|
2
|
+
* Wapiti - A linear-chain CRF tool
|
3
|
+
*
|
4
|
+
* Copyright (c) 2009-2011 CNRS
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
9
|
+
* * Redistributions of source code must retain the above copyright
|
10
|
+
* notice, this list of conditions and the following disclaimer.
|
11
|
+
* * Redistributions in binary form must reproduce the above copyright
|
12
|
+
* notice, this list of conditions and the following disclaimer in the
|
13
|
+
* documentation and/or other materials provided with the distribution.
|
14
|
+
*
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
16
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
18
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
19
|
+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
20
|
+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
21
|
+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
24
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
25
|
+
* POSSIBILITY OF SUCH DAMAGE.
|
26
|
+
*/
|
27
|
+
|
28
|
+
#include <assert.h>
|
29
|
+
#include <math.h>
|
30
|
+
#include <stdbool.h>
|
31
|
+
#include <stddef.h>
|
32
|
+
#include <stdlib.h>
|
33
|
+
#include <string.h>
|
34
|
+
|
35
|
+
#include "wapiti.h"
|
36
|
+
#include "gradient.h"
|
37
|
+
#include "model.h"
|
38
|
+
#include "options.h"
|
39
|
+
#include "progress.h"
|
40
|
+
#include "sequence.h"
|
41
|
+
#include "tools.h"
|
42
|
+
#include "vmath.h"
|
43
|
+
|
44
|
+
/******************************************************************************
|
45
|
+
* Blockwise Coordinates descent trainer
|
46
|
+
* The gradient and hessian computation used for the BCD is very similar to
|
47
|
+
* the generic one define below but there is some important differences:
|
48
|
+
* - The forward and backward recursions doesn't have to be performed fully
|
49
|
+
* but just in the range of activity of the considered block. So if the
|
50
|
+
* block is active only at position t, the alpha recusion is done from 1
|
51
|
+
* to t and the beta one from T to t, dividing the amount of computations
|
52
|
+
* by 2.
|
53
|
+
* - Samely the update of the gradient and hessian have to be done only at
|
54
|
+
* position where the block is active, so in the common case where the
|
55
|
+
* block is active only once in the sequence, the improvement can be huge.
|
56
|
+
* - And finally, there is no need to compute the logloss, which can take a
|
57
|
+
* long time due to the computation of the log()s.
|
58
|
+
******************************************************************************/
|
59
|
+
typedef struct bcd_s bcd_t;
|
60
|
+
struct bcd_s {
|
61
|
+
double *ugrd; // [Y]
|
62
|
+
double *uhes; // [Y]
|
63
|
+
double *bgrd; // [Y][Y]
|
64
|
+
double *bhes; // [Y][Y]
|
65
|
+
size_t *actpos; // [T]
|
66
|
+
size_t actcnt;
|
67
|
+
grd_t *grd;
|
68
|
+
};
|
69
|
+
|
70
|
+
/* bcd_soft:
|
71
|
+
* The softmax function.
|
72
|
+
*/
|
73
|
+
static double bcd_soft(double z, double r) {
|
74
|
+
if (z > r) return z - r;
|
75
|
+
if (z < -r) return z + r;
|
76
|
+
return 0.0;
|
77
|
+
}
|
78
|
+
|
79
|
+
/* bcd_actpos:
|
80
|
+
* List position where the given block is active in the sequence and setup the
|
81
|
+
* limits for the fwd/bwd.
|
82
|
+
*/
|
83
|
+
static void bcd_actpos(mdl_t *mdl, bcd_t *bcd, const seq_t *seq, size_t o) {
|
84
|
+
const int T = seq->len;
|
85
|
+
size_t *actpos = bcd->actpos;
|
86
|
+
size_t actcnt = 0;
|
87
|
+
for (int t = 0; t < T; t++) {
|
88
|
+
const pos_t *pos = &(seq->pos[t]);
|
89
|
+
bool ok = false;
|
90
|
+
if (mdl->kind[o] & 1)
|
91
|
+
for (size_t n = 0; !ok && n < pos->ucnt; n++)
|
92
|
+
if (pos->uobs[n] == o)
|
93
|
+
ok = true;
|
94
|
+
if (mdl->kind[o] & 2)
|
95
|
+
for (size_t n = 0; !ok && n < pos->bcnt; n++)
|
96
|
+
if (pos->bobs[n] == o)
|
97
|
+
ok = true;
|
98
|
+
if (!ok)
|
99
|
+
continue;
|
100
|
+
actpos[actcnt++] = t;
|
101
|
+
}
|
102
|
+
assert(actcnt != 0);
|
103
|
+
bcd->actcnt = actcnt;
|
104
|
+
bcd->grd->first = actpos[0];
|
105
|
+
bcd->grd->last = actpos[actcnt - 1];
|
106
|
+
}
|
107
|
+
|
108
|
+
/* bct_flgradhes:
|
109
|
+
* Update the gradient and hessian for <blk> on sequence <seq>. This one is
|
110
|
+
* very similar than the trn_spupgrad function but does the computation only
|
111
|
+
* at active pos and approximate also the hessian.
|
112
|
+
*/
|
113
|
+
static void bcd_flgradhes(mdl_t *mdl, bcd_t *bcd, const seq_t *seq, size_t o) {
|
114
|
+
const grd_t *grd = bcd->grd;
|
115
|
+
const size_t Y = mdl->nlbl;
|
116
|
+
const size_t T = seq->len;
|
117
|
+
const double (*psi )[T][Y][Y] = (void *)grd->psi;
|
118
|
+
const double (*alpha)[T][Y] = (void *)grd->alpha;
|
119
|
+
const double (*beta )[T][Y] = (void *)grd->beta;
|
120
|
+
const double *unorm = grd->unorm;
|
121
|
+
const double *bnorm = grd->bnorm;
|
122
|
+
const size_t *actpos = bcd->actpos;
|
123
|
+
const size_t actcnt = bcd->actcnt;
|
124
|
+
double *ugrd = bcd->ugrd;
|
125
|
+
double *uhes = bcd->uhes;
|
126
|
+
double *bgrd = bcd->bgrd;
|
127
|
+
double *bhes = bcd->bhes;
|
128
|
+
// Update the gradient and the hessian but here we sum only on the
|
129
|
+
// positions where the block is active for unigrams features
|
130
|
+
if (mdl->kind[o] & 1) {
|
131
|
+
for (size_t n = 0; n < actcnt; n++) {
|
132
|
+
const size_t t = actpos[n];
|
133
|
+
for (size_t y = 0; y < Y; y++) {
|
134
|
+
const double e = (*alpha)[t][y] * (*beta)[t][y]
|
135
|
+
* unorm[t];
|
136
|
+
ugrd[y] += e;
|
137
|
+
uhes[y] += e * (1.0 - e);
|
138
|
+
}
|
139
|
+
const size_t y = seq->pos[t].lbl;
|
140
|
+
ugrd[y] -= 1.0;
|
141
|
+
}
|
142
|
+
}
|
143
|
+
if ((mdl->kind[o] & 2) == 0)
|
144
|
+
return;
|
145
|
+
// for bigrams features
|
146
|
+
for (size_t n = 0; n < actcnt; n++) {
|
147
|
+
const size_t t = actpos[n];
|
148
|
+
if (t == 0)
|
149
|
+
continue;
|
150
|
+
for (size_t yp = 0, d = 0; yp < Y; yp++) {
|
151
|
+
for (size_t y = 0; y < Y; y++, d++) {
|
152
|
+
double e = (*alpha)[t - 1][yp] * (*beta)[t][y]
|
153
|
+
* (*psi)[t][yp][y] * bnorm[t];
|
154
|
+
bgrd[d] += e;
|
155
|
+
bhes[d] += e * (1.0 - e);
|
156
|
+
}
|
157
|
+
}
|
158
|
+
const size_t yp = seq->pos[t - 1].lbl;
|
159
|
+
const size_t y = seq->pos[t ].lbl;
|
160
|
+
bgrd[yp * Y + y] -= 1.0;
|
161
|
+
}
|
162
|
+
}
|
163
|
+
|
164
|
+
/* bct_spgradhes:
|
165
|
+
* Update the gradient and hessian for <blk> on sequence <seq>. This one is
|
166
|
+
* very similar than the trn_spupgrad function but does the computation only
|
167
|
+
* at active pos and approximate also the hessian.
|
168
|
+
*/
|
169
|
+
static void bcd_spgradhes(mdl_t *mdl, bcd_t *bcd, const seq_t *seq, size_t o) {
|
170
|
+
const grd_t *grd = bcd->grd;
|
171
|
+
const size_t Y = mdl->nlbl;
|
172
|
+
const size_t T = seq->len;
|
173
|
+
const double (*psiuni)[T][Y] = (void *)grd->psiuni;
|
174
|
+
const double *psival = grd->psi;
|
175
|
+
const size_t *psiyp = grd->psiyp;
|
176
|
+
const size_t (*psiidx)[T][Y] = (void *)grd->psiidx;
|
177
|
+
const size_t *psioff = grd->psioff;
|
178
|
+
const double (*alpha)[T][Y] = (void *)grd->alpha;
|
179
|
+
const double (*beta )[T][Y] = (void *)grd->beta;
|
180
|
+
const double *unorm = grd->unorm;
|
181
|
+
const double *bnorm = grd->bnorm;
|
182
|
+
const size_t *actpos = bcd->actpos;
|
183
|
+
const size_t actcnt = bcd->actcnt;
|
184
|
+
double *ugrd = bcd->ugrd;
|
185
|
+
double *uhes = bcd->uhes;
|
186
|
+
double *bgrd = bcd->bgrd;
|
187
|
+
double *bhes = bcd->bhes;
|
188
|
+
// Update the gradient and the hessian but here we sum only on the
|
189
|
+
// positions where the block is active for unigrams features
|
190
|
+
if (mdl->kind[o] & 1) {
|
191
|
+
for (size_t n = 0; n < actcnt; n++) {
|
192
|
+
const size_t t = actpos[n];
|
193
|
+
for (size_t y = 0; y < Y; y++) {
|
194
|
+
const double e = (*alpha)[t][y] * (*beta)[t][y]
|
195
|
+
* unorm[t];
|
196
|
+
ugrd[y] += e;
|
197
|
+
uhes[y] += e * (1.0 - e);
|
198
|
+
}
|
199
|
+
const size_t y = seq->pos[t].lbl;
|
200
|
+
ugrd[y] -= 1.0;
|
201
|
+
}
|
202
|
+
}
|
203
|
+
if ((mdl->kind[o] & 2) == 0)
|
204
|
+
return;
|
205
|
+
// for bigrams features
|
206
|
+
for (size_t n = 0; n < actcnt; n++) {
|
207
|
+
const size_t t = actpos[n];
|
208
|
+
if (t == 0)
|
209
|
+
continue;
|
210
|
+
// We build the expectation matrix
|
211
|
+
double e[Y][Y];
|
212
|
+
for (size_t yp = 0; yp < Y; yp++)
|
213
|
+
for (size_t y = 0; y < Y; y++)
|
214
|
+
e[yp][y] = (*alpha)[t - 1][yp] * (*beta)[t][y]
|
215
|
+
* (*psiuni)[t][y] * bnorm[t];
|
216
|
+
const size_t off = psioff[t];
|
217
|
+
for (size_t n = 0, y = 0; n < (*psiidx)[t][Y - 1]; ) {
|
218
|
+
while (n >= (*psiidx)[t][y])
|
219
|
+
y++;
|
220
|
+
while (n < (*psiidx)[t][y]) {
|
221
|
+
const size_t yp = psiyp [off + n];
|
222
|
+
const double v = psival[off + n];
|
223
|
+
e[yp][y] += e[yp][y] * v;
|
224
|
+
n++;
|
225
|
+
}
|
226
|
+
}
|
227
|
+
// And use it
|
228
|
+
for (size_t yp = 0, d = 0; yp < Y; yp++) {
|
229
|
+
for (size_t y = 0; y < Y; y++, d++) {
|
230
|
+
bgrd[d] += e[yp][y];
|
231
|
+
bhes[d] += e[yp][y] * (1.0 - e[yp][y]);
|
232
|
+
}
|
233
|
+
}
|
234
|
+
const size_t yp = seq->pos[t - 1].lbl;
|
235
|
+
const size_t y = seq->pos[t ].lbl;
|
236
|
+
bgrd[yp * Y + y] -= 1.0;
|
237
|
+
}
|
238
|
+
}
|
239
|
+
|
240
|
+
/* bct_update:
|
241
|
+
* Update the model with the computed gradient and hessian.
|
242
|
+
*/
|
243
|
+
static void bcd_update(mdl_t *mdl, bcd_t *bcd, size_t o) {
|
244
|
+
const double rho1 = mdl->opt->rho1;
|
245
|
+
const double rho2 = mdl->opt->rho2;
|
246
|
+
const double kappa = mdl->opt->bcd.kappa;
|
247
|
+
const size_t Y = mdl->nlbl;
|
248
|
+
const double *ugrd = bcd->ugrd;
|
249
|
+
const double *bgrd = bcd->bgrd;
|
250
|
+
double *uhes = bcd->uhes;
|
251
|
+
double *bhes = bcd->bhes;
|
252
|
+
if (mdl->kind[o] & 1) {
|
253
|
+
// Adjust the hessian
|
254
|
+
double a = 1.0;
|
255
|
+
for (size_t y = 0; y < Y; y++)
|
256
|
+
a = max(a, fabs(ugrd[y] / uhes[y]));
|
257
|
+
xvm_scale(uhes, uhes, a * kappa, Y);
|
258
|
+
// Update the model
|
259
|
+
double *w = mdl->theta + mdl->uoff[o];
|
260
|
+
for (size_t y = 0; y < Y; y++) {
|
261
|
+
double z = uhes[y] * w[y] - ugrd[y];
|
262
|
+
double d = uhes[y] + rho2;
|
263
|
+
w[y] = bcd_soft(z, rho1) / d;
|
264
|
+
}
|
265
|
+
}
|
266
|
+
if (mdl->kind[o] & 2) {
|
267
|
+
// Adjust the hessian
|
268
|
+
double a = 1.0;
|
269
|
+
for (size_t i = 0; i < Y * Y; i++)
|
270
|
+
a = max(a, fabs(bgrd[i] / bhes[i]));
|
271
|
+
xvm_scale(bhes, bhes, a * kappa, Y * Y);
|
272
|
+
// Update the model
|
273
|
+
double *bw = mdl->theta + mdl->boff[o];
|
274
|
+
for (size_t i = 0; i < Y * Y; i++) {
|
275
|
+
double z = bhes[i] * bw[i] - bgrd[i];
|
276
|
+
double d = bhes[i] + rho2;
|
277
|
+
bw[i] = bcd_soft(z, rho1) / d;
|
278
|
+
}
|
279
|
+
}
|
280
|
+
}
|
281
|
+
|
282
|
+
/* trn_bcd
|
283
|
+
* Train the model using the blockwise coordinates descend method.
|
284
|
+
*/
|
285
|
+
void trn_bcd(mdl_t *mdl) {
|
286
|
+
const size_t Y = mdl->nlbl;
|
287
|
+
const size_t O = mdl->nobs;
|
288
|
+
const size_t T = mdl->train->mlen;
|
289
|
+
const size_t S = mdl->train->nseq;
|
290
|
+
const int K = mdl->opt->maxiter;
|
291
|
+
// Build the index:
|
292
|
+
// Count active sequences per blocks
|
293
|
+
info(" - Build the index\n");
|
294
|
+
info(" 1/2 -- scan the sequences\n");
|
295
|
+
size_t tot = 0, cnt[O], lcl[O];
|
296
|
+
for (size_t o = 0; o < O; o++)
|
297
|
+
cnt[o] = 0, lcl[o] = none;
|
298
|
+
for (size_t s = 0; s < S; s++) {
|
299
|
+
// List actives blocks
|
300
|
+
const seq_t *seq = mdl->train->seq[s];
|
301
|
+
for (int t = 0; t < seq->len; t++) {
|
302
|
+
for (size_t b = 0; b < seq->pos[t].ucnt; b++)
|
303
|
+
lcl[seq->pos[t].uobs[b]] = s;
|
304
|
+
for (size_t b = 0; b < seq->pos[t].bcnt; b++)
|
305
|
+
lcl[seq->pos[t].bobs[b]] = s;
|
306
|
+
}
|
307
|
+
// Updates blocks count
|
308
|
+
for (size_t o = 0; o < O; o++)
|
309
|
+
cnt[o] += (lcl[o] == s);
|
310
|
+
}
|
311
|
+
for (size_t o = 0; o < O; o++)
|
312
|
+
tot += cnt[o];
|
313
|
+
// Allocate memory
|
314
|
+
size_t *idx_cnt = xmalloc(sizeof(size_t ) * O);
|
315
|
+
size_t **idx_lst = xmalloc(sizeof(size_t *) * O);
|
316
|
+
for (size_t o = 0; o < O; o++) {
|
317
|
+
idx_cnt[o] = cnt[o];
|
318
|
+
idx_lst[o] = xmalloc(sizeof(size_t) * cnt[o]);
|
319
|
+
}
|
320
|
+
// Populate the index
|
321
|
+
info(" 2/2 -- Populate the index\n");
|
322
|
+
for (size_t o = 0; o < O; o++)
|
323
|
+
cnt[o] = 0, lcl[o] = none;
|
324
|
+
for (size_t s = 0; s < S; s++) {
|
325
|
+
// List actives blocks
|
326
|
+
const seq_t *seq = mdl->train->seq[s];
|
327
|
+
for (int t = 0; t < seq->len; t++) {
|
328
|
+
for (size_t b = 0; b < seq->pos[t].ucnt; b++)
|
329
|
+
lcl[seq->pos[t].uobs[b]] = s;
|
330
|
+
for (size_t b = 0; b < seq->pos[t].bcnt; b++)
|
331
|
+
lcl[seq->pos[t].bobs[b]] = s;
|
332
|
+
}
|
333
|
+
// Build index
|
334
|
+
for (size_t o = 0; o < O; o++)
|
335
|
+
if (lcl[o] == s)
|
336
|
+
idx_lst[o][cnt[o]++] = s;
|
337
|
+
}
|
338
|
+
info(" Done\n");
|
339
|
+
// Allocate the specific trainer of BCD
|
340
|
+
bcd_t *bcd = xmalloc(sizeof(bcd_t));
|
341
|
+
bcd->ugrd = xvm_new(Y);
|
342
|
+
bcd->uhes = xvm_new(Y);
|
343
|
+
bcd->bgrd = xvm_new(Y * Y);
|
344
|
+
bcd->bhes = xvm_new(Y * Y);
|
345
|
+
bcd->actpos = xmalloc(sizeof(size_t) * T);
|
346
|
+
bcd->grd = grd_new(mdl, NULL);
|
347
|
+
// And train the model
|
348
|
+
for (int i = 0; i < K; i++) {
|
349
|
+
for (size_t o = 0; o < O; o++) {
|
350
|
+
// Clear the gradient and the hessian
|
351
|
+
for (size_t y = 0, d = 0; y < Y; y++) {
|
352
|
+
bcd->ugrd[y] = 0.0;
|
353
|
+
bcd->uhes[y] = 0.0;
|
354
|
+
for (size_t yp = 0; yp < Y; yp++, d++) {
|
355
|
+
bcd->bgrd[d] = 0.0;
|
356
|
+
bcd->bhes[d] = 0.0;
|
357
|
+
}
|
358
|
+
}
|
359
|
+
// Process active sequences
|
360
|
+
for (size_t s = 0; s < idx_cnt[o]; s++) {
|
361
|
+
const size_t id = idx_lst[o][s];
|
362
|
+
const seq_t *seq = mdl->train->seq[id];
|
363
|
+
bcd_actpos(mdl, bcd, seq, o);
|
364
|
+
grd_check(bcd->grd, seq->len);
|
365
|
+
if (mdl->opt->sparse) {
|
366
|
+
grd_spdopsi(bcd->grd, seq);
|
367
|
+
grd_spfwdbwd(bcd->grd, seq);
|
368
|
+
bcd_spgradhes(mdl, bcd, seq, o);
|
369
|
+
} else {
|
370
|
+
grd_fldopsi(bcd->grd, seq);
|
371
|
+
grd_flfwdbwd(bcd->grd, seq);
|
372
|
+
bcd_flgradhes(mdl, bcd, seq, o);
|
373
|
+
}
|
374
|
+
}
|
375
|
+
// And update the model
|
376
|
+
bcd_update(mdl, bcd, o);
|
377
|
+
}
|
378
|
+
if (!uit_progress(mdl, i + 1, -1.0))
|
379
|
+
break;
|
380
|
+
}
|
381
|
+
// Cleanup memory
|
382
|
+
grd_free(bcd->grd);
|
383
|
+
xvm_free(bcd->ugrd); xvm_free(bcd->uhes);
|
384
|
+
xvm_free(bcd->bgrd); xvm_free(bcd->bhes);
|
385
|
+
free(bcd->actpos);
|
386
|
+
free(bcd);
|
387
|
+
for (size_t o = 0; o < O; o++)
|
388
|
+
free(idx_lst[o]);
|
389
|
+
free(idx_lst);
|
390
|
+
free(idx_cnt);
|
391
|
+
}
|
392
|
+
|