wapiti 0.0.5 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.simplecov +3 -0
- data/Gemfile +25 -2
- data/HISTORY.md +5 -1
- data/LICENSE +14 -13
- data/README.md +9 -16
- data/Rakefile +38 -8
- data/ext/wapiti/bcd.c +126 -124
- data/ext/wapiti/decoder.c +203 -124
- data/ext/wapiti/decoder.h +6 -4
- data/ext/wapiti/extconf.rb +2 -2
- data/ext/wapiti/gradient.c +491 -320
- data/ext/wapiti/gradient.h +52 -34
- data/ext/wapiti/lbfgs.c +74 -33
- data/ext/wapiti/model.c +47 -37
- data/ext/wapiti/model.h +22 -20
- data/ext/wapiti/native.c +850 -839
- data/ext/wapiti/native.h +1 -1
- data/ext/wapiti/options.c +52 -20
- data/ext/wapiti/options.h +37 -30
- data/ext/wapiti/pattern.c +35 -33
- data/ext/wapiti/pattern.h +12 -11
- data/ext/wapiti/progress.c +14 -13
- data/ext/wapiti/progress.h +3 -2
- data/ext/wapiti/quark.c +14 -16
- data/ext/wapiti/quark.h +6 -5
- data/ext/wapiti/reader.c +83 -69
- data/ext/wapiti/reader.h +11 -9
- data/ext/wapiti/rprop.c +84 -43
- data/ext/wapiti/sequence.h +18 -16
- data/ext/wapiti/sgdl1.c +45 -43
- data/ext/wapiti/thread.c +19 -17
- data/ext/wapiti/thread.h +5 -4
- data/ext/wapiti/tools.c +7 -7
- data/ext/wapiti/tools.h +3 -4
- data/ext/wapiti/trainers.h +1 -1
- data/ext/wapiti/vmath.c +40 -38
- data/ext/wapiti/vmath.h +12 -11
- data/ext/wapiti/wapiti.c +159 -37
- data/ext/wapiti/wapiti.h +18 -4
- data/lib/wapiti.rb +15 -15
- data/lib/wapiti/errors.rb +15 -15
- data/lib/wapiti/model.rb +92 -84
- data/lib/wapiti/options.rb +123 -124
- data/lib/wapiti/utility.rb +14 -14
- data/lib/wapiti/version.rb +2 -2
- data/spec/spec_helper.rb +29 -9
- data/spec/wapiti/model_spec.rb +230 -194
- data/spec/wapiti/native_spec.rb +7 -8
- data/spec/wapiti/options_spec.rb +184 -174
- data/wapiti.gemspec +22 -8
- metadata +38 -42
- data/.gitignore +0 -5
data/ext/wapiti/decoder.h
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
/*
|
2
2
|
* Wapiti - A linear-chain CRF tool
|
3
3
|
*
|
4
|
-
* Copyright (c) 2009-
|
4
|
+
* Copyright (c) 2009-2013 CNRS
|
5
5
|
* All rights reserved.
|
6
6
|
*
|
7
7
|
* Redistribution and use in source and binary forms, with or without
|
@@ -29,6 +29,7 @@
|
|
29
29
|
#define decoder_h
|
30
30
|
|
31
31
|
#include <stddef.h>
|
32
|
+
#include <stdint.h>
|
32
33
|
#include <stdio.h>
|
33
34
|
|
34
35
|
#include "wapiti.h"
|
@@ -36,11 +37,12 @@
|
|
36
37
|
#include "sequence.h"
|
37
38
|
|
38
39
|
void tag_viterbi(mdl_t *mdl, const seq_t *seq,
|
39
|
-
|
40
|
-
void tag_nbviterbi(mdl_t *mdl, const seq_t *seq,
|
41
|
-
|
40
|
+
uint32_t out[], double *sc, double psc[]);
|
41
|
+
void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, uint32_t N,
|
42
|
+
uint32_t out[][N], double sc[], double psc[][N]);
|
42
43
|
|
43
44
|
void tag_label(mdl_t *mdl, FILE *fin, FILE *fout);
|
44
45
|
void tag_eval(mdl_t *mdl, double *te, double *se);
|
46
|
+
|
45
47
|
#endif
|
46
48
|
|
data/ext/wapiti/extconf.rb
CHANGED
data/ext/wapiti/gradient.c
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
/*
|
2
2
|
* Wapiti - A linear-chain CRF tool
|
3
3
|
*
|
4
|
-
* Copyright (c) 2009-
|
4
|
+
* Copyright (c) 2009-2013 CNRS
|
5
5
|
* All rights reserved.
|
6
6
|
*
|
7
7
|
* Redistribution and use in source and binary forms, with or without
|
@@ -26,8 +26,9 @@
|
|
26
26
|
*/
|
27
27
|
#include <math.h>
|
28
28
|
#include <stddef.h>
|
29
|
-
#include <
|
29
|
+
#include <stdint.h>
|
30
30
|
#include <stdio.h>
|
31
|
+
#include <stdlib.h>
|
31
32
|
#include <string.h>
|
32
33
|
|
33
34
|
#include "wapiti.h"
|
@@ -40,41 +41,71 @@
|
|
40
41
|
#include "thread.h"
|
41
42
|
#include "vmath.h"
|
42
43
|
|
44
|
+
/* atm_inc:
|
45
|
+
* Atomically increment the value pointed by [ptr] by [inc]. If ATM_ANSI is
|
46
|
+
* defined this NOT atomic at all so caller must have to deal with this.
|
47
|
+
*/
|
48
|
+
#ifdef ATM_ANSI
|
49
|
+
static inline
|
50
|
+
void atm_inc(double *value, double inc) {
|
51
|
+
*value += inc;
|
52
|
+
}
|
53
|
+
#else
|
54
|
+
static inline
|
55
|
+
void atm_inc(volatile double *value, double inc) {
|
56
|
+
while (1) {
|
57
|
+
volatile union {
|
58
|
+
double d;
|
59
|
+
uint64_t u;
|
60
|
+
} old, new;
|
61
|
+
old.d = *value;
|
62
|
+
new.d = old.d + inc;
|
63
|
+
uint64_t *ptr = (uint64_t *)value;
|
64
|
+
if (__sync_bool_compare_and_swap(ptr, old.u, new.u))
|
65
|
+
break;
|
66
|
+
}
|
67
|
+
}
|
68
|
+
#endif
|
69
|
+
|
43
70
|
/******************************************************************************
|
44
|
-
* Maxent
|
71
|
+
* Maxent gradient computation
|
45
72
|
*
|
46
|
-
* Maxent or maximum entropy models are
|
47
|
-
*
|
48
|
-
*
|
49
|
-
*
|
73
|
+
* Maxent or maximum entropy models are multi class logistic regression (see
|
74
|
+
* [1]. Then can be viewed as a special class of CRFs models where the there
|
75
|
+
* is no dependencies between the output labels. This mean that the
|
76
|
+
* normalization is local to each nodes and can be done a lot more efficiently
|
77
|
+
* as we do not have to perform the forward backward procedure.
|
50
78
|
*
|
51
|
-
* This code
|
52
|
-
*
|
53
|
-
*
|
79
|
+
* This code is used both when the maxent type of model is used and in other
|
80
|
+
* modes if the sequence length is one or if there is no bigrams features.
|
81
|
+
*
|
82
|
+
* [1] A maximum entropy approach to natural language processing, A. Berger
|
83
|
+
* and S. Della Pietra and V. Della Pietra, Computational Linguistics,
|
84
|
+
* (22-1), March 1996.
|
54
85
|
******************************************************************************/
|
55
|
-
void
|
56
|
-
const mdl_t *mdl =
|
57
|
-
const double
|
58
|
-
const
|
59
|
-
const
|
60
|
-
double *psi =
|
61
|
-
double *g =
|
62
|
-
for (
|
86
|
+
void grd_domaxent(grd_st_t *grd_st, const seq_t *seq) {
|
87
|
+
const mdl_t *mdl = grd_st->mdl;
|
88
|
+
const double *x = mdl->theta;
|
89
|
+
const uint32_t T = seq->len;
|
90
|
+
const uint32_t Y = mdl->nlbl;
|
91
|
+
double *psi = grd_st->psi;
|
92
|
+
double *g = grd_st->g;
|
93
|
+
for (uint32_t t = 0; t < T; t++) {
|
63
94
|
const pos_t *pos = &(seq->pos[t]);
|
64
95
|
// We first compute for each Y the sum of weights of all
|
65
96
|
// features actives in the sample:
|
66
97
|
// Ψ(y,x^i) = \exp( ∑_k θ_k f_k(y,x^i) )
|
67
98
|
// Z_θ(x^i) = ∑_y Ψ(y,x^i)
|
68
99
|
double Z = 0.0;
|
69
|
-
for (
|
100
|
+
for (uint32_t y = 0; y < Y; y++)
|
70
101
|
psi[y] = 0.0;
|
71
|
-
for (
|
102
|
+
for (uint32_t n = 0; n < pos->ucnt; n++) {
|
72
103
|
const double *wgh = x + mdl->uoff[pos->uobs[n]];
|
73
|
-
for (
|
104
|
+
for (uint32_t y = 0; y < Y; y++)
|
74
105
|
psi[y] += wgh[y];
|
75
106
|
}
|
76
107
|
double lloss = psi[pos->lbl];
|
77
|
-
for (
|
108
|
+
for (uint32_t y = 0; y < Y; y++) {
|
78
109
|
psi[y] = (psi[y] == 0.0) ? 1.0 : exp(psi[y]);
|
79
110
|
Z += psi[y];
|
80
111
|
}
|
@@ -85,22 +116,111 @@ void grd_dosingle(grd_t *grd, const seq_t *seq) {
|
|
85
116
|
// E_{q_θ}(x,y) - E_{p}(x,y)
|
86
117
|
// and we can compute the expectation over the model with:
|
87
118
|
// E_{q_θ}(x,y) = f_k(y,x^i) * ψ(y,x) / Z_θ(x)
|
88
|
-
for (
|
119
|
+
for (uint32_t y = 0; y < Y; y++)
|
89
120
|
psi[y] /= Z;
|
90
|
-
for (
|
121
|
+
for (uint32_t n = 0; n < pos->ucnt; n++) {
|
91
122
|
double *grd = g + mdl->uoff[pos->uobs[n]];
|
92
|
-
for (
|
93
|
-
grd
|
94
|
-
grd
|
123
|
+
for (uint32_t y = 0; y < Y; y++)
|
124
|
+
atm_inc(grd + y, psi[y]);
|
125
|
+
atm_inc(grd + pos->lbl, -1.0);
|
95
126
|
}
|
96
127
|
// And finally the log-likelihood with:
|
97
128
|
// L_θ(x^i,y^i) = log(Z_θ(x^i)) - log(ψ(y^i,x^i))
|
98
|
-
|
129
|
+
grd_st->lloss += log(Z) - lloss;
|
99
130
|
}
|
100
131
|
}
|
101
132
|
|
102
133
|
/******************************************************************************
|
103
|
-
*
|
134
|
+
* Maximum entropy markov model gradient computation
|
135
|
+
*
|
136
|
+
* Maximum entropy markov models are similar to linear-chains CRFs but with
|
137
|
+
* local normalization instead of global normalization (see [2]). This change
|
138
|
+
* make the computation a lot more simpler as at training time the gradient
|
139
|
+
* can be computed similarily to the maxent cases with the previous output
|
140
|
+
* label observed.
|
141
|
+
*
|
142
|
+
* This mean that for bigram features we only have to consider the reference
|
143
|
+
* label at previous position instead of all possible labels, so we don't have
|
144
|
+
* to perform the forward backward. Bigrams features are handle in the same
|
145
|
+
* way than unigrams features.
|
146
|
+
*
|
147
|
+
* [2] Maximum Entropy Markov Models for Information Extraction and
|
148
|
+
* Segmentation, A. McCallum and D. Freitag and F. Pereira, 2000,
|
149
|
+
* Proceedings of ICML 2000 , 591–598. Stanford, California.
|
150
|
+
******************************************************************************/
|
151
|
+
void grd_domemm(grd_st_t *grd_st, const seq_t *seq) {
|
152
|
+
const mdl_t *mdl = grd_st->mdl;
|
153
|
+
const double *x = mdl->theta;
|
154
|
+
const uint32_t T = seq->len;
|
155
|
+
const uint32_t Y = mdl->nlbl;
|
156
|
+
double *psi = grd_st->psi;
|
157
|
+
double *g = grd_st->g;
|
158
|
+
for (uint32_t t = 0; t < T; t++) {
|
159
|
+
const pos_t *pos = &(seq->pos[t]);
|
160
|
+
// We first compute for each Y the sum of weights of all
|
161
|
+
// features actives in the sample:
|
162
|
+
// Ψ(y,x^i) = \exp( ∑_k θ_k f_k(y_t-1, y,x^i) )
|
163
|
+
// Z_θ(x^i) = ∑_y Ψ(y,x^i)
|
164
|
+
// Bigram features rely on the gold label at previous position
|
165
|
+
// for the markov dependency unlike in CRFs.
|
166
|
+
double Z = 0.0;
|
167
|
+
for (uint32_t y = 0; y < Y; y++)
|
168
|
+
psi[y] = 0.0;
|
169
|
+
for (uint32_t n = 0; n < pos->ucnt; n++) {
|
170
|
+
const double *wgh = x + mdl->uoff[pos->uobs[n]];
|
171
|
+
for (uint32_t y = 0; y < Y; y++)
|
172
|
+
psi[y] += wgh[y];
|
173
|
+
}
|
174
|
+
if (t != 0) {
|
175
|
+
const uint32_t yp = seq->pos[t - 1].lbl;
|
176
|
+
const uint32_t d = yp * Y;
|
177
|
+
for (uint32_t y = 0; y < Y; y++) {
|
178
|
+
double sum = 0.0;
|
179
|
+
for (uint32_t n = 0; n < pos->bcnt; n++) {
|
180
|
+
const uint64_t o = pos->bobs[n];
|
181
|
+
sum += x[mdl->boff[o] + d + y];
|
182
|
+
}
|
183
|
+
psi[y] += sum;
|
184
|
+
}
|
185
|
+
}
|
186
|
+
double lloss = psi[pos->lbl];
|
187
|
+
for (uint32_t y = 0; y < Y; y++) {
|
188
|
+
psi[y] = (psi[y] == 0.0) ? 1.0 : exp(psi[y]);
|
189
|
+
Z += psi[y];
|
190
|
+
}
|
191
|
+
// Now, we can compute the gradient update, for each active
|
192
|
+
// feature in the sample the update is the expectation over the
|
193
|
+
// current model minus the expectation over the observed
|
194
|
+
// distribution:
|
195
|
+
// E_{q_θ}(x,y) - E_{p}(x,y)
|
196
|
+
// and we can compute the expectation over the model with:
|
197
|
+
// E_{q_θ}(x,y) = f_k(y, y,x^i) * ψ(y,x) / Z_θ(x)
|
198
|
+
for (uint32_t y = 0; y < Y; y++)
|
199
|
+
psi[y] /= Z;
|
200
|
+
for (uint32_t n = 0; n < pos->ucnt; n++) {
|
201
|
+
double *grd = g + mdl->uoff[pos->uobs[n]];
|
202
|
+
for (uint32_t y = 0; y < Y; y++)
|
203
|
+
atm_inc(grd + y, psi[y]);
|
204
|
+
atm_inc(grd + pos->lbl, -1.0);
|
205
|
+
}
|
206
|
+
if (t != 0) {
|
207
|
+
const uint32_t yp = seq->pos[t - 1].lbl;
|
208
|
+
const uint32_t d = yp * Y;
|
209
|
+
for (uint32_t n = 0; n < pos->bcnt; n++) {
|
210
|
+
double *grd = g + mdl->boff[pos->bobs[n]] + d;
|
211
|
+
for (uint32_t y = 0; y < Y; y++)
|
212
|
+
atm_inc(grd + y, psi[y]);
|
213
|
+
atm_inc(grd + pos->lbl, -1.0);
|
214
|
+
}
|
215
|
+
}
|
216
|
+
// And finally the log-likelihood with:
|
217
|
+
// L_θ(x^i,y^i) = log(Z_θ(x^i)) - log(ψ(y^i,x^i))
|
218
|
+
grd_st->lloss += log(Z) - lloss;
|
219
|
+
}
|
220
|
+
}
|
221
|
+
|
222
|
+
/******************************************************************************
|
223
|
+
* Linear-chain CRF gradient computation
|
104
224
|
*
|
105
225
|
* This section is responsible for computing the gradient of the
|
106
226
|
* log-likelihood function to optimize over a single sequence.
|
@@ -140,80 +260,6 @@ void grd_dosingle(grd_t *grd, const seq_t *seq) {
|
|
140
260
|
* the worst case use as less as possible memory.
|
141
261
|
******************************************************************************/
|
142
262
|
|
143
|
-
/* grd_check:
|
144
|
-
* Check that enough memory is allocated in the gradient object so that the
|
145
|
-
* linear-chain codepath can be computed for a sequence of the given length.
|
146
|
-
*/
|
147
|
-
void grd_check(grd_t *grd, int len) {
|
148
|
-
// Check if user ask for clearing the state tracker or if he requested a
|
149
|
-
// bigger tracker. In this case we have to free the previous allocated
|
150
|
-
// memory.
|
151
|
-
if (len == 0 || (len > grd->len && grd->len != 0)) {
|
152
|
-
if (grd->mdl->opt->sparse) {
|
153
|
-
xvm_free(grd->psiuni); grd->psiuni = NULL;
|
154
|
-
free(grd->psiyp); grd->psiyp = NULL;
|
155
|
-
free(grd->psiidx); grd->psiidx = NULL;
|
156
|
-
free(grd->psioff); grd->psioff = NULL;
|
157
|
-
}
|
158
|
-
xvm_free(grd->psi); grd->psi = NULL;
|
159
|
-
xvm_free(grd->alpha); grd->alpha = NULL;
|
160
|
-
xvm_free(grd->beta); grd->beta = NULL;
|
161
|
-
xvm_free(grd->unorm); grd->unorm = NULL;
|
162
|
-
xvm_free(grd->bnorm); grd->bnorm = NULL;
|
163
|
-
xvm_free(grd->scale); grd->scale = NULL;
|
164
|
-
grd->len = 0;
|
165
|
-
}
|
166
|
-
if (len == 0 || len <= grd->len)
|
167
|
-
return;
|
168
|
-
// If we are here, we have to allocate a new state. This is simple, we
|
169
|
-
// just have to take care of the special case for sparse mode.
|
170
|
-
const size_t Y = grd->mdl->nlbl;
|
171
|
-
const int T = len;
|
172
|
-
grd->psi = xvm_new(T * Y * Y);
|
173
|
-
grd->alpha = xvm_new(T * Y);
|
174
|
-
grd->beta = xvm_new(T * Y);
|
175
|
-
grd->scale = xvm_new(T);
|
176
|
-
grd->unorm = xvm_new(T);
|
177
|
-
grd->bnorm = xvm_new(T);
|
178
|
-
if (grd->mdl->opt->sparse) {
|
179
|
-
grd->psiuni = xvm_new(T * Y);
|
180
|
-
grd->psiyp = wapiti_xmalloc(sizeof(size_t) * T * Y * Y);
|
181
|
-
grd->psiidx = wapiti_xmalloc(sizeof(size_t) * T * Y);
|
182
|
-
grd->psioff = wapiti_xmalloc(sizeof(size_t) * T);
|
183
|
-
}
|
184
|
-
grd->len = len;
|
185
|
-
}
|
186
|
-
|
187
|
-
/* grd_new:
|
188
|
-
* Allocation memory for gradient computation state. This allocate memory for
|
189
|
-
* the longest sequence present in the data set.
|
190
|
-
*/
|
191
|
-
grd_t *grd_new(mdl_t *mdl, double *g) {
|
192
|
-
grd_t *grd = wapiti_xmalloc(sizeof(grd_t));
|
193
|
-
grd->mdl = mdl;
|
194
|
-
grd->len = 0;
|
195
|
-
grd->g = g;
|
196
|
-
grd->psi = NULL;
|
197
|
-
grd->psiuni = NULL;
|
198
|
-
grd->psiyp = NULL;
|
199
|
-
grd->psiidx = NULL;
|
200
|
-
grd->psioff = NULL;
|
201
|
-
grd->alpha = NULL;
|
202
|
-
grd->beta = NULL;
|
203
|
-
grd->unorm = NULL;
|
204
|
-
grd->bnorm = NULL;
|
205
|
-
grd->scale = NULL;
|
206
|
-
return grd;
|
207
|
-
}
|
208
|
-
|
209
|
-
/* grd_free:
|
210
|
-
* Free all memory used by gradient computation.
|
211
|
-
*/
|
212
|
-
void grd_free(grd_t *grd) {
|
213
|
-
grd_check(grd, 0);
|
214
|
-
free(grd);
|
215
|
-
}
|
216
|
-
|
217
263
|
/* grd_fldopsi:
|
218
264
|
* We first have to compute the Ψ_t(y',y,x) weights defined as
|
219
265
|
* Ψ_t(y',y,x) = \exp( ∑_k θ_k f_k(y',y,x_t) )
|
@@ -235,38 +281,38 @@ void grd_free(grd_t *grd) {
|
|
235
281
|
* 3/ we take the component-wise exponential of the resulting matrix
|
236
282
|
* (this can be done efficiently with vector maths)
|
237
283
|
*/
|
238
|
-
void grd_fldopsi(
|
239
|
-
const mdl_t *mdl =
|
240
|
-
const double
|
241
|
-
const
|
242
|
-
const
|
243
|
-
double (*psi)[T][Y][Y] = (void *)
|
244
|
-
for (
|
284
|
+
void grd_fldopsi(grd_st_t *grd_st, const seq_t *seq) {
|
285
|
+
const mdl_t *mdl = grd_st->mdl;
|
286
|
+
const double *x = mdl->theta;
|
287
|
+
const uint32_t Y = mdl->nlbl;
|
288
|
+
const uint32_t T = seq->len;
|
289
|
+
double (*psi)[T][Y][Y] = (void *)grd_st->psi;
|
290
|
+
for (uint32_t t = 0; t < T; t++) {
|
245
291
|
const pos_t *pos = &(seq->pos[t]);
|
246
|
-
for (
|
292
|
+
for (uint32_t y = 0; y < Y; y++) {
|
247
293
|
double sum = 0.0;
|
248
|
-
for (
|
249
|
-
const
|
294
|
+
for (uint32_t n = 0; n < pos->ucnt; n++) {
|
295
|
+
const uint64_t o = pos->uobs[n];
|
250
296
|
sum += x[mdl->uoff[o] + y];
|
251
297
|
}
|
252
|
-
for (
|
298
|
+
for (uint32_t yp = 0; yp < Y; yp++)
|
253
299
|
(*psi)[t][yp][y] = sum;
|
254
300
|
}
|
255
301
|
}
|
256
|
-
for (
|
302
|
+
for (uint32_t t = 1; t < T; t++) {
|
257
303
|
const pos_t *pos = &(seq->pos[t]);
|
258
|
-
for (
|
259
|
-
for (
|
304
|
+
for (uint32_t yp = 0, d = 0; yp < Y; yp++) {
|
305
|
+
for (uint32_t y = 0; y < Y; y++, d++) {
|
260
306
|
double sum = 0.0;
|
261
|
-
for (
|
262
|
-
const
|
307
|
+
for (uint32_t n = 0; n < pos->bcnt; n++) {
|
308
|
+
const uint64_t o = pos->bobs[n];
|
263
309
|
sum += x[mdl->boff[o] + d];
|
264
310
|
}
|
265
311
|
(*psi)[t][yp][y] += sum;
|
266
312
|
}
|
267
313
|
}
|
268
314
|
}
|
269
|
-
xvm_expma((double *)psi, (double *)psi, 0.0, (
|
315
|
+
xvm_expma((double *)psi, (double *)psi, 0.0, (uint64_t)T * Y * Y);
|
270
316
|
}
|
271
317
|
|
272
318
|
/* grd_spdopsi:
|
@@ -290,36 +336,36 @@ void grd_fldopsi(grd_t *grd, const seq_t *seq) {
|
|
290
336
|
* one. (here also this can be done efficiently with vector
|
291
337
|
* maths)
|
292
338
|
*/
|
293
|
-
void grd_spdopsi(
|
294
|
-
const mdl_t *mdl =
|
295
|
-
const double
|
296
|
-
const
|
297
|
-
const
|
298
|
-
double
|
299
|
-
double
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
for (
|
339
|
+
void grd_spdopsi(grd_st_t *grd_st, const seq_t *seq) {
|
340
|
+
const mdl_t *mdl = grd_st->mdl;
|
341
|
+
const double *x = mdl->theta;
|
342
|
+
const uint32_t Y = mdl->nlbl;
|
343
|
+
const uint32_t T = seq->len;
|
344
|
+
double (*psiuni)[T][Y] = (void *)grd_st->psiuni;
|
345
|
+
double *psival = grd_st->psi;
|
346
|
+
uint32_t *psiyp = grd_st->psiyp;
|
347
|
+
uint32_t (*psiidx)[T][Y] = (void *)grd_st->psiidx;
|
348
|
+
uint32_t *psioff = grd_st->psioff;
|
349
|
+
for (uint32_t t = 0; t < T; t++) {
|
304
350
|
const pos_t *pos = &(seq->pos[t]);
|
305
|
-
for (
|
351
|
+
for (uint32_t y = 0; y < Y; y++) {
|
306
352
|
double sum = 0.0;
|
307
|
-
for (
|
308
|
-
const
|
353
|
+
for (uint32_t n = 0; n < pos->ucnt; n++) {
|
354
|
+
const uint64_t o = pos->uobs[n];
|
309
355
|
sum += x[mdl->uoff[o] + y];
|
310
356
|
}
|
311
357
|
(*psiuni)[t][y] = sum;
|
312
358
|
}
|
313
359
|
}
|
314
|
-
|
315
|
-
for (
|
360
|
+
uint32_t off = 0;
|
361
|
+
for (uint32_t t = 1; t < T; t++) {
|
316
362
|
const pos_t *pos = &(seq->pos[t]);
|
317
363
|
psioff[t] = off;
|
318
|
-
for (
|
319
|
-
for (
|
364
|
+
for (uint32_t y = 0, nnz = 0; y < Y; y++) {
|
365
|
+
for (uint32_t yp = 0; yp < Y; yp++) {
|
320
366
|
double sum = 0.0;
|
321
|
-
for (
|
322
|
-
const
|
367
|
+
for (uint32_t n = 0; n < pos->bcnt; n++) {
|
368
|
+
const uint64_t o = pos->bobs[n];
|
323
369
|
sum += x[mdl->boff[o] + yp * Y + y];
|
324
370
|
}
|
325
371
|
if (sum == 0.0)
|
@@ -331,7 +377,7 @@ void grd_spdopsi(grd_t *grd, const seq_t *seq) {
|
|
331
377
|
(*psiidx)[t][y] = nnz;
|
332
378
|
}
|
333
379
|
}
|
334
|
-
xvm_expma((double *)psiuni, (double *)psiuni, 0.0, (
|
380
|
+
xvm_expma((double *)psiuni, (double *)psiuni, 0.0, (uint64_t)T * Y);
|
335
381
|
xvm_expma((double *)psival, (double *)psival, 1.0, off);
|
336
382
|
}
|
337
383
|
|
@@ -356,42 +402,42 @@ void grd_spdopsi(grd_t *grd, const seq_t *seq) {
|
|
356
402
|
* with α-scale_t the scaling factor used for the α vector at position t
|
357
403
|
* in the forward recursion.
|
358
404
|
*/
|
359
|
-
void grd_flfwdbwd(
|
360
|
-
const mdl_t *mdl =
|
361
|
-
const
|
362
|
-
const
|
363
|
-
const double (*psi)[T][Y][Y] = (void *)
|
364
|
-
double (*alpha)[T][Y] = (void *)
|
365
|
-
double (*beta )[T][Y] = (void *)
|
366
|
-
double *scale =
|
367
|
-
double *unorm =
|
368
|
-
double *bnorm =
|
369
|
-
for (
|
405
|
+
void grd_flfwdbwd(grd_st_t *grd_st, const seq_t *seq) {
|
406
|
+
const mdl_t *mdl = grd_st->mdl;
|
407
|
+
const uint64_t Y = mdl->nlbl;
|
408
|
+
const uint32_t T = seq->len;
|
409
|
+
const double (*psi)[T][Y][Y] = (void *)grd_st->psi;
|
410
|
+
double (*alpha)[T][Y] = (void *)grd_st->alpha;
|
411
|
+
double (*beta )[T][Y] = (void *)grd_st->beta;
|
412
|
+
double *scale = grd_st->scale;
|
413
|
+
double *unorm = grd_st->unorm;
|
414
|
+
double *bnorm = grd_st->bnorm;
|
415
|
+
for (uint32_t y = 0; y < Y; y++)
|
370
416
|
(*alpha)[0][y] = (*psi)[0][0][y];
|
371
417
|
scale[0] = xvm_unit((*alpha)[0], (*alpha)[0], Y);
|
372
|
-
for (
|
373
|
-
for (
|
418
|
+
for (uint32_t t = 1; t < grd_st->last + 1; t++) {
|
419
|
+
for (uint32_t y = 0; y < Y; y++) {
|
374
420
|
double sum = 0.0;
|
375
|
-
for (
|
421
|
+
for (uint32_t yp = 0; yp < Y; yp++)
|
376
422
|
sum += (*alpha)[t - 1][yp] * (*psi)[t][yp][y];
|
377
423
|
(*alpha)[t][y] = sum;
|
378
424
|
}
|
379
425
|
scale[t] = xvm_unit((*alpha)[t], (*alpha)[t], Y);
|
380
426
|
}
|
381
|
-
for (
|
427
|
+
for (uint32_t yp = 0; yp < Y; yp++)
|
382
428
|
(*beta)[T - 1][yp] = 1.0 / Y;
|
383
|
-
for (
|
384
|
-
for (
|
429
|
+
for (uint32_t t = T - 1; t > grd_st->first; t--) {
|
430
|
+
for (uint32_t yp = 0; yp < Y; yp++) {
|
385
431
|
double sum = 0.0;
|
386
|
-
for (
|
432
|
+
for (uint32_t y = 0; y < Y; y++)
|
387
433
|
sum += (*beta)[t][y] * (*psi)[t][yp][y];
|
388
434
|
(*beta)[t - 1][yp] = sum;
|
389
435
|
}
|
390
436
|
xvm_unit((*beta)[t - 1], (*beta)[t - 1], Y);
|
391
437
|
}
|
392
|
-
for (
|
438
|
+
for (uint32_t t = 0; t < T; t++) {
|
393
439
|
double z = 0.0;
|
394
|
-
for (
|
440
|
+
for (uint32_t y = 0; y < Y; y++)
|
395
441
|
z += (*alpha)[t][y] * (*beta)[t][y];
|
396
442
|
unorm[t] = 1.0 / z;
|
397
443
|
bnorm[t] = scale[t] / z;
|
@@ -416,67 +462,67 @@ void grd_flfwdbwd(grd_t *grd, const seq_t *seq) {
|
|
416
462
|
* And here also we reduce the number of multiplication if the matrix is
|
417
463
|
* really sparse.
|
418
464
|
*/
|
419
|
-
void grd_spfwdbwd(
|
420
|
-
const mdl_t *mdl =
|
421
|
-
const
|
422
|
-
const
|
423
|
-
const double
|
424
|
-
const double
|
425
|
-
const
|
426
|
-
const
|
427
|
-
const
|
428
|
-
double (*alpha)[T][Y] = (void *)
|
429
|
-
double (*beta )[T][Y] = (void *)
|
430
|
-
double *scale =
|
431
|
-
double *unorm =
|
432
|
-
double *bnorm =
|
433
|
-
for (
|
465
|
+
void grd_spfwdbwd(grd_st_t *grd_st, const seq_t *seq) {
|
466
|
+
const mdl_t *mdl = grd_st->mdl;
|
467
|
+
const uint32_t Y = mdl->nlbl;
|
468
|
+
const uint32_t T = seq->len;
|
469
|
+
const double (*psiuni)[T][Y] = (void *)grd_st->psiuni;
|
470
|
+
const double *psival = grd_st->psi;
|
471
|
+
const uint32_t *psiyp = grd_st->psiyp;
|
472
|
+
const uint32_t (*psiidx)[T][Y] = (void *)grd_st->psiidx;
|
473
|
+
const uint32_t *psioff = grd_st->psioff;
|
474
|
+
double (*alpha)[T][Y] = (void *)grd_st->alpha;
|
475
|
+
double (*beta )[T][Y] = (void *)grd_st->beta;
|
476
|
+
double *scale = grd_st->scale;
|
477
|
+
double *unorm = grd_st->unorm;
|
478
|
+
double *bnorm = grd_st->bnorm;
|
479
|
+
for (uint32_t y = 0; y < Y; y++)
|
434
480
|
(*alpha)[0][y] = (*psiuni)[0][y];
|
435
481
|
scale[0] = xvm_unit((*alpha)[0], (*alpha)[0], Y);
|
436
|
-
for (
|
437
|
-
for (
|
482
|
+
for (uint32_t t = 1; t < grd_st->last + 1; t++) {
|
483
|
+
for (uint32_t y = 0; y < Y; y++)
|
438
484
|
(*alpha)[t][y] = 1.0;
|
439
|
-
const
|
440
|
-
for (
|
485
|
+
const uint32_t off = psioff[t];
|
486
|
+
for (uint32_t n = 0, y = 0; n < (*psiidx)[t][Y - 1]; ) {
|
441
487
|
while (n >= (*psiidx)[t][y])
|
442
488
|
y++;
|
443
489
|
while (n < (*psiidx)[t][y]) {
|
444
|
-
const
|
445
|
-
const double
|
490
|
+
const uint32_t yp = psiyp [off + n];
|
491
|
+
const double v = psival[off + n];
|
446
492
|
(*alpha)[t][y] += (*alpha)[t - 1][yp] * v;
|
447
493
|
n++;
|
448
494
|
}
|
449
495
|
}
|
450
|
-
for (
|
496
|
+
for (uint32_t y = 0; y < Y; y++)
|
451
497
|
(*alpha)[t][y] *= (*psiuni)[t][y];
|
452
498
|
scale[t] = xvm_unit((*alpha)[t], (*alpha)[t], Y);
|
453
499
|
}
|
454
|
-
for (
|
500
|
+
for (uint32_t yp = 0; yp < Y; yp++)
|
455
501
|
(*beta)[T - 1][yp] = 1.0 / Y;
|
456
|
-
for (
|
502
|
+
for (uint32_t t = T - 1; t > grd_st->first; t--) {
|
457
503
|
double sum = 0.0, tmp[Y];
|
458
|
-
for (
|
504
|
+
for (uint32_t y = 0; y < Y; y++) {
|
459
505
|
tmp[y] = (*beta)[t][y] * (*psiuni)[t][y];
|
460
506
|
sum += tmp[y];
|
461
507
|
}
|
462
|
-
for (
|
508
|
+
for (uint32_t y = 0; y < Y; y++)
|
463
509
|
(*beta)[t - 1][y] = sum;
|
464
|
-
const
|
465
|
-
for (
|
510
|
+
const uint32_t off = psioff[t];
|
511
|
+
for (uint32_t n = 0, y = 0; n < (*psiidx)[t][Y - 1]; ) {
|
466
512
|
while (n >= (*psiidx)[t][y])
|
467
513
|
y++;
|
468
514
|
while (n < (*psiidx)[t][y]) {
|
469
|
-
const
|
470
|
-
const double
|
515
|
+
const uint32_t yp = psiyp [off + n];
|
516
|
+
const double v = psival[off + n];
|
471
517
|
(*beta)[t - 1][yp] += v * tmp[y];
|
472
518
|
n++;
|
473
519
|
}
|
474
520
|
}
|
475
521
|
xvm_unit((*beta)[t - 1], (*beta)[t - 1], Y);
|
476
522
|
}
|
477
|
-
for (
|
523
|
+
for (uint32_t t = 0; t < T; t++) {
|
478
524
|
double z = 0.0;
|
479
|
-
for (
|
525
|
+
for (uint32_t y = 0; y < Y; y++)
|
480
526
|
z += (*alpha)[t][y] * (*beta)[t][y];
|
481
527
|
unorm[t] = 1.0 / z;
|
482
528
|
bnorm[t] = scale[t] / z;
|
@@ -509,35 +555,35 @@ void grd_spfwdbwd(grd_t *grd, const seq_t *seq) {
|
|
509
555
|
* vector but just adding the contribution of this sequence. This allow to
|
510
556
|
* compute it easily the gradient over more than one sequence.
|
511
557
|
*/
|
512
|
-
void grd_flupgrad(
|
513
|
-
const mdl_t *mdl =
|
514
|
-
const
|
515
|
-
const
|
516
|
-
const double (*psi )[T][Y][Y] = (void *)
|
517
|
-
const double (*alpha)[T][Y] = (void *)
|
518
|
-
const double (*beta )[T][Y] = (void *)
|
519
|
-
const double *unorm =
|
520
|
-
const double *bnorm =
|
521
|
-
double *g =
|
522
|
-
for (
|
558
|
+
void grd_flupgrad(grd_st_t *grd_st, const seq_t *seq) {
|
559
|
+
const mdl_t *mdl = grd_st->mdl;
|
560
|
+
const uint32_t Y = mdl->nlbl;
|
561
|
+
const uint32_t T = seq->len;
|
562
|
+
const double (*psi )[T][Y][Y] = (void *)grd_st->psi;
|
563
|
+
const double (*alpha)[T][Y] = (void *)grd_st->alpha;
|
564
|
+
const double (*beta )[T][Y] = (void *)grd_st->beta;
|
565
|
+
const double *unorm = grd_st->unorm;
|
566
|
+
const double *bnorm = grd_st->bnorm;
|
567
|
+
double *g = grd_st->g;
|
568
|
+
for (uint32_t t = 0; t < T; t++) {
|
523
569
|
const pos_t *pos = &(seq->pos[t]);
|
524
|
-
for (
|
570
|
+
for (uint32_t y = 0; y < Y; y++) {
|
525
571
|
double e = (*alpha)[t][y] * (*beta)[t][y] * unorm[t];
|
526
|
-
for (
|
527
|
-
const
|
528
|
-
g
|
572
|
+
for (uint32_t n = 0; n < pos->ucnt; n++) {
|
573
|
+
const uint64_t o = pos->uobs[n];
|
574
|
+
atm_inc(g + mdl->uoff[o] + y, e);
|
529
575
|
}
|
530
576
|
}
|
531
577
|
}
|
532
|
-
for (
|
578
|
+
for (uint32_t t = 1; t < T; t++) {
|
533
579
|
const pos_t *pos = &(seq->pos[t]);
|
534
|
-
for (
|
535
|
-
for (
|
580
|
+
for (uint32_t yp = 0, d = 0; yp < Y; yp++) {
|
581
|
+
for (uint32_t y = 0; y < Y; y++, d++) {
|
536
582
|
double e = (*alpha)[t - 1][yp] * (*beta)[t][y]
|
537
583
|
* (*psi)[t][yp][y] * bnorm[t];
|
538
|
-
for (
|
539
|
-
const
|
540
|
-
g
|
584
|
+
for (uint32_t n = 0; n < pos->bcnt; n++) {
|
585
|
+
const uint64_t o = pos->bobs[n];
|
586
|
+
atm_inc(g + mdl->boff[o] + d, e);
|
541
587
|
}
|
542
588
|
}
|
543
589
|
}
|
@@ -552,55 +598,55 @@ void grd_flupgrad(grd_t *grd, const seq_t *seq) {
|
|
552
598
|
* matrix. We first fill it with the unigram component and next multiply it
|
553
599
|
* with the bigram one.
|
554
600
|
*/
|
555
|
-
void grd_spupgrad(
|
556
|
-
const mdl_t *mdl =
|
557
|
-
const
|
558
|
-
const
|
559
|
-
const double
|
560
|
-
const double
|
561
|
-
const
|
562
|
-
const
|
563
|
-
const
|
564
|
-
const double
|
565
|
-
const double
|
566
|
-
const double
|
567
|
-
const double
|
568
|
-
double *g =
|
569
|
-
for (
|
601
|
+
void grd_spupgrad(grd_st_t *grd_st, const seq_t *seq) {
|
602
|
+
const mdl_t *mdl = grd_st->mdl;
|
603
|
+
const uint32_t Y = mdl->nlbl;
|
604
|
+
const uint32_t T = seq->len;
|
605
|
+
const double (*psiuni)[T][Y] = (void *)grd_st->psiuni;
|
606
|
+
const double *psival = grd_st->psi;
|
607
|
+
const uint32_t *psiyp = grd_st->psiyp;
|
608
|
+
const uint32_t (*psiidx)[T][Y] = (void *)grd_st->psiidx;
|
609
|
+
const uint32_t *psioff = grd_st->psioff;
|
610
|
+
const double (*alpha)[T][Y] = (void *)grd_st->alpha;
|
611
|
+
const double (*beta )[T][Y] = (void *)grd_st->beta;
|
612
|
+
const double *unorm = grd_st->unorm;
|
613
|
+
const double *bnorm = grd_st->bnorm;
|
614
|
+
double *g = grd_st->g;
|
615
|
+
for (uint32_t t = 0; t < T; t++) {
|
570
616
|
const pos_t *pos = &(seq->pos[t]);
|
571
|
-
for (
|
617
|
+
for (uint32_t y = 0; y < Y; y++) {
|
572
618
|
double e = (*alpha)[t][y] * (*beta)[t][y] * unorm[t];
|
573
|
-
for (
|
574
|
-
const
|
575
|
-
g
|
619
|
+
for (uint32_t n = 0; n < pos->ucnt; n++) {
|
620
|
+
const uint64_t o = pos->uobs[n];
|
621
|
+
atm_inc(g + mdl->uoff[o] + y, e);
|
576
622
|
}
|
577
623
|
}
|
578
624
|
}
|
579
|
-
for (
|
625
|
+
for (uint32_t t = 1; t < T; t++) {
|
580
626
|
const pos_t *pos = &(seq->pos[t]);
|
581
627
|
// We build the expectation matrix
|
582
628
|
double e[Y][Y];
|
583
|
-
for (
|
584
|
-
for (
|
629
|
+
for (uint32_t yp = 0; yp < Y; yp++)
|
630
|
+
for (uint32_t y = 0; y < Y; y++)
|
585
631
|
e[yp][y] = (*alpha)[t - 1][yp] * (*beta)[t][y]
|
586
632
|
* (*psiuni)[t][y] * bnorm[t];
|
587
|
-
const
|
588
|
-
for (
|
633
|
+
const uint32_t off = psioff[t];
|
634
|
+
for (uint32_t n = 0, y = 0; n < (*psiidx)[t][Y - 1]; ) {
|
589
635
|
while (n >= (*psiidx)[t][y])
|
590
636
|
y++;
|
591
637
|
while (n < (*psiidx)[t][y]) {
|
592
|
-
const
|
593
|
-
const double
|
638
|
+
const uint32_t yp = psiyp [off + n];
|
639
|
+
const double v = psival[off + n];
|
594
640
|
e[yp][y] += e[yp][y] * v;
|
595
641
|
n++;
|
596
642
|
}
|
597
643
|
}
|
598
644
|
// Add the expectation over the model distribution
|
599
|
-
for (
|
600
|
-
for (
|
601
|
-
for (
|
602
|
-
const
|
603
|
-
g
|
645
|
+
for (uint32_t yp = 0, d = 0; yp < Y; yp++) {
|
646
|
+
for (uint32_t y = 0; y < Y; y++, d++) {
|
647
|
+
for (uint32_t n = 0; n < pos->bcnt; n++) {
|
648
|
+
const uint64_t o = pos->bobs[n];
|
649
|
+
atm_inc(g + mdl->boff[o] + d, e[yp][y]);
|
604
650
|
}
|
605
651
|
}
|
606
652
|
}
|
@@ -612,24 +658,24 @@ void grd_spupgrad(grd_t *grd, const seq_t *seq) {
|
|
612
658
|
* distribution. This is the second step of the gradient computation shared
|
613
659
|
* by the non-sparse and sparse version.
|
614
660
|
*/
|
615
|
-
void grd_subemp(
|
616
|
-
const mdl_t *mdl =
|
617
|
-
const
|
618
|
-
const
|
619
|
-
double *g =
|
620
|
-
for (
|
661
|
+
void grd_subemp(grd_st_t *grd_st, const seq_t *seq) {
|
662
|
+
const mdl_t *mdl = grd_st->mdl;
|
663
|
+
const uint32_t Y = mdl->nlbl;
|
664
|
+
const uint32_t T = seq->len;
|
665
|
+
double *g = grd_st->g;
|
666
|
+
for (uint32_t t = 0; t < T; t++) {
|
621
667
|
const pos_t *pos = &(seq->pos[t]);
|
622
|
-
const
|
623
|
-
for (
|
624
|
-
g
|
668
|
+
const uint32_t y = seq->pos[t].lbl;
|
669
|
+
for (uint32_t n = 0; n < pos->ucnt; n++)
|
670
|
+
atm_inc(g + mdl->uoff[pos->uobs[n]] + y, -1.0);
|
625
671
|
}
|
626
|
-
for (
|
672
|
+
for (uint32_t t = 1; t < T; t++) {
|
627
673
|
const pos_t *pos = &(seq->pos[t]);
|
628
|
-
const
|
629
|
-
const
|
630
|
-
const
|
631
|
-
for (
|
632
|
-
g
|
674
|
+
const uint32_t yp = seq->pos[t - 1].lbl;
|
675
|
+
const uint32_t y = seq->pos[t ].lbl;
|
676
|
+
const uint32_t d = yp * Y + y;
|
677
|
+
for (uint32_t n = 0; n < pos->bcnt; n++)
|
678
|
+
atm_inc(g + mdl->boff[pos->bobs[n]] + d, -1.0);
|
633
679
|
}
|
634
680
|
}
|
635
681
|
|
@@ -655,38 +701,38 @@ void grd_subemp(grd_t *grd, const seq_t *seq) {
|
|
655
701
|
* weights will be non-nul only for observations present in the sequence, we
|
656
702
|
* sum only over these ones.
|
657
703
|
*/
|
658
|
-
void grd_logloss(
|
659
|
-
const mdl_t *mdl =
|
660
|
-
const double
|
661
|
-
const
|
662
|
-
const
|
663
|
-
const double (*alpha)[T][Y] = (void *)
|
664
|
-
const double *scale =
|
704
|
+
void grd_logloss(grd_st_t *grd_st, const seq_t *seq) {
|
705
|
+
const mdl_t *mdl = grd_st->mdl;
|
706
|
+
const double *x = mdl->theta;
|
707
|
+
const uint32_t Y = mdl->nlbl;
|
708
|
+
const uint32_t T = seq->len;
|
709
|
+
const double (*alpha)[T][Y] = (void *)grd_st->alpha;
|
710
|
+
const double *scale = grd_st->scale;
|
665
711
|
double logz = 0.0;
|
666
|
-
for (
|
712
|
+
for (uint32_t y = 0; y < Y; y++)
|
667
713
|
logz += (*alpha)[T - 1][y];
|
668
714
|
logz = log(logz);
|
669
|
-
for (
|
715
|
+
for (uint32_t t = 0; t < T; t++)
|
670
716
|
logz -= log(scale[t]);
|
671
717
|
double lloss = logz;
|
672
|
-
for (
|
718
|
+
for (uint32_t t = 0; t < T; t++) {
|
673
719
|
const pos_t *pos = &(seq->pos[t]);
|
674
|
-
const
|
675
|
-
for (
|
720
|
+
const uint32_t y = seq->pos[t].lbl;
|
721
|
+
for (uint32_t n = 0; n < pos->ucnt; n++)
|
676
722
|
lloss -= x[mdl->uoff[pos->uobs[n]] + y];
|
677
723
|
}
|
678
|
-
for (
|
724
|
+
for (uint32_t t = 1; t < T; t++) {
|
679
725
|
const pos_t *pos = &(seq->pos[t]);
|
680
|
-
const
|
681
|
-
const
|
682
|
-
const
|
683
|
-
for (
|
726
|
+
const uint32_t yp = seq->pos[t - 1].lbl;
|
727
|
+
const uint32_t y = seq->pos[t ].lbl;
|
728
|
+
const uint32_t d = yp * Y + y;
|
729
|
+
for (uint32_t n = 0; n < pos->bcnt; n++)
|
684
730
|
lloss -= x[mdl->boff[pos->bobs[n]] + d];
|
685
731
|
}
|
686
|
-
|
732
|
+
grd_st->lloss += lloss;
|
687
733
|
}
|
688
734
|
|
689
|
-
/*
|
735
|
+
/* grd_docrf:
|
690
736
|
* This function compute the gradient and value of the negative log-likelihood
|
691
737
|
* of the model over a single training sequence.
|
692
738
|
*
|
@@ -694,21 +740,21 @@ void grd_logloss(grd_t *grd, const seq_t *seq) {
|
|
694
740
|
* just accumulate the values for the given sequence in it. This allow to
|
695
741
|
* easily compute the gradient over a set of sequences.
|
696
742
|
*/
|
697
|
-
void
|
698
|
-
const mdl_t *mdl =
|
699
|
-
|
700
|
-
|
743
|
+
void grd_docrf(grd_st_t *grd_st, const seq_t *seq) {
|
744
|
+
const mdl_t *mdl = grd_st->mdl;
|
745
|
+
grd_st->first = 0;
|
746
|
+
grd_st->last = seq->len - 1;
|
701
747
|
if (!mdl->opt->sparse) {
|
702
|
-
grd_fldopsi(
|
703
|
-
grd_flfwdbwd(
|
704
|
-
grd_flupgrad(
|
748
|
+
grd_fldopsi(grd_st, seq);
|
749
|
+
grd_flfwdbwd(grd_st, seq);
|
750
|
+
grd_flupgrad(grd_st, seq);
|
705
751
|
} else {
|
706
|
-
grd_spdopsi(
|
707
|
-
grd_spfwdbwd(
|
708
|
-
grd_spupgrad(
|
752
|
+
grd_spdopsi(grd_st, seq);
|
753
|
+
grd_spfwdbwd(grd_st, seq);
|
754
|
+
grd_spupgrad(grd_st, seq);
|
709
755
|
}
|
710
|
-
grd_subemp(
|
711
|
-
grd_logloss(
|
756
|
+
grd_subemp(grd_st, seq);
|
757
|
+
grd_logloss(grd_st, seq);
|
712
758
|
}
|
713
759
|
|
714
760
|
/******************************************************************************
|
@@ -731,16 +777,130 @@ void grd_doseq(grd_t *grd, const seq_t *seq) {
|
|
731
777
|
* cores, or to more thread than you have memory to hold vectors.
|
732
778
|
******************************************************************************/
|
733
779
|
|
780
|
+
/* grd_stcheck:
|
781
|
+
* Check that enough memory is allocated in the gradient object so that the
|
782
|
+
* linear-chain codepath can be computed for a sequence of the given length.
|
783
|
+
*/
|
784
|
+
void grd_stcheck(grd_st_t *grd_st, uint32_t len) {
|
785
|
+
// Check if user ask for clearing the state tracker or if he requested a
|
786
|
+
// bigger tracker. In this case we have to free the previous allocated
|
787
|
+
// memory.
|
788
|
+
if (len == 0 || (len > grd_st->len && grd_st->len != 0)) {
|
789
|
+
if (grd_st->mdl->opt->sparse) {
|
790
|
+
xvm_free(grd_st->psiuni); grd_st->psiuni = NULL;
|
791
|
+
free(grd_st->psiyp); grd_st->psiyp = NULL;
|
792
|
+
free(grd_st->psiidx); grd_st->psiidx = NULL;
|
793
|
+
free(grd_st->psioff); grd_st->psioff = NULL;
|
794
|
+
}
|
795
|
+
xvm_free(grd_st->psi); grd_st->psi = NULL;
|
796
|
+
xvm_free(grd_st->alpha); grd_st->alpha = NULL;
|
797
|
+
xvm_free(grd_st->beta); grd_st->beta = NULL;
|
798
|
+
xvm_free(grd_st->unorm); grd_st->unorm = NULL;
|
799
|
+
xvm_free(grd_st->bnorm); grd_st->bnorm = NULL;
|
800
|
+
xvm_free(grd_st->scale); grd_st->scale = NULL;
|
801
|
+
grd_st->len = 0;
|
802
|
+
}
|
803
|
+
if (len == 0 || len <= grd_st->len)
|
804
|
+
return;
|
805
|
+
// If we are here, we have to allocate a new state. This is simple, we
|
806
|
+
// just have to take care of the special case for sparse mode.
|
807
|
+
const uint32_t Y = grd_st->mdl->nlbl;
|
808
|
+
const uint32_t T = len;
|
809
|
+
grd_st->psi = xvm_new(T * Y * Y);
|
810
|
+
grd_st->alpha = xvm_new(T * Y);
|
811
|
+
grd_st->beta = xvm_new(T * Y);
|
812
|
+
grd_st->scale = xvm_new(T);
|
813
|
+
grd_st->unorm = xvm_new(T);
|
814
|
+
grd_st->bnorm = xvm_new(T);
|
815
|
+
if (grd_st->mdl->opt->sparse) {
|
816
|
+
grd_st->psiuni = xvm_new(T * Y);
|
817
|
+
grd_st->psiyp = wapiti_xmalloc(sizeof(uint32_t) * T * Y * Y);
|
818
|
+
grd_st->psiidx = wapiti_xmalloc(sizeof(uint32_t) * T * Y);
|
819
|
+
grd_st->psioff = wapiti_xmalloc(sizeof(uint32_t) * T);
|
820
|
+
}
|
821
|
+
grd_st->len = len;
|
822
|
+
}
|
823
|
+
|
824
|
+
/* grd_stnew:
|
825
|
+
* Allocation memory for gradient computation state. This allocate memory for
|
826
|
+
* the longest sequence present in the data set.
|
827
|
+
*/
|
828
|
+
grd_st_t *grd_stnew(mdl_t *mdl, double *g) {
|
829
|
+
grd_st_t *grd_st = wapiti_xmalloc(sizeof(grd_st_t));
|
830
|
+
grd_st->mdl = mdl;
|
831
|
+
grd_st->len = 0;
|
832
|
+
grd_st->g = g;
|
833
|
+
grd_st->psi = NULL;
|
834
|
+
grd_st->psiuni = NULL;
|
835
|
+
grd_st->psiyp = NULL;
|
836
|
+
grd_st->psiidx = NULL;
|
837
|
+
grd_st->psioff = NULL;
|
838
|
+
grd_st->alpha = NULL;
|
839
|
+
grd_st->beta = NULL;
|
840
|
+
grd_st->unorm = NULL;
|
841
|
+
grd_st->bnorm = NULL;
|
842
|
+
grd_st->scale = NULL;
|
843
|
+
return grd_st;
|
844
|
+
}
|
845
|
+
|
846
|
+
/* grd_stfree:
|
847
|
+
* Free all memory used by gradient computation.
|
848
|
+
*/
|
849
|
+
void grd_stfree(grd_st_t *grd_st) {
|
850
|
+
grd_stcheck(grd_st, 0);
|
851
|
+
free(grd_st);
|
852
|
+
}
|
853
|
+
|
734
854
|
/* grd_dospl:
|
735
855
|
* Compute the gradient of a single sample choosing between the maxent
|
736
856
|
* optimised codepath and classical one depending of the sample.
|
737
857
|
*/
|
738
|
-
void grd_dospl(
|
739
|
-
|
740
|
-
|
741
|
-
|
858
|
+
void grd_dospl(grd_st_t *grd_st, const seq_t *seq) {
|
859
|
+
grd_stcheck(grd_st, seq->len);
|
860
|
+
rdr_t *rdr = grd_st->mdl->reader;
|
861
|
+
if (seq->len == 1 || (rdr->npats != 0 && rdr->nbi == 0))
|
862
|
+
grd_domaxent(grd_st, seq);
|
863
|
+
else if (grd_st->mdl->type == 0)
|
864
|
+
grd_domaxent(grd_st, seq);
|
865
|
+
else if (grd_st->mdl->type == 1)
|
866
|
+
grd_domemm(grd_st, seq);
|
742
867
|
else
|
743
|
-
|
868
|
+
grd_docrf(grd_st, seq);
|
869
|
+
}
|
870
|
+
|
871
|
+
/* grd_new:
|
872
|
+
* Allocate a new parallel gradient computer. Return a grd_t object who can
|
873
|
+
* compute gradient over the full data set and store it in the vector <g>.
|
874
|
+
*/
|
875
|
+
grd_t *grd_new(mdl_t *mdl, double *g) {
|
876
|
+
const uint32_t W = mdl->opt->nthread;
|
877
|
+
grd_t *grd = wapiti_xmalloc(sizeof(grd_t));
|
878
|
+
grd->mdl = mdl;
|
879
|
+
grd->grd_st = wapiti_xmalloc(sizeof(grd_st_t *) * W);
|
880
|
+
#ifdef ATM_ANSI
|
881
|
+
grd->grd_st[0] = grd_stnew(mdl, g);
|
882
|
+
for (uint32_t w = 1; w < W; w++)
|
883
|
+
grd->grd_st[w] = grd_stnew(mdl, xvm_new(mdl->nftr));
|
884
|
+
#else
|
885
|
+
for (uint32_t w = 0; w < W; w++)
|
886
|
+
grd->grd_st[w] = grd_stnew(mdl, g);
|
887
|
+
#endif
|
888
|
+
return grd;
|
889
|
+
}
|
890
|
+
|
891
|
+
/* grd_free:
|
892
|
+
* Free all memory allocated for the given gradient computer object.
|
893
|
+
*/
|
894
|
+
void grd_free(grd_t *grd) {
|
895
|
+
const uint32_t W = grd->mdl->opt->nthread;
|
896
|
+
#ifdef ATM_ANSI
|
897
|
+
for (uint32_t w = 1; w < W; w++)
|
898
|
+
xvm_free(grd->grd_st[w]->g);
|
899
|
+
#endif
|
900
|
+
for (uint32_t w = 0; w < W; w++)
|
901
|
+
grd_stfree(grd->grd_st[w]);
|
902
|
+
free(grd->grd_st);
|
903
|
+
free(grd);
|
744
904
|
}
|
745
905
|
|
746
906
|
/* grd_worker:
|
@@ -748,22 +908,25 @@ void grd_dospl(grd_t *grd, const seq_t *seq) {
|
|
748
908
|
* training set. It is mean to be called by the thread spawner in order to
|
749
909
|
* compute the gradient over the full training set.
|
750
910
|
*/
|
751
|
-
static
|
911
|
+
static
|
912
|
+
void grd_worker(job_t *job, uint32_t id, uint32_t cnt, grd_st_t *grd_st) {
|
752
913
|
unused(id && cnt);
|
753
|
-
mdl_t *mdl =
|
914
|
+
mdl_t *mdl = grd_st->mdl;
|
754
915
|
const dat_t *dat = mdl->train;
|
755
|
-
const size_t F = mdl->nftr;
|
756
916
|
// We first cleanup the gradient and value as our parent don't do it (it
|
757
917
|
// is better to do this also in parallel)
|
758
|
-
|
759
|
-
|
760
|
-
|
918
|
+
grd_st->lloss = 0.0;
|
919
|
+
#ifdef ATM_ANSI
|
920
|
+
const uint64_t F = mdl->nftr;
|
921
|
+
for (uint64_t f = 0; f < F; f++)
|
922
|
+
grd_st->g[f] = 0.0;
|
923
|
+
#endif
|
761
924
|
// Now all is ready, we can process our sequences and accumulate the
|
762
925
|
// gradient and inverse log-likelihood
|
763
|
-
|
926
|
+
uint32_t count, pos;
|
764
927
|
while (mth_getjob(job, &count, &pos)) {
|
765
|
-
for (
|
766
|
-
grd_dospl(
|
928
|
+
for (uint32_t s = pos; !uit_stop && s < pos + count; s++)
|
929
|
+
grd_dospl(grd_st, dat->seq[s]);
|
767
930
|
if (uit_stop)
|
768
931
|
break;
|
769
932
|
}
|
@@ -775,30 +938,38 @@ static void grd_worker(job_t *job, int id, int cnt, grd_t *grd) {
|
|
775
938
|
* the fact that the gradient over the full training set is just the sum of
|
776
939
|
* the gradient of each sequence.
|
777
940
|
*/
|
778
|
-
double grd_gradient(
|
779
|
-
|
780
|
-
const
|
781
|
-
const
|
941
|
+
double grd_gradient(grd_t *grd) {
|
942
|
+
mdl_t *mdl = grd->mdl;
|
943
|
+
const double *x = mdl->theta;
|
944
|
+
const uint64_t F = mdl->nftr;
|
945
|
+
const uint32_t W = mdl->opt->nthread;
|
946
|
+
double *g = grd->grd_st[0]->g;
|
947
|
+
#ifndef ATM_ANSI
|
948
|
+
for (uint64_t f = 0; f < F; f++)
|
949
|
+
g[f] = 0.0;
|
950
|
+
#endif
|
782
951
|
// All is ready to compute the gradient, we spawn the threads of
|
783
952
|
// workers, each one working on a part of the data. As the gradient and
|
784
953
|
// log-likelihood are additive, computing the final values will be
|
785
954
|
// trivial.
|
786
|
-
mth_spawn((func_t *)grd_worker, W, (void **)
|
787
|
-
mdl->opt->jobsize);
|
955
|
+
mth_spawn((func_t *)grd_worker, W, (void **)grd->grd_st,
|
956
|
+
mdl->train->nseq, mdl->opt->jobsize);
|
788
957
|
if (uit_stop)
|
789
958
|
return -1.0;
|
790
959
|
// All computations are done, it just remain to add all the gradients
|
791
|
-
// and
|
792
|
-
double fx =
|
793
|
-
for (
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
|
960
|
+
// and negative log-likelihood from all the workers.
|
961
|
+
double fx = grd->grd_st[0]->lloss;
|
962
|
+
for (uint32_t w = 1; w < W; w++)
|
963
|
+
fx += grd->grd_st[w]->lloss;
|
964
|
+
#ifdef ATM_ANSI
|
965
|
+
for (uint32_t w = 1; w < W; w++)
|
966
|
+
for (uint64_t f = 0; f < F; f++)
|
967
|
+
g[f] += grd->grd_st[w]->g[f];
|
968
|
+
#endif
|
798
969
|
// If needed we clip the gradient: setting to 0.0 all coordinates where
|
799
970
|
// the function is 0.0.
|
800
971
|
if (mdl->opt->lbfgs.clip == true)
|
801
|
-
for (
|
972
|
+
for (uint64_t f = 0; f < F; f++)
|
802
973
|
if (x[f] == 0.0)
|
803
974
|
g[f] = 0.0;
|
804
975
|
// Now we can apply the elastic-net penalty. Depending of the values of
|
@@ -806,7 +977,7 @@ double grd_gradient(mdl_t *mdl, double *g, grd_t *grds[]) {
|
|
806
977
|
const double rho1 = mdl->opt->rho1;
|
807
978
|
const double rho2 = mdl->opt->rho2;
|
808
979
|
double nl1 = 0.0, nl2 = 0.0;
|
809
|
-
for (
|
980
|
+
for (uint64_t f = 0; f < F; f++) {
|
810
981
|
const double v = x[f];
|
811
982
|
g[f] += rho2 * v;
|
812
983
|
nl1 += fabs(v);
|