chaine 3.13.1__cp310-cp310-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of chaine might be problematic. Click here for more details.

Files changed (70) hide show
  1. chaine/__init__.py +2 -0
  2. chaine/_core/crf.cpp +19854 -0
  3. chaine/_core/crf.cpython-310-x86_64-linux-gnu.so +0 -0
  4. chaine/_core/crf.pyx +271 -0
  5. chaine/_core/crfsuite/COPYING +27 -0
  6. chaine/_core/crfsuite/README +183 -0
  7. chaine/_core/crfsuite/include/crfsuite.h +1077 -0
  8. chaine/_core/crfsuite/include/crfsuite.hpp +649 -0
  9. chaine/_core/crfsuite/include/crfsuite_api.hpp +406 -0
  10. chaine/_core/crfsuite/include/os.h +65 -0
  11. chaine/_core/crfsuite/lib/cqdb/COPYING +28 -0
  12. chaine/_core/crfsuite/lib/cqdb/include/cqdb.h +518 -0
  13. chaine/_core/crfsuite/lib/cqdb/src/cqdb.c +639 -0
  14. chaine/_core/crfsuite/lib/cqdb/src/lookup3.c +1271 -0
  15. chaine/_core/crfsuite/lib/cqdb/src/main.c +184 -0
  16. chaine/_core/crfsuite/lib/crf/src/crf1d.h +354 -0
  17. chaine/_core/crfsuite/lib/crf/src/crf1d_context.c +788 -0
  18. chaine/_core/crfsuite/lib/crf/src/crf1d_encode.c +1020 -0
  19. chaine/_core/crfsuite/lib/crf/src/crf1d_feature.c +382 -0
  20. chaine/_core/crfsuite/lib/crf/src/crf1d_model.c +1085 -0
  21. chaine/_core/crfsuite/lib/crf/src/crf1d_tag.c +582 -0
  22. chaine/_core/crfsuite/lib/crf/src/crfsuite.c +500 -0
  23. chaine/_core/crfsuite/lib/crf/src/crfsuite_internal.h +233 -0
  24. chaine/_core/crfsuite/lib/crf/src/crfsuite_train.c +302 -0
  25. chaine/_core/crfsuite/lib/crf/src/dataset.c +115 -0
  26. chaine/_core/crfsuite/lib/crf/src/dictionary.c +127 -0
  27. chaine/_core/crfsuite/lib/crf/src/holdout.c +83 -0
  28. chaine/_core/crfsuite/lib/crf/src/json.c +1497 -0
  29. chaine/_core/crfsuite/lib/crf/src/json.h +120 -0
  30. chaine/_core/crfsuite/lib/crf/src/logging.c +85 -0
  31. chaine/_core/crfsuite/lib/crf/src/logging.h +49 -0
  32. chaine/_core/crfsuite/lib/crf/src/params.c +370 -0
  33. chaine/_core/crfsuite/lib/crf/src/params.h +84 -0
  34. chaine/_core/crfsuite/lib/crf/src/quark.c +180 -0
  35. chaine/_core/crfsuite/lib/crf/src/quark.h +46 -0
  36. chaine/_core/crfsuite/lib/crf/src/rumavl.c +1178 -0
  37. chaine/_core/crfsuite/lib/crf/src/rumavl.h +144 -0
  38. chaine/_core/crfsuite/lib/crf/src/train_arow.c +409 -0
  39. chaine/_core/crfsuite/lib/crf/src/train_averaged_perceptron.c +237 -0
  40. chaine/_core/crfsuite/lib/crf/src/train_l2sgd.c +491 -0
  41. chaine/_core/crfsuite/lib/crf/src/train_lbfgs.c +323 -0
  42. chaine/_core/crfsuite/lib/crf/src/train_passive_aggressive.c +442 -0
  43. chaine/_core/crfsuite/lib/crf/src/vecmath.h +360 -0
  44. chaine/_core/crfsuite/swig/crfsuite.cpp +1 -0
  45. chaine/_core/crfsuite_api.pxd +67 -0
  46. chaine/_core/liblbfgs/COPYING +22 -0
  47. chaine/_core/liblbfgs/README +71 -0
  48. chaine/_core/liblbfgs/include/lbfgs.h +745 -0
  49. chaine/_core/liblbfgs/lib/arithmetic_ansi.h +142 -0
  50. chaine/_core/liblbfgs/lib/arithmetic_sse_double.h +303 -0
  51. chaine/_core/liblbfgs/lib/arithmetic_sse_float.h +312 -0
  52. chaine/_core/liblbfgs/lib/lbfgs.c +1531 -0
  53. chaine/_core/tagger_wrapper.hpp +58 -0
  54. chaine/_core/trainer_wrapper.cpp +32 -0
  55. chaine/_core/trainer_wrapper.hpp +26 -0
  56. chaine/crf.py +505 -0
  57. chaine/logging.py +214 -0
  58. chaine/optimization/__init__.py +10 -0
  59. chaine/optimization/metrics.py +129 -0
  60. chaine/optimization/spaces.py +394 -0
  61. chaine/optimization/trial.py +103 -0
  62. chaine/optimization/utils.py +119 -0
  63. chaine/training.py +184 -0
  64. chaine/typing.py +18 -0
  65. chaine/validation.py +43 -0
  66. chaine-3.13.1.dist-info/METADATA +348 -0
  67. chaine-3.13.1.dist-info/RECORD +70 -0
  68. chaine-3.13.1.dist-info/WHEEL +4 -0
  69. chaine.libs/libgcc_s-a0b57c20.so.1 +0 -0
  70. chaine.libs/libstdc++-0d31ccbe.so.6.0.32 +0 -0
@@ -0,0 +1,1020 @@
1
+ /*
2
+ * CRF1d encoder (routines for training).
3
+ *
4
+ * Copyright (c) 2007-2010, Naoaki Okazaki
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ * * Neither the names of the authors nor the names of its contributors
15
+ * may be used to endorse or promote products derived from this
16
+ * software without specific prior written permission.
17
+ *
18
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
22
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+ */
30
+
31
+ /* $Id$ */
32
+
33
+ #ifdef HAVE_CONFIG_H
34
+ #include <config.h>
35
+ #endif /*HAVE_CONFIG_H*/
36
+
37
+ #include <os.h>
38
+
39
+ #include <stdio.h>
40
+ #include <stdlib.h>
41
+ #include <memory.h>
42
+ #include <time.h>
43
+
44
+ #include <crfsuite.h>
45
+ #include "crfsuite_internal.h"
46
+ #include "crf1d.h"
47
+ #include "params.h"
48
+ #include "logging.h"
49
+
50
+ /**
51
+ * Parameters for feature generation.
52
+ */
53
+ typedef struct
54
+ {
55
+ floatval_t feature_minfreq; /** The threshold for occurrences of features. */
56
+ int feature_possible_states; /** Dense state features. */
57
+ int feature_possible_transitions; /** Dense transition features. */
58
+ } crf1de_option_t;
59
+
60
+ /**
61
+ * CRF1d internal data.
62
+ */
63
+ typedef struct
64
+ {
65
+ int num_labels; /**< Number of distinct output labels (L). */
66
+ int num_attributes; /**< Number of distinct attributes (A). */
67
+
68
+ int cap_items; /**< Maximum length of sequences in the data set. */
69
+
70
+ int num_features; /**< Number of distinct features (K). */
71
+ crf1df_feature_t *features; /**< Array of feature descriptors [K]. */
72
+ feature_refs_t *attributes; /**< References to attribute features [A]. */
73
+ feature_refs_t *forward_trans; /**< References to transition features [L]. */
74
+
75
+ crf1d_context_t *ctx; /**< CRF1d context. */
76
+ crf1de_option_t opt; /**< CRF1d options. */
77
+ } crf1de_t;
78
+
79
+ #define FEATURE(crf1de, k) \
80
+ (&(crf1de)->features[(k)])
81
+ #define ATTRIBUTE(crf1de, a) \
82
+ (&(crf1de)->attributes[(a)])
83
+ #define TRANSITION(crf1de, i) \
84
+ (&(crf1de)->forward_trans[(i)])
85
+
86
+ static void crf1de_init(crf1de_t *crf1de)
87
+ {
88
+ crf1de->num_labels = 0;
89
+ crf1de->num_attributes = 0;
90
+ crf1de->cap_items = 0;
91
+ crf1de->num_features = 0;
92
+ crf1de->features = NULL;
93
+ crf1de->attributes = NULL;
94
+ crf1de->forward_trans = NULL;
95
+ crf1de->ctx = NULL;
96
+ /* Initialize except for opt. */
97
+ }
98
+
99
+ static void crf1de_finish(crf1de_t *crf1de)
100
+ {
101
+ int i;
102
+
103
+ if (crf1de->ctx != NULL)
104
+ {
105
+ crf1dc_delete(crf1de->ctx);
106
+ crf1de->ctx = NULL;
107
+ }
108
+ if (crf1de->features != NULL)
109
+ {
110
+ free(crf1de->features);
111
+ crf1de->features = NULL;
112
+ }
113
+ if (crf1de->attributes != NULL)
114
+ {
115
+ for (i = 0; i < crf1de->num_attributes; ++i)
116
+ {
117
+ free(crf1de->attributes[i].fids);
118
+ }
119
+ free(crf1de->attributes);
120
+ crf1de->attributes = NULL;
121
+ }
122
+ if (crf1de->forward_trans != NULL)
123
+ {
124
+ for (i = 0; i < crf1de->num_labels; ++i)
125
+ {
126
+ free(crf1de->forward_trans[i].fids);
127
+ }
128
+ free(crf1de->forward_trans);
129
+ crf1de->forward_trans = NULL;
130
+ }
131
+ }
132
+
133
+ static void crf1de_state_score(
134
+ crf1de_t *crf1de,
135
+ const crfsuite_instance_t *inst,
136
+ const floatval_t *w)
137
+ {
138
+ int i, t, r;
139
+ crf1d_context_t *ctx = crf1de->ctx;
140
+ const int T = inst->num_items;
141
+ const int L = crf1de->num_labels;
142
+
143
+ /* Loop over the items in the sequence. */
144
+ for (t = 0; t < T; ++t)
145
+ {
146
+ const crfsuite_item_t *item = &inst->items[t];
147
+ floatval_t *state = STATE_SCORE(ctx, t);
148
+
149
+ /* Loop over the contents (attributes) attached to the item. */
150
+ for (i = 0; i < item->num_contents; ++i)
151
+ {
152
+ /* Access the list of state features associated with the attribute. */
153
+ int a = item->contents[i].aid;
154
+ const feature_refs_t *attr = ATTRIBUTE(crf1de, a);
155
+ floatval_t value = item->contents[i].value;
156
+
157
+ /* Loop over the state features associated with the attribute. */
158
+ for (r = 0; r < attr->num_features; ++r)
159
+ {
160
+ /* State feature associates the attribute #a with the label #(f->dst). */
161
+ int fid = attr->fids[r];
162
+ const crf1df_feature_t *f = FEATURE(crf1de, fid);
163
+ state[f->dst] += w[fid] * value;
164
+ }
165
+ }
166
+ }
167
+ }
168
+
169
+ static void
170
+ crf1de_state_score_scaled(
171
+ crf1de_t *crf1de,
172
+ const crfsuite_instance_t *inst,
173
+ const floatval_t *w,
174
+ const floatval_t scale)
175
+ {
176
+ int i, t, r;
177
+ crf1d_context_t *ctx = crf1de->ctx;
178
+ const int T = inst->num_items;
179
+ const int L = crf1de->num_labels;
180
+
181
+ /* Forward to the non-scaling version for fast computation when scale == 1. */
182
+ if (scale == 1.)
183
+ {
184
+ crf1de_state_score(crf1de, inst, w);
185
+ return;
186
+ }
187
+
188
+ /* Loop over the items in the sequence. */
189
+ for (t = 0; t < T; ++t)
190
+ {
191
+ const crfsuite_item_t *item = &inst->items[t];
192
+ floatval_t *state = STATE_SCORE(ctx, t);
193
+
194
+ /* Loop over the contents (attributes) attached to the item. */
195
+ for (i = 0; i < item->num_contents; ++i)
196
+ {
197
+ /* Access the list of state features associated with the attribute. */
198
+ int a = item->contents[i].aid;
199
+ const feature_refs_t *attr = ATTRIBUTE(crf1de, a);
200
+ floatval_t value = item->contents[i].value * scale;
201
+
202
+ /* Loop over the state features associated with the attribute. */
203
+ for (r = 0; r < attr->num_features; ++r)
204
+ {
205
+ /* State feature associates the attribute #a with the label #(f->dst). */
206
+ int fid = attr->fids[r];
207
+ const crf1df_feature_t *f = FEATURE(crf1de, fid);
208
+ state[f->dst] += w[fid] * value;
209
+ }
210
+ }
211
+ }
212
+ }
213
+
214
+ static void
215
+ crf1de_transition_score(
216
+ crf1de_t *crf1de,
217
+ const floatval_t *w)
218
+ {
219
+ int i, r;
220
+ crf1d_context_t *ctx = crf1de->ctx;
221
+ const int L = crf1de->num_labels;
222
+
223
+ /* Compute transition scores between two labels. */
224
+ for (i = 0; i < L; ++i)
225
+ {
226
+ floatval_t *trans = TRANS_SCORE(ctx, i);
227
+ const feature_refs_t *edge = TRANSITION(crf1de, i);
228
+ for (r = 0; r < edge->num_features; ++r)
229
+ {
230
+ /* Transition feature from #i to #(f->dst). */
231
+ int fid = edge->fids[r];
232
+ const crf1df_feature_t *f = FEATURE(crf1de, fid);
233
+ trans[f->dst] = w[fid];
234
+ }
235
+ }
236
+ }
237
+
238
+ static void
239
+ crf1de_transition_score_scaled(
240
+ crf1de_t *crf1de,
241
+ const floatval_t *w,
242
+ const floatval_t scale)
243
+ {
244
+ int i, r;
245
+ crf1d_context_t *ctx = crf1de->ctx;
246
+ const int L = crf1de->num_labels;
247
+
248
+ /* Forward to the non-scaling version for fast computation when scale == 1. */
249
+ if (scale == 1.)
250
+ {
251
+ crf1de_transition_score(crf1de, w);
252
+ return;
253
+ }
254
+
255
+ /* Compute transition scores between two labels. */
256
+ for (i = 0; i < L; ++i)
257
+ {
258
+ floatval_t *trans = TRANS_SCORE(ctx, i);
259
+ const feature_refs_t *edge = TRANSITION(crf1de, i);
260
+ for (r = 0; r < edge->num_features; ++r)
261
+ {
262
+ /* Transition feature from #i to #(f->dst). */
263
+ int fid = edge->fids[r];
264
+ const crf1df_feature_t *f = FEATURE(crf1de, fid);
265
+ trans[f->dst] = w[fid] * scale;
266
+ }
267
+ }
268
+ }
269
+
270
+ static void
271
+ crf1de_features_on_path(
272
+ crf1de_t *crf1de,
273
+ const crfsuite_instance_t *inst,
274
+ const int *labels,
275
+ crfsuite_encoder_features_on_path_callback func,
276
+ void *instance)
277
+ {
278
+ int c, i = -1, t, r;
279
+ crf1d_context_t *ctx = crf1de->ctx;
280
+ const int T = inst->num_items;
281
+ const int L = crf1de->num_labels;
282
+
283
+ /* Loop over the items in the sequence. */
284
+ for (t = 0; t < T; ++t)
285
+ {
286
+ const crfsuite_item_t *item = &inst->items[t];
287
+ const int j = labels[t];
288
+
289
+ /* Loop over the contents (attributes) attached to the item. */
290
+ for (c = 0; c < item->num_contents; ++c)
291
+ {
292
+ /* Access the list of state features associated with the attribute. */
293
+ int a = item->contents[c].aid;
294
+ const feature_refs_t *attr = ATTRIBUTE(crf1de, a);
295
+ floatval_t value = item->contents[c].value;
296
+
297
+ /* Loop over the state features associated with the attribute. */
298
+ for (r = 0; r < attr->num_features; ++r)
299
+ {
300
+ /* State feature associates the attribute #a with the label #(f->dst). */
301
+ int fid = attr->fids[r];
302
+ const crf1df_feature_t *f = FEATURE(crf1de, fid);
303
+ if (f->dst == j)
304
+ {
305
+ func(instance, fid, value);
306
+ }
307
+ }
308
+ }
309
+
310
+ if (i != -1)
311
+ {
312
+ const feature_refs_t *edge = TRANSITION(crf1de, i);
313
+ for (r = 0; r < edge->num_features; ++r)
314
+ {
315
+ /* Transition feature from #i to #(f->dst). */
316
+ int fid = edge->fids[r];
317
+ const crf1df_feature_t *f = FEATURE(crf1de, fid);
318
+ if (f->dst == j)
319
+ {
320
+ func(instance, fid, 1.);
321
+ }
322
+ }
323
+ }
324
+
325
+ i = j;
326
+ }
327
+ }
328
+
329
+ static void
330
+ crf1de_observation_expectation(
331
+ crf1de_t *crf1de,
332
+ const crfsuite_instance_t *inst,
333
+ const int *labels,
334
+ floatval_t *w,
335
+ const floatval_t scale)
336
+ {
337
+ int c, i = -1, t, r;
338
+ crf1d_context_t *ctx = crf1de->ctx;
339
+ const int T = inst->num_items;
340
+ const int L = crf1de->num_labels;
341
+
342
+ /* Loop over the items in the sequence. */
343
+ for (t = 0; t < T; ++t)
344
+ {
345
+ const crfsuite_item_t *item = &inst->items[t];
346
+ const int j = labels[t];
347
+
348
+ /* Loop over the contents (attributes) attached to the item. */
349
+ for (c = 0; c < item->num_contents; ++c)
350
+ {
351
+ /* Access the list of state features associated with the attribute. */
352
+ int a = item->contents[c].aid;
353
+ const feature_refs_t *attr = ATTRIBUTE(crf1de, a);
354
+ floatval_t value = item->contents[c].value;
355
+
356
+ /* Loop over the state features associated with the attribute. */
357
+ for (r = 0; r < attr->num_features; ++r)
358
+ {
359
+ /* State feature associates the attribute #a with the label #(f->dst). */
360
+ int fid = attr->fids[r];
361
+ const crf1df_feature_t *f = FEATURE(crf1de, fid);
362
+ if (f->dst == j)
363
+ {
364
+ w[fid] += value * scale;
365
+ }
366
+ }
367
+ }
368
+
369
+ if (i != -1)
370
+ {
371
+ const feature_refs_t *edge = TRANSITION(crf1de, i);
372
+ for (r = 0; r < edge->num_features; ++r)
373
+ {
374
+ /* Transition feature from #i to #(f->dst). */
375
+ int fid = edge->fids[r];
376
+ const crf1df_feature_t *f = FEATURE(crf1de, fid);
377
+ if (f->dst == j)
378
+ {
379
+ w[fid] += scale;
380
+ }
381
+ }
382
+ }
383
+
384
+ i = j;
385
+ }
386
+ }
387
+
388
+ static void
389
+ crf1de_model_expectation(
390
+ crf1de_t *crf1de,
391
+ const crfsuite_instance_t *inst,
392
+ floatval_t *w,
393
+ const floatval_t scale)
394
+ {
395
+ int a, c, i, t, r;
396
+ crf1d_context_t *ctx = crf1de->ctx;
397
+ const feature_refs_t *attr = NULL, *trans = NULL;
398
+ const crfsuite_item_t *item = NULL;
399
+ const int T = inst->num_items;
400
+ const int L = crf1de->num_labels;
401
+
402
+ for (t = 0; t < T; ++t)
403
+ {
404
+ floatval_t *prob = STATE_MEXP(ctx, t);
405
+
406
+ /* Compute expectations for state features at position #t. */
407
+ item = &inst->items[t];
408
+ for (c = 0; c < item->num_contents; ++c)
409
+ {
410
+ /* Access the attribute. */
411
+ floatval_t value = item->contents[c].value;
412
+ a = item->contents[c].aid;
413
+ attr = ATTRIBUTE(crf1de, a);
414
+
415
+ /* Loop over state features for the attribute. */
416
+ for (r = 0; r < attr->num_features; ++r)
417
+ {
418
+ int fid = attr->fids[r];
419
+ crf1df_feature_t *f = FEATURE(crf1de, fid);
420
+ w[fid] += prob[f->dst] * value * scale;
421
+ }
422
+ }
423
+ }
424
+
425
+ /* Loop over the labels (t, i) */
426
+ for (i = 0; i < L; ++i)
427
+ {
428
+ const floatval_t *prob = TRANS_MEXP(ctx, i);
429
+ const feature_refs_t *edge = TRANSITION(crf1de, i);
430
+ for (r = 0; r < edge->num_features; ++r)
431
+ {
432
+ /* Transition feature from #i to #(f->dst). */
433
+ int fid = edge->fids[r];
434
+ crf1df_feature_t *f = FEATURE(crf1de, fid);
435
+ w[fid] += prob[f->dst] * scale;
436
+ }
437
+ }
438
+ }
439
+
440
+ static int
441
+ crf1de_set_data(
442
+ crf1de_t *crf1de,
443
+ dataset_t *ds,
444
+ int num_labels,
445
+ int num_attributes,
446
+ logging_t *lg)
447
+ {
448
+ int i, ret = 0;
449
+ clock_t begin = 0;
450
+ int T = 0;
451
+ const int L = num_labels;
452
+ const int A = num_attributes;
453
+ const int N = ds->num_instances;
454
+ crf1de_option_t *opt = &crf1de->opt;
455
+
456
+ /* Initialize the member variables. */
457
+ crf1de_init(crf1de);
458
+ crf1de->num_attributes = A;
459
+ crf1de->num_labels = L;
460
+
461
+ /* Find the maximum length of items in the data set. */
462
+ for (i = 0; i < N; ++i)
463
+ {
464
+ const crfsuite_instance_t *inst = dataset_get(ds, i);
465
+ if (T < inst->num_items)
466
+ {
467
+ T = inst->num_items;
468
+ }
469
+ }
470
+
471
+ /* Construct a CRF context. */
472
+ crf1de->ctx = crf1dc_new(CTXF_MARGINALS | CTXF_VITERBI, L, T);
473
+ if (crf1de->ctx == NULL)
474
+ {
475
+ ret = CRFSUITEERR_OUTOFMEMORY;
476
+ goto error_exit;
477
+ }
478
+
479
+ /* Feature generation. */
480
+ logging(lg, "Processing training data");
481
+ begin = clock();
482
+ crf1de->features = crf1df_generate(
483
+ &crf1de->num_features,
484
+ ds,
485
+ L,
486
+ A,
487
+ opt->feature_possible_states ? 1 : 0,
488
+ opt->feature_possible_transitions ? 1 : 0,
489
+ opt->feature_minfreq,
490
+ lg->func,
491
+ lg->instance);
492
+ if (crf1de->features == NULL)
493
+ {
494
+ ret = CRFSUITEERR_OUTOFMEMORY;
495
+ goto error_exit;
496
+ }
497
+
498
+ /* Initialize the feature references. */
499
+ crf1df_init_references(
500
+ &crf1de->attributes,
501
+ &crf1de->forward_trans,
502
+ crf1de->features,
503
+ crf1de->num_features,
504
+ A,
505
+ L);
506
+ if (crf1de->attributes == NULL || crf1de->forward_trans == NULL)
507
+ {
508
+ ret = CRFSUITEERR_OUTOFMEMORY;
509
+ goto error_exit;
510
+ }
511
+
512
+ return ret;
513
+
514
+ error_exit:
515
+ crf1de_finish(crf1de);
516
+ return ret;
517
+ }
518
+
519
+ static int
520
+ crf1de_save_model(
521
+ crf1de_t *crf1de,
522
+ const char *filename,
523
+ const floatval_t *w,
524
+ crfsuite_dictionary_t *attrs,
525
+ crfsuite_dictionary_t *labels,
526
+ logging_t *lg)
527
+ {
528
+ int a, k, l, ret;
529
+ clock_t begin;
530
+ int *fmap = NULL, *amap = NULL;
531
+ crf1dmw_t *writer = NULL;
532
+ const feature_refs_t *edge = NULL, *attr = NULL;
533
+ const floatval_t threshold = 0.01;
534
+ const int L = crf1de->num_labels;
535
+ const int A = crf1de->num_attributes;
536
+ const int K = crf1de->num_features;
537
+ int J = 0, B = 0;
538
+
539
+ /* Start storing the model. */
540
+ logging(lg, "Saving model");
541
+ begin = clock();
542
+
543
+ /* Allocate and initialize the feature mapping. */
544
+ fmap = (int *)calloc(K, sizeof(int));
545
+ if (fmap == NULL)
546
+ {
547
+ goto error_exit;
548
+ }
549
+ #ifdef CRF_TRAIN_SAVE_NO_PRUNING
550
+ for (k = 0; k < K; ++k)
551
+ fmap[k] = k;
552
+ J = K;
553
+ #else
554
+ for (k = 0; k < K; ++k)
555
+ fmap[k] = -1;
556
+ #endif /*CRF_TRAIN_SAVE_NO_PRUNING*/
557
+
558
+ /* Allocate and initialize the attribute mapping. */
559
+ amap = (int *)calloc(A, sizeof(int));
560
+ if (amap == NULL)
561
+ {
562
+ goto error_exit;
563
+ }
564
+ #ifdef CRF_TRAIN_SAVE_NO_PRUNING
565
+ for (a = 0; a < A; ++a)
566
+ amap[a] = a;
567
+ B = A;
568
+ #else
569
+ for (a = 0; a < A; ++a)
570
+ amap[a] = -1;
571
+ #endif /*CRF_TRAIN_SAVE_NO_PRUNING*/
572
+
573
+ /*
574
+ * Open a model writer.
575
+ */
576
+ writer = crf1mmw(filename);
577
+ if (writer == NULL)
578
+ {
579
+ goto error_exit;
580
+ }
581
+
582
+ /* Open a feature chunk in the model file. */
583
+ if (ret = crf1dmw_open_features(writer))
584
+ {
585
+ goto error_exit;
586
+ }
587
+
588
+ /*
589
+ * Write the feature values.
590
+ * (with determining active features and attributes).
591
+ */
592
+ for (k = 0; k < K; ++k)
593
+ {
594
+ crf1df_feature_t *f = &crf1de->features[k];
595
+ if (w[k] != 0)
596
+ {
597
+ int src;
598
+ crf1dm_feature_t feat;
599
+
600
+ #ifndef CRF_TRAIN_SAVE_NO_PRUNING
601
+ /* The feature (#k) will have a new feature id (#J). */
602
+ fmap[k] = J++; /* Feature #k -> #fmap[k]. */
603
+
604
+ /* Map the source of the field. */
605
+ if (f->type == FT_STATE)
606
+ {
607
+ /* The attribute #(f->src) will have a new attribute id (#B). */
608
+ if (amap[f->src] < 0)
609
+ amap[f->src] = B++; /* Attribute #a -> #amap[a]. */
610
+ src = amap[f->src];
611
+ }
612
+ else
613
+ {
614
+ src = f->src;
615
+ }
616
+ #endif /*CRF_TRAIN_SAVE_NO_PRUNING*/
617
+
618
+ feat.type = f->type;
619
+ feat.src = src;
620
+ feat.dst = f->dst;
621
+ feat.weight = w[k];
622
+
623
+ /* Write the feature. */
624
+ if (ret = crf1dmw_put_feature(writer, fmap[k], &feat))
625
+ {
626
+ goto error_exit;
627
+ }
628
+ }
629
+ }
630
+
631
+ /* Close the feature chunk. */
632
+ if (ret = crf1dmw_close_features(writer))
633
+ {
634
+ goto error_exit;
635
+ }
636
+
637
+ /* Write labels. */
638
+ if (ret = crf1dmw_open_labels(writer, L))
639
+ {
640
+ goto error_exit;
641
+ }
642
+ for (l = 0; l < L; ++l)
643
+ {
644
+ const char *str = NULL;
645
+ labels->to_string(labels, l, &str);
646
+ if (str != NULL)
647
+ {
648
+ if (ret = crf1dmw_put_label(writer, l, str))
649
+ {
650
+ goto error_exit;
651
+ }
652
+ labels->free(labels, str);
653
+ }
654
+ }
655
+ if (ret = crf1dmw_close_labels(writer))
656
+ {
657
+ goto error_exit;
658
+ }
659
+
660
+ /* Write attributes. */
661
+ if (ret = crf1dmw_open_attrs(writer, B))
662
+ {
663
+ goto error_exit;
664
+ }
665
+ for (a = 0; a < A; ++a)
666
+ {
667
+ if (0 <= amap[a])
668
+ {
669
+ const char *str = NULL;
670
+ attrs->to_string(attrs, a, &str);
671
+ if (str != NULL)
672
+ {
673
+ if (ret = crf1dmw_put_attr(writer, amap[a], str))
674
+ {
675
+ goto error_exit;
676
+ }
677
+ attrs->free(attrs, str);
678
+ }
679
+ }
680
+ }
681
+ if (ret = crf1dmw_close_attrs(writer))
682
+ {
683
+ goto error_exit;
684
+ }
685
+
686
+ /* Write label feature references. */
687
+ if (ret = crf1dmw_open_labelrefs(writer, L + 2))
688
+ {
689
+ goto error_exit;
690
+ }
691
+ for (l = 0; l < L; ++l)
692
+ {
693
+ edge = TRANSITION(crf1de, l);
694
+ if (ret = crf1dmw_put_labelref(writer, l, edge, fmap))
695
+ {
696
+ goto error_exit;
697
+ }
698
+ }
699
+ if (ret = crf1dmw_close_labelrefs(writer))
700
+ {
701
+ goto error_exit;
702
+ }
703
+
704
+ /* Write attribute feature references. */
705
+ if (ret = crf1dmw_open_attrrefs(writer, B))
706
+ {
707
+ goto error_exit;
708
+ }
709
+ for (a = 0; a < A; ++a)
710
+ {
711
+ if (0 <= amap[a])
712
+ {
713
+ attr = ATTRIBUTE(crf1de, a);
714
+ if (ret = crf1dmw_put_attrref(writer, amap[a], attr, fmap))
715
+ {
716
+ goto error_exit;
717
+ }
718
+ }
719
+ }
720
+ if (ret = crf1dmw_close_attrrefs(writer))
721
+ {
722
+ goto error_exit;
723
+ }
724
+
725
+ /* Close the writer. */
726
+ crf1dmw_close(writer);
727
+
728
+ free(amap);
729
+ free(fmap);
730
+ return 0;
731
+
732
+ error_exit:
733
+ if (writer != NULL)
734
+ {
735
+ crf1dmw_close(writer);
736
+ }
737
+ if (amap != NULL)
738
+ {
739
+ free(amap);
740
+ }
741
+ if (fmap != NULL)
742
+ {
743
+ free(fmap);
744
+ }
745
+ return ret;
746
+ }
747
+
748
+ static int crf1de_exchange_options(crfsuite_params_t *params, crf1de_option_t *opt, int mode)
749
+ {
750
+ BEGIN_PARAM_MAP(params, mode)
751
+ DDX_PARAM_FLOAT(
752
+ "feature.minfreq", opt->feature_minfreq, 0.0,
753
+ "The minimum frequency of features.")
754
+ DDX_PARAM_INT(
755
+ "feature.possible_states", opt->feature_possible_states, 0,
756
+ "Force to generate possible state features.")
757
+ DDX_PARAM_INT(
758
+ "feature.possible_transitions", opt->feature_possible_transitions, 0,
759
+ "Force to generate possible transition features.")
760
+ END_PARAM_MAP()
761
+
762
+ return 0;
763
+ }
764
+
765
+ /*
766
+ * Implementation of encoder_t object.
767
+ */
768
+
769
+ enum
770
+ {
771
+ /** No precomputation. */
772
+ LEVEL_NONE = 0,
773
+ /** Feature weights are set. */
774
+ LEVEL_WEIGHT,
775
+ /** Instance is set. */
776
+ LEVEL_INSTANCE,
777
+ /** Performed the forward-backward algorithm. */
778
+ LEVEL_ALPHABETA,
779
+ /** Computed marginal probabilities. */
780
+ LEVEL_MARGINAL,
781
+ };
782
+
783
+ static void set_level(encoder_t *self, int level)
784
+ {
785
+ int prev = self->level;
786
+ crf1de_t *crf1de = (crf1de_t *)self->internal;
787
+
788
+ /*
789
+ Each training algorithm has a different requirement for processing a
790
+ training instance. For example, the perceptron algorithm need compute
791
+ Viterbi paths whereas gradient-based algorithms (e.g., SGD) need
792
+ marginal probabilities computed by the forward-backward algorithm.
793
+ */
794
+
795
+ /* LEVEL_WEIGHT: set transition scores. */
796
+ if (LEVEL_WEIGHT <= level && prev < LEVEL_WEIGHT)
797
+ {
798
+ crf1dc_reset(crf1de->ctx, RF_TRANS);
799
+ crf1de_transition_score_scaled(crf1de, self->w, self->scale);
800
+ }
801
+
802
+ /* LEVEL_INSTANCE: set state scores. */
803
+ if (LEVEL_INSTANCE <= level && prev < LEVEL_INSTANCE)
804
+ {
805
+ crf1dc_set_num_items(crf1de->ctx, self->inst->num_items);
806
+ crf1dc_reset(crf1de->ctx, RF_STATE);
807
+ crf1de_state_score_scaled(crf1de, self->inst, self->w, self->scale);
808
+ }
809
+
810
+ /* LEVEL_ALPHABETA: perform the forward-backward algorithm. */
811
+ if (LEVEL_ALPHABETA <= level && prev < LEVEL_ALPHABETA)
812
+ {
813
+ crf1dc_exp_transition(crf1de->ctx);
814
+ crf1dc_exp_state(crf1de->ctx);
815
+ crf1dc_alpha_score(crf1de->ctx);
816
+ crf1dc_beta_score(crf1de->ctx);
817
+ }
818
+
819
+ /* LEVEL_MARGINAL: compute the marginal probability. */
820
+ if (LEVEL_MARGINAL <= level && prev < LEVEL_MARGINAL)
821
+ {
822
+ crf1dc_marginals(crf1de->ctx);
823
+ }
824
+
825
+ self->level = level;
826
+ }
827
+
828
+ static int encoder_exchange_options(encoder_t *self, crfsuite_params_t *params, int mode)
829
+ {
830
+ crf1de_t *crf1de = (crf1de_t *)self->internal;
831
+ return crf1de_exchange_options(params, &crf1de->opt, mode);
832
+ }
833
+
834
+ static int encoder_initialize(encoder_t *self, dataset_t *ds, logging_t *lg)
835
+ {
836
+ int ret;
837
+ crf1de_t *crf1de = (crf1de_t *)self->internal;
838
+
839
+ ret = crf1de_set_data(
840
+ crf1de,
841
+ ds,
842
+ ds->data->labels->num(ds->data->labels),
843
+ ds->data->attrs->num(ds->data->attrs),
844
+ lg);
845
+ self->ds = ds;
846
+ self->num_features = crf1de->num_features;
847
+ self->cap_items = crf1de->ctx->cap_items;
848
+ return ret;
849
+ }
850
+
851
+ /* LEVEL_NONE -> LEVEL_NONE. */
852
+ static int encoder_objective_and_gradients_batch(encoder_t *self, dataset_t *ds, const floatval_t *w, floatval_t *f, floatval_t *g)
853
+ {
854
+ int i;
855
+ floatval_t logp = 0, logl = 0;
856
+ crf1de_t *crf1de = (crf1de_t *)self->internal;
857
+ const int N = ds->num_instances;
858
+ const int K = crf1de->num_features;
859
+
860
+ /*
861
+ Initialize the gradients with observation expectations.
862
+ */
863
+ for (i = 0; i < K; ++i)
864
+ {
865
+ crf1df_feature_t *f = &crf1de->features[i];
866
+ g[i] = -f->freq;
867
+ }
868
+
869
+ /*
870
+ Set the scores (weights) of transition features here because
871
+ these are independent of input label sequences.
872
+ */
873
+ crf1dc_reset(crf1de->ctx, RF_TRANS);
874
+ crf1de_transition_score(crf1de, w);
875
+ crf1dc_exp_transition(crf1de->ctx);
876
+
877
+ /*
878
+ Compute model expectations.
879
+ */
880
+ for (i = 0; i < N; ++i)
881
+ {
882
+ const crfsuite_instance_t *seq = dataset_get(ds, i);
883
+
884
+ /* Set label sequences and state scores. */
885
+ crf1dc_set_num_items(crf1de->ctx, seq->num_items);
886
+ crf1dc_reset(crf1de->ctx, RF_STATE);
887
+ crf1de_state_score(crf1de, seq, w);
888
+ crf1dc_exp_state(crf1de->ctx);
889
+
890
+ /* Compute forward/backward scores. */
891
+ crf1dc_alpha_score(crf1de->ctx);
892
+ crf1dc_beta_score(crf1de->ctx);
893
+ crf1dc_marginals(crf1de->ctx);
894
+
895
+ /* Compute the probability of the input sequence on the model. */
896
+ logp = crf1dc_score(crf1de->ctx, seq->labels) - crf1dc_lognorm(crf1de->ctx);
897
+ /* Update the log-likelihood. */
898
+ logl += logp * seq->weight;
899
+
900
+ /* Update the model expectations of features. */
901
+ crf1de_model_expectation(crf1de, seq, g, seq->weight);
902
+ }
903
+
904
+ *f = -logl;
905
+ return 0;
906
+ }
907
+
908
+ /* LEVEL_NONE -> LEVEL_NONE. */
909
+ static int encoder_features_on_path(encoder_t *self, const crfsuite_instance_t *inst, const int *path, crfsuite_encoder_features_on_path_callback func, void *instance)
910
+ {
911
+ crf1de_t *crf1de = (crf1de_t *)self->internal;
912
+ crf1de_features_on_path(crf1de, inst, path, func, instance);
913
+ return 0;
914
+ }
915
+
916
+ /* LEVEL_NONE -> LEVEL_NONE. */
917
+ static int encoder_save_model(encoder_t *self, const char *filename, const floatval_t *w, logging_t *lg)
918
+ {
919
+ crf1de_t *crf1de = (crf1de_t *)self->internal;
920
+ return crf1de_save_model(crf1de, filename, w, self->ds->data->attrs, self->ds->data->labels, lg);
921
+ }
922
+
923
+ /* LEVEL_NONE -> LEVEL_WEIGHT. */
924
+ static int encoder_set_weights(encoder_t *self, const floatval_t *w, floatval_t scale)
925
+ {
926
+ self->w = w;
927
+ self->scale = scale;
928
+ self->level = LEVEL_WEIGHT - 1;
929
+ set_level(self, LEVEL_WEIGHT);
930
+ return 0;
931
+ }
932
+
933
+ /* LEVEL_WEIGHT -> LEVEL_INSTANCE. */
934
+ static int encoder_set_instance(encoder_t *self, const crfsuite_instance_t *inst)
935
+ {
936
+ self->inst = inst;
937
+ self->level = LEVEL_INSTANCE - 1;
938
+ set_level(self, LEVEL_INSTANCE);
939
+ return 0;
940
+ }
941
+
942
+ /* LEVEL_INSTANCE -> LEVEL_INSTANCE. */
943
+ static int encoder_score(encoder_t *self, const int *path, floatval_t *ptr_score)
944
+ {
945
+ crf1de_t *crf1de = (crf1de_t *)self->internal;
946
+ *ptr_score = crf1dc_score(crf1de->ctx, path);
947
+ return 0;
948
+ }
949
+
950
+ /* LEVEL_INSTANCE -> LEVEL_INSTANCE. */
951
+ static int encoder_viterbi(encoder_t *self, int *path, floatval_t *ptr_score)
952
+ {
953
+ int i;
954
+ floatval_t score;
955
+ crf1de_t *crf1de = (crf1de_t *)self->internal;
956
+ score = crf1dc_viterbi(crf1de->ctx, path);
957
+ if (ptr_score != NULL)
958
+ {
959
+ *ptr_score = score;
960
+ }
961
+ return 0;
962
+ }
963
+
964
+ /* LEVEL_INSTANCE -> LEVEL_ALPHABETA. */
965
+ static int encoder_partition_factor(encoder_t *self, floatval_t *ptr_pf)
966
+ {
967
+ crf1de_t *crf1de = (crf1de_t *)self->internal;
968
+ set_level(self, LEVEL_ALPHABETA);
969
+ *ptr_pf = crf1dc_lognorm(crf1de->ctx);
970
+ return 0;
971
+ }
972
+
973
+ /* LEVEL_INSTANCE -> LEVEL_MARGINAL. */
974
+ static int encoder_objective_and_gradients(encoder_t *self, floatval_t *f, floatval_t *g, floatval_t gain, floatval_t weight)
975
+ {
976
+ crf1de_t *crf1de = (crf1de_t *)self->internal;
977
+ set_level(self, LEVEL_MARGINAL);
978
+ gain *= weight;
979
+ crf1de_observation_expectation(crf1de, self->inst, self->inst->labels, g, gain);
980
+ crf1de_model_expectation(crf1de, self->inst, g, -gain);
981
+ *f = (-crf1dc_score(crf1de->ctx, self->inst->labels) + crf1dc_lognorm(crf1de->ctx)) * weight;
982
+ return 0;
983
+ }
984
+
985
+ static void encoder_release(encoder_t *self)
986
+ {
987
+ crf1de_t *crf1de = (crf1de_t *)self->internal;
988
+ crf1de_finish(crf1de);
989
+ free(crf1de);
990
+ free(self);
991
+ }
992
+
993
+ encoder_t *crf1d_create_encoder()
994
+ {
995
+ encoder_t *self = (encoder_t *)calloc(1, sizeof(encoder_t));
996
+ if (self != NULL)
997
+ {
998
+ crf1de_t *enc = (crf1de_t *)calloc(1, sizeof(crf1de_t));
999
+ if (enc != NULL)
1000
+ {
1001
+ crf1de_init(enc);
1002
+
1003
+ self->exchange_options = encoder_exchange_options;
1004
+ self->initialize = encoder_initialize;
1005
+ self->objective_and_gradients_batch = encoder_objective_and_gradients_batch;
1006
+ self->save_model = encoder_save_model;
1007
+ self->features_on_path = encoder_features_on_path;
1008
+ self->set_weights = encoder_set_weights;
1009
+ self->set_instance = encoder_set_instance;
1010
+ self->score = encoder_score;
1011
+ self->viterbi = encoder_viterbi;
1012
+ self->partition_factor = encoder_partition_factor;
1013
+ self->objective_and_gradients = encoder_objective_and_gradients;
1014
+ self->release = encoder_release;
1015
+ self->internal = enc;
1016
+ }
1017
+ }
1018
+
1019
+ return self;
1020
+ }