opener-opinion-detector-basic 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +30 -0
  3. data/bin/opinion-detector-basic +19 -0
  4. data/bin/opinion-detector-basic-server +10 -0
  5. data/config.ru +4 -0
  6. data/core/opinion_detector_basic_multi.py +499 -0
  7. data/core/packages/KafNafParser-1.3.tar.gz +0 -0
  8. data/core/packages/VUA_pylib-1.4.tar.gz +0 -0
  9. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
  10. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
  11. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
  12. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
  13. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
  14. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
  15. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
  16. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
  17. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
  18. data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
  19. data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
  20. data/core/vendor/src/crfsuite/AUTHORS +1 -0
  21. data/core/vendor/src/crfsuite/COPYING +27 -0
  22. data/core/vendor/src/crfsuite/ChangeLog +103 -0
  23. data/core/vendor/src/crfsuite/INSTALL +236 -0
  24. data/core/vendor/src/crfsuite/Makefile.am +19 -0
  25. data/core/vendor/src/crfsuite/Makefile.in +783 -0
  26. data/core/vendor/src/crfsuite/README +183 -0
  27. data/core/vendor/src/crfsuite/aclocal.m4 +9018 -0
  28. data/core/vendor/src/crfsuite/autogen.sh +38 -0
  29. data/core/vendor/src/crfsuite/compile +143 -0
  30. data/core/vendor/src/crfsuite/config.guess +1502 -0
  31. data/core/vendor/src/crfsuite/config.h.in +198 -0
  32. data/core/vendor/src/crfsuite/config.sub +1714 -0
  33. data/core/vendor/src/crfsuite/configure +14273 -0
  34. data/core/vendor/src/crfsuite/configure.in +149 -0
  35. data/core/vendor/src/crfsuite/crfsuite.sln +42 -0
  36. data/core/vendor/src/crfsuite/depcomp +630 -0
  37. data/core/vendor/src/crfsuite/example/chunking.py +49 -0
  38. data/core/vendor/src/crfsuite/example/crfutils.py +179 -0
  39. data/core/vendor/src/crfsuite/example/ner.py +270 -0
  40. data/core/vendor/src/crfsuite/example/pos.py +78 -0
  41. data/core/vendor/src/crfsuite/example/template.py +88 -0
  42. data/core/vendor/src/crfsuite/frontend/Makefile.am +29 -0
  43. data/core/vendor/src/crfsuite/frontend/Makefile.in +640 -0
  44. data/core/vendor/src/crfsuite/frontend/dump.c +116 -0
  45. data/core/vendor/src/crfsuite/frontend/frontend.vcxproj +129 -0
  46. data/core/vendor/src/crfsuite/frontend/iwa.c +273 -0
  47. data/core/vendor/src/crfsuite/frontend/iwa.h +65 -0
  48. data/core/vendor/src/crfsuite/frontend/learn.c +439 -0
  49. data/core/vendor/src/crfsuite/frontend/main.c +137 -0
  50. data/core/vendor/src/crfsuite/frontend/option.c +93 -0
  51. data/core/vendor/src/crfsuite/frontend/option.h +86 -0
  52. data/core/vendor/src/crfsuite/frontend/readdata.h +38 -0
  53. data/core/vendor/src/crfsuite/frontend/reader.c +136 -0
  54. data/core/vendor/src/crfsuite/frontend/tag.c +427 -0
  55. data/core/vendor/src/crfsuite/genbinary.sh.in +15 -0
  56. data/core/vendor/src/crfsuite/include/Makefile.am +11 -0
  57. data/core/vendor/src/crfsuite/include/Makefile.in +461 -0
  58. data/core/vendor/src/crfsuite/include/crfsuite.h +1063 -0
  59. data/core/vendor/src/crfsuite/include/crfsuite.hpp +555 -0
  60. data/core/vendor/src/crfsuite/include/crfsuite_api.hpp +400 -0
  61. data/core/vendor/src/crfsuite/include/os.h +61 -0
  62. data/core/vendor/src/crfsuite/install-sh +520 -0
  63. data/core/vendor/src/crfsuite/lib/cqdb/COPYING +28 -0
  64. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.am +21 -0
  65. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.in +549 -0
  66. data/core/vendor/src/crfsuite/lib/cqdb/cqdb.vcxproj +86 -0
  67. data/core/vendor/src/crfsuite/lib/cqdb/include/cqdb.h +524 -0
  68. data/core/vendor/src/crfsuite/lib/cqdb/src/cqdb.c +587 -0
  69. data/core/vendor/src/crfsuite/lib/cqdb/src/lookup3.c +976 -0
  70. data/core/vendor/src/crfsuite/lib/crf/Makefile.am +46 -0
  71. data/core/vendor/src/crfsuite/lib/crf/Makefile.in +721 -0
  72. data/core/vendor/src/crfsuite/lib/crf/crf.vcxproj +216 -0
  73. data/core/vendor/src/crfsuite/lib/crf/src/crf1d.h +353 -0
  74. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_context.c +705 -0
  75. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_encode.c +943 -0
  76. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_feature.c +352 -0
  77. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_model.c +994 -0
  78. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_tag.c +550 -0
  79. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite.c +492 -0
  80. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_internal.h +236 -0
  81. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_train.c +272 -0
  82. data/core/vendor/src/crfsuite/lib/crf/src/dataset.c +106 -0
  83. data/core/vendor/src/crfsuite/lib/crf/src/dictionary.c +118 -0
  84. data/core/vendor/src/crfsuite/lib/crf/src/holdout.c +80 -0
  85. data/core/vendor/src/crfsuite/lib/crf/src/logging.c +91 -0
  86. data/core/vendor/src/crfsuite/lib/crf/src/logging.h +48 -0
  87. data/core/vendor/src/crfsuite/lib/crf/src/params.c +335 -0
  88. data/core/vendor/src/crfsuite/lib/crf/src/params.h +80 -0
  89. data/core/vendor/src/crfsuite/lib/crf/src/quark.c +172 -0
  90. data/core/vendor/src/crfsuite/lib/crf/src/quark.h +46 -0
  91. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.c +1107 -0
  92. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.h +160 -0
  93. data/core/vendor/src/crfsuite/lib/crf/src/train_arow.c +408 -0
  94. data/core/vendor/src/crfsuite/lib/crf/src/train_averaged_perceptron.c +242 -0
  95. data/core/vendor/src/crfsuite/lib/crf/src/train_l2sgd.c +507 -0
  96. data/core/vendor/src/crfsuite/lib/crf/src/train_lbfgs.c +338 -0
  97. data/core/vendor/src/crfsuite/lib/crf/src/train_passive_aggressive.c +435 -0
  98. data/core/vendor/src/crfsuite/lib/crf/src/vecmath.h +341 -0
  99. data/core/vendor/src/crfsuite/ltmain.sh +8413 -0
  100. data/core/vendor/src/crfsuite/missing +376 -0
  101. data/core/vendor/src/crfsuite/swig/Makefile.am +13 -0
  102. data/core/vendor/src/crfsuite/swig/Makefile.in +365 -0
  103. data/core/vendor/src/crfsuite/swig/crfsuite.cpp +2 -0
  104. data/core/vendor/src/crfsuite/swig/export.i +32 -0
  105. data/core/vendor/src/crfsuite/swig/python/README +92 -0
  106. data/core/vendor/src/crfsuite/swig/python/crfsuite.py +329 -0
  107. data/core/vendor/src/crfsuite/swig/python/export_wrap.cpp +14355 -0
  108. data/core/vendor/src/crfsuite/swig/python/export_wrap.h +63 -0
  109. data/core/vendor/src/crfsuite/swig/python/prepare.sh +9 -0
  110. data/core/vendor/src/crfsuite/swig/python/sample_tag.py +52 -0
  111. data/core/vendor/src/crfsuite/swig/python/sample_train.py +68 -0
  112. data/core/vendor/src/crfsuite/swig/python/setup.py +44 -0
  113. data/core/vendor/src/crfsuite/win32/stdint.h +679 -0
  114. data/core/vendor/src/liblbfgs/AUTHORS +1 -0
  115. data/core/vendor/src/liblbfgs/COPYING +22 -0
  116. data/core/vendor/src/liblbfgs/ChangeLog +120 -0
  117. data/core/vendor/src/liblbfgs/INSTALL +231 -0
  118. data/core/vendor/src/liblbfgs/Makefile.am +10 -0
  119. data/core/vendor/src/liblbfgs/Makefile.in +638 -0
  120. data/core/vendor/src/liblbfgs/NEWS +0 -0
  121. data/core/vendor/src/liblbfgs/README +71 -0
  122. data/core/vendor/src/liblbfgs/aclocal.m4 +6985 -0
  123. data/core/vendor/src/liblbfgs/autogen.sh +38 -0
  124. data/core/vendor/src/liblbfgs/config.guess +1411 -0
  125. data/core/vendor/src/liblbfgs/config.h.in +64 -0
  126. data/core/vendor/src/liblbfgs/config.sub +1500 -0
  127. data/core/vendor/src/liblbfgs/configure +21146 -0
  128. data/core/vendor/src/liblbfgs/configure.in +107 -0
  129. data/core/vendor/src/liblbfgs/depcomp +522 -0
  130. data/core/vendor/src/liblbfgs/include/lbfgs.h +745 -0
  131. data/core/vendor/src/liblbfgs/install-sh +322 -0
  132. data/core/vendor/src/liblbfgs/lbfgs.sln +26 -0
  133. data/core/vendor/src/liblbfgs/lib/Makefile.am +24 -0
  134. data/core/vendor/src/liblbfgs/lib/Makefile.in +499 -0
  135. data/core/vendor/src/liblbfgs/lib/arithmetic_ansi.h +133 -0
  136. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_double.h +294 -0
  137. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_float.h +298 -0
  138. data/core/vendor/src/liblbfgs/lib/lbfgs.c +1371 -0
  139. data/core/vendor/src/liblbfgs/lib/lib.vcxproj +95 -0
  140. data/core/vendor/src/liblbfgs/ltmain.sh +6426 -0
  141. data/core/vendor/src/liblbfgs/missing +353 -0
  142. data/core/vendor/src/liblbfgs/sample/Makefile.am +15 -0
  143. data/core/vendor/src/liblbfgs/sample/Makefile.in +433 -0
  144. data/core/vendor/src/liblbfgs/sample/sample.c +81 -0
  145. data/core/vendor/src/liblbfgs/sample/sample.cpp +126 -0
  146. data/core/vendor/src/liblbfgs/sample/sample.vcxproj +105 -0
  147. data/core/vendor/src/svm_light/LICENSE.txt +59 -0
  148. data/core/vendor/src/svm_light/Makefile +105 -0
  149. data/core/vendor/src/svm_light/kernel.h +40 -0
  150. data/core/vendor/src/svm_light/svm_classify.c +197 -0
  151. data/core/vendor/src/svm_light/svm_common.c +985 -0
  152. data/core/vendor/src/svm_light/svm_common.h +301 -0
  153. data/core/vendor/src/svm_light/svm_hideo.c +1062 -0
  154. data/core/vendor/src/svm_light/svm_learn.c +4147 -0
  155. data/core/vendor/src/svm_light/svm_learn.h +169 -0
  156. data/core/vendor/src/svm_light/svm_learn_main.c +397 -0
  157. data/core/vendor/src/svm_light/svm_loqo.c +211 -0
  158. data/ext/hack/Rakefile +17 -0
  159. data/ext/hack/support.rb +88 -0
  160. data/lib/opener/opinion_detector_basic.rb +91 -0
  161. data/lib/opener/opinion_detector_basic/public/markdown.css +284 -0
  162. data/lib/opener/opinion_detector_basic/server.rb +16 -0
  163. data/lib/opener/opinion_detector_basic/version.rb +5 -0
  164. data/lib/opener/opinion_detector_basic/views/index.erb +97 -0
  165. data/lib/opener/opinion_detector_basic/views/result.erb +15 -0
  166. data/opener-opinion-detector-basic.gemspec +36 -0
  167. data/pre_build_requirements.txt +1 -0
  168. metadata +309 -0
@@ -0,0 +1,242 @@
1
+ /*
2
+ * Online training with averaged perceptron.
3
+ *
4
+ * Copyright (c) 2007-2010, Naoaki Okazaki
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ * * Neither the names of the authors nor the names of its contributors
15
+ * may be used to endorse or promote products derived from this
16
+ * software without specific prior written permission.
17
+ *
18
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
22
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+ */
30
+
31
+ /* $Id$ */
32
+
33
+ #ifdef HAVE_CONFIG_H
34
+ #include <config.h>
35
+ #endif/*HAVE_CONFIG_H*/
36
+
37
+ #include <os.h>
38
+
39
+ #include <stdio.h>
40
+ #include <stdlib.h>
41
+ #include <time.h>
42
+
43
+ #include <crfsuite.h>
44
+ #include "crfsuite_internal.h"
45
+ #include "logging.h"
46
+ #include "params.h"
47
+ #include "vecmath.h"
48
+
49
+ /**
50
+ * Training parameters (configurable with crfsuite_params_t interface).
51
+ */
52
+ typedef struct {
53
+ int max_iterations;
54
+ floatval_t epsilon;
55
+ } training_option_t;
56
+
57
+ /**
58
+ * Internal data structure for updating (averaging) feature weights.
59
+ */
60
+ typedef struct {
61
+ floatval_t *w;
62
+ floatval_t *ws;
63
+ floatval_t c;
64
+ floatval_t cs;
65
+ } update_data;
66
+
67
+ static void update_weights(void *instance, int fid, floatval_t value)
68
+ {
69
+ update_data *ud = (update_data*)instance;
70
+ ud->w[fid] += ud->c * value;
71
+ ud->ws[fid] += ud->cs * value;
72
+ }
73
+
74
+ static int diff(int *x, int *y, int n)
75
+ {
76
+ int i, d = 0;
77
+ for (i = 0;i < n;++i) {
78
+ if (x[i] != y[i]) {
79
+ ++d;
80
+ }
81
+ }
82
+ return d;
83
+ }
84
+
85
+ static int exchange_options(crfsuite_params_t* params, training_option_t* opt, int mode)
86
+ {
87
+ BEGIN_PARAM_MAP(params, mode)
88
+ DDX_PARAM_INT(
89
+ "max_iterations", opt->max_iterations, 100,
90
+ "The maximum number of iterations."
91
+ )
92
+ DDX_PARAM_FLOAT(
93
+ "epsilon", opt->epsilon, 0.,
94
+ "The stopping criterion (the ratio of incorrect label predictions)."
95
+ )
96
+ END_PARAM_MAP()
97
+
98
+ return 0;
99
+ }
100
+
101
+ void crfsuite_train_averaged_perceptron_init(crfsuite_params_t* params)
102
+ {
103
+ exchange_options(params, NULL, 0);
104
+ }
105
+
106
+ int crfsuite_train_averaged_perceptron(
107
+ encoder_t *gm,
108
+ dataset_t *trainset,
109
+ dataset_t *testset,
110
+ crfsuite_params_t *params,
111
+ logging_t *lg,
112
+ floatval_t **ptr_w
113
+ )
114
+ {
115
+ int n, i, c, ret = 0;
116
+ int *viterbi = NULL;
117
+ floatval_t *w = NULL;
118
+ floatval_t *ws = NULL;
119
+ floatval_t *wa = NULL;
120
+ const int N = trainset->num_instances;
121
+ const int K = gm->num_features;
122
+ const int T = gm->cap_items;
123
+ training_option_t opt;
124
+ update_data ud;
125
+ clock_t begin = clock();
126
+
127
+ /* Initialize the variable. */
128
+ memset(&ud, 0, sizeof(ud));
129
+
130
+ /* Obtain parameter values. */
131
+ exchange_options(params, &opt, -1);
132
+
133
+ /* Allocate arrays. */
134
+ w = (floatval_t*)calloc(sizeof(floatval_t), K);
135
+ ws = (floatval_t*)calloc(sizeof(floatval_t), K);
136
+ wa = (floatval_t*)calloc(sizeof(floatval_t), K);
137
+ viterbi = (int*)calloc(sizeof(int), T);
138
+ if (w == NULL || ws == NULL || wa == NULL || viterbi == NULL) {
139
+ ret = CRFSUITEERR_OUTOFMEMORY;
140
+ goto error_exit;
141
+ }
142
+
143
+ /* Show the parameters. */
144
+ logging(lg, "Averaged perceptron\n");
145
+ logging(lg, "max_iterations: %d\n", opt.max_iterations);
146
+ logging(lg, "epsilon: %f\n", opt.epsilon);
147
+ logging(lg, "\n");
148
+
149
+ c = 1;
150
+ ud.w = w;
151
+ ud.ws = ws;
152
+
153
+ /* Loop for epoch. */
154
+ for (i = 0;i < opt.max_iterations;++i) {
155
+ floatval_t norm = 0., loss = 0.;
156
+ clock_t iteration_begin = clock();
157
+
158
+ /* Shuffle the instances. */
159
+ dataset_shuffle(trainset);
160
+
161
+ /* Loop for each instance. */
162
+ for (n = 0;n < N;++n) {
163
+ int d = 0;
164
+ floatval_t score;
165
+ const crfsuite_instance_t *inst = dataset_get(trainset, n);
166
+
167
+ /* Set the feature weights to the encoder. */
168
+ gm->set_weights(gm, w, 1.);
169
+ gm->set_instance(gm, inst);
170
+
171
+ /* Tag the sequence with the current model. */
172
+ gm->viterbi(gm, viterbi, &score);
173
+
174
+ /* Compute the number of different labels. */
175
+ d = diff(inst->labels, viterbi, inst->num_items);
176
+ if (0 < d) {
177
+ /*
178
+ For every feature k on the correct path:
179
+ w[k] += 1; ws[k] += c;
180
+ */
181
+ ud.c = 1;
182
+ ud.cs = c;
183
+ gm->features_on_path(gm, inst, inst->labels, update_weights, &ud);
184
+
185
+ /*
186
+ For every feature k on the Viterbi path:
187
+ w[k] -= 1; ws[k] -= c;
188
+ */
189
+ ud.c = -1;
190
+ ud.cs = -c;
191
+ gm->features_on_path(gm, inst, viterbi, update_weights, &ud);
192
+
193
+ /* We define the loss as the ratio of wrongly predicted labels. */
194
+ loss += d / (floatval_t)inst->num_items;
195
+ }
196
+
197
+ ++c;
198
+ }
199
+
200
+ /* Perform averaging to wa. */
201
+ veccopy(wa, w, K);
202
+ vecasub(wa, 1./c, ws, K);
203
+
204
+ /* Output the progress. */
205
+ logging(lg, "***** Iteration #%d *****\n", i+1);
206
+ logging(lg, "Loss: %f\n", loss);
207
+ logging(lg, "Feature norm: %f\n", sqrt(vecdot(wa, wa, K)));
208
+ logging(lg, "Seconds required for this iteration: %.3f\n", (clock() - iteration_begin) / (double)CLOCKS_PER_SEC);
209
+
210
+ /* Holdout evaluation if necessary. */
211
+ if (testset != NULL) {
212
+ holdout_evaluation(gm, testset, wa, lg);
213
+ }
214
+
215
+ logging(lg, "\n");
216
+
217
+ /* Convergence test. */
218
+ if (loss / N < opt.epsilon) {
219
+ logging(lg, "Terminated with the stopping criterion\n");
220
+ logging(lg, "\n");
221
+ break;
222
+ }
223
+ }
224
+
225
+ logging(lg, "Total seconds required for training: %.3f\n", (clock() - begin) / (double)CLOCKS_PER_SEC);
226
+ logging(lg, "\n");
227
+
228
+ free(viterbi);
229
+ free(ws);
230
+ free(w);
231
+ *ptr_w = wa;
232
+ return ret;
233
+
234
+ error_exit:
235
+ free(viterbi);
236
+ free(wa);
237
+ free(ws);
238
+ free(w);
239
+ *ptr_w = NULL;
240
+
241
+ return ret;
242
+ }
@@ -0,0 +1,507 @@
1
+ /*
2
+ * Online training with L2-regularized Stochastic Gradient Descent (SGD).
3
+ *
4
+ * Copyright (c) 2007-2010, Naoaki Okazaki
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ * * Neither the names of the authors nor the names of its contributors
15
+ * may be used to endorse or promote products derived from this
16
+ * software without specific prior written permission.
17
+ *
18
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
22
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+ */
30
+
31
+ /* $Id$ */
32
+
33
+ /*
34
+ SGD for L2-regularized MAP estimation.
35
+
36
+ The iterative algorithm is inspired by Pegasos:
37
+
38
+ Shai Shalev-Shwartz, Yoram Singer, and Nathan Srebro.
39
+ Pegasos: Primal Estimated sub-GrAdient SOlver for SVM.
40
+ In Proc. of ICML 2007, pp 807-814, 2007.
41
+
42
+ The calibration strategy is inspired by the implementation of sgd:
43
+ http://leon.bottou.org/projects/sgd
44
+ written by Léon Bottou.
45
+
46
+ The objective function to minimize is:
47
+
48
+ f(w) = (lambda/2) * ||w||^2 + (1/N) * \sum_i^N log P^i(y|x)
49
+ lambda = 2 * C / N
50
+
51
+ The original version of the Pegasos algorithm.
52
+
53
+ 0) Initialization
54
+ t = t0
55
+ k = [the batch size]
56
+ 1) Computing the learning rate (eta).
57
+ eta = 1 / (lambda * t)
58
+ 2) Updating feature weights.
59
+ w = (1 - eta * lambda) w - (eta / k) \sum_i (oexp - mexp)
60
+ 3) Projecting feature weights within an L2-ball.
61
+ w = min{1, (1/sqrt(lambda))/||w||} * w
62
+ 4) Goto 1 until convergence.
63
+
64
+ This implementation omit the step 3) because it makes the source code
65
+ tricky (in order to maintain L2-norm of feature weights at any time) and
66
+ because the project step does not have a strong impact to the quality of
67
+ solution.
68
+
69
+ A naive implementation requires O(K) computations for steps 2,
70
+ where K is the total number of features. This code implements the procedure
71
+ in an efficient way:
72
+
73
+ 0) Initialization
74
+ decay = 1
75
+ 1) Computing various factors
76
+ eta = 1 / (lambda * t)
77
+ decay *= (1 - eta * lambda)
78
+ gain = (eta / k) / decay
79
+ 2) Updating feature weights
80
+ Updating feature weights from observation expectation:
81
+ delta = gain * (1.0) * f(x,y)
82
+ w += delta
83
+ Updating feature weights from model expectation:
84
+ delta = gain * (-P(y|x)) * f(x,y)
85
+ w += delta
86
+ 4) Goto 1 until convergence.
87
+ */
88
+
89
+
90
+ #ifdef HAVE_CONFIG_H
91
+ #include <config.h>
92
+ #endif/*HAVE_CONFIG_H*/
93
+
94
+ #include <os.h>
95
+
96
+ #include <float.h>
97
+ #include <stdio.h>
98
+ #include <stdlib.h>
99
+ #include <string.h>
100
+ #include <time.h>
101
+ #include <math.h>
102
+
103
+ #include <crfsuite.h>
104
+ #include "crfsuite_internal.h"
105
+
106
+ #include "logging.h"
107
+ #include "params.h"
108
+ #include "crf1d.h"
109
+ #include "vecmath.h"
110
+
111
+ #define MIN(a, b) ((a) < (b) ? (a) : (b))
112
+
113
+ typedef struct {
114
+ floatval_t c2;
115
+ floatval_t lambda;
116
+ floatval_t t0;
117
+ int max_iterations;
118
+ int period;
119
+ floatval_t delta;
120
+ floatval_t calibration_eta;
121
+ floatval_t calibration_rate;
122
+ int calibration_samples;
123
+ int calibration_candidates;
124
+ int calibration_max_trials;
125
+ } training_option_t;
126
+
127
+ static int l2sgd(
128
+ encoder_t *gm,
129
+ dataset_t *trainset,
130
+ dataset_t *testset,
131
+ floatval_t *w,
132
+ logging_t *lg,
133
+ const int N,
134
+ const floatval_t t0,
135
+ const floatval_t lambda,
136
+ const int num_epochs,
137
+ int calibration,
138
+ int period,
139
+ const floatval_t epsilon,
140
+ floatval_t *ptr_loss
141
+ )
142
+ {
143
+ int i, epoch, ret = 0;
144
+ floatval_t t = 0;
145
+ floatval_t loss = 0, sum_loss = 0;
146
+ floatval_t best_sum_loss = DBL_MAX;
147
+ floatval_t eta, gain, decay = 1.;
148
+ floatval_t improvement = 0.;
149
+ floatval_t norm2 = 0.;
150
+ floatval_t *pf = NULL;
151
+ floatval_t *best_w = NULL;
152
+ clock_t clk_prev, clk_begin = clock();
153
+ const int K = gm->num_features;
154
+
155
+ if (!calibration) {
156
+ pf = (floatval_t*)malloc(sizeof(floatval_t) * period);
157
+ best_w = (floatval_t*)calloc(K, sizeof(floatval_t));
158
+ if (pf == NULL || best_w == NULL) {
159
+ ret = CRFSUITEERR_OUTOFMEMORY;
160
+ goto error_exit;
161
+ }
162
+ }
163
+
164
+ /* Initialize the feature weights. */
165
+ vecset(w, 0, K);
166
+
167
+ /* Loop for epochs. */
168
+ for (epoch = 1;epoch <= num_epochs;++epoch) {
169
+ clk_prev = clock();
170
+
171
+ if (!calibration) {
172
+ logging(lg, "***** Epoch #%d *****\n", epoch);
173
+ /* Shuffle the training instances. */
174
+ dataset_shuffle(trainset);
175
+ }
176
+
177
+ /* Loop for instances. */
178
+ sum_loss = 0.;
179
+ for (i = 0;i < N;++i) {
180
+ const crfsuite_instance_t *inst = dataset_get(trainset, i);
181
+
182
+ /* Update various factors. */
183
+ eta = 1 / (lambda * (t0 + t));
184
+ decay *= (1.0 - eta * lambda);
185
+ gain = eta / decay;
186
+
187
+ /* Compute the loss and gradients for the instance. */
188
+ gm->set_weights(gm, w, decay);
189
+ gm->set_instance(gm, inst);
190
+ gm->objective_and_gradients(gm, &loss, w, gain);
191
+
192
+ sum_loss += loss;
193
+ ++t;
194
+ }
195
+
196
+ /* Terminate when the loss is abnormal (NaN, -Inf, +Inf). */
197
+ if (!isfinite(loss)) {
198
+ logging(lg, "ERROR: overflow loss\n");
199
+ ret = CRFSUITEERR_OVERFLOW;
200
+ sum_loss = loss;
201
+ goto error_exit;
202
+ }
203
+
204
+ /* Scale the feature weights. */
205
+ vecscale(w, decay, K);
206
+ decay = 1.;
207
+
208
+ /* Include the L2 norm of feature weights to the objective. */
209
+ /* The factor N is necessary because lambda = 2 * C / N. */
210
+ norm2 = vecdot(w, w, K);
211
+ sum_loss += 0.5 * lambda * norm2 * N;
212
+
213
+ /* One epoch finished. */
214
+ if (!calibration) {
215
+ /* Check if the current epoch is the best. */
216
+ if (sum_loss < best_sum_loss) {
217
+ /* Store the feature weights to best_w. */
218
+ best_sum_loss = sum_loss;
219
+ veccopy(best_w, w, K);
220
+ }
221
+
222
+ /* We don't test the stopping criterion while period < epoch. */
223
+ if (period < epoch) {
224
+ improvement = (pf[(epoch-1) % period] - sum_loss) / sum_loss;
225
+ } else {
226
+ improvement = epsilon;
227
+ }
228
+
229
+ /* Store the current value of the objective function. */
230
+ pf[(epoch-1) % period] = sum_loss;
231
+
232
+ logging(lg, "Loss: %f\n", sum_loss);
233
+ if (period < epoch) {
234
+ logging(lg, "Improvement ratio: %f\n", improvement);
235
+ }
236
+ logging(lg, "Feature L2-norm: %f\n", sqrt(norm2));
237
+ logging(lg, "Learning rate (eta): %f\n", eta);
238
+ logging(lg, "Total number of feature updates: %.0f\n", t);
239
+ logging(lg, "Seconds required for this iteration: %.3f\n", (clock() - clk_prev) / (double)CLOCKS_PER_SEC);
240
+
241
+ /* Holdout evaluation if necessary. */
242
+ if (testset != NULL) {
243
+ holdout_evaluation(gm, testset, w, lg);
244
+ }
245
+ logging(lg, "\n");
246
+
247
+ /* Check for the stopping criterion. */
248
+ if (improvement < epsilon) {
249
+ ret = 0;
250
+ break;
251
+ }
252
+ }
253
+ }
254
+
255
+ /* Output the optimization result. */
256
+ if (!calibration) {
257
+ if (ret == 0) {
258
+ if (epoch < num_epochs) {
259
+ logging(lg, "SGD terminated with the stopping criteria\n");
260
+ } else {
261
+ logging(lg, "SGD terminated with the maximum number of iterations\n");
262
+ }
263
+ } else {
264
+ logging(lg, "SGD terminated with error code (%d)\n", ret);
265
+ }
266
+ }
267
+
268
+ /* Restore the best weights. */
269
+ if (best_w != NULL) {
270
+ sum_loss = best_sum_loss;
271
+ veccopy(w, best_w, K);
272
+ }
273
+
274
+ error_exit:
275
+ free(best_w);
276
+ free(pf);
277
+ if (ptr_loss != NULL) {
278
+ *ptr_loss = sum_loss;
279
+ }
280
+ return ret;
281
+ }
282
+
283
+ static floatval_t
284
+ l2sgd_calibration(
285
+ encoder_t *gm,
286
+ dataset_t *ds,
287
+ floatval_t *w,
288
+ logging_t *lg,
289
+ const training_option_t* opt
290
+ )
291
+ {
292
+ int i, s;
293
+ int dec = 0, ok, trials = 1;
294
+ int num = opt->calibration_candidates;
295
+ clock_t clk_begin = clock();
296
+ floatval_t loss = 0.;
297
+ floatval_t init_loss = 0.;
298
+ floatval_t best_loss = DBL_MAX;
299
+ floatval_t eta = opt->calibration_eta;
300
+ floatval_t best_eta = opt->calibration_eta;
301
+ const int N = ds->num_instances;
302
+ const int S = MIN(N, opt->calibration_samples);
303
+ const int K = gm->num_features;
304
+ const floatval_t init_eta = opt->calibration_eta;
305
+ const floatval_t rate = opt->calibration_rate;
306
+ const floatval_t lambda = opt->lambda;
307
+
308
+ logging(lg, "Calibrating the learning rate (eta)\n");
309
+ logging(lg, "calibration.eta: %f\n", eta);
310
+ logging(lg, "calibration.rate: %f\n", rate);
311
+ logging(lg, "calibration.samples: %d\n", S);
312
+ logging(lg, "calibration.candidates: %d\n", num);
313
+ logging(lg, "calibration.max_trials: %d\n", opt->calibration_max_trials);
314
+
315
+ /* Initialize a permutation that shuffles the instances. */
316
+ dataset_shuffle(ds);
317
+
318
+ /* Initialize feature weights as zero. */
319
+ vecset(w, 0, K);
320
+
321
+ /* Compute the initial loss. */
322
+ gm->set_weights(gm, w, 1.);
323
+ init_loss = 0;
324
+ for (i = 0;i < S;++i) {
325
+ floatval_t score;
326
+ const crfsuite_instance_t *inst = dataset_get(ds, i);
327
+ gm->set_instance(gm, inst);
328
+ gm->score(gm, inst->labels, &score);
329
+ init_loss -= score;
330
+ gm->partition_factor(gm, &score);
331
+ init_loss += score;
332
+ }
333
+ init_loss += 0.5 * lambda * vecdot(w, w, K) * N;
334
+ logging(lg, "Initial loss: %f\n", init_loss);
335
+
336
+ while (num > 0 || !dec) {
337
+ logging(lg, "Trial #%d (eta = %f): ", trials, eta);
338
+
339
+ /* Perform SGD for one epoch. */
340
+ l2sgd(
341
+ gm,
342
+ ds,
343
+ NULL,
344
+ w,
345
+ lg,
346
+ S, 1.0 / (lambda * eta), lambda, 1, 1, 1, 0., &loss);
347
+
348
+ /* Make sure that the learning rate decreases the log-likelihood. */
349
+ ok = isfinite(loss) && (loss < init_loss);
350
+ if (ok) {
351
+ logging(lg, "%f\n", loss);
352
+ --num;
353
+ } else {
354
+ logging(lg, "%f (worse)\n", loss);
355
+ }
356
+
357
+ if (isfinite(loss) && loss < best_loss) {
358
+ best_loss = loss;
359
+ best_eta = eta;
360
+ }
361
+
362
+ if (!dec) {
363
+ if (ok && 0 < num) {
364
+ eta *= rate;
365
+ } else {
366
+ dec = 1;
367
+ num = opt->calibration_candidates;
368
+ eta = init_eta / rate;
369
+ }
370
+ } else {
371
+ eta /= rate;
372
+ }
373
+
374
+ ++trials;
375
+ if (opt->calibration_max_trials <= trials) {
376
+ break;
377
+ }
378
+ }
379
+
380
+ eta = best_eta;
381
+ logging(lg, "Best learning rate (eta): %f\n", eta);
382
+ logging(lg, "Seconds required: %.3f\n", (clock() - clk_begin) / (double)CLOCKS_PER_SEC);
383
+ logging(lg, "\n");
384
+
385
+ return 1.0 / (lambda * eta);
386
+ }
387
+
388
+ int exchange_options(crfsuite_params_t* params, training_option_t* opt, int mode)
389
+ {
390
+ BEGIN_PARAM_MAP(params, mode)
391
+ DDX_PARAM_FLOAT(
392
+ "c2", opt->c2, 1.,
393
+ "Coefficient for L2 regularization."
394
+ )
395
+ DDX_PARAM_INT(
396
+ "max_iterations", opt->max_iterations, 1000,
397
+ "The maximum number of iterations (epochs) for SGD optimization."
398
+ )
399
+ DDX_PARAM_INT(
400
+ "period", opt->period, 10,
401
+ "The duration of iterations to test the stopping criterion."
402
+ )
403
+ DDX_PARAM_FLOAT(
404
+ "delta", opt->delta, 1e-6,
405
+ "The threshold for the stopping criterion; an optimization process stops when\n"
406
+ "the improvement of the log likelihood over the last ${period} iterations is no\n"
407
+ "greater than this threshold."
408
+ )
409
+ DDX_PARAM_FLOAT(
410
+ "calibration.eta", opt->calibration_eta, 0.1,
411
+ "The initial value of learning rate (eta) used for calibration."
412
+ )
413
+ DDX_PARAM_FLOAT(
414
+ "calibration.rate", opt->calibration_rate, 2.,
415
+ "The rate of increase/decrease of learning rate for calibration."
416
+ )
417
+ DDX_PARAM_INT(
418
+ "calibration.samples", opt->calibration_samples, 1000,
419
+ "The number of instances used for calibration."
420
+ )
421
+ DDX_PARAM_INT(
422
+ "calibration.candidates", opt->calibration_candidates, 10,
423
+ "The number of candidates of learning rate."
424
+ )
425
+ DDX_PARAM_INT(
426
+ "calibration.max_trials", opt->calibration_max_trials, 20,
427
+ "The maximum number of trials of learning rates for calibration."
428
+ )
429
+ END_PARAM_MAP()
430
+
431
+ return 0;
432
+ }
433
+
434
+ void crfsuite_train_l2sgd_init(crfsuite_params_t* params)
435
+ {
436
+ exchange_options(params, NULL, 0);
437
+ }
438
+
439
+ int crfsuite_train_l2sgd(
440
+ encoder_t *gm,
441
+ dataset_t *trainset,
442
+ dataset_t *testset,
443
+ crfsuite_params_t *params,
444
+ logging_t *lg,
445
+ floatval_t **ptr_w
446
+ )
447
+ {
448
+ int ret = 0;
449
+ floatval_t *w = NULL;
450
+ clock_t clk_begin;
451
+ floatval_t loss = 0;
452
+ const int N = trainset->num_instances;
453
+ const int K = gm->num_features;
454
+ const int T = gm->cap_items;
455
+ training_option_t opt;
456
+
457
+ /* Obtain parameter values. */
458
+ exchange_options(params, &opt, -1);
459
+
460
+ /* Allocate arrays. */
461
+ w = (floatval_t*)calloc(sizeof(floatval_t), K);
462
+ if (w == NULL) {
463
+ ret = CRFSUITEERR_OUTOFMEMORY;
464
+ goto error_exit;
465
+ }
466
+
467
+ opt.lambda = 2. * opt.c2 / N;
468
+
469
+ logging(lg, "Stochastic Gradient Descent (SGD)\n");
470
+ logging(lg, "c2: %f\n", opt.c2);
471
+ logging(lg, "max_iterations: %d\n", opt.max_iterations);
472
+ logging(lg, "period: %d\n", opt.period);
473
+ logging(lg, "delta: %f\n", opt.delta);
474
+ logging(lg, "\n");
475
+ clk_begin = clock();
476
+
477
+ /* Calibrate the training rate (eta). */
478
+ opt.t0 = l2sgd_calibration(gm, trainset, w, lg, &opt);
479
+
480
+ /* Perform stochastic gradient descent. */
481
+ ret = l2sgd(
482
+ gm,
483
+ trainset,
484
+ testset,
485
+ w,
486
+ lg,
487
+ N,
488
+ opt.t0,
489
+ opt.lambda,
490
+ opt.max_iterations,
491
+ 0,
492
+ opt.period,
493
+ opt.delta,
494
+ &loss
495
+ );
496
+
497
+ logging(lg, "Loss: %f\n", loss);
498
+ logging(lg, "Total seconds required for training: %.3f\n", (clock() - clk_begin) / (double)CLOCKS_PER_SEC);
499
+ logging(lg, "\n");
500
+
501
+ *ptr_w = w;
502
+ return ret;
503
+
504
+ error_exit:
505
+ free(w);
506
+ return ret;
507
+ }