RubyGems - opener-opinion-detector-base - Versions diffs - 2.0.0 - Mend

opener-opinion-detector-base 2.0.0

Files changed (261) hide show

checksums.yaml +7 -0
data/README.md +101 -0
data/bin/opinion-detector-base +19 -0
data/core/annotation.cfg.erb +9 -0
data/core/packages/KafNafParser-1.4.tar.gz +0 -0
data/core/packages/VUA_pylib-1.5.tar.gz +0 -0
data/core/python-scripts/LICENSE +339 -0
data/core/python-scripts/README.md +226 -0
data/core/python-scripts/classify_kaf_naf_file.py +499 -0
data/core/python-scripts/cross_validation.py +634 -0
data/core/python-scripts/generate_folds.py +134 -0
data/core/python-scripts/models.cfg +10 -0
data/core/python-scripts/my_templates/README +33 -0
data/core/python-scripts/my_templates/templates_exp.only0.txt +6 -0
data/core/python-scripts/my_templates/templates_exp.pol0.txt +10 -0
data/core/python-scripts/my_templates/templates_exp.red.txt +7 -0
data/core/python-scripts/my_templates/templates_exp.txt +10 -0
data/core/python-scripts/my_templates/templates_holder.only0.txt +11 -0
data/core/python-scripts/my_templates/templates_holder.red.txt +9 -0
data/core/python-scripts/my_templates/templates_holder.txt +10 -0
data/core/python-scripts/my_templates/templates_target.only0.txt +11 -0
data/core/python-scripts/my_templates/templates_target.red.txt +9 -0
data/core/python-scripts/my_templates/templates_target.txt +10 -0
data/core/python-scripts/run_all_experiments.sh +49 -0
data/core/python-scripts/run_basic.py +20 -0
data/core/python-scripts/run_experiment.sh +42 -0
data/core/python-scripts/scripts/__init__.py +1 -0
data/core/python-scripts/scripts/config_manager.py +314 -0
data/core/python-scripts/scripts/crfutils.py +215 -0
data/core/python-scripts/scripts/extract_feats_relations.py +295 -0
data/core/python-scripts/scripts/extract_features.py +376 -0
data/core/python-scripts/scripts/feats_to_crf.exp.py +105 -0
data/core/python-scripts/scripts/lexicons.py +44 -0
data/core/python-scripts/scripts/link_entities_distance.py +77 -0
data/core/python-scripts/scripts/relation_classifier.py +250 -0
data/core/python-scripts/train.py +566 -0
data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/PKG-INFO +10 -0
data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/SOURCES.txt +22 -0
data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/dependency_links.txt +1 -0
data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/installed-files.txt +47 -0
data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/top_level.txt +1 -0
data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.py +390 -0
data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/__init__.py +14 -0
data/core/site-packages/pre_build/KafNafParser/__init__.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/constituency_data.py +125 -0
data/core/site-packages/pre_build/KafNafParser/constituency_data.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/coreference_data.py +52 -0
data/core/site-packages/pre_build/KafNafParser/coreference_data.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/dependency_data.py +78 -0
data/core/site-packages/pre_build/KafNafParser/dependency_data.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/entity_data.py +59 -0
data/core/site-packages/pre_build/KafNafParser/entity_data.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/external_references_data.py +41 -0
data/core/site-packages/pre_build/KafNafParser/external_references_data.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.py +2 -0
data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.py +205 -0
data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.py +309 -0
data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/features_data.py +131 -0
data/core/site-packages/pre_build/KafNafParser/features_data.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/header_data.py +127 -0
data/core/site-packages/pre_build/KafNafParser/header_data.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/opinion_data.py +211 -0
data/core/site-packages/pre_build/KafNafParser/opinion_data.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/references_data.py +23 -0
data/core/site-packages/pre_build/KafNafParser/references_data.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/span_data.py +63 -0
data/core/site-packages/pre_build/KafNafParser/span_data.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/term_data.py +111 -0
data/core/site-packages/pre_build/KafNafParser/term_data.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.py +42 -0
data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/text_data.py +99 -0
data/core/site-packages/pre_build/KafNafParser/text_data.pyc +0 -0
data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/PKG-INFO +10 -0
data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/SOURCES.txt +14 -0
data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/dependency_links.txt +1 -0
data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/installed-files.txt +23 -0
data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/top_level.txt +1 -0
data/core/site-packages/pre_build/VUA_pylib/__init__.py +1 -0
data/core/site-packages/pre_build/VUA_pylib/__init__.pyc +0 -0
data/core/site-packages/pre_build/VUA_pylib/common/__init__.py +1 -0
data/core/site-packages/pre_build/VUA_pylib/common/__init__.pyc +0 -0
data/core/site-packages/pre_build/VUA_pylib/common/common.py +28 -0
data/core/site-packages/pre_build/VUA_pylib/common/common.pyc +0 -0
data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.py +1 -0
data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.pyc +0 -0
data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.py +156 -0
data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.pyc +0 -0
data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.py +1 -0
data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.pyc +0 -0
data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.py +121 -0
data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.pyc +0 -0
data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.py +1 -0
data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.pyc +0 -0
data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.py +72 -0
data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.pyc +0 -0
data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
data/core/vendor/src/crfsuite/AUTHORS +1 -0
data/core/vendor/src/crfsuite/COPYING +27 -0
data/core/vendor/src/crfsuite/ChangeLog +103 -0
data/core/vendor/src/crfsuite/INSTALL +236 -0
data/core/vendor/src/crfsuite/Makefile.am +19 -0
data/core/vendor/src/crfsuite/Makefile.in +783 -0
data/core/vendor/src/crfsuite/README +183 -0
data/core/vendor/src/crfsuite/aclocal.m4 +9018 -0
data/core/vendor/src/crfsuite/autogen.sh +38 -0
data/core/vendor/src/crfsuite/compile +143 -0
data/core/vendor/src/crfsuite/config.guess +1502 -0
data/core/vendor/src/crfsuite/config.h.in +198 -0
data/core/vendor/src/crfsuite/config.sub +1714 -0
data/core/vendor/src/crfsuite/configure +14273 -0
data/core/vendor/src/crfsuite/configure.in +149 -0
data/core/vendor/src/crfsuite/crfsuite.sln +42 -0
data/core/vendor/src/crfsuite/depcomp +630 -0
data/core/vendor/src/crfsuite/example/chunking.py +49 -0
data/core/vendor/src/crfsuite/example/crfutils.py +179 -0
data/core/vendor/src/crfsuite/example/ner.py +270 -0
data/core/vendor/src/crfsuite/example/pos.py +78 -0
data/core/vendor/src/crfsuite/example/template.py +88 -0
data/core/vendor/src/crfsuite/frontend/Makefile.am +29 -0
data/core/vendor/src/crfsuite/frontend/Makefile.in +640 -0
data/core/vendor/src/crfsuite/frontend/dump.c +116 -0
data/core/vendor/src/crfsuite/frontend/frontend.vcxproj +129 -0
data/core/vendor/src/crfsuite/frontend/iwa.c +273 -0
data/core/vendor/src/crfsuite/frontend/iwa.h +65 -0
data/core/vendor/src/crfsuite/frontend/learn.c +439 -0
data/core/vendor/src/crfsuite/frontend/main.c +137 -0
data/core/vendor/src/crfsuite/frontend/option.c +93 -0
data/core/vendor/src/crfsuite/frontend/option.h +86 -0
data/core/vendor/src/crfsuite/frontend/readdata.h +38 -0
data/core/vendor/src/crfsuite/frontend/reader.c +136 -0
data/core/vendor/src/crfsuite/frontend/tag.c +427 -0
data/core/vendor/src/crfsuite/genbinary.sh.in +15 -0
data/core/vendor/src/crfsuite/include/Makefile.am +11 -0
data/core/vendor/src/crfsuite/include/Makefile.in +461 -0
data/core/vendor/src/crfsuite/include/crfsuite.h +1063 -0
data/core/vendor/src/crfsuite/include/crfsuite.hpp +555 -0
data/core/vendor/src/crfsuite/include/crfsuite_api.hpp +400 -0
data/core/vendor/src/crfsuite/include/os.h +61 -0
data/core/vendor/src/crfsuite/install-sh +520 -0
data/core/vendor/src/crfsuite/lib/cqdb/COPYING +28 -0
data/core/vendor/src/crfsuite/lib/cqdb/Makefile.am +21 -0
data/core/vendor/src/crfsuite/lib/cqdb/Makefile.in +549 -0
data/core/vendor/src/crfsuite/lib/cqdb/cqdb.vcxproj +86 -0
data/core/vendor/src/crfsuite/lib/cqdb/include/cqdb.h +524 -0
data/core/vendor/src/crfsuite/lib/cqdb/src/cqdb.c +587 -0
data/core/vendor/src/crfsuite/lib/cqdb/src/lookup3.c +976 -0
data/core/vendor/src/crfsuite/lib/crf/Makefile.am +46 -0
data/core/vendor/src/crfsuite/lib/crf/Makefile.in +721 -0
data/core/vendor/src/crfsuite/lib/crf/crf.vcxproj +216 -0
data/core/vendor/src/crfsuite/lib/crf/src/crf1d.h +353 -0
data/core/vendor/src/crfsuite/lib/crf/src/crf1d_context.c +705 -0
data/core/vendor/src/crfsuite/lib/crf/src/crf1d_encode.c +943 -0
data/core/vendor/src/crfsuite/lib/crf/src/crf1d_feature.c +352 -0
data/core/vendor/src/crfsuite/lib/crf/src/crf1d_model.c +994 -0
data/core/vendor/src/crfsuite/lib/crf/src/crf1d_tag.c +550 -0
data/core/vendor/src/crfsuite/lib/crf/src/crfsuite.c +492 -0
data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_internal.h +236 -0
data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_train.c +272 -0
data/core/vendor/src/crfsuite/lib/crf/src/dataset.c +106 -0
data/core/vendor/src/crfsuite/lib/crf/src/dictionary.c +118 -0
data/core/vendor/src/crfsuite/lib/crf/src/holdout.c +80 -0
data/core/vendor/src/crfsuite/lib/crf/src/logging.c +91 -0
data/core/vendor/src/crfsuite/lib/crf/src/logging.h +48 -0
data/core/vendor/src/crfsuite/lib/crf/src/params.c +335 -0
data/core/vendor/src/crfsuite/lib/crf/src/params.h +80 -0
data/core/vendor/src/crfsuite/lib/crf/src/quark.c +172 -0
data/core/vendor/src/crfsuite/lib/crf/src/quark.h +46 -0
data/core/vendor/src/crfsuite/lib/crf/src/rumavl.c +1107 -0
data/core/vendor/src/crfsuite/lib/crf/src/rumavl.h +160 -0
data/core/vendor/src/crfsuite/lib/crf/src/train_arow.c +408 -0
data/core/vendor/src/crfsuite/lib/crf/src/train_averaged_perceptron.c +242 -0
data/core/vendor/src/crfsuite/lib/crf/src/train_l2sgd.c +507 -0
data/core/vendor/src/crfsuite/lib/crf/src/train_lbfgs.c +338 -0
data/core/vendor/src/crfsuite/lib/crf/src/train_passive_aggressive.c +435 -0
data/core/vendor/src/crfsuite/lib/crf/src/vecmath.h +341 -0
data/core/vendor/src/crfsuite/ltmain.sh +8413 -0
data/core/vendor/src/crfsuite/missing +376 -0
data/core/vendor/src/crfsuite/swig/Makefile.am +13 -0
data/core/vendor/src/crfsuite/swig/Makefile.in +365 -0
data/core/vendor/src/crfsuite/swig/crfsuite.cpp +2 -0
data/core/vendor/src/crfsuite/swig/export.i +32 -0
data/core/vendor/src/crfsuite/swig/python/README +92 -0
data/core/vendor/src/crfsuite/swig/python/crfsuite.py +329 -0
data/core/vendor/src/crfsuite/swig/python/export_wrap.cpp +14355 -0
data/core/vendor/src/crfsuite/swig/python/export_wrap.h +63 -0
data/core/vendor/src/crfsuite/swig/python/prepare.sh +9 -0
data/core/vendor/src/crfsuite/swig/python/sample_tag.py +52 -0
data/core/vendor/src/crfsuite/swig/python/sample_train.py +68 -0
data/core/vendor/src/crfsuite/swig/python/setup.py +44 -0
data/core/vendor/src/crfsuite/win32/stdint.h +679 -0
data/core/vendor/src/liblbfgs/AUTHORS +1 -0
data/core/vendor/src/liblbfgs/COPYING +22 -0
data/core/vendor/src/liblbfgs/ChangeLog +120 -0
data/core/vendor/src/liblbfgs/INSTALL +231 -0
data/core/vendor/src/liblbfgs/Makefile.am +10 -0
data/core/vendor/src/liblbfgs/Makefile.in +638 -0
data/core/vendor/src/liblbfgs/NEWS +0 -0
data/core/vendor/src/liblbfgs/README +71 -0
data/core/vendor/src/liblbfgs/aclocal.m4 +6985 -0
data/core/vendor/src/liblbfgs/autogen.sh +38 -0
data/core/vendor/src/liblbfgs/config.guess +1411 -0
data/core/vendor/src/liblbfgs/config.h.in +64 -0
data/core/vendor/src/liblbfgs/config.sub +1500 -0
data/core/vendor/src/liblbfgs/configure +21146 -0
data/core/vendor/src/liblbfgs/configure.in +107 -0
data/core/vendor/src/liblbfgs/depcomp +522 -0
data/core/vendor/src/liblbfgs/include/lbfgs.h +745 -0
data/core/vendor/src/liblbfgs/install-sh +322 -0
data/core/vendor/src/liblbfgs/lbfgs.sln +26 -0
data/core/vendor/src/liblbfgs/lib/Makefile.am +24 -0
data/core/vendor/src/liblbfgs/lib/Makefile.in +499 -0
data/core/vendor/src/liblbfgs/lib/arithmetic_ansi.h +133 -0
data/core/vendor/src/liblbfgs/lib/arithmetic_sse_double.h +294 -0
data/core/vendor/src/liblbfgs/lib/arithmetic_sse_float.h +298 -0
data/core/vendor/src/liblbfgs/lib/lbfgs.c +1371 -0
data/core/vendor/src/liblbfgs/lib/lib.vcxproj +95 -0
data/core/vendor/src/liblbfgs/ltmain.sh +6426 -0
data/core/vendor/src/liblbfgs/missing +353 -0
data/core/vendor/src/liblbfgs/sample/Makefile.am +15 -0
data/core/vendor/src/liblbfgs/sample/Makefile.in +433 -0
data/core/vendor/src/liblbfgs/sample/sample.c +81 -0
data/core/vendor/src/liblbfgs/sample/sample.cpp +126 -0
data/core/vendor/src/liblbfgs/sample/sample.vcxproj +105 -0
data/core/vendor/src/svm_light/LICENSE.txt +59 -0
data/core/vendor/src/svm_light/Makefile +105 -0
data/core/vendor/src/svm_light/kernel.h +40 -0
data/core/vendor/src/svm_light/svm_classify.c +197 -0
data/core/vendor/src/svm_light/svm_common.c +985 -0
data/core/vendor/src/svm_light/svm_common.h +301 -0
data/core/vendor/src/svm_light/svm_hideo.c +1062 -0
data/core/vendor/src/svm_light/svm_learn.c +4147 -0
data/core/vendor/src/svm_light/svm_learn.h +169 -0
data/core/vendor/src/svm_light/svm_learn_main.c +397 -0
data/core/vendor/src/svm_light/svm_loqo.c +211 -0
data/ext/hack/Rakefile +17 -0
data/ext/hack/support.rb +88 -0
data/lib/opener/opinion_detectors/base.rb +112 -0
data/lib/opener/opinion_detectors/base/version.rb +7 -0
data/lib/opener/opinion_detectors/configuration_creator.rb +86 -0
data/lib/opener/opinion_detectors/de.rb +7 -0
data/lib/opener/opinion_detectors/en.rb +7 -0
data/lib/opener/opinion_detectors/it.rb +7 -0
data/lib/opener/opinion_detectors/nl.rb +6 -0
data/opener-opinion-detector-base.gemspec +35 -0
data/pre_build_requirements.txt +3 -0
metadata +374 -0

data/core/vendor/src/crfsuite/lib/crf/src/train_averaged_perceptron.c ADDED Viewed

@@ -0,0 +1,242 @@
+/*
+ *      Online training with averaged perceptron.
+ *
+ * Copyright (c) 2007-2010, Naoaki Okazaki
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the names of the authors nor the names of its contributors
+ *       may be used to endorse or promote products derived from this
+ *       software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/* $Id$ */
+#ifdef    HAVE_CONFIG_H
+#include <config.h>
+#endif/*HAVE_CONFIG_H*/
+#include <os.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <crfsuite.h>
+#include "crfsuite_internal.h"
+#include "logging.h"
+#include "params.h"
+#include "vecmath.h"
+/**
+ * Training parameters (configurable with crfsuite_params_t interface).
+ */
+typedef struct {
+    int max_iterations;
+    floatval_t epsilon;
+} training_option_t;
+/**
+ * Internal data structure for updating (averaging) feature weights.
+ */
+typedef struct {
+    floatval_t *w;
+    floatval_t *ws;
+    floatval_t c;
+    floatval_t cs;
+} update_data;
+static void update_weights(void *instance, int fid, floatval_t value)
+{
+    update_data *ud = (update_data*)instance;
+    ud->w[fid] += ud->c * value;
+    ud->ws[fid] += ud->cs * value;
+}
+static int diff(int *x, int *y, int n)
+{
+    int i, d = 0;
+    for (i = 0;i < n;++i) {
+        if (x[i] != y[i]) {
+            ++d;
+        }
+    }
+    return d;
+}
+static int exchange_options(crfsuite_params_t* params, training_option_t* opt, int mode)
+{
+    BEGIN_PARAM_MAP(params, mode)
+        DDX_PARAM_INT(
+            "max_iterations", opt->max_iterations, 100,
+            "The maximum number of iterations."
+            )
+        DDX_PARAM_FLOAT(
+            "epsilon", opt->epsilon, 0.,
+            "The stopping criterion (the ratio of incorrect label predictions)."
+            )
+    END_PARAM_MAP()
+    return 0;
+}
+void crfsuite_train_averaged_perceptron_init(crfsuite_params_t* params)
+{
+    exchange_options(params, NULL, 0);
+}
+int crfsuite_train_averaged_perceptron(
+    encoder_t *gm,
+    dataset_t *trainset,
+    dataset_t *testset,
+    crfsuite_params_t *params,
+    logging_t *lg,
+    floatval_t **ptr_w
+    )
+{
+    int n, i, c, ret = 0;
+    int *viterbi = NULL;
+    floatval_t *w = NULL;
+    floatval_t *ws = NULL;
+    floatval_t *wa = NULL;
+    const int N = trainset->num_instances;
+    const int K = gm->num_features;
+    const int T = gm->cap_items;
+    training_option_t opt;
+    update_data ud;
+    clock_t begin = clock();
+	/* Initialize the variable. */
+	memset(&ud, 0, sizeof(ud));
+    /* Obtain parameter values. */
+    exchange_options(params, &opt, -1);
+    /* Allocate arrays. */
+    w = (floatval_t*)calloc(sizeof(floatval_t), K);
+    ws = (floatval_t*)calloc(sizeof(floatval_t), K);
+    wa = (floatval_t*)calloc(sizeof(floatval_t), K);
+    viterbi = (int*)calloc(sizeof(int), T);
+    if (w == NULL || ws == NULL || wa == NULL || viterbi == NULL) {
+        ret = CRFSUITEERR_OUTOFMEMORY;
+        goto error_exit;
+    }
+    /* Show the parameters. */
+    logging(lg, "Averaged perceptron\n");
+    logging(lg, "max_iterations: %d\n", opt.max_iterations);
+    logging(lg, "epsilon: %f\n", opt.epsilon);
+    logging(lg, "\n");
+    c = 1;
+    ud.w = w;
+    ud.ws = ws;
+	/* Loop for epoch. */
+    for (i = 0;i < opt.max_iterations;++i) {
+        floatval_t norm = 0., loss = 0.;
+        clock_t iteration_begin = clock();
+        /* Shuffle the instances. */
+        dataset_shuffle(trainset);
+		/* Loop for each instance. */
+        for (n = 0;n < N;++n) {
+            int d = 0;
+            floatval_t score;
+            const crfsuite_instance_t *inst = dataset_get(trainset, n);
+            /* Set the feature weights to the encoder. */
+            gm->set_weights(gm, w, 1.);
+            gm->set_instance(gm, inst);
+            /* Tag the sequence with the current model. */
+            gm->viterbi(gm, viterbi, &score);
+            /* Compute the number of different labels. */
+            d = diff(inst->labels, viterbi, inst->num_items);
+            if (0 < d) {
+                /*
+                    For every feature k on the correct path:
+                        w[k] += 1; ws[k] += c;
+                 */
+                ud.c = 1;
+                ud.cs = c;
+                gm->features_on_path(gm, inst, inst->labels, update_weights, &ud);
+                /*
+                    For every feature k on the Viterbi path:
+                        w[k] -= 1; ws[k] -= c;
+                 */
+                ud.c = -1;
+                ud.cs = -c;
+                gm->features_on_path(gm, inst, viterbi, update_weights, &ud);
+                /* We define the loss as the ratio of wrongly predicted labels. */
+                loss += d / (floatval_t)inst->num_items;
+            }
+            ++c;
+        }
+        /* Perform averaging to wa. */
+        veccopy(wa, w, K);
+        vecasub(wa, 1./c, ws, K);
+        /* Output the progress. */
+        logging(lg, "***** Iteration #%d *****\n", i+1);
+        logging(lg, "Loss: %f\n", loss);
+        logging(lg, "Feature norm: %f\n", sqrt(vecdot(wa, wa, K)));
+        logging(lg, "Seconds required for this iteration: %.3f\n", (clock() - iteration_begin) / (double)CLOCKS_PER_SEC);
+        /* Holdout evaluation if necessary. */
+        if (testset != NULL) {
+            holdout_evaluation(gm, testset, wa, lg);
+        }
+        logging(lg, "\n");
+        /* Convergence test. */
+        if (loss / N < opt.epsilon) {
+            logging(lg, "Terminated with the stopping criterion\n");
+            logging(lg, "\n");
+            break;
+        }
+    }
+    logging(lg, "Total seconds required for training: %.3f\n", (clock() - begin) / (double)CLOCKS_PER_SEC);
+    logging(lg, "\n");
+    free(viterbi);
+    free(ws);
+    free(w);
+    *ptr_w = wa;
+    return ret;
+error_exit:
+    free(viterbi);
+    free(wa);
+    free(ws);
+    free(w);
+    *ptr_w = NULL;
+    return ret;
+}

data/core/vendor/src/crfsuite/lib/crf/src/train_l2sgd.c ADDED Viewed

@@ -0,0 +1,507 @@
+/*
+ *      Online training with L2-regularized Stochastic Gradient Descent (SGD).
+ *
+ * Copyright (c) 2007-2010, Naoaki Okazaki
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the names of the authors nor the names of its contributors
+ *       may be used to endorse or promote products derived from this
+ *       software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/* $Id$ */
+/*
+    SGD for L2-regularized MAP estimation.
+    The iterative algorithm is inspired by Pegasos:
+    Shai Shalev-Shwartz, Yoram Singer, and Nathan Srebro.
+    Pegasos: Primal Estimated sub-GrAdient SOlver for SVM.
+    In Proc. of ICML 2007, pp 807-814, 2007.
+    The calibration strategy is inspired by the implementation of sgd:
+    http://leon.bottou.org/projects/sgd
+    written by Léon Bottou.
+    The objective function to minimize is:
+        f(w) = (lambda/2) * ||w||^2 + (1/N) * \sum_i^N log P^i(y|x)
+        lambda = 2 * C / N
+    The original version of the Pegasos algorithm.
+    0) Initialization
+        t = t0
+        k = [the batch size]
+    1) Computing the learning rate (eta).
+        eta = 1 / (lambda * t)
+    2) Updating feature weights.
+        w = (1 - eta * lambda) w - (eta / k) \sum_i (oexp - mexp)
+    3) Projecting feature weights within an L2-ball.
+        w = min{1, (1/sqrt(lambda))/||w||} * w
+    4) Goto 1 until convergence.
+    This implementation omit the step 3) because it makes the source code
+    tricky (in order to maintain L2-norm of feature weights at any time) and
+    because the project step does not have a strong impact to the quality of
+    solution.
+    A naive implementation requires O(K) computations for steps 2,
+    where K is the total number of features. This code implements the procedure
+    in an efficient way:
+    0) Initialization
+        decay = 1
+    1) Computing various factors
+        eta = 1 / (lambda * t)
+        decay *= (1 - eta * lambda)
+        gain = (eta / k) / decay
+    2) Updating feature weights
+        Updating feature weights from observation expectation:
+            delta = gain * (1.0) * f(x,y)
+            w += delta
+        Updating feature weights from model expectation:
+            delta = gain * (-P(y|x)) * f(x,y)
+            w += delta
+    4) Goto 1 until convergence.
+*/
+#ifdef    HAVE_CONFIG_H
+#include <config.h>
+#endif/*HAVE_CONFIG_H*/
+#include <os.h>
+#include <float.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <math.h>
+#include <crfsuite.h>
+#include "crfsuite_internal.h"
+#include "logging.h"
+#include "params.h"
+#include "crf1d.h"
+#include "vecmath.h"
+#define MIN(a, b)   ((a) < (b) ? (a) : (b))
+typedef struct {
+    floatval_t  c2;
+    floatval_t  lambda;
+    floatval_t  t0;
+    int         max_iterations;
+    int         period;
+    floatval_t  delta;
+    floatval_t  calibration_eta;
+    floatval_t  calibration_rate;
+    int         calibration_samples;
+    int         calibration_candidates;
+    int         calibration_max_trials;
+} training_option_t;
+static int l2sgd(
+    encoder_t *gm,
+    dataset_t *trainset,
+    dataset_t *testset,
+    floatval_t *w,
+    logging_t *lg,
+    const int N,
+    const floatval_t t0,
+    const floatval_t lambda,
+    const int num_epochs,
+    int calibration,
+    int period,
+    const floatval_t epsilon,
+    floatval_t *ptr_loss
+    )
+{
+    int i, epoch, ret = 0;
+    floatval_t t = 0;
+    floatval_t loss = 0, sum_loss = 0;
+    floatval_t best_sum_loss = DBL_MAX;
+    floatval_t eta, gain, decay = 1.;
+    floatval_t improvement = 0.;
+    floatval_t norm2 = 0.;
+    floatval_t *pf = NULL;
+    floatval_t *best_w = NULL;
+    clock_t clk_prev, clk_begin = clock();
+    const int K = gm->num_features;
+    if (!calibration) {
+        pf = (floatval_t*)malloc(sizeof(floatval_t) * period);
+        best_w = (floatval_t*)calloc(K, sizeof(floatval_t));
+        if (pf == NULL || best_w == NULL) {
+            ret = CRFSUITEERR_OUTOFMEMORY;
+            goto error_exit;
+        }
+    }
+    /* Initialize the feature weights. */
+    vecset(w, 0, K);
+    /* Loop for epochs. */
+    for (epoch = 1;epoch <= num_epochs;++epoch) {
+        clk_prev = clock();
+        if (!calibration) {
+            logging(lg, "***** Epoch #%d *****\n", epoch);
+            /* Shuffle the training instances. */
+            dataset_shuffle(trainset);
+        }
+        /* Loop for instances. */
+        sum_loss = 0.;
+        for (i = 0;i < N;++i) {
+            const crfsuite_instance_t *inst = dataset_get(trainset, i);
+            /* Update various factors. */
+            eta = 1 / (lambda * (t0 + t));
+            decay *= (1.0 - eta * lambda);
+            gain = eta / decay;
+            /* Compute the loss and gradients for the instance. */
+            gm->set_weights(gm, w, decay);
+            gm->set_instance(gm, inst);
+            gm->objective_and_gradients(gm, &loss, w, gain);
+            sum_loss += loss;
+            ++t;
+        }
+        /* Terminate when the loss is abnormal (NaN, -Inf, +Inf). */
+        if (!isfinite(loss)) {
+            logging(lg, "ERROR: overflow loss\n");
+            ret = CRFSUITEERR_OVERFLOW;
+            sum_loss = loss;
+            goto error_exit;
+        }
+        /* Scale the feature weights. */
+        vecscale(w, decay, K);
+        decay = 1.;
+        /* Include the L2 norm of feature weights to the objective. */
+        /* The factor N is necessary because lambda = 2 * C / N. */
+        norm2 = vecdot(w, w, K);
+        sum_loss += 0.5 * lambda * norm2 * N;
+        /* One epoch finished. */
+        if (!calibration) {
+            /* Check if the current epoch is the best. */
+            if (sum_loss < best_sum_loss) {
+                /* Store the feature weights to best_w. */
+                best_sum_loss = sum_loss;
+                veccopy(best_w, w, K);
+            }
+            /* We don't test the stopping criterion while period < epoch. */
+            if (period < epoch) {
+                improvement = (pf[(epoch-1) % period] - sum_loss) / sum_loss;
+            } else {
+                improvement = epsilon;
+            }
+            /* Store the current value of the objective function. */
+            pf[(epoch-1) % period] = sum_loss;
+            logging(lg, "Loss: %f\n", sum_loss);
+            if (period < epoch) {
+                logging(lg, "Improvement ratio: %f\n", improvement);
+            }
+            logging(lg, "Feature L2-norm: %f\n", sqrt(norm2));
+            logging(lg, "Learning rate (eta): %f\n", eta);
+            logging(lg, "Total number of feature updates: %.0f\n", t);
+            logging(lg, "Seconds required for this iteration: %.3f\n", (clock() - clk_prev) / (double)CLOCKS_PER_SEC);
+            /* Holdout evaluation if necessary. */
+            if (testset != NULL) {
+                holdout_evaluation(gm, testset, w, lg);
+            }
+            logging(lg, "\n");
+            /* Check for the stopping criterion. */
+            if (improvement < epsilon) {
+                ret = 0;
+                break;
+            }
+        }
+    }
+    /* Output the optimization result. */
+    if (!calibration) {
+        if (ret == 0) {
+            if (epoch < num_epochs) {
+                logging(lg, "SGD terminated with the stopping criteria\n");
+            } else {
+                logging(lg, "SGD terminated with the maximum number of iterations\n");
+            }
+        } else {
+            logging(lg, "SGD terminated with error code (%d)\n", ret);
+        }
+    }
+    /* Restore the best weights. */
+    if (best_w != NULL) {
+        sum_loss = best_sum_loss;
+        veccopy(w, best_w, K);
+    }
+error_exit:
+    free(best_w);
+    free(pf);
+    if (ptr_loss != NULL) {
+        *ptr_loss = sum_loss;
+    }
+    return ret;
+}
+static floatval_t
+l2sgd_calibration(
+    encoder_t *gm,
+    dataset_t *ds,
+    floatval_t *w,
+    logging_t *lg,
+    const training_option_t* opt
+    )
+{
+    int i, s;
+    int dec = 0, ok, trials = 1;
+    int num = opt->calibration_candidates;
+    clock_t clk_begin = clock();
+    floatval_t loss = 0.;
+    floatval_t init_loss = 0.;
+    floatval_t best_loss = DBL_MAX;
+    floatval_t eta = opt->calibration_eta;
+    floatval_t best_eta = opt->calibration_eta;
+    const int N = ds->num_instances;
+    const int S = MIN(N, opt->calibration_samples);
+    const int K = gm->num_features;
+    const floatval_t init_eta = opt->calibration_eta;
+    const floatval_t rate = opt->calibration_rate;
+    const floatval_t lambda = opt->lambda;
+    logging(lg, "Calibrating the learning rate (eta)\n");
+    logging(lg, "calibration.eta: %f\n", eta);
+    logging(lg, "calibration.rate: %f\n", rate);
+    logging(lg, "calibration.samples: %d\n", S);
+    logging(lg, "calibration.candidates: %d\n", num);
+    logging(lg, "calibration.max_trials: %d\n", opt->calibration_max_trials);
+    /* Initialize a permutation that shuffles the instances. */
+    dataset_shuffle(ds);
+    /* Initialize feature weights as zero. */
+    vecset(w, 0, K);
+    /* Compute the initial loss. */
+    gm->set_weights(gm, w, 1.);
+    init_loss = 0;
+    for (i = 0;i < S;++i) {
+        floatval_t score;
+        const crfsuite_instance_t *inst = dataset_get(ds, i);
+        gm->set_instance(gm, inst);
+        gm->score(gm, inst->labels, &score);
+        init_loss -= score;
+        gm->partition_factor(gm, &score);
+        init_loss += score;
+    }
+    init_loss += 0.5 * lambda * vecdot(w, w, K) * N;
+    logging(lg, "Initial loss: %f\n", init_loss);
+    while (num > 0 || !dec) {
+        logging(lg, "Trial #%d (eta = %f): ", trials, eta);
+        /* Perform SGD for one epoch. */
+        l2sgd(
+            gm,
+            ds,
+            NULL,
+            w,
+            lg,
+            S, 1.0 / (lambda * eta), lambda, 1, 1, 1, 0., &loss);
+        /* Make sure that the learning rate decreases the log-likelihood. */
+        ok = isfinite(loss) && (loss < init_loss);
+        if (ok) {
+            logging(lg, "%f\n", loss);
+            --num;
+        } else {
+            logging(lg, "%f (worse)\n", loss);
+        }
+        if (isfinite(loss) && loss < best_loss) {
+            best_loss = loss;
+            best_eta = eta;
+        }
+        if (!dec) {
+            if (ok && 0 < num) {
+                eta *= rate;
+            } else {
+                dec = 1;
+                num = opt->calibration_candidates;
+                eta = init_eta / rate;
+            }
+        } else {
+            eta /= rate;
+        }
+        ++trials;
+        if (opt->calibration_max_trials <= trials) {
+            break;
+        }
+    }
+    eta = best_eta;
+    logging(lg, "Best learning rate (eta): %f\n", eta);
+    logging(lg, "Seconds required: %.3f\n", (clock() - clk_begin) / (double)CLOCKS_PER_SEC);
+    logging(lg, "\n");
+    return 1.0 / (lambda * eta);
+}
+int exchange_options(crfsuite_params_t* params, training_option_t* opt, int mode)
+{
+    BEGIN_PARAM_MAP(params, mode)
+        DDX_PARAM_FLOAT(
+            "c2", opt->c2, 1.,
+            "Coefficient for L2 regularization."
+            )
+        DDX_PARAM_INT(
+            "max_iterations", opt->max_iterations, 1000,
+            "The maximum number of iterations (epochs) for SGD optimization."
+            )
+        DDX_PARAM_INT(
+            "period", opt->period, 10,
+            "The duration of iterations to test the stopping criterion."
+            )
+        DDX_PARAM_FLOAT(
+            "delta", opt->delta, 1e-6,
+            "The threshold for the stopping criterion; an optimization process stops when\n"
+            "the improvement of the log likelihood over the last ${period} iterations is no\n"
+            "greater than this threshold."
+            )
+        DDX_PARAM_FLOAT(
+            "calibration.eta", opt->calibration_eta, 0.1,
+            "The initial value of learning rate (eta) used for calibration."
+            )
+        DDX_PARAM_FLOAT(
+            "calibration.rate", opt->calibration_rate, 2.,
+            "The rate of increase/decrease of learning rate for calibration."
+            )
+        DDX_PARAM_INT(
+            "calibration.samples", opt->calibration_samples, 1000,
+            "The number of instances used for calibration."
+            )
+        DDX_PARAM_INT(
+            "calibration.candidates", opt->calibration_candidates, 10,
+            "The number of candidates of learning rate."
+            )
+        DDX_PARAM_INT(
+            "calibration.max_trials", opt->calibration_max_trials, 20,
+            "The maximum number of trials of learning rates for calibration."
+            )
+    END_PARAM_MAP()
+    return 0;
+}
+void crfsuite_train_l2sgd_init(crfsuite_params_t* params)
+{
+    exchange_options(params, NULL, 0);
+}
+int crfsuite_train_l2sgd(
+    encoder_t *gm,
+    dataset_t *trainset,
+    dataset_t *testset,
+    crfsuite_params_t *params,
+    logging_t *lg,
+    floatval_t **ptr_w
+    )
+{
+    int ret = 0;
+    floatval_t *w = NULL;
+    clock_t clk_begin;
+    floatval_t loss = 0;
+    const int N = trainset->num_instances;
+    const int K = gm->num_features;
+    const int T = gm->cap_items;
+    training_option_t opt;
+    /* Obtain parameter values. */
+    exchange_options(params, &opt, -1);
+    /* Allocate arrays. */
+    w = (floatval_t*)calloc(sizeof(floatval_t), K);
+    if (w == NULL) {
+        ret = CRFSUITEERR_OUTOFMEMORY;
+        goto error_exit;
+    }
+    opt.lambda = 2. * opt.c2 / N;
+    logging(lg, "Stochastic Gradient Descent (SGD)\n");
+    logging(lg, "c2: %f\n", opt.c2);
+    logging(lg, "max_iterations: %d\n", opt.max_iterations);
+    logging(lg, "period: %d\n", opt.period);
+    logging(lg, "delta: %f\n", opt.delta);
+    logging(lg, "\n");
+    clk_begin = clock();
+    /* Calibrate the training rate (eta). */
+    opt.t0 = l2sgd_calibration(gm, trainset, w, lg, &opt);
+    /* Perform stochastic gradient descent. */
+    ret = l2sgd(
+        gm,
+        trainset,
+        testset,
+        w,
+        lg,
+        N,
+        opt.t0,
+        opt.lambda,
+        opt.max_iterations,
+        0,
+        opt.period,
+        opt.delta,
+        &loss
+        );
+    logging(lg, "Loss: %f\n", loss);
+    logging(lg, "Total seconds required for training: %.3f\n", (clock() - clk_begin) / (double)CLOCKS_PER_SEC);
+    logging(lg, "\n");
+    *ptr_w = w;
+    return ret;
+error_exit:
+    free(w);
+    return ret;
+}