opener-opinion-detector-base 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +101 -0
  3. data/bin/opinion-detector-base +19 -0
  4. data/core/annotation.cfg.erb +9 -0
  5. data/core/packages/KafNafParser-1.4.tar.gz +0 -0
  6. data/core/packages/VUA_pylib-1.5.tar.gz +0 -0
  7. data/core/python-scripts/LICENSE +339 -0
  8. data/core/python-scripts/README.md +226 -0
  9. data/core/python-scripts/classify_kaf_naf_file.py +499 -0
  10. data/core/python-scripts/cross_validation.py +634 -0
  11. data/core/python-scripts/generate_folds.py +134 -0
  12. data/core/python-scripts/models.cfg +10 -0
  13. data/core/python-scripts/my_templates/README +33 -0
  14. data/core/python-scripts/my_templates/templates_exp.only0.txt +6 -0
  15. data/core/python-scripts/my_templates/templates_exp.pol0.txt +10 -0
  16. data/core/python-scripts/my_templates/templates_exp.red.txt +7 -0
  17. data/core/python-scripts/my_templates/templates_exp.txt +10 -0
  18. data/core/python-scripts/my_templates/templates_holder.only0.txt +11 -0
  19. data/core/python-scripts/my_templates/templates_holder.red.txt +9 -0
  20. data/core/python-scripts/my_templates/templates_holder.txt +10 -0
  21. data/core/python-scripts/my_templates/templates_target.only0.txt +11 -0
  22. data/core/python-scripts/my_templates/templates_target.red.txt +9 -0
  23. data/core/python-scripts/my_templates/templates_target.txt +10 -0
  24. data/core/python-scripts/run_all_experiments.sh +49 -0
  25. data/core/python-scripts/run_basic.py +20 -0
  26. data/core/python-scripts/run_experiment.sh +42 -0
  27. data/core/python-scripts/scripts/__init__.py +1 -0
  28. data/core/python-scripts/scripts/config_manager.py +314 -0
  29. data/core/python-scripts/scripts/crfutils.py +215 -0
  30. data/core/python-scripts/scripts/extract_feats_relations.py +295 -0
  31. data/core/python-scripts/scripts/extract_features.py +376 -0
  32. data/core/python-scripts/scripts/feats_to_crf.exp.py +105 -0
  33. data/core/python-scripts/scripts/lexicons.py +44 -0
  34. data/core/python-scripts/scripts/link_entities_distance.py +77 -0
  35. data/core/python-scripts/scripts/relation_classifier.py +250 -0
  36. data/core/python-scripts/train.py +566 -0
  37. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/PKG-INFO +10 -0
  38. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/SOURCES.txt +22 -0
  39. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/dependency_links.txt +1 -0
  40. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/installed-files.txt +47 -0
  41. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/top_level.txt +1 -0
  42. data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.py +390 -0
  43. data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.pyc +0 -0
  44. data/core/site-packages/pre_build/KafNafParser/__init__.py +14 -0
  45. data/core/site-packages/pre_build/KafNafParser/__init__.pyc +0 -0
  46. data/core/site-packages/pre_build/KafNafParser/constituency_data.py +125 -0
  47. data/core/site-packages/pre_build/KafNafParser/constituency_data.pyc +0 -0
  48. data/core/site-packages/pre_build/KafNafParser/coreference_data.py +52 -0
  49. data/core/site-packages/pre_build/KafNafParser/coreference_data.pyc +0 -0
  50. data/core/site-packages/pre_build/KafNafParser/dependency_data.py +78 -0
  51. data/core/site-packages/pre_build/KafNafParser/dependency_data.pyc +0 -0
  52. data/core/site-packages/pre_build/KafNafParser/entity_data.py +59 -0
  53. data/core/site-packages/pre_build/KafNafParser/entity_data.pyc +0 -0
  54. data/core/site-packages/pre_build/KafNafParser/external_references_data.py +41 -0
  55. data/core/site-packages/pre_build/KafNafParser/external_references_data.pyc +0 -0
  56. data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.py +2 -0
  57. data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.pyc +0 -0
  58. data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.py +205 -0
  59. data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.pyc +0 -0
  60. data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.py +309 -0
  61. data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.pyc +0 -0
  62. data/core/site-packages/pre_build/KafNafParser/features_data.py +131 -0
  63. data/core/site-packages/pre_build/KafNafParser/features_data.pyc +0 -0
  64. data/core/site-packages/pre_build/KafNafParser/header_data.py +127 -0
  65. data/core/site-packages/pre_build/KafNafParser/header_data.pyc +0 -0
  66. data/core/site-packages/pre_build/KafNafParser/opinion_data.py +211 -0
  67. data/core/site-packages/pre_build/KafNafParser/opinion_data.pyc +0 -0
  68. data/core/site-packages/pre_build/KafNafParser/references_data.py +23 -0
  69. data/core/site-packages/pre_build/KafNafParser/references_data.pyc +0 -0
  70. data/core/site-packages/pre_build/KafNafParser/span_data.py +63 -0
  71. data/core/site-packages/pre_build/KafNafParser/span_data.pyc +0 -0
  72. data/core/site-packages/pre_build/KafNafParser/term_data.py +111 -0
  73. data/core/site-packages/pre_build/KafNafParser/term_data.pyc +0 -0
  74. data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.py +42 -0
  75. data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.pyc +0 -0
  76. data/core/site-packages/pre_build/KafNafParser/text_data.py +99 -0
  77. data/core/site-packages/pre_build/KafNafParser/text_data.pyc +0 -0
  78. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/PKG-INFO +10 -0
  79. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/SOURCES.txt +14 -0
  80. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/dependency_links.txt +1 -0
  81. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/installed-files.txt +23 -0
  82. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/top_level.txt +1 -0
  83. data/core/site-packages/pre_build/VUA_pylib/__init__.py +1 -0
  84. data/core/site-packages/pre_build/VUA_pylib/__init__.pyc +0 -0
  85. data/core/site-packages/pre_build/VUA_pylib/common/__init__.py +1 -0
  86. data/core/site-packages/pre_build/VUA_pylib/common/__init__.pyc +0 -0
  87. data/core/site-packages/pre_build/VUA_pylib/common/common.py +28 -0
  88. data/core/site-packages/pre_build/VUA_pylib/common/common.pyc +0 -0
  89. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.py +1 -0
  90. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.pyc +0 -0
  91. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.py +156 -0
  92. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.pyc +0 -0
  93. data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.py +1 -0
  94. data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.pyc +0 -0
  95. data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.py +121 -0
  96. data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.pyc +0 -0
  97. data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.py +1 -0
  98. data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.pyc +0 -0
  99. data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.py +72 -0
  100. data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.pyc +0 -0
  101. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
  102. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
  103. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
  104. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
  105. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
  106. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
  107. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
  108. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
  109. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
  110. data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
  111. data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
  112. data/core/vendor/src/crfsuite/AUTHORS +1 -0
  113. data/core/vendor/src/crfsuite/COPYING +27 -0
  114. data/core/vendor/src/crfsuite/ChangeLog +103 -0
  115. data/core/vendor/src/crfsuite/INSTALL +236 -0
  116. data/core/vendor/src/crfsuite/Makefile.am +19 -0
  117. data/core/vendor/src/crfsuite/Makefile.in +783 -0
  118. data/core/vendor/src/crfsuite/README +183 -0
  119. data/core/vendor/src/crfsuite/aclocal.m4 +9018 -0
  120. data/core/vendor/src/crfsuite/autogen.sh +38 -0
  121. data/core/vendor/src/crfsuite/compile +143 -0
  122. data/core/vendor/src/crfsuite/config.guess +1502 -0
  123. data/core/vendor/src/crfsuite/config.h.in +198 -0
  124. data/core/vendor/src/crfsuite/config.sub +1714 -0
  125. data/core/vendor/src/crfsuite/configure +14273 -0
  126. data/core/vendor/src/crfsuite/configure.in +149 -0
  127. data/core/vendor/src/crfsuite/crfsuite.sln +42 -0
  128. data/core/vendor/src/crfsuite/depcomp +630 -0
  129. data/core/vendor/src/crfsuite/example/chunking.py +49 -0
  130. data/core/vendor/src/crfsuite/example/crfutils.py +179 -0
  131. data/core/vendor/src/crfsuite/example/ner.py +270 -0
  132. data/core/vendor/src/crfsuite/example/pos.py +78 -0
  133. data/core/vendor/src/crfsuite/example/template.py +88 -0
  134. data/core/vendor/src/crfsuite/frontend/Makefile.am +29 -0
  135. data/core/vendor/src/crfsuite/frontend/Makefile.in +640 -0
  136. data/core/vendor/src/crfsuite/frontend/dump.c +116 -0
  137. data/core/vendor/src/crfsuite/frontend/frontend.vcxproj +129 -0
  138. data/core/vendor/src/crfsuite/frontend/iwa.c +273 -0
  139. data/core/vendor/src/crfsuite/frontend/iwa.h +65 -0
  140. data/core/vendor/src/crfsuite/frontend/learn.c +439 -0
  141. data/core/vendor/src/crfsuite/frontend/main.c +137 -0
  142. data/core/vendor/src/crfsuite/frontend/option.c +93 -0
  143. data/core/vendor/src/crfsuite/frontend/option.h +86 -0
  144. data/core/vendor/src/crfsuite/frontend/readdata.h +38 -0
  145. data/core/vendor/src/crfsuite/frontend/reader.c +136 -0
  146. data/core/vendor/src/crfsuite/frontend/tag.c +427 -0
  147. data/core/vendor/src/crfsuite/genbinary.sh.in +15 -0
  148. data/core/vendor/src/crfsuite/include/Makefile.am +11 -0
  149. data/core/vendor/src/crfsuite/include/Makefile.in +461 -0
  150. data/core/vendor/src/crfsuite/include/crfsuite.h +1063 -0
  151. data/core/vendor/src/crfsuite/include/crfsuite.hpp +555 -0
  152. data/core/vendor/src/crfsuite/include/crfsuite_api.hpp +400 -0
  153. data/core/vendor/src/crfsuite/include/os.h +61 -0
  154. data/core/vendor/src/crfsuite/install-sh +520 -0
  155. data/core/vendor/src/crfsuite/lib/cqdb/COPYING +28 -0
  156. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.am +21 -0
  157. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.in +549 -0
  158. data/core/vendor/src/crfsuite/lib/cqdb/cqdb.vcxproj +86 -0
  159. data/core/vendor/src/crfsuite/lib/cqdb/include/cqdb.h +524 -0
  160. data/core/vendor/src/crfsuite/lib/cqdb/src/cqdb.c +587 -0
  161. data/core/vendor/src/crfsuite/lib/cqdb/src/lookup3.c +976 -0
  162. data/core/vendor/src/crfsuite/lib/crf/Makefile.am +46 -0
  163. data/core/vendor/src/crfsuite/lib/crf/Makefile.in +721 -0
  164. data/core/vendor/src/crfsuite/lib/crf/crf.vcxproj +216 -0
  165. data/core/vendor/src/crfsuite/lib/crf/src/crf1d.h +353 -0
  166. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_context.c +705 -0
  167. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_encode.c +943 -0
  168. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_feature.c +352 -0
  169. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_model.c +994 -0
  170. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_tag.c +550 -0
  171. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite.c +492 -0
  172. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_internal.h +236 -0
  173. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_train.c +272 -0
  174. data/core/vendor/src/crfsuite/lib/crf/src/dataset.c +106 -0
  175. data/core/vendor/src/crfsuite/lib/crf/src/dictionary.c +118 -0
  176. data/core/vendor/src/crfsuite/lib/crf/src/holdout.c +80 -0
  177. data/core/vendor/src/crfsuite/lib/crf/src/logging.c +91 -0
  178. data/core/vendor/src/crfsuite/lib/crf/src/logging.h +48 -0
  179. data/core/vendor/src/crfsuite/lib/crf/src/params.c +335 -0
  180. data/core/vendor/src/crfsuite/lib/crf/src/params.h +80 -0
  181. data/core/vendor/src/crfsuite/lib/crf/src/quark.c +172 -0
  182. data/core/vendor/src/crfsuite/lib/crf/src/quark.h +46 -0
  183. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.c +1107 -0
  184. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.h +160 -0
  185. data/core/vendor/src/crfsuite/lib/crf/src/train_arow.c +408 -0
  186. data/core/vendor/src/crfsuite/lib/crf/src/train_averaged_perceptron.c +242 -0
  187. data/core/vendor/src/crfsuite/lib/crf/src/train_l2sgd.c +507 -0
  188. data/core/vendor/src/crfsuite/lib/crf/src/train_lbfgs.c +338 -0
  189. data/core/vendor/src/crfsuite/lib/crf/src/train_passive_aggressive.c +435 -0
  190. data/core/vendor/src/crfsuite/lib/crf/src/vecmath.h +341 -0
  191. data/core/vendor/src/crfsuite/ltmain.sh +8413 -0
  192. data/core/vendor/src/crfsuite/missing +376 -0
  193. data/core/vendor/src/crfsuite/swig/Makefile.am +13 -0
  194. data/core/vendor/src/crfsuite/swig/Makefile.in +365 -0
  195. data/core/vendor/src/crfsuite/swig/crfsuite.cpp +2 -0
  196. data/core/vendor/src/crfsuite/swig/export.i +32 -0
  197. data/core/vendor/src/crfsuite/swig/python/README +92 -0
  198. data/core/vendor/src/crfsuite/swig/python/crfsuite.py +329 -0
  199. data/core/vendor/src/crfsuite/swig/python/export_wrap.cpp +14355 -0
  200. data/core/vendor/src/crfsuite/swig/python/export_wrap.h +63 -0
  201. data/core/vendor/src/crfsuite/swig/python/prepare.sh +9 -0
  202. data/core/vendor/src/crfsuite/swig/python/sample_tag.py +52 -0
  203. data/core/vendor/src/crfsuite/swig/python/sample_train.py +68 -0
  204. data/core/vendor/src/crfsuite/swig/python/setup.py +44 -0
  205. data/core/vendor/src/crfsuite/win32/stdint.h +679 -0
  206. data/core/vendor/src/liblbfgs/AUTHORS +1 -0
  207. data/core/vendor/src/liblbfgs/COPYING +22 -0
  208. data/core/vendor/src/liblbfgs/ChangeLog +120 -0
  209. data/core/vendor/src/liblbfgs/INSTALL +231 -0
  210. data/core/vendor/src/liblbfgs/Makefile.am +10 -0
  211. data/core/vendor/src/liblbfgs/Makefile.in +638 -0
  212. data/core/vendor/src/liblbfgs/NEWS +0 -0
  213. data/core/vendor/src/liblbfgs/README +71 -0
  214. data/core/vendor/src/liblbfgs/aclocal.m4 +6985 -0
  215. data/core/vendor/src/liblbfgs/autogen.sh +38 -0
  216. data/core/vendor/src/liblbfgs/config.guess +1411 -0
  217. data/core/vendor/src/liblbfgs/config.h.in +64 -0
  218. data/core/vendor/src/liblbfgs/config.sub +1500 -0
  219. data/core/vendor/src/liblbfgs/configure +21146 -0
  220. data/core/vendor/src/liblbfgs/configure.in +107 -0
  221. data/core/vendor/src/liblbfgs/depcomp +522 -0
  222. data/core/vendor/src/liblbfgs/include/lbfgs.h +745 -0
  223. data/core/vendor/src/liblbfgs/install-sh +322 -0
  224. data/core/vendor/src/liblbfgs/lbfgs.sln +26 -0
  225. data/core/vendor/src/liblbfgs/lib/Makefile.am +24 -0
  226. data/core/vendor/src/liblbfgs/lib/Makefile.in +499 -0
  227. data/core/vendor/src/liblbfgs/lib/arithmetic_ansi.h +133 -0
  228. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_double.h +294 -0
  229. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_float.h +298 -0
  230. data/core/vendor/src/liblbfgs/lib/lbfgs.c +1371 -0
  231. data/core/vendor/src/liblbfgs/lib/lib.vcxproj +95 -0
  232. data/core/vendor/src/liblbfgs/ltmain.sh +6426 -0
  233. data/core/vendor/src/liblbfgs/missing +353 -0
  234. data/core/vendor/src/liblbfgs/sample/Makefile.am +15 -0
  235. data/core/vendor/src/liblbfgs/sample/Makefile.in +433 -0
  236. data/core/vendor/src/liblbfgs/sample/sample.c +81 -0
  237. data/core/vendor/src/liblbfgs/sample/sample.cpp +126 -0
  238. data/core/vendor/src/liblbfgs/sample/sample.vcxproj +105 -0
  239. data/core/vendor/src/svm_light/LICENSE.txt +59 -0
  240. data/core/vendor/src/svm_light/Makefile +105 -0
  241. data/core/vendor/src/svm_light/kernel.h +40 -0
  242. data/core/vendor/src/svm_light/svm_classify.c +197 -0
  243. data/core/vendor/src/svm_light/svm_common.c +985 -0
  244. data/core/vendor/src/svm_light/svm_common.h +301 -0
  245. data/core/vendor/src/svm_light/svm_hideo.c +1062 -0
  246. data/core/vendor/src/svm_light/svm_learn.c +4147 -0
  247. data/core/vendor/src/svm_light/svm_learn.h +169 -0
  248. data/core/vendor/src/svm_light/svm_learn_main.c +397 -0
  249. data/core/vendor/src/svm_light/svm_loqo.c +211 -0
  250. data/ext/hack/Rakefile +17 -0
  251. data/ext/hack/support.rb +88 -0
  252. data/lib/opener/opinion_detectors/base.rb +112 -0
  253. data/lib/opener/opinion_detectors/base/version.rb +7 -0
  254. data/lib/opener/opinion_detectors/configuration_creator.rb +86 -0
  255. data/lib/opener/opinion_detectors/de.rb +7 -0
  256. data/lib/opener/opinion_detectors/en.rb +7 -0
  257. data/lib/opener/opinion_detectors/it.rb +7 -0
  258. data/lib/opener/opinion_detectors/nl.rb +6 -0
  259. data/opener-opinion-detector-base.gemspec +35 -0
  260. data/pre_build_requirements.txt +3 -0
  261. metadata +374 -0
@@ -0,0 +1,160 @@
1
+ /*
2
+ * RumAVL - Threaded AVL Tree Implementation
3
+ *
4
+ * Copyright (c) 2005-2007 Jesse Long <jpl@unknown.za.net>
5
+ * All rights reserved.
6
+ *
7
+ * Permission is hereby granted, free of charge, to any person obtaining a
8
+ * copy of this software and associated documentation files (the "Software"),
9
+ * to deal in the Software without restriction, including without limitation
10
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11
+ * and/or sell copies of the Software, and to permit persons to whom the
12
+ * Software is furnished to do so, subject to the following conditions:
13
+ *
14
+ * 1. The above copyright notice and this permission notice shall be
15
+ * included in all copies or substantial portions of the Software.
16
+ * 2. The origin of the Software must not be misrepresented; you must not
17
+ * claim that you wrote the original Software.
18
+ * 3. Altered source versions of the Software must be plainly marked as
19
+ * such, and must not be misrepresented as being the original Software.
20
+ *
21
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27
+ * DEALINGS IN THE SOFTWARE.
28
+ */
29
+
30
+ /*
31
+ * Please see the `README' file, the documentation in the `doc' directory and
32
+ * the `rumavl.c' source file for more information.
33
+ */
34
+
35
+ #ifndef RUMAVL_H
36
+ #define RUMAVL_H 1
37
+
38
+ #ifdef __cplusplus
39
+ extern "C" {
40
+ #endif
41
+
42
+ #include <stddef.h> /* size_t */
43
+
44
+
45
+
46
+
47
+ /*----------------------------------------------------------------------------
48
+ * DATA TYPES
49
+ *--------------------------------------------------------------------------*/
50
+
51
+ /* Opaque context handle for the tree */
52
+ typedef struct rumavl RUMAVL;
53
+
54
+ /* Node type - used for iterating */
55
+ typedef struct rumavl_node RUMAVL_NODE;
56
+
57
+
58
+
59
+
60
+ /*----------------------------------------------------------------------------
61
+ * FUNDEMENTAL FUNCTIONS
62
+ *--------------------------------------------------------------------------*/
63
+
64
+ /* Create a new RumAVL tree */
65
+ RUMAVL *rumavl_new (size_t reclen,
66
+ int (*cmp)(const void *, const void *, size_t, void *),
67
+ void *(*alloc)(void *, size_t, void *),
68
+ void *udata);
69
+
70
+ /* Destroy a RumAVL tree */
71
+ void rumavl_destroy (RUMAVL *tree);
72
+
73
+ /* This function returns the size of each record in a tree */
74
+ size_t rumavl_record_size (RUMAVL *tree);
75
+
76
+ /* Get a pointer to the udata pointer */
77
+ void **rumavl_udata (RUMAVL *tree);
78
+
79
+ /* Insert a record into a tree, overwriting an existing record necessary */
80
+ int rumavl_set (RUMAVL *tree, const void *record);
81
+ /* Insert a record into a tree, never overwrites an existing record */
82
+ int rumavl_insert (RUMAVL *tree, const void *record);
83
+
84
+ /* Retrieve record from tree, or NULL */
85
+ void *rumavl_find (RUMAVL *tree, const void *find);
86
+
87
+ /* Remove record from tree */
88
+ int rumavl_delete (RUMAVL *tree, const void *record);
89
+
90
+
91
+
92
+
93
+ /*----------------------------------------------------------------------------
94
+ * ITERATOR FUNCTIONS
95
+ *--------------------------------------------------------------------------*/
96
+
97
+ /* Get a pointer to the node containing a specific record */
98
+ RUMAVL_NODE *rumavl_node_find (RUMAVL *tree, const void *find, void **record);
99
+
100
+ /* Get the next node in sequence after a specific node, in a specific
101
+ * direction, or get the first node on either end of a tree */
102
+ RUMAVL_NODE *rumavl_node_next (RUMAVL *tree, RUMAVL_NODE *node, int dir,
103
+ void **record);
104
+ /* Possible directions */
105
+ #define RUMAVL_DESC (-1)
106
+ #define RUMAVL_ASC (+1)
107
+
108
+ /* Get a record held by a specific node */
109
+ void *rumavl_node_record (RUMAVL_NODE *node);
110
+
111
+ /* Pass each record in a tree to a user defined callback function */
112
+ extern int rumavl_foreach (RUMAVL *tree, int dir,
113
+ int (*cbfn)(RUMAVL *, void *, void *), void *udata);
114
+
115
+
116
+
117
+
118
+ /*----------------------------------------------------------------------------
119
+ * CALLBACK FUNCTIONS
120
+ *
121
+ * Functions giving you more control over the actions of this library.
122
+ *--------------------------------------------------------------------------*/
123
+
124
+ int (**rumavl_owcb(RUMAVL *tree))(RUMAVL *, RUMAVL_NODE *, void *,
125
+ const void *, void *);
126
+ int (**rumavl_delcb(RUMAVL *tree))(RUMAVL *, RUMAVL_NODE *, void *, void *);
127
+
128
+
129
+
130
+
131
+ /*----------------------------------------------------------------------------
132
+ * MEMORY MANAGEMENT
133
+ *
134
+ * The rumavl_mem struct is used to define how a RUMAVL object allocates
135
+ * and frees memory.
136
+ *--------------------------------------------------------------------------*/
137
+ void *(**rumavl_alloc(RUMAVL *tree))(void *ptr, size_t size, void *udata);
138
+
139
+
140
+
141
+ /*----------------------------------------------------------------------------
142
+ * ERROR CODES
143
+ *
144
+ * The functions returning int's will return these errors
145
+ *--------------------------------------------------------------------------*/
146
+
147
+ #define RUMAVL_ERR_INVAL (-1) /* Invalid argument */
148
+ #define RUMAVL_ERR_NOMEM (-2) /* Insufficient memory */
149
+ #define RUMAVL_ERR_NOENT (-3) /* Entry does not exist */
150
+ #define RUMAVL_ERR_EORNG (-5) /* No nodes left in range */
151
+ #define RUMAVL_ERR_EXIST (-6) /* Entry already exists */
152
+
153
+ /* returns static string describing error number */
154
+ extern const char *rumavl_strerror (int errno);
155
+
156
+ #ifdef __cplusplus
157
+ }
158
+ #endif
159
+
160
+ #endif /* ifndef RUMAVL_H */
@@ -0,0 +1,408 @@
1
+ /*
2
+ * Online training with Adaptive Regularization of Weights (AROW).
3
+ *
4
+ * Copyright (c) 2007-2010, Naoaki Okazaki
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ * * Neither the names of the authors nor the names of its contributors
15
+ * may be used to endorse or promote products derived from this
16
+ * software without specific prior written permission.
17
+ *
18
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
22
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+ */
30
+
31
+ /* $Id$ */
32
+
33
+ #ifdef HAVE_CONFIG_H
34
+ #include <config.h>
35
+ #endif/*HAVE_CONFIG_H*/
36
+
37
+ #include <os.h>
38
+
39
+ #include <stdio.h>
40
+ #include <stdlib.h>
41
+ #include <time.h>
42
+
43
+ #include <crfsuite.h>
44
+ #include "crfsuite_internal.h"
45
+ #include "logging.h"
46
+ #include "params.h"
47
+ #include "vecmath.h"
48
+
49
+ #define MIN(a, b) ((a) < (b) ? (a) : (b))
50
+
51
+ /**
52
+ * Training parameters (configurable with crfsuite_params_t interface).
53
+ */
54
+ typedef struct {
55
+ floatval_t variance;
56
+ floatval_t gamma;
57
+ int max_iterations;
58
+ floatval_t epsilon;
59
+ } training_option_t;
60
+
61
+ /**
62
+ * Internal data structure for computing the sparse vector F(x, y) - F(x, y').
63
+ */
64
+ typedef struct {
65
+ /* An array of feature indices relevant to the instance. */
66
+ int *actives;
67
+ int num_actives;
68
+ int cap_actives;
69
+ char *used;
70
+
71
+ /* Coefficient for collecting feature weights. */
72
+ floatval_t c;
73
+ /* The difference vector [K]. */
74
+ floatval_t *delta;
75
+ /* The number of features. */
76
+ int K;
77
+ } delta_t;
78
+
79
+ static int delta_init(delta_t *dc, const int K)
80
+ {
81
+ memset(dc, 0, sizeof(*dc));
82
+ dc->used = (char*)calloc(K, sizeof(char));
83
+ dc->delta = (floatval_t*)calloc(K, sizeof(floatval_t));
84
+ dc->K = K;
85
+ if (dc->delta == NULL || dc->used == NULL) {
86
+ return 1;
87
+ }
88
+ return 0;
89
+ }
90
+
91
+ static void delta_finish(delta_t *dc)
92
+ {
93
+ free(dc->actives);
94
+ free(dc->used);
95
+ free(dc->delta);
96
+ memset(dc, 0, sizeof(*dc));
97
+ }
98
+
99
+ static void delta_reset(delta_t *dc)
100
+ {
101
+ int i;
102
+ for (i = 0;i < dc->num_actives;++i) {
103
+ int k = dc->actives[i];
104
+ dc->delta[k] = 0;
105
+ }
106
+ dc->num_actives = 0;
107
+ }
108
+
109
+ static void delta_collect(void *instance, int fid, floatval_t value)
110
+ {
111
+ delta_t *dc = (delta_t*)instance;
112
+
113
+ /* Expand the active feature list if necessary. */
114
+ if (dc->cap_actives <= dc->num_actives) {
115
+ ++dc->cap_actives;
116
+ dc->cap_actives *= 2;
117
+ dc->actives = (int*)realloc(dc->actives, sizeof(int) * dc->cap_actives);
118
+ }
119
+
120
+ dc->actives[dc->num_actives++] = fid;
121
+ dc->delta[fid] += dc->c * value;
122
+ }
123
+
124
+ static void delta_finalize(delta_t *dc)
125
+ {
126
+ int i, j = 0, k;
127
+
128
+ /* Collapse the duplicated indices. */
129
+ for (i = 0;i < dc->num_actives;++i) {
130
+ k = dc->actives[i];
131
+ if (!dc->used[k]) {
132
+ dc->actives[j++] = k;
133
+ dc->used[k] = 1;
134
+ }
135
+ }
136
+ dc->num_actives = j; /* This is the distinct number of indices. */
137
+
138
+ /* Reset the used flag. */
139
+ for (i = 0;i < dc->num_actives;++i) {
140
+ k = dc->actives[i];
141
+ dc->used[k] = 0;
142
+ }
143
+ }
144
+
145
+ static floatval_t delta_norm2(delta_t *dc)
146
+ {
147
+ int i;
148
+ floatval_t norm2 = 0.;
149
+
150
+ for (i = 0;i < dc->num_actives;++i) {
151
+ int k = dc->actives[i];
152
+ norm2 += dc->delta[k] * dc->delta[k];
153
+ }
154
+ return norm2;
155
+ }
156
+
157
+ static void delta_add(delta_t *dc, floatval_t *w, floatval_t *ws, const floatval_t tau, const floatval_t u)
158
+ {
159
+ int i;
160
+ const floatval_t tauu = tau * u;
161
+
162
+ for (i = 0;i < dc->num_actives;++i) {
163
+ int k = dc->actives[i];
164
+ w[k] += tau * dc->delta[k];
165
+ ws[k] += tauu * dc->delta[k];
166
+ }
167
+ }
168
+
169
+ static int diff(int *x, int *y, int n)
170
+ {
171
+ int i, d = 0;
172
+ for (i = 0;i < n;++i) {
173
+ if (x[i] != y[i]) {
174
+ ++d;
175
+ }
176
+ }
177
+ return d;
178
+ }
179
+
180
+ static floatval_t cost_insensitive(floatval_t err, floatval_t d)
181
+ {
182
+ return err + 1.;
183
+ }
184
+
185
+ static floatval_t cost_sensitive(floatval_t err, floatval_t d)
186
+ {
187
+ return err + sqrt(d);
188
+ }
189
+
190
+ static floatval_t tau0(floatval_t cost, floatval_t norm, floatval_t c)
191
+ {
192
+ return cost / norm;
193
+ }
194
+
195
+ static floatval_t tau1(floatval_t cost, floatval_t norm, floatval_t c)
196
+ {
197
+ return MIN(c, cost / norm);
198
+ }
199
+
200
+ static floatval_t tau2(floatval_t cost, floatval_t norm, floatval_t c)
201
+ {
202
+ return cost / (norm + 0.5 / c);
203
+ }
204
+
205
+ static int exchange_options(crfsuite_params_t* params, training_option_t* opt, int mode)
206
+ {
207
+ BEGIN_PARAM_MAP(params, mode)
208
+ DDX_PARAM_FLOAT(
209
+ "variance", opt->variance, 1.,
210
+ "The initial variance of every feature weight."
211
+ )
212
+ DDX_PARAM_FLOAT(
213
+ "gamma", opt->gamma, 1.,
214
+ "Tradeoff parameter."
215
+ )
216
+ DDX_PARAM_INT(
217
+ "max_iterations", opt->max_iterations, 100,
218
+ "The maximum number of iterations."
219
+ )
220
+ DDX_PARAM_FLOAT(
221
+ "epsilon", opt->epsilon, 0.,
222
+ "The stopping criterion (the mean loss)."
223
+ )
224
+ END_PARAM_MAP()
225
+
226
+ return 0;
227
+ }
228
+
229
+ void crfsuite_train_arow_init(crfsuite_params_t* params)
230
+ {
231
+ exchange_options(params, NULL, 0);
232
+ }
233
+
234
+ int crfsuite_train_arow(
235
+ encoder_t *gm,
236
+ dataset_t *trainset,
237
+ dataset_t *testset,
238
+ crfsuite_params_t *params,
239
+ logging_t *lg,
240
+ floatval_t **ptr_w
241
+ )
242
+ {
243
+ int n, i, j, k, ret = 0;
244
+ int *viterbi = NULL;
245
+ floatval_t beta;
246
+ floatval_t *mean = NULL, *cov = NULL, *prod = NULL;
247
+ const int N = trainset->num_instances;
248
+ const int K = gm->num_features;
249
+ const int T = gm->cap_items;
250
+ training_option_t opt;
251
+ delta_t dc;
252
+ clock_t begin = clock();
253
+
254
+ /* Initialize the variable. */
255
+ if (delta_init(&dc, K) != 0) {
256
+ ret = CRFSUITEERR_OUTOFMEMORY;
257
+ goto error_exit;
258
+ }
259
+
260
+ /* Obtain parameter values. */
261
+ exchange_options(params, &opt, -1);
262
+
263
+ /* Allocate arrays. */
264
+ mean = (floatval_t*)calloc(sizeof(floatval_t), K);
265
+ cov = (floatval_t*)calloc(sizeof(floatval_t), K);
266
+ prod = (floatval_t*)calloc(sizeof(floatval_t), K);
267
+ viterbi = (int*)calloc(sizeof(int), T);
268
+ if (mean == NULL || cov == NULL || prod == NULL || viterbi == NULL) {
269
+ ret = CRFSUITEERR_OUTOFMEMORY;
270
+ goto error_exit;
271
+ }
272
+
273
+ /* Initialize the covariance vector (diagnal matrix). */
274
+ vecset(cov, opt.variance, K);
275
+
276
+ /* Show the parameters. */
277
+ logging(lg, "Adaptive Regularization of Weights (AROW)\n");
278
+ logging(lg, "variance: %f\n", opt.variance);
279
+ logging(lg, "gamma: %f\n", opt.gamma);
280
+ logging(lg, "max_iterations: %d\n", opt.max_iterations);
281
+ logging(lg, "epsilon: %f\n", opt.epsilon);
282
+ logging(lg, "\n");
283
+
284
+ beta = 1.0 / opt.gamma;
285
+
286
+ /* Loop for epoch. */
287
+ for (i = 0;i < opt.max_iterations;++i) {
288
+ floatval_t norm = 0., sum_loss = 0.;
289
+ clock_t iteration_begin = clock();
290
+
291
+ /* Shuffle the instances. */
292
+ dataset_shuffle(trainset);
293
+
294
+ /* Loop for each instance. */
295
+ for (n = 0;n < N;++n) {
296
+ int d = 0;
297
+ floatval_t sv;
298
+ const crfsuite_instance_t *inst = dataset_get(trainset, n);
299
+
300
+ /* Set the feature weights to the encoder. */
301
+ gm->set_weights(gm, mean, 1.);
302
+ gm->set_instance(gm, inst);
303
+
304
+ /* Tag the sequence with the current model. */
305
+ gm->viterbi(gm, viterbi, &sv);
306
+
307
+ /* Compute the number of different labels. */
308
+ d = diff(inst->labels, viterbi, inst->num_items);
309
+ if (0 < d) {
310
+ floatval_t alpha, frac;
311
+ floatval_t sc, norm2;
312
+ floatval_t tau, cost;
313
+
314
+ /*
315
+ Compute the cost of this instance.
316
+ */
317
+ gm->score(gm, inst->labels, &sc);
318
+ cost = sv - sc + (double)d;
319
+
320
+ /* Initialize delta[k] = 0. */
321
+ delta_reset(&dc);
322
+
323
+ /*
324
+ For every feature k on the correct path:
325
+ delta[k] += 1;
326
+ */
327
+ dc.c = 1;
328
+ gm->features_on_path(gm, inst, inst->labels, delta_collect, &dc);
329
+
330
+ /*
331
+ For every feature k on the Viterbi path:
332
+ delta[k] -= 1;
333
+ */
334
+ dc.c = -1;
335
+ gm->features_on_path(gm, inst, viterbi, delta_collect, &dc);
336
+
337
+ delta_finalize(&dc);
338
+
339
+ /* Compute prod[k] = delta[k] * delta[k]. */
340
+ for (j = 0;j < dc.num_actives;++j) {
341
+ k = dc.actives[j];
342
+ prod[k] = dc.delta[k] * dc.delta[k];
343
+ }
344
+
345
+ /*
346
+ Compute alpha.
347
+ */
348
+ frac = opt.gamma;
349
+ for (j = 0;j < dc.num_actives;++j) {
350
+ k = dc.actives[j];
351
+ frac += prod[k] * cov[k];
352
+ }
353
+ alpha = cost / frac;
354
+
355
+ /*
356
+ Update.
357
+ */
358
+ for (j = 0;j < dc.num_actives;++j) {
359
+ k = dc.actives[j];
360
+ mean[k] += alpha * cov[k] * dc.delta[k];
361
+ cov[k] = 1.0 / ((1.0 / cov[k]) + prod[k] / opt.gamma);
362
+ }
363
+
364
+ sum_loss += cost;
365
+ }
366
+ }
367
+
368
+ /* Output the progress. */
369
+ logging(lg, "***** Iteration #%d *****\n", i+1);
370
+ logging(lg, "Loss: %f\n", sum_loss);
371
+ logging(lg, "Feature norm: %f\n", sqrt(vecdot(mean, mean, K)));
372
+ logging(lg, "Seconds required for this iteration: %.3f\n", (clock() - iteration_begin) / (double)CLOCKS_PER_SEC);
373
+
374
+ /* Holdout evaluation if necessary. */
375
+ if (testset != NULL) {
376
+ holdout_evaluation(gm, testset, mean, lg);
377
+ }
378
+
379
+ logging(lg, "\n");
380
+
381
+ /* Convergence test. */
382
+ if (sum_loss / N <= opt.epsilon) {
383
+ logging(lg, "Terminated with the stopping criterion\n");
384
+ logging(lg, "\n");
385
+ break;
386
+ }
387
+ }
388
+
389
+ logging(lg, "Total seconds required for training: %.3f\n", (clock() - begin) / (double)CLOCKS_PER_SEC);
390
+ logging(lg, "\n");
391
+
392
+ free(viterbi);
393
+ free(prod);
394
+ free(cov);
395
+ *ptr_w = mean;
396
+ delta_finish(&dc);
397
+ return ret;
398
+
399
+ error_exit:
400
+ free(viterbi);
401
+ free(prod);
402
+ free(cov);
403
+ free(mean);
404
+ *ptr_w = NULL;
405
+ delta_finish(&dc);
406
+
407
+ return ret;
408
+ }