opener-opinion-detector-base 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +101 -0
  3. data/bin/opinion-detector-base +19 -0
  4. data/core/annotation.cfg.erb +9 -0
  5. data/core/packages/KafNafParser-1.4.tar.gz +0 -0
  6. data/core/packages/VUA_pylib-1.5.tar.gz +0 -0
  7. data/core/python-scripts/LICENSE +339 -0
  8. data/core/python-scripts/README.md +226 -0
  9. data/core/python-scripts/classify_kaf_naf_file.py +499 -0
  10. data/core/python-scripts/cross_validation.py +634 -0
  11. data/core/python-scripts/generate_folds.py +134 -0
  12. data/core/python-scripts/models.cfg +10 -0
  13. data/core/python-scripts/my_templates/README +33 -0
  14. data/core/python-scripts/my_templates/templates_exp.only0.txt +6 -0
  15. data/core/python-scripts/my_templates/templates_exp.pol0.txt +10 -0
  16. data/core/python-scripts/my_templates/templates_exp.red.txt +7 -0
  17. data/core/python-scripts/my_templates/templates_exp.txt +10 -0
  18. data/core/python-scripts/my_templates/templates_holder.only0.txt +11 -0
  19. data/core/python-scripts/my_templates/templates_holder.red.txt +9 -0
  20. data/core/python-scripts/my_templates/templates_holder.txt +10 -0
  21. data/core/python-scripts/my_templates/templates_target.only0.txt +11 -0
  22. data/core/python-scripts/my_templates/templates_target.red.txt +9 -0
  23. data/core/python-scripts/my_templates/templates_target.txt +10 -0
  24. data/core/python-scripts/run_all_experiments.sh +49 -0
  25. data/core/python-scripts/run_basic.py +20 -0
  26. data/core/python-scripts/run_experiment.sh +42 -0
  27. data/core/python-scripts/scripts/__init__.py +1 -0
  28. data/core/python-scripts/scripts/config_manager.py +314 -0
  29. data/core/python-scripts/scripts/crfutils.py +215 -0
  30. data/core/python-scripts/scripts/extract_feats_relations.py +295 -0
  31. data/core/python-scripts/scripts/extract_features.py +376 -0
  32. data/core/python-scripts/scripts/feats_to_crf.exp.py +105 -0
  33. data/core/python-scripts/scripts/lexicons.py +44 -0
  34. data/core/python-scripts/scripts/link_entities_distance.py +77 -0
  35. data/core/python-scripts/scripts/relation_classifier.py +250 -0
  36. data/core/python-scripts/train.py +566 -0
  37. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/PKG-INFO +10 -0
  38. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/SOURCES.txt +22 -0
  39. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/dependency_links.txt +1 -0
  40. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/installed-files.txt +47 -0
  41. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/top_level.txt +1 -0
  42. data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.py +390 -0
  43. data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.pyc +0 -0
  44. data/core/site-packages/pre_build/KafNafParser/__init__.py +14 -0
  45. data/core/site-packages/pre_build/KafNafParser/__init__.pyc +0 -0
  46. data/core/site-packages/pre_build/KafNafParser/constituency_data.py +125 -0
  47. data/core/site-packages/pre_build/KafNafParser/constituency_data.pyc +0 -0
  48. data/core/site-packages/pre_build/KafNafParser/coreference_data.py +52 -0
  49. data/core/site-packages/pre_build/KafNafParser/coreference_data.pyc +0 -0
  50. data/core/site-packages/pre_build/KafNafParser/dependency_data.py +78 -0
  51. data/core/site-packages/pre_build/KafNafParser/dependency_data.pyc +0 -0
  52. data/core/site-packages/pre_build/KafNafParser/entity_data.py +59 -0
  53. data/core/site-packages/pre_build/KafNafParser/entity_data.pyc +0 -0
  54. data/core/site-packages/pre_build/KafNafParser/external_references_data.py +41 -0
  55. data/core/site-packages/pre_build/KafNafParser/external_references_data.pyc +0 -0
  56. data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.py +2 -0
  57. data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.pyc +0 -0
  58. data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.py +205 -0
  59. data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.pyc +0 -0
  60. data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.py +309 -0
  61. data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.pyc +0 -0
  62. data/core/site-packages/pre_build/KafNafParser/features_data.py +131 -0
  63. data/core/site-packages/pre_build/KafNafParser/features_data.pyc +0 -0
  64. data/core/site-packages/pre_build/KafNafParser/header_data.py +127 -0
  65. data/core/site-packages/pre_build/KafNafParser/header_data.pyc +0 -0
  66. data/core/site-packages/pre_build/KafNafParser/opinion_data.py +211 -0
  67. data/core/site-packages/pre_build/KafNafParser/opinion_data.pyc +0 -0
  68. data/core/site-packages/pre_build/KafNafParser/references_data.py +23 -0
  69. data/core/site-packages/pre_build/KafNafParser/references_data.pyc +0 -0
  70. data/core/site-packages/pre_build/KafNafParser/span_data.py +63 -0
  71. data/core/site-packages/pre_build/KafNafParser/span_data.pyc +0 -0
  72. data/core/site-packages/pre_build/KafNafParser/term_data.py +111 -0
  73. data/core/site-packages/pre_build/KafNafParser/term_data.pyc +0 -0
  74. data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.py +42 -0
  75. data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.pyc +0 -0
  76. data/core/site-packages/pre_build/KafNafParser/text_data.py +99 -0
  77. data/core/site-packages/pre_build/KafNafParser/text_data.pyc +0 -0
  78. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/PKG-INFO +10 -0
  79. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/SOURCES.txt +14 -0
  80. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/dependency_links.txt +1 -0
  81. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/installed-files.txt +23 -0
  82. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/top_level.txt +1 -0
  83. data/core/site-packages/pre_build/VUA_pylib/__init__.py +1 -0
  84. data/core/site-packages/pre_build/VUA_pylib/__init__.pyc +0 -0
  85. data/core/site-packages/pre_build/VUA_pylib/common/__init__.py +1 -0
  86. data/core/site-packages/pre_build/VUA_pylib/common/__init__.pyc +0 -0
  87. data/core/site-packages/pre_build/VUA_pylib/common/common.py +28 -0
  88. data/core/site-packages/pre_build/VUA_pylib/common/common.pyc +0 -0
  89. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.py +1 -0
  90. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.pyc +0 -0
  91. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.py +156 -0
  92. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.pyc +0 -0
  93. data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.py +1 -0
  94. data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.pyc +0 -0
  95. data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.py +121 -0
  96. data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.pyc +0 -0
  97. data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.py +1 -0
  98. data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.pyc +0 -0
  99. data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.py +72 -0
  100. data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.pyc +0 -0
  101. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
  102. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
  103. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
  104. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
  105. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
  106. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
  107. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
  108. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
  109. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
  110. data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
  111. data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
  112. data/core/vendor/src/crfsuite/AUTHORS +1 -0
  113. data/core/vendor/src/crfsuite/COPYING +27 -0
  114. data/core/vendor/src/crfsuite/ChangeLog +103 -0
  115. data/core/vendor/src/crfsuite/INSTALL +236 -0
  116. data/core/vendor/src/crfsuite/Makefile.am +19 -0
  117. data/core/vendor/src/crfsuite/Makefile.in +783 -0
  118. data/core/vendor/src/crfsuite/README +183 -0
  119. data/core/vendor/src/crfsuite/aclocal.m4 +9018 -0
  120. data/core/vendor/src/crfsuite/autogen.sh +38 -0
  121. data/core/vendor/src/crfsuite/compile +143 -0
  122. data/core/vendor/src/crfsuite/config.guess +1502 -0
  123. data/core/vendor/src/crfsuite/config.h.in +198 -0
  124. data/core/vendor/src/crfsuite/config.sub +1714 -0
  125. data/core/vendor/src/crfsuite/configure +14273 -0
  126. data/core/vendor/src/crfsuite/configure.in +149 -0
  127. data/core/vendor/src/crfsuite/crfsuite.sln +42 -0
  128. data/core/vendor/src/crfsuite/depcomp +630 -0
  129. data/core/vendor/src/crfsuite/example/chunking.py +49 -0
  130. data/core/vendor/src/crfsuite/example/crfutils.py +179 -0
  131. data/core/vendor/src/crfsuite/example/ner.py +270 -0
  132. data/core/vendor/src/crfsuite/example/pos.py +78 -0
  133. data/core/vendor/src/crfsuite/example/template.py +88 -0
  134. data/core/vendor/src/crfsuite/frontend/Makefile.am +29 -0
  135. data/core/vendor/src/crfsuite/frontend/Makefile.in +640 -0
  136. data/core/vendor/src/crfsuite/frontend/dump.c +116 -0
  137. data/core/vendor/src/crfsuite/frontend/frontend.vcxproj +129 -0
  138. data/core/vendor/src/crfsuite/frontend/iwa.c +273 -0
  139. data/core/vendor/src/crfsuite/frontend/iwa.h +65 -0
  140. data/core/vendor/src/crfsuite/frontend/learn.c +439 -0
  141. data/core/vendor/src/crfsuite/frontend/main.c +137 -0
  142. data/core/vendor/src/crfsuite/frontend/option.c +93 -0
  143. data/core/vendor/src/crfsuite/frontend/option.h +86 -0
  144. data/core/vendor/src/crfsuite/frontend/readdata.h +38 -0
  145. data/core/vendor/src/crfsuite/frontend/reader.c +136 -0
  146. data/core/vendor/src/crfsuite/frontend/tag.c +427 -0
  147. data/core/vendor/src/crfsuite/genbinary.sh.in +15 -0
  148. data/core/vendor/src/crfsuite/include/Makefile.am +11 -0
  149. data/core/vendor/src/crfsuite/include/Makefile.in +461 -0
  150. data/core/vendor/src/crfsuite/include/crfsuite.h +1063 -0
  151. data/core/vendor/src/crfsuite/include/crfsuite.hpp +555 -0
  152. data/core/vendor/src/crfsuite/include/crfsuite_api.hpp +400 -0
  153. data/core/vendor/src/crfsuite/include/os.h +61 -0
  154. data/core/vendor/src/crfsuite/install-sh +520 -0
  155. data/core/vendor/src/crfsuite/lib/cqdb/COPYING +28 -0
  156. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.am +21 -0
  157. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.in +549 -0
  158. data/core/vendor/src/crfsuite/lib/cqdb/cqdb.vcxproj +86 -0
  159. data/core/vendor/src/crfsuite/lib/cqdb/include/cqdb.h +524 -0
  160. data/core/vendor/src/crfsuite/lib/cqdb/src/cqdb.c +587 -0
  161. data/core/vendor/src/crfsuite/lib/cqdb/src/lookup3.c +976 -0
  162. data/core/vendor/src/crfsuite/lib/crf/Makefile.am +46 -0
  163. data/core/vendor/src/crfsuite/lib/crf/Makefile.in +721 -0
  164. data/core/vendor/src/crfsuite/lib/crf/crf.vcxproj +216 -0
  165. data/core/vendor/src/crfsuite/lib/crf/src/crf1d.h +353 -0
  166. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_context.c +705 -0
  167. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_encode.c +943 -0
  168. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_feature.c +352 -0
  169. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_model.c +994 -0
  170. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_tag.c +550 -0
  171. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite.c +492 -0
  172. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_internal.h +236 -0
  173. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_train.c +272 -0
  174. data/core/vendor/src/crfsuite/lib/crf/src/dataset.c +106 -0
  175. data/core/vendor/src/crfsuite/lib/crf/src/dictionary.c +118 -0
  176. data/core/vendor/src/crfsuite/lib/crf/src/holdout.c +80 -0
  177. data/core/vendor/src/crfsuite/lib/crf/src/logging.c +91 -0
  178. data/core/vendor/src/crfsuite/lib/crf/src/logging.h +48 -0
  179. data/core/vendor/src/crfsuite/lib/crf/src/params.c +335 -0
  180. data/core/vendor/src/crfsuite/lib/crf/src/params.h +80 -0
  181. data/core/vendor/src/crfsuite/lib/crf/src/quark.c +172 -0
  182. data/core/vendor/src/crfsuite/lib/crf/src/quark.h +46 -0
  183. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.c +1107 -0
  184. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.h +160 -0
  185. data/core/vendor/src/crfsuite/lib/crf/src/train_arow.c +408 -0
  186. data/core/vendor/src/crfsuite/lib/crf/src/train_averaged_perceptron.c +242 -0
  187. data/core/vendor/src/crfsuite/lib/crf/src/train_l2sgd.c +507 -0
  188. data/core/vendor/src/crfsuite/lib/crf/src/train_lbfgs.c +338 -0
  189. data/core/vendor/src/crfsuite/lib/crf/src/train_passive_aggressive.c +435 -0
  190. data/core/vendor/src/crfsuite/lib/crf/src/vecmath.h +341 -0
  191. data/core/vendor/src/crfsuite/ltmain.sh +8413 -0
  192. data/core/vendor/src/crfsuite/missing +376 -0
  193. data/core/vendor/src/crfsuite/swig/Makefile.am +13 -0
  194. data/core/vendor/src/crfsuite/swig/Makefile.in +365 -0
  195. data/core/vendor/src/crfsuite/swig/crfsuite.cpp +2 -0
  196. data/core/vendor/src/crfsuite/swig/export.i +32 -0
  197. data/core/vendor/src/crfsuite/swig/python/README +92 -0
  198. data/core/vendor/src/crfsuite/swig/python/crfsuite.py +329 -0
  199. data/core/vendor/src/crfsuite/swig/python/export_wrap.cpp +14355 -0
  200. data/core/vendor/src/crfsuite/swig/python/export_wrap.h +63 -0
  201. data/core/vendor/src/crfsuite/swig/python/prepare.sh +9 -0
  202. data/core/vendor/src/crfsuite/swig/python/sample_tag.py +52 -0
  203. data/core/vendor/src/crfsuite/swig/python/sample_train.py +68 -0
  204. data/core/vendor/src/crfsuite/swig/python/setup.py +44 -0
  205. data/core/vendor/src/crfsuite/win32/stdint.h +679 -0
  206. data/core/vendor/src/liblbfgs/AUTHORS +1 -0
  207. data/core/vendor/src/liblbfgs/COPYING +22 -0
  208. data/core/vendor/src/liblbfgs/ChangeLog +120 -0
  209. data/core/vendor/src/liblbfgs/INSTALL +231 -0
  210. data/core/vendor/src/liblbfgs/Makefile.am +10 -0
  211. data/core/vendor/src/liblbfgs/Makefile.in +638 -0
  212. data/core/vendor/src/liblbfgs/NEWS +0 -0
  213. data/core/vendor/src/liblbfgs/README +71 -0
  214. data/core/vendor/src/liblbfgs/aclocal.m4 +6985 -0
  215. data/core/vendor/src/liblbfgs/autogen.sh +38 -0
  216. data/core/vendor/src/liblbfgs/config.guess +1411 -0
  217. data/core/vendor/src/liblbfgs/config.h.in +64 -0
  218. data/core/vendor/src/liblbfgs/config.sub +1500 -0
  219. data/core/vendor/src/liblbfgs/configure +21146 -0
  220. data/core/vendor/src/liblbfgs/configure.in +107 -0
  221. data/core/vendor/src/liblbfgs/depcomp +522 -0
  222. data/core/vendor/src/liblbfgs/include/lbfgs.h +745 -0
  223. data/core/vendor/src/liblbfgs/install-sh +322 -0
  224. data/core/vendor/src/liblbfgs/lbfgs.sln +26 -0
  225. data/core/vendor/src/liblbfgs/lib/Makefile.am +24 -0
  226. data/core/vendor/src/liblbfgs/lib/Makefile.in +499 -0
  227. data/core/vendor/src/liblbfgs/lib/arithmetic_ansi.h +133 -0
  228. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_double.h +294 -0
  229. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_float.h +298 -0
  230. data/core/vendor/src/liblbfgs/lib/lbfgs.c +1371 -0
  231. data/core/vendor/src/liblbfgs/lib/lib.vcxproj +95 -0
  232. data/core/vendor/src/liblbfgs/ltmain.sh +6426 -0
  233. data/core/vendor/src/liblbfgs/missing +353 -0
  234. data/core/vendor/src/liblbfgs/sample/Makefile.am +15 -0
  235. data/core/vendor/src/liblbfgs/sample/Makefile.in +433 -0
  236. data/core/vendor/src/liblbfgs/sample/sample.c +81 -0
  237. data/core/vendor/src/liblbfgs/sample/sample.cpp +126 -0
  238. data/core/vendor/src/liblbfgs/sample/sample.vcxproj +105 -0
  239. data/core/vendor/src/svm_light/LICENSE.txt +59 -0
  240. data/core/vendor/src/svm_light/Makefile +105 -0
  241. data/core/vendor/src/svm_light/kernel.h +40 -0
  242. data/core/vendor/src/svm_light/svm_classify.c +197 -0
  243. data/core/vendor/src/svm_light/svm_common.c +985 -0
  244. data/core/vendor/src/svm_light/svm_common.h +301 -0
  245. data/core/vendor/src/svm_light/svm_hideo.c +1062 -0
  246. data/core/vendor/src/svm_light/svm_learn.c +4147 -0
  247. data/core/vendor/src/svm_light/svm_learn.h +169 -0
  248. data/core/vendor/src/svm_light/svm_learn_main.c +397 -0
  249. data/core/vendor/src/svm_light/svm_loqo.c +211 -0
  250. data/ext/hack/Rakefile +17 -0
  251. data/ext/hack/support.rb +88 -0
  252. data/lib/opener/opinion_detectors/base.rb +112 -0
  253. data/lib/opener/opinion_detectors/base/version.rb +7 -0
  254. data/lib/opener/opinion_detectors/configuration_creator.rb +86 -0
  255. data/lib/opener/opinion_detectors/de.rb +7 -0
  256. data/lib/opener/opinion_detectors/en.rb +7 -0
  257. data/lib/opener/opinion_detectors/it.rb +7 -0
  258. data/lib/opener/opinion_detectors/nl.rb +6 -0
  259. data/opener-opinion-detector-base.gemspec +35 -0
  260. data/pre_build_requirements.txt +3 -0
  261. metadata +374 -0
@@ -0,0 +1,65 @@
1
+ /*
2
+ * A parser for Item With Attributes (IWA) format.
3
+ *
4
+ * Copyright (c) 2007-2010, Naoaki Okazaki
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ * * Neither the names of the authors nor the names of its contributors
15
+ * may be used to endorse or promote products derived from this
16
+ * software without specific prior written permission.
17
+ *
18
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
22
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+ */
30
+
31
+ /* $Id$ */
32
+
33
+ #ifndef __IWA_H__
34
+ #define __IWA_H__
35
+
36
+ #ifdef __cplusplus
37
+ extern "C" {
38
+ #endif/*__cplusplus*/
39
+
40
+ typedef struct tag_iwa iwa_t;
41
+
42
+ enum {
43
+ IWA_NONE,
44
+ IWA_EOF,
45
+ IWA_BOI,
46
+ IWA_EOI,
47
+ IWA_ITEM,
48
+ };
49
+
50
+ struct tag_iwa_token {
51
+ int type;
52
+ const char *attr;
53
+ const char *value;
54
+ };
55
+ typedef struct tag_iwa_token iwa_token_t;
56
+
57
+ iwa_t* iwa_reader(FILE *fp);
58
+ const iwa_token_t* iwa_read(iwa_t* iwa);
59
+ void iwa_delete(iwa_t* iwa);
60
+
61
+ #ifdef __cplusplus
62
+ }
63
+ #endif/*__cplusplus*/
64
+
65
+ #endif/*__IWA_H__*/
@@ -0,0 +1,439 @@
1
+ /*
2
+ * Learn command for CRFsuite frontend.
3
+ *
4
+ * Copyright (c) 2007-2010, Naoaki Okazaki
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ * * Redistributions of source code must retain the above copyright
10
+ * notice, this list of conditions and the following disclaimer.
11
+ * * Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in the
13
+ * documentation and/or other materials provided with the distribution.
14
+ * * Neither the names of the authors nor the names of its contributors
15
+ * may be used to endorse or promote products derived from this
16
+ * software without specific prior written permission.
17
+ *
18
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
22
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+ */
30
+
31
+ /* $Id$ */
32
+
33
+ #include <os.h>
34
+
35
+ #include <stdio.h>
36
+ #include <stdlib.h>
37
+ #include <string.h>
38
+ #include <time.h>
39
+
40
+ #include <crfsuite.h>
41
+ #include "option.h"
42
+ #include "readdata.h"
43
+
44
+ #define SAFE_RELEASE(obj) if ((obj) != NULL) { (obj)->release(obj); (obj) = NULL; }
45
+ #define MAX(a, b) ((a) < (b) ? (b) : (a))
46
+
47
+
48
+ typedef struct {
49
+ char *type;
50
+ char *algorithm;
51
+ char *model;
52
+ char *logbase;
53
+
54
+ int split;
55
+ int cross_validation;
56
+ int holdout;
57
+ int logfile;
58
+
59
+ int help;
60
+ int help_params;
61
+
62
+ int num_params;
63
+ char **params;
64
+ } learn_option_t;
65
+
66
+ static char* mystrdup(const char *src)
67
+ {
68
+ char *dst = (char*)malloc(strlen(src)+1);
69
+ if (dst != NULL) {
70
+ strcpy(dst, src);
71
+ }
72
+ return dst;
73
+ }
74
+
75
+ static char* mystrcat(char *dst, const char *src)
76
+ {
77
+ int n = (dst != 0 ? strlen(dst) : 0);
78
+ dst = (char*)realloc(dst, n + strlen(src) + 1);
79
+ strcat(dst, src);
80
+ return dst;
81
+ }
82
+
83
+ static void learn_option_init(learn_option_t* opt)
84
+ {
85
+ memset(opt, 0, sizeof(*opt));
86
+ opt->num_params = 0;
87
+ opt->holdout = -1;
88
+ opt->type = mystrdup("crf1d");
89
+ opt->algorithm = mystrdup("lbfgs");
90
+ opt->model = mystrdup("");
91
+ opt->logbase = mystrdup("log.crfsuite");
92
+ }
93
+
94
+ static void learn_option_finish(learn_option_t* opt)
95
+ {
96
+ int i;
97
+
98
+ free(opt->model);
99
+
100
+ for (i = 0;i < opt->num_params;++i) {
101
+ free(opt->params[i]);
102
+ }
103
+ free(opt->params);
104
+ }
105
+
106
+ BEGIN_OPTION_MAP(parse_learn_options, learn_option_t)
107
+
108
+ ON_OPTION_WITH_ARG(SHORTOPT('t') || LONGOPT("type"))
109
+ if (strcmp(arg, "1d") == 0) {
110
+ free(opt->type);
111
+ opt->type = mystrdup("crf1d");
112
+ } else {
113
+ fprintf(stderr, "ERROR: Unknown graphical model: %s\n", arg);
114
+ return 1;
115
+ }
116
+
117
+ ON_OPTION_WITH_ARG(SHORTOPT('a') || LONGOPT("algorithm"))
118
+ if (strcmp(arg, "lbfgs") == 0) {
119
+ free(opt->algorithm);
120
+ opt->algorithm = mystrdup("lbfgs");
121
+ } else if (strcmp(arg, "l2sgd") == 0) {
122
+ free(opt->algorithm);
123
+ opt->algorithm = mystrdup("l2sgd");
124
+ } else if (strcmp(arg, "ap") == 0 || strcmp(arg, "averaged-perceptron") == 0) {
125
+ free(opt->algorithm);
126
+ opt->algorithm = mystrdup("averaged-perceptron");
127
+ } else if (strcmp(arg, "pa") == 0 || strcmp(arg, "passive-aggressive") == 0) {
128
+ free(opt->algorithm);
129
+ opt->algorithm = mystrdup("passive-aggressive");
130
+ } else if (strcmp(arg, "arow") == 0) {
131
+ free(opt->algorithm);
132
+ opt->algorithm = mystrdup("arow");
133
+ } else {
134
+ fprintf(stderr, "ERROR: Unknown algorithm: %s\n", arg);
135
+ return 1;
136
+ }
137
+
138
+ ON_OPTION_WITH_ARG(SHORTOPT('p') || LONGOPT("set"))
139
+ opt->params = (char **)realloc(opt->params, sizeof(char*) * (opt->num_params + 1));
140
+ opt->params[opt->num_params] = mystrdup(arg);
141
+ ++opt->num_params;
142
+
143
+ ON_OPTION_WITH_ARG(SHORTOPT('m') || LONGOPT("model"))
144
+ free(opt->model);
145
+ opt->model = mystrdup(arg);
146
+
147
+ ON_OPTION_WITH_ARG(SHORTOPT('g') || LONGOPT("split"))
148
+ opt->split = atoi(arg);
149
+
150
+ ON_OPTION_WITH_ARG(SHORTOPT('e') || LONGOPT("holdout"))
151
+ opt->holdout = atoi(arg)-1;
152
+
153
+ ON_OPTION(SHORTOPT('x') || LONGOPT("cross-validate"))
154
+ opt->cross_validation = 1;
155
+
156
+ ON_OPTION(SHORTOPT('l') || LONGOPT("log-to-file"))
157
+ opt->logfile = 1;
158
+
159
+ ON_OPTION_WITH_ARG(SHORTOPT('L') || LONGOPT("logbase"))
160
+ free(opt->logbase);
161
+ opt->logbase = mystrdup(arg);
162
+
163
+ ON_OPTION(SHORTOPT('h') || LONGOPT("help"))
164
+ opt->help = 1;
165
+
166
+ ON_OPTION(SHORTOPT('H') || LONGOPT("help-params"))
167
+ opt->help_params = 1;
168
+
169
+ END_OPTION_MAP()
170
+
171
+ static void show_usage(FILE *fp, const char *argv0, const char *command)
172
+ {
173
+ fprintf(fp, "USAGE: %s %s [OPTIONS] [DATA1] [DATA2] ...\n", argv0, command);
174
+ fprintf(fp, "Trains a model using training data set(s).\n");
175
+ fprintf(fp, "\n");
176
+ fprintf(fp, " DATA file(s) corresponding to data set(s) for training; if multiple N files\n");
177
+ fprintf(fp, " are specified, this utility assigns a group number (1...N) to the\n");
178
+ fprintf(fp, " instances in each file; if a file name is '-', the utility reads a\n");
179
+ fprintf(fp, " data set from STDIN\n");
180
+ fprintf(fp, "\n");
181
+ fprintf(fp, "OPTIONS:\n");
182
+ fprintf(fp, " -t, --type=TYPE specify a graphical model (DEFAULT='1d'):\n");
183
+ fprintf(fp, " (this option is reserved for the future use)\n");
184
+ fprintf(fp, " 1d 1st-order Markov CRF with state and transition\n");
185
+ fprintf(fp, " features; transition features are not conditioned\n");
186
+ fprintf(fp, " on observations\n");
187
+ fprintf(fp, " -a, --algorithm=NAME specify a training algorithm (DEFAULT='lbfgs')\n");
188
+ fprintf(fp, " lbfgs L-BFGS with L1/L2 regularization\n");
189
+ fprintf(fp, " l2sgd SGD with L2-regularization\n");
190
+ fprintf(fp, " ap Averaged Perceptron\n");
191
+ fprintf(fp, " pa Passive Aggressive\n");
192
+ fprintf(fp, " arow Adaptive Regularization of Weights (AROW)\n");
193
+ fprintf(fp, " -p, --set=NAME=VALUE set the algorithm-specific parameter NAME to VALUE;\n");
194
+ fprintf(fp, " use '-H' or '--help-parameters' with the algorithm name\n");
195
+ fprintf(fp, " specified by '-a' or '--algorithm' and the graphical\n");
196
+ fprintf(fp, " model specified by '-t' or '--type' to see the list of\n");
197
+ fprintf(fp, " algorithm-specific parameters\n");
198
+ fprintf(fp, " -m, --model=FILE store the model to FILE (DEFAULT=''); if the value is\n");
199
+ fprintf(fp, " empty, this utility does not store the model\n");
200
+ fprintf(fp, " -g, --split=N split the instances into N groups; this option is\n");
201
+ fprintf(fp, " useful for holdout evaluation and cross validation\n");
202
+ fprintf(fp, " -e, --holdout=M use the M-th data for holdout evaluation and the rest\n");
203
+ fprintf(fp, " for training\n");
204
+ fprintf(fp, " -x, --cross-validate repeat holdout evaluations for #i in {1, ..., N} groups\n");
205
+ fprintf(fp, " (N-fold cross validation)\n");
206
+ fprintf(fp, " -l, --log-to-file write the training log to a file instead of to STDOUT;\n");
207
+ fprintf(fp, " The filename is determined automatically by the training\n");
208
+ fprintf(fp, " algorithm, parameters, and source files\n");
209
+ fprintf(fp, " -L, --logbase=BASE set the base name for a log file (used with -l option)\n");
210
+ fprintf(fp, " -h, --help show the usage of this command and exit\n");
211
+ fprintf(fp, " -H, --help-parameters show the help message of algorithm-specific parameters;\n");
212
+ fprintf(fp, " specify an algorithm with '-a' or '--algorithm' option,\n");
213
+ fprintf(fp, " and specify a graphical model with '-t' or '--type' option\n");
214
+ }
215
+
216
+
217
+
218
+ static int message_callback(void *instance, const char *format, va_list args)
219
+ {
220
+ vfprintf(stdout, format, args);
221
+ fflush(stdout);
222
+ return 0;
223
+ }
224
+
225
+ int main_learn(int argc, char *argv[], const char *argv0)
226
+ {
227
+ int i, n, groups = 1, ret = 0, arg_used = 0;
228
+ time_t ts;
229
+ char timestamp[80];
230
+ char trainer_id[128];
231
+ clock_t clk_begin, clk_current;
232
+ learn_option_t opt;
233
+ const char *command = argv[0];
234
+ FILE *fpi = stdin, *fpo = stdout, *fpe = stderr;
235
+ crfsuite_data_t data;
236
+ crfsuite_trainer_t *trainer = NULL;
237
+ crfsuite_dictionary_t *attrs = NULL, *labels = NULL;
238
+
239
+ /* Initializations. */
240
+ learn_option_init(&opt);
241
+ crfsuite_data_init(&data);
242
+
243
+ /* Parse the command-line option. */
244
+ arg_used = option_parse(++argv, --argc, parse_learn_options, &opt);
245
+ if (arg_used < 0) {
246
+ ret = 1;
247
+ goto force_exit;
248
+ }
249
+
250
+ /* Show the help message for this command if specified. */
251
+ if (opt.help) {
252
+ show_usage(fpo, argv0, command);
253
+ goto force_exit;
254
+ }
255
+
256
+ /* Open a log file if necessary. */
257
+ if (opt.logfile) {
258
+ /* Generate a filename for the log file. */
259
+ char *fname = NULL;
260
+ fname = mystrcat(fname, opt.logbase);
261
+ fname = mystrcat(fname, "_");
262
+ fname = mystrcat(fname, opt.algorithm);
263
+ for (i = 0;i < opt.num_params;++i) {
264
+ fname = mystrcat(fname, "_");
265
+ fname = mystrcat(fname, opt.params[i]);
266
+ }
267
+
268
+ fpo = fopen(fname, "w");
269
+ if (fpo == NULL) {
270
+ fprintf(fpe, "ERROR: Failed to open the log file.\n");
271
+ ret = 1;
272
+ goto force_exit;
273
+ }
274
+ }
275
+
276
+ /* Create dictionaries for attributes and labels. */
277
+ ret = crfsuite_create_instance("dictionary", (void**)&data.attrs);
278
+ if (!ret) {
279
+ fprintf(fpe, "ERROR: Failed to create a dictionary instance.\n");
280
+ ret = 1;
281
+ goto force_exit;
282
+ }
283
+ ret = crfsuite_create_instance("dictionary", (void**)&data.labels);
284
+ if (!ret) {
285
+ fprintf(fpe, "ERROR: Failed to create a dictionary instance.\n");
286
+ ret = 1;
287
+ goto force_exit;
288
+ }
289
+
290
+ /* Create a trainer instance. */
291
+ sprintf(trainer_id, "train/%s/%s", opt.type, opt.algorithm);
292
+ ret = crfsuite_create_instance(trainer_id, (void**)&trainer);
293
+ if (!ret) {
294
+ fprintf(fpe, "ERROR: Failed to create a trainer instance.\n");
295
+ ret = 1;
296
+ goto force_exit;
297
+ }
298
+
299
+ /* Show the help message for the training algorithm if specified. */
300
+ if (opt.help_params) {
301
+ crfsuite_params_t* params = trainer->params(trainer);
302
+
303
+ fprintf(fpo, "PARAMETERS for %s (%s):\n", opt.algorithm, opt.type);
304
+ fprintf(fpo, "\n");
305
+
306
+ for (i = 0;i < params->num(params);++i) {
307
+ char *name = NULL;
308
+ char *type = NULL;
309
+ char *value = NULL;
310
+ char *help = NULL;
311
+
312
+ params->name(params, i, &name);
313
+ params->get(params, name, &value);
314
+ params->help(params, name, &type, &help);
315
+
316
+ fprintf(fpo, "%s %s = %s;\n", type, name, value);
317
+ fprintf(fpo, "%s\n", help);
318
+ fprintf(fpo, "\n");
319
+
320
+ params->free(params, help);
321
+ params->free(params, type);
322
+ params->free(params, value);
323
+ params->free(params, name);
324
+ }
325
+
326
+ params->release(params);
327
+ goto force_exit;
328
+ }
329
+
330
+ /* Set parameters. */
331
+ for (i = 0;i < opt.num_params;++i) {
332
+ char *value = NULL;
333
+ char *name = opt.params[i];
334
+ crfsuite_params_t* params = trainer->params(trainer);
335
+
336
+ /* Split the parameter argument by the first '=' character. */
337
+ value = strchr(name, '=');
338
+ if (value != NULL) {
339
+ *value++ = 0;
340
+ }
341
+
342
+ if (params->set(params, name, value) != 0) {
343
+ fprintf(fpe, "ERROR: paraneter not found: %s\n", name);
344
+ goto force_exit;
345
+ }
346
+ params->release(params);
347
+ }
348
+
349
+ /* Log the start time. */
350
+ time(&ts);
351
+ strftime(timestamp, sizeof(timestamp), "%Y-%m-%dT%H:%M:%SZ", gmtime(&ts));
352
+ fprintf(fpo, "Start time of the training: %s\n", timestamp);
353
+ fprintf(fpo, "\n");
354
+
355
+ /* Read the training data. */
356
+ fprintf(fpo, "Reading the data set(s)\n");
357
+ for (i = arg_used;i < argc;++i) {
358
+ FILE *fp = (strcmp(argv[i], "-") == 0) ? fpi : fopen(argv[i], "r");
359
+ if (fp == NULL) {
360
+ fprintf(fpe, "ERROR: Failed to open the data set: %s\n", argv[i]);
361
+ ret = 1;
362
+ goto force_exit;
363
+ }
364
+
365
+ fprintf(fpo, "[%d] %s\n", i-arg_used+1, argv[i]);
366
+ clk_begin = clock();
367
+ n = read_data(fp, fpo, &data, i-arg_used);
368
+ clk_current = clock();
369
+ fprintf(fpo, "Number of instances: %d\n", n);
370
+ fprintf(fpo, "Seconds required: %.3f\n", (clk_current - clk_begin) / (double)CLOCKS_PER_SEC);
371
+ fclose(fp);
372
+ }
373
+ groups = argc-arg_used;
374
+ fprintf(fpo, "\n");
375
+
376
+ /* Split into data sets if necessary. */
377
+ if (0 < opt.split) {
378
+ /* Shuffle the instances. */
379
+ for (i = 0;i < data.num_instances;++i) {
380
+ int j = rand() % data.num_instances;
381
+ crfsuite_instance_swap(&data.instances[i], &data.instances[j]);
382
+ }
383
+
384
+ /* Assign group numbers. */
385
+ for (i = 0;i < data.num_instances;++i) {
386
+ data.instances[i].group = i % opt.split;
387
+ }
388
+ groups = opt.split;
389
+ }
390
+
391
+ /* Report the statistics of the training data. */
392
+ fprintf(fpo, "Statistics the data set(s)\n");
393
+ fprintf(fpo, "Number of data sets (groups): %d\n", groups);
394
+ fprintf(fpo, "Number of instances: %d\n", data.num_instances);
395
+ fprintf(fpo, "Number of items: %d\n", crfsuite_data_totalitems(&data));
396
+ fprintf(fpo, "Number of attributes: %d\n", data.attrs->num(data.attrs));
397
+ fprintf(fpo, "Number of labels: %d\n", data.labels->num(data.labels));
398
+ fprintf(fpo, "\n");
399
+ fflush(fpo);
400
+
401
+ /* Set callback procedures that receive messages and taggers. */
402
+ trainer->set_message_callback(trainer, NULL, message_callback);
403
+
404
+ /* Start training. */
405
+ if (opt.cross_validation) {
406
+ for (i = 0;i < groups;++i) {
407
+ fprintf(fpo, "===== Cross validation (%d/%d) =====\n", i+1, groups);
408
+ if (ret = trainer->train(trainer, &data, "", i)) {
409
+ goto force_exit;
410
+ }
411
+ fprintf(fpo, "\n");
412
+ }
413
+
414
+ } else {
415
+ if (ret = trainer->train(trainer, &data, opt.model, opt.holdout)) {
416
+ goto force_exit;
417
+ }
418
+
419
+ }
420
+
421
+ /* Log the end time. */
422
+ time(&ts);
423
+ strftime(timestamp, sizeof(timestamp), "%Y-%m-%dT%H:%M:%SZ", gmtime(&ts));
424
+ fprintf(fpo, "End time of the training: %s\n", timestamp);
425
+ fprintf(fpo, "\n");
426
+
427
+ force_exit:
428
+ SAFE_RELEASE(trainer);
429
+ SAFE_RELEASE(data.labels);
430
+ SAFE_RELEASE(data.attrs);
431
+
432
+ crfsuite_data_finish(&data);
433
+ learn_option_finish(&opt);
434
+ if (fpo != NULL) {
435
+ fclose(fpo);
436
+ }
437
+
438
+ return ret;
439
+ }