opener-opinion-detector-base 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +101 -0
  3. data/bin/opinion-detector-base +19 -0
  4. data/core/annotation.cfg.erb +9 -0
  5. data/core/packages/KafNafParser-1.4.tar.gz +0 -0
  6. data/core/packages/VUA_pylib-1.5.tar.gz +0 -0
  7. data/core/python-scripts/LICENSE +339 -0
  8. data/core/python-scripts/README.md +226 -0
  9. data/core/python-scripts/classify_kaf_naf_file.py +499 -0
  10. data/core/python-scripts/cross_validation.py +634 -0
  11. data/core/python-scripts/generate_folds.py +134 -0
  12. data/core/python-scripts/models.cfg +10 -0
  13. data/core/python-scripts/my_templates/README +33 -0
  14. data/core/python-scripts/my_templates/templates_exp.only0.txt +6 -0
  15. data/core/python-scripts/my_templates/templates_exp.pol0.txt +10 -0
  16. data/core/python-scripts/my_templates/templates_exp.red.txt +7 -0
  17. data/core/python-scripts/my_templates/templates_exp.txt +10 -0
  18. data/core/python-scripts/my_templates/templates_holder.only0.txt +11 -0
  19. data/core/python-scripts/my_templates/templates_holder.red.txt +9 -0
  20. data/core/python-scripts/my_templates/templates_holder.txt +10 -0
  21. data/core/python-scripts/my_templates/templates_target.only0.txt +11 -0
  22. data/core/python-scripts/my_templates/templates_target.red.txt +9 -0
  23. data/core/python-scripts/my_templates/templates_target.txt +10 -0
  24. data/core/python-scripts/run_all_experiments.sh +49 -0
  25. data/core/python-scripts/run_basic.py +20 -0
  26. data/core/python-scripts/run_experiment.sh +42 -0
  27. data/core/python-scripts/scripts/__init__.py +1 -0
  28. data/core/python-scripts/scripts/config_manager.py +314 -0
  29. data/core/python-scripts/scripts/crfutils.py +215 -0
  30. data/core/python-scripts/scripts/extract_feats_relations.py +295 -0
  31. data/core/python-scripts/scripts/extract_features.py +376 -0
  32. data/core/python-scripts/scripts/feats_to_crf.exp.py +105 -0
  33. data/core/python-scripts/scripts/lexicons.py +44 -0
  34. data/core/python-scripts/scripts/link_entities_distance.py +77 -0
  35. data/core/python-scripts/scripts/relation_classifier.py +250 -0
  36. data/core/python-scripts/train.py +566 -0
  37. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/PKG-INFO +10 -0
  38. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/SOURCES.txt +22 -0
  39. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/dependency_links.txt +1 -0
  40. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/installed-files.txt +47 -0
  41. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/top_level.txt +1 -0
  42. data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.py +390 -0
  43. data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.pyc +0 -0
  44. data/core/site-packages/pre_build/KafNafParser/__init__.py +14 -0
  45. data/core/site-packages/pre_build/KafNafParser/__init__.pyc +0 -0
  46. data/core/site-packages/pre_build/KafNafParser/constituency_data.py +125 -0
  47. data/core/site-packages/pre_build/KafNafParser/constituency_data.pyc +0 -0
  48. data/core/site-packages/pre_build/KafNafParser/coreference_data.py +52 -0
  49. data/core/site-packages/pre_build/KafNafParser/coreference_data.pyc +0 -0
  50. data/core/site-packages/pre_build/KafNafParser/dependency_data.py +78 -0
  51. data/core/site-packages/pre_build/KafNafParser/dependency_data.pyc +0 -0
  52. data/core/site-packages/pre_build/KafNafParser/entity_data.py +59 -0
  53. data/core/site-packages/pre_build/KafNafParser/entity_data.pyc +0 -0
  54. data/core/site-packages/pre_build/KafNafParser/external_references_data.py +41 -0
  55. data/core/site-packages/pre_build/KafNafParser/external_references_data.pyc +0 -0
  56. data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.py +2 -0
  57. data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.pyc +0 -0
  58. data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.py +205 -0
  59. data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.pyc +0 -0
  60. data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.py +309 -0
  61. data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.pyc +0 -0
  62. data/core/site-packages/pre_build/KafNafParser/features_data.py +131 -0
  63. data/core/site-packages/pre_build/KafNafParser/features_data.pyc +0 -0
  64. data/core/site-packages/pre_build/KafNafParser/header_data.py +127 -0
  65. data/core/site-packages/pre_build/KafNafParser/header_data.pyc +0 -0
  66. data/core/site-packages/pre_build/KafNafParser/opinion_data.py +211 -0
  67. data/core/site-packages/pre_build/KafNafParser/opinion_data.pyc +0 -0
  68. data/core/site-packages/pre_build/KafNafParser/references_data.py +23 -0
  69. data/core/site-packages/pre_build/KafNafParser/references_data.pyc +0 -0
  70. data/core/site-packages/pre_build/KafNafParser/span_data.py +63 -0
  71. data/core/site-packages/pre_build/KafNafParser/span_data.pyc +0 -0
  72. data/core/site-packages/pre_build/KafNafParser/term_data.py +111 -0
  73. data/core/site-packages/pre_build/KafNafParser/term_data.pyc +0 -0
  74. data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.py +42 -0
  75. data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.pyc +0 -0
  76. data/core/site-packages/pre_build/KafNafParser/text_data.py +99 -0
  77. data/core/site-packages/pre_build/KafNafParser/text_data.pyc +0 -0
  78. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/PKG-INFO +10 -0
  79. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/SOURCES.txt +14 -0
  80. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/dependency_links.txt +1 -0
  81. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/installed-files.txt +23 -0
  82. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/top_level.txt +1 -0
  83. data/core/site-packages/pre_build/VUA_pylib/__init__.py +1 -0
  84. data/core/site-packages/pre_build/VUA_pylib/__init__.pyc +0 -0
  85. data/core/site-packages/pre_build/VUA_pylib/common/__init__.py +1 -0
  86. data/core/site-packages/pre_build/VUA_pylib/common/__init__.pyc +0 -0
  87. data/core/site-packages/pre_build/VUA_pylib/common/common.py +28 -0
  88. data/core/site-packages/pre_build/VUA_pylib/common/common.pyc +0 -0
  89. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.py +1 -0
  90. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.pyc +0 -0
  91. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.py +156 -0
  92. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.pyc +0 -0
  93. data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.py +1 -0
  94. data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.pyc +0 -0
  95. data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.py +121 -0
  96. data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.pyc +0 -0
  97. data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.py +1 -0
  98. data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.pyc +0 -0
  99. data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.py +72 -0
  100. data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.pyc +0 -0
  101. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
  102. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
  103. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
  104. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
  105. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
  106. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
  107. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
  108. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
  109. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
  110. data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
  111. data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
  112. data/core/vendor/src/crfsuite/AUTHORS +1 -0
  113. data/core/vendor/src/crfsuite/COPYING +27 -0
  114. data/core/vendor/src/crfsuite/ChangeLog +103 -0
  115. data/core/vendor/src/crfsuite/INSTALL +236 -0
  116. data/core/vendor/src/crfsuite/Makefile.am +19 -0
  117. data/core/vendor/src/crfsuite/Makefile.in +783 -0
  118. data/core/vendor/src/crfsuite/README +183 -0
  119. data/core/vendor/src/crfsuite/aclocal.m4 +9018 -0
  120. data/core/vendor/src/crfsuite/autogen.sh +38 -0
  121. data/core/vendor/src/crfsuite/compile +143 -0
  122. data/core/vendor/src/crfsuite/config.guess +1502 -0
  123. data/core/vendor/src/crfsuite/config.h.in +198 -0
  124. data/core/vendor/src/crfsuite/config.sub +1714 -0
  125. data/core/vendor/src/crfsuite/configure +14273 -0
  126. data/core/vendor/src/crfsuite/configure.in +149 -0
  127. data/core/vendor/src/crfsuite/crfsuite.sln +42 -0
  128. data/core/vendor/src/crfsuite/depcomp +630 -0
  129. data/core/vendor/src/crfsuite/example/chunking.py +49 -0
  130. data/core/vendor/src/crfsuite/example/crfutils.py +179 -0
  131. data/core/vendor/src/crfsuite/example/ner.py +270 -0
  132. data/core/vendor/src/crfsuite/example/pos.py +78 -0
  133. data/core/vendor/src/crfsuite/example/template.py +88 -0
  134. data/core/vendor/src/crfsuite/frontend/Makefile.am +29 -0
  135. data/core/vendor/src/crfsuite/frontend/Makefile.in +640 -0
  136. data/core/vendor/src/crfsuite/frontend/dump.c +116 -0
  137. data/core/vendor/src/crfsuite/frontend/frontend.vcxproj +129 -0
  138. data/core/vendor/src/crfsuite/frontend/iwa.c +273 -0
  139. data/core/vendor/src/crfsuite/frontend/iwa.h +65 -0
  140. data/core/vendor/src/crfsuite/frontend/learn.c +439 -0
  141. data/core/vendor/src/crfsuite/frontend/main.c +137 -0
  142. data/core/vendor/src/crfsuite/frontend/option.c +93 -0
  143. data/core/vendor/src/crfsuite/frontend/option.h +86 -0
  144. data/core/vendor/src/crfsuite/frontend/readdata.h +38 -0
  145. data/core/vendor/src/crfsuite/frontend/reader.c +136 -0
  146. data/core/vendor/src/crfsuite/frontend/tag.c +427 -0
  147. data/core/vendor/src/crfsuite/genbinary.sh.in +15 -0
  148. data/core/vendor/src/crfsuite/include/Makefile.am +11 -0
  149. data/core/vendor/src/crfsuite/include/Makefile.in +461 -0
  150. data/core/vendor/src/crfsuite/include/crfsuite.h +1063 -0
  151. data/core/vendor/src/crfsuite/include/crfsuite.hpp +555 -0
  152. data/core/vendor/src/crfsuite/include/crfsuite_api.hpp +400 -0
  153. data/core/vendor/src/crfsuite/include/os.h +61 -0
  154. data/core/vendor/src/crfsuite/install-sh +520 -0
  155. data/core/vendor/src/crfsuite/lib/cqdb/COPYING +28 -0
  156. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.am +21 -0
  157. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.in +549 -0
  158. data/core/vendor/src/crfsuite/lib/cqdb/cqdb.vcxproj +86 -0
  159. data/core/vendor/src/crfsuite/lib/cqdb/include/cqdb.h +524 -0
  160. data/core/vendor/src/crfsuite/lib/cqdb/src/cqdb.c +587 -0
  161. data/core/vendor/src/crfsuite/lib/cqdb/src/lookup3.c +976 -0
  162. data/core/vendor/src/crfsuite/lib/crf/Makefile.am +46 -0
  163. data/core/vendor/src/crfsuite/lib/crf/Makefile.in +721 -0
  164. data/core/vendor/src/crfsuite/lib/crf/crf.vcxproj +216 -0
  165. data/core/vendor/src/crfsuite/lib/crf/src/crf1d.h +353 -0
  166. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_context.c +705 -0
  167. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_encode.c +943 -0
  168. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_feature.c +352 -0
  169. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_model.c +994 -0
  170. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_tag.c +550 -0
  171. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite.c +492 -0
  172. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_internal.h +236 -0
  173. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_train.c +272 -0
  174. data/core/vendor/src/crfsuite/lib/crf/src/dataset.c +106 -0
  175. data/core/vendor/src/crfsuite/lib/crf/src/dictionary.c +118 -0
  176. data/core/vendor/src/crfsuite/lib/crf/src/holdout.c +80 -0
  177. data/core/vendor/src/crfsuite/lib/crf/src/logging.c +91 -0
  178. data/core/vendor/src/crfsuite/lib/crf/src/logging.h +48 -0
  179. data/core/vendor/src/crfsuite/lib/crf/src/params.c +335 -0
  180. data/core/vendor/src/crfsuite/lib/crf/src/params.h +80 -0
  181. data/core/vendor/src/crfsuite/lib/crf/src/quark.c +172 -0
  182. data/core/vendor/src/crfsuite/lib/crf/src/quark.h +46 -0
  183. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.c +1107 -0
  184. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.h +160 -0
  185. data/core/vendor/src/crfsuite/lib/crf/src/train_arow.c +408 -0
  186. data/core/vendor/src/crfsuite/lib/crf/src/train_averaged_perceptron.c +242 -0
  187. data/core/vendor/src/crfsuite/lib/crf/src/train_l2sgd.c +507 -0
  188. data/core/vendor/src/crfsuite/lib/crf/src/train_lbfgs.c +338 -0
  189. data/core/vendor/src/crfsuite/lib/crf/src/train_passive_aggressive.c +435 -0
  190. data/core/vendor/src/crfsuite/lib/crf/src/vecmath.h +341 -0
  191. data/core/vendor/src/crfsuite/ltmain.sh +8413 -0
  192. data/core/vendor/src/crfsuite/missing +376 -0
  193. data/core/vendor/src/crfsuite/swig/Makefile.am +13 -0
  194. data/core/vendor/src/crfsuite/swig/Makefile.in +365 -0
  195. data/core/vendor/src/crfsuite/swig/crfsuite.cpp +2 -0
  196. data/core/vendor/src/crfsuite/swig/export.i +32 -0
  197. data/core/vendor/src/crfsuite/swig/python/README +92 -0
  198. data/core/vendor/src/crfsuite/swig/python/crfsuite.py +329 -0
  199. data/core/vendor/src/crfsuite/swig/python/export_wrap.cpp +14355 -0
  200. data/core/vendor/src/crfsuite/swig/python/export_wrap.h +63 -0
  201. data/core/vendor/src/crfsuite/swig/python/prepare.sh +9 -0
  202. data/core/vendor/src/crfsuite/swig/python/sample_tag.py +52 -0
  203. data/core/vendor/src/crfsuite/swig/python/sample_train.py +68 -0
  204. data/core/vendor/src/crfsuite/swig/python/setup.py +44 -0
  205. data/core/vendor/src/crfsuite/win32/stdint.h +679 -0
  206. data/core/vendor/src/liblbfgs/AUTHORS +1 -0
  207. data/core/vendor/src/liblbfgs/COPYING +22 -0
  208. data/core/vendor/src/liblbfgs/ChangeLog +120 -0
  209. data/core/vendor/src/liblbfgs/INSTALL +231 -0
  210. data/core/vendor/src/liblbfgs/Makefile.am +10 -0
  211. data/core/vendor/src/liblbfgs/Makefile.in +638 -0
  212. data/core/vendor/src/liblbfgs/NEWS +0 -0
  213. data/core/vendor/src/liblbfgs/README +71 -0
  214. data/core/vendor/src/liblbfgs/aclocal.m4 +6985 -0
  215. data/core/vendor/src/liblbfgs/autogen.sh +38 -0
  216. data/core/vendor/src/liblbfgs/config.guess +1411 -0
  217. data/core/vendor/src/liblbfgs/config.h.in +64 -0
  218. data/core/vendor/src/liblbfgs/config.sub +1500 -0
  219. data/core/vendor/src/liblbfgs/configure +21146 -0
  220. data/core/vendor/src/liblbfgs/configure.in +107 -0
  221. data/core/vendor/src/liblbfgs/depcomp +522 -0
  222. data/core/vendor/src/liblbfgs/include/lbfgs.h +745 -0
  223. data/core/vendor/src/liblbfgs/install-sh +322 -0
  224. data/core/vendor/src/liblbfgs/lbfgs.sln +26 -0
  225. data/core/vendor/src/liblbfgs/lib/Makefile.am +24 -0
  226. data/core/vendor/src/liblbfgs/lib/Makefile.in +499 -0
  227. data/core/vendor/src/liblbfgs/lib/arithmetic_ansi.h +133 -0
  228. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_double.h +294 -0
  229. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_float.h +298 -0
  230. data/core/vendor/src/liblbfgs/lib/lbfgs.c +1371 -0
  231. data/core/vendor/src/liblbfgs/lib/lib.vcxproj +95 -0
  232. data/core/vendor/src/liblbfgs/ltmain.sh +6426 -0
  233. data/core/vendor/src/liblbfgs/missing +353 -0
  234. data/core/vendor/src/liblbfgs/sample/Makefile.am +15 -0
  235. data/core/vendor/src/liblbfgs/sample/Makefile.in +433 -0
  236. data/core/vendor/src/liblbfgs/sample/sample.c +81 -0
  237. data/core/vendor/src/liblbfgs/sample/sample.cpp +126 -0
  238. data/core/vendor/src/liblbfgs/sample/sample.vcxproj +105 -0
  239. data/core/vendor/src/svm_light/LICENSE.txt +59 -0
  240. data/core/vendor/src/svm_light/Makefile +105 -0
  241. data/core/vendor/src/svm_light/kernel.h +40 -0
  242. data/core/vendor/src/svm_light/svm_classify.c +197 -0
  243. data/core/vendor/src/svm_light/svm_common.c +985 -0
  244. data/core/vendor/src/svm_light/svm_common.h +301 -0
  245. data/core/vendor/src/svm_light/svm_hideo.c +1062 -0
  246. data/core/vendor/src/svm_light/svm_learn.c +4147 -0
  247. data/core/vendor/src/svm_light/svm_learn.h +169 -0
  248. data/core/vendor/src/svm_light/svm_learn_main.c +397 -0
  249. data/core/vendor/src/svm_light/svm_loqo.c +211 -0
  250. data/ext/hack/Rakefile +17 -0
  251. data/ext/hack/support.rb +88 -0
  252. data/lib/opener/opinion_detectors/base.rb +112 -0
  253. data/lib/opener/opinion_detectors/base/version.rb +7 -0
  254. data/lib/opener/opinion_detectors/configuration_creator.rb +86 -0
  255. data/lib/opener/opinion_detectors/de.rb +7 -0
  256. data/lib/opener/opinion_detectors/en.rb +7 -0
  257. data/lib/opener/opinion_detectors/it.rb +7 -0
  258. data/lib/opener/opinion_detectors/nl.rb +6 -0
  259. data/opener-opinion-detector-base.gemspec +35 -0
  260. data/pre_build_requirements.txt +3 -0
  261. metadata +374 -0
@@ -0,0 +1,44 @@
1
+ #!/usr/bin/env python
2
+
3
+ import sys
4
+ import os
5
+ import csv
6
+ from subprocess import Popen,PIPE
7
+
8
+ def create_lexicons(path_to_script, training_file,exp_filename, tar_filename):
9
+ cmd = ['python']
10
+ cmd.append(path_to_script)
11
+ cmd.append('-exp_csv')
12
+ cmd.append(exp_filename)
13
+ cmd.append('-tar_csv')
14
+ cmd.append(tar_filename)
15
+ cmd.append('-l')
16
+ cmd.append(training_file)
17
+ folder = os.path.dirname(exp_filename)
18
+ log_out = open(os.path.join(folder,'log.out'),'wb')
19
+ log_err = open(os.path.join(folder,'log.err'),'wb')
20
+
21
+ lexicon_generator = Popen(' '.join(cmd),stdout=log_out, stderr=log_err, shell=True)
22
+ ret_code = lexicon_generator.wait()
23
+ log_out.close()
24
+ log_err.close()
25
+
26
+ print>>sys.stderr,' Lexicons created, on',folder,' ret code:',ret_code
27
+
28
+
29
+ def load_lexicon(lexicon_filename):
30
+ ### LEXICON FROM THE DOMAIN
31
+ fd = open(lexicon_filename,'rb')
32
+ ##dialect = csv.Sniffer().sniff(fd.read(1024))
33
+ ##fd.seek(0)
34
+ #lex_reader = csv.reader(fd,dialect)
35
+ lex_reader = csv.reader(fd,delimiter=';')
36
+ my_lexicon = {}
37
+ for n,row in enumerate(lex_reader):
38
+ if n != 0:
39
+ text_type,ratio,rel_freq,over_freq,lemmas,postags,freqwords = row
40
+ this_pos = text_type.rfind('#')
41
+ text = text_type[:this_pos]
42
+ my_type = text_type[this_pos+1:]
43
+ my_lexicon[text.decode('utf-8')] = my_type.decode('utf-8')
44
+ return my_lexicon
@@ -0,0 +1,77 @@
1
+ #####
2
+ import sys
3
+ import logging
4
+ from operator import itemgetter
5
+
6
+
7
+ def get_min(l):
8
+ min = None
9
+ for ele in l:
10
+ digits = ''
11
+ for c in ele:
12
+ if c.isdigit(): digits+=c
13
+ value = int(digits)
14
+ if min==None or value<min:
15
+ min = value
16
+ return min
17
+
18
+ #Returns the maximum position from a list of token ids
19
+ def get_max(l):
20
+ max = -1
21
+ for ele in l:
22
+ digits = ''
23
+ for c in ele:
24
+ if c.isdigit(): digits+=c
25
+ value = int(digits)
26
+ if value>max:
27
+ max = value
28
+ return max
29
+
30
+
31
+ ## Gets the distance in number of tokens between two lisf of ids
32
+ def get_distance(list1, list2):
33
+ min_1 = get_min(list1)
34
+ max_1 = get_max(list1)
35
+ min_2 = get_min(list2)
36
+ max_2 = get_max(list2)
37
+
38
+ if max_1 < min_2:
39
+ distance = min_2 - max_1
40
+ elif max_2 < min_1:
41
+ distance = min_1 - max_2
42
+ else:
43
+ distance = 0
44
+ return distance
45
+
46
+ def link_entities_distance(expressions,targets,holders, sentence_for_token):
47
+ triples = []
48
+ weight_crossing_sentence = 200
49
+
50
+ for exp_ids, type_exp in expressions:
51
+ sentence_exp = int(sentence_for_token[exp_ids[0]])
52
+
53
+ final_tar = []
54
+ list_tar_dist = []
55
+ for tar_ids, target_label in targets:
56
+ sentence_tar = int(sentence_for_token[tar_ids[0]])
57
+ dist_tar_exp = get_distance(exp_ids,tar_ids)
58
+ final_distance = dist_tar_exp + weight_crossing_sentence * abs(sentence_exp - sentence_tar)
59
+ list_tar_dist.append((tar_ids,final_distance))
60
+ if len(list_tar_dist) != 0:
61
+ list_tar_dist.sort(key=itemgetter(1))
62
+ final_tar = list_tar_dist[0][0]
63
+
64
+ final_hol = []
65
+ list_hol_dist = []
66
+ for hol_ids, target_label in holders:
67
+ sentence_hol = int(sentence_for_token[hol_ids[0]])
68
+ dist_hol_exp = get_distance(exp_ids,hol_ids)
69
+ final_distance = dist_hol_exp + weight_crossing_sentence * abs(sentence_exp - sentence_hol)
70
+ list_hol_dist.append((hol_ids,final_distance))
71
+ if len(list_hol_dist) != 0:
72
+ list_hol_dist.sort(key=itemgetter(1))
73
+ final_hol = list_hol_dist[0][0]
74
+
75
+ triples.append((type_exp,exp_ids,final_tar,final_hol))
76
+ return triples
77
+
@@ -0,0 +1,250 @@
1
+ #!/usr/bin/env python
2
+
3
+ from extract_feats_relations import *
4
+ from tempfile import NamedTemporaryFile
5
+ from subprocess import Popen, PIPE
6
+ from VUA_pylib.io import Cfeature_index
7
+ import os
8
+
9
+ config_manager = None
10
+
11
+
12
+ def link_exp_tar(expressions,targets, knaf_obj,use_dependencies=True,use_tokens=True, use_lemmas=True):
13
+ assigned_targets = [] # (expression_type, exp_ids,
14
+
15
+ if len(targets) == 0:
16
+ for exp_ids in expressions:
17
+ assigned_targets.append([])
18
+ elif len(targets) == 1:
19
+ for exp_ids in expressions:
20
+ assigned_targets.append(targets[0])
21
+ else:
22
+ feat_index_filename = config_manager.get_index_features_exp_tar_filename()
23
+ feat_index = Cfeature_index()
24
+ feat_index.load_from_file(feat_index_filename)
25
+ examples_file = NamedTemporaryFile(delete=False)
26
+ for exp_ids in expressions:
27
+ for tar_ids in targets:
28
+ feats = extract_feats_exp_tar(exp_ids,tar_ids,knaf_obj, use_dependencies=use_dependencies,use_tokens=use_tokens,use_lemmas=use_lemmas)
29
+ feat_index.encode_example_for_classification(feats, examples_file,my_class='0')
30
+ examples_file.close()
31
+ ## In examples_file.name we can find the examples file
32
+
33
+ ## The format in the example file will be:
34
+ # exp1 --> tar1
35
+ # exp1 --> tar2
36
+ # exp1 --> tar3
37
+ # exp2 --> tar1
38
+ # exp2 --> tar2
39
+ # exp2 --> tar3
40
+
41
+ model_file = config_manager.get_filename_model_exp_tar()
42
+ results = run_svm_classify(examples_file.name, model_file)
43
+
44
+ idx = 0 # This idx will iterate from 0 to num_exp X num_tar
45
+ selected = [] # will stor for each exp --> (best_tar_idx, best_svm_val)
46
+ for exp in expressions:
47
+ #Selecting the best for this exp
48
+ best_value = -100
49
+ best_idx = -100
50
+ #print>>sys.stderr,' Exp:', exp
51
+ for num_tar , tar in enumerate(targets):
52
+
53
+ #This is the probably of exp to be related with the target num_tar
54
+ value = results[idx]
55
+ #print>>sys.stderr,' Target:',tar
56
+ #print>>sys.stderr,' Value:', value
57
+ #print>>sys.stderr, exp
58
+ #print>>sys.stderr, tar
59
+ #print>>sys.stderr, num_tar, value
60
+ #print
61
+
62
+ #We select the best among the targets for the exp processed
63
+ if value > best_value:
64
+ best_value = value
65
+ best_idx = num_tar
66
+ idx += 1
67
+ selected.append((best_idx,best_value))
68
+ #print>>sys.stderr,' Selected:', targets[best_idx]
69
+ #print selected
70
+
71
+ for best_tar_idx, best_value in selected:
72
+ assigned_targets.append(targets[best_tar_idx])
73
+ #print>>sys.stderr, 'SELECTED',best_tar_idx,targets[best_tar_idx]
74
+ os.remove(examples_file.name)
75
+ return assigned_targets
76
+
77
+ def link_exp_tar_all(expressions,targets, knaf_obj,threshold, use_dependencies=True,use_tokens=True, use_lemmas=True):
78
+ pairs = []
79
+
80
+ if len(targets) == 0:
81
+ for exp_ids, exp_type in expressions:
82
+ pairs.append((exp_ids,exp_type,[]))
83
+ else:
84
+ feat_index_filename = config_manager.get_index_features_exp_tar_filename()
85
+ feat_index = Cfeature_index()
86
+ feat_index.load_from_file(feat_index_filename)
87
+ examples_file = NamedTemporaryFile(delete=False)
88
+ for exp_ids, exp_type in expressions:
89
+ for tar_ids in targets:
90
+ feats = extract_feats_exp_tar(exp_ids,tar_ids,knaf_obj, use_dependencies=use_dependencies,use_tokens=use_tokens,use_lemmas=use_lemmas)
91
+ feat_index.encode_example_for_classification(feats, examples_file,my_class='0')
92
+ examples_file.close()
93
+
94
+ model_file = config_manager.get_filename_model_exp_tar()
95
+ results = run_svm_classify(examples_file.name, model_file)
96
+
97
+
98
+ threshold = -0.75
99
+ idx = 0
100
+ for exp,exp_type in expressions:
101
+ at_least_one = False
102
+ for num_tar, tar in enumerate(targets):
103
+ value = results[idx]
104
+ idx += 1
105
+ if value >= threshold:
106
+ pairs.append((exp,exp_type,tar))
107
+ at_least_one = True
108
+
109
+ if not at_least_one:
110
+ pairs.append((exp,exp_type,[]))
111
+
112
+ os.remove(examples_file.name)
113
+ return pairs
114
+
115
+ def link_exp_hol(expressions,holders, knaf_obj,threshold_hol,use_dependencies=True,use_tokens=True,use_lemmas=True):
116
+ assigned_holders = [] # (expression_type, exp_ids,
117
+
118
+ if len(holders) == 0:
119
+ for exp_ids in expressions:
120
+ assigned_holders.append([])
121
+ else:
122
+ feat_index_filename = config_manager.get_index_features_exp_hol_filename()
123
+ feat_index = Cfeature_index()
124
+ feat_index.load_from_file(feat_index_filename)
125
+ examples_file = NamedTemporaryFile(delete=False)
126
+ for exp_ids in expressions:
127
+ for hol_ids in holders:
128
+ feats = extract_feats_exp_hol(exp_ids,hol_ids,knaf_obj, use_dependencies=use_dependencies,use_tokens=use_tokens,use_lemmas=use_lemmas)
129
+ feat_index.encode_example_for_classification(feats,examples_file,my_class='0')
130
+ examples_file.close()
131
+ ## In examples_file.name we can find the examples file
132
+
133
+ ## The format in the example file will be:
134
+ # exp1 --> hol1
135
+ # exp1 --> hol2
136
+ # exp1 --> hol3
137
+ # exp2 --> hol1
138
+ # exp2 --> hol2
139
+ # exp2 --> hol3
140
+
141
+ model_file = config_manager.get_filename_model_exp_hol()
142
+ results = run_svm_classify(examples_file.name, model_file)
143
+
144
+ idx = 0 # This idx will iterate from 0 to num_exp X num_tar
145
+ selected = [] # will stor for each exp --> (best_tar_idx, best_svm_val)
146
+ for exp in expressions:
147
+ #Selecting the best for this exp
148
+ best_value = -1
149
+ best_idx = -1
150
+ for num_hol , hol in enumerate(holders):
151
+ #This is the probably of exp to be related with the target num_tar
152
+ value = results[idx]
153
+
154
+ #We select the best among the targets for the exp processed
155
+ if value > best_value:
156
+ best_value = value
157
+ best_idx = num_hol
158
+ idx += 1
159
+ selected.append((best_idx,best_value))
160
+ #print selected
161
+
162
+ for best_hol_idx, best_value in selected:
163
+ if best_value >= threshold_hol:
164
+ assigned_holders.append(holders[best_hol_idx])
165
+ else:
166
+ assigned_holders.append([])
167
+ os.remove(examples_file.name)
168
+ return assigned_holders
169
+
170
+
171
+
172
+ def run_svm_classify(example_file,model_file):
173
+ #usage: svm_classify [options] example_file model_file output_file
174
+ svmlight = config_manager.get_svm_classify_binary()
175
+ if not os.path.exists(svmlight):
176
+ print>>sys.stderr,'SVMlight learn not found on',svmlight
177
+ print>>sys.stderr,'Check the config filename and make sure the path is correctly set'
178
+ print>>sys.stderr,'[svmlight]\npath_to_binary_learn = yourpathtolocalsvmlightlearn'
179
+ sys.exit(-1)
180
+
181
+ cmd = [svmlight]
182
+ cmd.append(example_file)
183
+ cmd.append(model_file)
184
+ tempout = NamedTemporaryFile(delete=False)
185
+ tempout.close()
186
+
187
+ cmd.append(tempout.name)
188
+ svm_process = Popen(' '.join(cmd),stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=True)
189
+ svm_process.wait()
190
+ str_err = svm_process.stderr.read()
191
+ if len(str_err) != 0:
192
+ print>>sys.stderr,'SVM light classify error '+str_err
193
+ sys.exit(-1)
194
+ #logging.debug('SVMlight classigfy log'+err_file)
195
+ results = []
196
+ fout = open(tempout.name,'r')
197
+ for line in fout:
198
+ results.append(float(line.strip()))
199
+ fout.close()
200
+ os.remove(tempout.name)
201
+ return results
202
+
203
+
204
+
205
+ def link_entities_svm(expressions, targets, holders, knaf_obj,this_config_manager):
206
+ all_types = []
207
+ all_exp_ids = []
208
+ all_tar_ids = []
209
+ all_hol_ids = []
210
+ global config_manager
211
+ config_manager = this_config_manager
212
+
213
+ for exp_ids,exp_type in expressions:
214
+ all_types.append(exp_type)
215
+ exp_term_ids = knaf_obj.map_tokens_to_terms(exp_ids)
216
+ all_exp_ids.append((exp_term_ids, exp_type))
217
+
218
+ for tar_ids, tar_type in targets:
219
+ tar_term_ids = knaf_obj.map_tokens_to_terms(tar_ids)
220
+ all_tar_ids.append(tar_term_ids)
221
+
222
+ for hol_ids, hol_type in holders:
223
+ hol_term_ids = knaf_obj.map_tokens_to_terms(hol_ids)
224
+ all_hol_ids.append(hol_term_ids)
225
+
226
+ #assigned_targets = link_exp_tar(all_exp_ids, all_tar_ids,knaf_obj)
227
+
228
+ svm_thres_exp_tar = config_manager.get_svm_threshold_exp_tar()
229
+ use_deps_now = config_manager.get_use_dependencies()
230
+ use_tokens_lemmas = config_manager.get_use_training_lexicons()
231
+ pairs_exp_tar = link_exp_tar_all(all_exp_ids, all_tar_ids, knaf_obj,svm_thres_exp_tar,use_dependencies=use_deps_now,use_tokens=use_tokens_lemmas,use_lemmas=use_tokens_lemmas)
232
+
233
+ results = []
234
+ sets_exp_ids = []
235
+ for exp_ids, exp_type, tar_ids in pairs_exp_tar:
236
+ sets_exp_ids.append(exp_ids)
237
+
238
+
239
+ # The holders are calculated in the old fashion
240
+ svm_thres_exp_hol = config_manager.get_svm_threshold_exp_hol()
241
+ assigned_holders = link_exp_hol(sets_exp_ids, all_hol_ids,knaf_obj,svm_thres_exp_hol,use_dependencies=use_deps_now,use_tokens=use_tokens_lemmas,use_lemmas=use_tokens_lemmas)
242
+
243
+ for index, (exp_ids, exp_type, tar_ids) in enumerate(pairs_exp_tar):
244
+ results.append((exp_type,exp_ids,tar_ids,assigned_holders[index]))
245
+
246
+ del config_manager
247
+ config_manager = None
248
+ return results
249
+
250
+
@@ -0,0 +1,566 @@
1
+ #!/usr/bin/env python
2
+
3
+ import sys
4
+ import os
5
+ import logging
6
+ import shutil
7
+ import glob
8
+ from subprocess import Popen, PIPE
9
+ import cPickle
10
+ import time
11
+ import csv
12
+ from collections import defaultdict
13
+
14
+
15
+ from scripts import lexicons as lexicons_manager
16
+ from scripts.config_manager import Cconfig_manager, internal_config_filename
17
+ from scripts.extract_features import extract_features_from_kaf_naf_file
18
+ from scripts.crfutils import extract_features_to_crf
19
+ from scripts.extract_feats_relations import create_rel_exp_tar_training, create_rel_exp_hol_training
20
+ from VUA_pylib.io import Cfeature_file, Cfeature_index
21
+ from KafNafParserPy import KafNafParser
22
+
23
+
24
+
25
+ #Globa configuration
26
+ my_config_manager = Cconfig_manager()
27
+
28
+ __this_folder = os.path.dirname(os.path.realpath(__file__))
29
+
30
+
31
+ def save_obj_to_file(obj,filename):
32
+ fic = open(filename,'wb')
33
+ cPickle.dump(obj,fic)
34
+ fic.close()
35
+
36
+ def create_folders(config_filename):
37
+ global my_config_manager
38
+
39
+ # Read configuration from the config file
40
+ my_config_manager.set_current_folder(__this_folder)
41
+ my_config_manager.set_config(config_filename)
42
+
43
+ out_folder = my_config_manager.get_output_folder()
44
+
45
+
46
+ logging.debug('Complete path to output folder: '+out_folder)
47
+
48
+ # Remove the folder if it exists
49
+ if os.path.exists(out_folder):
50
+ shutil.rmtree(out_folder)
51
+ logging.debug('Output folder exists and was removed')
52
+
53
+ os.mkdir(out_folder)
54
+ logging.debug('Created '+out_folder)
55
+
56
+ #Copy the config filename to out_folder/config.cfg
57
+ my_cfg = os.path.join(out_folder,internal_config_filename)
58
+ shutil.copyfile(config_filename,my_cfg)
59
+
60
+ feat_folder = my_config_manager.get_feature_folder_name()
61
+ logging.debug('Created '+feat_folder)
62
+ os.mkdir(feat_folder)
63
+
64
+ crf_exp = my_config_manager.get_crf_expression_folder()
65
+ os.mkdir(crf_exp)
66
+ logging.debug('Created '+crf_exp)
67
+
68
+ crf_target = my_config_manager.get_crf_target_folder()
69
+ os.mkdir(crf_target)
70
+ logging.debug('Created '+crf_target)
71
+
72
+ crf_holder = my_config_manager.get_crf_holder_folder()
73
+ os.mkdir(crf_holder)
74
+ logging.debug('Created '+crf_holder)
75
+
76
+ datasets_folder = my_config_manager.get_training_datasets_folder()
77
+ os.mkdir(datasets_folder)
78
+ logging.debug('Created '+datasets_folder)
79
+
80
+ models_folder = my_config_manager.get_model_foldername()
81
+ os.mkdir(models_folder)
82
+ logging.debug('Created '+models_folder)
83
+
84
+ relation_folder = my_config_manager.get_folder_relation_classifier()
85
+ os.mkdir(relation_folder)
86
+ logging.debug('Created '+relation_folder)
87
+
88
+ ##Templates folder
89
+ template_folder = my_config_manager.get_feature_template_folder_name()
90
+ os.mkdir(template_folder)
91
+ logging.debug('Created '+template_folder)
92
+
93
+ ##Copy template files
94
+ my_config_manager.copy_feature_templates()
95
+
96
+ ##Folder for lexicons
97
+ lexicons_folder = my_config_manager.get_lexicons_folder()
98
+ os.mkdir(lexicons_folder)
99
+ logging.debug('Created '+lexicons_folder)
100
+
101
+ def load_training_files():
102
+ file_training_files_cfg = my_config_manager.get_file_training_list()
103
+ train_files = []
104
+ path_to_file = ''
105
+ if os.path.isabs(file_training_files_cfg):
106
+ path_to_file = file_training_files_cfg
107
+ else:
108
+ path_to_file = os.path.join(__this_folder,file_training_files_cfg)
109
+ logging.debug('Reading training files from '+path_to_file)
110
+ try:
111
+ fic = open(path_to_file,'r')
112
+ for line in fic:
113
+ train_files.append(line.strip())
114
+ fic.close()
115
+ except Exception as e:
116
+ print>>sys.stderr,'Exception reading '+path_to_file,' -->'+str(e)
117
+ sys.exit(-1)
118
+ return train_files
119
+
120
+
121
+
122
+ def extract_all_features():
123
+ train_files = load_training_files()
124
+ logging.debug('Loaded '+str(len(train_files))+' files')
125
+
126
+ feat_folder = my_config_manager.get_feature_folder_name()
127
+ label_feats = separator = None
128
+ my_stdout, my_stderr = sys.stdout,sys.stderr
129
+
130
+ rel_exp_tar_filename = my_config_manager.get_relation_exp_tar_training_filename()
131
+ exp_tar_rel_fic = open(rel_exp_tar_filename,'w')
132
+
133
+ rel_exp_hol_filename = my_config_manager.get_relation_exp_hol_training_filename()
134
+ exp_hol_rel_fic = open(rel_exp_hol_filename,'w')
135
+
136
+ ### LEXICON FROM THE DOMAIN
137
+ expressions_lexicon = None
138
+ targets_lexicon = None
139
+ if my_config_manager.get_use_training_lexicons():
140
+ # Create the lexicons
141
+
142
+ ##GUESS THE LANG:
143
+ first_train_file = train_files[0]
144
+ obj = KafNafParser(first_train_file)
145
+ lang = obj.get_language()
146
+
147
+ expression_lexicon_filename = my_config_manager.get_expression_lexicon_filename()
148
+ target_lexicon_filename = my_config_manager.get_target_lexicon_filename()
149
+
150
+
151
+ this_exp_lex = my_config_manager.get_use_this_expression_lexicon()
152
+ this_tar_lex = my_config_manager.get_use_this_target_lexicon()
153
+
154
+
155
+ if this_exp_lex is None or this_tar_lex is None:
156
+ path_to_lex_creator = '/home/izquierdo/opener_repos/opinion-domain-lexicon-acquisition/acquire_from_annotated_data.py'
157
+ training_filename = my_config_manager.get_file_training_list()
158
+ lexicons_manager.create_lexicons(path_to_lex_creator,training_filename,expression_lexicon_filename,target_lexicon_filename)
159
+
160
+ ##Once created we have to copy the previous one in case:
161
+ if this_exp_lex is not None:
162
+ if "$LANG" in this_exp_lex:
163
+ this_exp_lex = this_exp_lex.replace('$LANG',lang)
164
+ shutil.copy(this_exp_lex, expression_lexicon_filename)
165
+
166
+ if this_tar_lex is not None:
167
+ if "$LANG" in this_tar_lex:
168
+ this_tar_lex = this_tar_lex.replace('$LANG',lang)
169
+ shutil.copy(this_tar_lex,target_lexicon_filename)
170
+
171
+ expressions_lexicon = lexicons_manager.load_lexicon(expression_lexicon_filename)
172
+ targets_lexicon = lexicons_manager.load_lexicon(target_lexicon_filename)
173
+
174
+ this_propagation_lexicon = my_config_manager.get_propagation_lexicon_name()
175
+ if this_propagation_lexicon is not None:
176
+ if "$LANG" in this_propagation_lexicon:
177
+ this_propagation_lexicon = this_propagation_lexicon.replace('$LANG',lang)
178
+
179
+ print>>sys.stderr,'Propagated lexicon',this_propagation_lexicon
180
+
181
+
182
+
183
+
184
+ ## Configuration for the relational alcasifier
185
+ use_deps_now = my_config_manager.get_use_dependencies()
186
+ use_toks_lems_now = my_config_manager.get_use_tokens_lemmas()
187
+
188
+ accepted_opinions = my_config_manager.get_mapping_valid_opinions()
189
+ use_dependencies_now = my_config_manager.get_use_dependencies()
190
+ polarities_found_and_skipped = []
191
+ for num_file, train_file in enumerate(train_files):
192
+ logging.debug('Extracting features '+os.path.basename(train_file))
193
+ base_name = os.path.basename(train_file)
194
+ out_file = os.path.join(feat_folder,'file#'+str(num_file)+'#'+base_name+".feat")
195
+ err_file = out_file+'.log'
196
+
197
+ #Creates the output file
198
+ # Returns the labels for the features and the separator used
199
+ if True:
200
+ kaf_naf_obj = KafNafParser(train_file)
201
+
202
+ label_feats, separator, pols_skipped_this = extract_features_from_kaf_naf_file(kaf_naf_obj,out_file,err_file,
203
+ accepted_opinions=accepted_opinions,
204
+ exp_lex=expressions_lexicon,
205
+ tar_lex=targets_lexicon,
206
+ propagation_lex_filename=this_propagation_lexicon)
207
+ polarities_found_and_skipped.extend(pols_skipped_this)
208
+ print>>exp_tar_rel_fic,'#'+train_file
209
+ print>>exp_hol_rel_fic,'#'+train_file
210
+ # SET valid_opinions to None to use all the possible opinions in the KAF file for extracitng relations
211
+ create_rel_exp_tar_training(kaf_naf_obj, output=exp_tar_rel_fic, valid_opinions=accepted_opinions,use_dependencies=use_dependencies_now,use_tokens=use_toks_lems_now,use_lemmas=use_toks_lems_now)
212
+ create_rel_exp_hol_training(kaf_naf_obj ,output=exp_hol_rel_fic, valid_opinions=accepted_opinions,use_dependencies=use_dependencies_now,use_tokens=use_toks_lems_now,use_lemmas=use_toks_lems_now)
213
+ if False:
214
+ #except Exception as e:
215
+ sys.stdout, sys.stderr = my_stdout, my_stderr
216
+ print>>sys.stderr,str(e),dir(e)
217
+ pass
218
+
219
+ ##Show just for information how many instances have been skipped becase the polarity of opinion expression was not allowed
220
+ count = defaultdict(int)
221
+ for exp_label in polarities_found_and_skipped:
222
+ count[exp_label] += 1
223
+ info = '\nOpinions skipped because the polarity label is not included in the configuration\n'
224
+ info += 'Accepted opinions: '+' '.join(accepted_opinions.keys())+'\n'
225
+ info += 'Number of complete opinions skipped\n'
226
+ for label, c in count.items():
227
+ info+=' '+label+' :'+str(c)+'\n'
228
+ info+='\n'
229
+ logging.debug(info)
230
+ ###################################################
231
+
232
+
233
+
234
+ #Re-set the stdout and stderr
235
+ exp_tar_rel_fic.close()
236
+ exp_hol_rel_fic.close()
237
+
238
+ sys.stdout,sys.stderr = my_stdout, my_stderr
239
+ #Sabe labelfeats and separator in a file
240
+ filename = my_config_manager.get_feature_desc_filename()
241
+ fic = open(filename,'w')
242
+ fic.write(' '.join(label_feats)+'\n')
243
+ fic.close()
244
+ logging.debug('Description of features --> '+filename)
245
+
246
+
247
+
248
+ def train_expression_classifier():
249
+ # 1) Create the training file from all the features
250
+ # Load the feature description
251
+ path_feat_desc = my_config_manager.get_feature_desc_filename()
252
+ fic = open(path_feat_desc)
253
+ fields = fic.read().strip()
254
+ fic.close()
255
+ separator = '\t'
256
+ feat_folder = my_config_manager.get_feature_folder_name()
257
+ crf_folder = my_config_manager.get_crf_expression_folder()
258
+ # Create all the CRF files calling to the crfutils.extract_features_to_crf
259
+
260
+ crf_out_files = []
261
+
262
+ templates_exp = my_config_manager.get_templates_expr()
263
+ possible_classes = my_config_manager.get_possible_expression_values()
264
+
265
+ # Only set the target class for the tokens of possible_classes
266
+ # For others, it's set to O (out sequence)
267
+ for feat_file in glob.glob(feat_folder+'/*.feat'):
268
+ base_name = os.path.basename(feat_file)
269
+ base_name = base_name[:-5]
270
+ out_crf = os.path.join(crf_folder,base_name)
271
+ logging.debug('Creating crf file in --> '+out_crf)
272
+
273
+ try:
274
+ extract_features_to_crf(feat_file,out_crf,fields,separator,templates_exp,possible_classes)
275
+ crf_out_files.append(out_crf)
276
+ except:
277
+ print>>sys.stderr,'Failed conversion to tab-expression -> CRF: ',feat_file
278
+ ###########################################################################################
279
+
280
+ # Concatenate all the crf files just created
281
+ out_f = open(my_config_manager.get_training_dataset_exp(),'w')
282
+ for crf_file in crf_out_files:
283
+ f = open(crf_file)
284
+ out_f.write(f.read())
285
+ f.close()
286
+ out_f.close()
287
+ logging.debug('Created training data for crf, op.exp '+my_config_manager.get_training_dataset_exp())
288
+ #############################################
289
+
290
+ #Train the model
291
+ crf_params = my_config_manager.get_crfsuite_params()
292
+ input_file = my_config_manager.get_training_dataset_exp()
293
+ model_file = my_config_manager.get_filename_model_expression()
294
+ logging.debug('Training the classifier for opinion expressions (could take a while)')
295
+ run_crfsuite(crf_params,input_file,model_file)
296
+
297
+
298
+
299
+ def train_target_classifier():
300
+
301
+ # 1) Create the training file from all the features
302
+ # Load the feature description
303
+ path_feat_desc = my_config_manager.get_feature_desc_filename()
304
+ fic = open(path_feat_desc)
305
+ fields = fic.read().strip()
306
+ fic.close()
307
+ separator = '\t'
308
+ feat_folder = my_config_manager.get_feature_folder_name()
309
+ crf_folder = my_config_manager.get_crf_target_folder()
310
+ # Create all the CRF files calling to the crfutils.extract_features_to_crf
311
+ crf_out_files = []
312
+ templates_target = my_config_manager.get_templates_target()
313
+ possible_classes = ['target']
314
+ for feat_file in glob.glob(feat_folder+'/*.feat'):
315
+ base_name = os.path.basename(feat_file)
316
+ base_name = base_name[:-5]
317
+ out_crf = os.path.join(crf_folder,base_name)
318
+ logging.debug('Creating crf file in --> '+out_crf)
319
+
320
+ try:
321
+ extract_features_to_crf(feat_file,out_crf,fields,separator,templates_target,possible_classes)
322
+ crf_out_files.append(out_crf)
323
+ except:
324
+ print>>sys.stderr,'Failed conversion to tab-target-> CRF: ',feat_file
325
+ ###########################################################################################
326
+
327
+ # Concatenate all the crf files just created
328
+ out_f = open(my_config_manager.get_training_dataset_target(),'w')
329
+ for crf_file in crf_out_files:
330
+ f = open(crf_file)
331
+ out_f.write(f.read())
332
+ f.close()
333
+ out_f.close()
334
+ logging.debug('Created training data for crf, op.exp '+my_config_manager.get_training_dataset_target())
335
+ #############################################
336
+
337
+ #Train the model
338
+ crf_params = my_config_manager.get_crfsuite_params()
339
+ input_file = my_config_manager.get_training_dataset_target()
340
+ model_file = my_config_manager.get_filename_model_target()
341
+ logging.debug('Training the classifier for opinion target (could take a while)')
342
+ run_crfsuite(crf_params,input_file,model_file)
343
+
344
+
345
+
346
+
347
+ def train_holder_classifier():
348
+
349
+ # 1) Create the training file from all the features
350
+ # Load the feature description
351
+ path_feat_desc = my_config_manager.get_feature_desc_filename()
352
+ fic = open(path_feat_desc)
353
+ fields = fic.read().strip()
354
+ fic.close()
355
+ separator = '\t'
356
+ feat_folder = my_config_manager.get_feature_folder_name()
357
+ crf_folder = my_config_manager.get_crf_holder_folder()
358
+ # Create all the CRF files calling to the crfutils.extract_features_to_crf
359
+ crf_out_files = []
360
+ templates_holder = my_config_manager.get_templates_holder()
361
+ possible_classes = ['holder']
362
+ for feat_file in glob.glob(feat_folder+'/*.feat'):
363
+ base_name = os.path.basename(feat_file)
364
+ base_name = base_name[:-5]
365
+ out_crf = os.path.join(crf_folder,base_name)
366
+ logging.debug('Creating crf file in --> '+out_crf)
367
+
368
+ try:
369
+ extract_features_to_crf(feat_file,out_crf,fields,separator,templates_holder,possible_classes)
370
+ crf_out_files.append(out_crf)
371
+ except:
372
+ print>>sys.stderr,'Failed conversion to tab-holder -> CRF: ',feat_file
373
+ ###########################################################################################
374
+
375
+ # Concatenate all the crf files just created
376
+ out_f = open(my_config_manager.get_training_dataset_holder(),'w')
377
+ for crf_file in crf_out_files:
378
+ f = open(crf_file)
379
+ out_f.write(f.read())
380
+ f.close()
381
+ out_f.close()
382
+ logging.debug('Created training data for crf, op.exp '+my_config_manager.get_training_dataset_holder())
383
+ #############################################
384
+
385
+ #Train the model
386
+ crf_params = my_config_manager.get_crfsuite_params()
387
+ input_file = my_config_manager.get_training_dataset_holder()
388
+ model_file = my_config_manager.get_filename_model_holder()
389
+ logging.debug('Training the classifier for opinion holder (could take a while)')
390
+ run_crfsuite(crf_params,input_file,model_file)
391
+
392
+
393
+ def run_crfsuite(crf_params,input_file,model_file):
394
+
395
+ crfsuite = my_config_manager.get_crfsuite_binary()
396
+ if not os.path.exists(crfsuite):
397
+ print>>sys.stderr,'CRFsuite not found on',crfsuite
398
+ print>>sys.stderr,'Check the config filename and make sure the path is correctly set'
399
+ print>>sys.stderr,'[crfsuite]\npath_to_binary = yourpathtolocalcrfsuite'
400
+ sys.exit(-1)
401
+
402
+ cmd = [crfsuite]
403
+ cmd.append('learn')
404
+ cmd.append(crf_params)
405
+ cmd.append('-m '+model_file)
406
+ cmd.append(input_file)
407
+ err_file = model_file+'.log'
408
+ err_fic = open(err_file,'w')
409
+ crf_process = Popen(' '.join(cmd), stdin=PIPE, stdout=err_fic, stderr=PIPE, shell=True)
410
+ crf_process.wait()
411
+ str_err = crf_process.stderr.read()
412
+ if len(str_err) != 0:
413
+ print>>sys.stderr,'CRF error!!: '+str_err
414
+ sys.exit(-1)
415
+ err_fic.close()
416
+ logging.debug('Crfsuite log '+err_file)
417
+
418
+
419
+
420
+
421
+ ############################################
422
+ ################ RELATION TRAINING #########
423
+ ###########################################
424
+
425
+ def train_classifier_relation_exp_tar():
426
+ #Load the human readable training file
427
+ train_filename = my_config_manager.get_relation_exp_tar_training_filename()
428
+ feature_file_obj = Cfeature_file(train_filename)
429
+ ###########################################
430
+
431
+
432
+ # Convert it into index based feature file, for svm-light
433
+ feature_index = Cfeature_index()
434
+ feat_bin_filename = my_config_manager.get_rel_exp_tar_training_idx_filename()
435
+ fic_out = open(feat_bin_filename,'w')
436
+ feature_index.encode_feature_file_to_svm(feature_file_obj,fic_out)
437
+ fic_out.close()
438
+ ###########################################
439
+
440
+
441
+ ## Save the feature index
442
+ feat_index_filename = my_config_manager.get_index_features_exp_tar_filename()
443
+ feature_index.save_to_file(feat_index_filename)
444
+ #########################
445
+
446
+ # Train the model
447
+ example_file = my_config_manager.get_rel_exp_tar_training_idx_filename()
448
+ model = my_config_manager.get_filename_model_exp_tar()
449
+ svm_opts = my_config_manager.get_svm_params()
450
+ logging.debug('Training SVMlight classifier for RELATION(expression,target) in '+model+ '(could take a while)')
451
+ run_svmlight_learn(example_file,model,svm_opts)
452
+ ###########################################
453
+
454
+
455
+
456
+
457
+ def train_classifier_relation_exp_hol():
458
+ #Load the human readable training file
459
+ train_filename = my_config_manager.get_relation_exp_hol_training_filename()
460
+ feature_file_obj = Cfeature_file(train_filename)
461
+ ###########################################
462
+
463
+
464
+ # Convert it into index based feature file, for svm-light
465
+ feature_index = Cfeature_index()
466
+ feat_bin_filename = my_config_manager.get_rel_exp_hol_training_idx_filename()
467
+ fic_out = open(feat_bin_filename,'w')
468
+ feature_index.encode_feature_file_to_svm(feature_file_obj,fic_out)
469
+ fic_out.close()
470
+ ###########################################
471
+
472
+
473
+ ## Save the feature index
474
+ feat_index_filename = my_config_manager.get_index_features_exp_hol_filename()
475
+ feature_index.save_to_file(feat_index_filename)
476
+ #########################
477
+
478
+ # Train the model
479
+ example_file = my_config_manager.get_rel_exp_hol_training_idx_filename()
480
+ model = my_config_manager.get_filename_model_exp_hol()
481
+ svm_opts = my_config_manager.get_svm_params()
482
+ logging.debug('Training SVMlight classifier for RELATION(expression,holder) in '+model+ '(could take a while)')
483
+ run_svmlight_learn(example_file,model,svm_opts)
484
+ ###########################################
485
+
486
+
487
+ def run_svmlight_learn(example_file,model_file,params):
488
+ svmlight = my_config_manager.get_svm_learn_binary()
489
+
490
+ if not os.path.exists(svmlight):
491
+ print>>sys.stderr,'SVMlight learn not found on',svmlight
492
+ print>>sys.stderr,'Check the config filename and make sure the path is correctly set'
493
+ print>>sys.stderr,'[svmlight]\npath_to_binary_learn = yourpathtolocalsvmlightlearn'
494
+ sys.exit(-1)
495
+
496
+ cmd = [svmlight]
497
+ cmd.append(params)
498
+ cmd.append(example_file)
499
+ cmd.append(model_file)
500
+ err_file = model_file+'.log'
501
+ err_fic = open(err_file,'w')
502
+ svm_process = Popen(' '.join(cmd),stdin=PIPE, stdout=err_fic, stderr=PIPE, shell=True)
503
+ svm_process.wait()
504
+ str_err = svm_process.stderr.read()
505
+ if len(str_err) != 0:
506
+ print>>sys.stderr,'SVM light error '+str_err
507
+ sys.exit(-1)
508
+ err_fic.close()
509
+ logging.debug('SVMlight learn log'+err_file)
510
+
511
+ def write_to_flag(msg,openas='a'):
512
+ flag = open(my_config_manager.get_flag_filename(),openas)
513
+ my_time = time.strftime('%Y-%m-%dT%H:%M:%S%Z')
514
+ flag.write(msg+' --> '+my_time+'\n')
515
+ flag.close()
516
+
517
+ def train_all(file_config):
518
+
519
+
520
+
521
+
522
+ # Check if the output folder exists or create it
523
+ create_folders(file_config)
524
+ write_to_flag('Beginning\n','w')
525
+
526
+ #Will create the subfolder out_folder/subfolder_feats with files *feat
527
+ write_to_flag('START extract features')
528
+ extract_all_features()
529
+ write_to_flag('DONE extract features\n')
530
+
531
+ # training the expression classifier
532
+ write_to_flag('START training expression classifier')
533
+ train_expression_classifier()
534
+ write_to_flag('DONE training expression classifier\n')
535
+
536
+
537
+ # Training the target classifier
538
+ write_to_flag('START training target classifier')
539
+ train_target_classifier()
540
+ write_to_flag('DONE training target classifier\n')
541
+
542
+ # training the holder classifier
543
+ write_to_flag('START training expression classifier')
544
+ train_holder_classifier()
545
+ write_to_flag('DONE training holder classifier\n')
546
+
547
+
548
+ write_to_flag('START training relation expression - target classifier')
549
+ train_classifier_relation_exp_tar()
550
+ write_to_flag('DONE training relation expression - target classifier\n')
551
+
552
+ write_to_flag('START training relation expression - holder classifier')
553
+ train_classifier_relation_exp_hol()
554
+ write_to_flag('DONE training relation expression - holder classifier\n')
555
+
556
+
557
+ logging.debug('ALL TRAINING DONE')
558
+ write_to_flag('FINISHED ')
559
+
560
+
561
+ if __name__ == '__main__':
562
+ logging.basicConfig(stream=sys.stderr,format='%(asctime)s - %(levelname)s\n %(message)s', level=logging.DEBUG)
563
+ file_config = sys.argv[1]
564
+ train_all(file_config)
565
+
566
+ sys.exit(0)