opener-opinion-detector-base 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +101 -0
  3. data/bin/opinion-detector-base +19 -0
  4. data/core/annotation.cfg.erb +9 -0
  5. data/core/packages/KafNafParser-1.4.tar.gz +0 -0
  6. data/core/packages/VUA_pylib-1.5.tar.gz +0 -0
  7. data/core/python-scripts/LICENSE +339 -0
  8. data/core/python-scripts/README.md +226 -0
  9. data/core/python-scripts/classify_kaf_naf_file.py +499 -0
  10. data/core/python-scripts/cross_validation.py +634 -0
  11. data/core/python-scripts/generate_folds.py +134 -0
  12. data/core/python-scripts/models.cfg +10 -0
  13. data/core/python-scripts/my_templates/README +33 -0
  14. data/core/python-scripts/my_templates/templates_exp.only0.txt +6 -0
  15. data/core/python-scripts/my_templates/templates_exp.pol0.txt +10 -0
  16. data/core/python-scripts/my_templates/templates_exp.red.txt +7 -0
  17. data/core/python-scripts/my_templates/templates_exp.txt +10 -0
  18. data/core/python-scripts/my_templates/templates_holder.only0.txt +11 -0
  19. data/core/python-scripts/my_templates/templates_holder.red.txt +9 -0
  20. data/core/python-scripts/my_templates/templates_holder.txt +10 -0
  21. data/core/python-scripts/my_templates/templates_target.only0.txt +11 -0
  22. data/core/python-scripts/my_templates/templates_target.red.txt +9 -0
  23. data/core/python-scripts/my_templates/templates_target.txt +10 -0
  24. data/core/python-scripts/run_all_experiments.sh +49 -0
  25. data/core/python-scripts/run_basic.py +20 -0
  26. data/core/python-scripts/run_experiment.sh +42 -0
  27. data/core/python-scripts/scripts/__init__.py +1 -0
  28. data/core/python-scripts/scripts/config_manager.py +314 -0
  29. data/core/python-scripts/scripts/crfutils.py +215 -0
  30. data/core/python-scripts/scripts/extract_feats_relations.py +295 -0
  31. data/core/python-scripts/scripts/extract_features.py +376 -0
  32. data/core/python-scripts/scripts/feats_to_crf.exp.py +105 -0
  33. data/core/python-scripts/scripts/lexicons.py +44 -0
  34. data/core/python-scripts/scripts/link_entities_distance.py +77 -0
  35. data/core/python-scripts/scripts/relation_classifier.py +250 -0
  36. data/core/python-scripts/train.py +566 -0
  37. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/PKG-INFO +10 -0
  38. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/SOURCES.txt +22 -0
  39. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/dependency_links.txt +1 -0
  40. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/installed-files.txt +47 -0
  41. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/top_level.txt +1 -0
  42. data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.py +390 -0
  43. data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.pyc +0 -0
  44. data/core/site-packages/pre_build/KafNafParser/__init__.py +14 -0
  45. data/core/site-packages/pre_build/KafNafParser/__init__.pyc +0 -0
  46. data/core/site-packages/pre_build/KafNafParser/constituency_data.py +125 -0
  47. data/core/site-packages/pre_build/KafNafParser/constituency_data.pyc +0 -0
  48. data/core/site-packages/pre_build/KafNafParser/coreference_data.py +52 -0
  49. data/core/site-packages/pre_build/KafNafParser/coreference_data.pyc +0 -0
  50. data/core/site-packages/pre_build/KafNafParser/dependency_data.py +78 -0
  51. data/core/site-packages/pre_build/KafNafParser/dependency_data.pyc +0 -0
  52. data/core/site-packages/pre_build/KafNafParser/entity_data.py +59 -0
  53. data/core/site-packages/pre_build/KafNafParser/entity_data.pyc +0 -0
  54. data/core/site-packages/pre_build/KafNafParser/external_references_data.py +41 -0
  55. data/core/site-packages/pre_build/KafNafParser/external_references_data.pyc +0 -0
  56. data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.py +2 -0
  57. data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.pyc +0 -0
  58. data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.py +205 -0
  59. data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.pyc +0 -0
  60. data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.py +309 -0
  61. data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.pyc +0 -0
  62. data/core/site-packages/pre_build/KafNafParser/features_data.py +131 -0
  63. data/core/site-packages/pre_build/KafNafParser/features_data.pyc +0 -0
  64. data/core/site-packages/pre_build/KafNafParser/header_data.py +127 -0
  65. data/core/site-packages/pre_build/KafNafParser/header_data.pyc +0 -0
  66. data/core/site-packages/pre_build/KafNafParser/opinion_data.py +211 -0
  67. data/core/site-packages/pre_build/KafNafParser/opinion_data.pyc +0 -0
  68. data/core/site-packages/pre_build/KafNafParser/references_data.py +23 -0
  69. data/core/site-packages/pre_build/KafNafParser/references_data.pyc +0 -0
  70. data/core/site-packages/pre_build/KafNafParser/span_data.py +63 -0
  71. data/core/site-packages/pre_build/KafNafParser/span_data.pyc +0 -0
  72. data/core/site-packages/pre_build/KafNafParser/term_data.py +111 -0
  73. data/core/site-packages/pre_build/KafNafParser/term_data.pyc +0 -0
  74. data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.py +42 -0
  75. data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.pyc +0 -0
  76. data/core/site-packages/pre_build/KafNafParser/text_data.py +99 -0
  77. data/core/site-packages/pre_build/KafNafParser/text_data.pyc +0 -0
  78. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/PKG-INFO +10 -0
  79. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/SOURCES.txt +14 -0
  80. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/dependency_links.txt +1 -0
  81. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/installed-files.txt +23 -0
  82. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/top_level.txt +1 -0
  83. data/core/site-packages/pre_build/VUA_pylib/__init__.py +1 -0
  84. data/core/site-packages/pre_build/VUA_pylib/__init__.pyc +0 -0
  85. data/core/site-packages/pre_build/VUA_pylib/common/__init__.py +1 -0
  86. data/core/site-packages/pre_build/VUA_pylib/common/__init__.pyc +0 -0
  87. data/core/site-packages/pre_build/VUA_pylib/common/common.py +28 -0
  88. data/core/site-packages/pre_build/VUA_pylib/common/common.pyc +0 -0
  89. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.py +1 -0
  90. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.pyc +0 -0
  91. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.py +156 -0
  92. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.pyc +0 -0
  93. data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.py +1 -0
  94. data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.pyc +0 -0
  95. data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.py +121 -0
  96. data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.pyc +0 -0
  97. data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.py +1 -0
  98. data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.pyc +0 -0
  99. data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.py +72 -0
  100. data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.pyc +0 -0
  101. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
  102. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
  103. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
  104. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
  105. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
  106. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
  107. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
  108. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
  109. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
  110. data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
  111. data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
  112. data/core/vendor/src/crfsuite/AUTHORS +1 -0
  113. data/core/vendor/src/crfsuite/COPYING +27 -0
  114. data/core/vendor/src/crfsuite/ChangeLog +103 -0
  115. data/core/vendor/src/crfsuite/INSTALL +236 -0
  116. data/core/vendor/src/crfsuite/Makefile.am +19 -0
  117. data/core/vendor/src/crfsuite/Makefile.in +783 -0
  118. data/core/vendor/src/crfsuite/README +183 -0
  119. data/core/vendor/src/crfsuite/aclocal.m4 +9018 -0
  120. data/core/vendor/src/crfsuite/autogen.sh +38 -0
  121. data/core/vendor/src/crfsuite/compile +143 -0
  122. data/core/vendor/src/crfsuite/config.guess +1502 -0
  123. data/core/vendor/src/crfsuite/config.h.in +198 -0
  124. data/core/vendor/src/crfsuite/config.sub +1714 -0
  125. data/core/vendor/src/crfsuite/configure +14273 -0
  126. data/core/vendor/src/crfsuite/configure.in +149 -0
  127. data/core/vendor/src/crfsuite/crfsuite.sln +42 -0
  128. data/core/vendor/src/crfsuite/depcomp +630 -0
  129. data/core/vendor/src/crfsuite/example/chunking.py +49 -0
  130. data/core/vendor/src/crfsuite/example/crfutils.py +179 -0
  131. data/core/vendor/src/crfsuite/example/ner.py +270 -0
  132. data/core/vendor/src/crfsuite/example/pos.py +78 -0
  133. data/core/vendor/src/crfsuite/example/template.py +88 -0
  134. data/core/vendor/src/crfsuite/frontend/Makefile.am +29 -0
  135. data/core/vendor/src/crfsuite/frontend/Makefile.in +640 -0
  136. data/core/vendor/src/crfsuite/frontend/dump.c +116 -0
  137. data/core/vendor/src/crfsuite/frontend/frontend.vcxproj +129 -0
  138. data/core/vendor/src/crfsuite/frontend/iwa.c +273 -0
  139. data/core/vendor/src/crfsuite/frontend/iwa.h +65 -0
  140. data/core/vendor/src/crfsuite/frontend/learn.c +439 -0
  141. data/core/vendor/src/crfsuite/frontend/main.c +137 -0
  142. data/core/vendor/src/crfsuite/frontend/option.c +93 -0
  143. data/core/vendor/src/crfsuite/frontend/option.h +86 -0
  144. data/core/vendor/src/crfsuite/frontend/readdata.h +38 -0
  145. data/core/vendor/src/crfsuite/frontend/reader.c +136 -0
  146. data/core/vendor/src/crfsuite/frontend/tag.c +427 -0
  147. data/core/vendor/src/crfsuite/genbinary.sh.in +15 -0
  148. data/core/vendor/src/crfsuite/include/Makefile.am +11 -0
  149. data/core/vendor/src/crfsuite/include/Makefile.in +461 -0
  150. data/core/vendor/src/crfsuite/include/crfsuite.h +1063 -0
  151. data/core/vendor/src/crfsuite/include/crfsuite.hpp +555 -0
  152. data/core/vendor/src/crfsuite/include/crfsuite_api.hpp +400 -0
  153. data/core/vendor/src/crfsuite/include/os.h +61 -0
  154. data/core/vendor/src/crfsuite/install-sh +520 -0
  155. data/core/vendor/src/crfsuite/lib/cqdb/COPYING +28 -0
  156. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.am +21 -0
  157. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.in +549 -0
  158. data/core/vendor/src/crfsuite/lib/cqdb/cqdb.vcxproj +86 -0
  159. data/core/vendor/src/crfsuite/lib/cqdb/include/cqdb.h +524 -0
  160. data/core/vendor/src/crfsuite/lib/cqdb/src/cqdb.c +587 -0
  161. data/core/vendor/src/crfsuite/lib/cqdb/src/lookup3.c +976 -0
  162. data/core/vendor/src/crfsuite/lib/crf/Makefile.am +46 -0
  163. data/core/vendor/src/crfsuite/lib/crf/Makefile.in +721 -0
  164. data/core/vendor/src/crfsuite/lib/crf/crf.vcxproj +216 -0
  165. data/core/vendor/src/crfsuite/lib/crf/src/crf1d.h +353 -0
  166. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_context.c +705 -0
  167. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_encode.c +943 -0
  168. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_feature.c +352 -0
  169. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_model.c +994 -0
  170. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_tag.c +550 -0
  171. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite.c +492 -0
  172. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_internal.h +236 -0
  173. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_train.c +272 -0
  174. data/core/vendor/src/crfsuite/lib/crf/src/dataset.c +106 -0
  175. data/core/vendor/src/crfsuite/lib/crf/src/dictionary.c +118 -0
  176. data/core/vendor/src/crfsuite/lib/crf/src/holdout.c +80 -0
  177. data/core/vendor/src/crfsuite/lib/crf/src/logging.c +91 -0
  178. data/core/vendor/src/crfsuite/lib/crf/src/logging.h +48 -0
  179. data/core/vendor/src/crfsuite/lib/crf/src/params.c +335 -0
  180. data/core/vendor/src/crfsuite/lib/crf/src/params.h +80 -0
  181. data/core/vendor/src/crfsuite/lib/crf/src/quark.c +172 -0
  182. data/core/vendor/src/crfsuite/lib/crf/src/quark.h +46 -0
  183. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.c +1107 -0
  184. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.h +160 -0
  185. data/core/vendor/src/crfsuite/lib/crf/src/train_arow.c +408 -0
  186. data/core/vendor/src/crfsuite/lib/crf/src/train_averaged_perceptron.c +242 -0
  187. data/core/vendor/src/crfsuite/lib/crf/src/train_l2sgd.c +507 -0
  188. data/core/vendor/src/crfsuite/lib/crf/src/train_lbfgs.c +338 -0
  189. data/core/vendor/src/crfsuite/lib/crf/src/train_passive_aggressive.c +435 -0
  190. data/core/vendor/src/crfsuite/lib/crf/src/vecmath.h +341 -0
  191. data/core/vendor/src/crfsuite/ltmain.sh +8413 -0
  192. data/core/vendor/src/crfsuite/missing +376 -0
  193. data/core/vendor/src/crfsuite/swig/Makefile.am +13 -0
  194. data/core/vendor/src/crfsuite/swig/Makefile.in +365 -0
  195. data/core/vendor/src/crfsuite/swig/crfsuite.cpp +2 -0
  196. data/core/vendor/src/crfsuite/swig/export.i +32 -0
  197. data/core/vendor/src/crfsuite/swig/python/README +92 -0
  198. data/core/vendor/src/crfsuite/swig/python/crfsuite.py +329 -0
  199. data/core/vendor/src/crfsuite/swig/python/export_wrap.cpp +14355 -0
  200. data/core/vendor/src/crfsuite/swig/python/export_wrap.h +63 -0
  201. data/core/vendor/src/crfsuite/swig/python/prepare.sh +9 -0
  202. data/core/vendor/src/crfsuite/swig/python/sample_tag.py +52 -0
  203. data/core/vendor/src/crfsuite/swig/python/sample_train.py +68 -0
  204. data/core/vendor/src/crfsuite/swig/python/setup.py +44 -0
  205. data/core/vendor/src/crfsuite/win32/stdint.h +679 -0
  206. data/core/vendor/src/liblbfgs/AUTHORS +1 -0
  207. data/core/vendor/src/liblbfgs/COPYING +22 -0
  208. data/core/vendor/src/liblbfgs/ChangeLog +120 -0
  209. data/core/vendor/src/liblbfgs/INSTALL +231 -0
  210. data/core/vendor/src/liblbfgs/Makefile.am +10 -0
  211. data/core/vendor/src/liblbfgs/Makefile.in +638 -0
  212. data/core/vendor/src/liblbfgs/NEWS +0 -0
  213. data/core/vendor/src/liblbfgs/README +71 -0
  214. data/core/vendor/src/liblbfgs/aclocal.m4 +6985 -0
  215. data/core/vendor/src/liblbfgs/autogen.sh +38 -0
  216. data/core/vendor/src/liblbfgs/config.guess +1411 -0
  217. data/core/vendor/src/liblbfgs/config.h.in +64 -0
  218. data/core/vendor/src/liblbfgs/config.sub +1500 -0
  219. data/core/vendor/src/liblbfgs/configure +21146 -0
  220. data/core/vendor/src/liblbfgs/configure.in +107 -0
  221. data/core/vendor/src/liblbfgs/depcomp +522 -0
  222. data/core/vendor/src/liblbfgs/include/lbfgs.h +745 -0
  223. data/core/vendor/src/liblbfgs/install-sh +322 -0
  224. data/core/vendor/src/liblbfgs/lbfgs.sln +26 -0
  225. data/core/vendor/src/liblbfgs/lib/Makefile.am +24 -0
  226. data/core/vendor/src/liblbfgs/lib/Makefile.in +499 -0
  227. data/core/vendor/src/liblbfgs/lib/arithmetic_ansi.h +133 -0
  228. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_double.h +294 -0
  229. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_float.h +298 -0
  230. data/core/vendor/src/liblbfgs/lib/lbfgs.c +1371 -0
  231. data/core/vendor/src/liblbfgs/lib/lib.vcxproj +95 -0
  232. data/core/vendor/src/liblbfgs/ltmain.sh +6426 -0
  233. data/core/vendor/src/liblbfgs/missing +353 -0
  234. data/core/vendor/src/liblbfgs/sample/Makefile.am +15 -0
  235. data/core/vendor/src/liblbfgs/sample/Makefile.in +433 -0
  236. data/core/vendor/src/liblbfgs/sample/sample.c +81 -0
  237. data/core/vendor/src/liblbfgs/sample/sample.cpp +126 -0
  238. data/core/vendor/src/liblbfgs/sample/sample.vcxproj +105 -0
  239. data/core/vendor/src/svm_light/LICENSE.txt +59 -0
  240. data/core/vendor/src/svm_light/Makefile +105 -0
  241. data/core/vendor/src/svm_light/kernel.h +40 -0
  242. data/core/vendor/src/svm_light/svm_classify.c +197 -0
  243. data/core/vendor/src/svm_light/svm_common.c +985 -0
  244. data/core/vendor/src/svm_light/svm_common.h +301 -0
  245. data/core/vendor/src/svm_light/svm_hideo.c +1062 -0
  246. data/core/vendor/src/svm_light/svm_learn.c +4147 -0
  247. data/core/vendor/src/svm_light/svm_learn.h +169 -0
  248. data/core/vendor/src/svm_light/svm_learn_main.c +397 -0
  249. data/core/vendor/src/svm_light/svm_loqo.c +211 -0
  250. data/ext/hack/Rakefile +17 -0
  251. data/ext/hack/support.rb +88 -0
  252. data/lib/opener/opinion_detectors/base.rb +112 -0
  253. data/lib/opener/opinion_detectors/base/version.rb +7 -0
  254. data/lib/opener/opinion_detectors/configuration_creator.rb +86 -0
  255. data/lib/opener/opinion_detectors/de.rb +7 -0
  256. data/lib/opener/opinion_detectors/en.rb +7 -0
  257. data/lib/opener/opinion_detectors/it.rb +7 -0
  258. data/lib/opener/opinion_detectors/nl.rb +6 -0
  259. data/opener-opinion-detector-base.gemspec +35 -0
  260. data/pre_build_requirements.txt +3 -0
  261. metadata +374 -0
@@ -0,0 +1,226 @@
1
+ #Opinion miner deluxe#
2
+
3
+ ##Introduction##
4
+
5
+
6
+ Opinion miner based on machine learning that can be trained using a list of
7
+ KAF/NAF files. It is important to notice that the opinion miner module will not call
8
+ to any external module to obtain features. It will read all the features from the input KAF/NAF file,
9
+ so you have to make sure that your input file contains all the required information in advance (tokens,
10
+ terms, polarities, constituents, entitiess, dependencies...)
11
+
12
+ The task is divided into 2 steps
13
+ * Detection of opinion entities (holder, target and expression): using
14
+ Conditional Random Fields
15
+ * Opinion entity linking (expression<-target and expression-<holder): using
16
+ binary Support Vector Machines
17
+
18
+ In next subsections, a brief explanation of the 2 steps is given.
19
+
20
+ ###Opinion Entity detection###
21
+
22
+ The first step when extracting opinions from text is to determine which portions of text represent the different opinion entities:
23
+
24
+ - Opinion expressions: very nice, really ugly ...
25
+ - Opinion targets: the hotel, the rooms, the staff ...
26
+ - Opinion holders: I, our family, the manager ...
27
+
28
+ In order to do this, three different Conditional Random Fields (CRF) classifiers have been trained using by default this set of features: tokens,
29
+ lemmas, part-of-speech tags, constituent labels and polarity of words and entities. These classifiers detect portions of text representeing differnet opinion
30
+ entities.
31
+
32
+
33
+ ###Opinion Entity linking###
34
+
35
+ This step takes as input the opinion entities detected in the previous step, and links them to create the final opinions <expression/target/holder>.
36
+ In this case we have trained two binary Support Vector Machines (SVM), one that indicates the degree of association between a given target and a given expression,
37
+ and another one that gives the degree of linkage between a holder and an opinion expression. So given a list of expressions, a list of targets and holders detected
38
+ by the CRF classifiers, the SVM models try to select the best candidate from the target list for each expressions, and the best holder from the holder list, to create
39
+ the final opinion triple.
40
+
41
+ Considering a certain opinion expression and a target, these are the features by default used to represent this data for the SVM engine:
42
+
43
+ 1) Textual features: tokens and lemmas of the expression and the target
44
+ 2) Distance features: features representing the relative distance of both elements in the text (normalized to a discrete list of possible values: far/medium/close for instance),
45
+ and if both elements are in the same sentence or not
46
+ 3) Dependency features: to indicate the dependency relations between the two elements in the text (dependency path, and dependencies relations with the root of the sentence)
47
+
48
+ ##Requirements##
49
+ This is the list of required libraries:
50
+ + SVMLight: library for Support Vector Machines (http://svmlight.joachims.org/)
51
+ + CRFsuite: library for Conditional Random Fields (http://www.chokkan.org/software/crfsuite/)
52
+ + KafNafParserPy: library for parsing KAF or NAF files (https://github.com/cltl/KafNafParserPy)
53
+ + VUA_pylib: library with functions used by the system (https://github.com/cltl/VUA_pylib)
54
+
55
+ To install SVMLight and CRFsuite please visit the corresponding webpages and follow the instructions given. For the last two python libraries,
56
+ you will only to clone the repositories and make sure that both are in the python path so Python is able to find them (the easiest way is
57
+ to modify the variable PYTHON_PATH to include the path to these libraries if you don't want to modify your system files).
58
+
59
+ ##Setting the opinion miner##
60
+
61
+ You will need first to install all the requirements on your local machine and then create a configuration file like this one:
62
+
63
+ ```shell
64
+ [general]
65
+ output_folder = feat
66
+
67
+ [crfsuite]
68
+ path_to_binary = crfsuite
69
+
70
+ [svmlight]
71
+ path_to_binary_learn = /home/izquierdo/tools/svm_light/svm_learn
72
+ path_to_binary_classify = /home/izquierdo/tools/svm_light/svm_classify
73
+ ````
74
+
75
+ The `output_folder` variable is the folder where the trained models have been stored. The rest of parameters are the local paths to your installation
76
+ of CRFsuite and SVMLight. This file will be passed to the main script to detect opinions in a new KAF/NAF file:
77
+
78
+ ````shell
79
+ cat my_file.kaf | classify_kaf_naf_file.py your_config_file.cfg
80
+ ````
81
+
82
+ ##Training your own models##
83
+
84
+ You will need first to install all the requirementes given and then follow these steps:
85
+
86
+ 1) Prepare the KAF/NAF files that you will be used for training, with as many layers as possible (for the default configuration, preferably KAF
87
+ files with tokens, terms, polarities, entities, aspects, constituents and dependencies). A file with the complete path to each training KAF
88
+ file needs to be created (my_list_kafs.txt, for instance)
89
+
90
+ 2) Create the feature template files or modify the existing ones on the folder `my_templates`
91
+
92
+ 3) Prepare a configuration file (or modify the existing one my_training.cfg) like this one:
93
+
94
+ ````shell
95
+ [general]
96
+ output_folder = feat
97
+ filename_training_list = /home/izquierdo/data/MPQA/13jan2014/list.25
98
+
99
+ [feature_templates]
100
+ expression = my_templates/templates_exp.txt
101
+ holder = my_templates/templates_holder.txt
102
+ target = my_templates/templates_target.txt
103
+
104
+ [valid_opinions]
105
+ negative = sentiment-neg
106
+ positive = sentiment-pos
107
+
108
+ [crfsuite]
109
+ path_to_binary = /home/izquierdo/bin/crfsuite
110
+ parameters = -a lbfgs
111
+
112
+ [svmlight]
113
+ path_to_binary_learn = /home/izquierdo/tools/svm_light/svm_learn
114
+ path_to_binary_classify = /home/izquierdo/tools/svm_light/svm_classify
115
+ parameters = -c 0.1
116
+ ````
117
+
118
+ The `output_folder` variable is where you want to store your new models (will be used later for tagging new files), and the `filename_training_list` is the file
119
+ you created with the paths to all your training KAF/NAF files (my_list_kafs.txt). The section feature_templates contains pointers to the feature template files
120
+ you want to use. The section valid_opinions allows you to specify which opinions from the training KAF files you want to use, and a mapping from all the labels
121
+ used in the KAF files. So with this configuration:
122
+
123
+ ````shell
124
+ [valid_opinions]
125
+ negative = sentiment-neg
126
+ positive = sentiment-pos
127
+ ````
128
+
129
+ the opinion expressions classifier will be trained for two classes (negative and positive), and for instance all the opinion expressions with the label sentiment-neg in
130
+ your KAF files will be used as training instance for the negative classifier. This allows you to use different sets of labels for the opinion expressions, for instance
131
+ you could use KAF files with differente labels for the negative expressions, like sentiment-low-negative, sentiment-medium-negative and sentiment-high-negative. To train the
132
+ system considering all these instances as training material for the negative classifier you will need to specify:
133
+
134
+ ````shell
135
+ [valid_opinions]
136
+ negative = sentiment-low-negative;sentiment-medium-negative;sentiment-high-negative
137
+ positive = sentiment-pos
138
+ ````
139
+
140
+ The rest of sections on the config file (crfsuite and svm_light) indicate the paths to your local installation of these libraries and the parameters accepted
141
+ by these (check the webpage of the libraries for information about these parameters)
142
+
143
+ 4) Once completed the previous step, the training can be performed calling to the script train.py:
144
+
145
+ ````shell
146
+ train.py my_modified_train.cfg
147
+ ````
148
+
149
+ This will used the config file (my_modified_train.cfg) to train the system and will store all the models and different intermediate files on the folder you set.
150
+
151
+
152
+ ##How to add new features##
153
+ This section explains how to add new features to the system
154
+
155
+ ###Adding new features to the opinion entity detection (CRF)###
156
+
157
+ 1) Modify the function that generates the features `scripts/extract_features.py-> extract_features_from_kaf_naf_file(...)`
158
+
159
+ 1.1) Modify the variable `features`, is a list of features for each token
160
+
161
+ 1.2) Modify the variable labels, which gives a name to each feature (lenghts must match)
162
+
163
+ 2) With the previous step you can extract the features for a single token only. You need specify which features you want to use from the context,
164
+ and if you want to use bigrams/trigrams. In order to do this 3 different features templates have to be filled. These files are plain text files, and
165
+ the default files used can be found on the subfolder `my_templates`. One different feature template can be specify for each CRF classifier. The format
166
+ of these files are a set of lines like `1 token -2 -1 0`, where:
167
+
168
+ - The first 1 is the length of the template, in this case unigram
169
+ - Then 'n' labels that will be used (must match with the labels generated by the feature extractor)
170
+ - Then the positions, in case of 2grams 3grams each position must be n/m/p
171
+
172
+ An example with bigrams: `2 token token -2/-1 -1/0 0/1 1/2` which would generate these templates:
173
+
174
+ ````shell
175
+ (('token',-2),('token',-1))
176
+ (('token',-1),('token',0))
177
+ (('token',1),('token',1))
178
+ ````
179
+
180
+ An one more example with trigrams: `3 token lemma pos -2/0/4 9/8/3`.
181
+
182
+ ````shell
183
+ (('token',-2),('lemma',0),('pos',4))
184
+ (('token',9),('lemma',8),('pos',3))
185
+ ````
186
+
187
+
188
+
189
+
190
+ ###Adding new features to the opinion entity linking (SVM)###
191
+
192
+ You will need to modify the script `scripts/extract_feats_relations.py`. There is one function to extract the features from an opinion
193
+ expression and a target, for the SVM model expression - target, and another function with the same purpose for the SVM model expression-holder.
194
+ These functions are:
195
+
196
+ ````shell
197
+ def extract_feats_exp_tar(exp_ids,tar_ids,knaf_obj):
198
+ ...
199
+
200
+ def extract_feats_exp_hol(exp_ids,hol_ids,knaf_obj):
201
+ ...
202
+ ````
203
+
204
+ Both take as input a list of term identifiers for the expression and for the target/holder, and a kaf/naf tree object representing the input file,
205
+ so there is no need to parse it again. These functions return a list of features for the expression, a list of features for the holder/target and two
206
+ extra list of features (for the expression and for the target/holder), that will be used later to stablish features that represent a relation (like
207
+ the dependencies or whether both are in the same sentence or not.) In order to to this, there are two functions that take as input two set of features
208
+ and generate this relation features:
209
+
210
+ ````shell
211
+ def get_extra_feats_exp_tar(extra_e, extra_t):
212
+ ...
213
+
214
+ def get_extra_feats_exp_hol(extra_e, extra_h):
215
+ ...
216
+ ````
217
+
218
+ The main reason of this is that the features for each expression, target and holder is extracted only once, but later for instance each target will act
219
+ as a positive example in one case (with its correct expression), but as negative example for the rest of possible expressions in the file. So the relation
220
+ features can not be extracted in advance for a pair expression/target but has to be computed for each pair we consider, and in order to do this we need
221
+ the two get_extra_feats functions indicated above.
222
+
223
+ ##Contact##
224
+ * Ruben Izquierdo
225
+ * Vrije University of Amsterdam
226
+ * ruben.izquierdobevia@vu.nl
@@ -0,0 +1,499 @@
1
+ #!/usr/bin/env python
2
+
3
+ import sys
4
+ import os
5
+ import csv
6
+ from tempfile import NamedTemporaryFile
7
+ from subprocess import Popen, PIPE
8
+ import logging
9
+ import cPickle
10
+ import argparse
11
+
12
+ from scripts import lexicons as lexicons_manager
13
+ from scripts.config_manager import Cconfig_manager, internal_config_filename
14
+ from scripts.extract_features import extract_features_from_kaf_naf_file
15
+ from scripts.crfutils import extract_features_to_crf
16
+ from scripts.link_entities_distance import link_entities_distance
17
+ from scripts.relation_classifier import link_entities_svm
18
+ from KafNafParserPy import *
19
+
20
+
21
+ DEBUG=0
22
+
23
+ my_config_manager = Cconfig_manager()
24
+ __this_folder = os.path.dirname(os.path.realpath(__file__))
25
+ separator = '\t'
26
+ __desc = 'Deluxe opinion miner (CRF+SVM)'
27
+ __last_edited = '10jan2014'
28
+ __version = '2.0'
29
+
30
+ logging.basicConfig(stream=sys.stderr,format='%(asctime)s - %(levelname)s\n + %(message)s', level=logging.CRITICAL)
31
+
32
+ terms_for_token = None
33
+
34
+
35
+ def load_obj_from_file(filename):
36
+ fic = open(filename,'rb')
37
+ obj = cPickle.load(fic)
38
+ return obj
39
+
40
+ # Gets the output of crf and a list of token ids, and parses the B- or I- ...
41
+ # Output: [(['id0', 'id1', 'id2', 'id3'], 'holder'), (['id4', 'id5', 'id6'], 'target')]
42
+ def match_crfsuite_out(crfout,list_token_ids):
43
+ matches = []
44
+ inside = False
45
+ current = []
46
+ current_type = None
47
+ num_token = 0
48
+ for line in crfout.splitlines():
49
+ if len(line) == 0: #new sentence
50
+ if inside:
51
+ matches.append((current,current_type))
52
+ current = []
53
+ inside = False
54
+ else:
55
+ if line=='O':
56
+ if inside:
57
+ matches.append((current,current_type))
58
+ current = []
59
+ inside = False
60
+ else:
61
+ my_type = line[0]
62
+ value = line[2:]
63
+ if my_type == 'B':
64
+ if inside:
65
+ matches.append((current,current_type))
66
+ current = [list_token_ids[num_token]]
67
+ inside = True
68
+ current_type = value
69
+ elif my_type == 'I':
70
+ if inside:
71
+ current.append(list_token_ids[num_token])
72
+ else:
73
+ current = [list_token_ids[num_token]]
74
+ current_type = value
75
+ inside = True
76
+ num_token += 1
77
+ if inside:
78
+ matches.append((current,current_type))
79
+ return matches
80
+
81
+
82
+
83
+ def extract_features(kaf_naf_obj):
84
+ feat_file_desc = NamedTemporaryFile(delete=False)
85
+ feat_file_desc.close()
86
+
87
+ out_file = feat_file_desc.name
88
+ err_file = out_file+'.log'
89
+
90
+ expressions_lexicon = None
91
+ targets_lexicon = None
92
+ if my_config_manager.get_use_training_lexicons():
93
+ expression_lexicon_filename = my_config_manager.get_expression_lexicon_filename()
94
+ target_lexicon_filename = my_config_manager.get_target_lexicon_filename()
95
+
96
+ expressions_lexicon = lexicons_manager.load_lexicon(expression_lexicon_filename)
97
+ targets_lexicon =lexicons_manager.load_lexicon(target_lexicon_filename)
98
+
99
+ #def extract_features_from_kaf_naf_file(knaf_obj,out_file=None,log_file=None,include_class=True,accepted_opinions=None, exp_lex= None):
100
+ labels, separator,polarities_skipped = extract_features_from_kaf_naf_file(kaf_naf_obj,out_file,err_file,include_class=False, exp_lex=expressions_lexicon,tar_lex=targets_lexicon)
101
+ return out_file, err_file
102
+
103
+
104
+ def convert_to_crf(input_file,templates):
105
+ out_desc = NamedTemporaryFile(delete=False)
106
+ out_desc.close()
107
+
108
+ out_crf = out_desc.name
109
+
110
+ ##Load description of features
111
+ path_feat_desc = my_config_manager.get_feature_desc_filename()
112
+ fic = open(path_feat_desc)
113
+ fields = fic.read().strip()
114
+ fic.close()
115
+ ####
116
+
117
+ extract_features_to_crf(input_file,out_crf,fields,separator,templates,possible_classes=None)
118
+ return out_crf
119
+
120
+
121
+
122
+ def run_crfsuite_tag(input_file,model_file):
123
+ crfsuite = my_config_manager.get_crfsuite_binary()
124
+ cmd = [crfsuite]
125
+ if not os.path.exists(crfsuite):
126
+ print>>sys.stderr,'CRFsuite not found on',crfsuite
127
+ print>>sys.stderr,'Check the config filename and make sure the path is correctly set'
128
+ print>>sys.stderr,'[crfsuite]\npath_to_binary = yourpathtolocalcrfsuite'
129
+ sys.exit(-1)
130
+
131
+ cmd.append('tag')
132
+ cmd.append('-m '+model_file)
133
+ cmd.append(input_file)
134
+
135
+ crf_process = Popen(' '.join(cmd), stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=True)
136
+ crf_process.wait()
137
+ output = crf_process.stdout.read()
138
+ error = crf_process.stderr.read()
139
+ return output,error
140
+
141
+
142
+ def detect_expressions(tab_feat_file,list_token_ids):
143
+ #1) Convert to the correct CRF
144
+ templates = my_config_manager.get_templates_expr()
145
+
146
+ crf_exp_file = convert_to_crf(tab_feat_file,templates)
147
+ logging.debug('File with crf format for EXPRESSIONS '+crf_exp_file)
148
+ if DEBUG:
149
+ print>>sys.stderr,'#'*50
150
+ print>>sys.stderr,'CRF FEATURES EXPRESSION'
151
+ f = open(crf_exp_file)
152
+ print>>sys.stderr,f.read()
153
+ f.close()
154
+ print>>sys.stderr,'#'*50
155
+
156
+ model_file = my_config_manager.get_filename_model_expression()
157
+ output_crf,error_crf = run_crfsuite_tag(crf_exp_file,model_file)
158
+
159
+ logging.debug('Expressions crf error: '+error_crf)
160
+ matches_exp = match_crfsuite_out(output_crf, list_token_ids)
161
+ if DEBUG:
162
+ print>>sys.stderr,'#'*50
163
+ print>>sys.stderr,'CRF output for EXPRESSION'
164
+ print>>sys.stderr,'Raw output CRF:', output_crf
165
+ print>>sys.stderr,'List token ids:',str(list_token_ids)
166
+ print>>sys.stderr,'MATCHES:',str(matches_exp)
167
+ print>>sys.stderr,'TEMP FILE:',crf_exp_file
168
+ print>>sys.stderr,'#'*50
169
+
170
+
171
+ logging.debug('Detector expressions out: '+str(matches_exp))
172
+ os.remove(crf_exp_file)
173
+ return matches_exp
174
+
175
+
176
+
177
+
178
+
179
+ def detect_targets(tab_feat_file, list_token_ids):
180
+ templates_target = my_config_manager.get_templates_target()
181
+
182
+ crf_target_file = convert_to_crf(tab_feat_file,templates_target)
183
+ logging.debug('File with crf format for TARGETS '+crf_target_file)
184
+ if DEBUG:
185
+ print>>sys.stderr,'#'*50
186
+ print>>sys.stderr,'CRF FEATURES TARGETS'
187
+ f = open(crf_target_file)
188
+ print>>sys.stderr,f.read()
189
+ f.close()
190
+ print>>sys.stderr,'#'*50
191
+
192
+ model_target_file = my_config_manager.get_filename_model_target()
193
+ out_crf_target,error_crf = run_crfsuite_tag(crf_target_file, model_target_file)
194
+ logging.debug('TARGETS crf error: '+error_crf)
195
+
196
+ matches_tar = match_crfsuite_out(out_crf_target, list_token_ids)
197
+
198
+ if DEBUG:
199
+ print>>sys.stderr,'#'*50
200
+ print>>sys.stderr,'CRF output for TARGETS'
201
+ print>>sys.stderr,'Raw output CRF:', out_crf_target
202
+ print>>sys.stderr,'List token ids:',str(list_token_ids)
203
+ print>>sys.stderr,'MATCHES:',str(matches_tar)
204
+ print>>sys.stderr,'#'*50
205
+
206
+ logging.debug('Detector targets out: '+str(matches_tar))
207
+ os.remove(crf_target_file)
208
+ return matches_tar
209
+
210
+
211
+
212
+
213
+
214
+ def detect_holders(tab_feat_file, list_token_ids):
215
+ templates_holder = my_config_manager.get_templates_holder()
216
+
217
+ crf_holder_file = convert_to_crf(tab_feat_file,templates_holder)
218
+ logging.debug('File with crf format for HOLDERS '+crf_holder_file)
219
+ if DEBUG:
220
+ print>>sys.stderr,'#'*50
221
+ print>>sys.stderr,'CRF FEATURES HOLDERS'
222
+ f = open(crf_holder_file)
223
+ print>>sys.stderr,f.read()
224
+ f.close()
225
+ print>>sys.stderr,'#'*50
226
+
227
+ model_holder_file = my_config_manager.get_filename_model_holder()
228
+ out_crf_holder,error_crf = run_crfsuite_tag(crf_holder_file, model_holder_file)
229
+ logging.debug('HOLDERS crf error: '+error_crf)
230
+
231
+ matches_holder = match_crfsuite_out(out_crf_holder, list_token_ids)
232
+
233
+ if DEBUG:
234
+ print>>sys.stderr,'#'*50
235
+ print>>sys.stderr,'CRF output for HOLDERS'
236
+ print>>sys.stderr,'Raw output CRF:', out_crf_holder
237
+ print>>sys.stderr,'List token ids:',str(list_token_ids)
238
+ print>>sys.stderr,'MATCHES:',str(matches_holder)
239
+ print>>sys.stderr,'#'*50
240
+
241
+ logging.debug('Detector HOLDERS out: '+str(matches_holder))
242
+ os.remove(crf_holder_file)
243
+ return matches_holder
244
+
245
+
246
+
247
+
248
+
249
+ def map_tokens_to_terms(list_tokens,knaf_obj):
250
+ global terms_for_token
251
+ if terms_for_token is None:
252
+ terms_for_token = {}
253
+ for term in knaf_obj.get_terms():
254
+ termid = term.get_id()
255
+ token_ids = term.get_span().get_span_ids()
256
+ for tokid in token_ids:
257
+ if tokid not in terms_for_token:
258
+ terms_for_token[tokid] = [termid]
259
+ else:
260
+ terms_for_token[tokid].append(termid)
261
+
262
+ ret = set()
263
+ for my_id in list_tokens:
264
+ term_ids = terms_for_token[my_id]
265
+ ret |= set(term_ids)
266
+ return sorted(list(ret))
267
+
268
+
269
+
270
+ def add_opinions_to_knaf(triples,knaf_obj,text_for_tid,ids_used, map_to_terms=True,include_polarity_strength=True):
271
+ num_opinion = 0
272
+ for type_exp, span_exp, span_tar, span_hol in triples:
273
+ #Map tokens to terms
274
+ if map_to_terms:
275
+ span_exp_terms = map_tokens_to_terms(span_exp,kaf_obj)
276
+ span_tar_terms = map_tokens_to_terms(span_tar,kaf_obj)
277
+ span_hol_terms = map_tokens_to_terms(span_hol, kaf_obj)
278
+ else:
279
+ span_hol_terms = span_hol
280
+ span_tar_terms = span_tar
281
+ span_exp_terms = span_exp
282
+
283
+ ##Creating holder
284
+ span_hol = Cspan()
285
+ span_hol.create_from_ids(span_hol_terms)
286
+ my_hol = Cholder()
287
+ my_hol.set_span(span_hol)
288
+
289
+ hol_text = ' '.join(text_for_tid[tid] for tid in span_hol_terms)
290
+ my_hol.set_comment(hol_text)
291
+
292
+ #Creating target
293
+ span_tar = Cspan()
294
+ span_tar.create_from_ids(span_tar_terms)
295
+ my_tar = opinion_data.Ctarget()
296
+ my_tar.set_span(span_tar)
297
+ tar_text = ' '.join(text_for_tid[tid] for tid in span_tar_terms)
298
+ my_tar.set_comment(tar_text)
299
+ #########################
300
+
301
+ ##Creating expression
302
+ span_exp = Cspan()
303
+ span_exp.create_from_ids(span_exp_terms)
304
+ my_exp = Cexpression()
305
+ my_exp.set_span(span_exp)
306
+ my_exp.set_polarity(type_exp)
307
+ if include_polarity_strength:
308
+ my_exp.set_strength("1")
309
+ exp_text = ' '.join(text_for_tid[tid] for tid in span_exp_terms)
310
+ my_exp.set_comment(exp_text)
311
+ #########################
312
+
313
+ #To get the first possible ID not already used
314
+ new_id = None
315
+ while True:
316
+ new_id = 'o'+str(num_opinion+1)
317
+ if new_id not in ids_used:
318
+ ids_used.add(new_id)
319
+ break
320
+ else:
321
+ num_opinion += 1
322
+ new_opinion = Copinion(type=knaf_obj.get_type())
323
+ new_opinion.set_id(new_id)
324
+ if len(span_hol_terms) != 0: #To avoid empty holders
325
+ new_opinion.set_holder(my_hol)
326
+
327
+ if len(span_tar_terms) != 0: #To avoid empty targets
328
+ new_opinion.set_target(my_tar)
329
+
330
+ new_opinion.set_expression(my_exp)
331
+
332
+ knaf_obj.add_opinion(new_opinion)
333
+
334
+ ##
335
+ # Input_file_stream can be a filename of a stream
336
+ # Opoutfile_trasm can be a filename of a stream
337
+ #Config file must be a string filename
338
+ def tag_file_with_opinions(input_file_stream, output_file_stream,model_folder,kaf_obj=None, remove_existing_opinions=True,include_polarity_strength=True,timestamp=True):
339
+
340
+ config_filename = os.path.join(model_folder,internal_config_filename)
341
+ if not os.path.exists(config_filename):
342
+ print>>sys.stderr,'Config file not found on:',config_filename
343
+ sys.exit(-1)
344
+
345
+ my_config_manager.set_current_folder(__this_folder)
346
+ my_config_manager.set_config(config_filename)
347
+
348
+ if kaf_obj is not None:
349
+ knaf_obj = kaf_obj
350
+ else:
351
+ knaf_obj = KafNafParser(input_file_stream)
352
+
353
+ #Create a temporary file
354
+ out_feat_file, err_feat_file = extract_features(knaf_obj)
355
+ if DEBUG:
356
+ print>>sys.stderr,'#'*50
357
+ print>>sys.stderr,'FEATURE FILE'
358
+ f = open(out_feat_file)
359
+ print>>sys.stderr,f.read()
360
+ f.close()
361
+ print>>sys.stderr,'#'*50
362
+
363
+ #get all the tokens in order
364
+ list_token_ids = []
365
+ text_for_wid = {}
366
+ text_for_tid = {}
367
+ sentence_for_token = {}
368
+ for token_obj in knaf_obj.get_tokens():
369
+ token = token_obj.get_text()
370
+ s_id = token_obj.get_sent()
371
+ w_id = token_obj.get_id()
372
+ text_for_wid[w_id] = token
373
+
374
+ list_token_ids.append(w_id)
375
+ sentence_for_token[w_id] = s_id
376
+
377
+ for term in knaf_obj.get_terms():
378
+ tid = term.get_id()
379
+ toks = [text_for_wid.get(wid,'') for wid in term.get_span().get_span_ids()]
380
+ text_for_tid[tid] = ' '.join(toks)
381
+
382
+
383
+ expressions = detect_expressions(out_feat_file,list_token_ids)
384
+ targets = detect_targets(out_feat_file, list_token_ids)
385
+ holders = detect_holders(out_feat_file, list_token_ids)
386
+
387
+ os.remove(out_feat_file)
388
+ os.remove(err_feat_file)
389
+
390
+ if DEBUG:
391
+ print>>sys.stderr,"Expressions detected:"
392
+ for e in expressions:
393
+ print>>sys.stderr,'\t',e, ' '.join([text_for_wid[wid] for wid in e[0] ])
394
+ print>>sys.stderr
395
+
396
+ print>>sys.stderr,'Targets detected'
397
+ for t in targets:
398
+ print>>sys.stderr,'\t',t, ' '.join([text_for_wid[wid] for wid in t[0] ])
399
+ print>>sys.stderr
400
+
401
+ print>>sys.stderr,'Holders',holders
402
+ for h in holders:
403
+ print>>sys.stderr,'\t',h, ' '.join([text_for_wid[wid] for wid in h[0] ])
404
+ print>>sys.stderr
405
+
406
+
407
+ # Entity linker based on distances
408
+ ####triples = link_entities_distance(expressions,targets,holders,sentence_for_token)
409
+
410
+ triples = link_entities_svm(expressions, targets, holders, knaf_obj, my_config_manager)
411
+
412
+ ids_used = set()
413
+ if remove_existing_opinions:
414
+ knaf_obj.remove_opinion_layer()
415
+ else:
416
+ for opi in knaf_obj.get_opinions():
417
+ ids_used.add(opi.get_id())
418
+
419
+
420
+ add_opinions_to_knaf(triples, knaf_obj,text_for_tid,ids_used, map_to_terms=False,include_polarity_strength=include_polarity_strength)
421
+
422
+ #Adding linguistic processor
423
+ my_lp = Clp()
424
+ my_lp.set_name(__desc)
425
+ my_lp.set_version(__last_edited+'_'+__version)
426
+ if timestamp:
427
+ my_lp.set_timestamp() ##Set to the current date and time
428
+ else:
429
+ my_lp.set_timestamp('*')
430
+ knaf_obj.add_linguistic_processor('opinions',my_lp)
431
+ knaf_obj.dump(output_file_stream)
432
+
433
+
434
+
435
+ def obtain_predefined_model(lang,domain,just_show=False):
436
+ #This function will read the models from the file models.cfg and will return
437
+ #The model folder for the lang and domain
438
+ # format of the file: 1 model per line: lang|domain|path_to_folder
439
+ model_file = os.path.join(__this_folder,'models.cfg')
440
+ fic = open(model_file)
441
+ use_this_model = None
442
+ if just_show:
443
+ print '#'*25
444
+ print 'Models available'
445
+ print '#'*25
446
+
447
+ nm = 0
448
+ for line in fic:
449
+ if line[0]!='#':
450
+ this_lang, this_domain, this_model,this_desc = line.strip().split('|')
451
+ if just_show:
452
+ print ' Model',nm
453
+ print ' Lang:',this_lang
454
+ print ' Domain:', this_domain
455
+ print ' Folder:',this_model
456
+ print ' Desc:',this_desc
457
+ nm+= 1
458
+ else:
459
+ if this_lang == lang and this_domain == domain:
460
+ use_this_model = this_model
461
+ break
462
+ fic.close()
463
+ if just_show:
464
+ print '#'*25
465
+ return use_this_model
466
+
467
+ if __name__ == '__main__':
468
+
469
+ argument_parser = argparse.ArgumentParser(description='Detect opinion triples in a KAF/NAF file')
470
+ group = argument_parser.add_mutually_exclusive_group(required=True)
471
+ group.add_argument('-m',dest='model_folder',help='Folder storing the trained models')
472
+ group.add_argument('-d', dest='domain',help='The domain where the models were trained')
473
+ group.add_argument('-show-models', dest='show_models', action='store_true',help='Show the models available and finish')
474
+
475
+ argument_parser.add_argument('-keep-opinions',dest='keep_opinions',action='store_true',help='Keep the opinions from the input (by default will be deleted)')
476
+ argument_parser.add_argument('-no-time',dest='timestamp',action='store_false',help='No include time in timestamp (for testing)')
477
+ arguments = argument_parser.parse_args()
478
+
479
+ if arguments.show_models:
480
+ obtain_predefined_model(None,None,just_show=True)
481
+ sys.exit(0)
482
+
483
+ knaf_obj = KafNafParser(sys.stdin)
484
+ model_folder = None
485
+ if arguments.model_folder is not None:
486
+ model_folder = arguments.model_folder
487
+ else:
488
+ #Obtain the language
489
+ lang = knaf_obj.get_language()
490
+ model_folder = obtain_predefined_model(lang,arguments.domain)
491
+
492
+
493
+ tag_file_with_opinions(None, sys.stdout,model_folder,kaf_obj=knaf_obj,remove_existing_opinions=(not arguments.keep_opinions),timestamp=arguments.timestamp)
494
+ sys.exit(0)
495
+
496
+
497
+
498
+
499
+