opener-opinion-detector-base 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +101 -0
  3. data/bin/opinion-detector-base +19 -0
  4. data/core/annotation.cfg.erb +9 -0
  5. data/core/packages/KafNafParser-1.4.tar.gz +0 -0
  6. data/core/packages/VUA_pylib-1.5.tar.gz +0 -0
  7. data/core/python-scripts/LICENSE +339 -0
  8. data/core/python-scripts/README.md +226 -0
  9. data/core/python-scripts/classify_kaf_naf_file.py +499 -0
  10. data/core/python-scripts/cross_validation.py +634 -0
  11. data/core/python-scripts/generate_folds.py +134 -0
  12. data/core/python-scripts/models.cfg +10 -0
  13. data/core/python-scripts/my_templates/README +33 -0
  14. data/core/python-scripts/my_templates/templates_exp.only0.txt +6 -0
  15. data/core/python-scripts/my_templates/templates_exp.pol0.txt +10 -0
  16. data/core/python-scripts/my_templates/templates_exp.red.txt +7 -0
  17. data/core/python-scripts/my_templates/templates_exp.txt +10 -0
  18. data/core/python-scripts/my_templates/templates_holder.only0.txt +11 -0
  19. data/core/python-scripts/my_templates/templates_holder.red.txt +9 -0
  20. data/core/python-scripts/my_templates/templates_holder.txt +10 -0
  21. data/core/python-scripts/my_templates/templates_target.only0.txt +11 -0
  22. data/core/python-scripts/my_templates/templates_target.red.txt +9 -0
  23. data/core/python-scripts/my_templates/templates_target.txt +10 -0
  24. data/core/python-scripts/run_all_experiments.sh +49 -0
  25. data/core/python-scripts/run_basic.py +20 -0
  26. data/core/python-scripts/run_experiment.sh +42 -0
  27. data/core/python-scripts/scripts/__init__.py +1 -0
  28. data/core/python-scripts/scripts/config_manager.py +314 -0
  29. data/core/python-scripts/scripts/crfutils.py +215 -0
  30. data/core/python-scripts/scripts/extract_feats_relations.py +295 -0
  31. data/core/python-scripts/scripts/extract_features.py +376 -0
  32. data/core/python-scripts/scripts/feats_to_crf.exp.py +105 -0
  33. data/core/python-scripts/scripts/lexicons.py +44 -0
  34. data/core/python-scripts/scripts/link_entities_distance.py +77 -0
  35. data/core/python-scripts/scripts/relation_classifier.py +250 -0
  36. data/core/python-scripts/train.py +566 -0
  37. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/PKG-INFO +10 -0
  38. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/SOURCES.txt +22 -0
  39. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/dependency_links.txt +1 -0
  40. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/installed-files.txt +47 -0
  41. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/top_level.txt +1 -0
  42. data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.py +390 -0
  43. data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.pyc +0 -0
  44. data/core/site-packages/pre_build/KafNafParser/__init__.py +14 -0
  45. data/core/site-packages/pre_build/KafNafParser/__init__.pyc +0 -0
  46. data/core/site-packages/pre_build/KafNafParser/constituency_data.py +125 -0
  47. data/core/site-packages/pre_build/KafNafParser/constituency_data.pyc +0 -0
  48. data/core/site-packages/pre_build/KafNafParser/coreference_data.py +52 -0
  49. data/core/site-packages/pre_build/KafNafParser/coreference_data.pyc +0 -0
  50. data/core/site-packages/pre_build/KafNafParser/dependency_data.py +78 -0
  51. data/core/site-packages/pre_build/KafNafParser/dependency_data.pyc +0 -0
  52. data/core/site-packages/pre_build/KafNafParser/entity_data.py +59 -0
  53. data/core/site-packages/pre_build/KafNafParser/entity_data.pyc +0 -0
  54. data/core/site-packages/pre_build/KafNafParser/external_references_data.py +41 -0
  55. data/core/site-packages/pre_build/KafNafParser/external_references_data.pyc +0 -0
  56. data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.py +2 -0
  57. data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.pyc +0 -0
  58. data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.py +205 -0
  59. data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.pyc +0 -0
  60. data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.py +309 -0
  61. data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.pyc +0 -0
  62. data/core/site-packages/pre_build/KafNafParser/features_data.py +131 -0
  63. data/core/site-packages/pre_build/KafNafParser/features_data.pyc +0 -0
  64. data/core/site-packages/pre_build/KafNafParser/header_data.py +127 -0
  65. data/core/site-packages/pre_build/KafNafParser/header_data.pyc +0 -0
  66. data/core/site-packages/pre_build/KafNafParser/opinion_data.py +211 -0
  67. data/core/site-packages/pre_build/KafNafParser/opinion_data.pyc +0 -0
  68. data/core/site-packages/pre_build/KafNafParser/references_data.py +23 -0
  69. data/core/site-packages/pre_build/KafNafParser/references_data.pyc +0 -0
  70. data/core/site-packages/pre_build/KafNafParser/span_data.py +63 -0
  71. data/core/site-packages/pre_build/KafNafParser/span_data.pyc +0 -0
  72. data/core/site-packages/pre_build/KafNafParser/term_data.py +111 -0
  73. data/core/site-packages/pre_build/KafNafParser/term_data.pyc +0 -0
  74. data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.py +42 -0
  75. data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.pyc +0 -0
  76. data/core/site-packages/pre_build/KafNafParser/text_data.py +99 -0
  77. data/core/site-packages/pre_build/KafNafParser/text_data.pyc +0 -0
  78. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/PKG-INFO +10 -0
  79. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/SOURCES.txt +14 -0
  80. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/dependency_links.txt +1 -0
  81. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/installed-files.txt +23 -0
  82. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/top_level.txt +1 -0
  83. data/core/site-packages/pre_build/VUA_pylib/__init__.py +1 -0
  84. data/core/site-packages/pre_build/VUA_pylib/__init__.pyc +0 -0
  85. data/core/site-packages/pre_build/VUA_pylib/common/__init__.py +1 -0
  86. data/core/site-packages/pre_build/VUA_pylib/common/__init__.pyc +0 -0
  87. data/core/site-packages/pre_build/VUA_pylib/common/common.py +28 -0
  88. data/core/site-packages/pre_build/VUA_pylib/common/common.pyc +0 -0
  89. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.py +1 -0
  90. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.pyc +0 -0
  91. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.py +156 -0
  92. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.pyc +0 -0
  93. data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.py +1 -0
  94. data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.pyc +0 -0
  95. data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.py +121 -0
  96. data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.pyc +0 -0
  97. data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.py +1 -0
  98. data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.pyc +0 -0
  99. data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.py +72 -0
  100. data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.pyc +0 -0
  101. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
  102. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
  103. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
  104. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
  105. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
  106. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
  107. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
  108. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
  109. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
  110. data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
  111. data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
  112. data/core/vendor/src/crfsuite/AUTHORS +1 -0
  113. data/core/vendor/src/crfsuite/COPYING +27 -0
  114. data/core/vendor/src/crfsuite/ChangeLog +103 -0
  115. data/core/vendor/src/crfsuite/INSTALL +236 -0
  116. data/core/vendor/src/crfsuite/Makefile.am +19 -0
  117. data/core/vendor/src/crfsuite/Makefile.in +783 -0
  118. data/core/vendor/src/crfsuite/README +183 -0
  119. data/core/vendor/src/crfsuite/aclocal.m4 +9018 -0
  120. data/core/vendor/src/crfsuite/autogen.sh +38 -0
  121. data/core/vendor/src/crfsuite/compile +143 -0
  122. data/core/vendor/src/crfsuite/config.guess +1502 -0
  123. data/core/vendor/src/crfsuite/config.h.in +198 -0
  124. data/core/vendor/src/crfsuite/config.sub +1714 -0
  125. data/core/vendor/src/crfsuite/configure +14273 -0
  126. data/core/vendor/src/crfsuite/configure.in +149 -0
  127. data/core/vendor/src/crfsuite/crfsuite.sln +42 -0
  128. data/core/vendor/src/crfsuite/depcomp +630 -0
  129. data/core/vendor/src/crfsuite/example/chunking.py +49 -0
  130. data/core/vendor/src/crfsuite/example/crfutils.py +179 -0
  131. data/core/vendor/src/crfsuite/example/ner.py +270 -0
  132. data/core/vendor/src/crfsuite/example/pos.py +78 -0
  133. data/core/vendor/src/crfsuite/example/template.py +88 -0
  134. data/core/vendor/src/crfsuite/frontend/Makefile.am +29 -0
  135. data/core/vendor/src/crfsuite/frontend/Makefile.in +640 -0
  136. data/core/vendor/src/crfsuite/frontend/dump.c +116 -0
  137. data/core/vendor/src/crfsuite/frontend/frontend.vcxproj +129 -0
  138. data/core/vendor/src/crfsuite/frontend/iwa.c +273 -0
  139. data/core/vendor/src/crfsuite/frontend/iwa.h +65 -0
  140. data/core/vendor/src/crfsuite/frontend/learn.c +439 -0
  141. data/core/vendor/src/crfsuite/frontend/main.c +137 -0
  142. data/core/vendor/src/crfsuite/frontend/option.c +93 -0
  143. data/core/vendor/src/crfsuite/frontend/option.h +86 -0
  144. data/core/vendor/src/crfsuite/frontend/readdata.h +38 -0
  145. data/core/vendor/src/crfsuite/frontend/reader.c +136 -0
  146. data/core/vendor/src/crfsuite/frontend/tag.c +427 -0
  147. data/core/vendor/src/crfsuite/genbinary.sh.in +15 -0
  148. data/core/vendor/src/crfsuite/include/Makefile.am +11 -0
  149. data/core/vendor/src/crfsuite/include/Makefile.in +461 -0
  150. data/core/vendor/src/crfsuite/include/crfsuite.h +1063 -0
  151. data/core/vendor/src/crfsuite/include/crfsuite.hpp +555 -0
  152. data/core/vendor/src/crfsuite/include/crfsuite_api.hpp +400 -0
  153. data/core/vendor/src/crfsuite/include/os.h +61 -0
  154. data/core/vendor/src/crfsuite/install-sh +520 -0
  155. data/core/vendor/src/crfsuite/lib/cqdb/COPYING +28 -0
  156. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.am +21 -0
  157. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.in +549 -0
  158. data/core/vendor/src/crfsuite/lib/cqdb/cqdb.vcxproj +86 -0
  159. data/core/vendor/src/crfsuite/lib/cqdb/include/cqdb.h +524 -0
  160. data/core/vendor/src/crfsuite/lib/cqdb/src/cqdb.c +587 -0
  161. data/core/vendor/src/crfsuite/lib/cqdb/src/lookup3.c +976 -0
  162. data/core/vendor/src/crfsuite/lib/crf/Makefile.am +46 -0
  163. data/core/vendor/src/crfsuite/lib/crf/Makefile.in +721 -0
  164. data/core/vendor/src/crfsuite/lib/crf/crf.vcxproj +216 -0
  165. data/core/vendor/src/crfsuite/lib/crf/src/crf1d.h +353 -0
  166. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_context.c +705 -0
  167. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_encode.c +943 -0
  168. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_feature.c +352 -0
  169. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_model.c +994 -0
  170. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_tag.c +550 -0
  171. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite.c +492 -0
  172. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_internal.h +236 -0
  173. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_train.c +272 -0
  174. data/core/vendor/src/crfsuite/lib/crf/src/dataset.c +106 -0
  175. data/core/vendor/src/crfsuite/lib/crf/src/dictionary.c +118 -0
  176. data/core/vendor/src/crfsuite/lib/crf/src/holdout.c +80 -0
  177. data/core/vendor/src/crfsuite/lib/crf/src/logging.c +91 -0
  178. data/core/vendor/src/crfsuite/lib/crf/src/logging.h +48 -0
  179. data/core/vendor/src/crfsuite/lib/crf/src/params.c +335 -0
  180. data/core/vendor/src/crfsuite/lib/crf/src/params.h +80 -0
  181. data/core/vendor/src/crfsuite/lib/crf/src/quark.c +172 -0
  182. data/core/vendor/src/crfsuite/lib/crf/src/quark.h +46 -0
  183. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.c +1107 -0
  184. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.h +160 -0
  185. data/core/vendor/src/crfsuite/lib/crf/src/train_arow.c +408 -0
  186. data/core/vendor/src/crfsuite/lib/crf/src/train_averaged_perceptron.c +242 -0
  187. data/core/vendor/src/crfsuite/lib/crf/src/train_l2sgd.c +507 -0
  188. data/core/vendor/src/crfsuite/lib/crf/src/train_lbfgs.c +338 -0
  189. data/core/vendor/src/crfsuite/lib/crf/src/train_passive_aggressive.c +435 -0
  190. data/core/vendor/src/crfsuite/lib/crf/src/vecmath.h +341 -0
  191. data/core/vendor/src/crfsuite/ltmain.sh +8413 -0
  192. data/core/vendor/src/crfsuite/missing +376 -0
  193. data/core/vendor/src/crfsuite/swig/Makefile.am +13 -0
  194. data/core/vendor/src/crfsuite/swig/Makefile.in +365 -0
  195. data/core/vendor/src/crfsuite/swig/crfsuite.cpp +2 -0
  196. data/core/vendor/src/crfsuite/swig/export.i +32 -0
  197. data/core/vendor/src/crfsuite/swig/python/README +92 -0
  198. data/core/vendor/src/crfsuite/swig/python/crfsuite.py +329 -0
  199. data/core/vendor/src/crfsuite/swig/python/export_wrap.cpp +14355 -0
  200. data/core/vendor/src/crfsuite/swig/python/export_wrap.h +63 -0
  201. data/core/vendor/src/crfsuite/swig/python/prepare.sh +9 -0
  202. data/core/vendor/src/crfsuite/swig/python/sample_tag.py +52 -0
  203. data/core/vendor/src/crfsuite/swig/python/sample_train.py +68 -0
  204. data/core/vendor/src/crfsuite/swig/python/setup.py +44 -0
  205. data/core/vendor/src/crfsuite/win32/stdint.h +679 -0
  206. data/core/vendor/src/liblbfgs/AUTHORS +1 -0
  207. data/core/vendor/src/liblbfgs/COPYING +22 -0
  208. data/core/vendor/src/liblbfgs/ChangeLog +120 -0
  209. data/core/vendor/src/liblbfgs/INSTALL +231 -0
  210. data/core/vendor/src/liblbfgs/Makefile.am +10 -0
  211. data/core/vendor/src/liblbfgs/Makefile.in +638 -0
  212. data/core/vendor/src/liblbfgs/NEWS +0 -0
  213. data/core/vendor/src/liblbfgs/README +71 -0
  214. data/core/vendor/src/liblbfgs/aclocal.m4 +6985 -0
  215. data/core/vendor/src/liblbfgs/autogen.sh +38 -0
  216. data/core/vendor/src/liblbfgs/config.guess +1411 -0
  217. data/core/vendor/src/liblbfgs/config.h.in +64 -0
  218. data/core/vendor/src/liblbfgs/config.sub +1500 -0
  219. data/core/vendor/src/liblbfgs/configure +21146 -0
  220. data/core/vendor/src/liblbfgs/configure.in +107 -0
  221. data/core/vendor/src/liblbfgs/depcomp +522 -0
  222. data/core/vendor/src/liblbfgs/include/lbfgs.h +745 -0
  223. data/core/vendor/src/liblbfgs/install-sh +322 -0
  224. data/core/vendor/src/liblbfgs/lbfgs.sln +26 -0
  225. data/core/vendor/src/liblbfgs/lib/Makefile.am +24 -0
  226. data/core/vendor/src/liblbfgs/lib/Makefile.in +499 -0
  227. data/core/vendor/src/liblbfgs/lib/arithmetic_ansi.h +133 -0
  228. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_double.h +294 -0
  229. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_float.h +298 -0
  230. data/core/vendor/src/liblbfgs/lib/lbfgs.c +1371 -0
  231. data/core/vendor/src/liblbfgs/lib/lib.vcxproj +95 -0
  232. data/core/vendor/src/liblbfgs/ltmain.sh +6426 -0
  233. data/core/vendor/src/liblbfgs/missing +353 -0
  234. data/core/vendor/src/liblbfgs/sample/Makefile.am +15 -0
  235. data/core/vendor/src/liblbfgs/sample/Makefile.in +433 -0
  236. data/core/vendor/src/liblbfgs/sample/sample.c +81 -0
  237. data/core/vendor/src/liblbfgs/sample/sample.cpp +126 -0
  238. data/core/vendor/src/liblbfgs/sample/sample.vcxproj +105 -0
  239. data/core/vendor/src/svm_light/LICENSE.txt +59 -0
  240. data/core/vendor/src/svm_light/Makefile +105 -0
  241. data/core/vendor/src/svm_light/kernel.h +40 -0
  242. data/core/vendor/src/svm_light/svm_classify.c +197 -0
  243. data/core/vendor/src/svm_light/svm_common.c +985 -0
  244. data/core/vendor/src/svm_light/svm_common.h +301 -0
  245. data/core/vendor/src/svm_light/svm_hideo.c +1062 -0
  246. data/core/vendor/src/svm_light/svm_learn.c +4147 -0
  247. data/core/vendor/src/svm_light/svm_learn.h +169 -0
  248. data/core/vendor/src/svm_light/svm_learn_main.c +397 -0
  249. data/core/vendor/src/svm_light/svm_loqo.c +211 -0
  250. data/ext/hack/Rakefile +17 -0
  251. data/ext/hack/support.rb +88 -0
  252. data/lib/opener/opinion_detectors/base.rb +112 -0
  253. data/lib/opener/opinion_detectors/base/version.rb +7 -0
  254. data/lib/opener/opinion_detectors/configuration_creator.rb +86 -0
  255. data/lib/opener/opinion_detectors/de.rb +7 -0
  256. data/lib/opener/opinion_detectors/en.rb +7 -0
  257. data/lib/opener/opinion_detectors/it.rb +7 -0
  258. data/lib/opener/opinion_detectors/nl.rb +6 -0
  259. data/opener-opinion-detector-base.gemspec +35 -0
  260. data/pre_build_requirements.txt +3 -0
  261. metadata +374 -0
@@ -0,0 +1,376 @@
1
+ #!/usr/bin/env python
2
+
3
+ import sys
4
+ import codecs
5
+ import csv
6
+ import os
7
+ from operator import itemgetter
8
+
9
+ #from VUA_pylib.lexicon import MPQA_subjectivity_lexicon
10
+
11
+
12
+ def get_first_term_id(token_data,term_data,this_ids):
13
+ vector_tid_pos = []
14
+ for tid in this_ids:
15
+ span_token = term_data[tid][2]
16
+ min_token_pos = min(token_data[tok_id][2] for tok_id in span_token)
17
+ vector_tid_pos.append((tid,min_token_pos))
18
+ vector_tid_pos.sort(key=itemgetter(1))
19
+ return vector_tid_pos[0][0]
20
+
21
+
22
+ def get_mapping_from_lexicon(token_ids,lexicon):
23
+ #Create index offset --> ids
24
+ idx = 0
25
+ my_map = {}
26
+ text = ' '
27
+ for token, tid in token_ids:
28
+ for c in token:
29
+ my_map[idx] = tid
30
+ idx+=1
31
+ text += token+' '
32
+ idx+=1
33
+ ####
34
+ all_extracted = [] # List of [(ids,polarity), (ids, polarity)
35
+
36
+
37
+ for substring, polarity in lexicon.items():
38
+ current_found = 0
39
+ while True:
40
+ start = text.find(' '+substring+' ',current_found)
41
+ if start == -1:
42
+ break
43
+ end = start + len(substring)
44
+ current_found = end
45
+ ids = set(my_map[myidx] for myidx in range(start,end) if myidx in my_map)
46
+ if len(ids) != 0:
47
+ all_extracted.append((ids,polarity))
48
+
49
+ final_selected = {}
50
+
51
+ #If w15 has been selected first, for instance (w14,w15,w16) will not be selected later in this file
52
+ for ids,polarity in sorted(all_extracted, key=lambda t: len(t[0])):
53
+ already_selected = False
54
+ for this_id in ids:
55
+ if this_id in final_selected:
56
+ already_selected = True
57
+
58
+ if not already_selected:
59
+ for this_id in ids:
60
+ final_selected[this_id] = polarity
61
+ return final_selected
62
+
63
+
64
+ def load_propagation_lexicon(propagation_lex_filename):
65
+ ##Creates a lexicon (map) [lemma] --> polarity
66
+ propagated_lexicon = {}
67
+ if not os.path.exists(propagation_lex_filename):
68
+ print>>sys.stderr,'The propagated lexicon on', propagation_lex_filename,'does not exist'
69
+ else:
70
+ fic = open(propagation_lex_filename,'r')
71
+ for line in fic:
72
+ line = line.decode('utf-8').rstrip()
73
+ tokens = line.split(';')
74
+ lemma = tokens[4]
75
+ polarity = tokens[2]
76
+ propagated_lexicon[lemma] = polarity
77
+ return propagated_lexicon
78
+
79
+
80
+
81
+ def extract_features_from_kaf_naf_file(knaf_obj,out_file=None,log_file=None,include_class=True,accepted_opinions=None, exp_lex= None, tar_lex=None, propagation_lex_filename=None):
82
+
83
+ labels = []
84
+
85
+ polarities_found_and_skipped = []
86
+ separator = '\t'
87
+ restore_out = None
88
+ log_on = False
89
+
90
+ if log_file is not None:
91
+ log_desc = codecs.open(log_file, 'w', encoding='UTF-8')
92
+ log_on = True
93
+
94
+ if out_file is not None:
95
+ restore_out = sys.stdout
96
+ sys.stdout = open(out_file,'a')
97
+
98
+
99
+
100
+ print>>log_desc,'Extracting features from ',knaf_obj.get_filename()
101
+
102
+
103
+
104
+ ###########################
105
+ ## EXTRACTING TOKENS #######
106
+ token_data = {} ## token_data['w_1'] = ('house','s_1')
107
+ tokens_in_order = []
108
+ num_token = 0
109
+ tokens_ids = []
110
+ for token_obj in knaf_obj.get_tokens():
111
+ token = token_obj.get_text()
112
+ s_id = token_obj.get_sent()
113
+ w_id = token_obj.get_id()
114
+ tokens_ids.append((token,w_id))
115
+ token_data[w_id] = (token,s_id,num_token)
116
+ tokens_in_order.append(w_id)
117
+ num_token += 1
118
+ if log_on:
119
+ print>>log_desc,' Number of tokens: ',len(tokens_in_order)
120
+ ###########################
121
+
122
+ #Lexicons from the training data
123
+ mapping_wid_polarity = {}
124
+ if exp_lex is not None:
125
+ mapping_wid_polarity = get_mapping_from_lexicon(tokens_ids,exp_lex)
126
+
127
+ mapping_wid_aspect = {}
128
+ if tar_lex is not None:
129
+ mapping_wid_aspect = get_mapping_from_lexicon(tokens_ids, tar_lex)
130
+
131
+ propagated_lex = {}
132
+ if propagation_lex_filename is not None:
133
+ #Lexicon of [lemma] ==> polarity
134
+ propagated_lex = load_propagation_lexicon(propagation_lex_filename)
135
+
136
+ ###########################
137
+ ## EXTRACTING TERMS #######
138
+ term_data = {} #(term_lemma,term_pos,term_span,polarity)
139
+ term_for_token = {}
140
+ sentence_for_term = {}
141
+ for term_obj in knaf_obj.get_terms():
142
+ term_id = term_obj.get_id()
143
+ term_lemma = term_obj.get_lemma()
144
+ term_pos = term_obj.get_morphofeat()
145
+ # if there is no morphofeat feature, we try to get the pos from the 'pos' attrib
146
+ if term_pos == None:
147
+ term_pos = term_obj.get_pos()
148
+ if term_pos is not None:
149
+ term_pos = term_pos.split(' ')[0] #[:2] ## Only the 2 first chars of the pos string
150
+ else:
151
+ term_pos = 'unknown'
152
+
153
+
154
+ term_span = term_obj.get_span().get_span_ids()
155
+
156
+ sentiment = term_obj.get_sentiment()
157
+ polarity = None
158
+ if sentiment is not None:
159
+ polarity = sentiment.get_polarity()
160
+ if polarity is None:
161
+ modifier = sentiment.get_modifier()
162
+ polarity = modifier
163
+ if polarity is None: polarity='-'
164
+
165
+ term_data[term_id] = (term_lemma,term_pos,term_span,polarity)
166
+ for tok_id in term_span:
167
+ term_for_token[tok_id] = term_id
168
+
169
+ if tok_id in token_data:
170
+ sentence_id = token_data[tok_id][1]
171
+ sentence_for_term[term_id] = sentence_id
172
+ else:
173
+ sentence_for_term[term_id] = '0'
174
+
175
+ if log_on:
176
+ print>>log_desc,' Number of terms loaded: '+str(len(term_data))
177
+ ###########################
178
+
179
+ ###########################
180
+ # EXTRACTING ENTITIES FOR EACH TERM
181
+ ###########################
182
+ entity_for_term = {}
183
+ for ent_obj in knaf_obj.get_entities():
184
+ ent_type = ent_obj.get_type()
185
+ for reference_obj in ent_obj.get_references():
186
+ for span_obj in reference_obj:
187
+ for t_id in span_obj.get_span_ids():
188
+ entity_for_term[t_id] = ent_type
189
+ if log_on:
190
+ print>>log_desc,'Entities:'+str(entity_for_term)
191
+
192
+ ###########################
193
+ # EXTRACTING PROPERTIES FOR EACH TERM
194
+ ###########################
195
+ property_for_term = {}
196
+ for prop_obj in knaf_obj.get_properties():
197
+ prop_type = prop_obj.get_type()
198
+ for reference_obj in prop_obj.get_references():
199
+ for span_obj in reference_obj:
200
+ for t_id in span_obj.get_span_ids():
201
+ property_for_term[t_id] = prop_type
202
+ if log_on:
203
+ print>>log_desc,'Properties:'+str(property_for_term)
204
+
205
+ ###########################
206
+ # EXTRACTING CLASS FOR EACH TERM
207
+ ###########################
208
+ class_for_term_id = {}
209
+ if include_class:
210
+ for opinion in knaf_obj.get_opinions():
211
+ ## opinion expression
212
+ opinion_id = opinion.get_id()
213
+ opinion_exp = opinion.get_expression()
214
+ exp_type = ''
215
+ exp_strength = ''
216
+ exp_ids = []
217
+ if opinion_exp is not None:
218
+ exp_type = opinion_exp.get_polarity()
219
+ exp_strength = opinion_exp.get_strength()
220
+ span = opinion_exp.get_span()
221
+ if span is not None:
222
+ exp_ids = span.get_span_ids()
223
+
224
+ opinion_hol = opinion.get_holder()
225
+ hol_ids = []
226
+ if opinion_hol is not None:
227
+ span = opinion_hol.get_span()
228
+ if span is not None:
229
+ hol_ids = span.get_span_ids()
230
+
231
+ opinion_tar = opinion.get_target()
232
+ tar_ids = []
233
+ if opinion_tar is not None:
234
+ span = opinion_tar.get_span()
235
+ if span is not None:
236
+ tar_ids = span.get_span_ids()
237
+
238
+ ############################
239
+
240
+ if accepted_opinions is not None:
241
+ if exp_type in accepted_opinions:
242
+ #Get the mapping label
243
+ mapped_type = accepted_opinions[exp_type]
244
+ else:
245
+ # This opinion wont be considered
246
+ polarities_found_and_skipped.append(exp_type)
247
+ continue
248
+ else:
249
+ mapped_type = exp_type
250
+
251
+
252
+ if log_on:
253
+ print>>log_desc,' Opinion',opinion_id
254
+ print>>log_desc,' Expression:'
255
+ print>>log_desc,' ids:',exp_ids
256
+ print>>log_desc,' terms:',[term_data[i][0] for i in exp_ids]
257
+
258
+ if len(exp_ids) != 0:
259
+ first_term_id = get_first_term_id(token_data,term_data,exp_ids)
260
+ for t_id in exp_ids:
261
+ if t_id == first_term_id: type='B-'
262
+ else: type='I-'
263
+ class_for_term_id[t_id]=type+mapped_type
264
+
265
+
266
+
267
+ if log_on:
268
+ print>>log_desc,' Target:'
269
+ print>>log_desc,' ids:',tar_ids
270
+ print>>log_desc,' terms:',[term_data[i][0] for i in tar_ids]
271
+
272
+ if len(tar_ids) != 0:
273
+ first_term_id = get_first_term_id(token_data,term_data,tar_ids)
274
+ for t_id in tar_ids:
275
+ if t_id == first_term_id: type='B-'
276
+ else: type='I-'
277
+ class_for_term_id[t_id]=type+'target'
278
+
279
+ if log_on:
280
+ print>>log_desc,' Holder:'
281
+ print>>log_desc,' ids:',hol_ids
282
+ print>>log_desc,' terms:',[term_data[i][0] for i in hol_ids]
283
+
284
+ if len(hol_ids) != 0:
285
+ first_term_id = get_first_term_id(token_data,term_data,hol_ids)
286
+ for t_id in hol_ids:
287
+ if t_id == first_term_id: type='B-'
288
+ else: type='I-'
289
+ class_for_term_id[t_id]=type+'holder'
290
+ ##############
291
+
292
+
293
+ #my_mpqa_subj_lex = MPQA_subjectivity_lexicon()
294
+ ## WRITE TO THE OUTPUT
295
+
296
+
297
+
298
+
299
+
300
+ prev_sent = None
301
+ for token_id in tokens_in_order:
302
+ token,sentence_id,num_token = token_data[token_id]
303
+
304
+ term_id = term_for_token.get(token_id,None)
305
+
306
+ #This is required for wrong KAF files that contain missing terms (tokens not linked with terms)
307
+ if term_id is not None:
308
+ data = term_data.get(term_id,None)
309
+ if data is not None:
310
+ term_lemma,term_pos,term_span,polarity = data
311
+ entity = entity_for_term.get(term_id,'-')
312
+ property = property_for_term.get(term_id,'-')
313
+ this_class = class_for_term_id.get(term_id,'O')
314
+
315
+ '''
316
+ #Mpqa subjectivy from the mpqa corpus
317
+ mpqa_type = mpqa_pol = '-'
318
+ if my_mpqa_subj_lex is not None:
319
+ mpqa_data = my_mpqa_subj_lex.get_type_and_polarity(token,term_pos)
320
+ if mpqa_data is not None:
321
+ mpqa_type, mpqa_pol = mpqa_data
322
+ '''
323
+
324
+
325
+ ## Constituency features
326
+ constituency_extractor = knaf_obj.get_constituency_extractor()
327
+ feature_phrase = 'XXX'
328
+ if constituency_extractor is not None:
329
+ this_phrase, subsumed_together = constituency_extractor.get_deepest_phrase_for_termid(term_id)
330
+ if this_phrase is not None:
331
+ feature_phrase = this_phrase
332
+ ######################
333
+
334
+ ### Expression from the domain lexicon
335
+ polarity_from_domain = mapping_wid_polarity.get(token_id,'-')
336
+
337
+ ## Polarity from the propagated lexicon
338
+ polarity_from_propagation = propagated_lex.get(term_lemma,'-')
339
+
340
+ ## Target from the training lexicon
341
+ aspect_from_domain = mapping_wid_aspect.get(token_id,'-')
342
+
343
+ ##############################################################################################
344
+ ## FEATURE GENERATION!!!!
345
+ ##############################################################################################
346
+ labels = ['sentence_id','token_id','token','lemma', 'pos', 'term_id', 'pol/mod', 'poldomain', 'aspect_training']
347
+ features = [ sentence_id, token_id, token, term_lemma, term_pos, term_id, polarity ,polarity_from_domain,aspect_from_domain]
348
+
349
+
350
+
351
+
352
+ labels.extend(['entity','property','phrase_type','propagation_polarity','y'])
353
+ features.extend([entity,property,feature_phrase,polarity_from_propagation,this_class])
354
+
355
+ ##############################################################################################
356
+ ##############################################################################################
357
+
358
+
359
+ if prev_sent is not None and sentence_id != prev_sent: print>>sys.stdout #breakline
360
+ print>>sys.stdout,(separator.join(features)).encode('utf-8')
361
+
362
+ prev_sent=sentence_id
363
+ print>>sys.stdout #Last breakline required for crfsuite
364
+
365
+
366
+ print>>log_desc
367
+ ## Restoring
368
+ if log_on:
369
+ log_desc.close()
370
+
371
+ if restore_out is not None:
372
+ sys.stdout.close()
373
+ sys.stdout = restore_out
374
+
375
+ return labels, separator, polarities_found_and_skipped
376
+
@@ -0,0 +1,105 @@
1
+ #!/usr/bin/env python
2
+
3
+ # Separator of field values.
4
+ separator = '\t'
5
+
6
+ # Field names of the input data.
7
+ # From file extract_feats_from_kaf
8
+ # print sentence_id+'\t'+token_id+'\t'+token+'\t'+term_id+'\t'+lemma+'\t'+pos+'\t'+entity_for_token+'\t'+property_for_token+'\t'+class_for_token
9
+
10
+ #9 wop140 competitor t141 competitor NN negative O
11
+
12
+ #fields = 'sentence_id token_id tok term_id lem pos pol train_pol y'
13
+ #fields = 'sentence_id token_id tok term_id lem pos pol y'
14
+ fields = 'sentence_id token_id tok term_id lem pos polope polmpqa poltra y'
15
+ fields = 'sentence_id token_id tok term_id lem pos polmpqa y'
16
+ # Attribute templates.
17
+
18
+ templates = (
19
+ #(('tok',-4),), (('lem',-4),), (('polmpqa',-4),),
20
+ #(('tok',-3),), (('lem',-3),), (('polmpqa',-3),),
21
+ #(('tok',-2),), (('lem',-2),), (('pos',-2),),(('polmpqa',-2),), #(('poltra',-2),),(('polope',-2),),
22
+ #(('tok',-4),), (('lem',-4),), (('pos',-4),),(('polmpqa',-4),),
23
+ #(('tok',-3),), (('lem',-3),), (('pos',-3),),(('polmpqa',-3),),
24
+ #(('tok',-2),), (('lem',-2),), (('pos',-2),),(('polmpqa',-2),),
25
+ (('tok',-1),), (('lem',-1),), (('pos',-1),),(('polmpqa',-1),), #(('poltra',-1),),(('polope',-1),),
26
+ (('tok',0),), (('lem',0),), (('pos',0),),(('polmpqa',0),), #(('poltra',0),),(('polope',0),),
27
+ (('tok',1),), (('lem',1),), (('pos',1),),(('polmpqa',1),), #(('poltra',1),),(('polope',1),),
28
+ #(('tok',2),), (('lem',2),), (('pos',2),),(('polmpqa',2),),
29
+ #(('tok',3),), (('lem',3),), (('pos',3),),(('polmpqa',3),),
30
+ #(('tok',4),), (('lem',4),), (('pos',4),),(('polmpqa',4),),
31
+ #(('tok',2),), (('lem',2),), (('pos',2),),(('polmpqa',2),), #(('poltra',2),),(('polope',2),),
32
+ #(('tok',3),), (('lem',3),), (('polmpqa',3),),
33
+ #(('tok',4),), (('lem',4),), (('polmpqa',4),),
34
+ )
35
+
36
+
37
+ templates1234 = (
38
+ (('tok',-1),), (('pos',-1),), (('lem',-1),),(('train_pol',-1),) , (('pol',-1),),
39
+ (('tok',0),), (('pos',0),), (('lem',0),),(('train_pol',0),) , (('pol',0),),
40
+ (('tok',1),), (('pos',1),), (('lem',1),), (('train_pol',1),) , (('pol',1),),
41
+ )
42
+
43
+
44
+
45
+ templates_default = (
46
+ (('tok', -1), ), (('pos', -1), ), (('lem', -1), ), (('pol', -1), ),
47
+ (('tok', 0), ), (('pos', 0), ), (('lem', 0), ), (('pol', 0), ),
48
+ (('tok', 1), ), (('pos', 1), ), (('lem', 1), ), (('pol', 1), ),
49
+ (('tok',-1),('tok',0)),(('pos',-1),('pos',0)), (('lem',-1),('lem',0)), (('pol',-1),('pol',0)),
50
+ (('tok',0),('tok',1)),(('pos',0),('pos',1)), (('lem',0),('lem',1)), (('pol',0),('pol',1)),
51
+ )
52
+
53
+
54
+ templates2222 = (
55
+ # (('tok', -5), ), (('lem', -5), ),(('pol', -5), ), (('train_pol',-5),),
56
+ (('tok', -4), ), (('lem', -4), ),(('pol', -4), ),(('train_pol',-4),),
57
+ (('tok', -3), ), (('lem', -3), ),(('pol', -3), ),(('train_pol',-3),),
58
+ (('tok', -2), ), (('lem', -2), ),(('pol', -2), ), (('train_pol',-2),),
59
+ (('tok', -1), ), (('lem', -1), ),(('pol', -1), ), (('train_pol',-1),),
60
+ (('tok', 0), ), (('lem', 0), ),(('pol', 0), ), (('pos', 0),), (('train_pol',0),),
61
+ (('tok', 1), ), (('lem', 1), ),(('pol', 1), ), (('train_pol',1),),
62
+ (('tok', 2), ), (('lem', 2), ),(('pol', 2), ), (('train_pol',2),),
63
+ (('tok', 3), ), (('lem', 3), ), (('pol', 3), ),(('train_pol',3),),
64
+ (('tok', 4), ), (('lem', 4), ), (('pol', 4), ),(('train_pol',4),),
65
+ (('tok', +5), ), (('lem', +5), ),(('pol', +5), ),(('train_pol',5),),
66
+ )
67
+
68
+ templates22 = (
69
+ (('tok', -5), ), (('lem', -5), ),(('pol', -5), ),
70
+ (('tok', -4), ), (('lem', -4), ),(('pol', -4), ),
71
+ (('tok', -3), ), (('lem', -3), ),(('pol', -3), ),
72
+ (('tok', -2), ), (('lem', -2), ),(('pol', -2), ), (('pos', -2),),
73
+ (('tok', -1), ), (('lem', -1), ),(('pol', -1), ), (('pos', -1),),
74
+ (('tok', 0), ), (('lem', 0), ),(('pol', 0), ), (('pos', 0),),
75
+ (('tok', 1), ), (('lem', 1), ),(('pol', 1), ), (('pos', 1),),
76
+ (('tok', 2), ), (('lem', 2), ),(('pol', 2), ), (('pos', 2),),
77
+ (('tok', 3), ), (('lem', 3), ), (('pol', 3), ),
78
+ (('tok', 4), ), (('lem', 4), ), (('pol', 4), ),
79
+ (('tok', +5), ), (('lem', +5), ),(('pol', +5), ),
80
+ )
81
+
82
+ import crfutils
83
+
84
+ def feature_extractor(X):
85
+ # Apply attribute templates to obtain features (in fact, attributes)
86
+ crfutils.apply_templates(X, templates)
87
+ if X:
88
+ # Append BOS and EOS features manually
89
+ X[0]['F'].append('__BOS__') # BOS feature
90
+ X[-1]['F'].append('__EOS__') # EOS feature
91
+
92
+
93
+
94
+ def extract_features(inputfile,outputfile):
95
+ fi = open(inputfile,'r')
96
+ fo = open(outputfile,'w')
97
+ crfutils.main(feature_extractor,fields=fields,sep=separator,fi=fi,fo=fo)
98
+ fi.close()
99
+ fo.close()
100
+
101
+
102
+ if __name__ == '__main__':
103
+ crfutils.main(feature_extractor, fields=fields, sep=separator)
104
+
105
+