opener-opinion-detector-base 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +101 -0
  3. data/bin/opinion-detector-base +19 -0
  4. data/core/annotation.cfg.erb +9 -0
  5. data/core/packages/KafNafParser-1.4.tar.gz +0 -0
  6. data/core/packages/VUA_pylib-1.5.tar.gz +0 -0
  7. data/core/python-scripts/LICENSE +339 -0
  8. data/core/python-scripts/README.md +226 -0
  9. data/core/python-scripts/classify_kaf_naf_file.py +499 -0
  10. data/core/python-scripts/cross_validation.py +634 -0
  11. data/core/python-scripts/generate_folds.py +134 -0
  12. data/core/python-scripts/models.cfg +10 -0
  13. data/core/python-scripts/my_templates/README +33 -0
  14. data/core/python-scripts/my_templates/templates_exp.only0.txt +6 -0
  15. data/core/python-scripts/my_templates/templates_exp.pol0.txt +10 -0
  16. data/core/python-scripts/my_templates/templates_exp.red.txt +7 -0
  17. data/core/python-scripts/my_templates/templates_exp.txt +10 -0
  18. data/core/python-scripts/my_templates/templates_holder.only0.txt +11 -0
  19. data/core/python-scripts/my_templates/templates_holder.red.txt +9 -0
  20. data/core/python-scripts/my_templates/templates_holder.txt +10 -0
  21. data/core/python-scripts/my_templates/templates_target.only0.txt +11 -0
  22. data/core/python-scripts/my_templates/templates_target.red.txt +9 -0
  23. data/core/python-scripts/my_templates/templates_target.txt +10 -0
  24. data/core/python-scripts/run_all_experiments.sh +49 -0
  25. data/core/python-scripts/run_basic.py +20 -0
  26. data/core/python-scripts/run_experiment.sh +42 -0
  27. data/core/python-scripts/scripts/__init__.py +1 -0
  28. data/core/python-scripts/scripts/config_manager.py +314 -0
  29. data/core/python-scripts/scripts/crfutils.py +215 -0
  30. data/core/python-scripts/scripts/extract_feats_relations.py +295 -0
  31. data/core/python-scripts/scripts/extract_features.py +376 -0
  32. data/core/python-scripts/scripts/feats_to_crf.exp.py +105 -0
  33. data/core/python-scripts/scripts/lexicons.py +44 -0
  34. data/core/python-scripts/scripts/link_entities_distance.py +77 -0
  35. data/core/python-scripts/scripts/relation_classifier.py +250 -0
  36. data/core/python-scripts/train.py +566 -0
  37. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/PKG-INFO +10 -0
  38. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/SOURCES.txt +22 -0
  39. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/dependency_links.txt +1 -0
  40. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/installed-files.txt +47 -0
  41. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/top_level.txt +1 -0
  42. data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.py +390 -0
  43. data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.pyc +0 -0
  44. data/core/site-packages/pre_build/KafNafParser/__init__.py +14 -0
  45. data/core/site-packages/pre_build/KafNafParser/__init__.pyc +0 -0
  46. data/core/site-packages/pre_build/KafNafParser/constituency_data.py +125 -0
  47. data/core/site-packages/pre_build/KafNafParser/constituency_data.pyc +0 -0
  48. data/core/site-packages/pre_build/KafNafParser/coreference_data.py +52 -0
  49. data/core/site-packages/pre_build/KafNafParser/coreference_data.pyc +0 -0
  50. data/core/site-packages/pre_build/KafNafParser/dependency_data.py +78 -0
  51. data/core/site-packages/pre_build/KafNafParser/dependency_data.pyc +0 -0
  52. data/core/site-packages/pre_build/KafNafParser/entity_data.py +59 -0
  53. data/core/site-packages/pre_build/KafNafParser/entity_data.pyc +0 -0
  54. data/core/site-packages/pre_build/KafNafParser/external_references_data.py +41 -0
  55. data/core/site-packages/pre_build/KafNafParser/external_references_data.pyc +0 -0
  56. data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.py +2 -0
  57. data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.pyc +0 -0
  58. data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.py +205 -0
  59. data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.pyc +0 -0
  60. data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.py +309 -0
  61. data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.pyc +0 -0
  62. data/core/site-packages/pre_build/KafNafParser/features_data.py +131 -0
  63. data/core/site-packages/pre_build/KafNafParser/features_data.pyc +0 -0
  64. data/core/site-packages/pre_build/KafNafParser/header_data.py +127 -0
  65. data/core/site-packages/pre_build/KafNafParser/header_data.pyc +0 -0
  66. data/core/site-packages/pre_build/KafNafParser/opinion_data.py +211 -0
  67. data/core/site-packages/pre_build/KafNafParser/opinion_data.pyc +0 -0
  68. data/core/site-packages/pre_build/KafNafParser/references_data.py +23 -0
  69. data/core/site-packages/pre_build/KafNafParser/references_data.pyc +0 -0
  70. data/core/site-packages/pre_build/KafNafParser/span_data.py +63 -0
  71. data/core/site-packages/pre_build/KafNafParser/span_data.pyc +0 -0
  72. data/core/site-packages/pre_build/KafNafParser/term_data.py +111 -0
  73. data/core/site-packages/pre_build/KafNafParser/term_data.pyc +0 -0
  74. data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.py +42 -0
  75. data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.pyc +0 -0
  76. data/core/site-packages/pre_build/KafNafParser/text_data.py +99 -0
  77. data/core/site-packages/pre_build/KafNafParser/text_data.pyc +0 -0
  78. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/PKG-INFO +10 -0
  79. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/SOURCES.txt +14 -0
  80. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/dependency_links.txt +1 -0
  81. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/installed-files.txt +23 -0
  82. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/top_level.txt +1 -0
  83. data/core/site-packages/pre_build/VUA_pylib/__init__.py +1 -0
  84. data/core/site-packages/pre_build/VUA_pylib/__init__.pyc +0 -0
  85. data/core/site-packages/pre_build/VUA_pylib/common/__init__.py +1 -0
  86. data/core/site-packages/pre_build/VUA_pylib/common/__init__.pyc +0 -0
  87. data/core/site-packages/pre_build/VUA_pylib/common/common.py +28 -0
  88. data/core/site-packages/pre_build/VUA_pylib/common/common.pyc +0 -0
  89. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.py +1 -0
  90. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.pyc +0 -0
  91. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.py +156 -0
  92. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.pyc +0 -0
  93. data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.py +1 -0
  94. data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.pyc +0 -0
  95. data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.py +121 -0
  96. data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.pyc +0 -0
  97. data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.py +1 -0
  98. data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.pyc +0 -0
  99. data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.py +72 -0
  100. data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.pyc +0 -0
  101. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
  102. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
  103. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
  104. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
  105. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
  106. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
  107. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
  108. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
  109. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
  110. data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
  111. data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
  112. data/core/vendor/src/crfsuite/AUTHORS +1 -0
  113. data/core/vendor/src/crfsuite/COPYING +27 -0
  114. data/core/vendor/src/crfsuite/ChangeLog +103 -0
  115. data/core/vendor/src/crfsuite/INSTALL +236 -0
  116. data/core/vendor/src/crfsuite/Makefile.am +19 -0
  117. data/core/vendor/src/crfsuite/Makefile.in +783 -0
  118. data/core/vendor/src/crfsuite/README +183 -0
  119. data/core/vendor/src/crfsuite/aclocal.m4 +9018 -0
  120. data/core/vendor/src/crfsuite/autogen.sh +38 -0
  121. data/core/vendor/src/crfsuite/compile +143 -0
  122. data/core/vendor/src/crfsuite/config.guess +1502 -0
  123. data/core/vendor/src/crfsuite/config.h.in +198 -0
  124. data/core/vendor/src/crfsuite/config.sub +1714 -0
  125. data/core/vendor/src/crfsuite/configure +14273 -0
  126. data/core/vendor/src/crfsuite/configure.in +149 -0
  127. data/core/vendor/src/crfsuite/crfsuite.sln +42 -0
  128. data/core/vendor/src/crfsuite/depcomp +630 -0
  129. data/core/vendor/src/crfsuite/example/chunking.py +49 -0
  130. data/core/vendor/src/crfsuite/example/crfutils.py +179 -0
  131. data/core/vendor/src/crfsuite/example/ner.py +270 -0
  132. data/core/vendor/src/crfsuite/example/pos.py +78 -0
  133. data/core/vendor/src/crfsuite/example/template.py +88 -0
  134. data/core/vendor/src/crfsuite/frontend/Makefile.am +29 -0
  135. data/core/vendor/src/crfsuite/frontend/Makefile.in +640 -0
  136. data/core/vendor/src/crfsuite/frontend/dump.c +116 -0
  137. data/core/vendor/src/crfsuite/frontend/frontend.vcxproj +129 -0
  138. data/core/vendor/src/crfsuite/frontend/iwa.c +273 -0
  139. data/core/vendor/src/crfsuite/frontend/iwa.h +65 -0
  140. data/core/vendor/src/crfsuite/frontend/learn.c +439 -0
  141. data/core/vendor/src/crfsuite/frontend/main.c +137 -0
  142. data/core/vendor/src/crfsuite/frontend/option.c +93 -0
  143. data/core/vendor/src/crfsuite/frontend/option.h +86 -0
  144. data/core/vendor/src/crfsuite/frontend/readdata.h +38 -0
  145. data/core/vendor/src/crfsuite/frontend/reader.c +136 -0
  146. data/core/vendor/src/crfsuite/frontend/tag.c +427 -0
  147. data/core/vendor/src/crfsuite/genbinary.sh.in +15 -0
  148. data/core/vendor/src/crfsuite/include/Makefile.am +11 -0
  149. data/core/vendor/src/crfsuite/include/Makefile.in +461 -0
  150. data/core/vendor/src/crfsuite/include/crfsuite.h +1063 -0
  151. data/core/vendor/src/crfsuite/include/crfsuite.hpp +555 -0
  152. data/core/vendor/src/crfsuite/include/crfsuite_api.hpp +400 -0
  153. data/core/vendor/src/crfsuite/include/os.h +61 -0
  154. data/core/vendor/src/crfsuite/install-sh +520 -0
  155. data/core/vendor/src/crfsuite/lib/cqdb/COPYING +28 -0
  156. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.am +21 -0
  157. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.in +549 -0
  158. data/core/vendor/src/crfsuite/lib/cqdb/cqdb.vcxproj +86 -0
  159. data/core/vendor/src/crfsuite/lib/cqdb/include/cqdb.h +524 -0
  160. data/core/vendor/src/crfsuite/lib/cqdb/src/cqdb.c +587 -0
  161. data/core/vendor/src/crfsuite/lib/cqdb/src/lookup3.c +976 -0
  162. data/core/vendor/src/crfsuite/lib/crf/Makefile.am +46 -0
  163. data/core/vendor/src/crfsuite/lib/crf/Makefile.in +721 -0
  164. data/core/vendor/src/crfsuite/lib/crf/crf.vcxproj +216 -0
  165. data/core/vendor/src/crfsuite/lib/crf/src/crf1d.h +353 -0
  166. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_context.c +705 -0
  167. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_encode.c +943 -0
  168. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_feature.c +352 -0
  169. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_model.c +994 -0
  170. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_tag.c +550 -0
  171. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite.c +492 -0
  172. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_internal.h +236 -0
  173. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_train.c +272 -0
  174. data/core/vendor/src/crfsuite/lib/crf/src/dataset.c +106 -0
  175. data/core/vendor/src/crfsuite/lib/crf/src/dictionary.c +118 -0
  176. data/core/vendor/src/crfsuite/lib/crf/src/holdout.c +80 -0
  177. data/core/vendor/src/crfsuite/lib/crf/src/logging.c +91 -0
  178. data/core/vendor/src/crfsuite/lib/crf/src/logging.h +48 -0
  179. data/core/vendor/src/crfsuite/lib/crf/src/params.c +335 -0
  180. data/core/vendor/src/crfsuite/lib/crf/src/params.h +80 -0
  181. data/core/vendor/src/crfsuite/lib/crf/src/quark.c +172 -0
  182. data/core/vendor/src/crfsuite/lib/crf/src/quark.h +46 -0
  183. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.c +1107 -0
  184. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.h +160 -0
  185. data/core/vendor/src/crfsuite/lib/crf/src/train_arow.c +408 -0
  186. data/core/vendor/src/crfsuite/lib/crf/src/train_averaged_perceptron.c +242 -0
  187. data/core/vendor/src/crfsuite/lib/crf/src/train_l2sgd.c +507 -0
  188. data/core/vendor/src/crfsuite/lib/crf/src/train_lbfgs.c +338 -0
  189. data/core/vendor/src/crfsuite/lib/crf/src/train_passive_aggressive.c +435 -0
  190. data/core/vendor/src/crfsuite/lib/crf/src/vecmath.h +341 -0
  191. data/core/vendor/src/crfsuite/ltmain.sh +8413 -0
  192. data/core/vendor/src/crfsuite/missing +376 -0
  193. data/core/vendor/src/crfsuite/swig/Makefile.am +13 -0
  194. data/core/vendor/src/crfsuite/swig/Makefile.in +365 -0
  195. data/core/vendor/src/crfsuite/swig/crfsuite.cpp +2 -0
  196. data/core/vendor/src/crfsuite/swig/export.i +32 -0
  197. data/core/vendor/src/crfsuite/swig/python/README +92 -0
  198. data/core/vendor/src/crfsuite/swig/python/crfsuite.py +329 -0
  199. data/core/vendor/src/crfsuite/swig/python/export_wrap.cpp +14355 -0
  200. data/core/vendor/src/crfsuite/swig/python/export_wrap.h +63 -0
  201. data/core/vendor/src/crfsuite/swig/python/prepare.sh +9 -0
  202. data/core/vendor/src/crfsuite/swig/python/sample_tag.py +52 -0
  203. data/core/vendor/src/crfsuite/swig/python/sample_train.py +68 -0
  204. data/core/vendor/src/crfsuite/swig/python/setup.py +44 -0
  205. data/core/vendor/src/crfsuite/win32/stdint.h +679 -0
  206. data/core/vendor/src/liblbfgs/AUTHORS +1 -0
  207. data/core/vendor/src/liblbfgs/COPYING +22 -0
  208. data/core/vendor/src/liblbfgs/ChangeLog +120 -0
  209. data/core/vendor/src/liblbfgs/INSTALL +231 -0
  210. data/core/vendor/src/liblbfgs/Makefile.am +10 -0
  211. data/core/vendor/src/liblbfgs/Makefile.in +638 -0
  212. data/core/vendor/src/liblbfgs/NEWS +0 -0
  213. data/core/vendor/src/liblbfgs/README +71 -0
  214. data/core/vendor/src/liblbfgs/aclocal.m4 +6985 -0
  215. data/core/vendor/src/liblbfgs/autogen.sh +38 -0
  216. data/core/vendor/src/liblbfgs/config.guess +1411 -0
  217. data/core/vendor/src/liblbfgs/config.h.in +64 -0
  218. data/core/vendor/src/liblbfgs/config.sub +1500 -0
  219. data/core/vendor/src/liblbfgs/configure +21146 -0
  220. data/core/vendor/src/liblbfgs/configure.in +107 -0
  221. data/core/vendor/src/liblbfgs/depcomp +522 -0
  222. data/core/vendor/src/liblbfgs/include/lbfgs.h +745 -0
  223. data/core/vendor/src/liblbfgs/install-sh +322 -0
  224. data/core/vendor/src/liblbfgs/lbfgs.sln +26 -0
  225. data/core/vendor/src/liblbfgs/lib/Makefile.am +24 -0
  226. data/core/vendor/src/liblbfgs/lib/Makefile.in +499 -0
  227. data/core/vendor/src/liblbfgs/lib/arithmetic_ansi.h +133 -0
  228. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_double.h +294 -0
  229. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_float.h +298 -0
  230. data/core/vendor/src/liblbfgs/lib/lbfgs.c +1371 -0
  231. data/core/vendor/src/liblbfgs/lib/lib.vcxproj +95 -0
  232. data/core/vendor/src/liblbfgs/ltmain.sh +6426 -0
  233. data/core/vendor/src/liblbfgs/missing +353 -0
  234. data/core/vendor/src/liblbfgs/sample/Makefile.am +15 -0
  235. data/core/vendor/src/liblbfgs/sample/Makefile.in +433 -0
  236. data/core/vendor/src/liblbfgs/sample/sample.c +81 -0
  237. data/core/vendor/src/liblbfgs/sample/sample.cpp +126 -0
  238. data/core/vendor/src/liblbfgs/sample/sample.vcxproj +105 -0
  239. data/core/vendor/src/svm_light/LICENSE.txt +59 -0
  240. data/core/vendor/src/svm_light/Makefile +105 -0
  241. data/core/vendor/src/svm_light/kernel.h +40 -0
  242. data/core/vendor/src/svm_light/svm_classify.c +197 -0
  243. data/core/vendor/src/svm_light/svm_common.c +985 -0
  244. data/core/vendor/src/svm_light/svm_common.h +301 -0
  245. data/core/vendor/src/svm_light/svm_hideo.c +1062 -0
  246. data/core/vendor/src/svm_light/svm_learn.c +4147 -0
  247. data/core/vendor/src/svm_light/svm_learn.h +169 -0
  248. data/core/vendor/src/svm_light/svm_learn_main.c +397 -0
  249. data/core/vendor/src/svm_light/svm_loqo.c +211 -0
  250. data/ext/hack/Rakefile +17 -0
  251. data/ext/hack/support.rb +88 -0
  252. data/lib/opener/opinion_detectors/base.rb +112 -0
  253. data/lib/opener/opinion_detectors/base/version.rb +7 -0
  254. data/lib/opener/opinion_detectors/configuration_creator.rb +86 -0
  255. data/lib/opener/opinion_detectors/de.rb +7 -0
  256. data/lib/opener/opinion_detectors/en.rb +7 -0
  257. data/lib/opener/opinion_detectors/it.rb +7 -0
  258. data/lib/opener/opinion_detectors/nl.rb +6 -0
  259. data/opener-opinion-detector-base.gemspec +35 -0
  260. data/pre_build_requirements.txt +3 -0
  261. metadata +374 -0
@@ -0,0 +1,215 @@
1
+ import optparse
2
+ import sys
3
+
4
+
5
+
6
+ def feature_extractor(X,templates):
7
+ # Apply attribute templates to obtain features (in fact, attributes)
8
+ apply_templates(X, templates)
9
+ if X:
10
+ # Append BOS and EOS features manually
11
+ X[0]['F'].append('__BOS__') # BOS feature
12
+ X[-1]['F'].append('__EOS__') # EOS feature
13
+
14
+
15
+
16
+ def extract_features_to_crf(inputfile,outputfile,fields,separator,templates,possible_classes=None):
17
+ fi = open(inputfile,'r')
18
+ fo = open(outputfile,'w')
19
+ #this_main(feature_extractor,templates=templates,fields=fields,sep=separator,fi=fi,fo=fo,possible_classes=possible_classes)
20
+
21
+ F = fields.split(' ')
22
+ for X in readiter(fi, F, separator):
23
+ feature_extractor(X,templates)
24
+ output_features(fo, X, 'y',possible_classes)
25
+
26
+ fi.close()
27
+ fo.close()
28
+
29
+ def apply_templates(X, templates):
30
+
31
+ """
32
+ Generate features for an item sequence by applying feature templates.
33
+ A feature template consists of a tuple of (name, offset) pairs,
34
+ where name and offset specify a field name and offset from which
35
+ the template extracts a feature value. Generated features are stored
36
+ in the 'F' field of each item in the sequence.
37
+
38
+ @type X: list of mapping objects
39
+ @param X: The item sequence.
40
+ @type template: tuple of (str, int)
41
+ @param template: The feature template.
42
+ """
43
+ name=''
44
+ for template in templates:
45
+ '''
46
+ name=''
47
+ for f, o in template:
48
+ if o<0: o=-1
49
+ elif o>0: o=1
50
+ name += f+'['+str(o)+']'+'|'
51
+ name = name[:-1]
52
+ '''
53
+ name = '|'.join(['%s[%d]' % (f, o) for f, o in template])
54
+ for t in range(len(X)):
55
+ values = []
56
+ for field, offset in template:
57
+ p = t + offset
58
+ if p not in range(len(X)):
59
+ values = []
60
+ break
61
+ value = X[p][field]
62
+ if value != '' and value !='-':
63
+ values.append(value)
64
+ if values and len(values)>0 :
65
+ X[t]['F'].append('%s=%s' % (name, '|'.join(values)))
66
+
67
+ def readiter(fi, names, sep=' '):
68
+ """
69
+ Return an iterator for item sequences read from a file object.
70
+ This function reads a sequence from a file object L{fi}, and
71
+ yields the sequence as a list of mapping objects. Each line
72
+ (item) from the file object is split by the separator character
73
+ L{sep}. Separated values of the item are named by L{names},
74
+ and stored in a mapping object. Every item has a field 'F' that
75
+ is reserved for storing features.
76
+
77
+ @type fi: file
78
+ @param fi: The file object.
79
+ @type names: tuple
80
+ @param names: The list of field names.
81
+ @type sep: str
82
+ @param sep: The separator character.
83
+ @rtype list of mapping objects
84
+ @return An iterator for sequences.
85
+ """
86
+ X = []
87
+ for line in fi:
88
+ line = line.strip('\n')
89
+ if not line:
90
+ yield X
91
+ X = []
92
+ else:
93
+ fields = line.split(sep)
94
+ if len(fields) < len(names):
95
+ raise ValueError(
96
+ 'Too few fields (%d) for %r\n%s' % (len(fields), names, line))
97
+ item = {'F': []} # 'F' is reserved for features.
98
+ for i in range(len(names)):
99
+ item[names[i]] = fields[i]
100
+ X.append(item)
101
+
102
+ def escape(src):
103
+ """
104
+ Escape colon characters from feature names.
105
+
106
+ @type src: str
107
+ @param src: A feature name
108
+ @rtype str
109
+ @return The feature name escaped.
110
+ """
111
+ return src.replace(':', '__COLON__')
112
+
113
+ def output_features(fo, X, field='', possible_classes=None):
114
+ """
115
+ Output features (and reference labels) of a sequence in CRFSuite
116
+ format. For each item in the sequence, this function writes a
117
+ reference label (if L{field} is a non-empty string) and features.
118
+
119
+ @type fo: file
120
+ @param fo: The file object.
121
+ @type X: list of mapping objects
122
+ @param X: The sequence.
123
+ @type field: str
124
+ @param field: The field name of reference labels.
125
+ """
126
+ for t in range(len(X)):
127
+ if field:
128
+ this_class = X[t][field] ### B-positive
129
+ general_class = this_class[2:]
130
+ if possible_classes is not None and general_class not in possible_classes:
131
+ this_class = 'O'
132
+ fo.write('%s' % this_class)
133
+ for a in X[t]['F']:
134
+ if isinstance(a, str):
135
+ fo.write('\t%s' % escape(a))
136
+ else:
137
+ fo.write('\t%s:%f' % (escape(a[0]), a[1]))
138
+ fo.write('\n')
139
+ fo.write('\n')
140
+
141
+ def to_crfsuite(X):
142
+ """
143
+ Convert an item sequence into an object compatible with crfsuite
144
+ Python module.
145
+
146
+ @type X: list of mapping objects
147
+ @param X: The sequence.
148
+ @rtype crfsuite.ItemSequence
149
+ @return The same sequence in crfsuite.ItemSequence type.
150
+ """
151
+ import crfsuite
152
+ xseq = crfsuite.ItemSequence()
153
+ for x in X:
154
+ item = crfsuite.Item()
155
+ for f in x['F']:
156
+ if isinstance(f, str):
157
+ item.append(crfsuite.Attribute(escape(f)))
158
+ else:
159
+ item.append(crfsuite.Attribute(escape(f[0]), f[1]))
160
+ xseq.append(item)
161
+ return xseq
162
+
163
+ '''
164
+ def this_main(feature_extractor, templates,fields='w pos y', sep=' ',fi = sys.stdin,fo = sys.stdout,possible_classes=None):
165
+
166
+ # Parse the command-line arguments.
167
+ parser = optparse.OptionParser(usage="""usage: %prog [options]
168
+ This utility reads a data set from STDIN, and outputs attributes to STDOUT.
169
+ Each line of a data set must consist of field values separated by SEPARATOR
170
+ characters. The names and order of field values can be specified by -f option.
171
+ The separator character can be specified with -s option. Instead of outputting
172
+ attributes, this utility tags the input data when a model file is specified by
173
+ -t option (CRFsuite Python module must be installed)."""
174
+ )
175
+ parser.add_option(
176
+ '-t', dest='model',
177
+ help='tag the input using the model (requires "crfsuite" module)'
178
+ )
179
+ parser.add_option(
180
+ '-f', dest='fields', default=fields,
181
+ help='specify field names of input data [default: "%default"]'
182
+ )
183
+ parser.add_option(
184
+ '-s', dest='separator', default=sep,
185
+ help='specify the separator of columns of input data [default: "%default"]'
186
+ )
187
+ (options, args) = parser.parse_args()
188
+
189
+ # The fields of input: ('w', 'pos', 'y) by default.
190
+ F = options.fields.split(' ')
191
+
192
+ if not options.model:
193
+ # The generator function readiter() reads a sequence from a
194
+ for X in readiter(fi, F, options.separator):
195
+ feature_extractor(X,templates)
196
+ output_features(fo, X, 'y',possible_classes)
197
+
198
+ else:
199
+ # Create a tagger with an existing model.
200
+ import crfsuite
201
+ tagger = crfsuite.Tagger()
202
+ tagger.open(options.model)
203
+
204
+ # For each sequence from STDIN.
205
+ for X in readiter(fi, F, options.separator):
206
+ # Obtain features.
207
+ feature_extractor(X,templates)
208
+ xseq = to_crfsuite(X)
209
+ yseq = tagger.tag(xseq)
210
+ for t in range(len(X)):
211
+ v = X[t]
212
+ fo.write('\t'.join([v[f] for f in F]))
213
+ fo.write('\t%s\n' % yseq[t])
214
+ fo.write('\n')
215
+ '''
@@ -0,0 +1,295 @@
1
+ #!/usr/bin/env python
2
+
3
+ import sys
4
+
5
+ def write_to_output(my_class,feats, output):
6
+ my_str = my_class
7
+ for name, value in feats:
8
+ my_str += '\t'+name+'='+value
9
+ output.write(my_str.encode('utf-8')+'\n')
10
+
11
+
12
+
13
+ #########################################################################
14
+ # EXTRACTION OF FEATURES FOR TRAINING THE RELATION CLASSIFIER EXP --> TARGET
15
+ #########################################################################
16
+ # This function extracts features for the relation between expression adn target
17
+ # for the svm classifier
18
+ def extract_feats_exp_tar(exp_ids,tar_ids,knaf_obj, use_lemmas=True, use_tokens=True, use_dependencies=True):
19
+ all_feats = []
20
+
21
+ data_for_token = {} # [token_id] -> (word, sentence_id)
22
+ for num_token, token_obj in enumerate(knaf_obj.get_tokens()):
23
+ word = token_obj.get_text()
24
+ s_id = token_obj.get_sent()
25
+ w_id = token_obj.get_id()
26
+
27
+ data_for_token[w_id] = (word,s_id,num_token)
28
+
29
+ # Loading data for terms
30
+ data_for_term = {} # [term_id] -> (lemma, span_token_ids)
31
+ for term in knaf_obj.get_terms():
32
+ termid = term.get_id()
33
+ lemma = term.get_lemma()
34
+ span = term.get_span()
35
+ span_token_ids = []
36
+ if span is not None:
37
+ span_token_ids = span.get_span_ids()
38
+ data_for_term[termid] = (lemma,span_token_ids)
39
+
40
+ sentence_for_exp = None
41
+ avg_position_exp = 0
42
+ n_toks = 0
43
+ for my_id in exp_ids:
44
+ lemma, span_tok_ids = data_for_term[my_id]
45
+ if use_lemmas:
46
+ all_feats.append(('lemmaExp',lemma))
47
+
48
+
49
+ for tok_id in span_tok_ids:
50
+ token,sent_id,num_token = data_for_token[tok_id]
51
+ avg_position_exp += num_token
52
+ n_toks += 1
53
+ if use_tokens:
54
+ all_feats.append(('tokenExp',token))
55
+
56
+ if sentence_for_exp is None:
57
+ sentence_for_exp = sent_id
58
+
59
+ avg_position_exp = avg_position_exp * 1.0 / n_toks
60
+
61
+ #Lemmas for target
62
+ sentence_for_tar = None
63
+ avg_position_tar = 0
64
+ n_toks = 0
65
+ for my_id in tar_ids:
66
+ lemma, span_tok_ids = data_for_term[my_id]
67
+ if use_lemmas:
68
+ all_feats.append(('lemmaTar',lemma))
69
+
70
+ for tok_id in span_tok_ids:
71
+ token,sent_id,num_token = data_for_token[tok_id]
72
+ avg_position_tar += num_token
73
+ n_toks += 1
74
+ if use_tokens:
75
+ all_feats.append(('tokenTar',token))
76
+
77
+ if sentence_for_tar is None:
78
+ sentence_for_tar = sent_id
79
+
80
+ avg_position_tar = avg_position_tar * 1.0 / n_toks
81
+
82
+ if use_dependencies:
83
+ dependency_extractor = knaf_obj.get_dependency_extractor()
84
+ if dependency_extractor is not None:
85
+ deps = dependency_extractor.get_shortest_path_spans(exp_ids,tar_ids)
86
+ if deps is not None:
87
+ all_feats.append(('deps-exp-tar','#'.join(deps)))
88
+
89
+
90
+ if sentence_for_exp is not None and sentence_for_tar is not None and sentence_for_exp == sentence_for_tar:
91
+ all_feats.append(('same_sentence','yes'))
92
+ else:
93
+ all_feats.append(('same_sentence','no'))
94
+
95
+ ##Distance
96
+ dist = abs(avg_position_exp - avg_position_tar)
97
+ if dist <= 10:
98
+ my_dist = 'veryclose'
99
+ elif dist <=20:
100
+ my_dist = 'close'
101
+ elif dist <=25:
102
+ my_dist = 'far'
103
+ else:
104
+ my_dist = 'veryfar'
105
+ all_feats.append(('distExpTar',my_dist))
106
+
107
+ return all_feats
108
+
109
+
110
+
111
+
112
+
113
+
114
+ def create_rel_exp_tar_training(knaf_obj, output=sys.stdout, valid_opinions=None,use_dependencies=True,use_tokens=True, use_lemmas=True):
115
+ # Obtain pairs of features for Expression and Target
116
+ pairs = [] # [(Exp,Tar), (E,T), (E,T)....]
117
+ for opinion in knaf_obj.get_opinions():
118
+ opi_id = opinion.get_id()
119
+ opi_exp = opinion.get_expression()
120
+ exp_type = ''
121
+ exp_ids = []
122
+ if opi_exp is not None:
123
+ exp_type = opi_exp.get_polarity()
124
+ span = opi_exp.get_span()
125
+ if span is not None:
126
+ exp_ids = span.get_span_ids()
127
+
128
+ opi_tar = opinion.get_target()
129
+ tar_ids = []
130
+ if opi_tar is not None:
131
+ span = opi_tar.get_span()
132
+ if span is not None:
133
+ tar_ids = span.get_span_ids()
134
+
135
+
136
+ if valid_opinions is not None:
137
+ if exp_type not in valid_opinions:
138
+ continue ## This opinions will not be used
139
+
140
+
141
+ if len(tar_ids) != 0 and len(exp_ids) != 0:
142
+ pairs.append((exp_ids,tar_ids))
143
+
144
+
145
+ #extract_feats_exp_tar(exp_ids,tar_ids,knaf_obj, use_lemmas=True, use_tokens=True, use_dependencies=True)
146
+ for idx1, (exp1, tar1) in enumerate(pairs):
147
+ feats_positive = extract_feats_exp_tar(exp1,tar1,knaf_obj,use_dependencies=use_dependencies, use_tokens=use_tokens,use_lemmas=use_lemmas)
148
+ write_to_output('+1', feats_positive, output)
149
+ for idx2, (exp2, tar2) in enumerate(pairs):
150
+ if idx1 != idx2:
151
+ feats_negative = extract_feats_exp_tar(exp1,tar2,knaf_obj,use_dependencies=use_dependencies, use_tokens=use_tokens,use_lemmas=use_lemmas)
152
+ write_to_output('-1', feats_negative, output)
153
+
154
+
155
+
156
+
157
+
158
+
159
+ def extract_feats_exp_hol(exp_ids,hol_ids,knaf_obj, use_lemmas=True, use_tokens=True, use_dependencies=True):
160
+ all_feats = []
161
+
162
+ data_for_token = {} # [token_id] -> (word, sentence_id)
163
+ for num_token, token_obj in enumerate(knaf_obj.get_tokens()):
164
+ word = token_obj.get_text()
165
+ s_id = token_obj.get_sent()
166
+ w_id = token_obj.get_id()
167
+
168
+ data_for_token[w_id] = (word,s_id,num_token)
169
+
170
+ # Loading data for terms
171
+ data_for_term = {} # [term_id] -> (lemma, span_token_ids)
172
+ for term in knaf_obj.get_terms():
173
+ termid = term.get_id()
174
+ lemma = term.get_lemma()
175
+ span = term.get_span()
176
+ span_token_ids = []
177
+ if span is not None:
178
+ span_token_ids = span.get_span_ids()
179
+ data_for_term[termid] = (lemma,span_token_ids)
180
+
181
+ sentence_for_exp = None
182
+ avg_position_exp = 0
183
+ n_toks = 0
184
+ for my_id in exp_ids:
185
+ lemma, span_tok_ids = data_for_term[my_id]
186
+ if use_lemmas:
187
+ all_feats.append(('lemmaExp',lemma))
188
+
189
+
190
+ for tok_id in span_tok_ids:
191
+ token,sent_id,num_token = data_for_token[tok_id]
192
+ avg_position_exp += num_token
193
+ n_toks += 1
194
+ if use_tokens:
195
+ all_feats.append(('tokenExp',token))
196
+
197
+ if sentence_for_exp is None:
198
+ sentence_for_exp = sent_id
199
+
200
+ avg_position_exp = avg_position_exp * 1.0 / n_toks
201
+
202
+ #Lemmas for HOLDER
203
+ sentence_for_hol = None
204
+ avg_position_hol = 0
205
+ n_toks = 0
206
+ for my_id in hol_ids:
207
+ lemma, span_tok_ids = data_for_term[my_id]
208
+ if use_lemmas:
209
+ all_feats.append(('lemmaHol',lemma))
210
+
211
+ for tok_id in span_tok_ids:
212
+ token,sent_id,num_token = data_for_token[tok_id]
213
+ avg_position_hol += num_token
214
+ n_toks += 1
215
+ if use_tokens:
216
+ all_feats.append(('tokenHol',token))
217
+
218
+ if sentence_for_hol is None:
219
+ sentence_for_hol = sent_id
220
+
221
+ avg_position_hol = avg_position_hol * 1.0 / n_toks
222
+
223
+ if use_dependencies:
224
+ dependency_extractor = knaf_obj.get_dependency_extractor()
225
+ if dependency_extractor is not None:
226
+ deps = dependency_extractor.get_shortest_path_spans(exp_ids,hol_ids)
227
+ if deps is not None:
228
+ all_feats.append(('deps-exp-hol','#'.join(deps)))
229
+
230
+
231
+ if sentence_for_exp is not None and sentence_for_hol is not None and sentence_for_exp == sentence_for_hol:
232
+ all_feats.append(('same_sentence','yes'))
233
+ else:
234
+ all_feats.append(('same_sentence','no'))
235
+
236
+ ##Distance
237
+ dist = abs(avg_position_exp - avg_position_hol)
238
+ if dist <= 10:
239
+ my_dist = 'veryclose'
240
+ elif dist <=20:
241
+ my_dist = 'close'
242
+ elif dist <=25:
243
+ my_dist = 'far'
244
+ else:
245
+ my_dist = 'veryfar'
246
+ all_feats.append(('distExpHol',my_dist))
247
+ #all_feats.append(('absDist',str(dist)))
248
+
249
+ return all_feats
250
+
251
+
252
+
253
+ def create_rel_exp_hol_training(knaf_obj, output=sys.stdout, valid_opinions=None,use_dependencies=True,use_tokens=True,use_lemmas=True):
254
+
255
+ # Obtain pairs of features for Expression and Holder
256
+ pairs = [] # [(Exp,Hol), (E,H), (E,H)....]
257
+ for opinion in knaf_obj.get_opinions():
258
+ opi_exp = opinion.get_expression()
259
+ exp_type = ''
260
+ exp_ids = []
261
+ if opi_exp is not None:
262
+ exp_type = opi_exp.get_polarity()
263
+ span = opi_exp.get_span()
264
+ if span is not None:
265
+ exp_ids = span.get_span_ids()
266
+
267
+ opi_hol = opinion.get_holder()
268
+ hol_ids = []
269
+ if opi_hol is not None:
270
+ span = opi_hol.get_span()
271
+ if span is not None:
272
+ hol_ids = span.get_span_ids()
273
+
274
+
275
+ if valid_opinions is not None:
276
+ if exp_type not in valid_opinions:
277
+ continue ## This opinions will not be used
278
+
279
+
280
+ if len(exp_ids) != 0 and len(hol_ids) != 0:
281
+ pairs.append((exp_ids,hol_ids))
282
+
283
+
284
+ #for feat_exp, feat_tar
285
+ for idx1, (expids1, tarids1) in enumerate(pairs):
286
+
287
+ feats_positive = extract_feats_exp_hol(expids1,tarids1,knaf_obj, use_dependencies=use_dependencies,use_tokens=use_tokens,use_lemmas=use_lemmas)
288
+ write_to_output('+1', feats_positive,output)
289
+
290
+ for idx2, (expids2, tarids2) in enumerate(pairs):
291
+ if idx1 != idx2:
292
+ feats_negative = extract_feats_exp_hol(expids1,tarids2,knaf_obj, use_dependencies=use_dependencies,use_tokens=use_tokens,use_lemmas=use_lemmas)
293
+ write_to_output('-1', feats_negative ,output)
294
+
295
+