opener-opinion-detector-base 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +101 -0
  3. data/bin/opinion-detector-base +19 -0
  4. data/core/annotation.cfg.erb +9 -0
  5. data/core/packages/KafNafParser-1.4.tar.gz +0 -0
  6. data/core/packages/VUA_pylib-1.5.tar.gz +0 -0
  7. data/core/python-scripts/LICENSE +339 -0
  8. data/core/python-scripts/README.md +226 -0
  9. data/core/python-scripts/classify_kaf_naf_file.py +499 -0
  10. data/core/python-scripts/cross_validation.py +634 -0
  11. data/core/python-scripts/generate_folds.py +134 -0
  12. data/core/python-scripts/models.cfg +10 -0
  13. data/core/python-scripts/my_templates/README +33 -0
  14. data/core/python-scripts/my_templates/templates_exp.only0.txt +6 -0
  15. data/core/python-scripts/my_templates/templates_exp.pol0.txt +10 -0
  16. data/core/python-scripts/my_templates/templates_exp.red.txt +7 -0
  17. data/core/python-scripts/my_templates/templates_exp.txt +10 -0
  18. data/core/python-scripts/my_templates/templates_holder.only0.txt +11 -0
  19. data/core/python-scripts/my_templates/templates_holder.red.txt +9 -0
  20. data/core/python-scripts/my_templates/templates_holder.txt +10 -0
  21. data/core/python-scripts/my_templates/templates_target.only0.txt +11 -0
  22. data/core/python-scripts/my_templates/templates_target.red.txt +9 -0
  23. data/core/python-scripts/my_templates/templates_target.txt +10 -0
  24. data/core/python-scripts/run_all_experiments.sh +49 -0
  25. data/core/python-scripts/run_basic.py +20 -0
  26. data/core/python-scripts/run_experiment.sh +42 -0
  27. data/core/python-scripts/scripts/__init__.py +1 -0
  28. data/core/python-scripts/scripts/config_manager.py +314 -0
  29. data/core/python-scripts/scripts/crfutils.py +215 -0
  30. data/core/python-scripts/scripts/extract_feats_relations.py +295 -0
  31. data/core/python-scripts/scripts/extract_features.py +376 -0
  32. data/core/python-scripts/scripts/feats_to_crf.exp.py +105 -0
  33. data/core/python-scripts/scripts/lexicons.py +44 -0
  34. data/core/python-scripts/scripts/link_entities_distance.py +77 -0
  35. data/core/python-scripts/scripts/relation_classifier.py +250 -0
  36. data/core/python-scripts/train.py +566 -0
  37. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/PKG-INFO +10 -0
  38. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/SOURCES.txt +22 -0
  39. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/dependency_links.txt +1 -0
  40. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/installed-files.txt +47 -0
  41. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/top_level.txt +1 -0
  42. data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.py +390 -0
  43. data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.pyc +0 -0
  44. data/core/site-packages/pre_build/KafNafParser/__init__.py +14 -0
  45. data/core/site-packages/pre_build/KafNafParser/__init__.pyc +0 -0
  46. data/core/site-packages/pre_build/KafNafParser/constituency_data.py +125 -0
  47. data/core/site-packages/pre_build/KafNafParser/constituency_data.pyc +0 -0
  48. data/core/site-packages/pre_build/KafNafParser/coreference_data.py +52 -0
  49. data/core/site-packages/pre_build/KafNafParser/coreference_data.pyc +0 -0
  50. data/core/site-packages/pre_build/KafNafParser/dependency_data.py +78 -0
  51. data/core/site-packages/pre_build/KafNafParser/dependency_data.pyc +0 -0
  52. data/core/site-packages/pre_build/KafNafParser/entity_data.py +59 -0
  53. data/core/site-packages/pre_build/KafNafParser/entity_data.pyc +0 -0
  54. data/core/site-packages/pre_build/KafNafParser/external_references_data.py +41 -0
  55. data/core/site-packages/pre_build/KafNafParser/external_references_data.pyc +0 -0
  56. data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.py +2 -0
  57. data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.pyc +0 -0
  58. data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.py +205 -0
  59. data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.pyc +0 -0
  60. data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.py +309 -0
  61. data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.pyc +0 -0
  62. data/core/site-packages/pre_build/KafNafParser/features_data.py +131 -0
  63. data/core/site-packages/pre_build/KafNafParser/features_data.pyc +0 -0
  64. data/core/site-packages/pre_build/KafNafParser/header_data.py +127 -0
  65. data/core/site-packages/pre_build/KafNafParser/header_data.pyc +0 -0
  66. data/core/site-packages/pre_build/KafNafParser/opinion_data.py +211 -0
  67. data/core/site-packages/pre_build/KafNafParser/opinion_data.pyc +0 -0
  68. data/core/site-packages/pre_build/KafNafParser/references_data.py +23 -0
  69. data/core/site-packages/pre_build/KafNafParser/references_data.pyc +0 -0
  70. data/core/site-packages/pre_build/KafNafParser/span_data.py +63 -0
  71. data/core/site-packages/pre_build/KafNafParser/span_data.pyc +0 -0
  72. data/core/site-packages/pre_build/KafNafParser/term_data.py +111 -0
  73. data/core/site-packages/pre_build/KafNafParser/term_data.pyc +0 -0
  74. data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.py +42 -0
  75. data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.pyc +0 -0
  76. data/core/site-packages/pre_build/KafNafParser/text_data.py +99 -0
  77. data/core/site-packages/pre_build/KafNafParser/text_data.pyc +0 -0
  78. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/PKG-INFO +10 -0
  79. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/SOURCES.txt +14 -0
  80. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/dependency_links.txt +1 -0
  81. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/installed-files.txt +23 -0
  82. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/top_level.txt +1 -0
  83. data/core/site-packages/pre_build/VUA_pylib/__init__.py +1 -0
  84. data/core/site-packages/pre_build/VUA_pylib/__init__.pyc +0 -0
  85. data/core/site-packages/pre_build/VUA_pylib/common/__init__.py +1 -0
  86. data/core/site-packages/pre_build/VUA_pylib/common/__init__.pyc +0 -0
  87. data/core/site-packages/pre_build/VUA_pylib/common/common.py +28 -0
  88. data/core/site-packages/pre_build/VUA_pylib/common/common.pyc +0 -0
  89. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.py +1 -0
  90. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.pyc +0 -0
  91. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.py +156 -0
  92. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.pyc +0 -0
  93. data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.py +1 -0
  94. data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.pyc +0 -0
  95. data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.py +121 -0
  96. data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.pyc +0 -0
  97. data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.py +1 -0
  98. data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.pyc +0 -0
  99. data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.py +72 -0
  100. data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.pyc +0 -0
  101. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
  102. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
  103. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
  104. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
  105. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
  106. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
  107. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
  108. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
  109. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
  110. data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
  111. data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
  112. data/core/vendor/src/crfsuite/AUTHORS +1 -0
  113. data/core/vendor/src/crfsuite/COPYING +27 -0
  114. data/core/vendor/src/crfsuite/ChangeLog +103 -0
  115. data/core/vendor/src/crfsuite/INSTALL +236 -0
  116. data/core/vendor/src/crfsuite/Makefile.am +19 -0
  117. data/core/vendor/src/crfsuite/Makefile.in +783 -0
  118. data/core/vendor/src/crfsuite/README +183 -0
  119. data/core/vendor/src/crfsuite/aclocal.m4 +9018 -0
  120. data/core/vendor/src/crfsuite/autogen.sh +38 -0
  121. data/core/vendor/src/crfsuite/compile +143 -0
  122. data/core/vendor/src/crfsuite/config.guess +1502 -0
  123. data/core/vendor/src/crfsuite/config.h.in +198 -0
  124. data/core/vendor/src/crfsuite/config.sub +1714 -0
  125. data/core/vendor/src/crfsuite/configure +14273 -0
  126. data/core/vendor/src/crfsuite/configure.in +149 -0
  127. data/core/vendor/src/crfsuite/crfsuite.sln +42 -0
  128. data/core/vendor/src/crfsuite/depcomp +630 -0
  129. data/core/vendor/src/crfsuite/example/chunking.py +49 -0
  130. data/core/vendor/src/crfsuite/example/crfutils.py +179 -0
  131. data/core/vendor/src/crfsuite/example/ner.py +270 -0
  132. data/core/vendor/src/crfsuite/example/pos.py +78 -0
  133. data/core/vendor/src/crfsuite/example/template.py +88 -0
  134. data/core/vendor/src/crfsuite/frontend/Makefile.am +29 -0
  135. data/core/vendor/src/crfsuite/frontend/Makefile.in +640 -0
  136. data/core/vendor/src/crfsuite/frontend/dump.c +116 -0
  137. data/core/vendor/src/crfsuite/frontend/frontend.vcxproj +129 -0
  138. data/core/vendor/src/crfsuite/frontend/iwa.c +273 -0
  139. data/core/vendor/src/crfsuite/frontend/iwa.h +65 -0
  140. data/core/vendor/src/crfsuite/frontend/learn.c +439 -0
  141. data/core/vendor/src/crfsuite/frontend/main.c +137 -0
  142. data/core/vendor/src/crfsuite/frontend/option.c +93 -0
  143. data/core/vendor/src/crfsuite/frontend/option.h +86 -0
  144. data/core/vendor/src/crfsuite/frontend/readdata.h +38 -0
  145. data/core/vendor/src/crfsuite/frontend/reader.c +136 -0
  146. data/core/vendor/src/crfsuite/frontend/tag.c +427 -0
  147. data/core/vendor/src/crfsuite/genbinary.sh.in +15 -0
  148. data/core/vendor/src/crfsuite/include/Makefile.am +11 -0
  149. data/core/vendor/src/crfsuite/include/Makefile.in +461 -0
  150. data/core/vendor/src/crfsuite/include/crfsuite.h +1063 -0
  151. data/core/vendor/src/crfsuite/include/crfsuite.hpp +555 -0
  152. data/core/vendor/src/crfsuite/include/crfsuite_api.hpp +400 -0
  153. data/core/vendor/src/crfsuite/include/os.h +61 -0
  154. data/core/vendor/src/crfsuite/install-sh +520 -0
  155. data/core/vendor/src/crfsuite/lib/cqdb/COPYING +28 -0
  156. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.am +21 -0
  157. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.in +549 -0
  158. data/core/vendor/src/crfsuite/lib/cqdb/cqdb.vcxproj +86 -0
  159. data/core/vendor/src/crfsuite/lib/cqdb/include/cqdb.h +524 -0
  160. data/core/vendor/src/crfsuite/lib/cqdb/src/cqdb.c +587 -0
  161. data/core/vendor/src/crfsuite/lib/cqdb/src/lookup3.c +976 -0
  162. data/core/vendor/src/crfsuite/lib/crf/Makefile.am +46 -0
  163. data/core/vendor/src/crfsuite/lib/crf/Makefile.in +721 -0
  164. data/core/vendor/src/crfsuite/lib/crf/crf.vcxproj +216 -0
  165. data/core/vendor/src/crfsuite/lib/crf/src/crf1d.h +353 -0
  166. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_context.c +705 -0
  167. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_encode.c +943 -0
  168. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_feature.c +352 -0
  169. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_model.c +994 -0
  170. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_tag.c +550 -0
  171. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite.c +492 -0
  172. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_internal.h +236 -0
  173. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_train.c +272 -0
  174. data/core/vendor/src/crfsuite/lib/crf/src/dataset.c +106 -0
  175. data/core/vendor/src/crfsuite/lib/crf/src/dictionary.c +118 -0
  176. data/core/vendor/src/crfsuite/lib/crf/src/holdout.c +80 -0
  177. data/core/vendor/src/crfsuite/lib/crf/src/logging.c +91 -0
  178. data/core/vendor/src/crfsuite/lib/crf/src/logging.h +48 -0
  179. data/core/vendor/src/crfsuite/lib/crf/src/params.c +335 -0
  180. data/core/vendor/src/crfsuite/lib/crf/src/params.h +80 -0
  181. data/core/vendor/src/crfsuite/lib/crf/src/quark.c +172 -0
  182. data/core/vendor/src/crfsuite/lib/crf/src/quark.h +46 -0
  183. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.c +1107 -0
  184. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.h +160 -0
  185. data/core/vendor/src/crfsuite/lib/crf/src/train_arow.c +408 -0
  186. data/core/vendor/src/crfsuite/lib/crf/src/train_averaged_perceptron.c +242 -0
  187. data/core/vendor/src/crfsuite/lib/crf/src/train_l2sgd.c +507 -0
  188. data/core/vendor/src/crfsuite/lib/crf/src/train_lbfgs.c +338 -0
  189. data/core/vendor/src/crfsuite/lib/crf/src/train_passive_aggressive.c +435 -0
  190. data/core/vendor/src/crfsuite/lib/crf/src/vecmath.h +341 -0
  191. data/core/vendor/src/crfsuite/ltmain.sh +8413 -0
  192. data/core/vendor/src/crfsuite/missing +376 -0
  193. data/core/vendor/src/crfsuite/swig/Makefile.am +13 -0
  194. data/core/vendor/src/crfsuite/swig/Makefile.in +365 -0
  195. data/core/vendor/src/crfsuite/swig/crfsuite.cpp +2 -0
  196. data/core/vendor/src/crfsuite/swig/export.i +32 -0
  197. data/core/vendor/src/crfsuite/swig/python/README +92 -0
  198. data/core/vendor/src/crfsuite/swig/python/crfsuite.py +329 -0
  199. data/core/vendor/src/crfsuite/swig/python/export_wrap.cpp +14355 -0
  200. data/core/vendor/src/crfsuite/swig/python/export_wrap.h +63 -0
  201. data/core/vendor/src/crfsuite/swig/python/prepare.sh +9 -0
  202. data/core/vendor/src/crfsuite/swig/python/sample_tag.py +52 -0
  203. data/core/vendor/src/crfsuite/swig/python/sample_train.py +68 -0
  204. data/core/vendor/src/crfsuite/swig/python/setup.py +44 -0
  205. data/core/vendor/src/crfsuite/win32/stdint.h +679 -0
  206. data/core/vendor/src/liblbfgs/AUTHORS +1 -0
  207. data/core/vendor/src/liblbfgs/COPYING +22 -0
  208. data/core/vendor/src/liblbfgs/ChangeLog +120 -0
  209. data/core/vendor/src/liblbfgs/INSTALL +231 -0
  210. data/core/vendor/src/liblbfgs/Makefile.am +10 -0
  211. data/core/vendor/src/liblbfgs/Makefile.in +638 -0
  212. data/core/vendor/src/liblbfgs/NEWS +0 -0
  213. data/core/vendor/src/liblbfgs/README +71 -0
  214. data/core/vendor/src/liblbfgs/aclocal.m4 +6985 -0
  215. data/core/vendor/src/liblbfgs/autogen.sh +38 -0
  216. data/core/vendor/src/liblbfgs/config.guess +1411 -0
  217. data/core/vendor/src/liblbfgs/config.h.in +64 -0
  218. data/core/vendor/src/liblbfgs/config.sub +1500 -0
  219. data/core/vendor/src/liblbfgs/configure +21146 -0
  220. data/core/vendor/src/liblbfgs/configure.in +107 -0
  221. data/core/vendor/src/liblbfgs/depcomp +522 -0
  222. data/core/vendor/src/liblbfgs/include/lbfgs.h +745 -0
  223. data/core/vendor/src/liblbfgs/install-sh +322 -0
  224. data/core/vendor/src/liblbfgs/lbfgs.sln +26 -0
  225. data/core/vendor/src/liblbfgs/lib/Makefile.am +24 -0
  226. data/core/vendor/src/liblbfgs/lib/Makefile.in +499 -0
  227. data/core/vendor/src/liblbfgs/lib/arithmetic_ansi.h +133 -0
  228. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_double.h +294 -0
  229. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_float.h +298 -0
  230. data/core/vendor/src/liblbfgs/lib/lbfgs.c +1371 -0
  231. data/core/vendor/src/liblbfgs/lib/lib.vcxproj +95 -0
  232. data/core/vendor/src/liblbfgs/ltmain.sh +6426 -0
  233. data/core/vendor/src/liblbfgs/missing +353 -0
  234. data/core/vendor/src/liblbfgs/sample/Makefile.am +15 -0
  235. data/core/vendor/src/liblbfgs/sample/Makefile.in +433 -0
  236. data/core/vendor/src/liblbfgs/sample/sample.c +81 -0
  237. data/core/vendor/src/liblbfgs/sample/sample.cpp +126 -0
  238. data/core/vendor/src/liblbfgs/sample/sample.vcxproj +105 -0
  239. data/core/vendor/src/svm_light/LICENSE.txt +59 -0
  240. data/core/vendor/src/svm_light/Makefile +105 -0
  241. data/core/vendor/src/svm_light/kernel.h +40 -0
  242. data/core/vendor/src/svm_light/svm_classify.c +197 -0
  243. data/core/vendor/src/svm_light/svm_common.c +985 -0
  244. data/core/vendor/src/svm_light/svm_common.h +301 -0
  245. data/core/vendor/src/svm_light/svm_hideo.c +1062 -0
  246. data/core/vendor/src/svm_light/svm_learn.c +4147 -0
  247. data/core/vendor/src/svm_light/svm_learn.h +169 -0
  248. data/core/vendor/src/svm_light/svm_learn_main.c +397 -0
  249. data/core/vendor/src/svm_light/svm_loqo.c +211 -0
  250. data/ext/hack/Rakefile +17 -0
  251. data/ext/hack/support.rb +88 -0
  252. data/lib/opener/opinion_detectors/base.rb +112 -0
  253. data/lib/opener/opinion_detectors/base/version.rb +7 -0
  254. data/lib/opener/opinion_detectors/configuration_creator.rb +86 -0
  255. data/lib/opener/opinion_detectors/de.rb +7 -0
  256. data/lib/opener/opinion_detectors/en.rb +7 -0
  257. data/lib/opener/opinion_detectors/it.rb +7 -0
  258. data/lib/opener/opinion_detectors/nl.rb +6 -0
  259. data/opener-opinion-detector-base.gemspec +35 -0
  260. data/pre_build_requirements.txt +3 -0
  261. metadata +374 -0
@@ -0,0 +1,99 @@
1
+ # included code for NAF/KAF
2
+
3
+ from lxml import etree
4
+
5
+
6
+ class Cwf:
7
+ def __init__(self,node=None,type='NAF'):
8
+ self.type = type
9
+ ##self.id = '' self.sent = '' self.para = '' self.page = '' self.offset = '' self.lenght = '' s
10
+ if node is None:
11
+ self.node = etree.Element('wf')
12
+ else:
13
+ self.node = node
14
+
15
+ def get_node(self):
16
+ return self.node
17
+
18
+ def set_id(self,this_id):
19
+ if self.type == 'NAF':
20
+ return self.node.set('id',this_id)
21
+ elif self.type == 'KAF':
22
+ return self.node.set('wid',this_id)
23
+
24
+ def get_id(self):
25
+ if self.type == 'NAF':
26
+ return self.node.get('id')
27
+ elif self.type == 'KAF':
28
+ return self.node.get('wid')
29
+
30
+ def set_text(self,this_text):
31
+ self.node.text = this_text
32
+
33
+ def get_text(self):
34
+ return self.node.text
35
+
36
+ def set_sent(self,this_sent):
37
+ self.node.set('sent',this_sent)
38
+
39
+ def get_sent(self):
40
+ return self.node.get('sent')
41
+
42
+
43
+ class Ctext:
44
+ def __init__(self,node=None,type='NAF'):
45
+ self.idx = {}
46
+ self.type = type
47
+ if node is None:
48
+ self.node = etree.Element('text')
49
+ else:
50
+ self.node = node
51
+ for wf_node in self.__get_wf_nodes():
52
+ if self.type == 'NAF': label_id = 'id'
53
+ elif self.type == 'KAF': label_id = 'wid'
54
+ self.idx[wf_node.get(label_id)] = wf_node
55
+
56
+ def get_node(self):
57
+ return self.node
58
+
59
+ def to_kaf(self):
60
+ if self.type == 'NAF':
61
+ self.type = 'KAF'
62
+ for node in self.__get_wf_nodes():
63
+ node.set('wid',node.get('id'))
64
+ del node.attrib['id']
65
+
66
+ def to_naf(self):
67
+ if self.type == 'KAF':
68
+ self.type = 'NAF'
69
+ for node in self.__get_wf_nodes():
70
+ node.set('id',node.get('wid'))
71
+ del node.attrib['wid']
72
+
73
+ def __get_wf_nodes(self):
74
+ for wf_node in self.node.findall('wf'):
75
+ yield wf_node
76
+
77
+ def __iter__(self):
78
+ for wf_node in self.__get_wf_nodes():
79
+ yield Cwf(node=wf_node,type=self.type)
80
+
81
+ def get_wf(self,token_id):
82
+ wf_node = self.idx.get(token_id)
83
+ if wf_node is not None:
84
+ return Cwf(node=wf_node,type=self.type)
85
+ else:
86
+ return None
87
+
88
+ def add_wf(self,wf_obj):
89
+ self.node.append(wf_obj.get_node())
90
+
91
+ def remove_tokens_of_sentence(self,sentence_id):
92
+ nodes_to_remove = set()
93
+ for wf in self:
94
+ if wf.get_sent() == sentence_id:
95
+ nodes_to_remove.add(wf.get_node())
96
+
97
+ for node in nodes_to_remove:
98
+ self.node.remove(node)
99
+
@@ -0,0 +1,10 @@
1
+ Metadata-Version: 1.0
2
+ Name: VUA-pylib
3
+ Version: 1.5
4
+ Summary: Various KAF / NAF python helpers
5
+ Home-page: https://github.com/cltl/VUA_pylib
6
+ Author: Ruben Izquierdo
7
+ Author-email: r.izquierdobevia@vu.nl
8
+ License: UNKNOWN
9
+ Description: UNKNOWN
10
+ Platform: UNKNOWN
@@ -0,0 +1,14 @@
1
+ README
2
+ VUA_pylib/__init__.py
3
+ VUA_pylib.egg-info/PKG-INFO
4
+ VUA_pylib.egg-info/SOURCES.txt
5
+ VUA_pylib.egg-info/dependency_links.txt
6
+ VUA_pylib.egg-info/top_level.txt
7
+ VUA_pylib/common/__init__.py
8
+ VUA_pylib/common/common.py
9
+ VUA_pylib/corpus_reader/__init__.py
10
+ VUA_pylib/corpus_reader/google_web_nl.py
11
+ VUA_pylib/io_utils/__init__.py
12
+ VUA_pylib/io_utils/feature_file.py
13
+ VUA_pylib/lexicon/__init__.py
14
+ VUA_pylib/lexicon/lexicon.py
@@ -0,0 +1,23 @@
1
+ ../VUA_pylib/__init__.py
2
+ ../VUA_pylib/lexicon/lexicon.py
3
+ ../VUA_pylib/lexicon/__init__.py
4
+ ../VUA_pylib/common/common.py
5
+ ../VUA_pylib/common/__init__.py
6
+ ../VUA_pylib/io_utils/feature_file.py
7
+ ../VUA_pylib/io_utils/__init__.py
8
+ ../VUA_pylib/corpus_reader/google_web_nl.py
9
+ ../VUA_pylib/corpus_reader/__init__.py
10
+ ../VUA_pylib/__init__.pyc
11
+ ../VUA_pylib/lexicon/lexicon.pyc
12
+ ../VUA_pylib/lexicon/__init__.pyc
13
+ ../VUA_pylib/common/common.pyc
14
+ ../VUA_pylib/common/__init__.pyc
15
+ ../VUA_pylib/io_utils/feature_file.pyc
16
+ ../VUA_pylib/io_utils/__init__.pyc
17
+ ../VUA_pylib/corpus_reader/google_web_nl.pyc
18
+ ../VUA_pylib/corpus_reader/__init__.pyc
19
+ ./
20
+ SOURCES.txt
21
+ dependency_links.txt
22
+ top_level.txt
23
+ PKG-INFO
@@ -0,0 +1 @@
1
+ from common import *
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env python
2
+
3
+ from operator import itemgetter
4
+
5
+ # Get the max (key,count) from a dict like my_dict = {'a':20,'b':1,'c':50}
6
+ # It will return --> (c,50)
7
+ def get_max_distr_dict(my_dict):
8
+ vect = my_dict.items()
9
+ if len(vect) !=0:
10
+ vect.sort(key=itemgetter(1),reverse=True)
11
+ return vect[0]
12
+ return None
13
+
14
+ def normalize_pos(pos):
15
+ pos = pos.lower()
16
+ new_pos = pos
17
+ if pos in ['adj','a'] or pos[0:2]=='jj':
18
+ new_pos = 'a'
19
+ elif pos in ['adverb','r'] or pos[0:2]=='rb':
20
+ new_pos = 'r'
21
+ elif pos in ['anypos']:
22
+ new_pos = '*'
23
+ elif pos in ['noun','n'] or pos[0:2]=='nn' or pos[0:2]=='np':
24
+ new_pos = 'n'
25
+ elif pos in ['verb','v'] or pos[0]=='v':
26
+ new_pos = 'v'
27
+ return new_pos
28
+
@@ -0,0 +1 @@
1
+ from google_web_nl import *
@@ -0,0 +1,156 @@
1
+ import urllib2
2
+ import urllib
3
+ import sys
4
+ import time
5
+
6
+ try:
7
+ from lxml import etree
8
+ except:
9
+ import xml.etree.cElementTree as etree
10
+
11
+ class Citem:
12
+ def __init__(self,item=None):
13
+ self.hits = None
14
+ self.word = None
15
+ self.tokens = None
16
+ if item is not None:
17
+ if isinstance(item,str):
18
+ self.load_from_string(item)
19
+ else:
20
+ self.load_from_item_node(item)
21
+
22
+ def load_from_string(self,line):
23
+ ## Example line: 22865,"de server van"
24
+ line = line.strip()
25
+ pos = line.find(',')
26
+ self.hits = int(line[:pos])
27
+ self.word = line[pos+2:-1]
28
+ self.tokens = self.word.split(' ')
29
+
30
+ def load_from_item_node(self,item_node):
31
+ hits_node = item_node.find('hits')
32
+ if hits_node is not None:
33
+ self.hits = int(hits_node.text)
34
+
35
+ word_node = item_node.find('word')
36
+ if word_node is not None:
37
+ self.word = str(word_node.text)
38
+ self.tokens = self.word.split(' ')
39
+
40
+ def __str__(self):
41
+ if self.word is not None and self.hits is not None:
42
+ s = str(self.tokens)+' ->'+str(self.hits)+' hits'
43
+ else:
44
+ s = 'None'
45
+ return s
46
+
47
+ def __repr__(self):
48
+ return self.__str__()
49
+
50
+ def get_hits(self):
51
+ return self.hits
52
+
53
+ def get_word(self):
54
+ return self.word
55
+
56
+ def get_tokens(self):
57
+ return self.tokens
58
+
59
+
60
+ class Cgoogle_web_nl:
61
+ def __init__(self):
62
+ self.url='http://www.let.rug.nl/gosse/bin/Web1T5_freq.perl'
63
+ self.sleep_this_time = 5 #First time to sleep in case of error
64
+ self.max_trials = 20
65
+ self.limit = 1000
66
+ self.min_freq = 100
67
+ self.items = []
68
+
69
+
70
+ def set_limit(self,l):
71
+ if not isinstance(l, int):
72
+ print>>sys.stderr,'Parameter for set_min_freq must be an integer and not ',type(m)
73
+ sys.exit(-1)
74
+ self.limit = l
75
+
76
+ def set_min_freq(self,m):
77
+ if not isinstance(m, int):
78
+ print>>sys.stderr,'Parameter for set_min_freq must be an integer and not ',type(m)
79
+ sys.exit(-1)
80
+ self.min_freq = m
81
+
82
+ def query(self,this_query,fixed='shown'):
83
+ #http://www.let.rug.nl/gosse/bin/Web1T5_freq.perl?
84
+ #query=interessante%20*&
85
+ #mode=XML&limit=10000&
86
+ #threshold=40&optimize=on&wildcards=listed+normally
87
+ #&fixed=shown&.cgifields=debug&.cgifields=optimize
88
+ dict_params = {}
89
+ dict_params['query'] = this_query
90
+ dict_params['mode']='XML'
91
+ #dict_params['mode']='csv'
92
+ dict_params['limit']=self.limit
93
+ dict_params['threshold']=self.min_freq
94
+ dict_params['optimize']='on'
95
+ dict_params['wildcards']='listed normally'
96
+ dict_params['fixed']=fixed
97
+ dict_params['.cgifields']='debug'
98
+ dict_params['.cgifields']='optimize'
99
+ params = urllib.urlencode(dict_params)
100
+ #print>>sys.stderr,self.url+'?%s' % params
101
+
102
+
103
+ done = False
104
+ this_url = None
105
+ trials = 0
106
+ while not done:
107
+ try:
108
+ this_url = urllib2.urlopen(self.url+'?%s' % params)
109
+ code = this_url.getcode()
110
+ except Exception as e:
111
+ code = -1
112
+ print>>sys.stderr,str(e)
113
+
114
+ if code == 200:
115
+ done = True
116
+ else:
117
+ print>>sys.stderr,'Got an error (code '+str(code)+') querying google web nl, with "'+this_query+'", retrying...'
118
+ print>>sys.stderr,'Trial ',trials,' waiting ',self.sleep_this_time,'seconds'
119
+ time.sleep(self.sleep_this_time)
120
+ trials += 1
121
+ self.sleep_this_time += 1
122
+ if trials == self.max_trials:
123
+ print>>sys.stderr,'Maximum number of trials reached. Giving up...'
124
+ done = True
125
+ this_url = None
126
+
127
+ if this_url is not None:
128
+ if dict_params['mode'] == 'XML':
129
+ xml_obj = etree.parse(this_url)
130
+ this_url.close()
131
+
132
+ for item_node in xml_obj.findall('item'):
133
+ self.items.append(Citem(item_node))
134
+ del xml_obj
135
+ else: #CSV
136
+ first_line = True
137
+ ## The first line is frequency,"N-gram"
138
+ for line in this_url:
139
+ if not first_line:
140
+ self.items.append(Citem(line))
141
+ first_line = False
142
+
143
+
144
+
145
+ def get_items(self):
146
+ for item in self.items:
147
+ yield item
148
+
149
+ def get_all_items(self):
150
+ return self.items
151
+
152
+ def len(self):
153
+ return len(self.items)
154
+ def __iter__(self):
155
+ for item in self.items:
156
+ yield item
@@ -0,0 +1 @@
1
+ from feature_file import *
@@ -0,0 +1,121 @@
1
+ from operator import itemgetter
2
+ import sys
3
+ import cPickle
4
+
5
+
6
+
7
+ class Cexample:
8
+ def __init__(self,str_line=None):
9
+ self.label = ''
10
+ self.features = []
11
+ if str_line is not None:
12
+ self.load_from_line(str_line)
13
+
14
+ def load_from_line(self,str_line):
15
+ fields = str_line.strip().split('\t')
16
+ self.label = fields[0]
17
+ for feat in fields[1:]:
18
+ first_equal = feat.find('=')
19
+ if first_equal != -1:
20
+ name = feat[:first_equal]
21
+ value = feat[first_equal+1:]
22
+ self.features.append((name,value))
23
+
24
+ def __str__(self):
25
+ s = 'Label: '+self.label+'\n'
26
+ s += 'Feats: '+str(self.features)
27
+ return s
28
+
29
+ def get_label(self):
30
+ return self.label
31
+
32
+ def get_features(self):
33
+ for name,value in self.features:
34
+ yield name,value
35
+
36
+ def get_all_features(self):
37
+ return self.features
38
+
39
+
40
+ class Cfeature_index:
41
+ def __init__(self):
42
+ self.idx = {}
43
+
44
+ def get_number_feat(self,feat):
45
+ return self.idx.get(feat,None)
46
+
47
+ def add_feat(self,feat):
48
+ num_feat = len(self.idx)+1
49
+ self.idx[feat] = num_feat
50
+ return num_feat
51
+
52
+
53
+ def compose_feat(self,name,value):
54
+ return name+'###'+value
55
+
56
+
57
+ def __encode_features(self,feats,modify_index=True):
58
+ feats_for_example = {}
59
+ clean_feats = ''
60
+ for name, value in feats:
61
+ my_feat = self.compose_feat(name, value)
62
+ clean_feats+=my_feat+' '
63
+ num_feat = self.get_number_feat(my_feat)
64
+ if num_feat is None:
65
+ if modify_index:
66
+ num_feat = self.add_feat(my_feat)
67
+
68
+ if num_feat is not None:
69
+ if num_feat in feats_for_example:
70
+ feats_for_example[num_feat] += 1
71
+ else:
72
+ feats_for_example[num_feat] = 1
73
+ return sorted(feats_for_example.items(),key=itemgetter(0)),clean_feats
74
+
75
+
76
+ def encode_feature_file_to_svm(self,feat_file_obj,out_fic=sys.stdout):
77
+ for example in feat_file_obj:
78
+ class_label = example.get_label()
79
+ out_fic.write(class_label)
80
+ feats_for_example, clean_feats =self.__encode_features(example.get_all_features())
81
+
82
+ for feat,freq_feat in feats_for_example:
83
+ value = freq_feat
84
+ out_fic.write(' %d:%d' % (feat,value))
85
+ out_fic.write(' #'+clean_feats.encode('utf-8')+'\n')
86
+
87
+ def encode_example_for_classification(self, feats,out_fic,my_class='0'):
88
+ feats_for_example, clean_feats =self.__encode_features(feats,modify_index=False)
89
+ out_fic.write(my_class)
90
+ for feat,freq_feat in feats_for_example:
91
+ value = freq_feat
92
+ out_fic.write(' %d:%d' % (feat,value))
93
+ out_fic.write(' #'+clean_feats.encode('utf-8')+'\n')
94
+
95
+ def save_to_file(self,filename):
96
+ fic = open(filename,'wb')
97
+ cPickle.dump(self.idx, fic, protocol=0)
98
+ fic.close()
99
+
100
+ def load_from_file(self,filename):
101
+ fic = open(filename,'rb')
102
+ self.idx = cPickle.load(fic)
103
+ fic.close()
104
+
105
+
106
+
107
+ class Cfeature_file:
108
+ def __init__(self,filename=None):
109
+ self.filename = filename
110
+
111
+ def __iter__(self):
112
+ if self.filename is not None:
113
+ fic = open(self.filename,'r')
114
+ for line in fic:
115
+ if line[0] != '#':
116
+ yield Cexample(line.decode('utf-8'))
117
+ fic.close()
118
+
119
+
120
+
121
+