opener-opinion-detector-base 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +101 -0
  3. data/bin/opinion-detector-base +19 -0
  4. data/core/annotation.cfg.erb +9 -0
  5. data/core/packages/KafNafParser-1.4.tar.gz +0 -0
  6. data/core/packages/VUA_pylib-1.5.tar.gz +0 -0
  7. data/core/python-scripts/LICENSE +339 -0
  8. data/core/python-scripts/README.md +226 -0
  9. data/core/python-scripts/classify_kaf_naf_file.py +499 -0
  10. data/core/python-scripts/cross_validation.py +634 -0
  11. data/core/python-scripts/generate_folds.py +134 -0
  12. data/core/python-scripts/models.cfg +10 -0
  13. data/core/python-scripts/my_templates/README +33 -0
  14. data/core/python-scripts/my_templates/templates_exp.only0.txt +6 -0
  15. data/core/python-scripts/my_templates/templates_exp.pol0.txt +10 -0
  16. data/core/python-scripts/my_templates/templates_exp.red.txt +7 -0
  17. data/core/python-scripts/my_templates/templates_exp.txt +10 -0
  18. data/core/python-scripts/my_templates/templates_holder.only0.txt +11 -0
  19. data/core/python-scripts/my_templates/templates_holder.red.txt +9 -0
  20. data/core/python-scripts/my_templates/templates_holder.txt +10 -0
  21. data/core/python-scripts/my_templates/templates_target.only0.txt +11 -0
  22. data/core/python-scripts/my_templates/templates_target.red.txt +9 -0
  23. data/core/python-scripts/my_templates/templates_target.txt +10 -0
  24. data/core/python-scripts/run_all_experiments.sh +49 -0
  25. data/core/python-scripts/run_basic.py +20 -0
  26. data/core/python-scripts/run_experiment.sh +42 -0
  27. data/core/python-scripts/scripts/__init__.py +1 -0
  28. data/core/python-scripts/scripts/config_manager.py +314 -0
  29. data/core/python-scripts/scripts/crfutils.py +215 -0
  30. data/core/python-scripts/scripts/extract_feats_relations.py +295 -0
  31. data/core/python-scripts/scripts/extract_features.py +376 -0
  32. data/core/python-scripts/scripts/feats_to_crf.exp.py +105 -0
  33. data/core/python-scripts/scripts/lexicons.py +44 -0
  34. data/core/python-scripts/scripts/link_entities_distance.py +77 -0
  35. data/core/python-scripts/scripts/relation_classifier.py +250 -0
  36. data/core/python-scripts/train.py +566 -0
  37. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/PKG-INFO +10 -0
  38. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/SOURCES.txt +22 -0
  39. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/dependency_links.txt +1 -0
  40. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/installed-files.txt +47 -0
  41. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/top_level.txt +1 -0
  42. data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.py +390 -0
  43. data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.pyc +0 -0
  44. data/core/site-packages/pre_build/KafNafParser/__init__.py +14 -0
  45. data/core/site-packages/pre_build/KafNafParser/__init__.pyc +0 -0
  46. data/core/site-packages/pre_build/KafNafParser/constituency_data.py +125 -0
  47. data/core/site-packages/pre_build/KafNafParser/constituency_data.pyc +0 -0
  48. data/core/site-packages/pre_build/KafNafParser/coreference_data.py +52 -0
  49. data/core/site-packages/pre_build/KafNafParser/coreference_data.pyc +0 -0
  50. data/core/site-packages/pre_build/KafNafParser/dependency_data.py +78 -0
  51. data/core/site-packages/pre_build/KafNafParser/dependency_data.pyc +0 -0
  52. data/core/site-packages/pre_build/KafNafParser/entity_data.py +59 -0
  53. data/core/site-packages/pre_build/KafNafParser/entity_data.pyc +0 -0
  54. data/core/site-packages/pre_build/KafNafParser/external_references_data.py +41 -0
  55. data/core/site-packages/pre_build/KafNafParser/external_references_data.pyc +0 -0
  56. data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.py +2 -0
  57. data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.pyc +0 -0
  58. data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.py +205 -0
  59. data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.pyc +0 -0
  60. data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.py +309 -0
  61. data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.pyc +0 -0
  62. data/core/site-packages/pre_build/KafNafParser/features_data.py +131 -0
  63. data/core/site-packages/pre_build/KafNafParser/features_data.pyc +0 -0
  64. data/core/site-packages/pre_build/KafNafParser/header_data.py +127 -0
  65. data/core/site-packages/pre_build/KafNafParser/header_data.pyc +0 -0
  66. data/core/site-packages/pre_build/KafNafParser/opinion_data.py +211 -0
  67. data/core/site-packages/pre_build/KafNafParser/opinion_data.pyc +0 -0
  68. data/core/site-packages/pre_build/KafNafParser/references_data.py +23 -0
  69. data/core/site-packages/pre_build/KafNafParser/references_data.pyc +0 -0
  70. data/core/site-packages/pre_build/KafNafParser/span_data.py +63 -0
  71. data/core/site-packages/pre_build/KafNafParser/span_data.pyc +0 -0
  72. data/core/site-packages/pre_build/KafNafParser/term_data.py +111 -0
  73. data/core/site-packages/pre_build/KafNafParser/term_data.pyc +0 -0
  74. data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.py +42 -0
  75. data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.pyc +0 -0
  76. data/core/site-packages/pre_build/KafNafParser/text_data.py +99 -0
  77. data/core/site-packages/pre_build/KafNafParser/text_data.pyc +0 -0
  78. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/PKG-INFO +10 -0
  79. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/SOURCES.txt +14 -0
  80. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/dependency_links.txt +1 -0
  81. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/installed-files.txt +23 -0
  82. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/top_level.txt +1 -0
  83. data/core/site-packages/pre_build/VUA_pylib/__init__.py +1 -0
  84. data/core/site-packages/pre_build/VUA_pylib/__init__.pyc +0 -0
  85. data/core/site-packages/pre_build/VUA_pylib/common/__init__.py +1 -0
  86. data/core/site-packages/pre_build/VUA_pylib/common/__init__.pyc +0 -0
  87. data/core/site-packages/pre_build/VUA_pylib/common/common.py +28 -0
  88. data/core/site-packages/pre_build/VUA_pylib/common/common.pyc +0 -0
  89. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.py +1 -0
  90. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.pyc +0 -0
  91. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.py +156 -0
  92. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.pyc +0 -0
  93. data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.py +1 -0
  94. data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.pyc +0 -0
  95. data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.py +121 -0
  96. data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.pyc +0 -0
  97. data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.py +1 -0
  98. data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.pyc +0 -0
  99. data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.py +72 -0
  100. data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.pyc +0 -0
  101. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
  102. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
  103. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
  104. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
  105. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
  106. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
  107. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
  108. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
  109. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
  110. data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
  111. data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
  112. data/core/vendor/src/crfsuite/AUTHORS +1 -0
  113. data/core/vendor/src/crfsuite/COPYING +27 -0
  114. data/core/vendor/src/crfsuite/ChangeLog +103 -0
  115. data/core/vendor/src/crfsuite/INSTALL +236 -0
  116. data/core/vendor/src/crfsuite/Makefile.am +19 -0
  117. data/core/vendor/src/crfsuite/Makefile.in +783 -0
  118. data/core/vendor/src/crfsuite/README +183 -0
  119. data/core/vendor/src/crfsuite/aclocal.m4 +9018 -0
  120. data/core/vendor/src/crfsuite/autogen.sh +38 -0
  121. data/core/vendor/src/crfsuite/compile +143 -0
  122. data/core/vendor/src/crfsuite/config.guess +1502 -0
  123. data/core/vendor/src/crfsuite/config.h.in +198 -0
  124. data/core/vendor/src/crfsuite/config.sub +1714 -0
  125. data/core/vendor/src/crfsuite/configure +14273 -0
  126. data/core/vendor/src/crfsuite/configure.in +149 -0
  127. data/core/vendor/src/crfsuite/crfsuite.sln +42 -0
  128. data/core/vendor/src/crfsuite/depcomp +630 -0
  129. data/core/vendor/src/crfsuite/example/chunking.py +49 -0
  130. data/core/vendor/src/crfsuite/example/crfutils.py +179 -0
  131. data/core/vendor/src/crfsuite/example/ner.py +270 -0
  132. data/core/vendor/src/crfsuite/example/pos.py +78 -0
  133. data/core/vendor/src/crfsuite/example/template.py +88 -0
  134. data/core/vendor/src/crfsuite/frontend/Makefile.am +29 -0
  135. data/core/vendor/src/crfsuite/frontend/Makefile.in +640 -0
  136. data/core/vendor/src/crfsuite/frontend/dump.c +116 -0
  137. data/core/vendor/src/crfsuite/frontend/frontend.vcxproj +129 -0
  138. data/core/vendor/src/crfsuite/frontend/iwa.c +273 -0
  139. data/core/vendor/src/crfsuite/frontend/iwa.h +65 -0
  140. data/core/vendor/src/crfsuite/frontend/learn.c +439 -0
  141. data/core/vendor/src/crfsuite/frontend/main.c +137 -0
  142. data/core/vendor/src/crfsuite/frontend/option.c +93 -0
  143. data/core/vendor/src/crfsuite/frontend/option.h +86 -0
  144. data/core/vendor/src/crfsuite/frontend/readdata.h +38 -0
  145. data/core/vendor/src/crfsuite/frontend/reader.c +136 -0
  146. data/core/vendor/src/crfsuite/frontend/tag.c +427 -0
  147. data/core/vendor/src/crfsuite/genbinary.sh.in +15 -0
  148. data/core/vendor/src/crfsuite/include/Makefile.am +11 -0
  149. data/core/vendor/src/crfsuite/include/Makefile.in +461 -0
  150. data/core/vendor/src/crfsuite/include/crfsuite.h +1063 -0
  151. data/core/vendor/src/crfsuite/include/crfsuite.hpp +555 -0
  152. data/core/vendor/src/crfsuite/include/crfsuite_api.hpp +400 -0
  153. data/core/vendor/src/crfsuite/include/os.h +61 -0
  154. data/core/vendor/src/crfsuite/install-sh +520 -0
  155. data/core/vendor/src/crfsuite/lib/cqdb/COPYING +28 -0
  156. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.am +21 -0
  157. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.in +549 -0
  158. data/core/vendor/src/crfsuite/lib/cqdb/cqdb.vcxproj +86 -0
  159. data/core/vendor/src/crfsuite/lib/cqdb/include/cqdb.h +524 -0
  160. data/core/vendor/src/crfsuite/lib/cqdb/src/cqdb.c +587 -0
  161. data/core/vendor/src/crfsuite/lib/cqdb/src/lookup3.c +976 -0
  162. data/core/vendor/src/crfsuite/lib/crf/Makefile.am +46 -0
  163. data/core/vendor/src/crfsuite/lib/crf/Makefile.in +721 -0
  164. data/core/vendor/src/crfsuite/lib/crf/crf.vcxproj +216 -0
  165. data/core/vendor/src/crfsuite/lib/crf/src/crf1d.h +353 -0
  166. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_context.c +705 -0
  167. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_encode.c +943 -0
  168. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_feature.c +352 -0
  169. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_model.c +994 -0
  170. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_tag.c +550 -0
  171. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite.c +492 -0
  172. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_internal.h +236 -0
  173. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_train.c +272 -0
  174. data/core/vendor/src/crfsuite/lib/crf/src/dataset.c +106 -0
  175. data/core/vendor/src/crfsuite/lib/crf/src/dictionary.c +118 -0
  176. data/core/vendor/src/crfsuite/lib/crf/src/holdout.c +80 -0
  177. data/core/vendor/src/crfsuite/lib/crf/src/logging.c +91 -0
  178. data/core/vendor/src/crfsuite/lib/crf/src/logging.h +48 -0
  179. data/core/vendor/src/crfsuite/lib/crf/src/params.c +335 -0
  180. data/core/vendor/src/crfsuite/lib/crf/src/params.h +80 -0
  181. data/core/vendor/src/crfsuite/lib/crf/src/quark.c +172 -0
  182. data/core/vendor/src/crfsuite/lib/crf/src/quark.h +46 -0
  183. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.c +1107 -0
  184. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.h +160 -0
  185. data/core/vendor/src/crfsuite/lib/crf/src/train_arow.c +408 -0
  186. data/core/vendor/src/crfsuite/lib/crf/src/train_averaged_perceptron.c +242 -0
  187. data/core/vendor/src/crfsuite/lib/crf/src/train_l2sgd.c +507 -0
  188. data/core/vendor/src/crfsuite/lib/crf/src/train_lbfgs.c +338 -0
  189. data/core/vendor/src/crfsuite/lib/crf/src/train_passive_aggressive.c +435 -0
  190. data/core/vendor/src/crfsuite/lib/crf/src/vecmath.h +341 -0
  191. data/core/vendor/src/crfsuite/ltmain.sh +8413 -0
  192. data/core/vendor/src/crfsuite/missing +376 -0
  193. data/core/vendor/src/crfsuite/swig/Makefile.am +13 -0
  194. data/core/vendor/src/crfsuite/swig/Makefile.in +365 -0
  195. data/core/vendor/src/crfsuite/swig/crfsuite.cpp +2 -0
  196. data/core/vendor/src/crfsuite/swig/export.i +32 -0
  197. data/core/vendor/src/crfsuite/swig/python/README +92 -0
  198. data/core/vendor/src/crfsuite/swig/python/crfsuite.py +329 -0
  199. data/core/vendor/src/crfsuite/swig/python/export_wrap.cpp +14355 -0
  200. data/core/vendor/src/crfsuite/swig/python/export_wrap.h +63 -0
  201. data/core/vendor/src/crfsuite/swig/python/prepare.sh +9 -0
  202. data/core/vendor/src/crfsuite/swig/python/sample_tag.py +52 -0
  203. data/core/vendor/src/crfsuite/swig/python/sample_train.py +68 -0
  204. data/core/vendor/src/crfsuite/swig/python/setup.py +44 -0
  205. data/core/vendor/src/crfsuite/win32/stdint.h +679 -0
  206. data/core/vendor/src/liblbfgs/AUTHORS +1 -0
  207. data/core/vendor/src/liblbfgs/COPYING +22 -0
  208. data/core/vendor/src/liblbfgs/ChangeLog +120 -0
  209. data/core/vendor/src/liblbfgs/INSTALL +231 -0
  210. data/core/vendor/src/liblbfgs/Makefile.am +10 -0
  211. data/core/vendor/src/liblbfgs/Makefile.in +638 -0
  212. data/core/vendor/src/liblbfgs/NEWS +0 -0
  213. data/core/vendor/src/liblbfgs/README +71 -0
  214. data/core/vendor/src/liblbfgs/aclocal.m4 +6985 -0
  215. data/core/vendor/src/liblbfgs/autogen.sh +38 -0
  216. data/core/vendor/src/liblbfgs/config.guess +1411 -0
  217. data/core/vendor/src/liblbfgs/config.h.in +64 -0
  218. data/core/vendor/src/liblbfgs/config.sub +1500 -0
  219. data/core/vendor/src/liblbfgs/configure +21146 -0
  220. data/core/vendor/src/liblbfgs/configure.in +107 -0
  221. data/core/vendor/src/liblbfgs/depcomp +522 -0
  222. data/core/vendor/src/liblbfgs/include/lbfgs.h +745 -0
  223. data/core/vendor/src/liblbfgs/install-sh +322 -0
  224. data/core/vendor/src/liblbfgs/lbfgs.sln +26 -0
  225. data/core/vendor/src/liblbfgs/lib/Makefile.am +24 -0
  226. data/core/vendor/src/liblbfgs/lib/Makefile.in +499 -0
  227. data/core/vendor/src/liblbfgs/lib/arithmetic_ansi.h +133 -0
  228. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_double.h +294 -0
  229. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_float.h +298 -0
  230. data/core/vendor/src/liblbfgs/lib/lbfgs.c +1371 -0
  231. data/core/vendor/src/liblbfgs/lib/lib.vcxproj +95 -0
  232. data/core/vendor/src/liblbfgs/ltmain.sh +6426 -0
  233. data/core/vendor/src/liblbfgs/missing +353 -0
  234. data/core/vendor/src/liblbfgs/sample/Makefile.am +15 -0
  235. data/core/vendor/src/liblbfgs/sample/Makefile.in +433 -0
  236. data/core/vendor/src/liblbfgs/sample/sample.c +81 -0
  237. data/core/vendor/src/liblbfgs/sample/sample.cpp +126 -0
  238. data/core/vendor/src/liblbfgs/sample/sample.vcxproj +105 -0
  239. data/core/vendor/src/svm_light/LICENSE.txt +59 -0
  240. data/core/vendor/src/svm_light/Makefile +105 -0
  241. data/core/vendor/src/svm_light/kernel.h +40 -0
  242. data/core/vendor/src/svm_light/svm_classify.c +197 -0
  243. data/core/vendor/src/svm_light/svm_common.c +985 -0
  244. data/core/vendor/src/svm_light/svm_common.h +301 -0
  245. data/core/vendor/src/svm_light/svm_hideo.c +1062 -0
  246. data/core/vendor/src/svm_light/svm_learn.c +4147 -0
  247. data/core/vendor/src/svm_light/svm_learn.h +169 -0
  248. data/core/vendor/src/svm_light/svm_learn_main.c +397 -0
  249. data/core/vendor/src/svm_light/svm_loqo.c +211 -0
  250. data/ext/hack/Rakefile +17 -0
  251. data/ext/hack/support.rb +88 -0
  252. data/lib/opener/opinion_detectors/base.rb +112 -0
  253. data/lib/opener/opinion_detectors/base/version.rb +7 -0
  254. data/lib/opener/opinion_detectors/configuration_creator.rb +86 -0
  255. data/lib/opener/opinion_detectors/de.rb +7 -0
  256. data/lib/opener/opinion_detectors/en.rb +7 -0
  257. data/lib/opener/opinion_detectors/it.rb +7 -0
  258. data/lib/opener/opinion_detectors/nl.rb +6 -0
  259. data/opener-opinion-detector-base.gemspec +35 -0
  260. data/pre_build_requirements.txt +3 -0
  261. metadata +374 -0
@@ -0,0 +1,49 @@
1
+ #!/usr/bin/env python
2
+
3
+ """
4
+ A feature extractor for chunking.
5
+ Copyright 2010,2011 Naoaki Okazaki.
6
+ """
7
+
8
+ # Separator of field values.
9
+ separator = ' '
10
+
11
+ # Field names of the input data.
12
+ fields = 'w pos y'
13
+
14
+ # Attribute templates.
15
+ templates = (
16
+ (('w', -2), ),
17
+ (('w', -1), ),
18
+ (('w', 0), ),
19
+ (('w', 1), ),
20
+ (('w', 2), ),
21
+ (('w', -1), ('w', 0)),
22
+ (('w', 0), ('w', 1)),
23
+ (('pos', -2), ),
24
+ (('pos', -1), ),
25
+ (('pos', 0), ),
26
+ (('pos', 1), ),
27
+ (('pos', 2), ),
28
+ (('pos', -2), ('pos', -1)),
29
+ (('pos', -1), ('pos', 0)),
30
+ (('pos', 0), ('pos', 1)),
31
+ (('pos', 1), ('pos', 2)),
32
+ (('pos', -2), ('pos', -1), ('pos', 0)),
33
+ (('pos', -1), ('pos', 0), ('pos', 1)),
34
+ (('pos', 0), ('pos', 1), ('pos', 2)),
35
+ )
36
+
37
+
38
+ import crfutils
39
+
40
+ def feature_extractor(X):
41
+ # Apply attribute templates to obtain features (in fact, attributes)
42
+ crfutils.apply_templates(X, templates)
43
+ if X:
44
+ # Append BOS and EOS features manually
45
+ X[0]['F'].append('__BOS__') # BOS feature
46
+ X[-1]['F'].append('__EOS__') # EOS feature
47
+
48
+ if __name__ == '__main__':
49
+ crfutils.main(feature_extractor, fields=fields, sep=separator)
@@ -0,0 +1,179 @@
1
+ """
2
+ A miscellaneous utility for sequential labeling.
3
+ Copyright 2010,2011 Naoaki Okazaki.
4
+ """
5
+
6
+ import optparse
7
+ import sys
8
+
9
+ def apply_templates(X, templates):
10
+ """
11
+ Generate features for an item sequence by applying feature templates.
12
+ A feature template consists of a tuple of (name, offset) pairs,
13
+ where name and offset specify a field name and offset from which
14
+ the template extracts a feature value. Generated features are stored
15
+ in the 'F' field of each item in the sequence.
16
+
17
+ @type X: list of mapping objects
18
+ @param X: The item sequence.
19
+ @type template: tuple of (str, int)
20
+ @param template: The feature template.
21
+ """
22
+ for template in templates:
23
+ name = '|'.join(['%s[%d]' % (f, o) for f, o in template])
24
+ for t in range(len(X)):
25
+ values = []
26
+ for field, offset in template:
27
+ p = t + offset
28
+ if p not in range(len(X)):
29
+ values = []
30
+ break
31
+ values.append(X[p][field])
32
+ if values:
33
+ X[t]['F'].append('%s=%s' % (name, '|'.join(values)))
34
+
35
+ def readiter(fi, names, sep=' '):
36
+ """
37
+ Return an iterator for item sequences read from a file object.
38
+ This function reads a sequence from a file object L{fi}, and
39
+ yields the sequence as a list of mapping objects. Each line
40
+ (item) from the file object is split by the separator character
41
+ L{sep}. Separated values of the item are named by L{names},
42
+ and stored in a mapping object. Every item has a field 'F' that
43
+ is reserved for storing features.
44
+
45
+ @type fi: file
46
+ @param fi: The file object.
47
+ @type names: tuple
48
+ @param names: The list of field names.
49
+ @type sep: str
50
+ @param sep: The separator character.
51
+ @rtype list of mapping objects
52
+ @return An iterator for sequences.
53
+ """
54
+ X = []
55
+ for line in fi:
56
+ line = line.strip('\n')
57
+ if not line:
58
+ yield X
59
+ X = []
60
+ else:
61
+ fields = line.split(sep)
62
+ if len(fields) < len(names):
63
+ raise ValueError(
64
+ 'Too few fields (%d) for %r\n%s' % (len(fields), names, line))
65
+ item = {'F': []} # 'F' is reserved for features.
66
+ for i in range(len(names)):
67
+ item[names[i]] = fields[i]
68
+ X.append(item)
69
+
70
+ def escape(src):
71
+ """
72
+ Escape colon characters from feature names.
73
+
74
+ @type src: str
75
+ @param src: A feature name
76
+ @rtype str
77
+ @return The feature name escaped.
78
+ """
79
+ return src.replace(':', '__COLON__')
80
+
81
+ def output_features(fo, X, field=''):
82
+ """
83
+ Output features (and reference labels) of a sequence in CRFSuite
84
+ format. For each item in the sequence, this function writes a
85
+ reference label (if L{field} is a non-empty string) and features.
86
+
87
+ @type fo: file
88
+ @param fo: The file object.
89
+ @type X: list of mapping objects
90
+ @param X: The sequence.
91
+ @type field: str
92
+ @param field: The field name of reference labels.
93
+ """
94
+ for t in range(len(X)):
95
+ if field:
96
+ fo.write('%s' % X[t][field])
97
+ for a in X[t]['F']:
98
+ if isinstance(a, str):
99
+ fo.write('\t%s' % escape(a))
100
+ else:
101
+ fo.write('\t%s:%f' % (escape(a[0]), a[1]))
102
+ fo.write('\n')
103
+ fo.write('\n')
104
+
105
+ def to_crfsuite(X):
106
+ """
107
+ Convert an item sequence into an object compatible with crfsuite
108
+ Python module.
109
+
110
+ @type X: list of mapping objects
111
+ @param X: The sequence.
112
+ @rtype crfsuite.ItemSequence
113
+ @return The same sequence in crfsuite.ItemSequence type.
114
+ """
115
+ import crfsuite
116
+ xseq = crfsuite.ItemSequence()
117
+ for x in X:
118
+ item = crfsuite.Item()
119
+ for f in x['F']:
120
+ if isinstance(f, str):
121
+ item.append(crfsuite.Attribute(escape(f)))
122
+ else:
123
+ item.append(crfsuite.Attribute(escape(f[0]), f[1]))
124
+ xseq.append(item)
125
+ return xseq
126
+
127
+ def main(feature_extractor, fields='w pos y', sep=' '):
128
+ fi = sys.stdin
129
+ fo = sys.stdout
130
+
131
+ # Parse the command-line arguments.
132
+ parser = optparse.OptionParser(usage="""usage: %prog [options]
133
+ This utility reads a data set from STDIN, and outputs attributes to STDOUT.
134
+ Each line of a data set must consist of field values separated by SEPARATOR
135
+ characters. The names and order of field values can be specified by -f option.
136
+ The separator character can be specified with -s option. Instead of outputting
137
+ attributes, this utility tags the input data when a model file is specified by
138
+ -t option (CRFsuite Python module must be installed)."""
139
+ )
140
+ parser.add_option(
141
+ '-t', dest='model',
142
+ help='tag the input using the model (requires "crfsuite" module)'
143
+ )
144
+ parser.add_option(
145
+ '-f', dest='fields', default=fields,
146
+ help='specify field names of input data [default: "%default"]'
147
+ )
148
+ parser.add_option(
149
+ '-s', dest='separator', default=sep,
150
+ help='specify the separator of columns of input data [default: "%default"]'
151
+ )
152
+ (options, args) = parser.parse_args()
153
+
154
+ # The fields of input: ('w', 'pos', 'y) by default.
155
+ F = options.fields.split(' ')
156
+
157
+ if not options.model:
158
+ # The generator function readiter() reads a sequence from a
159
+ for X in readiter(fi, F, options.separator):
160
+ feature_extractor(X)
161
+ output_features(fo, X, 'y')
162
+
163
+ else:
164
+ # Create a tagger with an existing model.
165
+ import crfsuite
166
+ tagger = crfsuite.Tagger()
167
+ tagger.open(options.model)
168
+
169
+ # For each sequence from STDIN.
170
+ for X in readiter(fi, F, options.separator):
171
+ # Obtain features.
172
+ feature_extractor(X)
173
+ xseq = to_crfsuite(X)
174
+ yseq = tagger.tag(xseq)
175
+ for t in range(len(X)):
176
+ v = X[t]
177
+ fo.write('\t'.join([v[f] for f in F]))
178
+ fo.write('\t%s\n' % yseq[t])
179
+ fo.write('\n')
@@ -0,0 +1,270 @@
1
+ #!/usr/bin/env python
2
+
3
+ """
4
+ A feature extractor for named eneity recognition (NER).
5
+ Copyright 2010,2011 Naoaki Okazaki.
6
+ """
7
+
8
+ # Separator of field values.
9
+ separator = ' '
10
+
11
+ # Field names of the input data.
12
+ fields = 'y w pos chk'
13
+
14
+
15
+ import crfutils
16
+
17
+ def get_shape(token):
18
+ r = ''
19
+ for c in token:
20
+ if c.isupper():
21
+ r += 'U'
22
+ elif c.islower():
23
+ r += 'L'
24
+ elif c.isdigit():
25
+ r += 'D'
26
+ elif c in ('.', ','):
27
+ r += '.'
28
+ elif c in (';', ':', '?', '!'):
29
+ r += ';'
30
+ elif c in ('+', '-', '*', '/', '=', '|', '_'):
31
+ r += '-'
32
+ elif c in ('(', '{', '[', '<'):
33
+ r += '('
34
+ elif c in (')', '}', ']', '>'):
35
+ r += ')'
36
+ else:
37
+ r += c
38
+ return r
39
+
40
+ def degenerate(src):
41
+ dst = ''
42
+ for c in src:
43
+ if not dst or dst[-1] != c:
44
+ dst += c
45
+ return dst
46
+
47
+ def get_type(token):
48
+ T = (
49
+ 'AllUpper', 'AllDigit', 'AllSymbol',
50
+ 'AllUpperDigit', 'AllUpperSymbol', 'AllDigitSymbol',
51
+ 'AllUpperDigitSymbol',
52
+ 'InitUpper',
53
+ 'AllLetter',
54
+ 'AllAlnum',
55
+ )
56
+ R = set(T)
57
+ if not token:
58
+ return 'EMPTY'
59
+
60
+ for i in range(len(token)):
61
+ c = token[i]
62
+ if c.isupper():
63
+ R.discard('AllDigit')
64
+ R.discard('AllSymbol')
65
+ R.discard('AllDigitSymbol')
66
+ elif c.isdigit() or c in (',', '.'):
67
+ R.discard('AllUpper')
68
+ R.discard('AllSymbol')
69
+ R.discard('AllUpperSymbol')
70
+ R.discard('AllLetter')
71
+ elif c.islower():
72
+ R.discard('AllUpper')
73
+ R.discard('AllDigit')
74
+ R.discard('AllSymbol')
75
+ R.discard('AllUpperDigit')
76
+ R.discard('AllUpperSymbol')
77
+ R.discard('AllDigitSymbol')
78
+ R.discard('AllUpperDigitSymbol')
79
+ else:
80
+ R.discard('AllUpper')
81
+ R.discard('AllDigit')
82
+ R.discard('AllUpperDigit')
83
+ R.discard('AllLetter')
84
+ R.discard('AllAlnum')
85
+
86
+ if i == 0 and not c.isupper():
87
+ R.discard('InitUpper')
88
+
89
+ for tag in T:
90
+ if tag in R:
91
+ return tag
92
+ return 'NO'
93
+
94
+ def get_2d(token):
95
+ return len(token) == 2 and token.isdigit()
96
+
97
+ def get_4d(token):
98
+ return len(token) == 4 and token.isdigit()
99
+
100
+ def get_da(token):
101
+ bd = False
102
+ ba = False
103
+ for c in token:
104
+ if c.isdigit():
105
+ bd = True
106
+ elif c.isalpha():
107
+ ba = True
108
+ else:
109
+ return False
110
+ return bd and ba
111
+
112
+ def get_dand(token, p):
113
+ bd = False
114
+ bdd = False
115
+ for c in token:
116
+ if c.isdigit():
117
+ bd = True
118
+ elif c == p:
119
+ bdd = True
120
+ else:
121
+ return False
122
+ return bd and bdd
123
+
124
+ def get_all_other(token):
125
+ for c in token:
126
+ if c.isalnum():
127
+ return False
128
+ return True
129
+
130
+ def get_capperiod(token):
131
+ return len(token) == 2 and token[0].isupper() and token[1] == '.'
132
+
133
+ def contains_upper(token):
134
+ b = False
135
+ for c in token:
136
+ b |= c.isupper()
137
+ return b
138
+
139
+ def contains_lower(token):
140
+ b = False
141
+ for c in token:
142
+ b |= c.islower()
143
+ return b
144
+
145
+ def contains_alpha(token):
146
+ b = False
147
+ for c in token:
148
+ b |= c.isalpha()
149
+ return b
150
+
151
+ def contains_digit(token):
152
+ b = False
153
+ for c in token:
154
+ b |= c.isdigit()
155
+ return b
156
+
157
+ def contains_symbol(token):
158
+ b = False
159
+ for c in token:
160
+ b |= ~c.isalnum()
161
+ return b
162
+
163
+ def b(v):
164
+ return 'yes' if v else 'no'
165
+
166
+ def observation(v, defval=''):
167
+ # Lowercased token.
168
+ v['wl'] = v['w'].lower()
169
+ # Token shape.
170
+ v['shape'] = get_shape(v['w'])
171
+ # Token shape degenerated.
172
+ v['shaped'] = degenerate(v['shape'])
173
+ # Token type.
174
+ v['type'] = get_type(v['w'])
175
+
176
+ # Prefixes (length between one to four).
177
+ v['p1'] = v['w'][0] if len(v['w']) >= 1 else defval
178
+ v['p2'] = v['w'][:2] if len(v['w']) >= 2 else defval
179
+ v['p3'] = v['w'][:3] if len(v['w']) >= 3 else defval
180
+ v['p4'] = v['w'][:4] if len(v['w']) >= 4 else defval
181
+
182
+ # Suffixes (length between one to four).
183
+ v['s1'] = v['w'][-1] if len(v['w']) >= 1 else defval
184
+ v['s2'] = v['w'][-2:] if len(v['w']) >= 2 else defval
185
+ v['s3'] = v['w'][-3:] if len(v['w']) >= 3 else defval
186
+ v['s4'] = v['w'][-4:] if len(v['w']) >= 4 else defval
187
+
188
+ # Two digits
189
+ v['2d'] = b(get_2d(v['w']))
190
+ # Four digits.
191
+ v['4d'] = b(get_4d(v['w']))
192
+ # Alphanumeric token.
193
+ v['d&a'] = b(get_da(v['w']))
194
+ # Digits and '-'.
195
+ v['d&-'] = b(get_dand(v['w'], '-'))
196
+ # Digits and '/'.
197
+ v['d&/'] = b(get_dand(v['w'], '/'))
198
+ # Digits and ','.
199
+ v['d&,'] = b(get_dand(v['w'], ','))
200
+ # Digits and '.'.
201
+ v['d&.'] = b(get_dand(v['w'], '.'))
202
+ # A uppercase letter followed by '.'
203
+ v['up'] = b(get_capperiod(v['w']))
204
+
205
+ # An initial uppercase letter.
206
+ v['iu'] = b(v['w'] and v['w'][0].isupper())
207
+ # All uppercase letters.
208
+ v['au'] = b(v['w'].isupper())
209
+ # All lowercase letters.
210
+ v['al'] = b(v['w'].islower())
211
+ # All digit letters.
212
+ v['ad'] = b(v['w'].isdigit())
213
+ # All other (non-alphanumeric) letters.
214
+ v['ao'] = b(get_all_other(v['w']))
215
+
216
+ # Contains a uppercase letter.
217
+ v['cu'] = b(contains_upper(v['w']))
218
+ # Contains a lowercase letter.
219
+ v['cl'] = b(contains_lower(v['w']))
220
+ # Contains a alphabet letter.
221
+ v['ca'] = b(contains_alpha(v['w']))
222
+ # Contains a digit.
223
+ v['cd'] = b(contains_digit(v['w']))
224
+ # Contains a symbol.
225
+ v['cs'] = b(contains_symbol(v['w']))
226
+
227
+ def disjunctive(X, t, field, begin, end):
228
+ name = '%s[%d..%d]' % (field, begin, end)
229
+ for offset in range(begin, end+1):
230
+ p = t + offset
231
+ if p not in range(0, len(X)):
232
+ continue
233
+ X[t]['F'].append('%s=%s' % (name, X[p][field]))
234
+
235
+ U = [
236
+ 'w', 'wl', 'pos', 'chk', 'shape', 'shaped', 'type',
237
+ 'p1', 'p2', 'p3', 'p4',
238
+ 's1', 's2', 's3', 's4',
239
+ '2d', '4d', 'd&a', 'd&-', 'd&/', 'd&,', 'd&.', 'up',
240
+ 'iu', 'au', 'al', 'ad', 'ao',
241
+ 'cu', 'cl', 'ca', 'cd', 'cs',
242
+ ]
243
+ B = ['w', 'pos', 'chk', 'shaped', 'type']
244
+
245
+ templates = []
246
+ for name in U:
247
+ templates += [((name, i),) for i in range(-2, 3)]
248
+ for name in B:
249
+ templates += [((name, i), (name, i+1)) for i in range(-2, 2)]
250
+
251
+ def feature_extractor(X):
252
+ # Append observations.
253
+ for x in X:
254
+ observation(x)
255
+
256
+ # Apply the feature templates.
257
+ crfutils.apply_templates(X, templates)
258
+
259
+ # Append disjunctive features.
260
+ for t in range(len(X)):
261
+ disjunctive(X, t, 'w', -4, -1)
262
+ disjunctive(X, t, 'w', 1, 4)
263
+
264
+ # Append BOS and EOS features.
265
+ if X:
266
+ X[0]['F'].append('__BOS__')
267
+ X[-1]['F'].append('__EOS__')
268
+
269
+ if __name__ == '__main__':
270
+ crfutils.main(feature_extractor, fields=fields, sep=separator)