opener-opinion-detector-basic 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +30 -0
  3. data/bin/opinion-detector-basic +19 -0
  4. data/bin/opinion-detector-basic-server +10 -0
  5. data/config.ru +4 -0
  6. data/core/opinion_detector_basic_multi.py +499 -0
  7. data/core/packages/KafNafParser-1.3.tar.gz +0 -0
  8. data/core/packages/VUA_pylib-1.4.tar.gz +0 -0
  9. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
  10. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
  11. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
  12. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
  13. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
  14. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
  15. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
  16. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
  17. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
  18. data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
  19. data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
  20. data/core/vendor/src/crfsuite/AUTHORS +1 -0
  21. data/core/vendor/src/crfsuite/COPYING +27 -0
  22. data/core/vendor/src/crfsuite/ChangeLog +103 -0
  23. data/core/vendor/src/crfsuite/INSTALL +236 -0
  24. data/core/vendor/src/crfsuite/Makefile.am +19 -0
  25. data/core/vendor/src/crfsuite/Makefile.in +783 -0
  26. data/core/vendor/src/crfsuite/README +183 -0
  27. data/core/vendor/src/crfsuite/aclocal.m4 +9018 -0
  28. data/core/vendor/src/crfsuite/autogen.sh +38 -0
  29. data/core/vendor/src/crfsuite/compile +143 -0
  30. data/core/vendor/src/crfsuite/config.guess +1502 -0
  31. data/core/vendor/src/crfsuite/config.h.in +198 -0
  32. data/core/vendor/src/crfsuite/config.sub +1714 -0
  33. data/core/vendor/src/crfsuite/configure +14273 -0
  34. data/core/vendor/src/crfsuite/configure.in +149 -0
  35. data/core/vendor/src/crfsuite/crfsuite.sln +42 -0
  36. data/core/vendor/src/crfsuite/depcomp +630 -0
  37. data/core/vendor/src/crfsuite/example/chunking.py +49 -0
  38. data/core/vendor/src/crfsuite/example/crfutils.py +179 -0
  39. data/core/vendor/src/crfsuite/example/ner.py +270 -0
  40. data/core/vendor/src/crfsuite/example/pos.py +78 -0
  41. data/core/vendor/src/crfsuite/example/template.py +88 -0
  42. data/core/vendor/src/crfsuite/frontend/Makefile.am +29 -0
  43. data/core/vendor/src/crfsuite/frontend/Makefile.in +640 -0
  44. data/core/vendor/src/crfsuite/frontend/dump.c +116 -0
  45. data/core/vendor/src/crfsuite/frontend/frontend.vcxproj +129 -0
  46. data/core/vendor/src/crfsuite/frontend/iwa.c +273 -0
  47. data/core/vendor/src/crfsuite/frontend/iwa.h +65 -0
  48. data/core/vendor/src/crfsuite/frontend/learn.c +439 -0
  49. data/core/vendor/src/crfsuite/frontend/main.c +137 -0
  50. data/core/vendor/src/crfsuite/frontend/option.c +93 -0
  51. data/core/vendor/src/crfsuite/frontend/option.h +86 -0
  52. data/core/vendor/src/crfsuite/frontend/readdata.h +38 -0
  53. data/core/vendor/src/crfsuite/frontend/reader.c +136 -0
  54. data/core/vendor/src/crfsuite/frontend/tag.c +427 -0
  55. data/core/vendor/src/crfsuite/genbinary.sh.in +15 -0
  56. data/core/vendor/src/crfsuite/include/Makefile.am +11 -0
  57. data/core/vendor/src/crfsuite/include/Makefile.in +461 -0
  58. data/core/vendor/src/crfsuite/include/crfsuite.h +1063 -0
  59. data/core/vendor/src/crfsuite/include/crfsuite.hpp +555 -0
  60. data/core/vendor/src/crfsuite/include/crfsuite_api.hpp +400 -0
  61. data/core/vendor/src/crfsuite/include/os.h +61 -0
  62. data/core/vendor/src/crfsuite/install-sh +520 -0
  63. data/core/vendor/src/crfsuite/lib/cqdb/COPYING +28 -0
  64. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.am +21 -0
  65. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.in +549 -0
  66. data/core/vendor/src/crfsuite/lib/cqdb/cqdb.vcxproj +86 -0
  67. data/core/vendor/src/crfsuite/lib/cqdb/include/cqdb.h +524 -0
  68. data/core/vendor/src/crfsuite/lib/cqdb/src/cqdb.c +587 -0
  69. data/core/vendor/src/crfsuite/lib/cqdb/src/lookup3.c +976 -0
  70. data/core/vendor/src/crfsuite/lib/crf/Makefile.am +46 -0
  71. data/core/vendor/src/crfsuite/lib/crf/Makefile.in +721 -0
  72. data/core/vendor/src/crfsuite/lib/crf/crf.vcxproj +216 -0
  73. data/core/vendor/src/crfsuite/lib/crf/src/crf1d.h +353 -0
  74. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_context.c +705 -0
  75. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_encode.c +943 -0
  76. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_feature.c +352 -0
  77. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_model.c +994 -0
  78. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_tag.c +550 -0
  79. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite.c +492 -0
  80. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_internal.h +236 -0
  81. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_train.c +272 -0
  82. data/core/vendor/src/crfsuite/lib/crf/src/dataset.c +106 -0
  83. data/core/vendor/src/crfsuite/lib/crf/src/dictionary.c +118 -0
  84. data/core/vendor/src/crfsuite/lib/crf/src/holdout.c +80 -0
  85. data/core/vendor/src/crfsuite/lib/crf/src/logging.c +91 -0
  86. data/core/vendor/src/crfsuite/lib/crf/src/logging.h +48 -0
  87. data/core/vendor/src/crfsuite/lib/crf/src/params.c +335 -0
  88. data/core/vendor/src/crfsuite/lib/crf/src/params.h +80 -0
  89. data/core/vendor/src/crfsuite/lib/crf/src/quark.c +172 -0
  90. data/core/vendor/src/crfsuite/lib/crf/src/quark.h +46 -0
  91. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.c +1107 -0
  92. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.h +160 -0
  93. data/core/vendor/src/crfsuite/lib/crf/src/train_arow.c +408 -0
  94. data/core/vendor/src/crfsuite/lib/crf/src/train_averaged_perceptron.c +242 -0
  95. data/core/vendor/src/crfsuite/lib/crf/src/train_l2sgd.c +507 -0
  96. data/core/vendor/src/crfsuite/lib/crf/src/train_lbfgs.c +338 -0
  97. data/core/vendor/src/crfsuite/lib/crf/src/train_passive_aggressive.c +435 -0
  98. data/core/vendor/src/crfsuite/lib/crf/src/vecmath.h +341 -0
  99. data/core/vendor/src/crfsuite/ltmain.sh +8413 -0
  100. data/core/vendor/src/crfsuite/missing +376 -0
  101. data/core/vendor/src/crfsuite/swig/Makefile.am +13 -0
  102. data/core/vendor/src/crfsuite/swig/Makefile.in +365 -0
  103. data/core/vendor/src/crfsuite/swig/crfsuite.cpp +2 -0
  104. data/core/vendor/src/crfsuite/swig/export.i +32 -0
  105. data/core/vendor/src/crfsuite/swig/python/README +92 -0
  106. data/core/vendor/src/crfsuite/swig/python/crfsuite.py +329 -0
  107. data/core/vendor/src/crfsuite/swig/python/export_wrap.cpp +14355 -0
  108. data/core/vendor/src/crfsuite/swig/python/export_wrap.h +63 -0
  109. data/core/vendor/src/crfsuite/swig/python/prepare.sh +9 -0
  110. data/core/vendor/src/crfsuite/swig/python/sample_tag.py +52 -0
  111. data/core/vendor/src/crfsuite/swig/python/sample_train.py +68 -0
  112. data/core/vendor/src/crfsuite/swig/python/setup.py +44 -0
  113. data/core/vendor/src/crfsuite/win32/stdint.h +679 -0
  114. data/core/vendor/src/liblbfgs/AUTHORS +1 -0
  115. data/core/vendor/src/liblbfgs/COPYING +22 -0
  116. data/core/vendor/src/liblbfgs/ChangeLog +120 -0
  117. data/core/vendor/src/liblbfgs/INSTALL +231 -0
  118. data/core/vendor/src/liblbfgs/Makefile.am +10 -0
  119. data/core/vendor/src/liblbfgs/Makefile.in +638 -0
  120. data/core/vendor/src/liblbfgs/NEWS +0 -0
  121. data/core/vendor/src/liblbfgs/README +71 -0
  122. data/core/vendor/src/liblbfgs/aclocal.m4 +6985 -0
  123. data/core/vendor/src/liblbfgs/autogen.sh +38 -0
  124. data/core/vendor/src/liblbfgs/config.guess +1411 -0
  125. data/core/vendor/src/liblbfgs/config.h.in +64 -0
  126. data/core/vendor/src/liblbfgs/config.sub +1500 -0
  127. data/core/vendor/src/liblbfgs/configure +21146 -0
  128. data/core/vendor/src/liblbfgs/configure.in +107 -0
  129. data/core/vendor/src/liblbfgs/depcomp +522 -0
  130. data/core/vendor/src/liblbfgs/include/lbfgs.h +745 -0
  131. data/core/vendor/src/liblbfgs/install-sh +322 -0
  132. data/core/vendor/src/liblbfgs/lbfgs.sln +26 -0
  133. data/core/vendor/src/liblbfgs/lib/Makefile.am +24 -0
  134. data/core/vendor/src/liblbfgs/lib/Makefile.in +499 -0
  135. data/core/vendor/src/liblbfgs/lib/arithmetic_ansi.h +133 -0
  136. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_double.h +294 -0
  137. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_float.h +298 -0
  138. data/core/vendor/src/liblbfgs/lib/lbfgs.c +1371 -0
  139. data/core/vendor/src/liblbfgs/lib/lib.vcxproj +95 -0
  140. data/core/vendor/src/liblbfgs/ltmain.sh +6426 -0
  141. data/core/vendor/src/liblbfgs/missing +353 -0
  142. data/core/vendor/src/liblbfgs/sample/Makefile.am +15 -0
  143. data/core/vendor/src/liblbfgs/sample/Makefile.in +433 -0
  144. data/core/vendor/src/liblbfgs/sample/sample.c +81 -0
  145. data/core/vendor/src/liblbfgs/sample/sample.cpp +126 -0
  146. data/core/vendor/src/liblbfgs/sample/sample.vcxproj +105 -0
  147. data/core/vendor/src/svm_light/LICENSE.txt +59 -0
  148. data/core/vendor/src/svm_light/Makefile +105 -0
  149. data/core/vendor/src/svm_light/kernel.h +40 -0
  150. data/core/vendor/src/svm_light/svm_classify.c +197 -0
  151. data/core/vendor/src/svm_light/svm_common.c +985 -0
  152. data/core/vendor/src/svm_light/svm_common.h +301 -0
  153. data/core/vendor/src/svm_light/svm_hideo.c +1062 -0
  154. data/core/vendor/src/svm_light/svm_learn.c +4147 -0
  155. data/core/vendor/src/svm_light/svm_learn.h +169 -0
  156. data/core/vendor/src/svm_light/svm_learn_main.c +397 -0
  157. data/core/vendor/src/svm_light/svm_loqo.c +211 -0
  158. data/ext/hack/Rakefile +17 -0
  159. data/ext/hack/support.rb +88 -0
  160. data/lib/opener/opinion_detector_basic.rb +91 -0
  161. data/lib/opener/opinion_detector_basic/public/markdown.css +284 -0
  162. data/lib/opener/opinion_detector_basic/server.rb +16 -0
  163. data/lib/opener/opinion_detector_basic/version.rb +5 -0
  164. data/lib/opener/opinion_detector_basic/views/index.erb +97 -0
  165. data/lib/opener/opinion_detector_basic/views/result.erb +15 -0
  166. data/opener-opinion-detector-basic.gemspec +36 -0
  167. data/pre_build_requirements.txt +1 -0
  168. metadata +309 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: cc6dd6c71396853ddb39ff898599ad4c3f466401
4
+ data.tar.gz: c820abad54167482947c7a56d78cba5331482957
5
+ SHA512:
6
+ metadata.gz: ee6555c6dee3540ed4b7a7817e106cfb459281f159c60d73b2af548b9bc19b967ce70bc52e8bac991eabdab6b74a6d30606f7672c339d75c999bf2666a8a66ec
7
+ data.tar.gz: 72fc4d27ce5b98fab608880492a2c7b135e24542eb9b2cb26b3bd4c32008095f966ec944588d30aaeb278c265bdb65bd22688dd43649df7cd16d9ea0e99ece55
data/README.md ADDED
@@ -0,0 +1,30 @@
1
+ Opinion Detector Basic
2
+ ======================
3
+
4
+ This module implements a opinion detector for English (also works for Dutch and
5
+ German). The language is determined by the "xml:lang" attribut in the input KAF
6
+ file. Depending on the value of this attribute, the corresponding lexicon will
7
+ be loaded. This module detects three elements of the opinions:
8
+
9
+ * Expression: the actual opinion expression
10
+ * Target: about what is the previous expression
11
+ * Holder: who is stating that expression
12
+
13
+ Requirements
14
+ -----------
15
+ * VUKafParserPy: parser in python for KAF files
16
+ * lxml: library for processing xml in python
17
+
18
+ Usage
19
+ ----
20
+
21
+ The input KAF file has to be annotated with at least the term layer, with
22
+ polarity information. Correct input files for this module are the output KAF
23
+ files from the polarity tagger module
24
+
25
+ To tag an input KAF file example.kaf with opinions you can run:
26
+
27
+ $ cat example.with.polaritieskaf | core/opinion_detector_basic_multi.py > output.with.opinions.kaf
28
+
29
+ The output will the input KAF file extended with the opinion layer.
30
+
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/opener/opinion_detector_basic'
4
+
5
+ # STDIN.tty? returns `false` if data is being piped into the current process.
6
+ if STDIN.tty?
7
+ input = nil
8
+ else
9
+ input = STDIN.read
10
+ end
11
+
12
+ kernel = Opener::OpinionDetectorBasic.new(:args => ARGV)
13
+ stdout, stderr, process = kernel.run(input)
14
+
15
+ if process.success?
16
+ puts stdout
17
+ else
18
+ abort stderr
19
+ end
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rack'
4
+
5
+ # Without calling `Rack::Server#options` manually the CLI arguments will never
6
+ # be passed, thus the application can't be specified as a constructor argument.
7
+ server = Rack::Server.new
8
+ server.options[:config] = File.expand_path('../../config.ru', __FILE__)
9
+
10
+ server.start
data/config.ru ADDED
@@ -0,0 +1,4 @@
1
+ require File.expand_path('../lib/opener/opinion_detector_basic', __FILE__)
2
+ require File.expand_path('../lib/opener/opinion_detector_basic/server', __FILE__)
3
+
4
+ run Opener::OpinionDetectorBasic::Server
@@ -0,0 +1,499 @@
1
+ #!/usr/bin/env python
2
+
3
+ import sys
4
+ import getopt
5
+ import os
6
+
7
+ this_folder = os.path.dirname(os.path.realpath(__file__))
8
+
9
+ # This updates the load path to ensure that the local site-packages directory
10
+ # can be used to load packages (e.g. a locally installed copy of lxml).
11
+ sys.path.append(os.path.join(this_folder, 'site-packages/pre_build'))
12
+
13
+ from VUKafParserPy import KafParser
14
+ from collections import defaultdict
15
+ import operator
16
+ import pprint
17
+ import lxml
18
+ from lxml import etree
19
+ import logging
20
+
21
+
22
+
23
+ def mix_lists(l1,l2):
24
+ newl=[]
25
+ min_l = min(len(l1),len(l2))
26
+ for x in range(min_l):
27
+ newl.append(l1[x])
28
+ newl.append(l2[x])
29
+
30
+ if len(l1)>len(l2):
31
+ newl.extend(l1[min_l:])
32
+ elif len(l2)>len(l1):
33
+ newl.extend(l2[min_l:])
34
+ return newl
35
+
36
+
37
+ class OpinionExpression:
38
+ def __init__(self,spans,sentence,value):
39
+ self.ids = spans
40
+ self.sentence = sentence
41
+ self.value = value
42
+ self.target_ids = []
43
+ self.candidates_r=[]
44
+ self.candidates_l=[]
45
+ self.holder = []
46
+
47
+ def __repr__(self):
48
+ r='Ids:'+'#'.join(self.ids)+' Sent:'+self.sentence+' Value:'+str(self.value)+ ' Target:'+'#'.join(self.target_ids)+'\n'
49
+ r+='Right cand: '+str(self.candidates_r)+'\n'
50
+ r+='Left cand: '+str(self.candidates_l)+'\n'
51
+ return r
52
+
53
+ class MyToken:
54
+ def __init__(self,id,lemma,pos,polarity,sent_mod,sent):
55
+ self.id = id
56
+ self.lemma = lemma
57
+ self.pos = pos
58
+ self.polarity = polarity
59
+ self.sent_mod = sent_mod
60
+ self.sentence = sent
61
+ self.use_it = True
62
+ self.list_ids = [id]
63
+ self.value = 0
64
+
65
+
66
+ if polarity == 'positive':
67
+ self.value = 1
68
+ elif polarity == 'negative':
69
+ self.value = -1
70
+
71
+ if sent_mod == 'intensifier':
72
+ self.value = 2
73
+ elif sent_mod == 'shifter':
74
+ self.value = -1
75
+
76
+
77
+ def isNegator(self):
78
+ return self.sent_mod == 'shifter'
79
+
80
+
81
+
82
+ def isIntensifier(self):
83
+ return self.sent_mod == 'intensifier'
84
+
85
+
86
+ def is_opinion_expression(self):
87
+ return self.use_it and self.polarity is not None
88
+
89
+
90
+ def __repr__(self):
91
+ if self.use_it:
92
+ return self.id+' lemma:'+self.lemma.encode('utf-8')+'.'+self.pos.encode('utf-8')+' pol:'+str(self.polarity)+' sentmod:'+str(self.sent_mod)+' sent:'+self.sentence+' use:'+str(self.use_it)+' list:'+'#'.join(self.list_ids)+' val:'+str(self.value)
93
+ else:
94
+ return '\t'+self.id+' lemma:'+self.lemma.encode('utf-8')+'.'+self.pos.encode('utf-8')+' pol:'+str(self.polarity)+' sentmod:'+str(self.sent_mod)+' sent:'+self.sentence+' use:'+str(self.use_it)+' list:'+'#'.join(self.list_ids)+' val:'+str(self.value)
95
+
96
+
97
+
98
+ def obtain_opinion_expressions(tokens,lang='nl'):
99
+ logging.debug(' Obtaining opinion expressions')
100
+ my_tokens = tokens[:]
101
+
102
+ accumulate_several_modifiers = True
103
+ apply_modifiers = True
104
+ apply_conjunctions = True
105
+
106
+ ## Acumulate doble/triple intensifiers or negators
107
+ if accumulate_several_modifiers:
108
+ logging.debug(' Accumulating modifiers')
109
+ t = 0
110
+ while t < len(my_tokens):
111
+ if my_tokens[t].isNegator() or my_tokens[t].isIntensifier():
112
+ if t+1 < len(my_tokens) and ( my_tokens[t+1].isNegator() or my_tokens[t+1].isIntensifier()):
113
+ ## There are 2 negators/intensifiers next to each other
114
+ ## The first one is deactivated and the second one is modified
115
+ my_tokens[t].use_it = False
116
+ my_tokens[t+1].value *= my_tokens[t].value
117
+ my_tokens[t+1].list_ids += my_tokens[t].list_ids
118
+ logging.debug(' Accucumating '+'-'.join(my_tokens[t+1].list_ids))
119
+ t+=1
120
+ ###########################################
121
+
122
+ ##Apply intensifiers/negators over the next elements
123
+ if apply_modifiers:
124
+ logging.debug(' Applying modifiers')
125
+ t = 0
126
+ while t < len(my_tokens):
127
+ if my_tokens[t].use_it and (my_tokens[t].isNegator() or my_tokens[t].isIntensifier()):
128
+ ## Try to modify the next token:
129
+ if t+1<len(my_tokens):
130
+ my_tokens[t+1].value *= my_tokens[t].value
131
+ my_tokens[t+1].list_ids += my_tokens[t].list_ids
132
+ my_tokens[t].use_it = False
133
+ logging.debug(' Applied modifier over '+'-'.join(my_tokens[t+1].list_ids))
134
+ t += 1
135
+ ###########################################
136
+
137
+ if apply_conjunctions:
138
+ if lang=='nl':
139
+ concat = [',','en']
140
+ elif lang=='en':
141
+ concat = [',','and']
142
+ elif lang=='es':
143
+ concat = [',','y','e']
144
+ elif lang=='it':
145
+ concat = [',','e','ed']
146
+ elif lang=='de':
147
+ concat = [',','und']
148
+ elif lang == 'fr':
149
+ concat=[',','et']
150
+ logging.debug(' Applying conjunctions:'+str(concat))
151
+
152
+
153
+ t = 0
154
+ while t < len(my_tokens):
155
+ if my_tokens[t].use_it and my_tokens[t].value!=0: ## Find the first one
156
+ #print 'FOUND ',my_tokens[t]
157
+ logging.debug(' Found token '+str(my_tokens[t]))
158
+ list_aux = my_tokens[t].list_ids
159
+ used = [t]
160
+ value_aux = my_tokens[t].value
161
+ my_tokens[t].use_it = False
162
+ #print 'Modified',my_tokens[t]
163
+
164
+ x = t+1
165
+ while True:
166
+ if x>=len(my_tokens):
167
+ break
168
+
169
+ if my_tokens[x].lemma in concat:
170
+ ## list_aux += my_tokens[x].list_ids Dont use it as part of the OE
171
+ my_tokens[x].use_it = False
172
+ x+=1
173
+ elif (my_tokens[x].use_it and my_tokens[x].value!=0):
174
+ #print '\Also ',my_tokens[x]
175
+ logging.debug(' Found token '+str(my_tokens[x]))
176
+ list_aux += my_tokens[x].list_ids
177
+
178
+ used.append(x)
179
+ my_tokens[x].use_it = False
180
+ value_aux += my_tokens[x].value
181
+ x += 1
182
+ else:
183
+ break
184
+ #print 'OUT OF THE WHILE'
185
+ ##The last one in the list used is the one accumulating all
186
+
187
+ last_pos = used[-1]
188
+ my_tokens[last_pos].value = value_aux
189
+ my_tokens[last_pos].list_ids = list_aux
190
+ my_tokens[last_pos].use_it = True
191
+ logging.debug(' Regenerating '+str(my_tokens[last_pos]))
192
+ t = x ## next token
193
+ #print
194
+ #print
195
+ t += 1
196
+
197
+
198
+ ## Create OpinionExpression
199
+ my_opinion_exps = []
200
+ logging.debug(' Generating output')
201
+ for token in my_tokens:
202
+ if token.use_it and token.value != 0:
203
+ op_exp = OpinionExpression(token.list_ids,token.sentence,token.value)
204
+ my_opinion_exps.append(op_exp)
205
+ return my_opinion_exps
206
+
207
+
208
+ '''
209
+ def get_distance(id1, id2):
210
+ pos1 = int(id1[id1.find('_')+1:])
211
+ pos2 = int(id2[id2.find('_')+1:])
212
+ if pos1>pos2:
213
+ return pos1-pos2
214
+ else:
215
+ return pos2-pos1
216
+ '''
217
+
218
+
219
+ def obtain_holders(ops_exps,sentences,lang):
220
+ if lang=='nl':
221
+ holders = ['ik','we','wij','ze','zij','jullie','u','hij','het','jij','je','mij','me','hem','haar','ons','hen','hun']
222
+ elif lang=='en':
223
+ holders = ['i','we','he','she','they','it','you']
224
+ elif lang =='es':
225
+ holders = ['yo','tu','nosotros','vosotros','ellos','ellas','nosotras','vosotras']
226
+ elif lang =='it':
227
+ holders = ['io','tu','noi','voi','loro','lei','lui']
228
+ elif lang == 'de':
229
+ holders = ['ich','du','wir','ihr','sie','er']
230
+ elif lang == 'fr':
231
+ holders = ['je','tu','lui','elle','nous','vous','ils','elles']
232
+
233
+ logging.debug('Obtaining holders with list: '+str(holders))
234
+
235
+ for oe in ops_exps:
236
+ sent = oe.sentence
237
+ list_terms = sentences[str(sent)]
238
+ for lemma, pos, term_id in list_terms:
239
+ if lemma in holders:
240
+ oe.holder.append(term_id)
241
+ logging.debug(' Selected for '+str(oe)+' holder'+lemma+' '+term_id)
242
+ break
243
+
244
+
245
+
246
+
247
+ #This is specific for the basic version
248
+ def filter_candidates(candidates,ids_oe):
249
+ ##filtered = [(lemma, pos,term_id) for (lemma,pos, term_id) in candidates if len(lemma)>=4 and term_id not in ids_oe]
250
+ filtered = [(lemma,pos,id) for (lemma,pos,id) in candidates if pos in ['N','R']]
251
+ return filtered
252
+
253
+ def obtain_targets_improved(ops_exps,sentences):
254
+ logging.debug(' Obtaining targets improved')
255
+ #print>>sys.stderr,'#'*40
256
+ #print>>sys.stderr,'#'*40
257
+
258
+ #print>>sys.stderr,'Beginning with obtain targets'
259
+ ##sentences --> dict [str(numsent)] ==> list of (lemma, term)id
260
+
261
+ all_ids_in_oe = []
262
+ for oe in ops_exps:
263
+ all_ids_in_oe.extend(oe.ids)
264
+ #print>>sys.stderr,'All list of ids in oe',all_ids_in_oe
265
+
266
+ for oe in ops_exps:
267
+ #print>>sys.stderr,'\tOE:',oe
268
+ logging.debug(' OpExp: '+str(oe))
269
+
270
+ ids_in_oe = oe.ids
271
+ sent = oe.sentence
272
+ list_terms = sentences[str(sent)]
273
+ #print>>sys.stderr,'\t\tTerms in sent:',list_terms
274
+
275
+ ###########################################
276
+ #First rule: noun to the right within maxdistance tokens
277
+ max_distance_right = 3
278
+ biggest_index = -1
279
+ for idx, (lemma,pos,term_id) in enumerate(list_terms):
280
+ if term_id in ids_in_oe:
281
+ biggest_index = idx
282
+
283
+ #print>>sys.stderr,'\t\tBI',biggest_index
284
+ if biggest_index+1 >= len(list_terms): ## is the last element and we shall skip it
285
+ #print>>sys.stderr,'\t\tNot possible to apply 1st rule'
286
+ pass
287
+ else:
288
+ candidates=list_terms[biggest_index+1:min(biggest_index+1+max_distance_right,len(list_terms))]
289
+ ##Filter candidates
290
+ #print>>sys.stderr,'\t\tCandidates for right rule no filter',candidates
291
+ #oe.__candidates_right = [(lemma, term_id) for (lemma, term_id) in candidates if len(lemma)>=4 and term_id not in all_ids_in_oe]
292
+ oe.candidates_r = filter_candidates(candidates,all_ids_in_oe)
293
+ logging.debug(' Candidates filtered right'+str(oe.candidates_r))
294
+ #print>>sys.stderr,'\t\tCandidates for right rule no filter',oe.__candidates_right
295
+
296
+ ######################################################################################
297
+
298
+
299
+ ###########################################
300
+ max_distance_left = 3
301
+ smallest_index = 0
302
+ for idx,(lemma,pos,term_id) in enumerate(list_terms):
303
+ if term_id in ids_in_oe:
304
+ smallest_index = idx
305
+ break
306
+ #print>>sys.stderr,'Smalles index:',smallest_index
307
+ if smallest_index == 0:
308
+ #print>>sys.stderr,'\t\tNot possible to apply left rule'
309
+ pass
310
+ else:
311
+ candidates = list_terms[max(0,smallest_index-1-max_distance_left):smallest_index]
312
+ ##Filter candidates
313
+ #print>>sys.stderr,'\t\tCandidates for left rule no filter',candidates
314
+
315
+ oe.candidates_l = filter_candidates(candidates,all_ids_in_oe)
316
+ logging.debug(' Candidates filtered left: '+str(oe.candidates_l))
317
+
318
+ ######################################################################################
319
+
320
+ #print>>sys.stderr,'#'*40
321
+ #print>>sys.stderr,'#'*40
322
+
323
+ ## filling or.target_ids
324
+ assigned_as_targets = []
325
+
326
+ # First we assing to all the first in the right, if any, and not assigned
327
+ logging.debug(' Applying first to the right rule')
328
+ for oe in ops_exps:
329
+ #print>>sys.stderr,'A ver ',oe
330
+ if len(oe.candidates_r) !=0:
331
+ lemma, pos, id = oe.candidates_r[0]
332
+ if id not in assigned_as_targets:
333
+ oe.target_ids.append(id)
334
+ ###assigned_as_targets.append(id) #Uncomment to avoid selection of the same target moe than once
335
+ logging.debug(' OpExp '+str(oe)+' selected '+id)
336
+ #print>>sys.stderr,'Asignamos',id
337
+
338
+ logging.debug(' Applying most close rule')
339
+ for oe in ops_exps:
340
+ if len(oe.target_ids) == 0: # otherwise it's solved
341
+ intercalados_list = mix_lists([id for _,_,id in oe.candidates_r],[id for _,_,id in oe.candidates_l])
342
+ for id in intercalados_list:
343
+ if id not in assigned_as_targets:
344
+ oe.target_ids.append(id)
345
+ ###assigned_as_targets.append(id) #Uncomment to avoid selection of the same target moe than once
346
+ logging.debug(' OpExp '+str(oe)+' selected '+id)
347
+ break
348
+
349
+ ######## MAIN ROUTINE ############
350
+
351
+ ## Check if we are reading from a pipeline
352
+ if sys.stdin.isatty():
353
+ print>>sys.stderr,'Input stream required.'
354
+ print>>sys.stderr,'Example usage: cat myUTF8file.kaf.xml |',sys.argv[0]
355
+ sys.exit(-1)
356
+ ########################################
357
+
358
+ logging.basicConfig(stream=sys.stderr,format='%(asctime)s - %(levelname)s - %(message)s',level=logging.DEBUG)
359
+
360
+ ## Processing the parameters
361
+ my_time_stamp = True
362
+ remove_opinions = True
363
+ opinion_strength = True
364
+ try:
365
+ opts, args = getopt.getopt(sys.argv[1:],"",["no-time","no-remove-opinions","no-opinion-strength"])
366
+ for opt, arg in opts:
367
+ if opt == "--no-time":
368
+ my_time_stamp = False
369
+ elif opt == "--no-remove-opinions":
370
+ remove_opinions = False
371
+ elif opt == "--no-opinion-strength":
372
+ opinion_strength = False
373
+ except getopt.GetoptError:
374
+ pass
375
+ #########################################
376
+
377
+ logging.debug('Include timestamp: '+str(my_time_stamp))
378
+
379
+ # Parsing the KAF file
380
+ try:
381
+ my_kaf_tree = KafParser(sys.stdin)
382
+ except Exception as e:
383
+ print>>sys.stderr,'Error parsing input'
384
+ print>>sys.stderr,'Stream input must be a valid KAF file'
385
+ print>>sys.stderr,'Error: ',str(e)
386
+ sys.exit(-1)
387
+
388
+
389
+ lang = my_kaf_tree.getLanguage()
390
+ ## Creating data structure
391
+ sentences = defaultdict(list)
392
+ my_tokens = []
393
+
394
+
395
+ # CREATE the datastructure for the tokens
396
+ n=0
397
+ lemma_for_tid = {}
398
+ for term in my_kaf_tree.getTerms():
399
+ n+=1
400
+ term_id = term.getId()
401
+ lemma = term.getLemma()
402
+ lemma_for_tid[term_id] = lemma
403
+ kaf_pos = term.getPos()
404
+ #print>>sys.stderr,kaf_pos
405
+ list_span = term.get_list_span() ## List of token ids in the span layer of the term
406
+ sentiment = term.getSentiment()
407
+ polarity = sent_mod = None
408
+ if sentiment is not None:
409
+ polarity = sentiment.getPolarity()
410
+ sent_mod = sentiment.getSentimentModifier()
411
+ sentence = my_kaf_tree.getToken(list_span[0]).get('sent') ## The sentence of the first token element in span
412
+ my_tokens.append(MyToken(term_id,lemma,kaf_pos,polarity,sent_mod,sentence))
413
+
414
+ sentences[str(sentence)].append((lemma,kaf_pos,term_id))
415
+ #############################
416
+
417
+ logging.debug('Num terms loaded: '+str(n))
418
+ logging.debug('Num sentences: '+str(len(sentences)))
419
+
420
+
421
+ logging.debug('Obtaining opinion expressions')
422
+ my_ops_exps = obtain_opinion_expressions(my_tokens,lang)
423
+ print>>sys.stderr,my_ops_exps
424
+
425
+ logging.debug('Obtaining targets')
426
+ obtain_targets_improved(my_ops_exps,sentences)
427
+
428
+
429
+ logging.debug('Obtaining holders')
430
+ obtain_holders(my_ops_exps,sentences,lang)
431
+
432
+
433
+
434
+
435
+ ## Create the elements
436
+ logging.debug('Generating KAF output')
437
+
438
+ if remove_opinions:
439
+ my_kaf_tree.remove_opinion_layer()
440
+
441
+ for oe in my_ops_exps:
442
+ op_ele = etree.Element('opinion')
443
+
444
+ ## Holder
445
+ if len(oe.holder)!=0:
446
+ oe.holder.sort()
447
+ c = ' '.join(lemma_for_tid[tid] for tid in oe.holder)
448
+ op_hol = etree.Element('opinion_holder')
449
+ op_hol.append(etree.Comment(c))
450
+ op_ele.append(op_hol)
451
+ span_op_hol = etree.Element('span')
452
+ op_hol.append(span_op_hol)
453
+ for id in oe.holder:
454
+ span_op_hol.append(etree.Element('target',attrib={'id':id}))
455
+
456
+ ## Target
457
+ op_tar = etree.Element('opinion_target')
458
+ op_ele.append(op_tar)
459
+
460
+
461
+ if len(oe.target_ids)!=0: ## if there are no targets, there is no opinion eleemnt
462
+ oe.target_ids.sort()
463
+ c = ' '.join(lemma_for_tid[tid] for tid in oe.target_ids)
464
+ op_tar.append(etree.Comment(c))
465
+ span_op_tar = etree.Element('span')
466
+ op_tar.append(span_op_tar)
467
+ for id in oe.target_ids:
468
+ span_op_tar.append(etree.Element('target',attrib={'id':id}))
469
+
470
+ #Expression
471
+ if oe.value > 0: pol = 'positive'
472
+ elif oe.value < 0: pol = 'negative'
473
+ else: pol = 'neutral'
474
+
475
+ op_exp = etree.Element('opinion_expression')
476
+ op_exp.set('polarity',pol)
477
+ if opinion_strength:
478
+ op_exp.set('strength',str(oe.value))
479
+
480
+ op_ele.append(op_exp)
481
+ oe.ids.sort()
482
+ c = ' '.join(lemma_for_tid[tid] for tid in oe.ids)
483
+ op_exp.append(etree.Comment(c))
484
+ span_exp = etree.Element('span')
485
+ op_exp.append(span_exp)
486
+ for id in oe.ids:
487
+ span_exp.append(etree.Element('target',attrib={'id':id}))
488
+
489
+ ##Append the op_ele to the opinions layer
490
+ my_kaf_tree.addElementToLayer('opinions', op_ele)
491
+
492
+
493
+ my_kaf_tree.addLinguisticProcessor('Basic opinion detector with Pos','1.0','opinions', my_time_stamp)
494
+ my_kaf_tree.saveToFile(sys.stdout)
495
+ logging.debug('Process finished')
496
+
497
+
498
+
499
+