opener-opinion-detector-basic 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (168) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +30 -0
  3. data/bin/opinion-detector-basic +19 -0
  4. data/bin/opinion-detector-basic-server +10 -0
  5. data/config.ru +4 -0
  6. data/core/opinion_detector_basic_multi.py +499 -0
  7. data/core/packages/KafNafParser-1.3.tar.gz +0 -0
  8. data/core/packages/VUA_pylib-1.4.tar.gz +0 -0
  9. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
  10. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
  11. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
  12. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
  13. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
  14. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
  15. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
  16. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
  17. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
  18. data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
  19. data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
  20. data/core/vendor/src/crfsuite/AUTHORS +1 -0
  21. data/core/vendor/src/crfsuite/COPYING +27 -0
  22. data/core/vendor/src/crfsuite/ChangeLog +103 -0
  23. data/core/vendor/src/crfsuite/INSTALL +236 -0
  24. data/core/vendor/src/crfsuite/Makefile.am +19 -0
  25. data/core/vendor/src/crfsuite/Makefile.in +783 -0
  26. data/core/vendor/src/crfsuite/README +183 -0
  27. data/core/vendor/src/crfsuite/aclocal.m4 +9018 -0
  28. data/core/vendor/src/crfsuite/autogen.sh +38 -0
  29. data/core/vendor/src/crfsuite/compile +143 -0
  30. data/core/vendor/src/crfsuite/config.guess +1502 -0
  31. data/core/vendor/src/crfsuite/config.h.in +198 -0
  32. data/core/vendor/src/crfsuite/config.sub +1714 -0
  33. data/core/vendor/src/crfsuite/configure +14273 -0
  34. data/core/vendor/src/crfsuite/configure.in +149 -0
  35. data/core/vendor/src/crfsuite/crfsuite.sln +42 -0
  36. data/core/vendor/src/crfsuite/depcomp +630 -0
  37. data/core/vendor/src/crfsuite/example/chunking.py +49 -0
  38. data/core/vendor/src/crfsuite/example/crfutils.py +179 -0
  39. data/core/vendor/src/crfsuite/example/ner.py +270 -0
  40. data/core/vendor/src/crfsuite/example/pos.py +78 -0
  41. data/core/vendor/src/crfsuite/example/template.py +88 -0
  42. data/core/vendor/src/crfsuite/frontend/Makefile.am +29 -0
  43. data/core/vendor/src/crfsuite/frontend/Makefile.in +640 -0
  44. data/core/vendor/src/crfsuite/frontend/dump.c +116 -0
  45. data/core/vendor/src/crfsuite/frontend/frontend.vcxproj +129 -0
  46. data/core/vendor/src/crfsuite/frontend/iwa.c +273 -0
  47. data/core/vendor/src/crfsuite/frontend/iwa.h +65 -0
  48. data/core/vendor/src/crfsuite/frontend/learn.c +439 -0
  49. data/core/vendor/src/crfsuite/frontend/main.c +137 -0
  50. data/core/vendor/src/crfsuite/frontend/option.c +93 -0
  51. data/core/vendor/src/crfsuite/frontend/option.h +86 -0
  52. data/core/vendor/src/crfsuite/frontend/readdata.h +38 -0
  53. data/core/vendor/src/crfsuite/frontend/reader.c +136 -0
  54. data/core/vendor/src/crfsuite/frontend/tag.c +427 -0
  55. data/core/vendor/src/crfsuite/genbinary.sh.in +15 -0
  56. data/core/vendor/src/crfsuite/include/Makefile.am +11 -0
  57. data/core/vendor/src/crfsuite/include/Makefile.in +461 -0
  58. data/core/vendor/src/crfsuite/include/crfsuite.h +1063 -0
  59. data/core/vendor/src/crfsuite/include/crfsuite.hpp +555 -0
  60. data/core/vendor/src/crfsuite/include/crfsuite_api.hpp +400 -0
  61. data/core/vendor/src/crfsuite/include/os.h +61 -0
  62. data/core/vendor/src/crfsuite/install-sh +520 -0
  63. data/core/vendor/src/crfsuite/lib/cqdb/COPYING +28 -0
  64. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.am +21 -0
  65. data/core/vendor/src/crfsuite/lib/cqdb/Makefile.in +549 -0
  66. data/core/vendor/src/crfsuite/lib/cqdb/cqdb.vcxproj +86 -0
  67. data/core/vendor/src/crfsuite/lib/cqdb/include/cqdb.h +524 -0
  68. data/core/vendor/src/crfsuite/lib/cqdb/src/cqdb.c +587 -0
  69. data/core/vendor/src/crfsuite/lib/cqdb/src/lookup3.c +976 -0
  70. data/core/vendor/src/crfsuite/lib/crf/Makefile.am +46 -0
  71. data/core/vendor/src/crfsuite/lib/crf/Makefile.in +721 -0
  72. data/core/vendor/src/crfsuite/lib/crf/crf.vcxproj +216 -0
  73. data/core/vendor/src/crfsuite/lib/crf/src/crf1d.h +353 -0
  74. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_context.c +705 -0
  75. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_encode.c +943 -0
  76. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_feature.c +352 -0
  77. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_model.c +994 -0
  78. data/core/vendor/src/crfsuite/lib/crf/src/crf1d_tag.c +550 -0
  79. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite.c +492 -0
  80. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_internal.h +236 -0
  81. data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_train.c +272 -0
  82. data/core/vendor/src/crfsuite/lib/crf/src/dataset.c +106 -0
  83. data/core/vendor/src/crfsuite/lib/crf/src/dictionary.c +118 -0
  84. data/core/vendor/src/crfsuite/lib/crf/src/holdout.c +80 -0
  85. data/core/vendor/src/crfsuite/lib/crf/src/logging.c +91 -0
  86. data/core/vendor/src/crfsuite/lib/crf/src/logging.h +48 -0
  87. data/core/vendor/src/crfsuite/lib/crf/src/params.c +335 -0
  88. data/core/vendor/src/crfsuite/lib/crf/src/params.h +80 -0
  89. data/core/vendor/src/crfsuite/lib/crf/src/quark.c +172 -0
  90. data/core/vendor/src/crfsuite/lib/crf/src/quark.h +46 -0
  91. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.c +1107 -0
  92. data/core/vendor/src/crfsuite/lib/crf/src/rumavl.h +160 -0
  93. data/core/vendor/src/crfsuite/lib/crf/src/train_arow.c +408 -0
  94. data/core/vendor/src/crfsuite/lib/crf/src/train_averaged_perceptron.c +242 -0
  95. data/core/vendor/src/crfsuite/lib/crf/src/train_l2sgd.c +507 -0
  96. data/core/vendor/src/crfsuite/lib/crf/src/train_lbfgs.c +338 -0
  97. data/core/vendor/src/crfsuite/lib/crf/src/train_passive_aggressive.c +435 -0
  98. data/core/vendor/src/crfsuite/lib/crf/src/vecmath.h +341 -0
  99. data/core/vendor/src/crfsuite/ltmain.sh +8413 -0
  100. data/core/vendor/src/crfsuite/missing +376 -0
  101. data/core/vendor/src/crfsuite/swig/Makefile.am +13 -0
  102. data/core/vendor/src/crfsuite/swig/Makefile.in +365 -0
  103. data/core/vendor/src/crfsuite/swig/crfsuite.cpp +2 -0
  104. data/core/vendor/src/crfsuite/swig/export.i +32 -0
  105. data/core/vendor/src/crfsuite/swig/python/README +92 -0
  106. data/core/vendor/src/crfsuite/swig/python/crfsuite.py +329 -0
  107. data/core/vendor/src/crfsuite/swig/python/export_wrap.cpp +14355 -0
  108. data/core/vendor/src/crfsuite/swig/python/export_wrap.h +63 -0
  109. data/core/vendor/src/crfsuite/swig/python/prepare.sh +9 -0
  110. data/core/vendor/src/crfsuite/swig/python/sample_tag.py +52 -0
  111. data/core/vendor/src/crfsuite/swig/python/sample_train.py +68 -0
  112. data/core/vendor/src/crfsuite/swig/python/setup.py +44 -0
  113. data/core/vendor/src/crfsuite/win32/stdint.h +679 -0
  114. data/core/vendor/src/liblbfgs/AUTHORS +1 -0
  115. data/core/vendor/src/liblbfgs/COPYING +22 -0
  116. data/core/vendor/src/liblbfgs/ChangeLog +120 -0
  117. data/core/vendor/src/liblbfgs/INSTALL +231 -0
  118. data/core/vendor/src/liblbfgs/Makefile.am +10 -0
  119. data/core/vendor/src/liblbfgs/Makefile.in +638 -0
  120. data/core/vendor/src/liblbfgs/NEWS +0 -0
  121. data/core/vendor/src/liblbfgs/README +71 -0
  122. data/core/vendor/src/liblbfgs/aclocal.m4 +6985 -0
  123. data/core/vendor/src/liblbfgs/autogen.sh +38 -0
  124. data/core/vendor/src/liblbfgs/config.guess +1411 -0
  125. data/core/vendor/src/liblbfgs/config.h.in +64 -0
  126. data/core/vendor/src/liblbfgs/config.sub +1500 -0
  127. data/core/vendor/src/liblbfgs/configure +21146 -0
  128. data/core/vendor/src/liblbfgs/configure.in +107 -0
  129. data/core/vendor/src/liblbfgs/depcomp +522 -0
  130. data/core/vendor/src/liblbfgs/include/lbfgs.h +745 -0
  131. data/core/vendor/src/liblbfgs/install-sh +322 -0
  132. data/core/vendor/src/liblbfgs/lbfgs.sln +26 -0
  133. data/core/vendor/src/liblbfgs/lib/Makefile.am +24 -0
  134. data/core/vendor/src/liblbfgs/lib/Makefile.in +499 -0
  135. data/core/vendor/src/liblbfgs/lib/arithmetic_ansi.h +133 -0
  136. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_double.h +294 -0
  137. data/core/vendor/src/liblbfgs/lib/arithmetic_sse_float.h +298 -0
  138. data/core/vendor/src/liblbfgs/lib/lbfgs.c +1371 -0
  139. data/core/vendor/src/liblbfgs/lib/lib.vcxproj +95 -0
  140. data/core/vendor/src/liblbfgs/ltmain.sh +6426 -0
  141. data/core/vendor/src/liblbfgs/missing +353 -0
  142. data/core/vendor/src/liblbfgs/sample/Makefile.am +15 -0
  143. data/core/vendor/src/liblbfgs/sample/Makefile.in +433 -0
  144. data/core/vendor/src/liblbfgs/sample/sample.c +81 -0
  145. data/core/vendor/src/liblbfgs/sample/sample.cpp +126 -0
  146. data/core/vendor/src/liblbfgs/sample/sample.vcxproj +105 -0
  147. data/core/vendor/src/svm_light/LICENSE.txt +59 -0
  148. data/core/vendor/src/svm_light/Makefile +105 -0
  149. data/core/vendor/src/svm_light/kernel.h +40 -0
  150. data/core/vendor/src/svm_light/svm_classify.c +197 -0
  151. data/core/vendor/src/svm_light/svm_common.c +985 -0
  152. data/core/vendor/src/svm_light/svm_common.h +301 -0
  153. data/core/vendor/src/svm_light/svm_hideo.c +1062 -0
  154. data/core/vendor/src/svm_light/svm_learn.c +4147 -0
  155. data/core/vendor/src/svm_light/svm_learn.h +169 -0
  156. data/core/vendor/src/svm_light/svm_learn_main.c +397 -0
  157. data/core/vendor/src/svm_light/svm_loqo.c +211 -0
  158. data/ext/hack/Rakefile +17 -0
  159. data/ext/hack/support.rb +88 -0
  160. data/lib/opener/opinion_detector_basic.rb +91 -0
  161. data/lib/opener/opinion_detector_basic/public/markdown.css +284 -0
  162. data/lib/opener/opinion_detector_basic/server.rb +16 -0
  163. data/lib/opener/opinion_detector_basic/version.rb +5 -0
  164. data/lib/opener/opinion_detector_basic/views/index.erb +97 -0
  165. data/lib/opener/opinion_detector_basic/views/result.erb +15 -0
  166. data/opener-opinion-detector-basic.gemspec +36 -0
  167. data/pre_build_requirements.txt +1 -0
  168. metadata +309 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: cc6dd6c71396853ddb39ff898599ad4c3f466401
4
+ data.tar.gz: c820abad54167482947c7a56d78cba5331482957
5
+ SHA512:
6
+ metadata.gz: ee6555c6dee3540ed4b7a7817e106cfb459281f159c60d73b2af548b9bc19b967ce70bc52e8bac991eabdab6b74a6d30606f7672c339d75c999bf2666a8a66ec
7
+ data.tar.gz: 72fc4d27ce5b98fab608880492a2c7b135e24542eb9b2cb26b3bd4c32008095f966ec944588d30aaeb278c265bdb65bd22688dd43649df7cd16d9ea0e99ece55
data/README.md ADDED
@@ -0,0 +1,30 @@
1
+ Opinion Detector Basic
2
+ ======================
3
+
4
+ This module implements a opinion detector for English (also works for Dutch and
5
+ German). The language is determined by the "xml:lang" attribut in the input KAF
6
+ file. Depending on the value of this attribute, the corresponding lexicon will
7
+ be loaded. This module detects three elements of the opinions:
8
+
9
+ * Expression: the actual opinion expression
10
+ * Target: about what is the previous expression
11
+ * Holder: who is stating that expression
12
+
13
+ Requirements
14
+ -----------
15
+ * VUKafParserPy: parser in python for KAF files
16
+ * lxml: library for processing xml in python
17
+
18
+ Usage
19
+ ----
20
+
21
+ The input KAF file has to be annotated with at least the term layer, with
22
+ polarity information. Correct input files for this module are the output KAF
23
+ files from the polarity tagger module
24
+
25
+ To tag an input KAF file example.kaf with opinions you can run:
26
+
27
+ $ cat example.with.polaritieskaf | core/opinion_detector_basic_multi.py > output.with.opinions.kaf
28
+
29
+ The output will the input KAF file extended with the opinion layer.
30
+
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/opener/opinion_detector_basic'
4
+
5
+ # STDIN.tty? returns `false` if data is being piped into the current process.
6
+ if STDIN.tty?
7
+ input = nil
8
+ else
9
+ input = STDIN.read
10
+ end
11
+
12
+ kernel = Opener::OpinionDetectorBasic.new(:args => ARGV)
13
+ stdout, stderr, process = kernel.run(input)
14
+
15
+ if process.success?
16
+ puts stdout
17
+ else
18
+ abort stderr
19
+ end
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rack'
4
+
5
+ # Without calling `Rack::Server#options` manually the CLI arguments will never
6
+ # be passed, thus the application can't be specified as a constructor argument.
7
+ server = Rack::Server.new
8
+ server.options[:config] = File.expand_path('../../config.ru', __FILE__)
9
+
10
+ server.start
data/config.ru ADDED
@@ -0,0 +1,4 @@
1
+ require File.expand_path('../lib/opener/opinion_detector_basic', __FILE__)
2
+ require File.expand_path('../lib/opener/opinion_detector_basic/server', __FILE__)
3
+
4
+ run Opener::OpinionDetectorBasic::Server
@@ -0,0 +1,499 @@
1
+ #!/usr/bin/env python
2
+
3
+ import sys
4
+ import getopt
5
+ import os
6
+
7
+ this_folder = os.path.dirname(os.path.realpath(__file__))
8
+
9
+ # This updates the load path to ensure that the local site-packages directory
10
+ # can be used to load packages (e.g. a locally installed copy of lxml).
11
+ sys.path.append(os.path.join(this_folder, 'site-packages/pre_build'))
12
+
13
+ from VUKafParserPy import KafParser
14
+ from collections import defaultdict
15
+ import operator
16
+ import pprint
17
+ import lxml
18
+ from lxml import etree
19
+ import logging
20
+
21
+
22
+
23
+ def mix_lists(l1,l2):
24
+ newl=[]
25
+ min_l = min(len(l1),len(l2))
26
+ for x in range(min_l):
27
+ newl.append(l1[x])
28
+ newl.append(l2[x])
29
+
30
+ if len(l1)>len(l2):
31
+ newl.extend(l1[min_l:])
32
+ elif len(l2)>len(l1):
33
+ newl.extend(l2[min_l:])
34
+ return newl
35
+
36
+
37
+ class OpinionExpression:
38
+ def __init__(self,spans,sentence,value):
39
+ self.ids = spans
40
+ self.sentence = sentence
41
+ self.value = value
42
+ self.target_ids = []
43
+ self.candidates_r=[]
44
+ self.candidates_l=[]
45
+ self.holder = []
46
+
47
+ def __repr__(self):
48
+ r='Ids:'+'#'.join(self.ids)+' Sent:'+self.sentence+' Value:'+str(self.value)+ ' Target:'+'#'.join(self.target_ids)+'\n'
49
+ r+='Right cand: '+str(self.candidates_r)+'\n'
50
+ r+='Left cand: '+str(self.candidates_l)+'\n'
51
+ return r
52
+
53
+ class MyToken:
54
+ def __init__(self,id,lemma,pos,polarity,sent_mod,sent):
55
+ self.id = id
56
+ self.lemma = lemma
57
+ self.pos = pos
58
+ self.polarity = polarity
59
+ self.sent_mod = sent_mod
60
+ self.sentence = sent
61
+ self.use_it = True
62
+ self.list_ids = [id]
63
+ self.value = 0
64
+
65
+
66
+ if polarity == 'positive':
67
+ self.value = 1
68
+ elif polarity == 'negative':
69
+ self.value = -1
70
+
71
+ if sent_mod == 'intensifier':
72
+ self.value = 2
73
+ elif sent_mod == 'shifter':
74
+ self.value = -1
75
+
76
+
77
+ def isNegator(self):
78
+ return self.sent_mod == 'shifter'
79
+
80
+
81
+
82
+ def isIntensifier(self):
83
+ return self.sent_mod == 'intensifier'
84
+
85
+
86
+ def is_opinion_expression(self):
87
+ return self.use_it and self.polarity is not None
88
+
89
+
90
+ def __repr__(self):
91
+ if self.use_it:
92
+ return self.id+' lemma:'+self.lemma.encode('utf-8')+'.'+self.pos.encode('utf-8')+' pol:'+str(self.polarity)+' sentmod:'+str(self.sent_mod)+' sent:'+self.sentence+' use:'+str(self.use_it)+' list:'+'#'.join(self.list_ids)+' val:'+str(self.value)
93
+ else:
94
+ return '\t'+self.id+' lemma:'+self.lemma.encode('utf-8')+'.'+self.pos.encode('utf-8')+' pol:'+str(self.polarity)+' sentmod:'+str(self.sent_mod)+' sent:'+self.sentence+' use:'+str(self.use_it)+' list:'+'#'.join(self.list_ids)+' val:'+str(self.value)
95
+
96
+
97
+
98
+ def obtain_opinion_expressions(tokens,lang='nl'):
99
+ logging.debug(' Obtaining opinion expressions')
100
+ my_tokens = tokens[:]
101
+
102
+ accumulate_several_modifiers = True
103
+ apply_modifiers = True
104
+ apply_conjunctions = True
105
+
106
+ ## Acumulate doble/triple intensifiers or negators
107
+ if accumulate_several_modifiers:
108
+ logging.debug(' Accumulating modifiers')
109
+ t = 0
110
+ while t < len(my_tokens):
111
+ if my_tokens[t].isNegator() or my_tokens[t].isIntensifier():
112
+ if t+1 < len(my_tokens) and ( my_tokens[t+1].isNegator() or my_tokens[t+1].isIntensifier()):
113
+ ## There are 2 negators/intensifiers next to each other
114
+ ## The first one is deactivated and the second one is modified
115
+ my_tokens[t].use_it = False
116
+ my_tokens[t+1].value *= my_tokens[t].value
117
+ my_tokens[t+1].list_ids += my_tokens[t].list_ids
118
+ logging.debug(' Accucumating '+'-'.join(my_tokens[t+1].list_ids))
119
+ t+=1
120
+ ###########################################
121
+
122
+ ##Apply intensifiers/negators over the next elements
123
+ if apply_modifiers:
124
+ logging.debug(' Applying modifiers')
125
+ t = 0
126
+ while t < len(my_tokens):
127
+ if my_tokens[t].use_it and (my_tokens[t].isNegator() or my_tokens[t].isIntensifier()):
128
+ ## Try to modify the next token:
129
+ if t+1<len(my_tokens):
130
+ my_tokens[t+1].value *= my_tokens[t].value
131
+ my_tokens[t+1].list_ids += my_tokens[t].list_ids
132
+ my_tokens[t].use_it = False
133
+ logging.debug(' Applied modifier over '+'-'.join(my_tokens[t+1].list_ids))
134
+ t += 1
135
+ ###########################################
136
+
137
+ if apply_conjunctions:
138
+ if lang=='nl':
139
+ concat = [',','en']
140
+ elif lang=='en':
141
+ concat = [',','and']
142
+ elif lang=='es':
143
+ concat = [',','y','e']
144
+ elif lang=='it':
145
+ concat = [',','e','ed']
146
+ elif lang=='de':
147
+ concat = [',','und']
148
+ elif lang == 'fr':
149
+ concat=[',','et']
150
+ logging.debug(' Applying conjunctions:'+str(concat))
151
+
152
+
153
+ t = 0
154
+ while t < len(my_tokens):
155
+ if my_tokens[t].use_it and my_tokens[t].value!=0: ## Find the first one
156
+ #print 'FOUND ',my_tokens[t]
157
+ logging.debug(' Found token '+str(my_tokens[t]))
158
+ list_aux = my_tokens[t].list_ids
159
+ used = [t]
160
+ value_aux = my_tokens[t].value
161
+ my_tokens[t].use_it = False
162
+ #print 'Modified',my_tokens[t]
163
+
164
+ x = t+1
165
+ while True:
166
+ if x>=len(my_tokens):
167
+ break
168
+
169
+ if my_tokens[x].lemma in concat:
170
+ ## list_aux += my_tokens[x].list_ids Dont use it as part of the OE
171
+ my_tokens[x].use_it = False
172
+ x+=1
173
+ elif (my_tokens[x].use_it and my_tokens[x].value!=0):
174
+ #print '\Also ',my_tokens[x]
175
+ logging.debug(' Found token '+str(my_tokens[x]))
176
+ list_aux += my_tokens[x].list_ids
177
+
178
+ used.append(x)
179
+ my_tokens[x].use_it = False
180
+ value_aux += my_tokens[x].value
181
+ x += 1
182
+ else:
183
+ break
184
+ #print 'OUT OF THE WHILE'
185
+ ##The last one in the list used is the one accumulating all
186
+
187
+ last_pos = used[-1]
188
+ my_tokens[last_pos].value = value_aux
189
+ my_tokens[last_pos].list_ids = list_aux
190
+ my_tokens[last_pos].use_it = True
191
+ logging.debug(' Regenerating '+str(my_tokens[last_pos]))
192
+ t = x ## next token
193
+ #print
194
+ #print
195
+ t += 1
196
+
197
+
198
+ ## Create OpinionExpression
199
+ my_opinion_exps = []
200
+ logging.debug(' Generating output')
201
+ for token in my_tokens:
202
+ if token.use_it and token.value != 0:
203
+ op_exp = OpinionExpression(token.list_ids,token.sentence,token.value)
204
+ my_opinion_exps.append(op_exp)
205
+ return my_opinion_exps
206
+
207
+
208
+ '''
209
+ def get_distance(id1, id2):
210
+ pos1 = int(id1[id1.find('_')+1:])
211
+ pos2 = int(id2[id2.find('_')+1:])
212
+ if pos1>pos2:
213
+ return pos1-pos2
214
+ else:
215
+ return pos2-pos1
216
+ '''
217
+
218
+
219
+ def obtain_holders(ops_exps,sentences,lang):
220
+ if lang=='nl':
221
+ holders = ['ik','we','wij','ze','zij','jullie','u','hij','het','jij','je','mij','me','hem','haar','ons','hen','hun']
222
+ elif lang=='en':
223
+ holders = ['i','we','he','she','they','it','you']
224
+ elif lang =='es':
225
+ holders = ['yo','tu','nosotros','vosotros','ellos','ellas','nosotras','vosotras']
226
+ elif lang =='it':
227
+ holders = ['io','tu','noi','voi','loro','lei','lui']
228
+ elif lang == 'de':
229
+ holders = ['ich','du','wir','ihr','sie','er']
230
+ elif lang == 'fr':
231
+ holders = ['je','tu','lui','elle','nous','vous','ils','elles']
232
+
233
+ logging.debug('Obtaining holders with list: '+str(holders))
234
+
235
+ for oe in ops_exps:
236
+ sent = oe.sentence
237
+ list_terms = sentences[str(sent)]
238
+ for lemma, pos, term_id in list_terms:
239
+ if lemma in holders:
240
+ oe.holder.append(term_id)
241
+ logging.debug(' Selected for '+str(oe)+' holder'+lemma+' '+term_id)
242
+ break
243
+
244
+
245
+
246
+
247
+ #This is specific for the basic version
248
+ def filter_candidates(candidates,ids_oe):
249
+ ##filtered = [(lemma, pos,term_id) for (lemma,pos, term_id) in candidates if len(lemma)>=4 and term_id not in ids_oe]
250
+ filtered = [(lemma,pos,id) for (lemma,pos,id) in candidates if pos in ['N','R']]
251
+ return filtered
252
+
253
+ def obtain_targets_improved(ops_exps,sentences):
254
+ logging.debug(' Obtaining targets improved')
255
+ #print>>sys.stderr,'#'*40
256
+ #print>>sys.stderr,'#'*40
257
+
258
+ #print>>sys.stderr,'Beginning with obtain targets'
259
+ ##sentences --> dict [str(numsent)] ==> list of (lemma, term)id
260
+
261
+ all_ids_in_oe = []
262
+ for oe in ops_exps:
263
+ all_ids_in_oe.extend(oe.ids)
264
+ #print>>sys.stderr,'All list of ids in oe',all_ids_in_oe
265
+
266
+ for oe in ops_exps:
267
+ #print>>sys.stderr,'\tOE:',oe
268
+ logging.debug(' OpExp: '+str(oe))
269
+
270
+ ids_in_oe = oe.ids
271
+ sent = oe.sentence
272
+ list_terms = sentences[str(sent)]
273
+ #print>>sys.stderr,'\t\tTerms in sent:',list_terms
274
+
275
+ ###########################################
276
+ #First rule: noun to the right within maxdistance tokens
277
+ max_distance_right = 3
278
+ biggest_index = -1
279
+ for idx, (lemma,pos,term_id) in enumerate(list_terms):
280
+ if term_id in ids_in_oe:
281
+ biggest_index = idx
282
+
283
+ #print>>sys.stderr,'\t\tBI',biggest_index
284
+ if biggest_index+1 >= len(list_terms): ## is the last element and we shall skip it
285
+ #print>>sys.stderr,'\t\tNot possible to apply 1st rule'
286
+ pass
287
+ else:
288
+ candidates=list_terms[biggest_index+1:min(biggest_index+1+max_distance_right,len(list_terms))]
289
+ ##Filter candidates
290
+ #print>>sys.stderr,'\t\tCandidates for right rule no filter',candidates
291
+ #oe.__candidates_right = [(lemma, term_id) for (lemma, term_id) in candidates if len(lemma)>=4 and term_id not in all_ids_in_oe]
292
+ oe.candidates_r = filter_candidates(candidates,all_ids_in_oe)
293
+ logging.debug(' Candidates filtered right'+str(oe.candidates_r))
294
+ #print>>sys.stderr,'\t\tCandidates for right rule no filter',oe.__candidates_right
295
+
296
+ ######################################################################################
297
+
298
+
299
+ ###########################################
300
+ max_distance_left = 3
301
+ smallest_index = 0
302
+ for idx,(lemma,pos,term_id) in enumerate(list_terms):
303
+ if term_id in ids_in_oe:
304
+ smallest_index = idx
305
+ break
306
+ #print>>sys.stderr,'Smalles index:',smallest_index
307
+ if smallest_index == 0:
308
+ #print>>sys.stderr,'\t\tNot possible to apply left rule'
309
+ pass
310
+ else:
311
+ candidates = list_terms[max(0,smallest_index-1-max_distance_left):smallest_index]
312
+ ##Filter candidates
313
+ #print>>sys.stderr,'\t\tCandidates for left rule no filter',candidates
314
+
315
+ oe.candidates_l = filter_candidates(candidates,all_ids_in_oe)
316
+ logging.debug(' Candidates filtered left: '+str(oe.candidates_l))
317
+
318
+ ######################################################################################
319
+
320
+ #print>>sys.stderr,'#'*40
321
+ #print>>sys.stderr,'#'*40
322
+
323
+ ## filling or.target_ids
324
+ assigned_as_targets = []
325
+
326
+ # First we assing to all the first in the right, if any, and not assigned
327
+ logging.debug(' Applying first to the right rule')
328
+ for oe in ops_exps:
329
+ #print>>sys.stderr,'A ver ',oe
330
+ if len(oe.candidates_r) !=0:
331
+ lemma, pos, id = oe.candidates_r[0]
332
+ if id not in assigned_as_targets:
333
+ oe.target_ids.append(id)
334
+ ###assigned_as_targets.append(id) #Uncomment to avoid selection of the same target moe than once
335
+ logging.debug(' OpExp '+str(oe)+' selected '+id)
336
+ #print>>sys.stderr,'Asignamos',id
337
+
338
+ logging.debug(' Applying most close rule')
339
+ for oe in ops_exps:
340
+ if len(oe.target_ids) == 0: # otherwise it's solved
341
+ intercalados_list = mix_lists([id for _,_,id in oe.candidates_r],[id for _,_,id in oe.candidates_l])
342
+ for id in intercalados_list:
343
+ if id not in assigned_as_targets:
344
+ oe.target_ids.append(id)
345
+ ###assigned_as_targets.append(id) #Uncomment to avoid selection of the same target moe than once
346
+ logging.debug(' OpExp '+str(oe)+' selected '+id)
347
+ break
348
+
349
+ ######## MAIN ROUTINE ############
350
+
351
+ ## Check if we are reading from a pipeline
352
+ if sys.stdin.isatty():
353
+ print>>sys.stderr,'Input stream required.'
354
+ print>>sys.stderr,'Example usage: cat myUTF8file.kaf.xml |',sys.argv[0]
355
+ sys.exit(-1)
356
+ ########################################
357
+
358
+ logging.basicConfig(stream=sys.stderr,format='%(asctime)s - %(levelname)s - %(message)s',level=logging.DEBUG)
359
+
360
+ ## Processing the parameters
361
+ my_time_stamp = True
362
+ remove_opinions = True
363
+ opinion_strength = True
364
+ try:
365
+ opts, args = getopt.getopt(sys.argv[1:],"",["no-time","no-remove-opinions","no-opinion-strength"])
366
+ for opt, arg in opts:
367
+ if opt == "--no-time":
368
+ my_time_stamp = False
369
+ elif opt == "--no-remove-opinions":
370
+ remove_opinions = False
371
+ elif opt == "--no-opinion-strength":
372
+ opinion_strength = False
373
+ except getopt.GetoptError:
374
+ pass
375
+ #########################################
376
+
377
+ logging.debug('Include timestamp: '+str(my_time_stamp))
378
+
379
+ # Parsing the KAF file
380
+ try:
381
+ my_kaf_tree = KafParser(sys.stdin)
382
+ except Exception as e:
383
+ print>>sys.stderr,'Error parsing input'
384
+ print>>sys.stderr,'Stream input must be a valid KAF file'
385
+ print>>sys.stderr,'Error: ',str(e)
386
+ sys.exit(-1)
387
+
388
+
389
+ lang = my_kaf_tree.getLanguage()
390
+ ## Creating data structure
391
+ sentences = defaultdict(list)
392
+ my_tokens = []
393
+
394
+
395
+ # CREATE the datastructure for the tokens
396
+ n=0
397
+ lemma_for_tid = {}
398
+ for term in my_kaf_tree.getTerms():
399
+ n+=1
400
+ term_id = term.getId()
401
+ lemma = term.getLemma()
402
+ lemma_for_tid[term_id] = lemma
403
+ kaf_pos = term.getPos()
404
+ #print>>sys.stderr,kaf_pos
405
+ list_span = term.get_list_span() ## List of token ids in the span layer of the term
406
+ sentiment = term.getSentiment()
407
+ polarity = sent_mod = None
408
+ if sentiment is not None:
409
+ polarity = sentiment.getPolarity()
410
+ sent_mod = sentiment.getSentimentModifier()
411
+ sentence = my_kaf_tree.getToken(list_span[0]).get('sent') ## The sentence of the first token element in span
412
+ my_tokens.append(MyToken(term_id,lemma,kaf_pos,polarity,sent_mod,sentence))
413
+
414
+ sentences[str(sentence)].append((lemma,kaf_pos,term_id))
415
+ #############################
416
+
417
+ logging.debug('Num terms loaded: '+str(n))
418
+ logging.debug('Num sentences: '+str(len(sentences)))
419
+
420
+
421
+ logging.debug('Obtaining opinion expressions')
422
+ my_ops_exps = obtain_opinion_expressions(my_tokens,lang)
423
+ print>>sys.stderr,my_ops_exps
424
+
425
+ logging.debug('Obtaining targets')
426
+ obtain_targets_improved(my_ops_exps,sentences)
427
+
428
+
429
+ logging.debug('Obtaining holders')
430
+ obtain_holders(my_ops_exps,sentences,lang)
431
+
432
+
433
+
434
+
435
+ ## Create the elements
436
+ logging.debug('Generating KAF output')
437
+
438
+ if remove_opinions:
439
+ my_kaf_tree.remove_opinion_layer()
440
+
441
+ for oe in my_ops_exps:
442
+ op_ele = etree.Element('opinion')
443
+
444
+ ## Holder
445
+ if len(oe.holder)!=0:
446
+ oe.holder.sort()
447
+ c = ' '.join(lemma_for_tid[tid] for tid in oe.holder)
448
+ op_hol = etree.Element('opinion_holder')
449
+ op_hol.append(etree.Comment(c))
450
+ op_ele.append(op_hol)
451
+ span_op_hol = etree.Element('span')
452
+ op_hol.append(span_op_hol)
453
+ for id in oe.holder:
454
+ span_op_hol.append(etree.Element('target',attrib={'id':id}))
455
+
456
+ ## Target
457
+ op_tar = etree.Element('opinion_target')
458
+ op_ele.append(op_tar)
459
+
460
+
461
+ if len(oe.target_ids)!=0: ## if there are no targets, there is no opinion eleemnt
462
+ oe.target_ids.sort()
463
+ c = ' '.join(lemma_for_tid[tid] for tid in oe.target_ids)
464
+ op_tar.append(etree.Comment(c))
465
+ span_op_tar = etree.Element('span')
466
+ op_tar.append(span_op_tar)
467
+ for id in oe.target_ids:
468
+ span_op_tar.append(etree.Element('target',attrib={'id':id}))
469
+
470
+ #Expression
471
+ if oe.value > 0: pol = 'positive'
472
+ elif oe.value < 0: pol = 'negative'
473
+ else: pol = 'neutral'
474
+
475
+ op_exp = etree.Element('opinion_expression')
476
+ op_exp.set('polarity',pol)
477
+ if opinion_strength:
478
+ op_exp.set('strength',str(oe.value))
479
+
480
+ op_ele.append(op_exp)
481
+ oe.ids.sort()
482
+ c = ' '.join(lemma_for_tid[tid] for tid in oe.ids)
483
+ op_exp.append(etree.Comment(c))
484
+ span_exp = etree.Element('span')
485
+ op_exp.append(span_exp)
486
+ for id in oe.ids:
487
+ span_exp.append(etree.Element('target',attrib={'id':id}))
488
+
489
+ ##Append the op_ele to the opinions layer
490
+ my_kaf_tree.addElementToLayer('opinions', op_ele)
491
+
492
+
493
+ my_kaf_tree.addLinguisticProcessor('Basic opinion detector with Pos','1.0','opinions', my_time_stamp)
494
+ my_kaf_tree.saveToFile(sys.stdout)
495
+ logging.debug('Process finished')
496
+
497
+
498
+
499
+