opener-opinion-detector-basic 2.0.7 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,160 @@
1
+ module Opener
2
+ class OpinionDetectorBasic
3
+ class Term
4
+ attr_reader :node, :sentence, :is_conjunction
5
+ attr_accessor :use, :accumulated_strength, :list_ids
6
+
7
+ def initialize(node, document, language)
8
+ @node = node
9
+ @sentence = get_sentence(document)
10
+ @use = true
11
+ @accumulated_strength = strength
12
+ @list_ids = [id]
13
+ @is_conjunction = is_conjunction?(language)
14
+ end
15
+
16
+ ##
17
+ # Returns the term id.
18
+ #
19
+ # @return [String]
20
+ #
21
+ def id
22
+ @id ||= node.get('tid')
23
+ end
24
+
25
+ ##
26
+ # Returns the lemma of the term.
27
+ #
28
+ # @return [String]
29
+ #
30
+ def lemma
31
+ @lemma ||= node.get('lemma')
32
+ end
33
+
34
+ ##
35
+ # Returns the part of speech of the term.
36
+ #
37
+ # @return [String]
38
+ #
39
+ def pos
40
+ @pos ||= node.get('pos')
41
+ end
42
+
43
+ ##
44
+ # Returns the sentiment modifier type if it exists.
45
+ #
46
+ # @return [String|NilClass]
47
+ #
48
+ def sentiment_modifier
49
+ @sentiment_modifier ||= if sentiment = node.xpath('sentiment').first
50
+ sentiment.get('sentiment_modifier')
51
+ end
52
+ end
53
+
54
+ ##
55
+ # Returns the polarity of the term if it exists.
56
+ #
57
+ # @return [String|NilClass]
58
+ #
59
+ def polarity
60
+ @polarity ||= if sentiment = node.xpath('sentiment').first
61
+ sentiment.get('polarity')
62
+ end
63
+ end
64
+
65
+ ##
66
+ # Returns the actual word ids that construct the lemma.
67
+ #
68
+ # @return [Array]
69
+ #
70
+ def target_ids
71
+ @target_ids ||= node.xpath('span/target').map {|target| target.get('id')}
72
+ end
73
+
74
+ ##
75
+ # Returns the strength of the term depending on its type.
76
+ #
77
+ # @return [Integer]
78
+ #
79
+ def strength
80
+ if polarity == "positive"
81
+ return 1
82
+ elsif polarity == "negative"
83
+ return -1
84
+ end
85
+
86
+ if is_intensifier?
87
+ return 2
88
+ elsif is_shifter?
89
+ return -1
90
+ end
91
+
92
+ return 0
93
+ end
94
+
95
+ ##
96
+ # Returns the sentence id that the term belongs to in the document.
97
+ #
98
+ # @return [String]
99
+ #
100
+ def get_sentence(document)
101
+ document
102
+ .xpath("KAF/text/wf[@wid='#{target_ids.first}']")
103
+ .first
104
+ .get('sent')
105
+ end
106
+
107
+ ##
108
+ # Checks if a term is an intensifier.
109
+ #
110
+ # @return [TrueClass|FalseClass]
111
+ #
112
+ def is_intensifier?
113
+ sentiment_modifier == "intensifier"
114
+ end
115
+
116
+ ##
117
+ # Checks if a term is a shifter.
118
+ #
119
+ # @return [TrueClass|FalseClass]
120
+ #
121
+ def is_shifter?
122
+ sentiment_modifier == "shifter"
123
+ end
124
+
125
+ ##
126
+ # Checks if a term is an expression.
127
+ #
128
+ # @return [TrueClass|FalseClass]
129
+ #
130
+ def is_expression?
131
+ use && !!polarity
132
+ end
133
+
134
+ ##
135
+ # Checks if a term is a conjunction.
136
+ #
137
+ # @return [TrueClass|FalseClass]
138
+ #
139
+ def is_conjunction?(language)
140
+ conjunctions[language].include?(lemma)
141
+ end
142
+
143
+ ##
144
+ # Map of conjunctions per language code
145
+ #
146
+ # @return [Hash]
147
+ #
148
+ def conjunctions
149
+ {
150
+ 'nl' => [',','en'],
151
+ 'en' => [',','and'],
152
+ 'es' => [',','y','e'],
153
+ 'it' => [',','e','ed'],
154
+ 'de' => [',','und'],
155
+ 'fr' => [',','et']
156
+ }
157
+ end
158
+ end # Term
159
+ end # OpinionDetectorBasic
160
+ end # Opener
@@ -1,5 +1,5 @@
1
1
  module Opener
2
2
  class OpinionDetectorBasic
3
- VERSION = '2.0.7'
3
+ VERSION = '3.0.0'
4
4
  end
5
5
  end
@@ -7,18 +7,14 @@ Gem::Specification.new do |gem|
7
7
  gem.summary = 'Basic Opinion Detector.'
8
8
  gem.description = gem.summary
9
9
  gem.homepage = 'http://opener-project.github.com/'
10
- gem.extensions = ['ext/hack/Rakefile']
11
10
  gem.license = 'Apache 2.0'
12
11
 
13
12
  gem.required_ruby_version = '>= 1.9.2'
14
-
13
+
15
14
  gem.files = Dir.glob([
16
- 'core/*',
17
- 'ext/**/*',
18
15
  'lib/**/*',
19
16
  'config.ru',
20
17
  '*.gemspec',
21
- '*_requirements.txt',
22
18
  'README.md',
23
19
  'LICENSE.txt',
24
20
  'exec/**/*',
@@ -30,12 +26,11 @@ Gem::Specification.new do |gem|
30
26
  gem.add_dependency 'opener-daemons', '~> 2.2'
31
27
  gem.add_dependency 'opener-webservice', '~> 2.1'
32
28
  gem.add_dependency 'opener-core', '~> 2.2'
33
-
34
- gem.add_dependency 'rake'
35
- gem.add_dependency 'nokogiri'
36
- gem.add_dependency 'cliver'
37
- gem.add_dependency 'slop', '~> 3.5'
29
+
30
+ gem.add_dependency 'oga'
38
31
 
39
32
  gem.add_development_dependency 'rspec', '~> 3.0'
40
33
  gem.add_development_dependency 'cucumber'
34
+ gem.add_development_dependency 'rake'
35
+ gem.add_development_dependency 'benchmark-ips', '~> 2.0'
41
36
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: opener-opinion-detector-basic
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.7
4
+ version: 3.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - development@olery.com
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-17 00:00:00.000000000 Z
11
+ date: 2015-06-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: opener-daemons
@@ -53,7 +53,7 @@ dependencies:
53
53
  prerelease: false
54
54
  type: :runtime
55
55
  - !ruby/object:Gem::Dependency
56
- name: rake
56
+ name: oga
57
57
  version_requirements: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - '>='
@@ -67,21 +67,21 @@ dependencies:
67
67
  prerelease: false
68
68
  type: :runtime
69
69
  - !ruby/object:Gem::Dependency
70
- name: nokogiri
70
+ name: rspec
71
71
  version_requirements: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - '>='
73
+ - - ~>
74
74
  - !ruby/object:Gem::Version
75
- version: '0'
75
+ version: '3.0'
76
76
  requirement: !ruby/object:Gem::Requirement
77
77
  requirements:
78
- - - '>='
78
+ - - ~>
79
79
  - !ruby/object:Gem::Version
80
- version: '0'
80
+ version: '3.0'
81
81
  prerelease: false
82
- type: :runtime
82
+ type: :development
83
83
  - !ruby/object:Gem::Dependency
84
- name: cliver
84
+ name: cucumber
85
85
  version_requirements: !ruby/object:Gem::Requirement
86
86
  requirements:
87
87
  - - '>='
@@ -93,47 +93,33 @@ dependencies:
93
93
  - !ruby/object:Gem::Version
94
94
  version: '0'
95
95
  prerelease: false
96
- type: :runtime
96
+ type: :development
97
97
  - !ruby/object:Gem::Dependency
98
- name: slop
98
+ name: rake
99
99
  version_requirements: !ruby/object:Gem::Requirement
100
100
  requirements:
101
- - - ~>
101
+ - - '>='
102
102
  - !ruby/object:Gem::Version
103
- version: '3.5'
103
+ version: '0'
104
104
  requirement: !ruby/object:Gem::Requirement
105
105
  requirements:
106
- - - ~>
106
+ - - '>='
107
107
  - !ruby/object:Gem::Version
108
- version: '3.5'
108
+ version: '0'
109
109
  prerelease: false
110
- type: :runtime
110
+ type: :development
111
111
  - !ruby/object:Gem::Dependency
112
- name: rspec
112
+ name: benchmark-ips
113
113
  version_requirements: !ruby/object:Gem::Requirement
114
114
  requirements:
115
115
  - - ~>
116
116
  - !ruby/object:Gem::Version
117
- version: '3.0'
117
+ version: '2.0'
118
118
  requirement: !ruby/object:Gem::Requirement
119
119
  requirements:
120
120
  - - ~>
121
121
  - !ruby/object:Gem::Version
122
- version: '3.0'
123
- prerelease: false
124
- type: :development
125
- - !ruby/object:Gem::Dependency
126
- name: cucumber
127
- version_requirements: !ruby/object:Gem::Requirement
128
- requirements:
129
- - - '>='
130
- - !ruby/object:Gem::Version
131
- version: '0'
132
- requirement: !ruby/object:Gem::Requirement
133
- requirements:
134
- - - '>='
135
- - !ruby/object:Gem::Version
136
- version: '0'
122
+ version: '2.0'
137
123
  prerelease: false
138
124
  type: :development
139
125
  description: Basic Opinion Detector.
@@ -142,21 +128,20 @@ executables:
142
128
  - opinion-detector-basic
143
129
  - opinion-detector-basic-daemon
144
130
  - opinion-detector-basic-server
145
- extensions:
146
- - ext/hack/Rakefile
131
+ extensions: []
147
132
  extra_rdoc_files: []
148
133
  files:
149
- - core/opinion_detector_basic_multi.py
150
- - ext/hack/Rakefile
151
134
  - lib/opener/opinion_detector_basic.rb
152
135
  - lib/opener/opinion_detector_basic/cli.rb
136
+ - lib/opener/opinion_detector_basic/opinion.rb
137
+ - lib/opener/opinion_detector_basic/processor.rb
153
138
  - lib/opener/opinion_detector_basic/server.rb
139
+ - lib/opener/opinion_detector_basic/term.rb
154
140
  - lib/opener/opinion_detector_basic/version.rb
155
141
  - lib/opener/opinion_detector_basic/public/markdown.css
156
142
  - lib/opener/opinion_detector_basic/views/index.erb
157
143
  - config.ru
158
144
  - opener-opinion-detector-basic.gemspec
159
- - pre_install_requirements.txt
160
145
  - README.md
161
146
  - LICENSE.txt
162
147
  - exec/opinion-detector-basic.rb
@@ -1,512 +0,0 @@
1
- #!/usr/bin/env python
2
-
3
- import sys
4
- import getopt
5
- import os
6
-
7
- this_folder = os.path.dirname(os.path.realpath(__file__))
8
-
9
- # This updates the load path to ensure that the local site-packages directory
10
- # can be used to load packages (e.g. a locally installed copy of lxml).
11
- sys.path.append(os.path.join(this_folder, 'site-packages/pre_install'))
12
-
13
- from VUKafParserPy import KafParser
14
- from collections import defaultdict
15
- import operator
16
- import pprint
17
- import lxml
18
- from lxml import etree
19
- import logging
20
-
21
-
22
-
23
- def mix_lists(l1,l2):
24
- newl=[]
25
- min_l = min(len(l1),len(l2))
26
- for x in range(min_l):
27
- newl.append(l1[x])
28
- newl.append(l2[x])
29
-
30
- if len(l1)>len(l2):
31
- newl.extend(l1[min_l:])
32
- elif len(l2)>len(l1):
33
- newl.extend(l2[min_l:])
34
- return newl
35
-
36
-
37
- class OpinionExpression:
38
- def __init__(self,spans,sentence,value):
39
- self.ids = spans
40
- self.sentence = sentence
41
- self.value = value
42
- self.target_ids = []
43
- self.candidates_r=[]
44
- self.candidates_l=[]
45
- self.holder = []
46
-
47
- def __repr__(self):
48
- r='Ids:'+'#'.join(self.ids)+' Sent:'+self.sentence+' Value:'+str(self.value)+ ' Target:'+'#'.join(self.target_ids)+'\n'
49
- r+='Right cand: '+str(self.candidates_r)+'\n'
50
- r+='Left cand: '+str(self.candidates_l)+'\n'
51
- return r
52
-
53
- class MyToken:
54
- def __init__(self,id,lemma,pos,polarity,sent_mod,sent):
55
- self.id = id
56
- self.lemma = lemma
57
- self.pos = pos
58
- self.polarity = polarity
59
- self.sent_mod = sent_mod
60
- self.sentence = sent
61
- self.use_it = True
62
- self.list_ids = [id]
63
- self.value = 0
64
-
65
-
66
- if polarity == 'positive':
67
- self.value = 1
68
- elif polarity == 'negative':
69
- self.value = -1
70
-
71
- if sent_mod == 'intensifier':
72
- self.value = 2
73
- elif sent_mod == 'shifter':
74
- self.value = -1
75
-
76
-
77
- def isNegator(self):
78
- return self.sent_mod == 'shifter'
79
-
80
-
81
-
82
- def isIntensifier(self):
83
- return self.sent_mod == 'intensifier'
84
-
85
-
86
- def is_opinion_expression(self):
87
- return self.use_it and self.polarity is not None
88
-
89
-
90
- def __repr__(self):
91
- if self.use_it:
92
- return self.id+' lemma:'+self.lemma.encode('utf-8')+'.'+self.pos.encode('utf-8')+' pol:'+str(self.polarity)+' sentmod:'+str(self.sent_mod)+' sent:'+self.sentence+' use:'+str(self.use_it)+' list:'+'#'.join(self.list_ids)+' val:'+str(self.value)
93
- else:
94
- return '\t'+self.id+' lemma:'+self.lemma.encode('utf-8')+'.'+self.pos.encode('utf-8')+' pol:'+str(self.polarity)+' sentmod:'+str(self.sent_mod)+' sent:'+self.sentence+' use:'+str(self.use_it)+' list:'+'#'.join(self.list_ids)+' val:'+str(self.value)
95
-
96
-
97
-
98
- def obtain_opinion_expressions(tokens,lang='nl'):
99
- logging.debug(' Obtaining opinion expressions')
100
- my_tokens = tokens[:]
101
-
102
- accumulate_several_modifiers = True
103
- apply_modifiers = True
104
- apply_conjunctions = True
105
-
106
- ## Acumulate doble/triple intensifiers or negators
107
- if accumulate_several_modifiers:
108
- logging.debug(' Accumulating modifiers')
109
- t = 0
110
- while t < len(my_tokens):
111
- if t+1 < len(my_tokens):
112
- if (my_tokens[t].isNegator() or my_tokens[t].isIntensifier()) and my_tokens[t+1].isNegator():
113
- my_tokens[t+1].value *= my_tokens[t].value
114
- my_tokens[t].use_it = False
115
- my_tokens[t+1].list_ids += my_tokens[t].list_ids
116
- logging.debug(' Accumulating '+'-'.join(my_tokens[t+1].list_ids))
117
- elif my_tokens[t].isNegator() and my_tokens[t+1].isIntensifier():
118
- my_tokens[t+1].value *= -1
119
- my_tokens[t].use_it = False
120
- my_tokens[t+1].list_ids += my_tokens[t].list_ids
121
- logging.debug(' Accumulating '+'-'.join(my_tokens[t+1].list_ids))
122
- elif my_tokens[t].isIntensifier() and my_tokens[t+1].isIntensifier():
123
- if my_tokens[t].value >= 0:
124
- my_tokens[t+1].value = my_tokens[t].value + my_tokens[t+1].value
125
- else:
126
- my_tokens[t+1].value = my_tokens[t].value - my_tokens[t+1].value
127
- my_tokens[t].use_it = False
128
- my_tokens[t+1].list_ids += my_tokens[t].list_ids
129
- logging.debug(' Accumulating '+'-'.join(my_tokens[t+1].list_ids))
130
-
131
- t+=1
132
- ###########################################
133
-
134
- ##Apply intensifiers/negators over the next elements
135
- if apply_modifiers:
136
- logging.debug(' Applying modifiers')
137
- t = 0
138
- while t < len(my_tokens):
139
- if my_tokens[t].use_it and (my_tokens[t].isNegator() or my_tokens[t].isIntensifier()):
140
- ## Try to modify the next token:
141
- if t+1<len(my_tokens):
142
- #print 'Score: ',my_tokens[t]
143
- my_tokens[t+1].value *= my_tokens[t].value
144
- my_tokens[t+1].list_ids += my_tokens[t].list_ids
145
- my_tokens[t].use_it = False
146
- logging.debug(' Applied modifier over '+'-'.join(my_tokens[t+1].list_ids))
147
- t += 1
148
- ###########################################
149
-
150
- if apply_conjunctions:
151
- if lang=='nl':
152
- concat = [',','en']
153
- elif lang=='en':
154
- concat = [',','and']
155
- elif lang=='es':
156
- concat = [',','y','e']
157
- elif lang=='it':
158
- concat = [',','e','ed']
159
- elif lang=='de':
160
- concat = [',','und']
161
- elif lang == 'fr':
162
- concat=[',','et']
163
- logging.debug(' Applying conjunctions:'+str(concat))
164
-
165
-
166
- t = 0
167
- while t < len(my_tokens):
168
- if my_tokens[t].use_it and my_tokens[t].value!=0: ## Find the first one
169
- #print 'FOUND ',my_tokens[t]
170
- logging.debug(' Found token '+str(my_tokens[t]))
171
- list_aux = my_tokens[t].list_ids
172
- used = [t]
173
- value_aux = my_tokens[t].value
174
- my_tokens[t].use_it = False
175
- #print 'Modified',my_tokens[t]
176
-
177
- x = t+1
178
- while True:
179
- if x>=len(my_tokens):
180
- break
181
-
182
- if my_tokens[x].lemma in concat:
183
- ## list_aux += my_tokens[x].list_ids Dont use it as part of the OE
184
- my_tokens[x].use_it = False
185
- x+=1
186
- elif (my_tokens[x].use_it and my_tokens[x].value!=0):
187
- #print '\Also ',my_tokens[x]
188
- logging.debug(' Found token '+str(my_tokens[x]))
189
- list_aux += my_tokens[x].list_ids
190
-
191
- used.append(x)
192
- my_tokens[x].use_it = False
193
- value_aux += my_tokens[x].value
194
- x += 1
195
- else:
196
- break
197
- #print 'OUT OF THE WHILE'
198
- ##The last one in the list used is the one accumulating all
199
-
200
- last_pos = used[-1]
201
- my_tokens[last_pos].value = value_aux
202
- my_tokens[last_pos].list_ids = list_aux
203
- my_tokens[last_pos].use_it = True
204
- logging.debug(' Regenerating '+str(my_tokens[last_pos]))
205
- t = x ## next token
206
- #print
207
- #print
208
- t += 1
209
-
210
-
211
- ## Create OpinionExpression
212
- my_opinion_exps = []
213
- logging.debug(' Generating output')
214
- for token in my_tokens:
215
- if token.use_it and token.value != 0:
216
- op_exp = OpinionExpression(token.list_ids,token.sentence,token.value)
217
- my_opinion_exps.append(op_exp)
218
- return my_opinion_exps
219
-
220
-
221
- '''
222
- def get_distance(id1, id2):
223
- pos1 = int(id1[id1.find('_')+1:])
224
- pos2 = int(id2[id2.find('_')+1:])
225
- if pos1>pos2:
226
- return pos1-pos2
227
- else:
228
- return pos2-pos1
229
- '''
230
-
231
-
232
- def obtain_holders(ops_exps,sentences,lang):
233
- if lang=='nl':
234
- holders = ['ik','we','wij','ze','zij','jullie','u','hij','het','jij','je','mij','me','hem','haar','ons','hen','hun']
235
- elif lang=='en':
236
- holders = ['i','we','he','she','they','it','you']
237
- elif lang =='es':
238
- holders = ['yo','tu','nosotros','vosotros','ellos','ellas','nosotras','vosotras']
239
- elif lang =='it':
240
- holders = ['io','tu','noi','voi','loro','lei','lui']
241
- elif lang == 'de':
242
- holders = ['ich','du','wir','ihr','sie','er']
243
- elif lang == 'fr':
244
- holders = ['je','tu','lui','elle','nous','vous','ils','elles']
245
-
246
- logging.debug('Obtaining holders with list: '+str(holders))
247
-
248
- for oe in ops_exps:
249
- sent = oe.sentence
250
- list_terms = sentences[str(sent)]
251
- for lemma, pos, term_id in list_terms:
252
- if lemma in holders:
253
- oe.holder.append(term_id)
254
- logging.debug(' Selected for '+str(oe)+' holder'+lemma+' '+term_id)
255
- break
256
-
257
-
258
-
259
-
260
- #This is specific for the basic version
261
- def filter_candidates(candidates,ids_oe):
262
- ##filtered = [(lemma, pos,term_id) for (lemma,pos, term_id) in candidates if len(lemma)>=4 and term_id not in ids_oe]
263
- filtered = [(lemma,pos,id) for (lemma,pos,id) in candidates if pos in ['N','R']]
264
- return filtered
265
-
266
- def obtain_targets_improved(ops_exps,sentences):
267
- logging.debug(' Obtaining targets improved')
268
- #print>>sys.stderr,'#'*40
269
- #print>>sys.stderr,'#'*40
270
-
271
- #print>>sys.stderr,'Beginning with obtain targets'
272
- ##sentences --> dict [str(numsent)] ==> list of (lemma, term)id
273
-
274
- all_ids_in_oe = []
275
- for oe in ops_exps:
276
- all_ids_in_oe.extend(oe.ids)
277
- #print>>sys.stderr,'All list of ids in oe',all_ids_in_oe
278
-
279
- for oe in ops_exps:
280
- #print>>sys.stderr,'\tOE:',oe
281
- logging.debug(' OpExp: '+str(oe))
282
-
283
- ids_in_oe = oe.ids
284
- sent = oe.sentence
285
- list_terms = sentences[str(sent)]
286
- #print>>sys.stderr,'\t\tTerms in sent:',list_terms
287
-
288
- ###########################################
289
- #First rule: noun to the right within maxdistance tokens
290
- max_distance_right = 3
291
- biggest_index = -1
292
- for idx, (lemma,pos,term_id) in enumerate(list_terms):
293
- if term_id in ids_in_oe:
294
- biggest_index = idx
295
-
296
- #print>>sys.stderr,'\t\tBI',biggest_index
297
- if biggest_index+1 >= len(list_terms): ## is the last element and we shall skip it
298
- #print>>sys.stderr,'\t\tNot possible to apply 1st rule'
299
- pass
300
- else:
301
- candidates=list_terms[biggest_index+1:min(biggest_index+1+max_distance_right,len(list_terms))]
302
- ##Filter candidates
303
- #print>>sys.stderr,'\t\tCandidates for right rule no filter',candidates
304
- #oe.__candidates_right = [(lemma, term_id) for (lemma, term_id) in candidates if len(lemma)>=4 and term_id not in all_ids_in_oe]
305
- oe.candidates_r = filter_candidates(candidates,all_ids_in_oe)
306
- logging.debug(' Candidates filtered right'+str(oe.candidates_r))
307
- #print>>sys.stderr,'\t\tCandidates for right rule no filter',oe.__candidates_right
308
-
309
- ######################################################################################
310
-
311
-
312
- ###########################################
313
- max_distance_left = 3
314
- smallest_index = 0
315
- for idx,(lemma,pos,term_id) in enumerate(list_terms):
316
- if term_id in ids_in_oe:
317
- smallest_index = idx
318
- break
319
- #print>>sys.stderr,'Smalles index:',smallest_index
320
- if smallest_index == 0:
321
- #print>>sys.stderr,'\t\tNot possible to apply left rule'
322
- pass
323
- else:
324
- candidates = list_terms[max(0,smallest_index-1-max_distance_left):smallest_index]
325
- ##Filter candidates
326
- #print>>sys.stderr,'\t\tCandidates for left rule no filter',candidates
327
-
328
- oe.candidates_l = filter_candidates(candidates,all_ids_in_oe)
329
- logging.debug(' Candidates filtered left: '+str(oe.candidates_l))
330
-
331
- ######################################################################################
332
-
333
- #print>>sys.stderr,'#'*40
334
- #print>>sys.stderr,'#'*40
335
-
336
- ## filling or.target_ids
337
- assigned_as_targets = []
338
-
339
- # First we assing to all the first in the right, if any, and not assigned
340
- logging.debug(' Applying first to the right rule')
341
- for oe in ops_exps:
342
- #print>>sys.stderr,'A ver ',oe
343
- if len(oe.candidates_r) !=0:
344
- lemma, pos, id = oe.candidates_r[0]
345
- if id not in assigned_as_targets:
346
- oe.target_ids.append(id)
347
- ###assigned_as_targets.append(id) #Uncomment to avoid selection of the same target moe than once
348
- logging.debug(' OpExp '+str(oe)+' selected '+id)
349
- #print>>sys.stderr,'Asignamos',id
350
-
351
- logging.debug(' Applying most close rule')
352
- for oe in ops_exps:
353
- if len(oe.target_ids) == 0: # otherwise it's solved
354
- intercalados_list = mix_lists([id for _,_,id in oe.candidates_r],[id for _,_,id in oe.candidates_l])
355
- for id in intercalados_list:
356
- if id not in assigned_as_targets:
357
- oe.target_ids.append(id)
358
- ###assigned_as_targets.append(id) #Uncomment to avoid selection of the same target moe than once
359
- logging.debug(' OpExp '+str(oe)+' selected '+id)
360
- break
361
-
362
- ######## MAIN ROUTINE ############
363
-
364
- ## Check if we are reading from a pipeline
365
- if sys.stdin.isatty():
366
- print>>sys.stderr,'Input stream required.'
367
- print>>sys.stderr,'Example usage: cat myUTF8file.kaf.xml |',sys.argv[0]
368
- sys.exit(-1)
369
- ########################################
370
-
371
- logging.basicConfig(stream=sys.stderr,format='%(asctime)s - %(levelname)s - %(message)s',level=logging.DEBUG)
372
-
373
- ## Processing the parameters
374
- my_time_stamp = True
375
- remove_opinions = True
376
- opinion_strength = True
377
- try:
378
- opts, args = getopt.getopt(sys.argv[1:],"",["no-time","no-remove-opinions","no-opinion-strength"])
379
- for opt, arg in opts:
380
- if opt == "--no-time":
381
- my_time_stamp = False
382
- elif opt == "--no-remove-opinions":
383
- remove_opinions = False
384
- elif opt == "--no-opinion-strength":
385
- opinion_strength = False
386
- except getopt.GetoptError:
387
- pass
388
- #########################################
389
-
390
- logging.debug('Include timestamp: '+str(my_time_stamp))
391
-
392
- # Parsing the KAF file
393
- try:
394
- my_kaf_tree = KafParser(sys.stdin)
395
- except Exception as e:
396
- print>>sys.stderr,'Error parsing input'
397
- print>>sys.stderr,'Stream input must be a valid KAF file'
398
- print>>sys.stderr,'Error: ',str(e)
399
- sys.exit(-1)
400
-
401
-
402
- lang = my_kaf_tree.getLanguage()
403
- ## Creating data structure
404
- sentences = defaultdict(list)
405
- my_tokens = []
406
-
407
-
408
- # CREATE the datastructure for the tokens
409
- n=0
410
- lemma_for_tid = {}
411
- for term in my_kaf_tree.getTerms():
412
- n+=1
413
- term_id = term.getId()
414
- lemma = term.getLemma()
415
- lemma_for_tid[term_id] = lemma
416
- kaf_pos = term.getPos()
417
- #print>>sys.stderr,kaf_pos
418
- list_span = term.get_list_span() ## List of token ids in the span layer of the term
419
- sentiment = term.getSentiment()
420
- polarity = sent_mod = None
421
- if sentiment is not None:
422
- polarity = sentiment.getPolarity()
423
- sent_mod = sentiment.getSentimentModifier()
424
- sentence = my_kaf_tree.getToken(list_span[0]).get('sent') ## The sentence of the first token element in span
425
- my_tokens.append(MyToken(term_id,lemma,kaf_pos,polarity,sent_mod,sentence))
426
-
427
- sentences[str(sentence)].append((lemma,kaf_pos,term_id))
428
- #############################
429
-
430
- logging.debug('Num terms loaded: '+str(n))
431
- logging.debug('Num sentences: '+str(len(sentences)))
432
-
433
-
434
- logging.debug('Obtaining opinion expressions')
435
- my_ops_exps = obtain_opinion_expressions(my_tokens,lang)
436
- print>>sys.stderr,my_ops_exps
437
-
438
- logging.debug('Obtaining targets')
439
- obtain_targets_improved(my_ops_exps,sentences)
440
-
441
-
442
- logging.debug('Obtaining holders')
443
- obtain_holders(my_ops_exps,sentences,lang)
444
-
445
-
446
-
447
-
448
- ## Create the elements
449
- logging.debug('Generating KAF output')
450
-
451
- if remove_opinions:
452
- my_kaf_tree.remove_opinion_layer()
453
-
454
- for oe in my_ops_exps:
455
- op_ele = etree.Element('opinion')
456
-
457
- ## Holder
458
- if len(oe.holder)!=0:
459
- oe.holder.sort()
460
- c = ' '.join(lemma_for_tid[tid] for tid in oe.holder)
461
- op_hol = etree.Element('opinion_holder')
462
- op_hol.append(etree.Comment(c))
463
- op_ele.append(op_hol)
464
- span_op_hol = etree.Element('span')
465
- op_hol.append(span_op_hol)
466
- for id in oe.holder:
467
- span_op_hol.append(etree.Element('target',attrib={'id':id}))
468
-
469
- ## Target
470
- op_tar = etree.Element('opinion_target')
471
- op_ele.append(op_tar)
472
-
473
-
474
- if len(oe.target_ids)!=0: ## if there are no targets, there is no opinion eleemnt
475
- oe.target_ids.sort()
476
- c = ' '.join(lemma_for_tid[tid] for tid in oe.target_ids)
477
- op_tar.append(etree.Comment(c))
478
- span_op_tar = etree.Element('span')
479
- op_tar.append(span_op_tar)
480
- for id in oe.target_ids:
481
- span_op_tar.append(etree.Element('target',attrib={'id':id}))
482
-
483
- #Expression
484
- if oe.value > 0: pol = 'positive'
485
- elif oe.value < 0: pol = 'negative'
486
- else: pol = 'neutral'
487
-
488
- op_exp = etree.Element('opinion_expression')
489
- op_exp.set('polarity',pol)
490
- if opinion_strength:
491
- op_exp.set('strength',str(oe.value))
492
-
493
- op_ele.append(op_exp)
494
- oe.ids.sort()
495
- c = ' '.join(lemma_for_tid[tid] for tid in oe.ids)
496
- op_exp.append(etree.Comment(c))
497
- span_exp = etree.Element('span')
498
- op_exp.append(span_exp)
499
- for id in oe.ids:
500
- span_exp.append(etree.Element('target',attrib={'id':id}))
501
-
502
- ##Append the op_ele to the opinions layer
503
- my_kaf_tree.addElementToLayer('opinions', op_ele)
504
-
505
-
506
- my_kaf_tree.addLinguisticProcessor('Basic opinion detector with Pos','1.0','opinions', my_time_stamp)
507
- my_kaf_tree.saveToFile(sys.stdout)
508
- logging.debug('Process finished')
509
-
510
-
511
-
512
-