opener-opinion-detector-basic 3.2.2 → 3.2.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,136 @@
1
+ module Opener
2
+ class OpinionDetectorBasic
3
+ ##
4
+ # Class that detects opinions in a given input KAF file.
5
+ #
6
+ class LegacyProcessor < BaseProcessor
7
+
8
+ def opinions
9
+ unless @opinions
10
+ set_accumulated_strength
11
+ apply_modifiers
12
+ apply_conjunctions
13
+
14
+ ##
15
+ # Initialize opinions with their expressions.
16
+ #
17
+ @opinions = document.terms.map do |term|
18
+ next unless term.is_expression? and term.accumulated_strength != 0
19
+ Kaf::Opinion.new term
20
+ end.compact
21
+
22
+ ##
23
+ # Obtain targets for each opinion.
24
+ #
25
+ @opinions.each do |opinion|
26
+ opinion.obtain_targets sentences
27
+ end
28
+
29
+ ##
30
+ # Obtain holders for each opinion.
31
+ #
32
+ @opinions.each do |opinion|
33
+ opinion.obtain_holders sentences, document.language
34
+ end
35
+ end
36
+
37
+ @opinions
38
+ end
39
+
40
+ protected
41
+
42
+ ##
43
+ # The strength of a term depends heavily on the type of the previous
44
+ # one. For example if the previous one is a shifter, it needs
45
+ # to be multiplied. If it's an intensifier, it needs to be
46
+ # added (or subtracted depending on the strength of the previous
47
+ # term) etc.
48
+ #
49
+ def set_accumulated_strength
50
+ symbol = :+
51
+ terms_count = terms.count
52
+ terms.each.with_index do |term, i|
53
+ next unless i+1 < terms_count
54
+
55
+ if terms[i+1].is_shifter?
56
+ if term.accumulated_strength != 0
57
+ terms[i+1].accumulated_strength *= term.accumulated_strength
58
+ terms[i+1].list_ids += term.list_ids
59
+ term.use = false
60
+ symbol = terms[i+1].accumulated_strength > 0 ? :+ : :-
61
+ else
62
+ symbol = :*
63
+ end
64
+ elsif terms[i+1].is_intensifier?
65
+ terms[i+1].accumulated_strength = term.accumulated_strength.send(symbol, terms[i+1].accumulated_strength)
66
+ term.use = false
67
+ symbol = terms[i+1].accumulated_strength > 0 ? :+ : :-
68
+ if term.accumulated_strength != 0
69
+ terms[i+1].list_ids += term.list_ids
70
+ end
71
+ else
72
+ symbol = terms[i+1].accumulated_strength >= 0 ? :+ : :-
73
+ end
74
+ end
75
+ end
76
+
77
+ ##
78
+ # Apply strength to the next term after a shifter or intensifier.
79
+ #
80
+ def apply_modifiers
81
+ terms_count = terms.count
82
+ terms.each.with_index do |term, i|
83
+ if i+1 < terms_count
84
+ if term.use && (term.is_shifter? || term.is_intensifier?)
85
+ terms[i+1].accumulated_strength *= term.accumulated_strength
86
+ terms[i+1].list_ids += term.list_ids
87
+ term.use = false
88
+ end
89
+ end
90
+ end
91
+ end
92
+
93
+ ##
94
+ # Ignore conjunctions when applying strength.
95
+ #
96
+ def apply_conjunctions
97
+ terms_count = terms.count
98
+ i = 0
99
+ while i < terms_count
100
+ if terms[i].use && terms[i].accumulated_strength != 0
101
+ used = [i]
102
+ list_ids = terms[i].list_ids
103
+ strength = terms[i].accumulated_strength
104
+ terms[i].use = false
105
+ j = i+1
106
+ while true
107
+ if j >= terms_count
108
+ break
109
+ end
110
+
111
+ if terms[j].is_conjunction
112
+ terms[j].use = false
113
+ j += 1
114
+ elsif terms[j].use && terms[j].accumulated_strength != 0
115
+ list_ids += terms[j].list_ids
116
+ used << j
117
+ terms[j].use = false
118
+ strength += terms[j].accumulated_strength
119
+ j += 1
120
+ else
121
+ break
122
+ end
123
+ end
124
+ last_used = used.last
125
+ terms[last_used].accumulated_strength = strength
126
+ terms[last_used].list_ids = list_ids
127
+ terms[last_used].use = true
128
+ i = j
129
+ end
130
+ i += 1
131
+ end
132
+ end
133
+
134
+ end
135
+ end
136
+ end
@@ -1,326 +1,38 @@
1
- require_relative 'term'
2
- require_relative 'opinion'
3
-
4
1
  module Opener
5
2
  class OpinionDetectorBasic
6
3
  ##
7
4
  # Class that detects opinions in a given input KAF file.
8
5
  #
9
- class Processor
10
- attr_accessor :document, :timestamp, :opinion_strength, :pretty
11
-
12
- ##
13
- # @param [String|IO] file The KAF file/input to process.
14
- # @param [Hash] options. Options for timestamp and including strength to
15
- # opinions.
16
- # @param [TrueClass|FalseClass] pretty Enable pretty formatting, disabled
17
- # by default due to the performance overhead.
18
- #
19
- def initialize(file, options = {})
20
- @document = Oga.parse_xml(file)
21
-
22
- @timestamp = options[:timestamp]
23
- @opinion_strength = options[:opinion_strength]
24
- @pretty = options[:pretty] || false
25
-
26
- raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
27
- end
28
-
29
- ##
30
- # Processes the input and returns the new KAF output.
31
- # @return [String]
32
- #
33
- def process
34
- add_opinions_layer
35
-
36
- index = 1
37
- opinions.each do |opinion|
38
- add_opinion(opinion, index)
39
- index += 1
40
- end
41
-
42
- add_linguistic_processor
6
+ class Processor < BaseProcessor
43
7
 
44
- pretty ? pretty_print(document) : document.to_xml
45
- end
46
-
47
- ##
48
- # Get the language of the input file.
49
- #
50
- # @return [String]
51
- #
52
- def language
53
- @language ||= document.at_xpath('KAF').get('xml:lang')
54
- end
55
-
56
- ##
57
- # Get the terms from the input file
58
- # @return [Hash]
59
- #
60
- def terms
61
- @terms ||= document.xpath('KAF/terms/term').map do |term|
62
- Term.new(term, document, language)
63
- end
64
- end
65
-
66
- ##
67
- # Get the opinions.
68
- #
69
- # @return [Hash]
70
- #
71
8
  def opinions
72
- unless @opinions
73
- set_accumulated_strength
74
- apply_modifiers
75
- apply_conjunctions
76
-
77
- ##
78
- # Initialize opinions with their expressions.
79
- #
80
- @opinions = terms.map do |term|
81
- if term.is_expression? && term.accumulated_strength != 0
82
- Opinion.new(term)
83
- end
84
- end.compact
85
-
86
- ##
87
- # Obtain targets for each opinion.
88
- #
89
- @opinions.each do |opinion|
90
- opinion.obtain_targets(sentences)
91
- end
92
-
93
- ##
94
- # Obtain holders for each opinion.
95
- #
96
- @opinions.each do |opinion|
97
- opinion.obtain_holders(sentences, language)
98
- end
99
- end
100
-
101
- @opinions
102
- end
103
-
104
- ##
105
- # Remove the opinions layer from the KAF file if it exists and add a new
106
- # one.
107
- def add_opinions_layer
108
- existing = document.at_xpath('KAF/opinions')
109
-
110
- existing.remove if existing
111
-
112
- new_node('opinions', 'KAF')
113
- end
114
-
115
- ##
116
- # Adds the entire opinion in the KAF file.
117
- #
118
- def add_opinion(opinion, index)
119
- opinion_node = new_node("opinion", "KAF/opinions")
120
- opinion_node.set('oid', "o#{index.to_s}")
121
-
122
- unless opinion.holders.empty?
123
- opinion_holder_node = new_node("opinion_holder", opinion_node)
124
- add_opinion_element(opinion_holder_node, opinion.holders)
125
- end
126
-
127
- opinion_target_node = new_node("opinion_target", opinion_node)
128
-
129
- unless opinion.target_ids.empty?
130
- add_opinion_element(opinion_target_node, opinion.target_ids)
131
- end
9
+ return @opinions if @opinions
132
10
 
133
- expression_node = new_node("opinion_expression", opinion_node)
134
- expression_node.set('polarity', opinion.polarity)
135
- expression_node.set('strength', opinion.strength.to_s)
11
+ ##
12
+ # Initialize opinions with their expressions.
13
+ #
14
+ @opinions = document.terms.map do |term|
15
+ next unless term.is_expression? and term.accumulated_strength != 0
16
+ Kaf::Opinion.new term
17
+ end.compact
136
18
 
137
- add_opinion_element(expression_node, opinion.ids)
19
+ set_accumulated_strength
138
20
  end
139
21
 
140
- ##
141
- # Method for adding opinion holders, targets and expressions.
142
- #
143
- def add_opinion_element(node, ids)
144
- lemmas = terms.select{|t| ids.include?(t.id)}.map(&:lemma).join(" ")
145
- comment = Oga::XML::Comment.new(:text => "#{lemmas}")
146
- node.children << comment
147
- span_node = new_node("span", node)
148
-
149
- ids.each do |id|
150
- target_node = new_node("target", span_node)
151
- target_node.set('id', id.to_s)
152
- end
153
- end
154
-
155
- ##
156
- # Add linguistic processor layer with basic information
157
- # (version, timestamp, description etc) in the KAF file.
158
- #
159
- def add_linguistic_processor
160
- description = 'Basic opinion detector with Pos'
161
- last_edited = '13may2015'
162
- version = '2.0'
163
-
164
- node = new_node('linguisticProcessors', 'KAF/kafHeader')
165
- node.set('layer', 'opinions')
166
-
167
- lp_node = new_node('lp', node)
168
-
169
- lp_node.set('version', "#{last_edited}-#{version}")
170
- lp_node.set('name', description)
171
-
172
- if timestamp
173
- format = '%Y-%m-%dT%H:%M:%S%Z'
174
-
175
- lp_node.set('timestamp', Time.now.strftime(format))
176
- else
177
- lp_node.set('timestamp', '*')
178
- end
179
- end
180
-
181
- ##
182
- # Format the output document properly.
183
- #
184
- # TODO: this should be handled by Oga in a nice way.
185
- #
186
- # @return [String]
187
- #
188
- def pretty_print(document)
189
- doc = REXML::Document.new document.to_xml
190
- doc.context[:attribute_quote] = :quote
191
- out = ""
192
- formatter = REXML::Formatters::Pretty.new
193
- formatter.compact = true
194
- formatter.write(doc, out)
195
-
196
- out.strip
197
- end
198
-
199
- ##
200
- # Get terms grouped by sentence.
201
- #
202
- def sentences
203
- @sentences ||= terms.group_by{|t| t.sentence}
204
- end
205
-
206
- protected
207
-
208
- ##
209
- # The strength of a term depends heavily on the type of the previous
210
- # one. For example if the previous one is a shifter, it needs
211
- # to be multiplied. If it's an intensifier, it needs to be
212
- # added (or subtracted depending on the strength of the previous
213
- # term) etc.
214
- #
215
22
  def set_accumulated_strength
216
- symbol = :+
217
- terms_count = terms.count
218
- terms.each_with_index do |term, i|
219
- if i+1 < terms_count
220
- if terms[i+1].is_shifter?
221
- if term.accumulated_strength != 0
222
- terms[i+1].accumulated_strength *= term.accumulated_strength
223
- terms[i+1].list_ids += term.list_ids
224
- term.use = false
225
- symbol = terms[i+1].accumulated_strength > 0 ? :+ : :-
226
- else
227
- symbol = :*
228
- end
229
- elsif terms[i+1].is_intensifier?
230
- terms[i+1].accumulated_strength = term.accumulated_strength.send(symbol, terms[i+1].accumulated_strength)
231
- term.use = false
232
- symbol = terms[i+1].accumulated_strength > 0 ? :+ : :-
233
- if term.accumulated_strength != 0
234
- terms[i+1].list_ids += term.list_ids
235
- end
236
- else
237
- symbol = terms[i+1].accumulated_strength >= 0 ? :+ : :-
238
- end
239
- end
240
- end
241
- end
242
-
243
- ##
244
- # Apply strength to the next term after a shifter or intensifier.
245
- #
246
- def apply_modifiers
247
- terms_count = terms.count
248
- terms.each_with_index do |term, i|
249
- if i+1 < terms_count
250
- if term.use && (term.is_shifter? || term.is_intensifier?)
251
- terms[i+1].accumulated_strength *= term.accumulated_strength
252
- terms[i+1].list_ids += term.list_ids
253
- term.use = false
254
- end
23
+ terms.each.with_index do |term, i|
24
+ head = term.head_term
25
+ if head.is_shifter?
26
+ term.accumulated_strength *= -1
27
+ term.list_ids += term.list_ids
28
+ elsif head.is_intensifier?
29
+ term.accumulated_strength += head.accumulated_strength
30
+ term.list_ids += term.list_ids
31
+ else
255
32
  end
256
33
  end
257
34
  end
258
35
 
259
- ##
260
- # Ignore conjunctions when applying strength.
261
- #
262
- def apply_conjunctions
263
- terms_count = terms.count
264
- i = 0
265
- while i < terms_count
266
- if terms[i].use && terms[i].accumulated_strength != 0
267
- used = [i]
268
- list_ids = terms[i].list_ids
269
- strength = terms[i].accumulated_strength
270
- terms[i].use = false
271
- j = i+1
272
- while true
273
- if j >= terms_count
274
- break
275
- end
276
-
277
- if terms[j].is_conjunction
278
- terms[j].use = false
279
- j += 1
280
- elsif terms[j].use && terms[j].accumulated_strength != 0
281
- list_ids += terms[j].list_ids
282
- used << j
283
- terms[j].use = false
284
- strength += terms[j].accumulated_strength
285
- j += 1
286
- else
287
- break
288
- end
289
- end
290
- last_used = used.last
291
- terms[last_used].accumulated_strength = strength
292
- terms[last_used].list_ids = list_ids
293
- terms[last_used].use = true
294
- i = j
295
- end
296
- i += 1
297
- end
298
- end
299
-
300
- ##
301
- # Creates a new node in the KAF file.
302
- #
303
- def new_node(tag, parent)
304
- if parent.is_a?(String)
305
- parent_node = document.at_xpath(parent)
306
- else
307
- parent_node = parent
308
- end
309
-
310
- node = Oga::XML::Element.new(:name => tag)
311
-
312
- parent_node.children << node
313
-
314
- node
315
- end
316
-
317
- ##
318
- # Check if input is a KAF file.
319
- # @return [Boolean]
320
- #
321
- def is_kaf?
322
- !!document.at_xpath('KAF')
323
- end
324
- end # Processor
325
- end # OpinionDetectorBasic
326
- end # Opener
36
+ end
37
+ end
38
+ end