opener-opinion-detector-basic 3.2.2 → 3.2.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/opener/opinion_detector_basic.rb +13 -2
- data/lib/opener/opinion_detector_basic/base_processor.rb +56 -0
- data/lib/opener/opinion_detector_basic/kaf/document.rb +146 -0
- data/lib/opener/opinion_detector_basic/kaf/opinion.rb +152 -0
- data/lib/opener/opinion_detector_basic/kaf/term.rb +185 -0
- data/lib/opener/opinion_detector_basic/legacy_processor.rb +136 -0
- data/lib/opener/opinion_detector_basic/processor.rb +22 -310
- data/lib/opener/opinion_detector_basic/version.rb +1 -1
- data/opener-opinion-detector-basic.gemspec +3 -1
- metadata +37 -13
- data/lib/opener/opinion_detector_basic/opinion.rb +0 -170
- data/lib/opener/opinion_detector_basic/term.rb +0 -159
@@ -0,0 +1,136 @@
|
|
1
|
+
module Opener
|
2
|
+
class OpinionDetectorBasic
|
3
|
+
##
|
4
|
+
# Class that detects opinions in a given input KAF file.
|
5
|
+
#
|
6
|
+
class LegacyProcessor < BaseProcessor
|
7
|
+
|
8
|
+
def opinions
|
9
|
+
unless @opinions
|
10
|
+
set_accumulated_strength
|
11
|
+
apply_modifiers
|
12
|
+
apply_conjunctions
|
13
|
+
|
14
|
+
##
|
15
|
+
# Initialize opinions with their expressions.
|
16
|
+
#
|
17
|
+
@opinions = document.terms.map do |term|
|
18
|
+
next unless term.is_expression? and term.accumulated_strength != 0
|
19
|
+
Kaf::Opinion.new term
|
20
|
+
end.compact
|
21
|
+
|
22
|
+
##
|
23
|
+
# Obtain targets for each opinion.
|
24
|
+
#
|
25
|
+
@opinions.each do |opinion|
|
26
|
+
opinion.obtain_targets sentences
|
27
|
+
end
|
28
|
+
|
29
|
+
##
|
30
|
+
# Obtain holders for each opinion.
|
31
|
+
#
|
32
|
+
@opinions.each do |opinion|
|
33
|
+
opinion.obtain_holders sentences, document.language
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
@opinions
|
38
|
+
end
|
39
|
+
|
40
|
+
protected
|
41
|
+
|
42
|
+
##
|
43
|
+
# The strength of a term depends heavily on the type of the previous
|
44
|
+
# one. For example if the previous one is a shifter, it needs
|
45
|
+
# to be multiplied. If it's an intensifier, it needs to be
|
46
|
+
# added (or subtracted depending on the strength of the previous
|
47
|
+
# term) etc.
|
48
|
+
#
|
49
|
+
def set_accumulated_strength
|
50
|
+
symbol = :+
|
51
|
+
terms_count = terms.count
|
52
|
+
terms.each.with_index do |term, i|
|
53
|
+
next unless i+1 < terms_count
|
54
|
+
|
55
|
+
if terms[i+1].is_shifter?
|
56
|
+
if term.accumulated_strength != 0
|
57
|
+
terms[i+1].accumulated_strength *= term.accumulated_strength
|
58
|
+
terms[i+1].list_ids += term.list_ids
|
59
|
+
term.use = false
|
60
|
+
symbol = terms[i+1].accumulated_strength > 0 ? :+ : :-
|
61
|
+
else
|
62
|
+
symbol = :*
|
63
|
+
end
|
64
|
+
elsif terms[i+1].is_intensifier?
|
65
|
+
terms[i+1].accumulated_strength = term.accumulated_strength.send(symbol, terms[i+1].accumulated_strength)
|
66
|
+
term.use = false
|
67
|
+
symbol = terms[i+1].accumulated_strength > 0 ? :+ : :-
|
68
|
+
if term.accumulated_strength != 0
|
69
|
+
terms[i+1].list_ids += term.list_ids
|
70
|
+
end
|
71
|
+
else
|
72
|
+
symbol = terms[i+1].accumulated_strength >= 0 ? :+ : :-
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
##
|
78
|
+
# Apply strength to the next term after a shifter or intensifier.
|
79
|
+
#
|
80
|
+
def apply_modifiers
|
81
|
+
terms_count = terms.count
|
82
|
+
terms.each.with_index do |term, i|
|
83
|
+
if i+1 < terms_count
|
84
|
+
if term.use && (term.is_shifter? || term.is_intensifier?)
|
85
|
+
terms[i+1].accumulated_strength *= term.accumulated_strength
|
86
|
+
terms[i+1].list_ids += term.list_ids
|
87
|
+
term.use = false
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
##
|
94
|
+
# Ignore conjunctions when applying strength.
|
95
|
+
#
|
96
|
+
def apply_conjunctions
|
97
|
+
terms_count = terms.count
|
98
|
+
i = 0
|
99
|
+
while i < terms_count
|
100
|
+
if terms[i].use && terms[i].accumulated_strength != 0
|
101
|
+
used = [i]
|
102
|
+
list_ids = terms[i].list_ids
|
103
|
+
strength = terms[i].accumulated_strength
|
104
|
+
terms[i].use = false
|
105
|
+
j = i+1
|
106
|
+
while true
|
107
|
+
if j >= terms_count
|
108
|
+
break
|
109
|
+
end
|
110
|
+
|
111
|
+
if terms[j].is_conjunction
|
112
|
+
terms[j].use = false
|
113
|
+
j += 1
|
114
|
+
elsif terms[j].use && terms[j].accumulated_strength != 0
|
115
|
+
list_ids += terms[j].list_ids
|
116
|
+
used << j
|
117
|
+
terms[j].use = false
|
118
|
+
strength += terms[j].accumulated_strength
|
119
|
+
j += 1
|
120
|
+
else
|
121
|
+
break
|
122
|
+
end
|
123
|
+
end
|
124
|
+
last_used = used.last
|
125
|
+
terms[last_used].accumulated_strength = strength
|
126
|
+
terms[last_used].list_ids = list_ids
|
127
|
+
terms[last_used].use = true
|
128
|
+
i = j
|
129
|
+
end
|
130
|
+
i += 1
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
@@ -1,326 +1,38 @@
|
|
1
|
-
require_relative 'term'
|
2
|
-
require_relative 'opinion'
|
3
|
-
|
4
1
|
module Opener
|
5
2
|
class OpinionDetectorBasic
|
6
3
|
##
|
7
4
|
# Class that detects opinions in a given input KAF file.
|
8
5
|
#
|
9
|
-
class Processor
|
10
|
-
attr_accessor :document, :timestamp, :opinion_strength, :pretty
|
11
|
-
|
12
|
-
##
|
13
|
-
# @param [String|IO] file The KAF file/input to process.
|
14
|
-
# @param [Hash] options. Options for timestamp and including strength to
|
15
|
-
# opinions.
|
16
|
-
# @param [TrueClass|FalseClass] pretty Enable pretty formatting, disabled
|
17
|
-
# by default due to the performance overhead.
|
18
|
-
#
|
19
|
-
def initialize(file, options = {})
|
20
|
-
@document = Oga.parse_xml(file)
|
21
|
-
|
22
|
-
@timestamp = options[:timestamp]
|
23
|
-
@opinion_strength = options[:opinion_strength]
|
24
|
-
@pretty = options[:pretty] || false
|
25
|
-
|
26
|
-
raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
|
27
|
-
end
|
28
|
-
|
29
|
-
##
|
30
|
-
# Processes the input and returns the new KAF output.
|
31
|
-
# @return [String]
|
32
|
-
#
|
33
|
-
def process
|
34
|
-
add_opinions_layer
|
35
|
-
|
36
|
-
index = 1
|
37
|
-
opinions.each do |opinion|
|
38
|
-
add_opinion(opinion, index)
|
39
|
-
index += 1
|
40
|
-
end
|
41
|
-
|
42
|
-
add_linguistic_processor
|
6
|
+
class Processor < BaseProcessor
|
43
7
|
|
44
|
-
pretty ? pretty_print(document) : document.to_xml
|
45
|
-
end
|
46
|
-
|
47
|
-
##
|
48
|
-
# Get the language of the input file.
|
49
|
-
#
|
50
|
-
# @return [String]
|
51
|
-
#
|
52
|
-
def language
|
53
|
-
@language ||= document.at_xpath('KAF').get('xml:lang')
|
54
|
-
end
|
55
|
-
|
56
|
-
##
|
57
|
-
# Get the terms from the input file
|
58
|
-
# @return [Hash]
|
59
|
-
#
|
60
|
-
def terms
|
61
|
-
@terms ||= document.xpath('KAF/terms/term').map do |term|
|
62
|
-
Term.new(term, document, language)
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
##
|
67
|
-
# Get the opinions.
|
68
|
-
#
|
69
|
-
# @return [Hash]
|
70
|
-
#
|
71
8
|
def opinions
|
72
|
-
|
73
|
-
set_accumulated_strength
|
74
|
-
apply_modifiers
|
75
|
-
apply_conjunctions
|
76
|
-
|
77
|
-
##
|
78
|
-
# Initialize opinions with their expressions.
|
79
|
-
#
|
80
|
-
@opinions = terms.map do |term|
|
81
|
-
if term.is_expression? && term.accumulated_strength != 0
|
82
|
-
Opinion.new(term)
|
83
|
-
end
|
84
|
-
end.compact
|
85
|
-
|
86
|
-
##
|
87
|
-
# Obtain targets for each opinion.
|
88
|
-
#
|
89
|
-
@opinions.each do |opinion|
|
90
|
-
opinion.obtain_targets(sentences)
|
91
|
-
end
|
92
|
-
|
93
|
-
##
|
94
|
-
# Obtain holders for each opinion.
|
95
|
-
#
|
96
|
-
@opinions.each do |opinion|
|
97
|
-
opinion.obtain_holders(sentences, language)
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|
101
|
-
@opinions
|
102
|
-
end
|
103
|
-
|
104
|
-
##
|
105
|
-
# Remove the opinions layer from the KAF file if it exists and add a new
|
106
|
-
# one.
|
107
|
-
def add_opinions_layer
|
108
|
-
existing = document.at_xpath('KAF/opinions')
|
109
|
-
|
110
|
-
existing.remove if existing
|
111
|
-
|
112
|
-
new_node('opinions', 'KAF')
|
113
|
-
end
|
114
|
-
|
115
|
-
##
|
116
|
-
# Adds the entire opinion in the KAF file.
|
117
|
-
#
|
118
|
-
def add_opinion(opinion, index)
|
119
|
-
opinion_node = new_node("opinion", "KAF/opinions")
|
120
|
-
opinion_node.set('oid', "o#{index.to_s}")
|
121
|
-
|
122
|
-
unless opinion.holders.empty?
|
123
|
-
opinion_holder_node = new_node("opinion_holder", opinion_node)
|
124
|
-
add_opinion_element(opinion_holder_node, opinion.holders)
|
125
|
-
end
|
126
|
-
|
127
|
-
opinion_target_node = new_node("opinion_target", opinion_node)
|
128
|
-
|
129
|
-
unless opinion.target_ids.empty?
|
130
|
-
add_opinion_element(opinion_target_node, opinion.target_ids)
|
131
|
-
end
|
9
|
+
return @opinions if @opinions
|
132
10
|
|
133
|
-
|
134
|
-
|
135
|
-
|
11
|
+
##
|
12
|
+
# Initialize opinions with their expressions.
|
13
|
+
#
|
14
|
+
@opinions = document.terms.map do |term|
|
15
|
+
next unless term.is_expression? and term.accumulated_strength != 0
|
16
|
+
Kaf::Opinion.new term
|
17
|
+
end.compact
|
136
18
|
|
137
|
-
|
19
|
+
set_accumulated_strength
|
138
20
|
end
|
139
21
|
|
140
|
-
##
|
141
|
-
# Method for adding opinion holders, targets and expressions.
|
142
|
-
#
|
143
|
-
def add_opinion_element(node, ids)
|
144
|
-
lemmas = terms.select{|t| ids.include?(t.id)}.map(&:lemma).join(" ")
|
145
|
-
comment = Oga::XML::Comment.new(:text => "#{lemmas}")
|
146
|
-
node.children << comment
|
147
|
-
span_node = new_node("span", node)
|
148
|
-
|
149
|
-
ids.each do |id|
|
150
|
-
target_node = new_node("target", span_node)
|
151
|
-
target_node.set('id', id.to_s)
|
152
|
-
end
|
153
|
-
end
|
154
|
-
|
155
|
-
##
|
156
|
-
# Add linguistic processor layer with basic information
|
157
|
-
# (version, timestamp, description etc) in the KAF file.
|
158
|
-
#
|
159
|
-
def add_linguistic_processor
|
160
|
-
description = 'Basic opinion detector with Pos'
|
161
|
-
last_edited = '13may2015'
|
162
|
-
version = '2.0'
|
163
|
-
|
164
|
-
node = new_node('linguisticProcessors', 'KAF/kafHeader')
|
165
|
-
node.set('layer', 'opinions')
|
166
|
-
|
167
|
-
lp_node = new_node('lp', node)
|
168
|
-
|
169
|
-
lp_node.set('version', "#{last_edited}-#{version}")
|
170
|
-
lp_node.set('name', description)
|
171
|
-
|
172
|
-
if timestamp
|
173
|
-
format = '%Y-%m-%dT%H:%M:%S%Z'
|
174
|
-
|
175
|
-
lp_node.set('timestamp', Time.now.strftime(format))
|
176
|
-
else
|
177
|
-
lp_node.set('timestamp', '*')
|
178
|
-
end
|
179
|
-
end
|
180
|
-
|
181
|
-
##
|
182
|
-
# Format the output document properly.
|
183
|
-
#
|
184
|
-
# TODO: this should be handled by Oga in a nice way.
|
185
|
-
#
|
186
|
-
# @return [String]
|
187
|
-
#
|
188
|
-
def pretty_print(document)
|
189
|
-
doc = REXML::Document.new document.to_xml
|
190
|
-
doc.context[:attribute_quote] = :quote
|
191
|
-
out = ""
|
192
|
-
formatter = REXML::Formatters::Pretty.new
|
193
|
-
formatter.compact = true
|
194
|
-
formatter.write(doc, out)
|
195
|
-
|
196
|
-
out.strip
|
197
|
-
end
|
198
|
-
|
199
|
-
##
|
200
|
-
# Get terms grouped by sentence.
|
201
|
-
#
|
202
|
-
def sentences
|
203
|
-
@sentences ||= terms.group_by{|t| t.sentence}
|
204
|
-
end
|
205
|
-
|
206
|
-
protected
|
207
|
-
|
208
|
-
##
|
209
|
-
# The strength of a term depends heavily on the type of the previous
|
210
|
-
# one. For example if the previous one is a shifter, it needs
|
211
|
-
# to be multiplied. If it's an intensifier, it needs to be
|
212
|
-
# added (or subtracted depending on the strength of the previous
|
213
|
-
# term) etc.
|
214
|
-
#
|
215
22
|
def set_accumulated_strength
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
symbol = terms[i+1].accumulated_strength > 0 ? :+ : :-
|
226
|
-
else
|
227
|
-
symbol = :*
|
228
|
-
end
|
229
|
-
elsif terms[i+1].is_intensifier?
|
230
|
-
terms[i+1].accumulated_strength = term.accumulated_strength.send(symbol, terms[i+1].accumulated_strength)
|
231
|
-
term.use = false
|
232
|
-
symbol = terms[i+1].accumulated_strength > 0 ? :+ : :-
|
233
|
-
if term.accumulated_strength != 0
|
234
|
-
terms[i+1].list_ids += term.list_ids
|
235
|
-
end
|
236
|
-
else
|
237
|
-
symbol = terms[i+1].accumulated_strength >= 0 ? :+ : :-
|
238
|
-
end
|
239
|
-
end
|
240
|
-
end
|
241
|
-
end
|
242
|
-
|
243
|
-
##
|
244
|
-
# Apply strength to the next term after a shifter or intensifier.
|
245
|
-
#
|
246
|
-
def apply_modifiers
|
247
|
-
terms_count = terms.count
|
248
|
-
terms.each_with_index do |term, i|
|
249
|
-
if i+1 < terms_count
|
250
|
-
if term.use && (term.is_shifter? || term.is_intensifier?)
|
251
|
-
terms[i+1].accumulated_strength *= term.accumulated_strength
|
252
|
-
terms[i+1].list_ids += term.list_ids
|
253
|
-
term.use = false
|
254
|
-
end
|
23
|
+
terms.each.with_index do |term, i|
|
24
|
+
head = term.head_term
|
25
|
+
if head.is_shifter?
|
26
|
+
term.accumulated_strength *= -1
|
27
|
+
term.list_ids += term.list_ids
|
28
|
+
elsif head.is_intensifier?
|
29
|
+
term.accumulated_strength += head.accumulated_strength
|
30
|
+
term.list_ids += term.list_ids
|
31
|
+
else
|
255
32
|
end
|
256
33
|
end
|
257
34
|
end
|
258
35
|
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
def apply_conjunctions
|
263
|
-
terms_count = terms.count
|
264
|
-
i = 0
|
265
|
-
while i < terms_count
|
266
|
-
if terms[i].use && terms[i].accumulated_strength != 0
|
267
|
-
used = [i]
|
268
|
-
list_ids = terms[i].list_ids
|
269
|
-
strength = terms[i].accumulated_strength
|
270
|
-
terms[i].use = false
|
271
|
-
j = i+1
|
272
|
-
while true
|
273
|
-
if j >= terms_count
|
274
|
-
break
|
275
|
-
end
|
276
|
-
|
277
|
-
if terms[j].is_conjunction
|
278
|
-
terms[j].use = false
|
279
|
-
j += 1
|
280
|
-
elsif terms[j].use && terms[j].accumulated_strength != 0
|
281
|
-
list_ids += terms[j].list_ids
|
282
|
-
used << j
|
283
|
-
terms[j].use = false
|
284
|
-
strength += terms[j].accumulated_strength
|
285
|
-
j += 1
|
286
|
-
else
|
287
|
-
break
|
288
|
-
end
|
289
|
-
end
|
290
|
-
last_used = used.last
|
291
|
-
terms[last_used].accumulated_strength = strength
|
292
|
-
terms[last_used].list_ids = list_ids
|
293
|
-
terms[last_used].use = true
|
294
|
-
i = j
|
295
|
-
end
|
296
|
-
i += 1
|
297
|
-
end
|
298
|
-
end
|
299
|
-
|
300
|
-
##
|
301
|
-
# Creates a new node in the KAF file.
|
302
|
-
#
|
303
|
-
def new_node(tag, parent)
|
304
|
-
if parent.is_a?(String)
|
305
|
-
parent_node = document.at_xpath(parent)
|
306
|
-
else
|
307
|
-
parent_node = parent
|
308
|
-
end
|
309
|
-
|
310
|
-
node = Oga::XML::Element.new(:name => tag)
|
311
|
-
|
312
|
-
parent_node.children << node
|
313
|
-
|
314
|
-
node
|
315
|
-
end
|
316
|
-
|
317
|
-
##
|
318
|
-
# Check if input is a KAF file.
|
319
|
-
# @return [Boolean]
|
320
|
-
#
|
321
|
-
def is_kaf?
|
322
|
-
!!document.at_xpath('KAF')
|
323
|
-
end
|
324
|
-
end # Processor
|
325
|
-
end # OpinionDetectorBasic
|
326
|
-
end # Opener
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|