opener-opinion-detector-basic 3.2.2 → 3.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/opener/opinion_detector_basic.rb +13 -2
- data/lib/opener/opinion_detector_basic/base_processor.rb +56 -0
- data/lib/opener/opinion_detector_basic/kaf/document.rb +146 -0
- data/lib/opener/opinion_detector_basic/kaf/opinion.rb +152 -0
- data/lib/opener/opinion_detector_basic/kaf/term.rb +185 -0
- data/lib/opener/opinion_detector_basic/legacy_processor.rb +136 -0
- data/lib/opener/opinion_detector_basic/processor.rb +22 -310
- data/lib/opener/opinion_detector_basic/version.rb +1 -1
- data/opener-opinion-detector-basic.gemspec +3 -1
- metadata +37 -13
- data/lib/opener/opinion_detector_basic/opinion.rb +0 -170
- data/lib/opener/opinion_detector_basic/term.rb +0 -159
@@ -0,0 +1,136 @@
|
|
1
|
+
module Opener
|
2
|
+
class OpinionDetectorBasic
|
3
|
+
##
|
4
|
+
# Class that detects opinions in a given input KAF file.
|
5
|
+
#
|
6
|
+
class LegacyProcessor < BaseProcessor
|
7
|
+
|
8
|
+
def opinions
|
9
|
+
unless @opinions
|
10
|
+
set_accumulated_strength
|
11
|
+
apply_modifiers
|
12
|
+
apply_conjunctions
|
13
|
+
|
14
|
+
##
|
15
|
+
# Initialize opinions with their expressions.
|
16
|
+
#
|
17
|
+
@opinions = document.terms.map do |term|
|
18
|
+
next unless term.is_expression? and term.accumulated_strength != 0
|
19
|
+
Kaf::Opinion.new term
|
20
|
+
end.compact
|
21
|
+
|
22
|
+
##
|
23
|
+
# Obtain targets for each opinion.
|
24
|
+
#
|
25
|
+
@opinions.each do |opinion|
|
26
|
+
opinion.obtain_targets sentences
|
27
|
+
end
|
28
|
+
|
29
|
+
##
|
30
|
+
# Obtain holders for each opinion.
|
31
|
+
#
|
32
|
+
@opinions.each do |opinion|
|
33
|
+
opinion.obtain_holders sentences, document.language
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
@opinions
|
38
|
+
end
|
39
|
+
|
40
|
+
protected
|
41
|
+
|
42
|
+
##
|
43
|
+
# The strength of a term depends heavily on the type of the previous
|
44
|
+
# one. For example if the previous one is a shifter, it needs
|
45
|
+
# to be multiplied. If it's an intensifier, it needs to be
|
46
|
+
# added (or subtracted depending on the strength of the previous
|
47
|
+
# term) etc.
|
48
|
+
#
|
49
|
+
def set_accumulated_strength
|
50
|
+
symbol = :+
|
51
|
+
terms_count = terms.count
|
52
|
+
terms.each.with_index do |term, i|
|
53
|
+
next unless i+1 < terms_count
|
54
|
+
|
55
|
+
if terms[i+1].is_shifter?
|
56
|
+
if term.accumulated_strength != 0
|
57
|
+
terms[i+1].accumulated_strength *= term.accumulated_strength
|
58
|
+
terms[i+1].list_ids += term.list_ids
|
59
|
+
term.use = false
|
60
|
+
symbol = terms[i+1].accumulated_strength > 0 ? :+ : :-
|
61
|
+
else
|
62
|
+
symbol = :*
|
63
|
+
end
|
64
|
+
elsif terms[i+1].is_intensifier?
|
65
|
+
terms[i+1].accumulated_strength = term.accumulated_strength.send(symbol, terms[i+1].accumulated_strength)
|
66
|
+
term.use = false
|
67
|
+
symbol = terms[i+1].accumulated_strength > 0 ? :+ : :-
|
68
|
+
if term.accumulated_strength != 0
|
69
|
+
terms[i+1].list_ids += term.list_ids
|
70
|
+
end
|
71
|
+
else
|
72
|
+
symbol = terms[i+1].accumulated_strength >= 0 ? :+ : :-
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
##
|
78
|
+
# Apply strength to the next term after a shifter or intensifier.
|
79
|
+
#
|
80
|
+
def apply_modifiers
|
81
|
+
terms_count = terms.count
|
82
|
+
terms.each.with_index do |term, i|
|
83
|
+
if i+1 < terms_count
|
84
|
+
if term.use && (term.is_shifter? || term.is_intensifier?)
|
85
|
+
terms[i+1].accumulated_strength *= term.accumulated_strength
|
86
|
+
terms[i+1].list_ids += term.list_ids
|
87
|
+
term.use = false
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
##
|
94
|
+
# Ignore conjunctions when applying strength.
|
95
|
+
#
|
96
|
+
def apply_conjunctions
|
97
|
+
terms_count = terms.count
|
98
|
+
i = 0
|
99
|
+
while i < terms_count
|
100
|
+
if terms[i].use && terms[i].accumulated_strength != 0
|
101
|
+
used = [i]
|
102
|
+
list_ids = terms[i].list_ids
|
103
|
+
strength = terms[i].accumulated_strength
|
104
|
+
terms[i].use = false
|
105
|
+
j = i+1
|
106
|
+
while true
|
107
|
+
if j >= terms_count
|
108
|
+
break
|
109
|
+
end
|
110
|
+
|
111
|
+
if terms[j].is_conjunction
|
112
|
+
terms[j].use = false
|
113
|
+
j += 1
|
114
|
+
elsif terms[j].use && terms[j].accumulated_strength != 0
|
115
|
+
list_ids += terms[j].list_ids
|
116
|
+
used << j
|
117
|
+
terms[j].use = false
|
118
|
+
strength += terms[j].accumulated_strength
|
119
|
+
j += 1
|
120
|
+
else
|
121
|
+
break
|
122
|
+
end
|
123
|
+
end
|
124
|
+
last_used = used.last
|
125
|
+
terms[last_used].accumulated_strength = strength
|
126
|
+
terms[last_used].list_ids = list_ids
|
127
|
+
terms[last_used].use = true
|
128
|
+
i = j
|
129
|
+
end
|
130
|
+
i += 1
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
@@ -1,326 +1,38 @@
|
|
1
|
-
require_relative 'term'
|
2
|
-
require_relative 'opinion'
|
3
|
-
|
4
1
|
module Opener
|
5
2
|
class OpinionDetectorBasic
|
6
3
|
##
|
7
4
|
# Class that detects opinions in a given input KAF file.
|
8
5
|
#
|
9
|
-
class Processor
|
10
|
-
attr_accessor :document, :timestamp, :opinion_strength, :pretty
|
11
|
-
|
12
|
-
##
|
13
|
-
# @param [String|IO] file The KAF file/input to process.
|
14
|
-
# @param [Hash] options. Options for timestamp and including strength to
|
15
|
-
# opinions.
|
16
|
-
# @param [TrueClass|FalseClass] pretty Enable pretty formatting, disabled
|
17
|
-
# by default due to the performance overhead.
|
18
|
-
#
|
19
|
-
def initialize(file, options = {})
|
20
|
-
@document = Oga.parse_xml(file)
|
21
|
-
|
22
|
-
@timestamp = options[:timestamp]
|
23
|
-
@opinion_strength = options[:opinion_strength]
|
24
|
-
@pretty = options[:pretty] || false
|
25
|
-
|
26
|
-
raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
|
27
|
-
end
|
28
|
-
|
29
|
-
##
|
30
|
-
# Processes the input and returns the new KAF output.
|
31
|
-
# @return [String]
|
32
|
-
#
|
33
|
-
def process
|
34
|
-
add_opinions_layer
|
35
|
-
|
36
|
-
index = 1
|
37
|
-
opinions.each do |opinion|
|
38
|
-
add_opinion(opinion, index)
|
39
|
-
index += 1
|
40
|
-
end
|
41
|
-
|
42
|
-
add_linguistic_processor
|
6
|
+
class Processor < BaseProcessor
|
43
7
|
|
44
|
-
pretty ? pretty_print(document) : document.to_xml
|
45
|
-
end
|
46
|
-
|
47
|
-
##
|
48
|
-
# Get the language of the input file.
|
49
|
-
#
|
50
|
-
# @return [String]
|
51
|
-
#
|
52
|
-
def language
|
53
|
-
@language ||= document.at_xpath('KAF').get('xml:lang')
|
54
|
-
end
|
55
|
-
|
56
|
-
##
|
57
|
-
# Get the terms from the input file
|
58
|
-
# @return [Hash]
|
59
|
-
#
|
60
|
-
def terms
|
61
|
-
@terms ||= document.xpath('KAF/terms/term').map do |term|
|
62
|
-
Term.new(term, document, language)
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
##
|
67
|
-
# Get the opinions.
|
68
|
-
#
|
69
|
-
# @return [Hash]
|
70
|
-
#
|
71
8
|
def opinions
|
72
|
-
|
73
|
-
set_accumulated_strength
|
74
|
-
apply_modifiers
|
75
|
-
apply_conjunctions
|
76
|
-
|
77
|
-
##
|
78
|
-
# Initialize opinions with their expressions.
|
79
|
-
#
|
80
|
-
@opinions = terms.map do |term|
|
81
|
-
if term.is_expression? && term.accumulated_strength != 0
|
82
|
-
Opinion.new(term)
|
83
|
-
end
|
84
|
-
end.compact
|
85
|
-
|
86
|
-
##
|
87
|
-
# Obtain targets for each opinion.
|
88
|
-
#
|
89
|
-
@opinions.each do |opinion|
|
90
|
-
opinion.obtain_targets(sentences)
|
91
|
-
end
|
92
|
-
|
93
|
-
##
|
94
|
-
# Obtain holders for each opinion.
|
95
|
-
#
|
96
|
-
@opinions.each do |opinion|
|
97
|
-
opinion.obtain_holders(sentences, language)
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|
101
|
-
@opinions
|
102
|
-
end
|
103
|
-
|
104
|
-
##
|
105
|
-
# Remove the opinions layer from the KAF file if it exists and add a new
|
106
|
-
# one.
|
107
|
-
def add_opinions_layer
|
108
|
-
existing = document.at_xpath('KAF/opinions')
|
109
|
-
|
110
|
-
existing.remove if existing
|
111
|
-
|
112
|
-
new_node('opinions', 'KAF')
|
113
|
-
end
|
114
|
-
|
115
|
-
##
|
116
|
-
# Adds the entire opinion in the KAF file.
|
117
|
-
#
|
118
|
-
def add_opinion(opinion, index)
|
119
|
-
opinion_node = new_node("opinion", "KAF/opinions")
|
120
|
-
opinion_node.set('oid', "o#{index.to_s}")
|
121
|
-
|
122
|
-
unless opinion.holders.empty?
|
123
|
-
opinion_holder_node = new_node("opinion_holder", opinion_node)
|
124
|
-
add_opinion_element(opinion_holder_node, opinion.holders)
|
125
|
-
end
|
126
|
-
|
127
|
-
opinion_target_node = new_node("opinion_target", opinion_node)
|
128
|
-
|
129
|
-
unless opinion.target_ids.empty?
|
130
|
-
add_opinion_element(opinion_target_node, opinion.target_ids)
|
131
|
-
end
|
9
|
+
return @opinions if @opinions
|
132
10
|
|
133
|
-
|
134
|
-
|
135
|
-
|
11
|
+
##
|
12
|
+
# Initialize opinions with their expressions.
|
13
|
+
#
|
14
|
+
@opinions = document.terms.map do |term|
|
15
|
+
next unless term.is_expression? and term.accumulated_strength != 0
|
16
|
+
Kaf::Opinion.new term
|
17
|
+
end.compact
|
136
18
|
|
137
|
-
|
19
|
+
set_accumulated_strength
|
138
20
|
end
|
139
21
|
|
140
|
-
##
|
141
|
-
# Method for adding opinion holders, targets and expressions.
|
142
|
-
#
|
143
|
-
def add_opinion_element(node, ids)
|
144
|
-
lemmas = terms.select{|t| ids.include?(t.id)}.map(&:lemma).join(" ")
|
145
|
-
comment = Oga::XML::Comment.new(:text => "#{lemmas}")
|
146
|
-
node.children << comment
|
147
|
-
span_node = new_node("span", node)
|
148
|
-
|
149
|
-
ids.each do |id|
|
150
|
-
target_node = new_node("target", span_node)
|
151
|
-
target_node.set('id', id.to_s)
|
152
|
-
end
|
153
|
-
end
|
154
|
-
|
155
|
-
##
|
156
|
-
# Add linguistic processor layer with basic information
|
157
|
-
# (version, timestamp, description etc) in the KAF file.
|
158
|
-
#
|
159
|
-
def add_linguistic_processor
|
160
|
-
description = 'Basic opinion detector with Pos'
|
161
|
-
last_edited = '13may2015'
|
162
|
-
version = '2.0'
|
163
|
-
|
164
|
-
node = new_node('linguisticProcessors', 'KAF/kafHeader')
|
165
|
-
node.set('layer', 'opinions')
|
166
|
-
|
167
|
-
lp_node = new_node('lp', node)
|
168
|
-
|
169
|
-
lp_node.set('version', "#{last_edited}-#{version}")
|
170
|
-
lp_node.set('name', description)
|
171
|
-
|
172
|
-
if timestamp
|
173
|
-
format = '%Y-%m-%dT%H:%M:%S%Z'
|
174
|
-
|
175
|
-
lp_node.set('timestamp', Time.now.strftime(format))
|
176
|
-
else
|
177
|
-
lp_node.set('timestamp', '*')
|
178
|
-
end
|
179
|
-
end
|
180
|
-
|
181
|
-
##
|
182
|
-
# Format the output document properly.
|
183
|
-
#
|
184
|
-
# TODO: this should be handled by Oga in a nice way.
|
185
|
-
#
|
186
|
-
# @return [String]
|
187
|
-
#
|
188
|
-
def pretty_print(document)
|
189
|
-
doc = REXML::Document.new document.to_xml
|
190
|
-
doc.context[:attribute_quote] = :quote
|
191
|
-
out = ""
|
192
|
-
formatter = REXML::Formatters::Pretty.new
|
193
|
-
formatter.compact = true
|
194
|
-
formatter.write(doc, out)
|
195
|
-
|
196
|
-
out.strip
|
197
|
-
end
|
198
|
-
|
199
|
-
##
|
200
|
-
# Get terms grouped by sentence.
|
201
|
-
#
|
202
|
-
def sentences
|
203
|
-
@sentences ||= terms.group_by{|t| t.sentence}
|
204
|
-
end
|
205
|
-
|
206
|
-
protected
|
207
|
-
|
208
|
-
##
|
209
|
-
# The strength of a term depends heavily on the type of the previous
|
210
|
-
# one. For example if the previous one is a shifter, it needs
|
211
|
-
# to be multiplied. If it's an intensifier, it needs to be
|
212
|
-
# added (or subtracted depending on the strength of the previous
|
213
|
-
# term) etc.
|
214
|
-
#
|
215
22
|
def set_accumulated_strength
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
symbol = terms[i+1].accumulated_strength > 0 ? :+ : :-
|
226
|
-
else
|
227
|
-
symbol = :*
|
228
|
-
end
|
229
|
-
elsif terms[i+1].is_intensifier?
|
230
|
-
terms[i+1].accumulated_strength = term.accumulated_strength.send(symbol, terms[i+1].accumulated_strength)
|
231
|
-
term.use = false
|
232
|
-
symbol = terms[i+1].accumulated_strength > 0 ? :+ : :-
|
233
|
-
if term.accumulated_strength != 0
|
234
|
-
terms[i+1].list_ids += term.list_ids
|
235
|
-
end
|
236
|
-
else
|
237
|
-
symbol = terms[i+1].accumulated_strength >= 0 ? :+ : :-
|
238
|
-
end
|
239
|
-
end
|
240
|
-
end
|
241
|
-
end
|
242
|
-
|
243
|
-
##
|
244
|
-
# Apply strength to the next term after a shifter or intensifier.
|
245
|
-
#
|
246
|
-
def apply_modifiers
|
247
|
-
terms_count = terms.count
|
248
|
-
terms.each_with_index do |term, i|
|
249
|
-
if i+1 < terms_count
|
250
|
-
if term.use && (term.is_shifter? || term.is_intensifier?)
|
251
|
-
terms[i+1].accumulated_strength *= term.accumulated_strength
|
252
|
-
terms[i+1].list_ids += term.list_ids
|
253
|
-
term.use = false
|
254
|
-
end
|
23
|
+
terms.each.with_index do |term, i|
|
24
|
+
head = term.head_term
|
25
|
+
if head.is_shifter?
|
26
|
+
term.accumulated_strength *= -1
|
27
|
+
term.list_ids += term.list_ids
|
28
|
+
elsif head.is_intensifier?
|
29
|
+
term.accumulated_strength += head.accumulated_strength
|
30
|
+
term.list_ids += term.list_ids
|
31
|
+
else
|
255
32
|
end
|
256
33
|
end
|
257
34
|
end
|
258
35
|
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
def apply_conjunctions
|
263
|
-
terms_count = terms.count
|
264
|
-
i = 0
|
265
|
-
while i < terms_count
|
266
|
-
if terms[i].use && terms[i].accumulated_strength != 0
|
267
|
-
used = [i]
|
268
|
-
list_ids = terms[i].list_ids
|
269
|
-
strength = terms[i].accumulated_strength
|
270
|
-
terms[i].use = false
|
271
|
-
j = i+1
|
272
|
-
while true
|
273
|
-
if j >= terms_count
|
274
|
-
break
|
275
|
-
end
|
276
|
-
|
277
|
-
if terms[j].is_conjunction
|
278
|
-
terms[j].use = false
|
279
|
-
j += 1
|
280
|
-
elsif terms[j].use && terms[j].accumulated_strength != 0
|
281
|
-
list_ids += terms[j].list_ids
|
282
|
-
used << j
|
283
|
-
terms[j].use = false
|
284
|
-
strength += terms[j].accumulated_strength
|
285
|
-
j += 1
|
286
|
-
else
|
287
|
-
break
|
288
|
-
end
|
289
|
-
end
|
290
|
-
last_used = used.last
|
291
|
-
terms[last_used].accumulated_strength = strength
|
292
|
-
terms[last_used].list_ids = list_ids
|
293
|
-
terms[last_used].use = true
|
294
|
-
i = j
|
295
|
-
end
|
296
|
-
i += 1
|
297
|
-
end
|
298
|
-
end
|
299
|
-
|
300
|
-
##
|
301
|
-
# Creates a new node in the KAF file.
|
302
|
-
#
|
303
|
-
def new_node(tag, parent)
|
304
|
-
if parent.is_a?(String)
|
305
|
-
parent_node = document.at_xpath(parent)
|
306
|
-
else
|
307
|
-
parent_node = parent
|
308
|
-
end
|
309
|
-
|
310
|
-
node = Oga::XML::Element.new(:name => tag)
|
311
|
-
|
312
|
-
parent_node.children << node
|
313
|
-
|
314
|
-
node
|
315
|
-
end
|
316
|
-
|
317
|
-
##
|
318
|
-
# Check if input is a KAF file.
|
319
|
-
# @return [Boolean]
|
320
|
-
#
|
321
|
-
def is_kaf?
|
322
|
-
!!document.at_xpath('KAF')
|
323
|
-
end
|
324
|
-
end # Processor
|
325
|
-
end # OpinionDetectorBasic
|
326
|
-
end # Opener
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|