opener-opinion-detector-basic 2.0.7 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 765314f86b29243ff3c007100653a20198d3b890
4
- data.tar.gz: 1c8767a8dc9ccc48680d0fca3364ed7d0ef08a80
3
+ metadata.gz: d07d2a2eb88245eca143655a2fc8b5d301b632dd
4
+ data.tar.gz: 1c67e6b59421ef2ab4e33f5c3260699c202eab0e
5
5
  SHA512:
6
- metadata.gz: 9972bff4b61846eac50d946c1350ba65fef31fd15e6876a58b7c85c627efb11fd4427543a25ccb20310e94a3a3ab965a57199a4344f07227434688a75aa610ce
7
- data.tar.gz: b454c68ab7ed948db7e39e879646f8e07c64f65c90b2ec1713a0a9aeb8736cb516ba28ed2175f30ea52cfb7f7342585849340b5a756ae934715bc921545cf648
6
+ metadata.gz: cf26709cea362f73901df7184f2c562ac5b9d597c5386c1bb4845a843a667b8e59a301d6e36e3ed5759fd7a7b904b82a390665c0dd916f803e4dcfdefe3ca7f3
7
+ data.tar.gz: f978e9dc22837f78a758e28d4612e07732d5c12f7c429711d46716e1869c0fb640e7397b47b9c388c12273b440b7b282d49bcf87899070af80831373789294be
data/README.md CHANGED
@@ -103,11 +103,7 @@ At least you need the following system setup:
103
103
 
104
104
  ### Depenencies for normal use:
105
105
 
106
- * Ruby 1.9.3 or newer
107
- * Python 2.6
108
- * lxml: library for processing xml in python
109
- * libarchive, on Debian/Ubuntu based systems this can be installed using
110
- `sudo apt-get install libarchive-dev`
106
+ * Tested on Ruby 2.1.5, 2.2.2, Rubinius 2.4.0, jruby-1.7.8
111
107
 
112
108
  ## Domain Adaption
113
109
 
@@ -1,8 +1,13 @@
1
- require 'open3'
2
1
  require 'slop'
2
+ require 'oga'
3
+ require 'monitor'
4
+
5
+ require 'rexml/document'
6
+ require 'rexml/formatters/pretty'
3
7
 
4
8
  require_relative 'opinion_detector_basic/version'
5
9
  require_relative 'opinion_detector_basic/cli'
10
+ require_relative 'opinion_detector_basic/processor'
6
11
 
7
12
  module Opener
8
13
  ##
@@ -27,77 +32,19 @@ module Opener
27
32
  @args = options.delete(:args) || []
28
33
  @options = options
29
34
  end
30
-
35
+
31
36
  ##
32
- # Builds the command used to execute the kernel.
33
- #
34
- # @param [Array] args Commandline arguments passed to the command.
35
- #
36
- def command
37
- return "#{adjust_python_path} python -E #{kernel} #{args.join(' ')}"
38
- end
39
-
40
- ##
41
- # Processes an input KAF document and returns the results as a new KAF
42
- # document.
37
+ # Processes the input KAF document.
43
38
  #
44
39
  # @param [String] input
45
40
  # @return [String]
46
41
  #
47
42
  def run(input)
48
- stdout, stderr, process = capture(input)
49
-
50
- raise stderr unless process.success?
51
-
52
- return stdout
53
- end
54
-
55
- protected
56
-
57
- ##
58
- # @return [String]
59
- #
60
- def adjust_python_path
61
- site_packages = File.join(core_dir, 'site-packages')
62
-
63
- return "env PYTHONPATH=#{site_packages}:$PYTHONPATH"
64
- end
65
-
66
- ##
67
- # capture3 method doesn't work properly with Jruby, so
68
- # this is a workaround
69
- #
70
- def capture(input)
71
- Open3.popen3(*command.split(" ")) {|i, o, e, t|
72
- out_reader = Thread.new { o.read }
73
- err_reader = Thread.new { e.read }
74
- i.write input
75
- i.close
76
- [out_reader.value, err_reader.value, t.value]
77
- }
78
- end
79
-
80
- ##
81
- # @return [String]
82
- #
83
- def core_dir
84
- return File.expand_path('../../../core', __FILE__)
85
- end
86
-
87
- ##
88
- # @return [String]
89
- #
90
- def kernel
91
- return File.join(core_dir, 'opinion_detector_basic_multi.py')
92
- end
43
+ options[:timestamp] = !options.delete(:no_time)
93
44
 
94
- ##
95
- # @return the language from the KAF
96
- #
97
- def language(input)
98
- document = Nokogiri::XML(input)
99
-
100
- return document.at('KAF').attr('xml:lang')
45
+ return Processor.new(input, options).process
101
46
  end
102
47
  end # OpinionDetectorBasic
103
48
  end # Opener
49
+
50
+
@@ -0,0 +1,171 @@
1
+ module Opener
2
+ class OpinionDetectorBasic
3
+ class Opinion
4
+ attr_reader :term
5
+ attr_accessor :left_candidates, :right_candidates, :target_ids, :holders
6
+
7
+ def initialize(term)
8
+ @term = term
9
+ @left_candidates = []
10
+ @right_candidates = []
11
+ @holders = []
12
+ @target_ids = []
13
+ end
14
+
15
+ ##
16
+ # Returns the term ids of the opinion expression.
17
+ #
18
+ # @return [Array]
19
+ #
20
+ def ids
21
+ @ids ||= term.list_ids.sort
22
+ end
23
+
24
+ ##
25
+ # Returns the sentence id of the opinion.
26
+ #
27
+ # @return [String]
28
+ #
29
+ def sentence
30
+ @sentence ||= term.sentence
31
+ end
32
+
33
+ ##
34
+ # Returns the strength of the opinion.
35
+ #
36
+ # @return [Integer]
37
+ #
38
+ def strength
39
+ @strength ||= term.accumulated_strength
40
+ end
41
+
42
+ ##
43
+ # Returns the polarity of the opinion.
44
+ #
45
+ # @return [String]
46
+ #
47
+ def polarity
48
+ @polarity ||= if strength > 0
49
+ "positive"
50
+ elsif strength < 0
51
+ "negative"
52
+ else
53
+ "neutral"
54
+ end
55
+ end
56
+
57
+ ##
58
+ # Obtain the opinion holders from the terms that belong to the same
59
+ # sentence.
60
+ #
61
+ def obtain_holders(sentences, language)
62
+ sentence_terms = sentences[sentence]
63
+ sentence_terms.each do |term|
64
+ if opinion_holders[language].include?(term.lemma)
65
+ @holders << term.id
66
+ break
67
+ end
68
+ end
69
+ end
70
+
71
+ ##
72
+ # Get the potential right and left candidates of the sentence and
73
+ # decide which ones are the actual targets of the opinion
74
+ #
75
+ def obtain_targets(sentences)
76
+ sentence_terms = sentences[sentence]
77
+ max_distance = 3
78
+ terms_count = sentence_terms.count
79
+
80
+ index = -1
81
+ sentence_terms.each_with_index do |term, i|
82
+ if ids.include?(term.id)
83
+ index = i
84
+ end
85
+ end
86
+
87
+ unless index+1 >= terms_count
88
+ min = index+1
89
+ max = [index+1+max_distance,terms_count].min
90
+ @right_candidates = filter_candidates(sentence_terms[min..max])
91
+ end
92
+
93
+ index = 0
94
+ sentence_terms.each_with_index do |term, i|
95
+ if ids.include?(term.id)
96
+ index = i
97
+ break # needed for left_candidates
98
+ end
99
+ end
100
+
101
+ unless index == 0
102
+ min = [0, index-1-max_distance].max
103
+ max = index
104
+ @left_candidates = filter_candidates(sentence_terms[min..max])
105
+ end
106
+
107
+ unless right_candidates.empty?
108
+ candidate = right_candidates.first
109
+ @target_ids << candidate.id
110
+ end
111
+
112
+ if target_ids.empty?
113
+ list = mix_lists(right_candidates, left_candidates)
114
+ list.each do |l|
115
+ @target_ids << l.id
116
+ break
117
+ end
118
+ end
119
+ end
120
+
121
+ protected
122
+
123
+ ##
124
+ # If there are no opinion targets, right and left candidates
125
+ # are mixed into one list and the first one is picked as the target.
126
+ #
127
+ # @return [Array]
128
+ #
129
+ def mix_lists(lista, listb)
130
+ list = []
131
+ min = [lista.count, listb.count].min
132
+ (0..min).each do |i|
133
+ list << lista[i]
134
+ list << listb[i]
135
+ if lista.count > listb.count
136
+ list << lista[min]
137
+ elsif listb.count > lista.count
138
+ list << listb[min]
139
+ end
140
+ end
141
+ return list.compact
142
+ end
143
+
144
+ ##
145
+ # Filters candidate terms depending on their part of speech and if
146
+ # they are already part of the expression.
147
+ #
148
+ # @return [Hash]
149
+ #
150
+ def filter_candidates(sentence_terms)
151
+ sentence_terms.select{|t| (t.pos == "N" || t.pos == "R") && !ids.include?(t.id)}
152
+ end
153
+
154
+ ##
155
+ # Opinion holders for each language code.
156
+ #
157
+ # @return [Hash]
158
+ #
159
+ def opinion_holders
160
+ {
161
+ 'nl' => ['ik','we','wij','ze','zij','jullie','u','hij','het','jij','je','mij','me','hem','haar','ons','hen','hun'],
162
+ 'en' => ['i','we','he','she','they','it','you'],
163
+ 'es' => ['yo','tu','nosotros','vosotros','ellos','ellas','nosotras','vosotras'],
164
+ 'it' => ['io','tu','noi','voi','loro','lei','lui'],
165
+ 'de' => ['ich','du','wir','ihr','sie','er'],
166
+ 'fr' => ['je','tu','lui','elle','nous','vous','ils','elles']
167
+ }
168
+ end
169
+ end # Opinion
170
+ end # OpinionDetectorBasic
171
+ end # Opener
@@ -0,0 +1,329 @@
1
+ require_relative 'term'
2
+ require_relative 'opinion'
3
+
4
+ module Opener
5
+ class OpinionDetectorBasic
6
+ ##
7
+ # Class that detects opinions in a given input KAF file.
8
+ #
9
+ class Processor
10
+ attr_accessor :document, :timestamp, :opinion_strength, :pretty
11
+
12
+ ##
13
+ # @param [String|IO] file The KAF file/input to process.
14
+ # @param [Hash] options. Options for timestamp and including strength to
15
+ # opinions.
16
+ # @param [TrueClass|FalseClass] pretty Enable pretty formatting, disabled
17
+ # by default due to the performance overhead.
18
+ #
19
+ def initialize(file, options = {})
20
+ @document = Oga.parse_xml(file)
21
+
22
+ @timestamp = !!options[:timestamp]
23
+ @opinion_strength = !!options[:opinion_strength]
24
+ @pretty = options[:pretty] || false
25
+
26
+ raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
27
+ end
28
+
29
+ ##
30
+ # Processes the input and returns the new KAF output.
31
+ # @return [String]
32
+ #
33
+ def process
34
+ add_opinions_layer
35
+
36
+ index = 1
37
+ opinions.each do |opinion|
38
+ add_opinion(opinion, index)
39
+ index += 1
40
+ end
41
+
42
+ add_linguistic_processor
43
+
44
+ return pretty ? pretty_print(document) : document.to_xml
45
+ end
46
+
47
+ ##
48
+ # Get the language of the input file.
49
+ #
50
+ # @return [String]
51
+ #
52
+ def language
53
+ return @language ||= document.at_xpath('KAF').get('xml:lang')
54
+ end
55
+
56
+ ##
57
+ # Get the terms from the input file
58
+ # @return [Hash]
59
+ #
60
+ def terms
61
+ unless @terms
62
+ @terms = []
63
+
64
+ document.xpath('KAF/terms/term').each do |term|
65
+ @terms << Term.new(term, document, language)
66
+ end
67
+ end
68
+
69
+ return @terms
70
+ end
71
+
72
+ ##
73
+ # Get the opinions.
74
+ #
75
+ # @return [Hash]
76
+ #
77
+ def opinions
78
+ unless @opinions
79
+ set_accumulated_strength
80
+ apply_modifiers
81
+ apply_conjunctions
82
+
83
+ ##
84
+ # Initialize opinions with their expressions.
85
+ #
86
+ @opinions = terms.map do |term|
87
+ if term.is_expression? && term.accumulated_strength != 0
88
+ o = Opinion.new(term)
89
+ end
90
+ end.compact
91
+
92
+ ##
93
+ # Obtain targets for each opinion.
94
+ #
95
+ @opinions.each do |opinion|
96
+ opinion.obtain_targets(sentences)
97
+ end
98
+
99
+ ##
100
+ # Obtain holders for each opinion.
101
+ #
102
+ @opinions.each do |opinion|
103
+ opinion.obtain_holders(sentences, language)
104
+ end
105
+ end
106
+
107
+ return @opinions
108
+ end
109
+
110
+ ##
111
+ # Remove the opinions layer from the KAF file if it exists and add a new
112
+ # one.
113
+ def add_opinions_layer
114
+ existing = document.at_xpath('KAF/opinions')
115
+
116
+ existing.remove if existing
117
+
118
+ new_node('opinions', 'KAF')
119
+ end
120
+
121
+ ##
122
+ # Adds the entire opinion in the KAF file.
123
+ #
124
+ def add_opinion(opinion, index)
125
+ opinion_node = new_node("opinion", "KAF/opinions")
126
+ opinion_node.set('oid', "o#{index.to_s}")
127
+
128
+ unless opinion.holders.empty?
129
+ opinion_holder_node = new_node("opinion_holder", opinion_node)
130
+ add_opinion_element(opinion_holder_node, opinion.holders)
131
+ end
132
+
133
+ opinion_target_node = new_node("opinion_target", opinion_node)
134
+ unless opinion.target_ids.empty?
135
+ add_opinion_element(opinion_target_node, opinion.target_ids)
136
+ end
137
+
138
+ expression_node = new_node("opinion_expression", opinion_node)
139
+ expression_node.set('polarity', opinion.polarity)
140
+ expression_node.set('strength', opinion.strength.to_s)
141
+ add_opinion_element(expression_node, opinion.ids)
142
+ end
143
+
144
+ ##
145
+ # Method for adding opinion holders, targets and expressions.
146
+ #
147
+ def add_opinion_element(node, ids)
148
+ lemmas = terms.select{|t| ids.include?(t.id)}.map(&:lemma).join(" ")
149
+ comment = Oga::XML::Comment.new(:text => "#{lemmas}")
150
+ node.children << comment
151
+ span_node = new_node("span", node)
152
+ ids.each do |id|
153
+ target_node = new_node("target", span_node)
154
+ target_node.set('id', id.to_s)
155
+ end
156
+ end
157
+
158
+ ##
159
+ # Add linguistic processor layer with basic information
160
+ # (version, timestamp, description etc) in the KAF file.
161
+ #
162
+ def add_linguistic_processor
163
+ description = 'Basic opinion detector with Pos'
164
+ last_edited = '13may2015'
165
+ version = '2.0'
166
+
167
+ node = new_node('linguisticProcessors', 'KAF/kafHeader')
168
+ node.set('layer', 'opinions')
169
+
170
+ lp_node = new_node('lp', node)
171
+
172
+ lp_node.set('version', "#{last_edited}-#{version}")
173
+ lp_node.set('name', description)
174
+
175
+ if timestamp
176
+ format = '%Y-%m-%dT%H:%M:%S%Z'
177
+
178
+ lp_node.set('timestamp', Time.now.strftime(format))
179
+ else
180
+ lp_node.set('timestamp', '*')
181
+ end
182
+ end
183
+
184
+ ##
185
+ # Format the output document properly.
186
+ #
187
+ # TODO: this should be handled by Oga in a nice way.
188
+ #
189
+ # @return [String]
190
+ #
191
+ def pretty_print(document)
192
+ doc = REXML::Document.new document.to_xml
193
+ doc.context[:attribute_quote] = :quote
194
+ out = ""
195
+ formatter = REXML::Formatters::Pretty.new
196
+ formatter.compact = true
197
+ formatter.write(doc, out)
198
+
199
+ return out.strip
200
+ end
201
+
202
+ ##
203
+ # Get terms grouped by sentence.
204
+ #
205
+ def sentences
206
+ @sentences ||= terms.group_by{|t| t.sentence}
207
+ end
208
+
209
+ protected
210
+
211
+ ##
212
+ # The strength of a term depends heavily on the type of the previous
213
+ # one. For example if the previous one is a shifter, it needs
214
+ # to be multiplied. If it's an intensifier, it needs to be
215
+ # added (or subtracted depending on the strength of the previous
216
+ # term) etc.
217
+ #
218
+ def set_accumulated_strength
219
+ symbol = :+
220
+ terms_count = terms.count
221
+ terms.each_with_index do |term, i|
222
+ if i+1 < terms_count
223
+ if terms[i+1].is_shifter?
224
+ if term.accumulated_strength != 0
225
+ terms[i+1].accumulated_strength *= term.accumulated_strength
226
+ terms[i+1].list_ids += term.list_ids
227
+ term.use = false
228
+ symbol = terms[i+1].accumulated_strength > 0 ? :+ : :-
229
+ else
230
+ symbol = :*
231
+ end
232
+ elsif terms[i+1].is_intensifier?
233
+ terms[i+1].accumulated_strength = term.accumulated_strength.send(symbol, terms[i+1].accumulated_strength)
234
+ term.use = false
235
+ symbol = terms[i+1].accumulated_strength > 0 ? :+ : :-
236
+ if term.accumulated_strength != 0
237
+ terms[i+1].list_ids += term.list_ids
238
+ end
239
+ else
240
+ symbol = terms[i+1].accumulated_strength >= 0 ? :+ : :-
241
+ end
242
+ end
243
+ end
244
+ end
245
+
246
+ ##
247
+ # Apply strength to the next term after a shifter or intensifier.
248
+ #
249
+ def apply_modifiers
250
+ terms_count = terms.count
251
+ terms.each_with_index do |term, i|
252
+ if i+1 < terms_count
253
+ if term.use && (term.is_shifter? || term.is_intensifier?)
254
+ terms[i+1].accumulated_strength *= term.accumulated_strength
255
+ terms[i+1].list_ids += term.list_ids
256
+ term.use = false
257
+ end
258
+ end
259
+ end
260
+ end
261
+
262
+ ##
263
+ # Ignore conjunctions when applying strength.
264
+ #
265
+ def apply_conjunctions
266
+ terms_count = terms.count
267
+ i = 0
268
+ while i < terms_count
269
+ if terms[i].use && terms[i].accumulated_strength != 0
270
+ used = [i]
271
+ list_ids = terms[i].list_ids
272
+ strength = terms[i].accumulated_strength
273
+ terms[i].use = false
274
+ j = i+1
275
+ while true
276
+ if j >= terms_count
277
+ break
278
+ end
279
+
280
+ if terms[j].is_conjunction
281
+ terms[j].use = false
282
+ j += 1
283
+ elsif terms[j].use && terms[j].accumulated_strength != 0
284
+ list_ids += terms[j].list_ids
285
+ used << j
286
+ terms[j].use = false
287
+ strength += terms[j].accumulated_strength
288
+ j += 1
289
+ else
290
+ break
291
+ end
292
+ end
293
+ last_used = used.last
294
+ terms[last_used].accumulated_strength = strength
295
+ terms[last_used].list_ids = list_ids
296
+ terms[last_used].use = true
297
+ i = j
298
+ end
299
+ i += 1
300
+ end
301
+ end
302
+
303
+ ##
304
+ # Creates a new node in the KAF file.
305
+ #
306
+ def new_node(tag, parent)
307
+ if parent.is_a?(String)
308
+ parent_node = document.at_xpath(parent)
309
+ else
310
+ parent_node = parent
311
+ end
312
+
313
+ node = Oga::XML::Element.new(:name => tag)
314
+
315
+ parent_node.children << node
316
+
317
+ return node
318
+ end
319
+
320
+ ##
321
+ # Check if input is a KAF file.
322
+ # @return [Boolean]
323
+ #
324
+ def is_kaf?
325
+ return !!document.at_xpath('KAF')
326
+ end
327
+ end # Processor
328
+ end # OpinionDetectorBasic
329
+ end # Opener