opener-opinion-detector-basic 2.0.7 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 765314f86b29243ff3c007100653a20198d3b890
4
- data.tar.gz: 1c8767a8dc9ccc48680d0fca3364ed7d0ef08a80
3
+ metadata.gz: d07d2a2eb88245eca143655a2fc8b5d301b632dd
4
+ data.tar.gz: 1c67e6b59421ef2ab4e33f5c3260699c202eab0e
5
5
  SHA512:
6
- metadata.gz: 9972bff4b61846eac50d946c1350ba65fef31fd15e6876a58b7c85c627efb11fd4427543a25ccb20310e94a3a3ab965a57199a4344f07227434688a75aa610ce
7
- data.tar.gz: b454c68ab7ed948db7e39e879646f8e07c64f65c90b2ec1713a0a9aeb8736cb516ba28ed2175f30ea52cfb7f7342585849340b5a756ae934715bc921545cf648
6
+ metadata.gz: cf26709cea362f73901df7184f2c562ac5b9d597c5386c1bb4845a843a667b8e59a301d6e36e3ed5759fd7a7b904b82a390665c0dd916f803e4dcfdefe3ca7f3
7
+ data.tar.gz: f978e9dc22837f78a758e28d4612e07732d5c12f7c429711d46716e1869c0fb640e7397b47b9c388c12273b440b7b282d49bcf87899070af80831373789294be
data/README.md CHANGED
@@ -103,11 +103,7 @@ At least you need the following system setup:
103
103
 
104
104
  ### Depenencies for normal use:
105
105
 
106
- * Ruby 1.9.3 or newer
107
- * Python 2.6
108
- * lxml: library for processing xml in python
109
- * libarchive, on Debian/Ubuntu based systems this can be installed using
110
- `sudo apt-get install libarchive-dev`
106
+ * Tested on Ruby 2.1.5, 2.2.2, Rubinius 2.4.0, jruby-1.7.8
111
107
 
112
108
  ## Domain Adaption
113
109
 
@@ -1,8 +1,13 @@
1
- require 'open3'
2
1
  require 'slop'
2
+ require 'oga'
3
+ require 'monitor'
4
+
5
+ require 'rexml/document'
6
+ require 'rexml/formatters/pretty'
3
7
 
4
8
  require_relative 'opinion_detector_basic/version'
5
9
  require_relative 'opinion_detector_basic/cli'
10
+ require_relative 'opinion_detector_basic/processor'
6
11
 
7
12
  module Opener
8
13
  ##
@@ -27,77 +32,19 @@ module Opener
27
32
  @args = options.delete(:args) || []
28
33
  @options = options
29
34
  end
30
-
35
+
31
36
  ##
32
- # Builds the command used to execute the kernel.
33
- #
34
- # @param [Array] args Commandline arguments passed to the command.
35
- #
36
- def command
37
- return "#{adjust_python_path} python -E #{kernel} #{args.join(' ')}"
38
- end
39
-
40
- ##
41
- # Processes an input KAF document and returns the results as a new KAF
42
- # document.
37
+ # Processes the input KAF document.
43
38
  #
44
39
  # @param [String] input
45
40
  # @return [String]
46
41
  #
47
42
  def run(input)
48
- stdout, stderr, process = capture(input)
49
-
50
- raise stderr unless process.success?
51
-
52
- return stdout
53
- end
54
-
55
- protected
56
-
57
- ##
58
- # @return [String]
59
- #
60
- def adjust_python_path
61
- site_packages = File.join(core_dir, 'site-packages')
62
-
63
- return "env PYTHONPATH=#{site_packages}:$PYTHONPATH"
64
- end
65
-
66
- ##
67
- # capture3 method doesn't work properly with Jruby, so
68
- # this is a workaround
69
- #
70
- def capture(input)
71
- Open3.popen3(*command.split(" ")) {|i, o, e, t|
72
- out_reader = Thread.new { o.read }
73
- err_reader = Thread.new { e.read }
74
- i.write input
75
- i.close
76
- [out_reader.value, err_reader.value, t.value]
77
- }
78
- end
79
-
80
- ##
81
- # @return [String]
82
- #
83
- def core_dir
84
- return File.expand_path('../../../core', __FILE__)
85
- end
86
-
87
- ##
88
- # @return [String]
89
- #
90
- def kernel
91
- return File.join(core_dir, 'opinion_detector_basic_multi.py')
92
- end
43
+ options[:timestamp] = !options.delete(:no_time)
93
44
 
94
- ##
95
- # @return the language from the KAF
96
- #
97
- def language(input)
98
- document = Nokogiri::XML(input)
99
-
100
- return document.at('KAF').attr('xml:lang')
45
+ return Processor.new(input, options).process
101
46
  end
102
47
  end # OpinionDetectorBasic
103
48
  end # Opener
49
+
50
+
@@ -0,0 +1,171 @@
1
+ module Opener
2
+ class OpinionDetectorBasic
3
+ class Opinion
4
+ attr_reader :term
5
+ attr_accessor :left_candidates, :right_candidates, :target_ids, :holders
6
+
7
+ def initialize(term)
8
+ @term = term
9
+ @left_candidates = []
10
+ @right_candidates = []
11
+ @holders = []
12
+ @target_ids = []
13
+ end
14
+
15
+ ##
16
+ # Returns the term ids of the opinion expression.
17
+ #
18
+ # @return [Array]
19
+ #
20
+ def ids
21
+ @ids ||= term.list_ids.sort
22
+ end
23
+
24
+ ##
25
+ # Returns the sentence id of the opinion.
26
+ #
27
+ # @return [String]
28
+ #
29
+ def sentence
30
+ @sentence ||= term.sentence
31
+ end
32
+
33
+ ##
34
+ # Returns the strength of the opinion.
35
+ #
36
+ # @return [Integer]
37
+ #
38
+ def strength
39
+ @strength ||= term.accumulated_strength
40
+ end
41
+
42
+ ##
43
+ # Returns the polarity of the opinion.
44
+ #
45
+ # @return [String]
46
+ #
47
+ def polarity
48
+ @polarity ||= if strength > 0
49
+ "positive"
50
+ elsif strength < 0
51
+ "negative"
52
+ else
53
+ "neutral"
54
+ end
55
+ end
56
+
57
+ ##
58
+ # Obtain the opinion holders from the terms that belong to the same
59
+ # sentence.
60
+ #
61
+ def obtain_holders(sentences, language)
62
+ sentence_terms = sentences[sentence]
63
+ sentence_terms.each do |term|
64
+ if opinion_holders[language].include?(term.lemma)
65
+ @holders << term.id
66
+ break
67
+ end
68
+ end
69
+ end
70
+
71
+ ##
72
+ # Get the potential right and left candidates of the sentence and
73
+ # decide which ones are the actual targets of the opinion
74
+ #
75
+ def obtain_targets(sentences)
76
+ sentence_terms = sentences[sentence]
77
+ max_distance = 3
78
+ terms_count = sentence_terms.count
79
+
80
+ index = -1
81
+ sentence_terms.each_with_index do |term, i|
82
+ if ids.include?(term.id)
83
+ index = i
84
+ end
85
+ end
86
+
87
+ unless index+1 >= terms_count
88
+ min = index+1
89
+ max = [index+1+max_distance,terms_count].min
90
+ @right_candidates = filter_candidates(sentence_terms[min..max])
91
+ end
92
+
93
+ index = 0
94
+ sentence_terms.each_with_index do |term, i|
95
+ if ids.include?(term.id)
96
+ index = i
97
+ break # needed for left_candidates
98
+ end
99
+ end
100
+
101
+ unless index == 0
102
+ min = [0, index-1-max_distance].max
103
+ max = index
104
+ @left_candidates = filter_candidates(sentence_terms[min..max])
105
+ end
106
+
107
+ unless right_candidates.empty?
108
+ candidate = right_candidates.first
109
+ @target_ids << candidate.id
110
+ end
111
+
112
+ if target_ids.empty?
113
+ list = mix_lists(right_candidates, left_candidates)
114
+ list.each do |l|
115
+ @target_ids << l.id
116
+ break
117
+ end
118
+ end
119
+ end
120
+
121
+ protected
122
+
123
+ ##
124
+ # If there are no opinion targets, right and left candidates
125
+ # are mixed into one list and the first one is picked as the target.
126
+ #
127
+ # @return [Array]
128
+ #
129
+ def mix_lists(lista, listb)
130
+ list = []
131
+ min = [lista.count, listb.count].min
132
+ (0..min).each do |i|
133
+ list << lista[i]
134
+ list << listb[i]
135
+ if lista.count > listb.count
136
+ list << lista[min]
137
+ elsif listb.count > lista.count
138
+ list << listb[min]
139
+ end
140
+ end
141
+ return list.compact
142
+ end
143
+
144
+ ##
145
+ # Filters candidate terms depending on their part of speech and if
146
+ # they are already part of the expression.
147
+ #
148
+ # @return [Hash]
149
+ #
150
+ def filter_candidates(sentence_terms)
151
+ sentence_terms.select{|t| (t.pos == "N" || t.pos == "R") && !ids.include?(t.id)}
152
+ end
153
+
154
+ ##
155
+ # Opinion holders for each language code.
156
+ #
157
+ # @return [Hash]
158
+ #
159
+ def opinion_holders
160
+ {
161
+ 'nl' => ['ik','we','wij','ze','zij','jullie','u','hij','het','jij','je','mij','me','hem','haar','ons','hen','hun'],
162
+ 'en' => ['i','we','he','she','they','it','you'],
163
+ 'es' => ['yo','tu','nosotros','vosotros','ellos','ellas','nosotras','vosotras'],
164
+ 'it' => ['io','tu','noi','voi','loro','lei','lui'],
165
+ 'de' => ['ich','du','wir','ihr','sie','er'],
166
+ 'fr' => ['je','tu','lui','elle','nous','vous','ils','elles']
167
+ }
168
+ end
169
+ end # Opinion
170
+ end # OpinionDetectorBasic
171
+ end # Opener
@@ -0,0 +1,329 @@
1
+ require_relative 'term'
2
+ require_relative 'opinion'
3
+
4
+ module Opener
5
+ class OpinionDetectorBasic
6
+ ##
7
+ # Class that detects opinions in a given input KAF file.
8
+ #
9
+ class Processor
10
+ attr_accessor :document, :timestamp, :opinion_strength, :pretty
11
+
12
+ ##
13
+ # @param [String|IO] file The KAF file/input to process.
14
+ # @param [Hash] options. Options for timestamp and including strength to
15
+ # opinions.
16
+ # @param [TrueClass|FalseClass] pretty Enable pretty formatting, disabled
17
+ # by default due to the performance overhead.
18
+ #
19
+ def initialize(file, options = {})
20
+ @document = Oga.parse_xml(file)
21
+
22
+ @timestamp = !!options[:timestamp]
23
+ @opinion_strength = !!options[:opinion_strength]
24
+ @pretty = options[:pretty] || false
25
+
26
+ raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
27
+ end
28
+
29
+ ##
30
+ # Processes the input and returns the new KAF output.
31
+ # @return [String]
32
+ #
33
+ def process
34
+ add_opinions_layer
35
+
36
+ index = 1
37
+ opinions.each do |opinion|
38
+ add_opinion(opinion, index)
39
+ index += 1
40
+ end
41
+
42
+ add_linguistic_processor
43
+
44
+ return pretty ? pretty_print(document) : document.to_xml
45
+ end
46
+
47
+ ##
48
+ # Get the language of the input file.
49
+ #
50
+ # @return [String]
51
+ #
52
+ def language
53
+ return @language ||= document.at_xpath('KAF').get('xml:lang')
54
+ end
55
+
56
+ ##
57
+ # Get the terms from the input file
58
+ # @return [Hash]
59
+ #
60
+ def terms
61
+ unless @terms
62
+ @terms = []
63
+
64
+ document.xpath('KAF/terms/term').each do |term|
65
+ @terms << Term.new(term, document, language)
66
+ end
67
+ end
68
+
69
+ return @terms
70
+ end
71
+
72
+ ##
73
+ # Get the opinions.
74
+ #
75
+ # @return [Hash]
76
+ #
77
+ def opinions
78
+ unless @opinions
79
+ set_accumulated_strength
80
+ apply_modifiers
81
+ apply_conjunctions
82
+
83
+ ##
84
+ # Initialize opinions with their expressions.
85
+ #
86
+ @opinions = terms.map do |term|
87
+ if term.is_expression? && term.accumulated_strength != 0
88
+ o = Opinion.new(term)
89
+ end
90
+ end.compact
91
+
92
+ ##
93
+ # Obtain targets for each opinion.
94
+ #
95
+ @opinions.each do |opinion|
96
+ opinion.obtain_targets(sentences)
97
+ end
98
+
99
+ ##
100
+ # Obtain holders for each opinion.
101
+ #
102
+ @opinions.each do |opinion|
103
+ opinion.obtain_holders(sentences, language)
104
+ end
105
+ end
106
+
107
+ return @opinions
108
+ end
109
+
110
+ ##
111
+ # Remove the opinions layer from the KAF file if it exists and add a new
112
+ # one.
113
+ def add_opinions_layer
114
+ existing = document.at_xpath('KAF/opinions')
115
+
116
+ existing.remove if existing
117
+
118
+ new_node('opinions', 'KAF')
119
+ end
120
+
121
+ ##
122
+ # Adds the entire opinion in the KAF file.
123
+ #
124
+ def add_opinion(opinion, index)
125
+ opinion_node = new_node("opinion", "KAF/opinions")
126
+ opinion_node.set('oid', "o#{index.to_s}")
127
+
128
+ unless opinion.holders.empty?
129
+ opinion_holder_node = new_node("opinion_holder", opinion_node)
130
+ add_opinion_element(opinion_holder_node, opinion.holders)
131
+ end
132
+
133
+ opinion_target_node = new_node("opinion_target", opinion_node)
134
+ unless opinion.target_ids.empty?
135
+ add_opinion_element(opinion_target_node, opinion.target_ids)
136
+ end
137
+
138
+ expression_node = new_node("opinion_expression", opinion_node)
139
+ expression_node.set('polarity', opinion.polarity)
140
+ expression_node.set('strength', opinion.strength.to_s)
141
+ add_opinion_element(expression_node, opinion.ids)
142
+ end
143
+
144
+ ##
145
+ # Method for adding opinion holders, targets and expressions.
146
+ #
147
+ def add_opinion_element(node, ids)
148
+ lemmas = terms.select{|t| ids.include?(t.id)}.map(&:lemma).join(" ")
149
+ comment = Oga::XML::Comment.new(:text => "#{lemmas}")
150
+ node.children << comment
151
+ span_node = new_node("span", node)
152
+ ids.each do |id|
153
+ target_node = new_node("target", span_node)
154
+ target_node.set('id', id.to_s)
155
+ end
156
+ end
157
+
158
+ ##
159
+ # Add linguistic processor layer with basic information
160
+ # (version, timestamp, description etc) in the KAF file.
161
+ #
162
+ def add_linguistic_processor
163
+ description = 'Basic opinion detector with Pos'
164
+ last_edited = '13may2015'
165
+ version = '2.0'
166
+
167
+ node = new_node('linguisticProcessors', 'KAF/kafHeader')
168
+ node.set('layer', 'opinions')
169
+
170
+ lp_node = new_node('lp', node)
171
+
172
+ lp_node.set('version', "#{last_edited}-#{version}")
173
+ lp_node.set('name', description)
174
+
175
+ if timestamp
176
+ format = '%Y-%m-%dT%H:%M:%S%Z'
177
+
178
+ lp_node.set('timestamp', Time.now.strftime(format))
179
+ else
180
+ lp_node.set('timestamp', '*')
181
+ end
182
+ end
183
+
184
+ ##
185
+ # Format the output document properly.
186
+ #
187
+ # TODO: this should be handled by Oga in a nice way.
188
+ #
189
+ # @return [String]
190
+ #
191
+ def pretty_print(document)
192
+ doc = REXML::Document.new document.to_xml
193
+ doc.context[:attribute_quote] = :quote
194
+ out = ""
195
+ formatter = REXML::Formatters::Pretty.new
196
+ formatter.compact = true
197
+ formatter.write(doc, out)
198
+
199
+ return out.strip
200
+ end
201
+
202
+ ##
203
+ # Get terms grouped by sentence.
204
+ #
205
+ def sentences
206
+ @sentences ||= terms.group_by{|t| t.sentence}
207
+ end
208
+
209
+ protected
210
+
211
+ ##
212
+ # The strength of a term depends heavily on the type of the previous
213
+ # one. For example if the previous one is a shifter, it needs
214
+ # to be multiplied. If it's an intensifier, it needs to be
215
+ # added (or subtracted depending on the strength of the previous
216
+ # term) etc.
217
+ #
218
+ def set_accumulated_strength
219
+ symbol = :+
220
+ terms_count = terms.count
221
+ terms.each_with_index do |term, i|
222
+ if i+1 < terms_count
223
+ if terms[i+1].is_shifter?
224
+ if term.accumulated_strength != 0
225
+ terms[i+1].accumulated_strength *= term.accumulated_strength
226
+ terms[i+1].list_ids += term.list_ids
227
+ term.use = false
228
+ symbol = terms[i+1].accumulated_strength > 0 ? :+ : :-
229
+ else
230
+ symbol = :*
231
+ end
232
+ elsif terms[i+1].is_intensifier?
233
+ terms[i+1].accumulated_strength = term.accumulated_strength.send(symbol, terms[i+1].accumulated_strength)
234
+ term.use = false
235
+ symbol = terms[i+1].accumulated_strength > 0 ? :+ : :-
236
+ if term.accumulated_strength != 0
237
+ terms[i+1].list_ids += term.list_ids
238
+ end
239
+ else
240
+ symbol = terms[i+1].accumulated_strength >= 0 ? :+ : :-
241
+ end
242
+ end
243
+ end
244
+ end
245
+
246
+ ##
247
+ # Apply strength to the next term after a shifter or intensifier.
248
+ #
249
+ def apply_modifiers
250
+ terms_count = terms.count
251
+ terms.each_with_index do |term, i|
252
+ if i+1 < terms_count
253
+ if term.use && (term.is_shifter? || term.is_intensifier?)
254
+ terms[i+1].accumulated_strength *= term.accumulated_strength
255
+ terms[i+1].list_ids += term.list_ids
256
+ term.use = false
257
+ end
258
+ end
259
+ end
260
+ end
261
+
262
+ ##
263
+ # Ignore conjunctions when applying strength.
264
+ #
265
+ def apply_conjunctions
266
+ terms_count = terms.count
267
+ i = 0
268
+ while i < terms_count
269
+ if terms[i].use && terms[i].accumulated_strength != 0
270
+ used = [i]
271
+ list_ids = terms[i].list_ids
272
+ strength = terms[i].accumulated_strength
273
+ terms[i].use = false
274
+ j = i+1
275
+ while true
276
+ if j >= terms_count
277
+ break
278
+ end
279
+
280
+ if terms[j].is_conjunction
281
+ terms[j].use = false
282
+ j += 1
283
+ elsif terms[j].use && terms[j].accumulated_strength != 0
284
+ list_ids += terms[j].list_ids
285
+ used << j
286
+ terms[j].use = false
287
+ strength += terms[j].accumulated_strength
288
+ j += 1
289
+ else
290
+ break
291
+ end
292
+ end
293
+ last_used = used.last
294
+ terms[last_used].accumulated_strength = strength
295
+ terms[last_used].list_ids = list_ids
296
+ terms[last_used].use = true
297
+ i = j
298
+ end
299
+ i += 1
300
+ end
301
+ end
302
+
303
+ ##
304
+ # Creates a new node in the KAF file.
305
+ #
306
+ def new_node(tag, parent)
307
+ if parent.is_a?(String)
308
+ parent_node = document.at_xpath(parent)
309
+ else
310
+ parent_node = parent
311
+ end
312
+
313
+ node = Oga::XML::Element.new(:name => tag)
314
+
315
+ parent_node.children << node
316
+
317
+ return node
318
+ end
319
+
320
+ ##
321
+ # Check if input is a KAF file.
322
+ # @return [Boolean]
323
+ #
324
+ def is_kaf?
325
+ return !!document.at_xpath('KAF')
326
+ end
327
+ end # Processor
328
+ end # OpinionDetectorBasic
329
+ end # Opener