opener-opinion-detector-basic 3.2.0 → 3.2.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 14f63b17cb26086742f4618eef4ad61e9435d00f271f2a2e7984ca9ef0f68a3e
4
- data.tar.gz: 257e711a1e2aee0764b4d8de9092e116d47a50e2d6de7717e8668e692d4f1270
3
+ metadata.gz: fa1aba5cb9ba31f6e2205af1499f866f9e998883701b303f2229ecc855348293
4
+ data.tar.gz: db3a5d5021a0013757ba68252ccaed4c185a0960aaa9ca26e47681e0b3300d11
5
5
  SHA512:
6
- metadata.gz: 54374bd46b28f4065f26899a042a1caeedafeff43a50de0642c164928dd55b1164a8ad076b19a3504fd2a4748348c19725e1cc448abbc2de12a0aaf4eb540df1
7
- data.tar.gz: e1ef4640b783bf2de26072553ecb874ec08c21a51e7286340ce402d6a0359d94adbc6efe03ed62af82c5c25a47354f6fe33b56d070b7a4d9488515dbb90074e6
6
+ metadata.gz: 5e6e4ae440580e6ed2974c4a75b46f544212c8557ae3fe43fcbb4e4c3a7d7a6d71a058451f3abdae352766914182dc40237f42df54a565c7c866054828c758c8
7
+ data.tar.gz: 3db868535c5f43814b4b883ecd9d5b0bd02fb59de5ed290c6c1b3face4d65993d84c2971a2c7dc021b607b6c228735dd82f8931c909911302577dd3bcb4558f5
@@ -1,14 +1,23 @@
1
1
  gem 'slop', '~> 3.0'
2
2
 
3
+ require 'active_support/all'
4
+
3
5
  require 'slop'
4
- require 'oga'
6
+ require 'hashie'
7
+ require 'nokogiri'
5
8
 
6
9
  require 'rexml/document'
7
10
  require 'rexml/formatters/pretty'
8
11
 
12
+ require_relative 'opinion_detector_basic/kaf/document'
13
+ require_relative 'opinion_detector_basic/kaf/term'
14
+ require_relative 'opinion_detector_basic/kaf/opinion'
15
+
9
16
  require_relative 'opinion_detector_basic/version'
10
17
  require_relative 'opinion_detector_basic/cli'
18
+ require_relative 'opinion_detector_basic/base_processor'
11
19
  require_relative 'opinion_detector_basic/processor'
20
+ require_relative 'opinion_detector_basic/legacy_processor'
12
21
 
13
22
  module Opener
14
23
  ##
@@ -32,6 +41,7 @@ module Opener
32
41
  def initialize(options = {})
33
42
  @args = options.delete(:args) || []
34
43
  @options = options
44
+ @klass = if ENV['OPINION_LEGACY'] then LegacyProcessor else Processor end
35
45
  end
36
46
 
37
47
  ##
@@ -41,7 +51,7 @@ module Opener
41
51
  # @return [String]
42
52
  #
43
53
  def run input, params = {}
44
- return Processor.new(input, options).process
54
+ @klass.new(input, options).process
45
55
  end
46
56
 
47
57
  end
@@ -0,0 +1,56 @@
1
+ module Opener
2
+ class OpinionDetectorBasic
3
+ class BaseProcessor
4
+
5
+ attr_accessor :document
6
+ attr_reader :terms, :sentences
7
+
8
+ ##
9
+ # @param [String|IO] file The KAF file/input to process.
10
+ # @param [Hash] options. Options for timestamp and including strength to
11
+ # opinions.
12
+ # @param [TrueClass|FalseClass] pretty Enable pretty formatting, disabled
13
+ # by default due to the performance overhead.
14
+ #
15
+ def initialize file, options = {}
16
+ @document = Kaf::Document.new file, options
17
+ @terms = @document.terms
18
+ @sentences = @document.sentences
19
+ end
20
+
21
+ ##
22
+ # Processes the input and returns the new KAF output.
23
+ # @return [String]
24
+ #
25
+ def process
26
+ document.add_opinions_layer
27
+ opinions.each.with_index do |opinion, index|
28
+ document.add_opinion opinion, index+1
29
+ end
30
+
31
+ document.add_linguistic_processor
32
+
33
+ if document.pretty then pretty_print document else document.to_xml end
34
+ end
35
+
36
+ ##
37
+ # Format the output document properly.
38
+ #
39
+ # TODO: this should be handled by Oga in a nice way.
40
+ #
41
+ # @return [String]
42
+ #
43
+ def pretty_print document
44
+ doc = REXML::Document.new document.to_xml
45
+ doc.context[:attribute_quote] = :quote
46
+ out = ""
47
+ formatter = REXML::Formatters::Pretty.new
48
+ formatter.compact = true
49
+ formatter.write doc, out
50
+
51
+ out.strip
52
+ end
53
+
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,146 @@
1
+ module Opener
2
+ class OpinionDetectorBasic
3
+ module Kaf
4
+ class Document
5
+
6
+ attr_accessor :document, :timestamp, :opinion_strength, :pretty
7
+
8
+ def initialize file, options = {}
9
+ @document = Nokogiri.XML file
10
+
11
+ @timestamp = options[:timestamp]
12
+ @opinion_strength = options[:opinion_strength]
13
+ @pretty = options[:pretty] || false
14
+
15
+ raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
16
+ end
17
+
18
+ def terms
19
+ @terms ||= document.xpath('KAF/terms/term').map do |term|
20
+ Term.new term, self, language
21
+ end
22
+ end
23
+
24
+ def language
25
+ @language ||= document.at_xpath('KAF').attr('xml:lang')
26
+ end
27
+
28
+ ##
29
+ # Get terms grouped by sentence.
30
+ #
31
+ def sentences
32
+ @sentences ||= terms.group_by{ |t| t.sentence }
33
+ end
34
+
35
+ ##
36
+ # Adds the entire opinion in the KAF file.
37
+ #
38
+ def add_opinion opinion, index
39
+ opinion_node = new_node 'opinion', 'KAF/opinions'
40
+ opinion_node['oid'] = "o#{index.to_s}"
41
+
42
+ if opinion.holders.present?
43
+ opinion_holder_node = new_node 'opinion_holder', opinion_node
44
+ add_opinion_element opinion_holder_node, opinion.holders
45
+ end
46
+
47
+ opinion_target_node = new_node 'opinion_target', opinion_node
48
+
49
+ if opinion.target_ids.present?
50
+ add_opinion_element opinion_target_node, opinion.target_ids
51
+ end
52
+
53
+ expression_node = new_node 'opinion_expression', opinion_node
54
+ expression_node['polarity'] = opinion.polarity
55
+ expression_node['strength'] = opinion.strength.to_s
56
+ expression_node['lexicon-id'] = opinion.lexicon_id if opinion.lexicon_id
57
+
58
+ add_opinion_element expression_node, opinion.ids
59
+ end
60
+
61
+ ##
62
+ # Remove the opinions layer from the KAF file if it exists and add a new
63
+ # one.
64
+ def add_opinions_layer
65
+ existing = document.at_xpath('KAF/opinions')
66
+
67
+ existing.remove if existing
68
+
69
+ new_node 'opinions', 'KAF'
70
+ end
71
+
72
+ ##
73
+ # Method for adding opinion holders, targets and expressions.
74
+ #
75
+ def add_opinion_element node, ids
76
+ lemmas = terms.select{|t| ids.include?(t.id)}.map(&:lemma).join(' ')
77
+ comment = Nokogiri::XML::Comment.new(document, lemmas)
78
+ node.add_child comment
79
+
80
+ span_node = new_node('span', node)
81
+
82
+ ids.each do |id|
83
+ target_node = new_node('target', span_node)
84
+ target_node['id'] = id.to_s
85
+ end
86
+ end
87
+
88
+ ##
89
+ # Add linguistic processor layer with basic information
90
+ # (version, timestamp, description etc) in the KAF file.
91
+ #
92
+ def add_linguistic_processor
93
+ description = 'Basic opinion detector with Pos'
94
+ last_edited = '13may2015'
95
+ version = '2.0'
96
+
97
+ node = new_node('linguisticProcessors', 'KAF/kafHeader')
98
+ node['layer'] = 'opinions'
99
+
100
+ lp_node = new_node('lp', node)
101
+
102
+ lp_node['version'] = "#{last_edited}-#{version}"
103
+ lp_node['name'] = description
104
+
105
+ if timestamp
106
+ format = '%Y-%m-%dT%H:%M:%S%Z'
107
+
108
+ lp_node['timestamp'] = Time.now.strftime(format)
109
+ else
110
+ lp_node['timestamp'] = '*'
111
+ end
112
+ end
113
+
114
+ ##
115
+ # Creates a new node in the KAF file.
116
+ #
117
+ def new_node tag, parent
118
+ if parent.is_a?(String)
119
+ parent_node = document.at_xpath(parent)
120
+ else
121
+ parent_node = parent
122
+ end
123
+
124
+ node = Nokogiri::XML::Element.new(tag, document)
125
+
126
+ parent_node.add_child node
127
+
128
+ node
129
+ end
130
+
131
+ ##
132
+ # Check if input is a KAF file.
133
+ # @return [Boolean]
134
+ #
135
+ def is_kaf?
136
+ !!document.at_xpath('KAF')
137
+ end
138
+
139
+ def method_missing method, *args, &block
140
+ @document.send method, *args, &block
141
+ end
142
+
143
+ end
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,179 @@
1
+ module Opener
2
+ class OpinionDetectorBasic
3
+ module Kaf
4
+ class Opinion
5
+
6
+ attr_reader :term
7
+ attr_accessor :left_candidates, :right_candidates, :target_ids, :holders
8
+
9
+ # Opinion holders for each language code.
10
+ OPINION_HOLDERS = {
11
+ 'nl' => %w[
12
+ ik we wij ze zij jullie u hij het jij je mij
13
+ me hem haar ons hen hun
14
+ ],
15
+ 'en' => %w[i we he she they it you],
16
+ 'es' => %w[
17
+ yo tu nosotros vosotros ellos ellas nosotras vosotras
18
+ ],
19
+ 'it' => %w[io tu noi voi loro lei lui],
20
+ 'de' => %w[ich du wir ihr sie er],
21
+ 'fr' => %w[je tu lui elle nous vous ils elles],
22
+ }
23
+
24
+ def initialize term
25
+ @term = term
26
+ @holders = []
27
+ @target_ids = []
28
+
29
+ @left_candidates = []
30
+ @right_candidates = []
31
+ end
32
+
33
+ ##
34
+ # Returns the term ids of the opinion expression.
35
+ #
36
+ # @return [Array]
37
+ #
38
+ def ids
39
+ @ids ||= term.list_ids.sort
40
+ end
41
+
42
+ ##
43
+ # Returns the sentence id of the opinion.
44
+ #
45
+ # @return [String]
46
+ #
47
+ def sentence
48
+ @sentence ||= term.sentence
49
+ end
50
+
51
+ ##
52
+ # Returns the strength of the opinion.
53
+ #
54
+ # @return [Integer]
55
+ #
56
+ def strength
57
+ @strength ||= term.accumulated_strength
58
+ end
59
+
60
+ def lexicon_id
61
+ @lexicon_id ||= term.lexicon_id
62
+ end
63
+
64
+ ##
65
+ # Returns the polarity of the opinion.
66
+ #
67
+ # @return [String]
68
+ #
69
+ def polarity
70
+ @polarity ||= if strength > 0
71
+ 'positive'
72
+ elsif strength < 0
73
+ 'negative'
74
+ else
75
+ 'neutral'
76
+ end
77
+ end
78
+
79
+ ##
80
+ # Obtain the opinion holders from the terms that belong to the same
81
+ # sentence.
82
+ #
83
+ def obtain_holders(sentences, language)
84
+ sentence_terms = sentences[sentence]
85
+ sentence_terms.each do |term|
86
+ if OPINION_HOLDERS[language]&.include?(term.lemma)
87
+ @holders << term.id
88
+ break
89
+ end
90
+ end
91
+ end
92
+
93
+ ##
94
+ # Get the potential right and left candidates of the sentence and
95
+ # decide which ones are the actual targets of the opinion
96
+ #
97
+ def obtain_targets(sentences)
98
+ sentence_terms = sentences[sentence]
99
+ max_distance = 3
100
+ terms_count = sentence_terms.count
101
+
102
+ index = -1
103
+ sentence_terms.each_with_index do |term, i|
104
+ if ids.include?(term.id)
105
+ index = i
106
+ end
107
+ end
108
+
109
+ unless index+1 >= terms_count
110
+ min = index+1
111
+ max = [index+1+max_distance,terms_count].min
112
+ @right_candidates = filter_candidates(sentence_terms[min..max])
113
+ end
114
+
115
+ index = 0
116
+ sentence_terms.each_with_index do |term, i|
117
+ if ids.include?(term.id)
118
+ index = i
119
+ break # needed for left_candidates
120
+ end
121
+ end
122
+
123
+ unless index == 0
124
+ min = [0, index-1-max_distance].max
125
+ max = index
126
+ @left_candidates = filter_candidates(sentence_terms[min..max])
127
+ end
128
+
129
+ unless right_candidates.empty?
130
+ candidate = right_candidates.first
131
+ @target_ids << candidate.id
132
+ end
133
+
134
+ if target_ids.empty?
135
+ list = mix_lists(right_candidates, left_candidates)
136
+ list.each do |l|
137
+ @target_ids << l.id
138
+ break
139
+ end
140
+ end
141
+ end
142
+
143
+ protected
144
+
145
+ ##
146
+ # If there are no opinion targets, right and left candidates
147
+ # are mixed into one list and the first one is picked as the target.
148
+ #
149
+ # @return [Array]
150
+ #
151
+ def mix_lists(lista, listb)
152
+ list = []
153
+ min = [lista.count, listb.count].min
154
+ (0..min).each do |i|
155
+ list << lista[i]
156
+ list << listb[i]
157
+ if lista.count > listb.count
158
+ list << lista[min]
159
+ elsif listb.count > lista.count
160
+ list << listb[min]
161
+ end
162
+ end
163
+ return list.compact
164
+ end
165
+
166
+ ##
167
+ # Filters candidate terms depending on their part of speech and if
168
+ # they are already part of the expression.
169
+ #
170
+ # @return [Hash]
171
+ #
172
+ def filter_candidates sentence_terms
173
+ sentence_terms.select{|t| (t.pos == 'N' || t.pos == 'R') && !ids.include?(t.id)}
174
+ end
175
+
176
+ end
177
+ end
178
+ end
179
+ end
@@ -0,0 +1,181 @@
1
+ module Opener
2
+ class OpinionDetectorBasic
3
+ module Kaf
4
+ class Term
5
+
6
+ attr_reader :document
7
+ attr_reader :node, :sentence, :is_conjunction
8
+
9
+ attr_accessor :use, :accumulated_strength, :list_ids
10
+
11
+ # Map of conjunctions per language code
12
+ # Deprecated
13
+ CONJUNCTIONS = {
14
+ 'nl' => %w{, en},
15
+ 'en' => %w{, and},
16
+ 'es' => %w{, y e},
17
+ 'it' => %w{, e ed},
18
+ 'de' => %w{, und},
19
+ 'fr' => %w{, et}
20
+ }
21
+
22
+ def initialize node, document, language
23
+ @document = document
24
+ @node = node
25
+ @sentence = get_sentence document
26
+ @use = true
27
+ @accumulated_strength = strength
28
+ @list_ids = [id]
29
+ @is_conjunction = is_conjunction? language
30
+ end
31
+
32
+ ##
33
+ # Returns the term id.
34
+ #
35
+ # @return [String]
36
+ #
37
+ def id
38
+ @id ||= node.attr :tid
39
+ end
40
+
41
+ ##
42
+ # Returns the lemma of the term.
43
+ #
44
+ # @return [String]
45
+ #
46
+ def lemma
47
+ @lemma ||= node.attr :lemma
48
+ end
49
+
50
+ ##
51
+ # Returns the head of the term.
52
+ #
53
+ # @return [String]
54
+ #
55
+ def head
56
+ @head ||= node.attr(:head).to_i
57
+ end
58
+
59
+ def head_term
60
+ return if root?
61
+ document.terms[head-1]
62
+ end
63
+
64
+ def root?
65
+ head == 0
66
+ end
67
+
68
+ ##
69
+ # Returns the part of speech of the term.
70
+ #
71
+ # @return [String]
72
+ #
73
+ def pos
74
+ @pos ||= node.attr('pos')
75
+ end
76
+
77
+ def lexicon_id
78
+ @lexicon_id ||= node.attr('lexicon-id')
79
+ end
80
+
81
+ ##
82
+ # Returns the sentiment modifier type if it exists.
83
+ #
84
+ # @return [String|NilClass]
85
+ #
86
+ def sentiment_modifier
87
+ @sentiment_modifier ||=
88
+ first_sentiment ? first_sentiment.attr('sentiment_modifier') : nil
89
+ end
90
+
91
+ ##
92
+ # Returns the polarity of the term if it exists.
93
+ #
94
+ # @return [String|NilClass]
95
+ #
96
+ def polarity
97
+ @polarity ||= first_sentiment ? first_sentiment.attr('polarity') : nil
98
+ end
99
+
100
+ ##
101
+ # Returns the actual word ids that construct the lemma.
102
+ #
103
+ # @return [Array]
104
+ #
105
+ def target_ids
106
+ @target_ids ||= node.xpath('span/target')
107
+ .map { |target| target.attr('id') }
108
+ end
109
+
110
+ ##
111
+ # Returns the strength of the term depending on its type.
112
+ #
113
+ # @return [Integer]
114
+ #
115
+ def strength
116
+ return 1 if polarity == 'positive'
117
+ return -1 if polarity == 'negative'
118
+ return 2 if is_intensifier?
119
+ return -1 if is_shifter?
120
+ return 0
121
+ end
122
+
123
+ ##
124
+ # Returns the sentence id that the term belongs to in the document.
125
+ #
126
+ # @return [String]
127
+ #
128
+ def get_sentence(document)
129
+ document
130
+ .xpath("KAF/text/wf[@wid='#{target_ids.first}']")
131
+ .first
132
+ .attr('sent')
133
+ end
134
+
135
+ ##
136
+ # Checks if a term is an intensifier.
137
+ #
138
+ # @return [TrueClass|FalseClass]
139
+ #
140
+ def is_intensifier?
141
+ sentiment_modifier == 'intensifier'
142
+ end
143
+
144
+ ##
145
+ # Checks if a term is a shifter.
146
+ #
147
+ # @return [TrueClass|FalseClass]
148
+ #
149
+ def is_shifter?
150
+ sentiment_modifier == 'shifter'
151
+ end
152
+
153
+ ##
154
+ # Checks if a term is an expression.
155
+ #
156
+ # @return [TrueClass|FalseClass]
157
+ #
158
+ def is_expression?
159
+ use && !!polarity
160
+ end
161
+
162
+ ##
163
+ # Checks if a term is a conjunction.
164
+ #
165
+ # @return [TrueClass|FalseClass]
166
+ #
167
+ def is_conjunction?(language)
168
+ pos == 'J' || CONJUNCTIONS[language]&.include?(lemma)
169
+ end
170
+
171
+ private
172
+
173
+ # @return [Oga::XML::Element]
174
+ def first_sentiment
175
+ @first_sentiment ||= node.at :sentiment
176
+ end
177
+
178
+ end
179
+ end
180
+ end
181
+ end