opener-opinion-detector-basic 3.2.0 → 3.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 14f63b17cb26086742f4618eef4ad61e9435d00f271f2a2e7984ca9ef0f68a3e
4
- data.tar.gz: 257e711a1e2aee0764b4d8de9092e116d47a50e2d6de7717e8668e692d4f1270
3
+ metadata.gz: fa1aba5cb9ba31f6e2205af1499f866f9e998883701b303f2229ecc855348293
4
+ data.tar.gz: db3a5d5021a0013757ba68252ccaed4c185a0960aaa9ca26e47681e0b3300d11
5
5
  SHA512:
6
- metadata.gz: 54374bd46b28f4065f26899a042a1caeedafeff43a50de0642c164928dd55b1164a8ad076b19a3504fd2a4748348c19725e1cc448abbc2de12a0aaf4eb540df1
7
- data.tar.gz: e1ef4640b783bf2de26072553ecb874ec08c21a51e7286340ce402d6a0359d94adbc6efe03ed62af82c5c25a47354f6fe33b56d070b7a4d9488515dbb90074e6
6
+ metadata.gz: 5e6e4ae440580e6ed2974c4a75b46f544212c8557ae3fe43fcbb4e4c3a7d7a6d71a058451f3abdae352766914182dc40237f42df54a565c7c866054828c758c8
7
+ data.tar.gz: 3db868535c5f43814b4b883ecd9d5b0bd02fb59de5ed290c6c1b3face4d65993d84c2971a2c7dc021b607b6c228735dd82f8931c909911302577dd3bcb4558f5
@@ -1,14 +1,23 @@
1
1
  gem 'slop', '~> 3.0'
2
2
 
3
+ require 'active_support/all'
4
+
3
5
  require 'slop'
4
- require 'oga'
6
+ require 'hashie'
7
+ require 'nokogiri'
5
8
 
6
9
  require 'rexml/document'
7
10
  require 'rexml/formatters/pretty'
8
11
 
12
+ require_relative 'opinion_detector_basic/kaf/document'
13
+ require_relative 'opinion_detector_basic/kaf/term'
14
+ require_relative 'opinion_detector_basic/kaf/opinion'
15
+
9
16
  require_relative 'opinion_detector_basic/version'
10
17
  require_relative 'opinion_detector_basic/cli'
18
+ require_relative 'opinion_detector_basic/base_processor'
11
19
  require_relative 'opinion_detector_basic/processor'
20
+ require_relative 'opinion_detector_basic/legacy_processor'
12
21
 
13
22
  module Opener
14
23
  ##
@@ -32,6 +41,7 @@ module Opener
32
41
  def initialize(options = {})
33
42
  @args = options.delete(:args) || []
34
43
  @options = options
44
+ @klass = if ENV['OPINION_LEGACY'] then LegacyProcessor else Processor end
35
45
  end
36
46
 
37
47
  ##
@@ -41,7 +51,7 @@ module Opener
41
51
  # @return [String]
42
52
  #
43
53
  def run input, params = {}
44
- return Processor.new(input, options).process
54
+ @klass.new(input, options).process
45
55
  end
46
56
 
47
57
  end
@@ -0,0 +1,56 @@
1
+ module Opener
2
+ class OpinionDetectorBasic
3
+ class BaseProcessor
4
+
5
+ attr_accessor :document
6
+ attr_reader :terms, :sentences
7
+
8
+ ##
9
+ # @param [String|IO] file The KAF file/input to process.
10
+ # @param [Hash] options. Options for timestamp and including strength to
11
+ # opinions.
12
+ # @param [TrueClass|FalseClass] pretty Enable pretty formatting, disabled
13
+ # by default due to the performance overhead.
14
+ #
15
+ def initialize file, options = {}
16
+ @document = Kaf::Document.new file, options
17
+ @terms = @document.terms
18
+ @sentences = @document.sentences
19
+ end
20
+
21
+ ##
22
+ # Processes the input and returns the new KAF output.
23
+ # @return [String]
24
+ #
25
+ def process
26
+ document.add_opinions_layer
27
+ opinions.each.with_index do |opinion, index|
28
+ document.add_opinion opinion, index+1
29
+ end
30
+
31
+ document.add_linguistic_processor
32
+
33
+ if document.pretty then pretty_print document else document.to_xml end
34
+ end
35
+
36
+ ##
37
+ # Format the output document properly.
38
+ #
39
+ # TODO: this should be handled by Oga in a nice way.
40
+ #
41
+ # @return [String]
42
+ #
43
+ def pretty_print document
44
+ doc = REXML::Document.new document.to_xml
45
+ doc.context[:attribute_quote] = :quote
46
+ out = ""
47
+ formatter = REXML::Formatters::Pretty.new
48
+ formatter.compact = true
49
+ formatter.write doc, out
50
+
51
+ out.strip
52
+ end
53
+
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,146 @@
1
+ module Opener
2
+ class OpinionDetectorBasic
3
+ module Kaf
4
+ class Document
5
+
6
+ attr_accessor :document, :timestamp, :opinion_strength, :pretty
7
+
8
+ def initialize file, options = {}
9
+ @document = Nokogiri.XML file
10
+
11
+ @timestamp = options[:timestamp]
12
+ @opinion_strength = options[:opinion_strength]
13
+ @pretty = options[:pretty] || false
14
+
15
+ raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
16
+ end
17
+
18
+ def terms
19
+ @terms ||= document.xpath('KAF/terms/term').map do |term|
20
+ Term.new term, self, language
21
+ end
22
+ end
23
+
24
+ def language
25
+ @language ||= document.at_xpath('KAF').attr('xml:lang')
26
+ end
27
+
28
+ ##
29
+ # Get terms grouped by sentence.
30
+ #
31
+ def sentences
32
+ @sentences ||= terms.group_by{ |t| t.sentence }
33
+ end
34
+
35
+ ##
36
+ # Adds the entire opinion in the KAF file.
37
+ #
38
+ def add_opinion opinion, index
39
+ opinion_node = new_node 'opinion', 'KAF/opinions'
40
+ opinion_node['oid'] = "o#{index.to_s}"
41
+
42
+ if opinion.holders.present?
43
+ opinion_holder_node = new_node 'opinion_holder', opinion_node
44
+ add_opinion_element opinion_holder_node, opinion.holders
45
+ end
46
+
47
+ opinion_target_node = new_node 'opinion_target', opinion_node
48
+
49
+ if opinion.target_ids.present?
50
+ add_opinion_element opinion_target_node, opinion.target_ids
51
+ end
52
+
53
+ expression_node = new_node 'opinion_expression', opinion_node
54
+ expression_node['polarity'] = opinion.polarity
55
+ expression_node['strength'] = opinion.strength.to_s
56
+ expression_node['lexicon-id'] = opinion.lexicon_id if opinion.lexicon_id
57
+
58
+ add_opinion_element expression_node, opinion.ids
59
+ end
60
+
61
+ ##
62
+ # Remove the opinions layer from the KAF file if it exists and add a new
63
+ # one.
64
+ def add_opinions_layer
65
+ existing = document.at_xpath('KAF/opinions')
66
+
67
+ existing.remove if existing
68
+
69
+ new_node 'opinions', 'KAF'
70
+ end
71
+
72
+ ##
73
+ # Method for adding opinion holders, targets and expressions.
74
+ #
75
+ def add_opinion_element node, ids
76
+ lemmas = terms.select{|t| ids.include?(t.id)}.map(&:lemma).join(' ')
77
+ comment = Nokogiri::XML::Comment.new(document, lemmas)
78
+ node.add_child comment
79
+
80
+ span_node = new_node('span', node)
81
+
82
+ ids.each do |id|
83
+ target_node = new_node('target', span_node)
84
+ target_node['id'] = id.to_s
85
+ end
86
+ end
87
+
88
+ ##
89
+ # Add linguistic processor layer with basic information
90
+ # (version, timestamp, description etc) in the KAF file.
91
+ #
92
+ def add_linguistic_processor
93
+ description = 'Basic opinion detector with Pos'
94
+ last_edited = '13may2015'
95
+ version = '2.0'
96
+
97
+ node = new_node('linguisticProcessors', 'KAF/kafHeader')
98
+ node['layer'] = 'opinions'
99
+
100
+ lp_node = new_node('lp', node)
101
+
102
+ lp_node['version'] = "#{last_edited}-#{version}"
103
+ lp_node['name'] = description
104
+
105
+ if timestamp
106
+ format = '%Y-%m-%dT%H:%M:%S%Z'
107
+
108
+ lp_node['timestamp'] = Time.now.strftime(format)
109
+ else
110
+ lp_node['timestamp'] = '*'
111
+ end
112
+ end
113
+
114
+ ##
115
+ # Creates a new node in the KAF file.
116
+ #
117
+ def new_node tag, parent
118
+ if parent.is_a?(String)
119
+ parent_node = document.at_xpath(parent)
120
+ else
121
+ parent_node = parent
122
+ end
123
+
124
+ node = Nokogiri::XML::Element.new(tag, document)
125
+
126
+ parent_node.add_child node
127
+
128
+ node
129
+ end
130
+
131
+ ##
132
+ # Check if input is a KAF file.
133
+ # @return [Boolean]
134
+ #
135
+ def is_kaf?
136
+ !!document.at_xpath('KAF')
137
+ end
138
+
139
+ def method_missing method, *args, &block
140
+ @document.send method, *args, &block
141
+ end
142
+
143
+ end
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,179 @@
1
+ module Opener
2
+ class OpinionDetectorBasic
3
+ module Kaf
4
+ class Opinion
5
+
6
+ attr_reader :term
7
+ attr_accessor :left_candidates, :right_candidates, :target_ids, :holders
8
+
9
+ # Opinion holders for each language code.
10
+ OPINION_HOLDERS = {
11
+ 'nl' => %w[
12
+ ik we wij ze zij jullie u hij het jij je mij
13
+ me hem haar ons hen hun
14
+ ],
15
+ 'en' => %w[i we he she they it you],
16
+ 'es' => %w[
17
+ yo tu nosotros vosotros ellos ellas nosotras vosotras
18
+ ],
19
+ 'it' => %w[io tu noi voi loro lei lui],
20
+ 'de' => %w[ich du wir ihr sie er],
21
+ 'fr' => %w[je tu lui elle nous vous ils elles],
22
+ }
23
+
24
+ def initialize term
25
+ @term = term
26
+ @holders = []
27
+ @target_ids = []
28
+
29
+ @left_candidates = []
30
+ @right_candidates = []
31
+ end
32
+
33
+ ##
34
+ # Returns the term ids of the opinion expression.
35
+ #
36
+ # @return [Array]
37
+ #
38
+ def ids
39
+ @ids ||= term.list_ids.sort
40
+ end
41
+
42
+ ##
43
+ # Returns the sentence id of the opinion.
44
+ #
45
+ # @return [String]
46
+ #
47
+ def sentence
48
+ @sentence ||= term.sentence
49
+ end
50
+
51
+ ##
52
+ # Returns the strength of the opinion.
53
+ #
54
+ # @return [Integer]
55
+ #
56
+ def strength
57
+ @strength ||= term.accumulated_strength
58
+ end
59
+
60
+ def lexicon_id
61
+ @lexicon_id ||= term.lexicon_id
62
+ end
63
+
64
+ ##
65
+ # Returns the polarity of the opinion.
66
+ #
67
+ # @return [String]
68
+ #
69
+ def polarity
70
+ @polarity ||= if strength > 0
71
+ 'positive'
72
+ elsif strength < 0
73
+ 'negative'
74
+ else
75
+ 'neutral'
76
+ end
77
+ end
78
+
79
+ ##
80
+ # Obtain the opinion holders from the terms that belong to the same
81
+ # sentence.
82
+ #
83
+ def obtain_holders(sentences, language)
84
+ sentence_terms = sentences[sentence]
85
+ sentence_terms.each do |term|
86
+ if OPINION_HOLDERS[language]&.include?(term.lemma)
87
+ @holders << term.id
88
+ break
89
+ end
90
+ end
91
+ end
92
+
93
+ ##
94
+ # Get the potential right and left candidates of the sentence and
95
+ # decide which ones are the actual targets of the opinion
96
+ #
97
+ def obtain_targets(sentences)
98
+ sentence_terms = sentences[sentence]
99
+ max_distance = 3
100
+ terms_count = sentence_terms.count
101
+
102
+ index = -1
103
+ sentence_terms.each_with_index do |term, i|
104
+ if ids.include?(term.id)
105
+ index = i
106
+ end
107
+ end
108
+
109
+ unless index+1 >= terms_count
110
+ min = index+1
111
+ max = [index+1+max_distance,terms_count].min
112
+ @right_candidates = filter_candidates(sentence_terms[min..max])
113
+ end
114
+
115
+ index = 0
116
+ sentence_terms.each_with_index do |term, i|
117
+ if ids.include?(term.id)
118
+ index = i
119
+ break # needed for left_candidates
120
+ end
121
+ end
122
+
123
+ unless index == 0
124
+ min = [0, index-1-max_distance].max
125
+ max = index
126
+ @left_candidates = filter_candidates(sentence_terms[min..max])
127
+ end
128
+
129
+ unless right_candidates.empty?
130
+ candidate = right_candidates.first
131
+ @target_ids << candidate.id
132
+ end
133
+
134
+ if target_ids.empty?
135
+ list = mix_lists(right_candidates, left_candidates)
136
+ list.each do |l|
137
+ @target_ids << l.id
138
+ break
139
+ end
140
+ end
141
+ end
142
+
143
+ protected
144
+
145
+ ##
146
+ # If there are no opinion targets, right and left candidates
147
+ # are mixed into one list and the first one is picked as the target.
148
+ #
149
+ # @return [Array]
150
+ #
151
+ def mix_lists(lista, listb)
152
+ list = []
153
+ min = [lista.count, listb.count].min
154
+ (0..min).each do |i|
155
+ list << lista[i]
156
+ list << listb[i]
157
+ if lista.count > listb.count
158
+ list << lista[min]
159
+ elsif listb.count > lista.count
160
+ list << listb[min]
161
+ end
162
+ end
163
+ return list.compact
164
+ end
165
+
166
+ ##
167
+ # Filters candidate terms depending on their part of speech and if
168
+ # they are already part of the expression.
169
+ #
170
+ # @return [Hash]
171
+ #
172
+ def filter_candidates sentence_terms
173
+ sentence_terms.select{|t| (t.pos == 'N' || t.pos == 'R') && !ids.include?(t.id)}
174
+ end
175
+
176
+ end
177
+ end
178
+ end
179
+ end
@@ -0,0 +1,181 @@
1
+ module Opener
2
+ class OpinionDetectorBasic
3
+ module Kaf
4
+ class Term
5
+
6
+ attr_reader :document
7
+ attr_reader :node, :sentence, :is_conjunction
8
+
9
+ attr_accessor :use, :accumulated_strength, :list_ids
10
+
11
+ # Map of conjunctions per language code
12
+ # Deprecated
13
+ CONJUNCTIONS = {
14
+ 'nl' => %w{, en},
15
+ 'en' => %w{, and},
16
+ 'es' => %w{, y e},
17
+ 'it' => %w{, e ed},
18
+ 'de' => %w{, und},
19
+ 'fr' => %w{, et}
20
+ }
21
+
22
+ def initialize node, document, language
23
+ @document = document
24
+ @node = node
25
+ @sentence = get_sentence document
26
+ @use = true
27
+ @accumulated_strength = strength
28
+ @list_ids = [id]
29
+ @is_conjunction = is_conjunction? language
30
+ end
31
+
32
+ ##
33
+ # Returns the term id.
34
+ #
35
+ # @return [String]
36
+ #
37
+ def id
38
+ @id ||= node.attr :tid
39
+ end
40
+
41
+ ##
42
+ # Returns the lemma of the term.
43
+ #
44
+ # @return [String]
45
+ #
46
+ def lemma
47
+ @lemma ||= node.attr :lemma
48
+ end
49
+
50
+ ##
51
+ # Returns the head of the term.
52
+ #
53
+ # @return [String]
54
+ #
55
+ def head
56
+ @head ||= node.attr(:head).to_i
57
+ end
58
+
59
+ def head_term
60
+ return if root?
61
+ document.terms[head-1]
62
+ end
63
+
64
+ def root?
65
+ head == 0
66
+ end
67
+
68
+ ##
69
+ # Returns the part of speech of the term.
70
+ #
71
+ # @return [String]
72
+ #
73
+ def pos
74
+ @pos ||= node.attr('pos')
75
+ end
76
+
77
+ def lexicon_id
78
+ @lexicon_id ||= node.attr('lexicon-id')
79
+ end
80
+
81
+ ##
82
+ # Returns the sentiment modifier type if it exists.
83
+ #
84
+ # @return [String|NilClass]
85
+ #
86
+ def sentiment_modifier
87
+ @sentiment_modifier ||=
88
+ first_sentiment ? first_sentiment.attr('sentiment_modifier') : nil
89
+ end
90
+
91
+ ##
92
+ # Returns the polarity of the term if it exists.
93
+ #
94
+ # @return [String|NilClass]
95
+ #
96
+ def polarity
97
+ @polarity ||= first_sentiment ? first_sentiment.attr('polarity') : nil
98
+ end
99
+
100
+ ##
101
+ # Returns the actual word ids that construct the lemma.
102
+ #
103
+ # @return [Array]
104
+ #
105
+ def target_ids
106
+ @target_ids ||= node.xpath('span/target')
107
+ .map { |target| target.attr('id') }
108
+ end
109
+
110
+ ##
111
+ # Returns the strength of the term depending on its type.
112
+ #
113
+ # @return [Integer]
114
+ #
115
+ def strength
116
+ return 1 if polarity == 'positive'
117
+ return -1 if polarity == 'negative'
118
+ return 2 if is_intensifier?
119
+ return -1 if is_shifter?
120
+ return 0
121
+ end
122
+
123
+ ##
124
+ # Returns the sentence id that the term belongs to in the document.
125
+ #
126
+ # @return [String]
127
+ #
128
+ def get_sentence(document)
129
+ document
130
+ .xpath("KAF/text/wf[@wid='#{target_ids.first}']")
131
+ .first
132
+ .attr('sent')
133
+ end
134
+
135
+ ##
136
+ # Checks if a term is an intensifier.
137
+ #
138
+ # @return [TrueClass|FalseClass]
139
+ #
140
+ def is_intensifier?
141
+ sentiment_modifier == 'intensifier'
142
+ end
143
+
144
+ ##
145
+ # Checks if a term is a shifter.
146
+ #
147
+ # @return [TrueClass|FalseClass]
148
+ #
149
+ def is_shifter?
150
+ sentiment_modifier == 'shifter'
151
+ end
152
+
153
+ ##
154
+ # Checks if a term is an expression.
155
+ #
156
+ # @return [TrueClass|FalseClass]
157
+ #
158
+ def is_expression?
159
+ use && !!polarity
160
+ end
161
+
162
+ ##
163
+ # Checks if a term is a conjunction.
164
+ #
165
+ # @return [TrueClass|FalseClass]
166
+ #
167
+ def is_conjunction?(language)
168
+ pos == 'J' || CONJUNCTIONS[language]&.include?(lemma)
169
+ end
170
+
171
+ private
172
+
173
+ # @return [Oga::XML::Element]
174
+ def first_sentiment
175
+ @first_sentiment ||= node.at :sentiment
176
+ end
177
+
178
+ end
179
+ end
180
+ end
181
+ end