opener-opinion-detector-basic 3.2.2 → 3.2.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d90cb88e3b8224dc8d308e0c14aee257451869d6ae4c79db007b0247aaae6c2b
4
- data.tar.gz: 0546c8e56ce77a995e259cb1567be5b7444044944f6ee017a7497780f6ee98c5
3
+ metadata.gz: 7bf885ed5d9cd309933e3419dc00f13073e42038e102bf58ed029e9ac6762101
4
+ data.tar.gz: c9b7248d4a8dd672b6d0246994e643d845ebbc73af4e8f427bde80154755a310
5
5
  SHA512:
6
- metadata.gz: 8db793d069cec098dcc50d3732820bbf83756e2dfaff54975ff6b2ef47af116efe140416e40c92ac5a6e011d161c6f398388282e59ea60b50ec6c596520437e9
7
- data.tar.gz: 70dde9675a2b3038d9883e24f378abe597a6823ec5aa2e88d0806be5b71d20cc174c68fd0ef3da3a141931559a9c81be54ab2d1f100e3f9573d568042d2c0890
6
+ metadata.gz: 2caf7e13cb13f4574446e4deaa246212b7a54630f94ba883803a360fb81a4e43c2d77b8dd140eaa5582d25efe1793d67145385f49acec275e1399cd52c3d1785
7
+ data.tar.gz: 16346fb5631e18212bcc690353f384da813f4baa6f48e7f32263183e2f86dee39293876954dedaddd05baa3aa32d970e361e2a8ef6f7f8de7feff96991c90b2b
@@ -1,14 +1,23 @@
1
1
  gem 'slop', '~> 3.0'
2
2
 
3
+ require 'active_support/all'
4
+
3
5
  require 'slop'
4
- require 'oga'
6
+ require 'hashie'
7
+ require 'nokogiri'
5
8
 
6
9
  require 'rexml/document'
7
10
  require 'rexml/formatters/pretty'
8
11
 
12
+ require_relative 'opinion_detector_basic/kaf/document'
13
+ require_relative 'opinion_detector_basic/kaf/term'
14
+ require_relative 'opinion_detector_basic/kaf/opinion'
15
+
9
16
  require_relative 'opinion_detector_basic/version'
10
17
  require_relative 'opinion_detector_basic/cli'
18
+ require_relative 'opinion_detector_basic/base_processor'
11
19
  require_relative 'opinion_detector_basic/processor'
20
+ require_relative 'opinion_detector_basic/legacy_processor'
12
21
 
13
22
  module Opener
14
23
  ##
@@ -32,6 +41,8 @@ module Opener
32
41
  def initialize(options = {})
33
42
  @args = options.delete(:args) || []
34
43
  @options = options
44
+ ENV['OPINION_LEGACY'] = 'true' # Processor is still not working
45
+ @klass = if ENV['OPINION_LEGACY'] then LegacyProcessor else Processor end
35
46
  end
36
47
 
37
48
  ##
@@ -41,7 +52,7 @@ module Opener
41
52
  # @return [String]
42
53
  #
43
54
  def run input, params = {}
44
- return Processor.new(input, options).process
55
+ @klass.new(input, options).process
45
56
  end
46
57
 
47
58
  end
@@ -0,0 +1,56 @@
1
+ module Opener
2
+ class OpinionDetectorBasic
3
+ class BaseProcessor
4
+
5
+ attr_accessor :document
6
+ attr_reader :terms, :sentences
7
+
8
+ ##
9
+ # @param [String|IO] file The KAF file/input to process.
10
+ # @param [Hash] options. Options for timestamp and including strength to
11
+ # opinions.
12
+ # @param [TrueClass|FalseClass] pretty Enable pretty formatting, disabled
13
+ # by default due to the performance overhead.
14
+ #
15
+ def initialize file, options = {}
16
+ @document = Kaf::Document.new file, options
17
+ @terms = @document.terms
18
+ @sentences = @document.sentences
19
+ end
20
+
21
+ ##
22
+ # Processes the input and returns the new KAF output.
23
+ # @return [String]
24
+ #
25
+ def process
26
+ document.add_opinions_layer
27
+ opinions.each.with_index do |opinion, index|
28
+ document.add_opinion opinion, index+1
29
+ end
30
+
31
+ document.add_linguistic_processor
32
+
33
+ if document.pretty then pretty_print document else document.to_xml end
34
+ end
35
+
36
+ ##
37
+ # Format the output document properly.
38
+ #
39
+ # TODO: this should be handled by Oga in a nice way.
40
+ #
41
+ # @return [String]
42
+ #
43
+ def pretty_print document
44
+ doc = REXML::Document.new document.to_xml
45
+ doc.context[:attribute_quote] = :quote
46
+ out = ""
47
+ formatter = REXML::Formatters::Pretty.new
48
+ formatter.compact = true
49
+ formatter.write doc, out
50
+
51
+ out.strip
52
+ end
53
+
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,146 @@
1
+ module Opener
2
+ class OpinionDetectorBasic
3
+ module Kaf
4
+ class Document
5
+
6
+ attr_accessor :document, :timestamp, :opinion_strength, :pretty
7
+
8
+ def initialize file, options = {}
9
+ @document = Nokogiri.XML file
10
+
11
+ @timestamp = options[:timestamp]
12
+ @opinion_strength = options[:opinion_strength]
13
+ @pretty = options[:pretty] || false
14
+
15
+ raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
16
+ end
17
+
18
+ def terms
19
+ @terms ||= document.xpath('KAF/terms/term').map do |term|
20
+ Term.new term, self, language
21
+ end
22
+ end
23
+
24
+ def language
25
+ @language ||= document.at_xpath('KAF').attr('xml:lang')
26
+ end
27
+
28
+ ##
29
+ # Get terms grouped by sentence.
30
+ #
31
+ def sentences
32
+ @sentences ||= terms.group_by{ |t| t.sentence }
33
+ end
34
+
35
+ ##
36
+ # Adds the entire opinion in the KAF file.
37
+ #
38
+ def add_opinion opinion, index
39
+ opinion_node = new_node 'opinion', 'KAF/opinions'
40
+ opinion_node['oid'] = "o#{index.to_s}"
41
+
42
+ if opinion.holders.present?
43
+ opinion_holder_node = new_node 'opinion_holder', opinion_node
44
+ add_opinion_element opinion_holder_node, opinion.holders
45
+ end
46
+
47
+ opinion_target_node = new_node 'opinion_target', opinion_node
48
+
49
+ if opinion.target_ids.present?
50
+ add_opinion_element opinion_target_node, opinion.target_ids
51
+ end
52
+
53
+ expression_node = new_node 'opinion_expression', opinion_node
54
+ expression_node['polarity'] = opinion.polarity
55
+ expression_node['strength'] = opinion.strength.to_s
56
+ expression_node['lexicon-id'] = opinion.lexicon_id if opinion.lexicon_id
57
+
58
+ add_opinion_element expression_node, opinion.ids
59
+ end
60
+
61
+ ##
62
+ # Remove the opinions layer from the KAF file if it exists and add a new
63
+ # one.
64
+ def add_opinions_layer
65
+ existing = document.at_xpath('KAF/opinions')
66
+
67
+ existing.remove if existing
68
+
69
+ new_node 'opinions', 'KAF'
70
+ end
71
+
72
+ ##
73
+ # Method for adding opinion holders, targets and expressions.
74
+ #
75
+ def add_opinion_element node, ids
76
+ lemmas = terms.select{|t| ids.include?(t.id)}.map(&:lemma).join(' ')
77
+ comment = Nokogiri::XML::Comment.new(document, lemmas)
78
+ node.add_child comment
79
+
80
+ span_node = new_node('span', node)
81
+
82
+ ids.each do |id|
83
+ target_node = new_node('target', span_node)
84
+ target_node['id'] = id.to_s
85
+ end
86
+ end
87
+
88
+ ##
89
+ # Add linguistic processor layer with basic information
90
+ # (version, timestamp, description etc) in the KAF file.
91
+ #
92
+ def add_linguistic_processor
93
+ description = 'Basic opinion detector with Pos'
94
+ last_edited = '13may2015'
95
+ version = '2.0'
96
+
97
+ node = new_node('linguisticProcessors', 'KAF/kafHeader')
98
+ node['layer'] = 'opinions'
99
+
100
+ lp_node = new_node('lp', node)
101
+
102
+ lp_node['version'] = "#{last_edited}-#{version}"
103
+ lp_node['name'] = description
104
+
105
+ if timestamp
106
+ format = '%Y-%m-%dT%H:%M:%S%Z'
107
+
108
+ lp_node['timestamp'] = Time.now.strftime(format)
109
+ else
110
+ lp_node['timestamp'] = '*'
111
+ end
112
+ end
113
+
114
+ ##
115
+ # Creates a new node in the KAF file.
116
+ #
117
+ def new_node tag, parent
118
+ if parent.is_a?(String)
119
+ parent_node = document.at_xpath(parent)
120
+ else
121
+ parent_node = parent
122
+ end
123
+
124
+ node = Nokogiri::XML::Element.new(tag, document)
125
+
126
+ parent_node.add_child node
127
+
128
+ node
129
+ end
130
+
131
+ ##
132
+ # Check if input is a KAF file.
133
+ # @return [Boolean]
134
+ #
135
+ def is_kaf?
136
+ !!document.at_xpath('KAF')
137
+ end
138
+
139
+ def method_missing method, *args, &block
140
+ @document.send method, *args, &block
141
+ end
142
+
143
+ end
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,152 @@
1
+ module Opener
2
+ class OpinionDetectorBasic
3
+ module Kaf
4
+ class Opinion
5
+
6
+ attr_reader :term
7
+ attr_accessor :left_candidates, :right_candidates, :target_ids, :holders
8
+
9
+ # Opinion holders for each language code.
10
+ OPINION_HOLDERS = {
11
+ 'nl' => %w[
12
+ ik we wij ze zij jullie u hij het jij je mij
13
+ me hem haar ons hen hun
14
+ ],
15
+ 'en' => %w[i we he she they it you],
16
+ 'es' => %w[
17
+ yo tu nosotros vosotros ellos ellas nosotras vosotras
18
+ ],
19
+ 'it' => %w[io tu noi voi loro lei lui],
20
+ 'de' => %w[ich du wir ihr sie er],
21
+ 'fr' => %w[je tu lui elle nous vous ils elles],
22
+ }
23
+
24
+ def initialize term
25
+ @term = term
26
+ @holders = []
27
+ @target_ids = []
28
+
29
+ @left_candidates = []
30
+ @right_candidates = []
31
+ end
32
+
33
+ ##
34
+ # Returns the term ids of the opinion expression.
35
+ #
36
+ # @return [Array]
37
+ #
38
+ def ids
39
+ @ids ||= term.list_ids.sort
40
+ end
41
+
42
+ ##
43
+ # Returns the sentence id of the opinion.
44
+ #
45
+ # @return [String]
46
+ #
47
+ def sentence
48
+ @sentence ||= term.sentence
49
+ end
50
+
51
+ ##
52
+ # Returns the strength of the opinion.
53
+ #
54
+ # @return [Integer]
55
+ #
56
+ def strength
57
+ @strength ||= term.accumulated_strength
58
+ end
59
+
60
+ def lexicon_id
61
+ @lexicon_id ||= term.lexicon_id
62
+ end
63
+
64
+ ##
65
+ # Returns the polarity of the opinion.
66
+ #
67
+ # @return [String]
68
+ #
69
+ def polarity
70
+ @polarity ||= if strength > 0
71
+ 'positive'
72
+ elsif strength < 0
73
+ 'negative'
74
+ else
75
+ 'neutral'
76
+ end
77
+ end
78
+
79
+ ##
80
+ # Obtain the opinion holders from the terms that belong to the same
81
+ # sentence.
82
+ #
83
+ def obtain_holders(sentences, language)
84
+ sentence_terms = sentences[sentence]
85
+ sentence_terms.each do |term|
86
+ if OPINION_HOLDERS[language]&.include?(term.lemma)
87
+ @holders << term.id
88
+ break
89
+ end
90
+ end
91
+ end
92
+
93
+ ##
94
+ # Get the potential right and left candidates of the sentence and
95
+ # decide which ones are the actual targets of the opinion
96
+ #
97
+ def obtain_targets(sentences)
98
+ sentence_terms = sentences[sentence]
99
+ max_distance = 3
100
+ terms_count = sentence_terms.count
101
+
102
+ index = -1
103
+ sentence_terms.each_with_index do |term, i|
104
+ if ids.include?(term.id)
105
+ index = i
106
+ end
107
+ end
108
+
109
+ unless index+1 >= terms_count
110
+ min = index+1
111
+ max = [index+1+max_distance,terms_count].min
112
+ @right_candidates = filter_candidates(sentence_terms[min..max])
113
+ end
114
+
115
+ index = 0
116
+ sentence_terms.each_with_index do |term, i|
117
+ if ids.include?(term.id)
118
+ index = i
119
+ break # needed for left_candidates
120
+ end
121
+ end
122
+
123
+ unless index == 0
124
+ min = [0, index-1-max_distance].max
125
+ max = index
126
+ @left_candidates = filter_candidates(sentence_terms[min..max])
127
+ end
128
+
129
+ if right_candidates.any?
130
+ @target_ids << right_candidates.first.id
131
+ end
132
+ if left_candidates.any?
133
+ @target_ids << left_candidates.first.id
134
+ end
135
+ end
136
+
137
+ protected
138
+
139
+ ##
140
+ # Filters candidate terms depending on their part of speech and if
141
+ # they are already part of the expression.
142
+ #
143
+ # @return [Hash]
144
+ #
145
+ def filter_candidates sentence_terms
146
+ sentence_terms.select{|t| (t.pos == 'N' || t.pos == 'R') && !ids.include?(t.id)}
147
+ end
148
+
149
+ end
150
+ end
151
+ end
152
+ end
@@ -0,0 +1,185 @@
1
+ module Opener
2
+ class OpinionDetectorBasic
3
+ module Kaf
4
+ class Term
5
+
6
+ attr_reader :document
7
+ attr_reader :node, :sentence, :is_conjunction
8
+
9
+ attr_accessor :use, :accumulated_strength, :list_ids
10
+
11
+ # Map of conjunctions per language code
12
+ # Deprecated
13
+ CONJUNCTIONS = {
14
+ 'nl' => %w{, en},
15
+ 'en' => %w{, and},
16
+ 'es' => %w{, y e},
17
+ 'it' => %w{, e ed},
18
+ 'de' => %w{, und},
19
+ 'fr' => %w{, et},
20
+ }
21
+
22
+ def initialize node, document, language
23
+ @document = document
24
+ @node = node
25
+ @sentence = get_sentence document
26
+ @use = true
27
+ @accumulated_strength = strength
28
+ @list_ids = [id]
29
+ @is_conjunction = is_conjunction? language
30
+ end
31
+
32
+ ##
33
+ # Returns the term id.
34
+ #
35
+ # @return [String]
36
+ #
37
+ def id
38
+ @id ||= node.attr :tid
39
+ end
40
+
41
+ ##
42
+ # Returns the lemma of the term.
43
+ #
44
+ # @return [String]
45
+ #
46
+ def lemma
47
+ @lemma ||= node.attr :lemma
48
+ end
49
+
50
+ ##
51
+ # Returns the head of the term.
52
+ #
53
+ # @return [String]
54
+ #
55
+ def head
56
+ @head ||= node.attr(:head).to_i
57
+ end
58
+
59
+ def head_term
60
+ return if root?
61
+ document.terms[head-1]
62
+ end
63
+
64
+ def root?
65
+ head == 0
66
+ end
67
+
68
+ ##
69
+ # Returns the part of speech of the term.
70
+ #
71
+ # @return [String]
72
+ #
73
+ def pos
74
+ @pos ||= node.attr('pos')
75
+ end
76
+
77
+ def xpos
78
+ @xpos ||= node.attr('xpos')
79
+ end
80
+
81
+ def lexicon_id
82
+ @lexicon_id ||= node.attr('lexicon-id')
83
+ end
84
+
85
+ ##
86
+ # Returns the sentiment modifier type if it exists.
87
+ #
88
+ # @return [String|NilClass]
89
+ #
90
+ def sentiment_modifier
91
+ @sentiment_modifier ||=
92
+ first_sentiment ? first_sentiment.attr('sentiment_modifier') : nil
93
+ end
94
+
95
+ ##
96
+ # Returns the polarity of the term if it exists.
97
+ #
98
+ # @return [String|NilClass]
99
+ #
100
+ def polarity
101
+ @polarity ||= first_sentiment ? first_sentiment.attr('polarity') : nil
102
+ end
103
+
104
+ ##
105
+ # Returns the actual word ids that construct the lemma.
106
+ #
107
+ # @return [Array]
108
+ #
109
+ def target_ids
110
+ @target_ids ||= node.xpath('span/target')
111
+ .map { |target| target.attr('id') }
112
+ end
113
+
114
+ ##
115
+ # Returns the strength of the term depending on its type.
116
+ #
117
+ # @return [Integer]
118
+ #
119
+ def strength
120
+ return 1 if polarity == 'positive'
121
+ return -1 if polarity == 'negative'
122
+ return 2 if is_intensifier?
123
+ return -1 if is_shifter?
124
+ return 0
125
+ end
126
+
127
+ ##
128
+ # Returns the sentence id that the term belongs to in the document.
129
+ #
130
+ # @return [String]
131
+ #
132
+ def get_sentence(document)
133
+ document
134
+ .xpath("KAF/text/wf[@wid='#{target_ids.first}']")
135
+ .first
136
+ .attr('sent')
137
+ end
138
+
139
+ ##
140
+ # Checks if a term is an intensifier.
141
+ #
142
+ # @return [TrueClass|FalseClass]
143
+ #
144
+ def is_intensifier?
145
+ sentiment_modifier == 'intensifier'
146
+ end
147
+
148
+ ##
149
+ # Checks if a term is a shifter.
150
+ #
151
+ # @return [TrueClass|FalseClass]
152
+ #
153
+ def is_shifter?
154
+ sentiment_modifier == 'shifter'
155
+ end
156
+
157
+ ##
158
+ # Checks if a term is an expression.
159
+ #
160
+ # @return [TrueClass|FalseClass]
161
+ #
162
+ def is_expression?
163
+ use && !!polarity
164
+ end
165
+
166
+ ##
167
+ # Checks if a term is a conjunction.
168
+ # Comma is identified as conjunction by default
169
+ # Sometimes, comma comes with space after it
170
+ #
171
+ def is_conjunction?(language)
172
+ pos == 'J' || xpos == ',' || CONJUNCTIONS[language]&.include?(lemma)
173
+ end
174
+
175
+ private
176
+
177
+ # @return [Oga::XML::Element]
178
+ def first_sentiment
179
+ @first_sentiment ||= node.at :sentiment
180
+ end
181
+
182
+ end
183
+ end
184
+ end
185
+ end