opener-opinion-detector-basic 3.2.2 → 3.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d90cb88e3b8224dc8d308e0c14aee257451869d6ae4c79db007b0247aaae6c2b
4
- data.tar.gz: 0546c8e56ce77a995e259cb1567be5b7444044944f6ee017a7497780f6ee98c5
3
+ metadata.gz: 7bf885ed5d9cd309933e3419dc00f13073e42038e102bf58ed029e9ac6762101
4
+ data.tar.gz: c9b7248d4a8dd672b6d0246994e643d845ebbc73af4e8f427bde80154755a310
5
5
  SHA512:
6
- metadata.gz: 8db793d069cec098dcc50d3732820bbf83756e2dfaff54975ff6b2ef47af116efe140416e40c92ac5a6e011d161c6f398388282e59ea60b50ec6c596520437e9
7
- data.tar.gz: 70dde9675a2b3038d9883e24f378abe597a6823ec5aa2e88d0806be5b71d20cc174c68fd0ef3da3a141931559a9c81be54ab2d1f100e3f9573d568042d2c0890
6
+ metadata.gz: 2caf7e13cb13f4574446e4deaa246212b7a54630f94ba883803a360fb81a4e43c2d77b8dd140eaa5582d25efe1793d67145385f49acec275e1399cd52c3d1785
7
+ data.tar.gz: 16346fb5631e18212bcc690353f384da813f4baa6f48e7f32263183e2f86dee39293876954dedaddd05baa3aa32d970e361e2a8ef6f7f8de7feff96991c90b2b
@@ -1,14 +1,23 @@
1
1
  gem 'slop', '~> 3.0'
2
2
 
3
+ require 'active_support/all'
4
+
3
5
  require 'slop'
4
- require 'oga'
6
+ require 'hashie'
7
+ require 'nokogiri'
5
8
 
6
9
  require 'rexml/document'
7
10
  require 'rexml/formatters/pretty'
8
11
 
12
+ require_relative 'opinion_detector_basic/kaf/document'
13
+ require_relative 'opinion_detector_basic/kaf/term'
14
+ require_relative 'opinion_detector_basic/kaf/opinion'
15
+
9
16
  require_relative 'opinion_detector_basic/version'
10
17
  require_relative 'opinion_detector_basic/cli'
18
+ require_relative 'opinion_detector_basic/base_processor'
11
19
  require_relative 'opinion_detector_basic/processor'
20
+ require_relative 'opinion_detector_basic/legacy_processor'
12
21
 
13
22
  module Opener
14
23
  ##
@@ -32,6 +41,8 @@ module Opener
32
41
  def initialize(options = {})
33
42
  @args = options.delete(:args) || []
34
43
  @options = options
44
+ ENV['OPINION_LEGACY'] = 'true' # Processor is still not working
45
+ @klass = if ENV['OPINION_LEGACY'] then LegacyProcessor else Processor end
35
46
  end
36
47
 
37
48
  ##
@@ -41,7 +52,7 @@ module Opener
41
52
  # @return [String]
42
53
  #
43
54
  def run input, params = {}
44
- return Processor.new(input, options).process
55
+ @klass.new(input, options).process
45
56
  end
46
57
 
47
58
  end
@@ -0,0 +1,56 @@
1
+ module Opener
2
+ class OpinionDetectorBasic
3
+ class BaseProcessor
4
+
5
+ attr_accessor :document
6
+ attr_reader :terms, :sentences
7
+
8
+ ##
9
+ # @param [String|IO] file The KAF file/input to process.
10
+ # @param [Hash] options. Options for timestamp and including strength to
11
+ # opinions.
12
+ # @param [TrueClass|FalseClass] pretty Enable pretty formatting, disabled
13
+ # by default due to the performance overhead.
14
+ #
15
+ def initialize file, options = {}
16
+ @document = Kaf::Document.new file, options
17
+ @terms = @document.terms
18
+ @sentences = @document.sentences
19
+ end
20
+
21
+ ##
22
+ # Processes the input and returns the new KAF output.
23
+ # @return [String]
24
+ #
25
+ def process
26
+ document.add_opinions_layer
27
+ opinions.each.with_index do |opinion, index|
28
+ document.add_opinion opinion, index+1
29
+ end
30
+
31
+ document.add_linguistic_processor
32
+
33
+ if document.pretty then pretty_print document else document.to_xml end
34
+ end
35
+
36
+ ##
37
+ # Format the output document properly.
38
+ #
39
+ # TODO: this should be handled by Oga in a nice way.
40
+ #
41
+ # @return [String]
42
+ #
43
+ def pretty_print document
44
+ doc = REXML::Document.new document.to_xml
45
+ doc.context[:attribute_quote] = :quote
46
+ out = ""
47
+ formatter = REXML::Formatters::Pretty.new
48
+ formatter.compact = true
49
+ formatter.write doc, out
50
+
51
+ out.strip
52
+ end
53
+
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,146 @@
1
+ module Opener
2
+ class OpinionDetectorBasic
3
+ module Kaf
4
+ class Document
5
+
6
+ attr_accessor :document, :timestamp, :opinion_strength, :pretty
7
+
8
+ def initialize file, options = {}
9
+ @document = Nokogiri.XML file
10
+
11
+ @timestamp = options[:timestamp]
12
+ @opinion_strength = options[:opinion_strength]
13
+ @pretty = options[:pretty] || false
14
+
15
+ raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
16
+ end
17
+
18
+ def terms
19
+ @terms ||= document.xpath('KAF/terms/term').map do |term|
20
+ Term.new term, self, language
21
+ end
22
+ end
23
+
24
+ def language
25
+ @language ||= document.at_xpath('KAF').attr('xml:lang')
26
+ end
27
+
28
+ ##
29
+ # Get terms grouped by sentence.
30
+ #
31
+ def sentences
32
+ @sentences ||= terms.group_by{ |t| t.sentence }
33
+ end
34
+
35
+ ##
36
+ # Adds the entire opinion in the KAF file.
37
+ #
38
+ def add_opinion opinion, index
39
+ opinion_node = new_node 'opinion', 'KAF/opinions'
40
+ opinion_node['oid'] = "o#{index.to_s}"
41
+
42
+ if opinion.holders.present?
43
+ opinion_holder_node = new_node 'opinion_holder', opinion_node
44
+ add_opinion_element opinion_holder_node, opinion.holders
45
+ end
46
+
47
+ opinion_target_node = new_node 'opinion_target', opinion_node
48
+
49
+ if opinion.target_ids.present?
50
+ add_opinion_element opinion_target_node, opinion.target_ids
51
+ end
52
+
53
+ expression_node = new_node 'opinion_expression', opinion_node
54
+ expression_node['polarity'] = opinion.polarity
55
+ expression_node['strength'] = opinion.strength.to_s
56
+ expression_node['lexicon-id'] = opinion.lexicon_id if opinion.lexicon_id
57
+
58
+ add_opinion_element expression_node, opinion.ids
59
+ end
60
+
61
+ ##
62
+ # Remove the opinions layer from the KAF file if it exists and add a new
63
+ # one.
64
+ def add_opinions_layer
65
+ existing = document.at_xpath('KAF/opinions')
66
+
67
+ existing.remove if existing
68
+
69
+ new_node 'opinions', 'KAF'
70
+ end
71
+
72
+ ##
73
+ # Method for adding opinion holders, targets and expressions.
74
+ #
75
+ def add_opinion_element node, ids
76
+ lemmas = terms.select{|t| ids.include?(t.id)}.map(&:lemma).join(' ')
77
+ comment = Nokogiri::XML::Comment.new(document, lemmas)
78
+ node.add_child comment
79
+
80
+ span_node = new_node('span', node)
81
+
82
+ ids.each do |id|
83
+ target_node = new_node('target', span_node)
84
+ target_node['id'] = id.to_s
85
+ end
86
+ end
87
+
88
+ ##
89
+ # Add linguistic processor layer with basic information
90
+ # (version, timestamp, description etc) in the KAF file.
91
+ #
92
+ def add_linguistic_processor
93
+ description = 'Basic opinion detector with Pos'
94
+ last_edited = '13may2015'
95
+ version = '2.0'
96
+
97
+ node = new_node('linguisticProcessors', 'KAF/kafHeader')
98
+ node['layer'] = 'opinions'
99
+
100
+ lp_node = new_node('lp', node)
101
+
102
+ lp_node['version'] = "#{last_edited}-#{version}"
103
+ lp_node['name'] = description
104
+
105
+ if timestamp
106
+ format = '%Y-%m-%dT%H:%M:%S%Z'
107
+
108
+ lp_node['timestamp'] = Time.now.strftime(format)
109
+ else
110
+ lp_node['timestamp'] = '*'
111
+ end
112
+ end
113
+
114
+ ##
115
+ # Creates a new node in the KAF file.
116
+ #
117
+ def new_node tag, parent
118
+ if parent.is_a?(String)
119
+ parent_node = document.at_xpath(parent)
120
+ else
121
+ parent_node = parent
122
+ end
123
+
124
+ node = Nokogiri::XML::Element.new(tag, document)
125
+
126
+ parent_node.add_child node
127
+
128
+ node
129
+ end
130
+
131
+ ##
132
+ # Check if input is a KAF file.
133
+ # @return [Boolean]
134
+ #
135
+ def is_kaf?
136
+ !!document.at_xpath('KAF')
137
+ end
138
+
139
+ def method_missing method, *args, &block
140
+ @document.send method, *args, &block
141
+ end
142
+
143
+ end
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,152 @@
1
+ module Opener
2
+ class OpinionDetectorBasic
3
+ module Kaf
4
+ class Opinion
5
+
6
+ attr_reader :term
7
+ attr_accessor :left_candidates, :right_candidates, :target_ids, :holders
8
+
9
+ # Opinion holders for each language code.
10
+ OPINION_HOLDERS = {
11
+ 'nl' => %w[
12
+ ik we wij ze zij jullie u hij het jij je mij
13
+ me hem haar ons hen hun
14
+ ],
15
+ 'en' => %w[i we he she they it you],
16
+ 'es' => %w[
17
+ yo tu nosotros vosotros ellos ellas nosotras vosotras
18
+ ],
19
+ 'it' => %w[io tu noi voi loro lei lui],
20
+ 'de' => %w[ich du wir ihr sie er],
21
+ 'fr' => %w[je tu lui elle nous vous ils elles],
22
+ }
23
+
24
+ def initialize term
25
+ @term = term
26
+ @holders = []
27
+ @target_ids = []
28
+
29
+ @left_candidates = []
30
+ @right_candidates = []
31
+ end
32
+
33
+ ##
34
+ # Returns the term ids of the opinion expression.
35
+ #
36
+ # @return [Array]
37
+ #
38
+ def ids
39
+ @ids ||= term.list_ids.sort
40
+ end
41
+
42
+ ##
43
+ # Returns the sentence id of the opinion.
44
+ #
45
+ # @return [String]
46
+ #
47
+ def sentence
48
+ @sentence ||= term.sentence
49
+ end
50
+
51
+ ##
52
+ # Returns the strength of the opinion.
53
+ #
54
+ # @return [Integer]
55
+ #
56
+ def strength
57
+ @strength ||= term.accumulated_strength
58
+ end
59
+
60
+ def lexicon_id
61
+ @lexicon_id ||= term.lexicon_id
62
+ end
63
+
64
+ ##
65
+ # Returns the polarity of the opinion.
66
+ #
67
+ # @return [String]
68
+ #
69
+ def polarity
70
+ @polarity ||= if strength > 0
71
+ 'positive'
72
+ elsif strength < 0
73
+ 'negative'
74
+ else
75
+ 'neutral'
76
+ end
77
+ end
78
+
79
+ ##
80
+ # Obtain the opinion holders from the terms that belong to the same
81
+ # sentence.
82
+ #
83
+ def obtain_holders(sentences, language)
84
+ sentence_terms = sentences[sentence]
85
+ sentence_terms.each do |term|
86
+ if OPINION_HOLDERS[language]&.include?(term.lemma)
87
+ @holders << term.id
88
+ break
89
+ end
90
+ end
91
+ end
92
+
93
+ ##
94
+ # Get the potential right and left candidates of the sentence and
95
+ # decide which ones are the actual targets of the opinion
96
+ #
97
+ def obtain_targets(sentences)
98
+ sentence_terms = sentences[sentence]
99
+ max_distance = 3
100
+ terms_count = sentence_terms.count
101
+
102
+ index = -1
103
+ sentence_terms.each_with_index do |term, i|
104
+ if ids.include?(term.id)
105
+ index = i
106
+ end
107
+ end
108
+
109
+ unless index+1 >= terms_count
110
+ min = index+1
111
+ max = [index+1+max_distance,terms_count].min
112
+ @right_candidates = filter_candidates(sentence_terms[min..max])
113
+ end
114
+
115
+ index = 0
116
+ sentence_terms.each_with_index do |term, i|
117
+ if ids.include?(term.id)
118
+ index = i
119
+ break # needed for left_candidates
120
+ end
121
+ end
122
+
123
+ unless index == 0
124
+ min = [0, index-1-max_distance].max
125
+ max = index
126
+ @left_candidates = filter_candidates(sentence_terms[min..max])
127
+ end
128
+
129
+ if right_candidates.any?
130
+ @target_ids << right_candidates.first.id
131
+ end
132
+ if left_candidates.any?
133
+ @target_ids << left_candidates.first.id
134
+ end
135
+ end
136
+
137
+ protected
138
+
139
+ ##
140
+ # Filters candidate terms depending on their part of speech and if
141
+ # they are already part of the expression.
142
+ #
143
+ # @return [Hash]
144
+ #
145
+ def filter_candidates sentence_terms
146
+ sentence_terms.select{|t| (t.pos == 'N' || t.pos == 'R') && !ids.include?(t.id)}
147
+ end
148
+
149
+ end
150
+ end
151
+ end
152
+ end
@@ -0,0 +1,185 @@
1
+ module Opener
2
+ class OpinionDetectorBasic
3
+ module Kaf
4
+ class Term
5
+
6
+ attr_reader :document
7
+ attr_reader :node, :sentence, :is_conjunction
8
+
9
+ attr_accessor :use, :accumulated_strength, :list_ids
10
+
11
+ # Map of conjunctions per language code
12
+ # Deprecated
13
+ CONJUNCTIONS = {
14
+ 'nl' => %w{, en},
15
+ 'en' => %w{, and},
16
+ 'es' => %w{, y e},
17
+ 'it' => %w{, e ed},
18
+ 'de' => %w{, und},
19
+ 'fr' => %w{, et},
20
+ }
21
+
22
+ def initialize node, document, language
23
+ @document = document
24
+ @node = node
25
+ @sentence = get_sentence document
26
+ @use = true
27
+ @accumulated_strength = strength
28
+ @list_ids = [id]
29
+ @is_conjunction = is_conjunction? language
30
+ end
31
+
32
+ ##
33
+ # Returns the term id.
34
+ #
35
+ # @return [String]
36
+ #
37
+ def id
38
+ @id ||= node.attr :tid
39
+ end
40
+
41
+ ##
42
+ # Returns the lemma of the term.
43
+ #
44
+ # @return [String]
45
+ #
46
+ def lemma
47
+ @lemma ||= node.attr :lemma
48
+ end
49
+
50
+ ##
51
+ # Returns the head of the term.
52
+ #
53
+ # @return [String]
54
+ #
55
+ def head
56
+ @head ||= node.attr(:head).to_i
57
+ end
58
+
59
+ def head_term
60
+ return if root?
61
+ document.terms[head-1]
62
+ end
63
+
64
+ def root?
65
+ head == 0
66
+ end
67
+
68
+ ##
69
+ # Returns the part of speech of the term.
70
+ #
71
+ # @return [String]
72
+ #
73
+ def pos
74
+ @pos ||= node.attr('pos')
75
+ end
76
+
77
+ def xpos
78
+ @xpos ||= node.attr('xpos')
79
+ end
80
+
81
+ def lexicon_id
82
+ @lexicon_id ||= node.attr('lexicon-id')
83
+ end
84
+
85
+ ##
86
+ # Returns the sentiment modifier type if it exists.
87
+ #
88
+ # @return [String|NilClass]
89
+ #
90
+ def sentiment_modifier
91
+ @sentiment_modifier ||=
92
+ first_sentiment ? first_sentiment.attr('sentiment_modifier') : nil
93
+ end
94
+
95
+ ##
96
+ # Returns the polarity of the term if it exists.
97
+ #
98
+ # @return [String|NilClass]
99
+ #
100
+ def polarity
101
+ @polarity ||= first_sentiment ? first_sentiment.attr('polarity') : nil
102
+ end
103
+
104
+ ##
105
+ # Returns the actual word ids that construct the lemma.
106
+ #
107
+ # @return [Array]
108
+ #
109
+ def target_ids
110
+ @target_ids ||= node.xpath('span/target')
111
+ .map { |target| target.attr('id') }
112
+ end
113
+
114
+ ##
115
+ # Returns the strength of the term depending on its type.
116
+ #
117
+ # @return [Integer]
118
+ #
119
+ def strength
120
+ return 1 if polarity == 'positive'
121
+ return -1 if polarity == 'negative'
122
+ return 2 if is_intensifier?
123
+ return -1 if is_shifter?
124
+ return 0
125
+ end
126
+
127
+ ##
128
+ # Returns the sentence id that the term belongs to in the document.
129
+ #
130
+ # @return [String]
131
+ #
132
+ def get_sentence(document)
133
+ document
134
+ .xpath("KAF/text/wf[@wid='#{target_ids.first}']")
135
+ .first
136
+ .attr('sent')
137
+ end
138
+
139
+ ##
140
+ # Checks if a term is an intensifier.
141
+ #
142
+ # @return [TrueClass|FalseClass]
143
+ #
144
+ def is_intensifier?
145
+ sentiment_modifier == 'intensifier'
146
+ end
147
+
148
+ ##
149
+ # Checks if a term is a shifter.
150
+ #
151
+ # @return [TrueClass|FalseClass]
152
+ #
153
+ def is_shifter?
154
+ sentiment_modifier == 'shifter'
155
+ end
156
+
157
+ ##
158
+ # Checks if a term is an expression.
159
+ #
160
+ # @return [TrueClass|FalseClass]
161
+ #
162
+ def is_expression?
163
+ use && !!polarity
164
+ end
165
+
166
+ ##
167
+ # Checks if a term is a conjunction.
168
+ # Comma is identified as conjunction by default
169
+ # Sometimes, comma comes with space after it
170
+ #
171
+ def is_conjunction?(language)
172
+ pos == 'J' || xpos == ',' || CONJUNCTIONS[language]&.include?(lemma)
173
+ end
174
+
175
+ private
176
+
177
+ # @return [Oga::XML::Element]
178
+ def first_sentiment
179
+ @first_sentiment ||= node.at :sentiment
180
+ end
181
+
182
+ end
183
+ end
184
+ end
185
+ end