RubyGems - opener-opinion-detector-basic - Versions diffs - 3.2.0 → 3.2.4 - Mend

opener-opinion-detector-basic 3.2.0 → 3.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/lib/opener/opinion_detector_basic.rb +12 -2
data/lib/opener/opinion_detector_basic/base_processor.rb +56 -0
data/lib/opener/opinion_detector_basic/kaf/document.rb +146 -0
data/lib/opener/opinion_detector_basic/kaf/opinion.rb +179 -0
data/lib/opener/opinion_detector_basic/kaf/term.rb +181 -0
data/lib/opener/opinion_detector_basic/legacy_processor.rb +136 -0
data/lib/opener/opinion_detector_basic/processor.rb +22 -310
data/lib/opener/opinion_detector_basic/version.rb +1 -1
data/opener-opinion-detector-basic.gemspec +3 -1
metadata +52 -28
data/lib/opener/opinion_detector_basic/opinion.rb +0 -170
data/lib/opener/opinion_detector_basic/term.rb +0 -159

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 14f63b17cb26086742f4618eef4ad61e9435d00f271f2a2e7984ca9ef0f68a3e
-  data.tar.gz: 257e711a1e2aee0764b4d8de9092e116d47a50e2d6de7717e8668e692d4f1270
+  metadata.gz: fa1aba5cb9ba31f6e2205af1499f866f9e998883701b303f2229ecc855348293
+  data.tar.gz: db3a5d5021a0013757ba68252ccaed4c185a0960aaa9ca26e47681e0b3300d11
 SHA512:
-  metadata.gz: 54374bd46b28f4065f26899a042a1caeedafeff43a50de0642c164928dd55b1164a8ad076b19a3504fd2a4748348c19725e1cc448abbc2de12a0aaf4eb540df1
-  data.tar.gz: e1ef4640b783bf2de26072553ecb874ec08c21a51e7286340ce402d6a0359d94adbc6efe03ed62af82c5c25a47354f6fe33b56d070b7a4d9488515dbb90074e6
+  metadata.gz: 5e6e4ae440580e6ed2974c4a75b46f544212c8557ae3fe43fcbb4e4c3a7d7a6d71a058451f3abdae352766914182dc40237f42df54a565c7c866054828c758c8
+  data.tar.gz: 3db868535c5f43814b4b883ecd9d5b0bd02fb59de5ed290c6c1b3face4d65993d84c2971a2c7dc021b607b6c228735dd82f8931c909911302577dd3bcb4558f5

data/lib/opener/opinion_detector_basic.rb CHANGED Viewed

@@ -1,14 +1,23 @@
 gem 'slop', '~> 3.0'
+require 'active_support/all'
 require 'slop'
-require 'oga'
+require 'hashie'
+require 'nokogiri'
 require 'rexml/document'
 require 'rexml/formatters/pretty'
+require_relative 'opinion_detector_basic/kaf/document'
+require_relative 'opinion_detector_basic/kaf/term'
+require_relative 'opinion_detector_basic/kaf/opinion'
 require_relative 'opinion_detector_basic/version'
 require_relative 'opinion_detector_basic/cli'
+require_relative 'opinion_detector_basic/base_processor'
 require_relative 'opinion_detector_basic/processor'
+require_relative 'opinion_detector_basic/legacy_processor'
 module Opener
   ##
@@ -32,6 +41,7 @@ module Opener
     def initialize(options = {})
       @args    = options.delete(:args) || []
       @options = options
+      @klass   = if ENV['OPINION_LEGACY'] then LegacyProcessor else Processor end
     end
     ##
@@ -41,7 +51,7 @@ module Opener
     # @return [String]
     #
     def run input, params = {}
-      return Processor.new(input, options).process
+      @klass.new(input, options).process
     end
   end

data/lib/opener/opinion_detector_basic/base_processor.rb ADDED Viewed

@@ -0,0 +1,56 @@
+module Opener
+  class OpinionDetectorBasic
+    class BaseProcessor
+      attr_accessor :document
+      attr_reader :terms, :sentences
+      ##
+      # @param [String|IO] file The KAF file/input to process.
+      # @param [Hash] options. Options for timestamp and including strength to
+      # opinions.
+      # @param [TrueClass|FalseClass] pretty Enable pretty formatting, disabled
+      #  by default due to the performance overhead.
+      #
+      def initialize file, options = {}
+        @document  = Kaf::Document.new file, options
+        @terms     = @document.terms
+        @sentences = @document.sentences
+      end
+      ##
+      # Processes the input and returns the new KAF output.
+      # @return [String]
+      #
+      def process
+        document.add_opinions_layer
+        opinions.each.with_index do |opinion, index|
+          document.add_opinion opinion, index+1
+        end
+        document.add_linguistic_processor
+        if document.pretty then pretty_print document else document.to_xml end
+      end
+      ##
+      # Format the output document properly.
+      #
+      # TODO: this should be handled by Oga in a nice way.
+      #
+      # @return [String]
+      #
+      def pretty_print document
+        doc = REXML::Document.new document.to_xml
+        doc.context[:attribute_quote] = :quote
+        out = ""
+        formatter = REXML::Formatters::Pretty.new
+        formatter.compact = true
+        formatter.write doc, out
+        out.strip
+      end
+    end
+  end
+end

data/lib/opener/opinion_detector_basic/kaf/document.rb ADDED Viewed

@@ -0,0 +1,146 @@
+module Opener
+  class OpinionDetectorBasic
+    module Kaf
+      class Document
+        attr_accessor :document, :timestamp, :opinion_strength, :pretty
+        def initialize file, options = {}
+          @document = Nokogiri.XML file
+          @timestamp        = options[:timestamp]
+          @opinion_strength = options[:opinion_strength]
+          @pretty           = options[:pretty] || false
+          raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
+        end
+        def terms
+          @terms ||= document.xpath('KAF/terms/term').map do |term|
+            Term.new term, self, language
+          end
+        end
+        def language
+          @language ||= document.at_xpath('KAF').attr('xml:lang')
+        end
+        ##
+        # Get terms grouped by sentence.
+        #
+        def sentences
+          @sentences ||= terms.group_by{ |t| t.sentence }
+        end
+        ##
+        # Adds the entire opinion in the KAF file.
+        #
+        def add_opinion opinion, index
+          opinion_node = new_node 'opinion', 'KAF/opinions'
+          opinion_node['oid'] = "o#{index.to_s}"
+          if opinion.holders.present?
+            opinion_holder_node = new_node 'opinion_holder', opinion_node
+            add_opinion_element opinion_holder_node, opinion.holders
+          end
+          opinion_target_node = new_node 'opinion_target', opinion_node
+          if opinion.target_ids.present?
+            add_opinion_element opinion_target_node, opinion.target_ids
+          end
+          expression_node = new_node 'opinion_expression', opinion_node
+          expression_node['polarity'] = opinion.polarity
+          expression_node['strength'] = opinion.strength.to_s
+          expression_node['lexicon-id'] = opinion.lexicon_id if opinion.lexicon_id
+          add_opinion_element expression_node, opinion.ids
+        end
+        ##
+        # Remove the opinions layer from the KAF file if it exists and add a new
+        # one.
+        def add_opinions_layer
+          existing = document.at_xpath('KAF/opinions')
+          existing.remove if existing
+          new_node 'opinions', 'KAF'
+        end
+        ##
+        # Method for adding opinion holders, targets and expressions.
+        #
+        def add_opinion_element node, ids
+          lemmas    = terms.select{|t| ids.include?(t.id)}.map(&:lemma).join(' ')
+          comment   = Nokogiri::XML::Comment.new(document, lemmas)
+          node.add_child comment
+          span_node = new_node('span', node)
+          ids.each do |id|
+            target_node       = new_node('target', span_node)
+            target_node['id'] = id.to_s
+          end
+        end
+        ##
+        # Add linguistic processor layer with basic information
+        # (version, timestamp, description etc) in the KAF file.
+        #
+        def add_linguistic_processor
+          description = 'Basic opinion detector with Pos'
+          last_edited = '13may2015'
+          version     = '2.0'
+          node = new_node('linguisticProcessors', 'KAF/kafHeader')
+          node['layer'] = 'opinions'
+          lp_node = new_node('lp', node)
+          lp_node['version'] = "#{last_edited}-#{version}"
+          lp_node['name'] = description
+          if timestamp
+            format = '%Y-%m-%dT%H:%M:%S%Z'
+            lp_node['timestamp'] = Time.now.strftime(format)
+          else
+            lp_node['timestamp'] = '*'
+          end
+        end
+        ##
+        # Creates a new node in the KAF file.
+        #
+        def new_node tag, parent
+          if parent.is_a?(String)
+            parent_node = document.at_xpath(parent)
+          else
+            parent_node = parent
+          end
+          node = Nokogiri::XML::Element.new(tag, document)
+          parent_node.add_child node
+          node
+        end
+        ##
+        # Check if input is a KAF file.
+        # @return [Boolean]
+        #
+        def is_kaf?
+          !!document.at_xpath('KAF')
+        end
+        def method_missing method, *args, &block
+          @document.send method, *args, &block
+        end
+      end
+    end
+  end
+end

data/lib/opener/opinion_detector_basic/kaf/opinion.rb ADDED Viewed

@@ -0,0 +1,179 @@
+module Opener
+  class OpinionDetectorBasic
+    module Kaf
+      class Opinion
+        attr_reader :term
+        attr_accessor :left_candidates, :right_candidates, :target_ids, :holders
+        # Opinion holders for each language code.
+        OPINION_HOLDERS = {
+          'nl' => %w[
+            ik we wij ze zij jullie u hij het jij je mij
+            me hem haar ons hen hun
+          ],
+          'en' => %w[i we he she they it you],
+          'es' => %w[
+            yo tu nosotros vosotros ellos ellas nosotras vosotras
+          ],
+          'it' => %w[io tu noi voi loro lei lui],
+          'de' => %w[ich du wir ihr sie er],
+          'fr' => %w[je tu lui elle nous vous ils elles],
+        }
+        def initialize term
+          @term       = term
+          @holders    = []
+          @target_ids = []
+          @left_candidates  = []
+          @right_candidates = []
+        end
+        ##
+        # Returns the term ids of the opinion expression.
+        #
+        # @return [Array]
+        #
+        def ids
+          @ids ||= term.list_ids.sort
+        end
+        ##
+        # Returns the sentence id of the opinion.
+        #
+        # @return [String]
+        #
+        def sentence
+          @sentence ||= term.sentence
+        end
+        ##
+        # Returns the strength of the opinion.
+        #
+        # @return [Integer]
+        #
+        def strength
+          @strength ||= term.accumulated_strength
+        end
+        def lexicon_id
+          @lexicon_id ||= term.lexicon_id
+        end
+        ##
+        # Returns the polarity of the opinion.
+        #
+        # @return [String]
+        #
+        def polarity
+          @polarity ||= if strength > 0
+            'positive'
+          elsif strength < 0
+            'negative'
+          else
+            'neutral'
+          end
+        end
+        ##
+        # Obtain the opinion holders from the terms that belong to the same
+        # sentence.
+        #
+        def obtain_holders(sentences, language)
+          sentence_terms = sentences[sentence]
+          sentence_terms.each do |term|
+            if OPINION_HOLDERS[language]&.include?(term.lemma)
+              @holders << term.id
+              break
+            end
+          end
+        end
+        ##
+        # Get the potential right and left candidates of the sentence and
+        # decide which ones are the actual targets of the opinion
+        #
+        def obtain_targets(sentences)
+          sentence_terms = sentences[sentence]
+          max_distance = 3
+          terms_count = sentence_terms.count
+          index = -1
+          sentence_terms.each_with_index do |term, i|
+            if ids.include?(term.id)
+              index = i
+            end
+          end
+          unless index+1 >= terms_count
+            min = index+1
+            max = [index+1+max_distance,terms_count].min
+            @right_candidates = filter_candidates(sentence_terms[min..max])
+          end
+          index = 0
+          sentence_terms.each_with_index do |term, i|
+            if ids.include?(term.id)
+              index = i
+              break # needed for left_candidates
+            end
+          end
+          unless index == 0
+            min = [0, index-1-max_distance].max
+            max = index
+            @left_candidates = filter_candidates(sentence_terms[min..max])
+          end
+          unless right_candidates.empty?
+            candidate = right_candidates.first
+            @target_ids << candidate.id
+          end
+          if target_ids.empty?
+            list = mix_lists(right_candidates, left_candidates)
+            list.each do |l|
+              @target_ids << l.id
+              break
+            end
+          end
+        end
+        protected
+        ##
+        # If there are no opinion targets, right and left candidates
+        # are mixed into one list and the first one is picked as the target.
+        #
+        # @return [Array]
+        #
+        def mix_lists(lista, listb)
+          list = []
+          min = [lista.count, listb.count].min
+          (0..min).each do |i|
+            list << lista[i]
+            list << listb[i]
+            if lista.count > listb.count
+              list << lista[min]
+            elsif listb.count > lista.count
+              list << listb[min]
+            end
+          end
+          return list.compact
+        end
+        ##
+        # Filters candidate terms depending on their part of speech and if
+        # they are already part of the expression.
+        #
+        # @return [Hash]
+        #
+        def filter_candidates sentence_terms
+          sentence_terms.select{|t| (t.pos == 'N' || t.pos == 'R') && !ids.include?(t.id)}
+        end
+      end
+    end
+  end
+end

data/lib/opener/opinion_detector_basic/kaf/term.rb ADDED Viewed

@@ -0,0 +1,181 @@
+module Opener
+  class OpinionDetectorBasic
+    module Kaf
+      class Term
+        attr_reader :document
+        attr_reader :node, :sentence, :is_conjunction
+        attr_accessor :use, :accumulated_strength, :list_ids
+        # Map of conjunctions per language code
+        # Deprecated
+        CONJUNCTIONS = {
+          'nl' => %w{, en},
+          'en' => %w{, and},
+          'es' => %w{, y e},
+          'it' => %w{, e ed},
+          'de' => %w{, und},
+          'fr' => %w{, et}
+        }
+        def initialize node, document, language
+          @document             = document
+          @node                 = node
+          @sentence             = get_sentence document
+          @use                  = true
+          @accumulated_strength = strength
+          @list_ids             = [id]
+          @is_conjunction       = is_conjunction? language
+        end
+        ##
+        # Returns the term id.
+        #
+        # @return [String]
+        #
+        def id
+          @id ||= node.attr :tid
+        end
+        ##
+        # Returns the lemma of the term.
+        #
+        # @return [String]
+        #
+        def lemma
+          @lemma ||= node.attr :lemma
+        end
+        ##
+        # Returns the head of the term.
+        #
+        # @return [String]
+        #
+        def head
+          @head ||= node.attr(:head).to_i
+        end
+        def head_term
+          return if root?
+          document.terms[head-1]
+        end
+        def root?
+          head == 0
+        end
+        ##
+        # Returns the part of speech of the term.
+        #
+        # @return [String]
+        #
+        def pos
+          @pos ||= node.attr('pos')
+        end
+        def lexicon_id
+          @lexicon_id ||= node.attr('lexicon-id')
+        end
+        ##
+        # Returns the sentiment modifier type if it exists.
+        #
+        # @return [String|NilClass]
+        #
+        def sentiment_modifier
+          @sentiment_modifier ||=
+            first_sentiment ? first_sentiment.attr('sentiment_modifier') : nil
+        end
+        ##
+        # Returns the polarity of the term if it exists.
+        #
+        # @return [String|NilClass]
+        #
+        def polarity
+          @polarity ||= first_sentiment ? first_sentiment.attr('polarity') : nil
+        end
+        ##
+        # Returns the actual word ids that construct the lemma.
+        #
+        # @return [Array]
+        #
+        def target_ids
+          @target_ids ||= node.xpath('span/target')
+            .map { |target| target.attr('id') }
+        end
+        ##
+        # Returns the strength of the term depending on its type.
+        #
+        # @return [Integer]
+        #
+        def strength
+          return  1 if polarity == 'positive'
+          return -1 if polarity == 'negative'
+          return  2 if is_intensifier?
+          return -1 if is_shifter?
+          return  0
+        end
+        ##
+        # Returns the sentence id that the term belongs to in the document.
+        #
+        # @return [String]
+        #
+        def get_sentence(document)
+          document
+          .xpath("KAF/text/wf[@wid='#{target_ids.first}']")
+          .first
+          .attr('sent')
+        end
+        ##
+        # Checks if a term is an intensifier.
+        #
+        # @return [TrueClass|FalseClass]
+        #
+        def is_intensifier?
+          sentiment_modifier == 'intensifier'
+        end
+        ##
+        # Checks if a term is a shifter.
+        #
+        # @return [TrueClass|FalseClass]
+        #
+        def is_shifter?
+          sentiment_modifier == 'shifter'
+        end
+        ##
+        # Checks if a term is an expression.
+        #
+        # @return [TrueClass|FalseClass]
+        #
+        def is_expression?
+          use && !!polarity
+        end
+        ##
+        # Checks if a term is a conjunction.
+        #
+        # @return [TrueClass|FalseClass]
+        #
+        def is_conjunction?(language)
+          pos == 'J' || CONJUNCTIONS[language]&.include?(lemma)
+        end
+        private
+        # @return [Oga::XML::Element]
+        def first_sentiment
+          @first_sentiment ||= node.at :sentiment
+        end
+      end
+    end
+  end
+end