RubyGems - opener-opinion-detector-basic - Versions diffs - 3.2.3 → 3.2.4 - Mend

opener-opinion-detector-basic 3.2.3 → 3.2.4

Files changed (13) hide show

checksums.yaml +4 -4
data/lib/opener/opinion_detector_basic.rb +11 -1
data/lib/opener/opinion_detector_basic/base_processor.rb +56 -0
data/lib/opener/opinion_detector_basic/kaf/document.rb +146 -0
data/lib/opener/opinion_detector_basic/kaf/opinion.rb +179 -0
data/lib/opener/opinion_detector_basic/kaf/term.rb +181 -0
data/lib/opener/opinion_detector_basic/legacy_processor.rb +136 -0
data/lib/opener/opinion_detector_basic/processor.rb +19 -294
data/lib/opener/opinion_detector_basic/version.rb +1 -1
data/opener-opinion-detector-basic.gemspec +2 -0
metadata +36 -6
data/lib/opener/opinion_detector_basic/opinion.rb +0 -170
data/lib/opener/opinion_detector_basic/term.rb +0 -159

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 37bab9088bfcdff7ddd21a5452bd9226edb14a0d11fb35b2f7bd8a183cd5b1af
-  data.tar.gz: 3605f40a5b4c1f22d343a308d6b7e4e28184bd8f0e7de000ca334842ccc3b4f7
+  metadata.gz: fa1aba5cb9ba31f6e2205af1499f866f9e998883701b303f2229ecc855348293
+  data.tar.gz: db3a5d5021a0013757ba68252ccaed4c185a0960aaa9ca26e47681e0b3300d11
 SHA512:
-  metadata.gz: ccc07cced6aee88b530fa52bfa7185855a840cced15f7008574e9493e85e9249c2af14c6b0645b0e991edfe876e51d69ef26a10ff9ab56ded4ffe59a47873cf4
-  data.tar.gz: 025e2637189339a502fefa73f4e6c44d1317e3199ea474a790d1a4421a0b50293adac691da913800f70a023c1ae190ba2473397021b0d45f7952d6470f33da09
+  metadata.gz: 5e6e4ae440580e6ed2974c4a75b46f544212c8557ae3fe43fcbb4e4c3a7d7a6d71a058451f3abdae352766914182dc40237f42df54a565c7c866054828c758c8
+  data.tar.gz: 3db868535c5f43814b4b883ecd9d5b0bd02fb59de5ed290c6c1b3face4d65993d84c2971a2c7dc021b607b6c228735dd82f8931c909911302577dd3bcb4558f5

data/lib/opener/opinion_detector_basic.rb CHANGED Viewed

@@ -1,14 +1,23 @@
 gem 'slop', '~> 3.0'
+require 'active_support/all'
 require 'slop'
+require 'hashie'
 require 'nokogiri'
 require 'rexml/document'
 require 'rexml/formatters/pretty'
+require_relative 'opinion_detector_basic/kaf/document'
+require_relative 'opinion_detector_basic/kaf/term'
+require_relative 'opinion_detector_basic/kaf/opinion'
 require_relative 'opinion_detector_basic/version'
 require_relative 'opinion_detector_basic/cli'
+require_relative 'opinion_detector_basic/base_processor'
 require_relative 'opinion_detector_basic/processor'
+require_relative 'opinion_detector_basic/legacy_processor'
 module Opener
   ##
@@ -32,6 +41,7 @@ module Opener
     def initialize(options = {})
       @args    = options.delete(:args) || []
       @options = options
+      @klass   = if ENV['OPINION_LEGACY'] then LegacyProcessor else Processor end
     end
     ##
@@ -41,7 +51,7 @@ module Opener
     # @return [String]
     #
     def run input, params = {}
-      return Processor.new(input, options).process
+      @klass.new(input, options).process
     end
   end

data/lib/opener/opinion_detector_basic/base_processor.rb ADDED Viewed

@@ -0,0 +1,56 @@
+module Opener
+  class OpinionDetectorBasic
+    class BaseProcessor
+      attr_accessor :document
+      attr_reader :terms, :sentences
+      ##
+      # @param [String|IO] file The KAF file/input to process.
+      # @param [Hash] options. Options for timestamp and including strength to
+      # opinions.
+      # @param [TrueClass|FalseClass] pretty Enable pretty formatting, disabled
+      #  by default due to the performance overhead.
+      #
+      def initialize file, options = {}
+        @document  = Kaf::Document.new file, options
+        @terms     = @document.terms
+        @sentences = @document.sentences
+      end
+      ##
+      # Processes the input and returns the new KAF output.
+      # @return [String]
+      #
+      def process
+        document.add_opinions_layer
+        opinions.each.with_index do |opinion, index|
+          document.add_opinion opinion, index+1
+        end
+        document.add_linguistic_processor
+        if document.pretty then pretty_print document else document.to_xml end
+      end
+      ##
+      # Format the output document properly.
+      #
+      # TODO: this should be handled by Oga in a nice way.
+      #
+      # @return [String]
+      #
+      def pretty_print document
+        doc = REXML::Document.new document.to_xml
+        doc.context[:attribute_quote] = :quote
+        out = ""
+        formatter = REXML::Formatters::Pretty.new
+        formatter.compact = true
+        formatter.write doc, out
+        out.strip
+      end
+    end
+  end
+end

data/lib/opener/opinion_detector_basic/kaf/document.rb ADDED Viewed

@@ -0,0 +1,146 @@
+module Opener
+  class OpinionDetectorBasic
+    module Kaf
+      class Document
+        attr_accessor :document, :timestamp, :opinion_strength, :pretty
+        def initialize file, options = {}
+          @document = Nokogiri.XML file
+          @timestamp        = options[:timestamp]
+          @opinion_strength = options[:opinion_strength]
+          @pretty           = options[:pretty] || false
+          raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
+        end
+        def terms
+          @terms ||= document.xpath('KAF/terms/term').map do |term|
+            Term.new term, self, language
+          end
+        end
+        def language
+          @language ||= document.at_xpath('KAF').attr('xml:lang')
+        end
+        ##
+        # Get terms grouped by sentence.
+        #
+        def sentences
+          @sentences ||= terms.group_by{ |t| t.sentence }
+        end
+        ##
+        # Adds the entire opinion in the KAF file.
+        #
+        def add_opinion opinion, index
+          opinion_node = new_node 'opinion', 'KAF/opinions'
+          opinion_node['oid'] = "o#{index.to_s}"
+          if opinion.holders.present?
+            opinion_holder_node = new_node 'opinion_holder', opinion_node
+            add_opinion_element opinion_holder_node, opinion.holders
+          end
+          opinion_target_node = new_node 'opinion_target', opinion_node
+          if opinion.target_ids.present?
+            add_opinion_element opinion_target_node, opinion.target_ids
+          end
+          expression_node = new_node 'opinion_expression', opinion_node
+          expression_node['polarity'] = opinion.polarity
+          expression_node['strength'] = opinion.strength.to_s
+          expression_node['lexicon-id'] = opinion.lexicon_id if opinion.lexicon_id
+          add_opinion_element expression_node, opinion.ids
+        end
+        ##
+        # Remove the opinions layer from the KAF file if it exists and add a new
+        # one.
+        def add_opinions_layer
+          existing = document.at_xpath('KAF/opinions')
+          existing.remove if existing
+          new_node 'opinions', 'KAF'
+        end
+        ##
+        # Method for adding opinion holders, targets and expressions.
+        #
+        def add_opinion_element node, ids
+          lemmas    = terms.select{|t| ids.include?(t.id)}.map(&:lemma).join(' ')
+          comment   = Nokogiri::XML::Comment.new(document, lemmas)
+          node.add_child comment
+          span_node = new_node('span', node)
+          ids.each do |id|
+            target_node       = new_node('target', span_node)
+            target_node['id'] = id.to_s
+          end
+        end
+        ##
+        # Add linguistic processor layer with basic information
+        # (version, timestamp, description etc) in the KAF file.
+        #
+        def add_linguistic_processor
+          description = 'Basic opinion detector with Pos'
+          last_edited = '13may2015'
+          version     = '2.0'
+          node = new_node('linguisticProcessors', 'KAF/kafHeader')
+          node['layer'] = 'opinions'
+          lp_node = new_node('lp', node)
+          lp_node['version'] = "#{last_edited}-#{version}"
+          lp_node['name'] = description
+          if timestamp
+            format = '%Y-%m-%dT%H:%M:%S%Z'
+            lp_node['timestamp'] = Time.now.strftime(format)
+          else
+            lp_node['timestamp'] = '*'
+          end
+        end
+        ##
+        # Creates a new node in the KAF file.
+        #
+        def new_node tag, parent
+          if parent.is_a?(String)
+            parent_node = document.at_xpath(parent)
+          else
+            parent_node = parent
+          end
+          node = Nokogiri::XML::Element.new(tag, document)
+          parent_node.add_child node
+          node
+        end
+        ##
+        # Check if input is a KAF file.
+        # @return [Boolean]
+        #
+        def is_kaf?
+          !!document.at_xpath('KAF')
+        end
+        def method_missing method, *args, &block
+          @document.send method, *args, &block
+        end
+      end
+    end
+  end
+end

data/lib/opener/opinion_detector_basic/kaf/opinion.rb ADDED Viewed

@@ -0,0 +1,179 @@
+module Opener
+  class OpinionDetectorBasic
+    module Kaf
+      class Opinion
+        attr_reader :term
+        attr_accessor :left_candidates, :right_candidates, :target_ids, :holders
+        # Opinion holders for each language code.
+        OPINION_HOLDERS = {
+          'nl' => %w[
+            ik we wij ze zij jullie u hij het jij je mij
+            me hem haar ons hen hun
+          ],
+          'en' => %w[i we he she they it you],
+          'es' => %w[
+            yo tu nosotros vosotros ellos ellas nosotras vosotras
+          ],
+          'it' => %w[io tu noi voi loro lei lui],
+          'de' => %w[ich du wir ihr sie er],
+          'fr' => %w[je tu lui elle nous vous ils elles],
+        }
+        def initialize term
+          @term       = term
+          @holders    = []
+          @target_ids = []
+          @left_candidates  = []
+          @right_candidates = []
+        end
+        ##
+        # Returns the term ids of the opinion expression.
+        #
+        # @return [Array]
+        #
+        def ids
+          @ids ||= term.list_ids.sort
+        end
+        ##
+        # Returns the sentence id of the opinion.
+        #
+        # @return [String]
+        #
+        def sentence
+          @sentence ||= term.sentence
+        end
+        ##
+        # Returns the strength of the opinion.
+        #
+        # @return [Integer]
+        #
+        def strength
+          @strength ||= term.accumulated_strength
+        end
+        def lexicon_id
+          @lexicon_id ||= term.lexicon_id
+        end
+        ##
+        # Returns the polarity of the opinion.
+        #
+        # @return [String]
+        #
+        def polarity
+          @polarity ||= if strength > 0
+            'positive'
+          elsif strength < 0
+            'negative'
+          else
+            'neutral'
+          end
+        end
+        ##
+        # Obtain the opinion holders from the terms that belong to the same
+        # sentence.
+        #
+        def obtain_holders(sentences, language)
+          sentence_terms = sentences[sentence]
+          sentence_terms.each do |term|
+            if OPINION_HOLDERS[language]&.include?(term.lemma)
+              @holders << term.id
+              break
+            end
+          end
+        end
+        ##
+        # Get the potential right and left candidates of the sentence and
+        # decide which ones are the actual targets of the opinion
+        #
+        def obtain_targets(sentences)
+          sentence_terms = sentences[sentence]
+          max_distance = 3
+          terms_count = sentence_terms.count
+          index = -1
+          sentence_terms.each_with_index do |term, i|
+            if ids.include?(term.id)
+              index = i
+            end
+          end
+          unless index+1 >= terms_count
+            min = index+1
+            max = [index+1+max_distance,terms_count].min
+            @right_candidates = filter_candidates(sentence_terms[min..max])
+          end
+          index = 0
+          sentence_terms.each_with_index do |term, i|
+            if ids.include?(term.id)
+              index = i
+              break # needed for left_candidates
+            end
+          end
+          unless index == 0
+            min = [0, index-1-max_distance].max
+            max = index
+            @left_candidates = filter_candidates(sentence_terms[min..max])
+          end
+          unless right_candidates.empty?
+            candidate = right_candidates.first
+            @target_ids << candidate.id
+          end
+          if target_ids.empty?
+            list = mix_lists(right_candidates, left_candidates)
+            list.each do |l|
+              @target_ids << l.id
+              break
+            end
+          end
+        end
+        protected
+        ##
+        # If there are no opinion targets, right and left candidates
+        # are mixed into one list and the first one is picked as the target.
+        #
+        # @return [Array]
+        #
+        def mix_lists(lista, listb)
+          list = []
+          min = [lista.count, listb.count].min
+          (0..min).each do |i|
+            list << lista[i]
+            list << listb[i]
+            if lista.count > listb.count
+              list << lista[min]
+            elsif listb.count > lista.count
+              list << listb[min]
+            end
+          end
+          return list.compact
+        end
+        ##
+        # Filters candidate terms depending on their part of speech and if
+        # they are already part of the expression.
+        #
+        # @return [Hash]
+        #
+        def filter_candidates sentence_terms
+          sentence_terms.select{|t| (t.pos == 'N' || t.pos == 'R') && !ids.include?(t.id)}
+        end
+      end
+    end
+  end
+end

data/lib/opener/opinion_detector_basic/kaf/term.rb ADDED Viewed

@@ -0,0 +1,181 @@
+module Opener
+  class OpinionDetectorBasic
+    module Kaf
+      class Term
+        attr_reader :document
+        attr_reader :node, :sentence, :is_conjunction
+        attr_accessor :use, :accumulated_strength, :list_ids
+        # Map of conjunctions per language code
+        # Deprecated
+        CONJUNCTIONS = {
+          'nl' => %w{, en},
+          'en' => %w{, and},
+          'es' => %w{, y e},
+          'it' => %w{, e ed},
+          'de' => %w{, und},
+          'fr' => %w{, et}
+        }
+        def initialize node, document, language
+          @document             = document
+          @node                 = node
+          @sentence             = get_sentence document
+          @use                  = true
+          @accumulated_strength = strength
+          @list_ids             = [id]
+          @is_conjunction       = is_conjunction? language
+        end
+        ##
+        # Returns the term id.
+        #
+        # @return [String]
+        #
+        def id
+          @id ||= node.attr :tid
+        end
+        ##
+        # Returns the lemma of the term.
+        #
+        # @return [String]
+        #
+        def lemma
+          @lemma ||= node.attr :lemma
+        end
+        ##
+        # Returns the head of the term.
+        #
+        # @return [String]
+        #
+        def head
+          @head ||= node.attr(:head).to_i
+        end
+        def head_term
+          return if root?
+          document.terms[head-1]
+        end
+        def root?
+          head == 0
+        end
+        ##
+        # Returns the part of speech of the term.
+        #
+        # @return [String]
+        #
+        def pos
+          @pos ||= node.attr('pos')
+        end
+        def lexicon_id
+          @lexicon_id ||= node.attr('lexicon-id')
+        end
+        ##
+        # Returns the sentiment modifier type if it exists.
+        #
+        # @return [String|NilClass]
+        #
+        def sentiment_modifier
+          @sentiment_modifier ||=
+            first_sentiment ? first_sentiment.attr('sentiment_modifier') : nil
+        end
+        ##
+        # Returns the polarity of the term if it exists.
+        #
+        # @return [String|NilClass]
+        #
+        def polarity
+          @polarity ||= first_sentiment ? first_sentiment.attr('polarity') : nil
+        end
+        ##
+        # Returns the actual word ids that construct the lemma.
+        #
+        # @return [Array]
+        #
+        def target_ids
+          @target_ids ||= node.xpath('span/target')
+            .map { |target| target.attr('id') }
+        end
+        ##
+        # Returns the strength of the term depending on its type.
+        #
+        # @return [Integer]
+        #
+        def strength
+          return  1 if polarity == 'positive'
+          return -1 if polarity == 'negative'
+          return  2 if is_intensifier?
+          return -1 if is_shifter?
+          return  0
+        end
+        ##
+        # Returns the sentence id that the term belongs to in the document.
+        #
+        # @return [String]
+        #
+        def get_sentence(document)
+          document
+          .xpath("KAF/text/wf[@wid='#{target_ids.first}']")
+          .first
+          .attr('sent')
+        end
+        ##
+        # Checks if a term is an intensifier.
+        #
+        # @return [TrueClass|FalseClass]
+        #
+        def is_intensifier?
+          sentiment_modifier == 'intensifier'
+        end
+        ##
+        # Checks if a term is a shifter.
+        #
+        # @return [TrueClass|FalseClass]
+        #
+        def is_shifter?
+          sentiment_modifier == 'shifter'
+        end
+        ##
+        # Checks if a term is an expression.
+        #
+        # @return [TrueClass|FalseClass]
+        #
+        def is_expression?
+          use && !!polarity
+        end
+        ##
+        # Checks if a term is a conjunction.
+        #
+        # @return [TrueClass|FalseClass]
+        #
+        def is_conjunction?(language)
+          pos == 'J' || CONJUNCTIONS[language]&.include?(lemma)
+        end
+        private
+        # @return [Oga::XML::Element]
+        def first_sentiment
+          @first_sentiment ||= node.at :sentiment
+        end
+      end
+    end
+  end
+end