RubyGems - opener-opinion-detector-basic - Versions diffs - 2.0.7 → 3.0.0 - Mend

opener-opinion-detector-basic 2.0.7 → 3.0.0

Files changed (12) hide show

checksums.yaml +4 -4
data/README.md +1 -5
data/lib/opener/opinion_detector_basic.rb +12 -65
data/lib/opener/opinion_detector_basic/opinion.rb +171 -0
data/lib/opener/opinion_detector_basic/processor.rb +329 -0
data/lib/opener/opinion_detector_basic/term.rb +160 -0
data/lib/opener/opinion_detector_basic/version.rb +1 -1
data/opener-opinion-detector-basic.gemspec +5 -10
metadata +24 -39
data/core/opinion_detector_basic_multi.py +0 -512
data/ext/hack/Rakefile +0 -8
data/pre_install_requirements.txt +0 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 765314f86b29243ff3c007100653a20198d3b890
-  data.tar.gz: 1c8767a8dc9ccc48680d0fca3364ed7d0ef08a80
+  metadata.gz: d07d2a2eb88245eca143655a2fc8b5d301b632dd
+  data.tar.gz: 1c67e6b59421ef2ab4e33f5c3260699c202eab0e
 SHA512:
-  metadata.gz: 9972bff4b61846eac50d946c1350ba65fef31fd15e6876a58b7c85c627efb11fd4427543a25ccb20310e94a3a3ab965a57199a4344f07227434688a75aa610ce
-  data.tar.gz: b454c68ab7ed948db7e39e879646f8e07c64f65c90b2ec1713a0a9aeb8736cb516ba28ed2175f30ea52cfb7f7342585849340b5a756ae934715bc921545cf648
+  metadata.gz: cf26709cea362f73901df7184f2c562ac5b9d597c5386c1bb4845a843a667b8e59a301d6e36e3ed5759fd7a7b904b82a390665c0dd916f803e4dcfdefe3ca7f3
+  data.tar.gz: f978e9dc22837f78a758e28d4612e07732d5c12f7c429711d46716e1869c0fb640e7397b47b9c388c12273b440b7b282d49bcf87899070af80831373789294be

data/README.md CHANGED Viewed

@@ -103,11 +103,7 @@ At least you need the following system setup:
 ### Depenencies for normal use:
-* Ruby 1.9.3 or newer
-* Python 2.6
-* lxml: library for processing xml in python
-* libarchive, on Debian/Ubuntu based systems this can be installed using
-  `sudo apt-get install libarchive-dev`
+* Tested on Ruby 2.1.5, 2.2.2, Rubinius 2.4.0, jruby-1.7.8
 ## Domain Adaption

data/lib/opener/opinion_detector_basic.rb CHANGED Viewed

@@ -1,8 +1,13 @@
-require 'open3'
 require 'slop'
+require 'oga'
+require 'monitor'
+require 'rexml/document'
+require 'rexml/formatters/pretty'
 require_relative 'opinion_detector_basic/version'
 require_relative 'opinion_detector_basic/cli'
+require_relative 'opinion_detector_basic/processor'
 module Opener
   ##
@@ -27,77 +32,19 @@ module Opener
       @args    = options.delete(:args) || []
       @options = options
     end
     ##
-    # Builds the command used to execute the kernel.
-    #
-    # @param [Array] args Commandline arguments passed to the command.
-    #
-    def command
-      return "#{adjust_python_path} python -E #{kernel} #{args.join(' ')}"
-    end
-    ##
-    # Processes an input KAF document and returns the results as a new KAF
-    # document.
+    # Processes the input KAF document.
     #
     # @param [String] input
     # @return [String]
     #
     def run(input)
-      stdout, stderr, process = capture(input)
-      raise stderr unless process.success?
-      return stdout
-    end
-    protected
-    ##
-    # @return [String]
-    #
-    def adjust_python_path
-      site_packages =  File.join(core_dir, 'site-packages')
-      return "env PYTHONPATH=#{site_packages}:$PYTHONPATH"
-    end
-    ##
-    # capture3 method doesn't work properly with Jruby, so
-    # this is a workaround
-    #
-    def capture(input)
-      Open3.popen3(*command.split(" ")) {|i, o, e, t|
-        out_reader = Thread.new { o.read }
-        err_reader = Thread.new { e.read }
-        i.write input
-        i.close
-        [out_reader.value, err_reader.value, t.value]
-      }
-    end
-    ##
-    # @return [String]
-    #
-    def core_dir
-      return File.expand_path('../../../core', __FILE__)
-    end
-    ##
-    # @return [String]
-    #
-    def kernel
-      return File.join(core_dir, 'opinion_detector_basic_multi.py')
-    end
+      options[:timestamp] = !options.delete(:no_time)
-    ##
-    # @return the language from the KAF
-    #
-    def language(input)
-      document = Nokogiri::XML(input)
-      return document.at('KAF').attr('xml:lang')
+      return Processor.new(input, options).process
     end
   end # OpinionDetectorBasic
 end # Opener

data/lib/opener/opinion_detector_basic/opinion.rb ADDED Viewed

@@ -0,0 +1,171 @@
+module Opener
+  class OpinionDetectorBasic
+    class Opinion
+      attr_reader :term
+      attr_accessor :left_candidates, :right_candidates, :target_ids, :holders
+      def initialize(term)
+        @term = term
+        @left_candidates = []
+        @right_candidates = []
+        @holders = []
+        @target_ids = []
+      end
+      ##
+      # Returns the term ids of the opinion expression.
+      #
+      # @return [Array]
+      #
+      def ids
+        @ids ||= term.list_ids.sort
+      end
+      ##
+      # Returns the sentence id of the opinion.
+      #
+      # @return [String]
+      #
+      def sentence
+        @sentence ||= term.sentence
+      end
+      ##
+      # Returns the strength of the opinion.
+      #
+      # @return [Integer]
+      #
+      def strength
+        @strength ||= term.accumulated_strength
+      end
+      ##
+      # Returns the polarity of the opinion.
+      #
+      # @return [String]
+      #
+      def polarity
+        @polarity ||= if strength > 0
+          "positive"
+        elsif strength < 0
+          "negative"
+        else
+          "neutral"
+        end
+      end
+      ##
+      # Obtain the opinion holders from the terms that belong to the same
+      # sentence.
+      #
+      def obtain_holders(sentences, language)
+        sentence_terms = sentences[sentence]
+        sentence_terms.each do |term|
+          if opinion_holders[language].include?(term.lemma)
+            @holders << term.id
+            break
+          end
+        end
+      end
+      ##
+      # Get the potential right and left candidates of the sentence and
+      # decide which ones are the actual targets of the opinion
+      #
+      def obtain_targets(sentences)
+        sentence_terms = sentences[sentence]
+        max_distance = 3
+        terms_count = sentence_terms.count
+        index = -1
+        sentence_terms.each_with_index do |term, i|
+          if ids.include?(term.id)
+            index = i
+          end
+        end
+        unless index+1 >= terms_count
+          min = index+1
+          max = [index+1+max_distance,terms_count].min
+          @right_candidates = filter_candidates(sentence_terms[min..max])
+        end
+        index = 0
+        sentence_terms.each_with_index do |term, i|
+          if ids.include?(term.id)
+            index = i
+            break # needed for left_candidates
+          end
+        end
+        unless index == 0
+          min = [0, index-1-max_distance].max
+          max = index
+          @left_candidates = filter_candidates(sentence_terms[min..max])
+        end
+        unless right_candidates.empty?
+          candidate = right_candidates.first
+          @target_ids << candidate.id
+        end
+        if target_ids.empty?
+          list = mix_lists(right_candidates, left_candidates)
+          list.each do |l|
+            @target_ids << l.id
+            break
+          end
+        end
+      end
+      protected
+      ##
+      # If there are no opinion targets, right and left candidates
+      # are mixed into one list and the first one is picked as the target.
+      #
+      # @return [Array]
+      #
+      def mix_lists(lista, listb)
+        list = []
+        min = [lista.count, listb.count].min
+        (0..min).each do |i|
+          list << lista[i]
+          list << listb[i]
+          if lista.count > listb.count
+            list << lista[min]
+          elsif listb.count > lista.count
+            list << listb[min]
+          end
+        end
+        return list.compact
+      end
+      ##
+      # Filters candidate terms depending on their part of speech and if
+      # they are already part of the expression.
+      #
+      # @return [Hash]
+      #
+      def filter_candidates(sentence_terms)
+        sentence_terms.select{|t| (t.pos == "N" || t.pos == "R") && !ids.include?(t.id)}
+      end
+      ##
+      # Opinion holders for each language code.
+      #
+      # @return [Hash]
+      #
+      def opinion_holders
+        {
+          'nl' => ['ik','we','wij','ze','zij','jullie','u','hij','het','jij','je','mij','me','hem','haar','ons','hen','hun'],
+          'en' => ['i','we','he','she','they','it','you'],
+          'es' => ['yo','tu','nosotros','vosotros','ellos','ellas','nosotras','vosotras'],
+          'it' => ['io','tu','noi','voi','loro','lei','lui'],
+          'de' => ['ich','du','wir','ihr','sie','er'],
+          'fr' => ['je','tu','lui','elle','nous','vous','ils','elles']
+        }
+      end
+    end # Opinion
+  end # OpinionDetectorBasic
+end # Opener

data/lib/opener/opinion_detector_basic/processor.rb ADDED Viewed

@@ -0,0 +1,329 @@
+require_relative 'term'
+require_relative 'opinion'
+module Opener
+  class OpinionDetectorBasic
+    ##
+    # Class that detects opinions in a given input KAF file.
+    #
+    class Processor
+      attr_accessor :document, :timestamp, :opinion_strength, :pretty
+      ##
+      # @param [String|IO] file The KAF file/input to process.
+      # @param [Hash] options. Options for timestamp and including strength to
+      # opinions.
+      # @param [TrueClass|FalseClass] pretty Enable pretty formatting, disabled
+      #  by default due to the performance overhead.
+      #
+      def initialize(file, options = {})
+        @document            = Oga.parse_xml(file)
+        @timestamp           = !!options[:timestamp]
+        @opinion_strength    = !!options[:opinion_strength]
+        @pretty              = options[:pretty] || false
+        raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
+      end
+      ##
+      # Processes the input and returns the new KAF output.
+      # @return [String]
+      #
+      def process
+        add_opinions_layer
+        index = 1
+        opinions.each do |opinion|
+          add_opinion(opinion, index)
+          index += 1
+        end
+        add_linguistic_processor
+        return pretty ? pretty_print(document) : document.to_xml
+      end
+      ##
+      # Get the language of the input file.
+      #
+      # @return [String]
+      #
+      def language
+        return @language ||= document.at_xpath('KAF').get('xml:lang')
+      end
+      ##
+      # Get the terms from the input file
+      # @return [Hash]
+      #
+      def terms
+        unless @terms
+          @terms = []
+          document.xpath('KAF/terms/term').each do |term|
+            @terms << Term.new(term, document, language)
+          end
+        end
+        return @terms
+      end
+      ##
+      # Get the opinions.
+      #
+      # @return [Hash]
+      #
+      def opinions
+        unless @opinions
+          set_accumulated_strength
+          apply_modifiers
+          apply_conjunctions
+          ##
+          # Initialize opinions with their expressions.
+          #
+          @opinions = terms.map do |term|
+            if term.is_expression? && term.accumulated_strength != 0
+              o = Opinion.new(term)
+            end
+          end.compact
+          ##
+          # Obtain targets for each opinion.
+          #
+          @opinions.each do |opinion|
+            opinion.obtain_targets(sentences)
+          end
+          ##
+          # Obtain holders for each opinion.
+          #
+          @opinions.each do |opinion|
+            opinion.obtain_holders(sentences, language)
+          end
+        end
+        return @opinions
+      end
+      ##
+      # Remove the opinions layer from the KAF file if it exists and add a new
+      # one.
+      def add_opinions_layer
+        existing = document.at_xpath('KAF/opinions')
+        existing.remove if existing
+        new_node('opinions', 'KAF')
+      end
+      ##
+      # Adds the entire opinion in the KAF file.
+      #
+      def add_opinion(opinion, index)
+        opinion_node = new_node("opinion", "KAF/opinions")
+        opinion_node.set('oid', "o#{index.to_s}")
+        unless opinion.holders.empty?
+          opinion_holder_node = new_node("opinion_holder", opinion_node)
+          add_opinion_element(opinion_holder_node, opinion.holders)
+        end
+        opinion_target_node = new_node("opinion_target", opinion_node)
+        unless opinion.target_ids.empty?
+          add_opinion_element(opinion_target_node, opinion.target_ids)
+        end
+        expression_node = new_node("opinion_expression", opinion_node)
+        expression_node.set('polarity', opinion.polarity)
+        expression_node.set('strength', opinion.strength.to_s)
+        add_opinion_element(expression_node, opinion.ids)
+      end
+      ##
+      # Method for adding opinion holders, targets and expressions.
+      #
+      def add_opinion_element(node, ids)
+        lemmas = terms.select{|t| ids.include?(t.id)}.map(&:lemma).join(" ")
+        comment = Oga::XML::Comment.new(:text => "#{lemmas}")
+        node.children << comment
+        span_node = new_node("span", node)
+        ids.each do |id|
+          target_node = new_node("target", span_node)
+          target_node.set('id', id.to_s)
+        end
+      end
+      ##
+      # Add linguistic processor layer with basic information
+      # (version, timestamp, description etc) in the KAF file.
+      #
+      def add_linguistic_processor
+        description = 'Basic opinion detector with Pos'
+        last_edited = '13may2015'
+        version     = '2.0'
+        node = new_node('linguisticProcessors', 'KAF/kafHeader')
+        node.set('layer', 'opinions')
+        lp_node = new_node('lp', node)
+        lp_node.set('version', "#{last_edited}-#{version}")
+        lp_node.set('name', description)
+        if timestamp
+          format = '%Y-%m-%dT%H:%M:%S%Z'
+          lp_node.set('timestamp', Time.now.strftime(format))
+        else
+          lp_node.set('timestamp', '*')
+        end
+      end
+      ##
+      # Format the output document properly.
+      #
+      # TODO: this should be handled by Oga in a nice way.
+      #
+      # @return [String]
+      #
+      def pretty_print(document)
+        doc = REXML::Document.new document.to_xml
+        doc.context[:attribute_quote] = :quote
+        out = ""
+        formatter = REXML::Formatters::Pretty.new
+        formatter.compact = true
+        formatter.write(doc, out)
+        return out.strip
+      end
+      ##
+      # Get terms grouped by sentence.
+      #
+      def sentences
+        @sentences ||= terms.group_by{|t| t.sentence}
+      end
+      protected
+      ##
+      # The strength of a term depends heavily on the type of the previous
+      # one. For example if the previous one is a shifter, it needs
+      # to be multiplied. If it's an intensifier, it needs to be
+      # added (or subtracted depending on the strength of the previous
+      # term) etc.
+      #
+      def set_accumulated_strength
+        symbol    = :+
+        terms_count = terms.count
+        terms.each_with_index do |term, i|
+          if i+1 < terms_count
+            if terms[i+1].is_shifter?
+              if term.accumulated_strength != 0
+                terms[i+1].accumulated_strength *= term.accumulated_strength
+                terms[i+1].list_ids += term.list_ids
+                term.use = false
+                symbol = terms[i+1].accumulated_strength > 0 ? :+ : :-
+              else
+                symbol = :*
+              end
+            elsif terms[i+1].is_intensifier?
+              terms[i+1].accumulated_strength = term.accumulated_strength.send(symbol, terms[i+1].accumulated_strength)
+              term.use = false
+              symbol = terms[i+1].accumulated_strength > 0 ? :+ : :-
+              if term.accumulated_strength != 0
+                terms[i+1].list_ids += term.list_ids
+              end
+            else
+              symbol = terms[i+1].accumulated_strength >= 0 ? :+ : :-
+            end
+          end
+        end
+      end
+      ##
+      # Apply strength to the next term after a shifter or intensifier.
+      #
+      def apply_modifiers
+        terms_count = terms.count
+        terms.each_with_index do |term, i|
+          if i+1 < terms_count
+            if term.use && (term.is_shifter? || term.is_intensifier?)
+              terms[i+1].accumulated_strength *= term.accumulated_strength
+              terms[i+1].list_ids += term.list_ids
+              term.use = false
+            end
+          end
+        end
+      end
+      ##
+      # Ignore conjunctions when applying strength.
+      #
+      def apply_conjunctions
+        terms_count = terms.count
+        i = 0
+        while i < terms_count
+          if terms[i].use && terms[i].accumulated_strength != 0
+            used     = [i]
+            list_ids = terms[i].list_ids
+            strength = terms[i].accumulated_strength
+            terms[i].use = false
+            j = i+1
+            while true
+              if j >= terms_count
+                break
+              end
+              if terms[j].is_conjunction
+                terms[j].use = false
+                j += 1
+              elsif terms[j].use && terms[j].accumulated_strength != 0
+                list_ids += terms[j].list_ids
+                used << j
+                terms[j].use = false
+                strength += terms[j].accumulated_strength
+                j += 1
+              else
+                break
+              end
+            end
+            last_used = used.last
+            terms[last_used].accumulated_strength = strength
+            terms[last_used].list_ids = list_ids
+            terms[last_used].use = true
+            i = j
+          end
+          i += 1
+        end
+      end
+      ##
+      # Creates a new node in the KAF file.
+      #
+      def new_node(tag, parent)
+        if parent.is_a?(String)
+          parent_node = document.at_xpath(parent)
+        else
+          parent_node = parent
+        end
+        node = Oga::XML::Element.new(:name => tag)
+        parent_node.children << node
+        return node
+      end
+      ##
+      # Check if input is a KAF file.
+      # @return [Boolean]
+      #
+      def is_kaf?
+        return !!document.at_xpath('KAF')
+      end
+    end # Processor
+  end # OpinionDetectorBasic
+end # Opener