RubyGems - opener-opinion-detector-basic - Versions diffs - 2.0.7 → 3.0.0 - Mend

opener-opinion-detector-basic 2.0.7 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/README.md +1 -5
data/lib/opener/opinion_detector_basic.rb +12 -65
data/lib/opener/opinion_detector_basic/opinion.rb +171 -0
data/lib/opener/opinion_detector_basic/processor.rb +329 -0
data/lib/opener/opinion_detector_basic/term.rb +160 -0
data/lib/opener/opinion_detector_basic/version.rb +1 -1
data/opener-opinion-detector-basic.gemspec +5 -10
metadata +24 -39
data/core/opinion_detector_basic_multi.py +0 -512
data/ext/hack/Rakefile +0 -8
data/pre_install_requirements.txt +0 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 765314f86b29243ff3c007100653a20198d3b890
-  data.tar.gz: 1c8767a8dc9ccc48680d0fca3364ed7d0ef08a80
+  metadata.gz: d07d2a2eb88245eca143655a2fc8b5d301b632dd
+  data.tar.gz: 1c67e6b59421ef2ab4e33f5c3260699c202eab0e
 SHA512:
-  metadata.gz: 9972bff4b61846eac50d946c1350ba65fef31fd15e6876a58b7c85c627efb11fd4427543a25ccb20310e94a3a3ab965a57199a4344f07227434688a75aa610ce
-  data.tar.gz: b454c68ab7ed948db7e39e879646f8e07c64f65c90b2ec1713a0a9aeb8736cb516ba28ed2175f30ea52cfb7f7342585849340b5a756ae934715bc921545cf648
+  metadata.gz: cf26709cea362f73901df7184f2c562ac5b9d597c5386c1bb4845a843a667b8e59a301d6e36e3ed5759fd7a7b904b82a390665c0dd916f803e4dcfdefe3ca7f3
+  data.tar.gz: f978e9dc22837f78a758e28d4612e07732d5c12f7c429711d46716e1869c0fb640e7397b47b9c388c12273b440b7b282d49bcf87899070af80831373789294be

data/README.md CHANGED Viewed

@@ -103,11 +103,7 @@ At least you need the following system setup:
 ### Depenencies for normal use:
-* Ruby 1.9.3 or newer
-* Python 2.6
-* lxml: library for processing xml in python
-* libarchive, on Debian/Ubuntu based systems this can be installed using
-  `sudo apt-get install libarchive-dev`
+* Tested on Ruby 2.1.5, 2.2.2, Rubinius 2.4.0, jruby-1.7.8
 ## Domain Adaption

data/lib/opener/opinion_detector_basic.rb CHANGED Viewed

@@ -1,8 +1,13 @@
-require 'open3'
 require 'slop'
+require 'oga'
+require 'monitor'
+require 'rexml/document'
+require 'rexml/formatters/pretty'
 require_relative 'opinion_detector_basic/version'
 require_relative 'opinion_detector_basic/cli'
+require_relative 'opinion_detector_basic/processor'
 module Opener
   ##
@@ -27,77 +32,19 @@ module Opener
       @args    = options.delete(:args) || []
       @options = options
     end
     ##
-    # Builds the command used to execute the kernel.
-    #
-    # @param [Array] args Commandline arguments passed to the command.
-    #
-    def command
-      return "#{adjust_python_path} python -E #{kernel} #{args.join(' ')}"
-    end
-    ##
-    # Processes an input KAF document and returns the results as a new KAF
-    # document.
+    # Processes the input KAF document.
     #
     # @param [String] input
     # @return [String]
     #
     def run(input)
-      stdout, stderr, process = capture(input)
-      raise stderr unless process.success?
-      return stdout
-    end
-    protected
-    ##
-    # @return [String]
-    #
-    def adjust_python_path
-      site_packages =  File.join(core_dir, 'site-packages')
-      return "env PYTHONPATH=#{site_packages}:$PYTHONPATH"
-    end
-    ##
-    # capture3 method doesn't work properly with Jruby, so
-    # this is a workaround
-    #
-    def capture(input)
-      Open3.popen3(*command.split(" ")) {|i, o, e, t|
-        out_reader = Thread.new { o.read }
-        err_reader = Thread.new { e.read }
-        i.write input
-        i.close
-        [out_reader.value, err_reader.value, t.value]
-      }
-    end
-    ##
-    # @return [String]
-    #
-    def core_dir
-      return File.expand_path('../../../core', __FILE__)
-    end
-    ##
-    # @return [String]
-    #
-    def kernel
-      return File.join(core_dir, 'opinion_detector_basic_multi.py')
-    end
+      options[:timestamp] = !options.delete(:no_time)
-    ##
-    # @return the language from the KAF
-    #
-    def language(input)
-      document = Nokogiri::XML(input)
-      return document.at('KAF').attr('xml:lang')
+      return Processor.new(input, options).process
     end
   end # OpinionDetectorBasic
 end # Opener

data/lib/opener/opinion_detector_basic/opinion.rb ADDED Viewed

@@ -0,0 +1,171 @@
+module Opener
+  class OpinionDetectorBasic
+    class Opinion
+      attr_reader :term
+      attr_accessor :left_candidates, :right_candidates, :target_ids, :holders
+      def initialize(term)
+        @term = term
+        @left_candidates = []
+        @right_candidates = []
+        @holders = []
+        @target_ids = []
+      end
+      ##
+      # Returns the term ids of the opinion expression.
+      #
+      # @return [Array]
+      #
+      def ids
+        @ids ||= term.list_ids.sort
+      end
+      ##
+      # Returns the sentence id of the opinion.
+      #
+      # @return [String]
+      #
+      def sentence
+        @sentence ||= term.sentence
+      end
+      ##
+      # Returns the strength of the opinion.
+      #
+      # @return [Integer]
+      #
+      def strength
+        @strength ||= term.accumulated_strength
+      end
+      ##
+      # Returns the polarity of the opinion.
+      #
+      # @return [String]
+      #
+      def polarity
+        @polarity ||= if strength > 0
+          "positive"
+        elsif strength < 0
+          "negative"
+        else
+          "neutral"
+        end
+      end
+      ##
+      # Obtain the opinion holders from the terms that belong to the same
+      # sentence.
+      #
+      def obtain_holders(sentences, language)
+        sentence_terms = sentences[sentence]
+        sentence_terms.each do |term|
+          if opinion_holders[language].include?(term.lemma)
+            @holders << term.id
+            break
+          end
+        end
+      end
+      ##
+      # Get the potential right and left candidates of the sentence and
+      # decide which ones are the actual targets of the opinion
+      #
+      def obtain_targets(sentences)
+        sentence_terms = sentences[sentence]
+        max_distance = 3
+        terms_count = sentence_terms.count
+        index = -1
+        sentence_terms.each_with_index do |term, i|
+          if ids.include?(term.id)
+            index = i
+          end
+        end
+        unless index+1 >= terms_count
+          min = index+1
+          max = [index+1+max_distance,terms_count].min
+          @right_candidates = filter_candidates(sentence_terms[min..max])
+        end
+        index = 0
+        sentence_terms.each_with_index do |term, i|
+          if ids.include?(term.id)
+            index = i
+            break # needed for left_candidates
+          end
+        end
+        unless index == 0
+          min = [0, index-1-max_distance].max
+          max = index
+          @left_candidates = filter_candidates(sentence_terms[min..max])
+        end
+        unless right_candidates.empty?
+          candidate = right_candidates.first
+          @target_ids << candidate.id
+        end
+        if target_ids.empty?
+          list = mix_lists(right_candidates, left_candidates)
+          list.each do |l|
+            @target_ids << l.id
+            break
+          end
+        end
+      end
+      protected
+      ##
+      # If there are no opinion targets, right and left candidates
+      # are mixed into one list and the first one is picked as the target.
+      #
+      # @return [Array]
+      #
+      def mix_lists(lista, listb)
+        list = []
+        min = [lista.count, listb.count].min
+        (0..min).each do |i|
+          list << lista[i]
+          list << listb[i]
+          if lista.count > listb.count
+            list << lista[min]
+          elsif listb.count > lista.count
+            list << listb[min]
+          end
+        end
+        return list.compact
+      end
+      ##
+      # Filters candidate terms depending on their part of speech and if
+      # they are already part of the expression.
+      #
+      # @return [Hash]
+      #
+      def filter_candidates(sentence_terms)
+        sentence_terms.select{|t| (t.pos == "N" || t.pos == "R") && !ids.include?(t.id)}
+      end
+      ##
+      # Opinion holders for each language code.
+      #
+      # @return [Hash]
+      #
+      def opinion_holders
+        {
+          'nl' => ['ik','we','wij','ze','zij','jullie','u','hij','het','jij','je','mij','me','hem','haar','ons','hen','hun'],
+          'en' => ['i','we','he','she','they','it','you'],
+          'es' => ['yo','tu','nosotros','vosotros','ellos','ellas','nosotras','vosotras'],
+          'it' => ['io','tu','noi','voi','loro','lei','lui'],
+          'de' => ['ich','du','wir','ihr','sie','er'],
+          'fr' => ['je','tu','lui','elle','nous','vous','ils','elles']
+        }
+      end
+    end # Opinion
+  end # OpinionDetectorBasic
+end # Opener

data/lib/opener/opinion_detector_basic/processor.rb ADDED Viewed

@@ -0,0 +1,329 @@
+require_relative 'term'
+require_relative 'opinion'
+module Opener
+  class OpinionDetectorBasic
+    ##
+    # Class that detects opinions in a given input KAF file.
+    #
+    class Processor
+      attr_accessor :document, :timestamp, :opinion_strength, :pretty
+      ##
+      # @param [String|IO] file The KAF file/input to process.
+      # @param [Hash] options. Options for timestamp and including strength to
+      # opinions.
+      # @param [TrueClass|FalseClass] pretty Enable pretty formatting, disabled
+      #  by default due to the performance overhead.
+      #
+      def initialize(file, options = {})
+        @document            = Oga.parse_xml(file)
+        @timestamp           = !!options[:timestamp]
+        @opinion_strength    = !!options[:opinion_strength]
+        @pretty              = options[:pretty] || false
+        raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
+      end
+      ##
+      # Processes the input and returns the new KAF output.
+      # @return [String]
+      #
+      def process
+        add_opinions_layer
+        index = 1
+        opinions.each do |opinion|
+          add_opinion(opinion, index)
+          index += 1
+        end
+        add_linguistic_processor
+        return pretty ? pretty_print(document) : document.to_xml
+      end
+      ##
+      # Get the language of the input file.
+      #
+      # @return [String]
+      #
+      def language
+        return @language ||= document.at_xpath('KAF').get('xml:lang')
+      end
+      ##
+      # Get the terms from the input file
+      # @return [Hash]
+      #
+      def terms
+        unless @terms
+          @terms = []
+          document.xpath('KAF/terms/term').each do |term|
+            @terms << Term.new(term, document, language)
+          end
+        end
+        return @terms
+      end
+      ##
+      # Get the opinions.
+      #
+      # @return [Hash]
+      #
+      def opinions
+        unless @opinions
+          set_accumulated_strength
+          apply_modifiers
+          apply_conjunctions
+          ##
+          # Initialize opinions with their expressions.
+          #
+          @opinions = terms.map do |term|
+            if term.is_expression? && term.accumulated_strength != 0
+              o = Opinion.new(term)
+            end
+          end.compact
+          ##
+          # Obtain targets for each opinion.
+          #
+          @opinions.each do |opinion|
+            opinion.obtain_targets(sentences)
+          end
+          ##
+          # Obtain holders for each opinion.
+          #
+          @opinions.each do |opinion|
+            opinion.obtain_holders(sentences, language)
+          end
+        end
+        return @opinions
+      end
+      ##
+      # Remove the opinions layer from the KAF file if it exists and add a new
+      # one.
+      def add_opinions_layer
+        existing = document.at_xpath('KAF/opinions')
+        existing.remove if existing
+        new_node('opinions', 'KAF')
+      end
+      ##
+      # Adds the entire opinion in the KAF file.
+      #
+      def add_opinion(opinion, index)
+        opinion_node = new_node("opinion", "KAF/opinions")
+        opinion_node.set('oid', "o#{index.to_s}")
+        unless opinion.holders.empty?
+          opinion_holder_node = new_node("opinion_holder", opinion_node)
+          add_opinion_element(opinion_holder_node, opinion.holders)
+        end
+        opinion_target_node = new_node("opinion_target", opinion_node)
+        unless opinion.target_ids.empty?
+          add_opinion_element(opinion_target_node, opinion.target_ids)
+        end
+        expression_node = new_node("opinion_expression", opinion_node)
+        expression_node.set('polarity', opinion.polarity)
+        expression_node.set('strength', opinion.strength.to_s)
+        add_opinion_element(expression_node, opinion.ids)
+      end
+      ##
+      # Method for adding opinion holders, targets and expressions.
+      #
+      def add_opinion_element(node, ids)
+        lemmas = terms.select{|t| ids.include?(t.id)}.map(&:lemma).join(" ")
+        comment = Oga::XML::Comment.new(:text => "#{lemmas}")
+        node.children << comment
+        span_node = new_node("span", node)
+        ids.each do |id|
+          target_node = new_node("target", span_node)
+          target_node.set('id', id.to_s)
+        end
+      end
+      ##
+      # Add linguistic processor layer with basic information
+      # (version, timestamp, description etc) in the KAF file.
+      #
+      def add_linguistic_processor
+        description = 'Basic opinion detector with Pos'
+        last_edited = '13may2015'
+        version     = '2.0'
+        node = new_node('linguisticProcessors', 'KAF/kafHeader')
+        node.set('layer', 'opinions')
+        lp_node = new_node('lp', node)
+        lp_node.set('version', "#{last_edited}-#{version}")
+        lp_node.set('name', description)
+        if timestamp
+          format = '%Y-%m-%dT%H:%M:%S%Z'
+          lp_node.set('timestamp', Time.now.strftime(format))
+        else
+          lp_node.set('timestamp', '*')
+        end
+      end
+      ##
+      # Format the output document properly.
+      #
+      # TODO: this should be handled by Oga in a nice way.
+      #
+      # @return [String]
+      #
+      def pretty_print(document)
+        doc = REXML::Document.new document.to_xml
+        doc.context[:attribute_quote] = :quote
+        out = ""
+        formatter = REXML::Formatters::Pretty.new
+        formatter.compact = true
+        formatter.write(doc, out)
+        return out.strip
+      end
+      ##
+      # Get terms grouped by sentence.
+      #
+      def sentences
+        @sentences ||= terms.group_by{|t| t.sentence}
+      end
+      protected
+      ##
+      # The strength of a term depends heavily on the type of the previous
+      # one. For example if the previous one is a shifter, it needs
+      # to be multiplied. If it's an intensifier, it needs to be
+      # added (or subtracted depending on the strength of the previous
+      # term) etc.
+      #
+      def set_accumulated_strength
+        symbol    = :+
+        terms_count = terms.count
+        terms.each_with_index do |term, i|
+          if i+1 < terms_count
+            if terms[i+1].is_shifter?
+              if term.accumulated_strength != 0
+                terms[i+1].accumulated_strength *= term.accumulated_strength
+                terms[i+1].list_ids += term.list_ids
+                term.use = false
+                symbol = terms[i+1].accumulated_strength > 0 ? :+ : :-
+              else
+                symbol = :*
+              end
+            elsif terms[i+1].is_intensifier?
+              terms[i+1].accumulated_strength = term.accumulated_strength.send(symbol, terms[i+1].accumulated_strength)
+              term.use = false
+              symbol = terms[i+1].accumulated_strength > 0 ? :+ : :-
+              if term.accumulated_strength != 0
+                terms[i+1].list_ids += term.list_ids
+              end
+            else
+              symbol = terms[i+1].accumulated_strength >= 0 ? :+ : :-
+            end
+          end
+        end
+      end
+      ##
+      # Apply strength to the next term after a shifter or intensifier.
+      #
+      def apply_modifiers
+        terms_count = terms.count
+        terms.each_with_index do |term, i|
+          if i+1 < terms_count
+            if term.use && (term.is_shifter? || term.is_intensifier?)
+              terms[i+1].accumulated_strength *= term.accumulated_strength
+              terms[i+1].list_ids += term.list_ids
+              term.use = false
+            end
+          end
+        end
+      end
+      ##
+      # Ignore conjunctions when applying strength.
+      #
+      def apply_conjunctions
+        terms_count = terms.count
+        i = 0
+        while i < terms_count
+          if terms[i].use && terms[i].accumulated_strength != 0
+            used     = [i]
+            list_ids = terms[i].list_ids
+            strength = terms[i].accumulated_strength
+            terms[i].use = false
+            j = i+1
+            while true
+              if j >= terms_count
+                break
+              end
+              if terms[j].is_conjunction
+                terms[j].use = false
+                j += 1
+              elsif terms[j].use && terms[j].accumulated_strength != 0
+                list_ids += terms[j].list_ids
+                used << j
+                terms[j].use = false
+                strength += terms[j].accumulated_strength
+                j += 1
+              else
+                break
+              end
+            end
+            last_used = used.last
+            terms[last_used].accumulated_strength = strength
+            terms[last_used].list_ids = list_ids
+            terms[last_used].use = true
+            i = j
+          end
+          i += 1
+        end
+      end
+      ##
+      # Creates a new node in the KAF file.
+      #
+      def new_node(tag, parent)
+        if parent.is_a?(String)
+          parent_node = document.at_xpath(parent)
+        else
+          parent_node = parent
+        end
+        node = Oga::XML::Element.new(:name => tag)
+        parent_node.children << node
+        return node
+      end
+      ##
+      # Check if input is a KAF file.
+      # @return [Boolean]
+      #
+      def is_kaf?
+        return !!document.at_xpath('KAF')
+      end
+    end # Processor
+  end # OpinionDetectorBasic
+end # Opener