RubyGems - proiel - Versions diffs - 1.1.0 → 1.3.1 - Mend

proiel 1.1.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

checksums.yaml +5 -5
data/LICENSE +1 -1
data/README.md +2 -2
data/lib/proiel.rb +16 -1
data/lib/proiel/alignment.rb +3 -0
data/lib/proiel/alignment/builder.rb +220 -0
data/lib/proiel/annotation_schema.rb +11 -4
data/lib/proiel/chronology.rb +80 -0
data/lib/proiel/dictionary.rb +79 -0
data/lib/proiel/dictionary/builder.rb +224 -0
data/lib/proiel/div.rb +22 -3
data/lib/proiel/language.rb +108 -0
data/lib/proiel/lemma.rb +77 -0
data/lib/proiel/proiel_xml/proiel-3.0/proiel-3.0.xsd +383 -0
data/lib/proiel/proiel_xml/reader.rb +138 -2
data/lib/proiel/proiel_xml/schema.rb +4 -2
data/lib/proiel/proiel_xml/validator.rb +76 -9
data/lib/proiel/sentence.rb +27 -4
data/lib/proiel/source.rb +14 -4
data/lib/proiel/statistics.rb +2 -2
data/lib/proiel/token.rb +14 -6
data/lib/proiel/tokenization.rb +5 -3
data/lib/proiel/treebank.rb +23 -6
data/lib/proiel/utils.rb +0 -1
data/lib/proiel/valency.rb +5 -0
data/lib/proiel/valency/arguments.rb +151 -0
data/lib/proiel/valency/lexicon.rb +59 -0
data/lib/proiel/valency/obliqueness.rb +31 -0
data/lib/proiel/version.rb +2 -3
data/lib/proiel/visualization.rb +1 -0
data/lib/proiel/visualization/graphviz.rb +111 -0
data/lib/proiel/visualization/graphviz/aligned-modern.dot.erb +83 -0
data/lib/proiel/visualization/graphviz/classic.dot.erb +24 -0
data/lib/proiel/visualization/graphviz/linearized.dot.erb +57 -0
data/lib/proiel/visualization/graphviz/modern.dot.erb +39 -0
data/lib/proiel/visualization/graphviz/packed.dot.erb +25 -0
metadata +76 -31

data/lib/proiel/valency/arguments.rb ADDED

@@ -0,0 +1,151 @@
+module PROIEL::Valency::Arguments
+  def self.get_argument_frame(token)
+    arguments = collect_arguments(token)
+    hoisted_arguments = arguments.map { |a| hoist_dependents(a) }
+    a =
+      hoisted_arguments.map do |argument|
+        { relation: argument.relation }.merge(extract_features(argument))
+      end
+    PROIEL::Valency::Obliqueness.sort_arguments(a)
+  end
+  private
+  POS_CLASSIFICATION = {
+    'R' => :functor,
+    'G' => :functor,
+    'N' => :nominal,
+    'P' => :nominal,
+    'A' => :nominal,
+    'M' => :nominal,
+    'V' => :verbal,
+  }.freeze
+  # Collapses dependents based on features
+  def self.collapse_dependents(dependents)
+    # Hoist dependents if any of the dependents is a coordinator
+    dependents = dependents.map { |d| hoist_dependents(d) }
+    # Figure out if all dependents are equivalent for the purposes of
+    # argument frames. Typical examples would be coordinated, identical
+    # prepositions (which is operationalised as same lemma, same POS, no
+    # case) or coordinated nouns in the same case (which is operationalised
+    # as same major POS, same case). If we fail to figure out a way to
+    # hoist and reduce arguments, we keep the coordinator.
+    majors = dependents.map { |d| POS_CLASSIFICATION[d.part_of_speech_hash[:major] || d.empty_token_sort] }.uniq
+    majors = majors.length == 1 ? majors.first : nil
+    case majors
+    when :functor
+      lemmas = dependents.map(&:lemma).uniq
+      if lemmas.length == 1
+        dependents.first
+      else
+        # STDERR.puts "Different lemmas R/G: #{lemmas.inspect}"
+        nil
+      end
+    when :nominal
+      cases = dependents.map { |d| d.morphology_hash[:case] }.uniq
+      if cases.length == 1
+        dependents.first
+      else
+        # STDERR.puts "Different cases N/P: #{cases.inspect}"
+        nil
+      end
+    when :verbal
+      moods = dependents.map { |d| d.morphology_hash[:mood] }.uniq
+      if moods.length == 1
+        dependents.first
+      else
+        # STDERR.puts "Different moods V: #{moods.inspect}"
+        nil
+      end
+    else
+      # STDERR.puts "Unknown combination: #{dependents.map(&:pos).inspect}"
+      nil
+    end
+  end
+  # Hoists the real argument dependents from conjoined arguments
+  def self.hoist_dependents(argument)
+    if argument.part_of_speech == 'C-' or argument.empty_token_sort == 'C'
+      # Pick dependents that have the same relation as the coordinator. This
+      # eliminates auxiliary elements like particles and repeated
+      # conjunctions as well as attributes that scope over all conjuncts.
+      dependents = argument.dependents.select { |d| d.relation == argument.relation }
+      collapse_dependents(dependents) || argument
+    else
+      argument
+    end
+  end
+  # Extracts morphosyntactic features that are relevant to the argument frame
+  def self.extract_features(argument)
+    {}.tap do |features|
+      case argument.part_of_speech_hash[:major]
+      when 'G'
+        features[:lemma] = argument.lemma
+        features[:part_of_speech] = argument.part_of_speech
+        # There may be multiple dependents and dependents may be headed by
+        # coordinators. All relevant dependents have the relation PRED.
+        dependents = argument.dependents.select { |d| d.relation == 'pred' }.map { |a| hoist_dependents(a) }
+        local_argument = collapse_dependents(dependents)
+        if local_argument and local_argument.morphology_hash[:mood]
+          features[:mood] = local_argument.morphology_hash[:mood]
+        end
+      when 'R'
+        features[:lemma] = argument.lemma
+        features[:part_of_speech] = argument.part_of_speech
+        # There may be multiple dependents and dependents may be headed by
+        # coordinators. All relevant dependents have the relation OBL.
+        dependents = argument.dependents.select { |d| d.relation == 'obl' }.map { |a| hoist_dependents(a) }
+        local_argument = collapse_dependents(dependents)
+        if local_argument and local_argument.morphology_hash[:case]
+          features[:case] = local_argument.morphology_hash[:case]
+        end
+      when 'V'
+        features[:mood] = argument.morphology_hash[:mood] if argument.morphology_hash[:mood]
+      when 'D'
+        features[:lemma] = argument.lemma
+        features[:part_of_speech] = argument.part_of_speech
+      when 'P'
+        features[:case] = argument.morphology_hash[:case] if argument.morphology_hash[:case]
+        if argument.part_of_speech == 'Pk' # reflexive personal pronoun
+          features[:lemma] = argument.lemma
+          features[:part_of_speech] = argument.part_of_speech
+        end
+      else
+        features[:case] = argument.morphology_hash[:case] if argument.morphology_hash[:case]
+      end
+    end
+  end
+  #  Determines the arguments of a predicate
+  def self.collect_arguments(token)
+    token.dependents.select do |dependent|
+      case dependent.relation
+      when 'obj', 'obl', 'xobj', 'comp', 'narg' # arguments
+        true
+      when 'aux', 'sub', 'ag', 'adv', 'xadv', 'apos', 'atr', 'part', 'expl' # non-arguments
+        false
+      when 'arg' # unspecific but always an argument
+        true
+      when 'adnom', 'nonsub', 'per' # unspecific and undetermined with respect to argumenthood
+        false
+      when 'rel' # unspecific but never an argument
+        false
+      when 'pred', 'parpred', 'voc' # shouldn't happen
+        false
+      when 'pid', 'xsub' # really shouldn't happen
+        false
+      else
+        raise "unknown relation #{dependent.relation.inspect}"
+      end
+    end
+  end
+end

data/lib/proiel/valency/lexicon.rb ADDED

@@ -0,0 +1,59 @@
+module PROIEL
+  module Valency
+    class Lexicon
+      attr_reader :frames
+      def initialize
+        @source_ids = Set.new
+        @source_languages = Set.new
+        @frames = {}
+      end
+      # Generates a valency lexicon from the provided sources. In practice the
+      # sources should be in the same language but this is not enforced. This
+      # makes it possible to generate a lexicon from sources in closely related
+      # languages or dialects.
+      def add_source!(source)
+        @source_ids << source.id
+        @source_languages << source.language
+        source.sentences.each do |sentence|
+          tokens = find_verbal_nodes(sentence)
+          tokens.each do |token|
+            frame = PROIEL::Valency::Arguments.get_argument_frame(token)
+            partition =
+              if token.dependents.any? { |d| d.relation == 'aux' and d.part_of_speech == 'Pk' }
+                :r
+              else
+                :a
+              end
+            @frames[token.lemma] ||= {}
+            @frames[token.lemma][token.part_of_speech] ||= {}
+            @frames[token.lemma][token.part_of_speech][frame] ||= { a: [], r: [] }
+            @frames[token.lemma][token.part_of_speech][frame][partition] << token.id
+          end
+        end
+      end
+      def lookup(lemma, part_of_speech)
+        frames =
+          @frames[lemma][part_of_speech].map do |arguments, token_ids|
+            { arguments: arguments, tokens: token_ids }
+          end
+        PROIEL::Valency::Obliqueness.sort_frames(frames)
+      end
+      private
+      # Find verbal nodes in a sentence
+      def find_verbal_nodes(sentence)
+        sentence.tokens.select do |token|
+          # FIXME: is this test in the proiel library already?
+          (token.part_of_speech and token.part_of_speech[/^V/]) or token.empty_token_sort == 'V'
+        end
+      end
+    end
+  end
+end

data/lib/proiel/valency/obliqueness.rb ADDED

@@ -0,0 +1,31 @@
+module PROIEL::Valency::Obliqueness
+  # Sorts frames by obliqueness
+  def self.sort_frames(frames)
+    # Sort frames by obliqueness, then by inspecting them so that we get
+    # a stable, reproducible order.
+    frames.sort_by { |frame| [obliqueness_of_arguments(frame[:arguments]).sort, frame.inspect] }
+  end
+  # Sorts arguments by obliqueness
+  def self.sort_arguments(arguments)
+    arguments.sort_by { |argument| obliqueness_of_argument(argument) }
+  end
+  private
+  def self.obliqueness_of_arguments(arguments)
+    arguments.map do |argument|
+      obliqueness_of_argument(argument)
+    end
+  end
+  def self.obliqueness_of_argument(argument)
+    obliqueness_of_relation(argument[:relation]) * 2 + (argument[:lemma].nil? ? 0 : 1)
+  end
+  OBLIQUENESS_HIERARCHY = %w(sub ag obj xobj arg obl comp narg).freeze
+  def self.obliqueness_of_relation(relation)
+    OBLIQUENESS_HIERARCHY.index(relation) || OBLIQUENESS_HIERARCHY.length
+  end
+end

data/lib/proiel/version.rb CHANGED

@@ -1,9 +1,8 @@
 #--
-# Copyright (c) 2015-2016 Marius L. Jøhndal
+# Copyright (c) 2015-2018 Marius L. Jøhndal
 #
 # See LICENSE in the top-level source directory for licensing terms.
 #++
 module PROIEL
-  # Gem version
-  VERSION = '1.1.0'
+  VERSION = '1.3.1'.freeze
 end

data/lib/proiel/visualization.rb ADDED

	@@ -0,0 +1 @@
1	+ require 'proiel/visualization/graphviz'

data/lib/proiel/visualization/graphviz.rb ADDED

@@ -0,0 +1,111 @@
+module PROIEL
+  module Visualization
+    module Graphviz
+      DEFAULT_GRAPHVIZ_BINARY = 'dot'.freeze
+      DEFAULT_TEMPLATES = %i(classic linearized packed modern aligned-modern).freeze
+      SUPPORTED_OUTPUT_FORMATS = %i(png svg).freeze
+      class GraphvizError < Exception
+      end
+      def self.generate_to_file(template, graph, output_format, output_filename, options = {})
+        raise ArgumentError, 'string expected' unless output_filename.is_a?(String)
+        result = PROIEL::Visualization::Graphviz.generate(template, graph, output_format, options)
+        File.open(output_filename, 'w') do |f|
+          f.write(result)
+        end
+      end
+      def self.generate(template, graph, output_format, options = {})
+        raise ArgumentError, 'string or symbol expected' unless template.is_a?(String) or template.is_a?(Symbol)
+        dot_code = generate_dot(template, graph, options)
+        if output_format.to_sym == :dot
+          dot_code
+        else
+          generate_image(dot_code, output_format, options)
+        end
+      end
+      def self.template_filename(template)
+        raise ArgumentError, 'string or symbol expected' unless template.is_a?(String) or template.is_a?(Symbol)
+        raise ArgumentError, 'invalid template' unless DEFAULT_TEMPLATES.include?(template.to_sym)
+        File.join(File.dirname(__FILE__), 'graphviz', "#{template}.dot.erb")
+      end
+      def self.generate_image(dot_code, output_format, options = {})
+        raise ArgumentError, 'string expected' unless dot_code.is_a?(String)
+        unless output_format.is_a?(String) or output_format.is_a?(Symbol)
+          raise ArgumentError, 'string or symbol expected'
+        end
+        raise ArgumentError, 'invalid output format' unless SUPPORTED_OUTPUT_FORMATS.include?(output_format.to_sym)
+        graphviz_binary = options[:graphviz_binary] || DEFAULT_GRAPHVIZ_BINARY
+        result, errors = nil, nil
+        Open3.popen3("dot -T#{output_format}") do |dot, img, err|
+          dot.write dot_code
+          dot.close
+          result, errors = img.read, err.read
+        end
+        raise GraphvizError, "graphviz exited with errors: #{errors}" unless errors.nil? or errors == ''
+        result
+      end
+      def self.generate_dot(template, graph, options)
+        unless options[:direction].nil? or %(TD LR).include?(options[:direction])
+          raise ArgumentError, 'invalid direction'
+        end
+        filename = template_filename(template)
+        content = File.read(filename)
+        template = ERB.new(content, nil, '-')
+        template.filename = filename
+        TemplateContext.new(graph, options[:direction] || 'TD').generate(template)
+      end
+      class TemplateContext
+        def initialize(graph, direction, title = '')
+          @graph = graph
+          @direction = direction
+          @title = title
+        end
+        def generate(template)
+          template.result(binding)
+        end
+        protected
+        # Creates a node with an identifier and a label.
+        def node(identifier, label = '', options = {})
+          attrs = join_attributes(options.merge(label: label))
+          "#{identifier} [#{attrs}];"
+        end
+        # Creates an edge with a label from one identifier to another identifier.
+        def edge(identifier1, identifier2, label = '', options = {})
+          attrs = join_attributes(options.merge(label: label))
+          "#{identifier1} -> #{identifier2} [#{attrs}];"
+        end
+        def join_attributes(attrs)
+          attrs.map { |a, v| %|#{a}="#{v.to_s.gsub('"', '\\"')}"| }.join(',')
+        end
+      end
+    end
+  end
+end

data/lib/proiel/visualization/graphviz/aligned-modern.dot.erb ADDED

@@ -0,0 +1,83 @@
+digraph "<%= @title -%>" {
+  charset="UTF-8";
+  graph [truecolor=true,bgcolor=transparent];
+  rankdir="<%= @direction -%>";
+  nodesep=0.1;
+  ranksep=0.25;
+  <%- @graph.left.each_with_index do |tokens, i| -%>
+    <%= "rootL#{i}" -%> [label="",shape=point];
+    <%- tokens.select { |t| t.empty_token_sort != 'P' }.each do |token| -%>
+      <%- if token.empty_token_sort -%>
+        <%= node token.id, token.relation.to_s.upcase, shape: :none, fontcolor: :gray -%>
+      <%- else -%>
+        <%= node token.id, token.relation.to_s.upcase, shape: :none -%>
+      <%- end -%>
+      <%- if token.relation -%>
+        <%= edge (token.head ? token.head.id : "rootL#{i}"), token.id, '', weight: 1.0, color: :orange, arrowhead: :none -%>
+      <%- end -%>
+      <%- token.slashes.each do |(relation, target)| -%>
+        <%= edge token.id, target, relation.to_s.upcase, weight: 0.0, fontcolor: :blue, color: :blue, style: :dashed %>
+      <%- end -%>
+    <%- end -%>
+    <%- tokens.reject(&:empty_token_sort).each do |token| -%>
+      <%= edge token.id, "T#{token.id}", nil, weight: 10, arrowhead: :none -%>
+    <%- end -%>
+  <%- end -%>
+  {
+    rank="same";
+    <%- @graph.left.each do |tokens| -%>
+      <%- tokens.reject(&:empty_token_sort).each do |token| -%>
+        <%= node "T#{token.id}", token.form, shape: :none, fontcolor: :blue, tooltip: [token.lemma, token.part_of_speech, token.morphology].join("\n") -%>
+      <%- end -%>
+      <%= tokens.reject(&:empty_token_sort).map { |token| "T#{token.id}" }.join('->') -%> [style="invis"];
+    <%- end -%>
+  }
+  <%- @graph.right.each_with_index do |tokens, i| -%>
+    <%= "rootR#{i}" -%> [label="",shape=point];
+    <%- tokens.select { |t| t.empty_token_sort != 'P' }.each do |token| -%>
+      <%- if token.empty_token_sort -%>
+        <%= node token.id, token.relation.to_s.upcase, shape: :none, fontcolor: :gray -%>
+      <%- else -%>
+        <%= node token.id, token.relation.to_s.upcase, shape: :none -%>
+      <%- end -%>
+      <%- if token.relation -%>
+        <%= edge token.id, (token.head ? token.head.id : "rootR#{i}"), '', weight: 1.0, color: :orange, arrowhead: :none -%>
+      <%- end -%>
+      <%- token.slashes.each do |(relation, target)| -%>
+        <%= edge token.id, target, relation.to_s.upcase, weight: 0.0, fontcolor: :blue, color: :blue, style: :dashed %>
+      <%- end -%>
+    <%- end -%>
+    <%- tokens.reject(&:empty_token_sort).each do |token| -%>
+      <%= edge "T#{token.id}", token.id, nil, weight: 10, arrowhead: :none -%>
+    <%- end -%>
+  <%- end -%>
+  {
+    rank="same";
+    <%- @graph.right.each do |tokens| -%>
+      <%- tokens.reject(&:empty_token_sort).each do |token| -%>
+        <%= node "T#{token.id}", token.form, shape: :none, fontcolor: :blue, tooltip: [token.lemma, token.part_of_speech, token.morphology].join("\n") -%>
+      <%- end -%>
+      <%= tokens.reject(&:empty_token_sort).map { |token| "T#{token.id}" }.join('->') -%> [style="invis"];
+    <%- end -%>
+  }
+  <%- @graph.alignments.each do |x, y| -%>
+    <%= "T#{x}" -%> -> <%= "T#{y}" -%> [color=blue,dir=none];
+  <%- end -%>
+}