RubyGems - proiel - Versions diffs - 1.2.0 → 1.2.1 - Mend

proiel 1.2.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +5 -5
data/README.md +2 -2
data/lib/proiel.rb +6 -1
data/lib/proiel/chronology.rb +80 -0
data/lib/proiel/dictionary.rb +3 -0
data/lib/proiel/dictionary/builder.rb +201 -0
data/lib/proiel/div.rb +17 -1
data/lib/proiel/proiel_xml/validator.rb +71 -2
data/lib/proiel/sentence.rb +17 -1
data/lib/proiel/token.rb +10 -2
data/lib/proiel/valency.rb +5 -0
data/lib/proiel/valency/arguments.rb +147 -0
data/lib/proiel/valency/lexicon.rb +59 -0
data/lib/proiel/valency/obliqueness.rb +31 -0
data/lib/proiel/version.rb +2 -2
metadata +37 -16

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: 5ead6b41029599129af6a717b0398980876a139c
-  data.tar.gz: 11a555a98a41029dfb721e9613bbb5be23847cfd
+SHA256:
+  metadata.gz: 10affa8825a31d3bcb810a5dbc41a7869c4fe7d7cb15b1c361cc8c13947d3c4a
+  data.tar.gz: 43145ff2225e521599bdc96983c295b2ccdef1a9b642849523f3852fb68b4d8d
 SHA512:
-  metadata.gz: 475971ca8443be39f3ef2a634ff8afd50931b248130e439991d473d06fcf0b7695d38d0629856654738ecaa853af1d63ba8675849eef52055b431252d07e205e
-  data.tar.gz: c33088acdb1e3fb130386204b02e3f54ca6566a05310c3397e8471c88360a742aafb8e5edc515e0d185280625994c606fed987f8cb7518a23c34a5e13686474b
+  metadata.gz: cc4b7b78021b97304c93429bab8fbe44f38a2e4740c280c5085a86ecb6c43a4e44c55936a0192196d5b769a3f54169ff8dfe64eb31305c07abd791d1e6ea0a17
+  data.tar.gz: cfcadba2ef52a4d81c6aa432549618c5c9dfef55876ae313f7cdd15704a825cb82be06b1fda0f53ef5983f17470aa443bf5be1d70d659fb066b1a3bbd57ea309

data/README.md CHANGED

@@ -12,7 +12,7 @@ PROIEL annotation scheme and the PROIEL XML-based interchange format.
 ## Installation
-To install this library you need Ruby 2.1 or newer.
+This library requires Ruby >= 2.2. Install as
 ```shell
 gem install proiel
@@ -35,7 +35,7 @@ bundle
 ```
 To download a sample treebank, initialize a new git repository and add the
-[PROIEL treebank](http://proiel.github.io) as a submodule:
+[PROIEL treebank](https://proiel.github.io) as a submodule:
 ```shell
 git init

data/lib/proiel.rb CHANGED

@@ -1,5 +1,5 @@
 #--
-# Copyright (c) 2015-2016 Marius L. Jøhndal
+# Copyright (c) 2015-2017 Marius L. Jøhndal
 #
 # See LICENSE in the top-level source directory for licensing terms.
 #++
@@ -13,6 +13,8 @@ require 'nokogiri'
 require 'singleton'
 require 'erb'
 require 'open3'
+require 'set'
+require 'builder'
 require 'proiel/version'
 require 'proiel/utils'
@@ -31,3 +33,6 @@ require 'proiel/div'
 require 'proiel/sentence'
 require 'proiel/token'
 require 'proiel/visualization'
+require 'proiel/chronology'
+require 'proiel/valency'
+require 'proiel/dictionary'

data/lib/proiel/chronology.rb ADDED

@@ -0,0 +1,80 @@
+#--
+# Copyright (c) 2016-2017 Marius L. Jøhndal
+#
+# See LICENSE in the top-level source directory for licensing terms.
+#++
+# Methods for parsing chronological descriptions.  Extra care is taken to get
+# the interpretation of centuries and ranges involving the transition between 1
+# BC and AD 1 correct.
+module PROIEL::Chronology
+  # Computes the chronological midpoint of a chronological description.
+  #
+  # @param s [String] chronological description
+  #
+  # @return [Integer]
+  #
+  # @example
+  #   midpoint('1000')         # => 1000
+  #   midpoint('1000 BC')      # => -1000
+  #   midpoint('1000-1020')    # => 1010
+  def self.midpoint(s)
+    i = parse(s)
+    if i.is_a?(Array)
+      # Handle missing Julian year 0 by shifting years after 1 BC down by 1 and then shifting the midpoint back
+      # up again unless negative
+      if i.first < 0 and i.last > 0
+        y = (i.first + i.last - 1)/2.0
+        if y < 0
+          y.floor
+        else
+          (y + 1).floor
+        end
+      else
+        ((i.first + i.last)/2.0).floor # a non-integer midpoint is within the year of the integer part
+      end
+    elsif i.is_a?(Integer)
+      i
+    else
+      raise ArgumentError, 'integer or array expected'
+    end
+  end
+  # Parses a chronological description. The syntax of chronological
+  # descriptions is explained in the [PROIEL XML
+  # documentation](http://proiel.github.io/handbook/developer/proielxml.html#chronological-data).
+  #
+  # @param s [String] chronological description
+  #
+  # @return [Integer, Array<Integer,Integer>]
+  #
+  # @example
+  #   parse('1000')         # => 1000
+  #   parse('1000 BC')      # => -1000
+  #   parse('1000-1020')    # => [1000,1020]
+  #   parse('1000 BC-1020') # => [-1000,1020]
+  def self.parse(s)
+    case s
+    when /^\s*(?:c\.\s+)?(\d+)(\s+BC)?\s*$/
+      i = $1.to_i
+      multiplier = $2 ? -1 : 1
+      (i * multiplier).to_i.tap do |i|
+        # There is no year zero in the Julian calendar
+        raise ArgumentError, 'invalid year' if i.zero?
+      end
+    when /^\s*(1st|2nd|3rd|\d+th)\s+c\.\s*$/
+      a = $1.to_i * 100
+      [a - 99, a]
+    when /^\s*(1st|2nd|3rd|\d+th)\s+c\.\s+BC\s*$/
+      a = -$1.to_i * 100
+      [a, a + 99]
+    when /^\s*(?:c\.\s+)?\d+(\s+BC)?\s*-\s*(c\.\s+)?\d+(\s+BC)?\s*$/
+      s.split('-').map { |i| self.parse(i) }.tap do |from, to|
+        raise ArgumentError, 'invalid range' unless from < to
+      end
+    else
+      raise ArgumentError, 'unexpected format'
+    end
+  end
+end

data/lib/proiel/dictionary.rb ADDED

@@ -0,0 +1,3 @@
+module PROIEL::Dictionary; end
+require 'proiel/dictionary/builder'

data/lib/proiel/dictionary/builder.rb ADDED

@@ -0,0 +1,201 @@
+#--
+# Copyright (c) 2016-2017 Marius L. Jøhndal
+#
+# See LICENSE in the top-level source directory for licensing terms.
+#++
+# Methods for synthesising and manipulating dictionaries from treebank data.
+module PROIEL::Dictionary
+  class Builder
+    attr_reader :license
+    attr_reader :language
+    attr_reader :sources
+    attr_reader :lemmata
+    def initialize
+      @language = nil
+      @license = nil
+      @sources = []
+      @lemmata = {}
+      @valency = PROIEL::Valency::Lexicon.new
+    end
+    def add_source!(source)
+      raise ArgumentError, 'source expected' unless source.is_a?(PROIEL::Source)
+      raise ArgumentError, 'incompatible language' unless @language.nil? or @language == source.language
+      raise ArgumentError, 'incompatible license' unless @license.nil? or @license == source.license
+      @language ||= source.language
+      @license ||= source.license
+      @sources << source
+      source.tokens.each { |token| index_token!(token) }
+      index_homographs!
+    end
+    CURRENT_SCHEMA_VERSION = '3.0'
+    def to_xml(io)
+      builder = ::Builder::XmlMarkup.new(target: io, indent: 2)
+      builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
+      builder.proiel('export-time': DateTime.now.xmlschema, 'schema-version': CURRENT_SCHEMA_VERSION) do
+        builder.dictionary(language: @language) do
+          builder.sources do
+            @sources.each do |source|
+              builder.source(id: source.id, license: source.license)
+            end
+          end
+          builder.lemmata(n: @lemmata.count) do
+            @lemmata.sort_by { |lemma, _| lemma.downcase }.each do |form, data|
+              lemma_to_xml(builder, form, data)
+            end
+          end
+        end
+      end
+    end
+    private
+    def lemma_to_xml(builder, form, data)
+      builder.lemma(form: form, part_of_speech: data[:part_of_speech], n: data[:n]) do
+        distribution_to_xml(builder, data)
+        glosses_to_xml(builder, data)
+        homographs_to_xml(builder, data)
+        paradigm_to_xml(builder, data)
+        valency_to_xml(builder, data)
+      end
+    end
+    def distribution_to_xml(builder, data)
+      builder.distribution do
+        data[:distribution].sort_by(&:first).each do |source_id, n|
+          builder.source(id: source_id, n: n)
+        end
+      end
+    end
+    def glosses_to_xml(builder, data)
+      if data[:glosses].count > 0
+        builder.glosses do
+          # TODO
+        end
+      end
+    end
+    def homographs_to_xml(builder, data)
+      if data[:homographs].count > 0
+        builder.homographs do
+          data[:homographs].each do |homograph|
+            builder.lemma form: homograph
+          end
+        end
+      end
+    end
+    def paradigm_to_xml(builder, data)
+      unless data[:paradigm].empty?
+        builder.paradigm do
+          data[:paradigm].sort_by(&:first).each do |morphology, d|
+            builder.slot1 morphology: morphology do
+              d.sort_by(&:first).each do |form, n|
+                builder.slot2 form: form, n: n
+              end
+            end
+          end
+        end
+      end
+    end
+    def valency_to_xml(builder, data)
+      unless data[:valency].empty?
+        builder.valency do
+          frames =
+            data[:valency].map do |arguments, token_ids|
+              { arguments: arguments, tokens: token_ids }
+            end
+          PROIEL::Valency::Obliqueness.sort_frames(frames).each do |frame|
+            builder.frame do
+              builder.arguments do
+                frame[:arguments].each do |argument|
+                  builder.argument argument
+                end
+              end
+              if frame[:tokens][:a].count > 0
+                builder.tokens flags: 'a', n: frame[:tokens][:a].count do
+                  frame[:tokens][:a].each do |token_id|
+                    builder.token id: token_id
+                  end
+                end
+              end
+              if frame[:tokens][:r].count > 0
+                builder.tokens flags: 'r', n: frame[:tokens][:r].count do
+                  frame[:tokens][:r].each do |token_id|
+                    builder.token id: token_id
+                  end
+                end
+              end
+            end
+          end
+        end
+      end
+    end
+    def index_homographs!
+      @lemmata.keys.group_by { |l| l.split(',').first }.each do |m, homographs|
+        if homographs.count > 1
+          homographs.each do |form|
+            @lemmata[form][:homographs] = homographs.reject { |homograph| homograph == form }
+          end
+        end
+      end
+    end
+    def index_token!(token)
+      if token.lemma and token.part_of_speech
+        encoded_lemma = [token.lemma, token.part_of_speech].join(',')
+        @lemmata[encoded_lemma] ||= {
+          lemma: token.lemma,
+          part_of_speech: token.part_of_speech,
+          distribution: {},
+          glosses: {},
+          homographs: [],
+          paradigm: {},
+          n: 0,
+          valency: {},
+        }
+        lemma = @lemmata[encoded_lemma]
+        lemma[:distribution][token.source.id] ||= 0
+        lemma[:distribution][token.source.id] += 1
+        lemma[:paradigm][token.morphology] ||= {}
+        lemma[:paradigm][token.morphology][token.form] ||= 0
+        lemma[:paradigm][token.morphology][token.form] += 1
+        lemma[:n] += 1
+        # Find verbal nodes
+        if token.part_of_speech[/^V/]
+          frame = PROIEL::Valency::Arguments.get_argument_frame(token)
+          lemma[:valency][frame] ||= { a: [], r: [] }
+          entry = lemma[:valency][frame]
+          if token.dependents.any? { |d| d.relation == 'aux' and d.part_of_speech == 'Pk' }
+            entry[:r] << token.id
+          else
+            entry[:a] << token.id
+          end
+        end
+      end
+    end
+  end
+end

data/lib/proiel/div.rb CHANGED

@@ -1,5 +1,5 @@
 #--
-# Copyright (c) 2015-2016 Marius L. Jøhndal
+# Copyright (c) 2015-2017 Marius L. Jøhndal
 #
 # See LICENSE in the top-level source directory for licensing terms.
 #++
@@ -135,5 +135,21 @@ module PROIEL
         end
       end
     end
+    # Returns the aligned div if any.
+    #
+    # @return [Div, NilClass] aligned div
+    def alignment(aligned_source)
+      alignment_id ? aligned_source.treebank.find_div(alignment_id) : nil
+    end
+    # Returns inferred aligned divs if any.
+    #
+    # @return [Array<Div>] inferred aligned divs
+    def inferred_alignment(aligned_source)
+      sentences.map do |sentence|
+        sentence.inferred_alignment(aligned_source)
+      end.flatten.compact.map(&:div).uniq
+    end
   end
 end

data/lib/proiel/proiel_xml/validator.rb CHANGED

@@ -1,5 +1,5 @@
 #--
-# Copyright (c) 2015 Marius L. Jøhndal
+# Copyright (c) 2015-2017 Marius L. Jøhndal
 #
 # See LICENSE in the top-level source directory for licensing terms.
 #++
@@ -16,9 +16,11 @@ module PROIEL
       # Creates a new validator for a PROIEL XML file.
       #
       # @param filename [String] name of PROIEL XML file to validate
+      # @param aligned_filename [NilClass, String] name of PROIEL XML file to validate alignments against
       #
-      def initialize(filename)
+      def initialize(filename, aligned_filename = nil)
         @filename = filename
+        @aligned_filename = aligned_filename
         @errors = []
       end
@@ -154,6 +156,27 @@ module PROIEL
           end
         end
+        # Pass 5: if div is aligned, sentences and tokens within should belong
+        # to aligned div(s); if sentence aligned, tokens within should belong
+        # to aligned sentence(s). Skip if no alignment_id on source (see pass
+        # 4) or if aligned source not available.
+        if @aligned_filename
+          aligned_tb = PROIEL::Treebank.new
+          aligned_tb.load_from_xml(@aligned_filename)
+          tb.sources.each do |source|
+            if source.alignment_id
+              aligned_source = aligned_tb.find_source(source.alignment_id)
+              if aligned_source
+                check_alignment_integrity(errors, source, aligned_source)
+              else
+                errors << "Aligned source not available in treebank"
+              end
+            end
+          end
+        end
         # Decide if there were any errors
         if errors.empty?
           true
@@ -182,6 +205,52 @@ module PROIEL
           errors << "Token #{token.id}: #{attribute_name} is null"
         end
       end
+      def check_alignment_integrity(errors, source, aligned_source)
+        source.divs.each do |div|
+          target_sentences =
+            div.sentences.map do |sentence|
+              target_tokens =
+                sentence.tokens.select(&:alignment_id).map do |token|
+                  # Check that target token exists in aligned source
+                  aligned_token = aligned_source.treebank.find_token(token.alignment_id)
+                  if aligned_token
+                    aligned_token
+                  else
+                    errors << "Token #{token.id}: aligned to token #{aligned_source.id}:#{token.alignment_id} which does not exist"
+                    nil
+                  end
+                end
+              inferred_target_sentences = target_tokens.compact.map(&:sentence).sort_by(&:id).uniq
+              if sentence.alignment_id
+                a = sentence.alignment_id.to_s.split(',').sort.join(',')
+                i = inferred_target_sentences.map(&:id).sort.join(',')
+                # FIXME: handle i.empty? case, in which we have to use a and check that the objects exist
+                if a != i
+                  errors << "Sentence #{sentence.id}: aligned to sentence #{aligned_source.id}:#{a} but inferred alignment is #{aligned_source.id}:#{i}"
+                end
+              end
+              inferred_target_sentences
+            end
+          inferred_target_divs = target_sentences.flatten.compact.map(&:div).uniq
+          if div.alignment_id
+            a = div.alignment_id.to_s.split(',').sort.join(',')
+            i = inferred_target_divs.map(&:id).sort.join(',')
+            # FIXME: handle i.empty? case, in which we have to use a and check that the objects exist
+            if a != i
+              errors << "Div #{div.id}: aligned to div #{aligned_source.id}:#{a} but inferred alignment is #{aligned_source.id}:#{i}"
+            end
+          end
+        end
+      end
     end
   end
 end

data/lib/proiel/sentence.rb CHANGED

@@ -116,7 +116,7 @@ module PROIEL
     # @return [String] the printable form of the sentence
     def printable_form(options = {})
       [presentation_before,
-       @children.map { |t| t.printable_form(options) },
+       @children.reject(&:is_empty?).map { |t| t.printable_form(options) },
        presentation_after].compact.join
     end
@@ -217,5 +217,21 @@ module PROIEL
     def tokens
       @children.to_enum
     end
+    # Returns the aligned sentence if any.
+    #
+    # @return [Sentence, NilClass] aligned sentence
+    def alignment(aligned_source)
+      alignment_id ? aligned_source.treebank.find_sentence(alignment_id) : nil
+    end
+    # Returns inferred aligned sentences if any.
+    #
+    # @return [Array<Sentence>] inferred aligned sentences
+    def inferred_alignment(aligned_source)
+      tokens.select(&:alignment_id).map do |token|
+        token.alignment(aligned_source)
+      end.flatten.compact.map(&:sentence).uniq
+    end
   end
 end

data/lib/proiel/token.rb CHANGED

@@ -1,5 +1,5 @@
 #--
-# Copyright (c) 2015-2016 Marius L. Jøhndal
+# Copyright (c) 2015-2017 Marius L. Jøhndal
 #
 # See LICENSE in the top-level source directory for licensing terms.
 #++
@@ -160,12 +160,13 @@ module PROIEL
     # Returns the printable form of the token with any presentation data.
     #
     # @param custom_token_formatter [Lambda] formatting function for tokens
+    # which is passed the token as its sole argument
     #
     # @return [String] the printable form of the token
     def printable_form(custom_token_formatter: nil)
       printable_form =
         if custom_token_formatter
-          custom_token_formatter.call(id, form)
+          custom_token_formatter.call(self)
         else
           form
         end
@@ -393,6 +394,13 @@ module PROIEL
       common_ancestors(other_token, inclusive: inclusive).first
     end
+    # Returns the aligned token if any.
+    #
+    # @return [Token, NilClass] aligned token
+    def alignment(aligned_source)
+      alignment_id ? aligned_source.treebank.find_token(alignment_id) : nil
+    end
     private
     # FIXME: extract this from the header of the PROIEL XML file instead and

data/lib/proiel/valency.rb ADDED

@@ -0,0 +1,5 @@
+module PROIEL::Valency; end
+require 'proiel/valency/obliqueness'
+require 'proiel/valency/arguments'
+require 'proiel/valency/lexicon'

data/lib/proiel/valency/arguments.rb ADDED

@@ -0,0 +1,147 @@
+module PROIEL::Valency::Arguments
+  def self.get_argument_frame(token)
+    arguments = collect_arguments(token)
+    hoisted_arguments = arguments.map { |a| hoist_dependents(a) }
+    a =
+      hoisted_arguments.map do |argument|
+        { relation: argument.relation }.merge(extract_features(argument))
+      end
+    PROIEL::Valency::Obliqueness.sort_arguments(a)
+  end
+  private
+  POS_CLASSIFICATION = {
+    'R' => :functor,
+    'G' => :functor,
+    'N' => :nominal,
+    'P' => :nominal,
+    'A' => :nominal,
+    'M' => :nominal,
+    'V' => :verbal,
+  }
+  # Collapses dependents based on features
+  def self.collapse_dependents(dependents)
+    # Hoist dependents if any of the dependents is a coordinator
+    dependents = dependents.map { |d| hoist_dependents(d) }
+    # Figure out if all dependents are equivalent for the purposes of
+    # argument frames. Typical examples would be coordinated, identical
+    # prepositions (which is operationalised as same lemma, same POS, no
+    # case) or coordinated nouns in the same case (which is operationalised
+    # as same major POS, same case). If we fail to figure out a way to
+    # hoist and reduce arguments, we keep the coordinator.
+    majors = dependents.map { |d| POS_CLASSIFICATION[d.part_of_speech_hash[:major] || d.empty_token_sort] }.uniq
+    majors = majors.length == 1 ? majors.first : nil
+    case majors
+    when :functor
+      lemmas = dependents.map(&:lemma).uniq
+      if lemmas.length == 1
+        dependents.first
+      else
+        #STDERR.puts "Different lemmas R/G: #{lemmas.inspect}"
+        nil
+      end
+    when :nominal
+      cases = dependents.map { |d| d.morphology_hash[:case] }.uniq
+      if cases.length == 1
+        dependents.first
+      else
+        #STDERR.puts "Different cases N/P: #{cases.inspect}"
+        nil
+      end
+    when :verbal
+      moods = dependents.map { |d| d.morphology_hash[:mood] }.uniq
+      if moods.length == 1
+        dependents.first
+      else
+        #STDERR.puts "Different moods V: #{moods.inspect}"
+        nil
+      end
+    else
+      #STDERR.puts "Unknown combination: #{dependents.map(&:pos).inspect}"
+      nil
+    end
+  end
+  # Hoists the real argument dependents from conjoined arguments
+  def self.hoist_dependents(argument)
+    if argument.part_of_speech == 'C-' or argument.empty_token_sort == 'C'
+      # Pick dependents that have the same relation as the coordinator. This
+      # eliminates auxiliary elements like particles and repeated
+      # conjunctions as well as attributes that scope over all conjuncts.
+      dependents = argument.dependents.select { |d| d.relation == argument.relation }
+      collapse_dependents(dependents) || argument
+    else
+      argument
+    end
+  end
+  # Extracts morphosyntactic features that are relevant to the argument frame
+  def self.extract_features(argument)
+    {}.tap do |features|
+      case argument.part_of_speech_hash[:major]
+      when 'G'
+        features[:lemma] = argument.lemma
+        features[:part_of_speech] = argument.part_of_speech
+        # There may be multiple dependents and dependents may be headed by
+        # coordinators. All relevant dependents have the relation PRED.
+        dependents = argument.dependents.select { |d| d.relation == 'pred' }.map { |a| hoist_dependents(a) }
+        local_argument = collapse_dependents(dependents)
+        features[:mood] = local_argument.morphology_hash[:mood] if local_argument and local_argument.morphology_hash[:mood]
+      when 'R'
+        features[:lemma] = argument.lemma
+        features[:part_of_speech] = argument.part_of_speech
+        # There may be multiple dependents and dependents may be headed by
+        # coordinators. All relevant dependents have the relation OBL.
+        dependents = argument.dependents.select { |d| d.relation == 'obl' }.map { |a| hoist_dependents(a) }
+        local_argument = collapse_dependents(dependents)
+        features[:case] = local_argument.morphology_hash[:case] if local_argument and local_argument.morphology_hash[:case]
+      when 'V'
+        features[:mood] = argument.morphology_hash[:mood] if argument.morphology_hash[:mood]
+      when 'D'
+        features[:lemma] = argument.lemma
+        features[:part_of_speech] = argument.part_of_speech
+      when 'P'
+        features[:case] = argument.morphology_hash[:case] if argument.morphology_hash[:case]
+        if argument.part_of_speech == 'Pk' # reflexive personal pronoun
+          features[:lemma] = argument.lemma
+          features[:part_of_speech] = argument.part_of_speech
+        end
+      else
+        features[:case] = argument.morphology_hash[:case] if argument.morphology_hash[:case]
+      end
+    end
+  end
+  # Determines the arguments of a predicate
+  def self.collect_arguments(token)
+    token.dependents.select do |dependent|
+      case dependent.relation
+      when 'obj', 'obl', 'xobj', 'comp', 'narg' # arguments
+        true
+      when 'aux', 'sub', 'ag', 'adv', 'xadv', 'apos', 'atr', 'part', 'expl' # non-arguments
+        false
+      when 'arg' # unspecific but always an argument
+        true
+      when 'adnom', 'nonsub', 'per' # unspecific and undetermined with respect to argumenthood
+        false
+      when 'rel' # unspecific but never an argument
+        false
+      when 'pred', 'parpred', 'voc' # shouldn't happen
+        false
+      when 'pid', 'xsub' # really shouldn't happen
+        false
+      else
+        raise "unknown relation #{dependent.relation.inspect}"
+      end
+    end
+  end
+end

data/lib/proiel/valency/lexicon.rb ADDED

@@ -0,0 +1,59 @@
+module PROIEL
+  module Valency
+    class Lexicon
+      attr_reader :frames
+      def initialize
+        @source_ids = Set.new
+        @source_languages = Set.new
+        @frames = {}
+      end
+      # Generates a valency lexicon from the provided sources. In practice the
+      # sources should be in the same language but this is not enforced. This
+      # makes it possible to generate a lexicon from sources in closely related
+      # languages or dialects.
+      def add_source!(source)
+        @source_ids << source.id
+        @source_languages << source.language
+        source.sentences.each do |sentence|
+          tokens = find_verbal_nodes(sentence)
+          tokens.each do |token|
+            frame = PROIEL::Valency::Arguments.get_argument_frame(token)
+            partition =
+              if token.dependents.any? { |d| d.relation == 'aux' and d.part_of_speech == 'Pk' }
+                :r
+              else
+                :a
+              end
+            @frames[token.lemma] ||= {}
+            @frames[token.lemma][token.part_of_speech] ||= {}
+            @frames[token.lemma][token.part_of_speech][frame] ||= { a: [], r: [] }
+            @frames[token.lemma][token.part_of_speech][frame][partition] << token.id
+          end
+        end
+      end
+      def lookup(lemma, part_of_speech)
+        frames =
+          @frames[lemma][part_of_speech].map do |arguments, token_ids|
+            { arguments: arguments, tokens: token_ids }
+          end
+        PROIEL::Valency::Obliqueness.sort_frames(frames)
+      end
+      private
+      # Find verbal nodes in a sentence
+      def find_verbal_nodes(sentence)
+        sentence.tokens.select do |token|
+          # FIXME: is this test in the proiel library already?
+          (token.part_of_speech and token.part_of_speech[/^V/]) or token.empty_token_sort == 'V'
+        end
+      end
+    end
+  end
+end

data/lib/proiel/valency/obliqueness.rb ADDED

@@ -0,0 +1,31 @@
+module PROIEL::Valency::Obliqueness
+  # Sorts frames by obliqueness
+  def self.sort_frames(frames)
+    # Sort frames by obliqueness, then by inspecting them so that we get
+    # a stable, reproducible order.
+    frames.sort_by { |frame| [obliqueness_of_arguments(frame[:arguments]).sort, frame.inspect] }
+  end
+  # Sorts arguments by obliqueness
+  def self.sort_arguments(arguments)
+    arguments.sort_by { |argument| obliqueness_of_argument(argument) }
+  end
+  private
+  def self.obliqueness_of_arguments(arguments)
+    arguments.map do |argument|
+      obliqueness_of_argument(argument)
+    end
+  end
+  def self.obliqueness_of_argument(argument)
+    obliqueness_of_relation(argument[:relation]) * 2 + (argument[:lemma].nil? ? 0 : 1)
+  end
+  OBLIQUENESS_HIERARCHY = %w(sub ag obj xobj arg obl comp narg)
+  def self.obliqueness_of_relation(relation)
+    OBLIQUENESS_HIERARCHY.index(relation) || OBLIQUENESS_HIERARCHY.length
+  end
+end

data/lib/proiel/version.rb CHANGED

@@ -1,9 +1,9 @@
 #--
-# Copyright (c) 2015-2016 Marius L. Jøhndal
+# Copyright (c) 2015-2018 Marius L. Jøhndal
 #
 # See LICENSE in the top-level source directory for licensing terms.
 #++
 module PROIEL
   # Gem version
-  VERSION = '1.2.0'
+  VERSION = '1.2.1'
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: proiel
 version: !ruby/object:Gem::Version
-  version: 1.2.0
+  version: 1.2.1
 platform: ruby
 authors:
 - Marius L. Jøhndal
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-08-31 00:00:00.000000000 Z
+date: 2018-01-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: json
@@ -30,28 +30,28 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 1.6.6
+        version: '1.8'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 1.6.6
+        version: '1.8'
 - !ruby/object:Gem::Dependency
   name: sax-machine
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 1.3.2
+        version: '1.3'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 1.3.2
+        version: '1.3'
 - !ruby/object:Gem::Dependency
   name: memoist
   requirement: !ruby/object:Gem::Requirement
@@ -66,48 +66,62 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '0.12'
+- !ruby/object:Gem::Dependency
+  name: builder
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.2'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.2'
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.12'
+        version: '1.15'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.12'
+        version: '1.15'
 - !ruby/object:Gem::Dependency
   name: rake
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '11.2'
+        version: '12.0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '11.2'
+        version: '12.0'
 - !ruby/object:Gem::Dependency
   name: rspec
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '3.2'
+        version: '3.6'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '3.2'
+        version: '3.6'
 - !ruby/object:Gem::Dependency
   name: pry
   requirement: !ruby/object:Gem::Requirement
@@ -128,14 +142,14 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.12'
+        version: '0.14'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.12'
+        version: '0.14'
 - !ruby/object:Gem::Dependency
   name: yard
   requirement: !ruby/object:Gem::Requirement
@@ -164,7 +178,10 @@ files:
 - bin/setup
 - lib/proiel.rb
 - lib/proiel/annotation_schema.rb
+- lib/proiel/chronology.rb
 - lib/proiel/citations.rb
+- lib/proiel/dictionary.rb
+- lib/proiel/dictionary/builder.rb
 - lib/proiel/div.rb
 - lib/proiel/positional_tag.rb
 - lib/proiel/proiel_xml/proiel-1.0/proiel-1.0.xsd
@@ -183,6 +200,10 @@ files:
 - lib/proiel/treebank.rb
 - lib/proiel/treebank_object.rb
 - lib/proiel/utils.rb
+- lib/proiel/valency.rb
+- lib/proiel/valency/arguments.rb
+- lib/proiel/valency/lexicon.rb
+- lib/proiel/valency/obliqueness.rb
 - lib/proiel/version.rb
 - lib/proiel/visualization.rb
 - lib/proiel/visualization/graphviz.rb
@@ -201,7 +222,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      version: '2.1'
+      version: '2.2'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
@@ -209,7 +230,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.5.1
+rubygems_version: 2.7.4
 signing_key:
 specification_version: 4
 summary: A library for working with treebanks using the PROIEL dependency format