RubyGems - proiel - Versions diffs - 1.0.0 - Mend

proiel 1.0.0

Files changed (26) hide show

checksums.yaml +7 -0
data/LICENSE +19 -0
data/README.md +99 -0
data/bin/console +6 -0
data/bin/setup +5 -0
data/lib/proiel/annotation_schema.rb +127 -0
data/lib/proiel/citations.rb +84 -0
data/lib/proiel/div.rb +133 -0
data/lib/proiel/positional_tag.rb +127 -0
data/lib/proiel/proiel_xml/proiel-1.0/proiel-1.0.xsd +172 -0
data/lib/proiel/proiel_xml/proiel-1.0/teilite.xsd +7387 -0
data/lib/proiel/proiel_xml/proiel-1.0/xml.xsd +287 -0
data/lib/proiel/proiel_xml/proiel-2.0/proiel-2.0.xsd +185 -0
data/lib/proiel/proiel_xml/reader.rb +237 -0
data/lib/proiel/proiel_xml/schema.rb +81 -0
data/lib/proiel/proiel_xml/validator.rb +177 -0
data/lib/proiel/sentence.rb +191 -0
data/lib/proiel/source.rb +114 -0
data/lib/proiel/statistics.rb +41 -0
data/lib/proiel/token.rb +407 -0
data/lib/proiel/tokenization.rb +90 -0
data/lib/proiel/treebank.rb +214 -0
data/lib/proiel/treebank_object.rb +21 -0
data/lib/proiel/version.rb +9 -0
data/lib/proiel.rb +28 -0
metadata +210 -0

data/lib/proiel/source.rb ADDED Viewed

@@ -0,0 +1,114 @@
+#--
+# Copyright (c) 2015 Marius L. Jøhndal
+#
+# See LICENSE in the top-level source directory for licensing terms.
+#++
+module PROIEL
+  # A source object in a treebank.
+  class Source < TreebankObject
+    # @return [String] ID of the source
+    attr_reader :id
+    # @return [Treebank] treebank that the div belongs to
+    attr_reader :treebank
+    # @return [String] language of the source as an ISO 639-3 language tag
+    attr_reader :language
+    # @return [DateTime] export time for the source
+    attr_reader :export_time
+    # @return [Hash{Symbol, String}] metadata fields for the source
+    # @see PROIEL::Treebank::METADATA_ELEMENTS
+    attr_reader :metadata
+    # Creates a new source object.
+    def initialize(parent, id, export_time, language, metadata, &block)
+      @treebank = parent
+      @id = id.freeze
+      @export_time = DateTime.parse(export_time).freeze
+      @language = language.freeze
+      @metadata = metadata.freeze
+      @children = block.call(self) if block_given?
+    end
+    # @return [String] a complete citation for the source
+    def citation
+      citation_part
+    end
+    # Returns the printable form of the source with all token forms and any
+    # presentation data.
+    #
+    # @return [String] the printable form of the source
+    def printable_form(options = {})
+      @children.map { |d| d.printable_form(options) }.compact.join
+    end
+    # Accesses metadata fields.
+    #
+    # @see PROIEL::Treebank::METADATA_ELEMENTS
+    def method_missing(method_name, *args, &block)
+      if @metadata.key?(method_name) and args.empty?
+        @metadata[method_name]
+      else
+        super
+      end
+    end
+    # Finds all divs in the source.
+    #
+    # @return [Enumerator] divs in the source
+    def divs
+      @children.to_enum
+    end
+    # Finds all sentences in the source.
+    #
+    # @return [Enumerator] sentences in the source
+    #
+    # @example Iterating sentences
+    #   sentences.each { |s| puts s.id }
+    #
+    # @example Create an array with only reviewed sentences
+    #   sentences.select(&:reviewed?)
+    #
+    # @example Counting sentences
+    #   sentences.count #=> 200
+    #
+    def sentences
+      Enumerator.new do |y|
+        @children.each do |div|
+          div.sentences.each do |sentence|
+            y << sentence
+          end
+        end
+      end
+    end
+    # Finds all tokens in the source.
+    #
+    # @return [Enumerator] tokens in the source
+    #
+    # @example Iterating tokens
+    #   tokens.each { |t| puts t.id }
+    #
+    # @example Create an array with only empty tokens
+    #   tokens.select(&:is_empty?)
+    #
+    # @example Counting tokens
+    #   puts tokens.count #=> 200
+    #
+    def tokens
+      Enumerator.new do |y|
+        @children.each do |div|
+          div.sentences.each do |sentence|
+            sentence.tokens.each do |token|
+              y << token
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/proiel/statistics.rb ADDED Viewed

@@ -0,0 +1,41 @@
+#--
+# Copyright (c) 2015 Marius L. Jøhndal
+#
+# See LICENSE in the top-level source directory for licensing terms.
+#++
+module PROIEL
+  module Statistics
+    # Computes the line of best fit using the least-squares method.
+    #
+    # @param x [Array<Number>] x-values
+    # @param y [Array<Number>] y-values
+    #
+    # @return [Array(Float, Float)] y-intercept and slope
+    #
+    # @example
+    #   x = [8, 2, 11, 6, 5, 4, 12, 9, 6, 1]
+    #   y = [3, 10, 3, 6, 8, 12, 1, 4, 9, 14]
+    #   a, b = PROIEL::Statistics.least_squares(x, y)
+    #   a # => 14.081081081081088
+    #   b # => -1.1064189189189197
+    #
+    def self.least_squares(x, y)
+      raise ArgumentError unless x.is_a?(Array)
+      raise ArgumentError unless y.is_a?(Array)
+      raise ArgumentError, 'array lengths differ' unless x.size == y.size
+      x_mean = x.reduce(&:+).to_f / x.size
+      y_mean = y.reduce(&:+).to_f / y.size
+      x_sqsum = x.reduce(0.0) { |sum, n| sum + n ** 2 }
+      xy_sum = x.zip(y).reduce(0.0) { |sum, (m, n)| sum + m * n }
+      sxy = xy_sum - x.length * x_mean * y_mean
+      sx2 = x_sqsum - x.length * (x_mean ** 2)
+      beta = sxy / sx2
+      alfa = y_mean - beta * x_mean
+      [alfa, beta]
+    end
+  end
+end

data/lib/proiel/token.rb ADDED Viewed

@@ -0,0 +1,407 @@
+#--
+# Copyright (c) 2015 Marius L. Jøhndal
+#
+# See LICENSE in the top-level source directory for licensing terms.
+#++
+module PROIEL
+  # A token object in a treebank.
+  class Token < TreebankObject
+    # A class representing a token sentence in the PROIEL treebank.
+    extend Memoist
+    # @return [Fixnum] ID of the sentence
+    attr_reader :id
+    # @return [Sentence] parent sentence object
+    attr_accessor :sentence
+    # @return [nil, Fixnum] ID of head token
+    attr_reader :head_id
+    # @return [nil, String] token form
+    attr_reader :form
+    # @return [nil, String] token lemma
+    attr_reader :lemma
+    # @return [nil, String] token part of speech tag
+    attr_reader :part_of_speech
+    # @return [nil, String] token part of speech tag
+    alias :pos :part_of_speech
+    # @return [nil, String] token morphological tag
+    attr_reader :morphology
+    # @return [nil, String] token relation tag
+    attr_reader :relation
+    # @return [nil, String] token empty token sort tag
+    attr_reader :empty_token_sort
+    # @return [nil, String] citation part
+    attr_reader :citation_part
+    # @return [nil, String] presentation material before form
+    attr_reader :presentation_before
+    # @return [nil, String] presentation material after form
+    attr_reader :presentation_after
+    # @return [nil, Fixnum] ID of antecedent token
+    attr_reader :antecedent_id
+    # @return [nil, String] information status tag
+    attr_reader :information_status
+    # @return [nil, String] contrast group tag
+    attr_reader :contrast_group
+    # @return [nil, String] free-form foreign IDs
+    attr_reader :foreign_ids
+    # @return [Array<Array<String,Fixnum>>] secondary edges as an array of pairs of relation tag and target token ID
+    attr_reader :slashes
+    # Creates a new token object.
+    def initialize(parent, id, head_id, form, lemma, part_of_speech,
+                   morphology, relation, empty_token_sort, citation_part,
+                   presentation_before, presentation_after, antecedent_id,
+                   information_status, contrast_group, foreign_ids, slashes)
+      @sentence = parent
+      raise ArgumentError, 'integer expected' unless id.is_a?(Integer)
+      @id = id
+      raise ArgumentError, 'integer or nil expected' unless head_id.nil? or head_id.is_a?(Integer)
+      @head_id = head_id
+      raise ArgumentError, 'string or nil expected' unless form.nil? or form.is_a?(String)
+      @form = form.freeze
+      raise ArgumentError, 'string or nil expected' unless lemma.nil? or lemma.is_a?(String)
+      @lemma = lemma.freeze
+      raise ArgumentError, 'string or nil expected' unless part_of_speech.nil? or part_of_speech.is_a?(String)
+      @part_of_speech = part_of_speech.freeze
+      raise ArgumentError, 'string or nil expected' unless morphology.nil? or morphology.is_a?(String)
+      @morphology = morphology.freeze
+      raise ArgumentError, 'string or nil expected' unless relation.nil? or relation.is_a?(String)
+      @relation = relation.freeze
+      raise ArgumentError, 'string or nil expected' unless empty_token_sort.nil? or empty_token_sort.is_a?(String)
+      @empty_token_sort = empty_token_sort.freeze
+      raise ArgumentError, 'string or nil expected' unless citation_part.nil? or citation_part.is_a?(String)
+      @citation_part = citation_part.freeze
+      raise ArgumentError, 'string or nil expected' unless presentation_before.nil? or presentation_before.is_a?(String)
+      @presentation_before = presentation_before.freeze
+      raise ArgumentError, 'string or nil expected' unless presentation_after.nil? or presentation_after.is_a?(String)
+      @presentation_after = presentation_after.freeze
+      raise ArgumentError, 'integer or nil expected' unless antecedent_id.nil? or antecedent_id.is_a?(Integer)
+      @antecedent_id = antecedent_id
+      raise ArgumentError, 'string or nil expected' unless information_status.nil? or information_status.is_a?(String)
+      @information_status = information_status.freeze
+      raise ArgumentError, 'string or nil expected' unless contrast_group.nil? or contrast_group.is_a?(String)
+      @contrast_group = contrast_group.freeze
+      raise ArgumentError, 'string or nil expected' unless foreign_ids.nil? or foreign_ids.is_a?(String)
+      @foreign_ids = foreign_ids.freeze
+      raise ArgumentError, 'array expected' unless slashes.is_a?(Array)
+      @slashes = slashes.map { |s| [s.relation.freeze, s.target_id] }
+    end
+    # @return [Div] parent div object
+    def div
+      @sentence.div
+    end
+    # @return [Source] parent source object
+    def source
+      @sentence.div.source
+    end
+    # @return [Treebank] parent treebank object
+    def treebank
+      @sentence.div.source.treebank
+    end
+    # @return [String] language of the token as an ISO 639-3 language tag
+    def language
+      source.language
+    end
+    memoize :language
+    # @return [nil, String] a complete citation for the token
+    def citation
+      if citation_part
+        [source.citation_part, citation_part].compact.join(' ')
+      else
+        nil
+      end
+    end
+    # Returns the printable form of the token with any presentation data.
+    #
+    # @param custom_token_formatter [Lambda] formatting function for tokens
+    #
+    # @return [String] the printable form of the token
+    def printable_form(custom_token_formatter: nil)
+      printable_form =
+        if custom_token_formatter
+          custom_token_formatter.call(id, form)
+        else
+          form
+        end
+      [presentation_before, printable_form, presentation_after].compact.join
+    end
+    # @return [Hash<Symbol,String>] token part of speech tag as a hash
+    def part_of_speech_hash
+      if part_of_speech
+        POS_POSITIONAL_TAG_SEQUENCE.zip(part_of_speech.split('')).reject { |_, v| v == '-' }.to_h
+      else
+        {}
+      end
+    end
+    memoize :part_of_speech_hash
+    alias :pos_hash :part_of_speech_hash
+    # Returns the part of speech tag if set, but also provides a suitable
+    # part of speech tag for empty elements.
+    #
+    # @return [String] part of speech tag
+    def part_of_speech_with_nulls
+      part_of_speech || NULL_PARTS_OF_SPEECH[empty_token_sort]
+    end
+    alias :pos_with_nulls :part_of_speech_with_nulls
+    # @return [Hash<Symbol,String>] token morphology tag as a hash
+    def morphology_hash
+      if morphology
+        MORPHOLOGY_POSITIONAL_TAG_SEQUENCE.zip(morphology.split('')).reject { |_, v| v == '-' }.to_h
+      else
+        {}
+      end
+    end
+    memoize :morphology_hash
+    # Checks if the token is the root of its dependency graph.
+    #
+    # If the token belongs to a sentence that lacks dependency annotation,
+    # all tokens are treated as roots. If a sentence has partial or complete
+    # dependency annotation there may still be multiple root tokens.
+    #
+    # @return [true, false]
+    def is_root?
+      head_id.nil?
+    end
+    # Finds the head of this token.
+    #
+    # The head is the parent of the this token in the tree that has tokens as
+    # nodes and primary relations as edges.
+    #
+    # @return [Token] head
+    def head
+      if is_root?
+        nil
+      else
+        treebank.find_token(head_id)
+      end
+    end
+    memoize :head
+    alias :parent :head
+    # Finds dependent of this token in the dependency graph.
+    #
+    # The dependents are the children of the this token in the tree that has
+    # tokens as nodes and primary relations as edges.
+    #
+    # The order of the returned dependents is indeterminate.
+    #
+    # @return [Array<Token>] dependent
+    def dependents
+      @sentence.tokens.select { |t| t.head_id == @id }
+    end
+    memoize :dependents
+    alias :children :dependents
+    # Finds ancestors of this token in the dependency graph.
+    #
+    # The ancestors are the ancestors of the this token in the tree that has
+    # tokens as nodes and primary relations as edges.
+    #
+    # The order of the returned ancestors is as follows: The first
+    # ancestor is the head of this token, the next ancestor is
+    # the head of the previous token, and so on.
+    #
+    # @return [Array<Token>] ancestors
+    def ancestors
+      if is_root?
+        []
+      else
+        [head] + head.ancestors
+      end
+    end
+    memoize :ancestors
+    # Finds descendents of this token in the dependency graph.
+    #
+    # The descendents are the ancestors of the this token in the tree that has
+    # tokens as nodes and primary relations as edges.
+    #
+    # The order of the returned descendents is as indeterminate.
+    #
+    # @return [Array<Token>] descendents
+    def descendents
+      dependents.map { |dependent| [dependent ] + dependent.descendents }.flatten
+    end
+    memoize :descendents
+    alias :descendants :descendents
+    # Tests if the token is empty.
+    #
+    # A token is empty if it does not have a form. If the token is empty,
+    # {Token#empty_token_sort} explains its function.
+    #
+    # @see Token#has_content?
+    #
+    # @return [true, false]
+    def is_empty?
+      !empty_token_sort.nil?
+    end
+    # Tests if the token has content.
+    #
+    # A token has content if it has a form.
+    #
+    # @see Token#is_empty?
+    #
+    # @return [true, false]
+    def has_content?
+      empty_token_sort.nil?
+    end
+    # Tests if the token has a citation.
+    #
+    # A token has a citation if `citation_part` is not `nil`.
+    #
+    # @return [true, false]
+    def has_citation?
+      !citation_part.nil?
+    end
+    # Checks if the token is a PRO token.
+    #
+    # @return [true, false]
+    def pro?
+      empty_token_sort == 'P'
+    end
+    # Finds the common ancestors that this token and another token
+    # share in the dependency graph.
+    #
+    # If `inclusive` is `false`, a common ancestor is defined strictly
+    # as a common ancestor of both tokens. If `inclusive` is `true`,
+    # one of the tokens can be a common ancestor of the other.
+    #
+    # Ancestors are returned in the same order as {Token#ancestors}.
+    #
+    # @example
+    #   x.head # => w
+    #   w.head # => z
+    #   y.head # => z
+    #   z.head # => u
+    #
+    #   x.common_ancestors(y, inclusive: false) # => [z, u]
+    #   x.common_ancestors(w, inclusive: false) # => [z, u]
+    #   x.common_ancestors(x, inclusive: false) # => [w, z, u]
+    #
+    #   x.common_ancestors(y, inclusive: true)  # => [z, u]
+    #   x.common_ancestors(w, inclusive: true)  # => [w, z, u]
+    #   x.common_ancestors(x, inclusive: true)  # => [x, w, z, u]
+    #
+    # @see Token#first_common_ancestor
+    # @see Token#first_common_ancestor_path
+    #
+    # @return [Array<Token>] common ancestors
+    def common_ancestors(other_token, inclusive: false)
+      if inclusive
+        x, y = [self] + ancestors, [other_token] + other_token.ancestors
+      else
+        x, y = ancestors, other_token.ancestors
+      end
+      x & y
+    end
+    # Finds the first common ancestor that this token and another token
+    # share in the dependency graph.
+    #
+    # If `inclusive` is `false`, a common ancestor is defined strictly
+    # as a common ancestor of both tokens. If `inclusive` is `true`,
+    # one of the tokens can be a common ancestor of the other.
+    #
+    # @example
+    #   x.head # => w
+    #   w.head # => z
+    #   y.head # => z
+    #   z.head # => u
+    #
+    #   x.first_common_ancestor(y, inclusive: false) # => z
+    #   x.first_common_ancestor(w, inclusive: false) # => z
+    #   x.first_common_ancestor(x, inclusive: false) # => w
+    #
+    #   x.first_common_ancestor(y, inclusive: true)  # => z
+    #   x.first_common_ancestor(w, inclusive: true)  # => w
+    #   x.first_common_ancestor(x, inclusive: true)  # => x
+    #
+    # @see Token#common_ancestors
+    # @see Token#first_common_ancestor_path
+    #
+    # @return [nil, Token] first common ancestor
+    def first_common_ancestor(other_token, inclusive: false)
+      common_ancestors(other_token, inclusive: inclusive).first
+    end
+    private
+    # FIXME: extract this from the header of the PROIEL XML file instead and
+    # subclass PositionalTag
+    POS_POSITIONAL_TAG_SEQUENCE = %i(major minor)
+    # FIXME: extract this from the header of the PROIEL XML file instead and
+    # subclass PositionalTag
+    MORPHOLOGY_POSITIONAL_TAG_SEQUENCE = %i(
+      person number tense mood voice gender case degree strength inflection
+    )
+    NULL_PARTS_OF_SPEECH = {
+      'V' => 'V-',
+      'C' => 'C-',
+      'P' => 'Pp',
+    }
+  end
+end

data/lib/proiel/tokenization.rb ADDED Viewed

@@ -0,0 +1,90 @@
+#--
+# Copyright (c) 2015 Marius L. Jøhndal
+#
+# See LICENSE in the top-level source directory for licensing terms.
+#++
+module PROIEL
+  module Tokenization
+    # Loads tokenization patterns from a configuration file.
+    #
+    # The configuration file should be a JSON file. The keys should
+    # be language tags and the values tokenization patterns.
+    #
+    # The method can be called multiple times. On the first invocation
+    # patterns will be loaded, on subsequent invocations patterns will
+    # be updated. Only patterns for languages that are defined in the
+    # configuration file will be updated, other patterns will remain unchanged.
+    #
+    # @param filename [String] name of tokenization pattern file
+    #
+    # @return [Hash] loaded patterns
+    #
+    def self.load_patterns(filename)
+      raise ArgumentError, 'invalid filename' unless filename.is_a?(String)
+      patterns = JSON.parse(File.read(filename))
+      regexes = patterns.map { |l, p| [l, self.make_regex(p)] }.to_h
+      @@regexes ||= {}
+      @@regexes.merge!(regexes)
+    end
+    # Makes a regular expression from a pattern given in the configuration file.
+    #
+    # The regular expression is to avoid partial matches. Multi-line matches
+    # are allowed in case characters that are interpreted as line separators
+    # occur in the data.
+    #
+    # @param pattern [String] tokenization pattern
+    #
+    # @return [Regexp]
+    #
+    def self.make_regex(pattern)
+      raise ArgumentError, 'invalid pattern' unless pattern.is_a?(String)
+      Regexp.new("^#{pattern}$", Regexp::MULTILINE)
+    end
+    # Tests if a token form is splitable. Any form with more than one character
+    # is splitable.
+    #
+    # @param form [String, nil] token form to Tests
+    #
+    # @return [true, false]
+    #
+    def self.is_splitable?(form)
+      raise ArgumentError, 'invalid form' unless form.is_a?(String) or form.nil?
+      form and form.length > 1
+    end
+    # Splits a token form using the tokenization patterns that apply for a
+    # the specified language. Tokenization patterns must already have been
+    # loaded.
+    #
+    # @param language_tag [String] ISO 639-3 tag for the language whose patterns
+    #   should be used to split the token form
+    # @param form [String] token form to split
+    #
+    # @return [Array<String>]
+    #
+    def self.split_form(language_tag, form)
+      raise ArgumentError, 'invalid language tag' unless language_tag.is_a?(String)
+      raise ArgumentError, 'invalid form' unless form.is_a?(String)
+      if form[/\W+/]
+        # Split on any non-word character like a space or punctuation
+        form.split(/(\W+)/)
+      elsif @@regexes.key?(language_tag) and form[@@regexes[language_tag]]
+        # Apply language-specific pattern
+        form.match(@@regexes[language_tag]).captures
+      elsif form == ''
+        ['']
+      else
+        # Give up and split by character
+        form.split(/()/)
+      end
+    end
+  end
+end