RubyGems - proiel-cli - Versions diffs - 0.1.0 - Mend

proiel-cli 0.1.0

Files changed (34) hide show

checksums.yaml +7 -0
data/LICENSE +23 -0
data/README.md +34 -0
data/bin/proiel +27 -0
data/bin/setup +7 -0
data/contrib/proiel-giza-train +6 -0
data/contrib/proiel-lexc-compile +18 -0
data/contrib/proiel-maltparser-parse +2 -0
data/contrib/proiel-maltparser-train +6 -0
data/contrib/proiel-tnt-train +15 -0
data/examples/decision-tree.rb +41 -0
data/examples/dep-pos-cooccurrences.rb +84 -0
data/examples/lint-rules.rb +174 -0
data/examples/relation-as-disambiguator.rb +134 -0
data/examples/word-occurrences.rb +30 -0
data/lib/proiel/cli.rb +2 -0
data/lib/proiel/cli/commands.rb +28 -0
data/lib/proiel/cli/commands/convert.rb +94 -0
data/lib/proiel/cli/commands/grep.rb +136 -0
data/lib/proiel/cli/commands/info.rb +126 -0
data/lib/proiel/cli/commands/tokenize.rb +165 -0
data/lib/proiel/cli/commands/validate.rb +42 -0
data/lib/proiel/cli/converters/conll-u.rb +589 -0
data/lib/proiel/cli/converters/conll-u/morphology.rb +235 -0
data/lib/proiel/cli/converters/conll-u/syntax.rb +81 -0
data/lib/proiel/cli/converters/conll-x.rb +66 -0
data/lib/proiel/cli/converters/lexc.rb +36 -0
data/lib/proiel/cli/converters/proielxml.rb +152 -0
data/lib/proiel/cli/converters/text.rb +99 -0
data/lib/proiel/cli/converters/tiger.rb +157 -0
data/lib/proiel/cli/converters/tiger2.rb +193 -0
data/lib/proiel/cli/converters/tnt.rb +30 -0
data/lib/proiel/cli/version.rb +5 -0
metadata +248 -0

data/examples/relation-as-disambiguator.rb ADDED Viewed

@@ -0,0 +1,134 @@
+#!/usr/bin/env ruby
+#
+# Does the dependency relation suffice to disambiguate ambiguous morphology?
+#
+require 'colorize'
+require 'proiel'
+if ARGV.length < 1
+  STDERR.puts "Usage: #{$0} treebank-files(s)"
+  exit 1
+end
+tb = PROIEL::Treebank.new
+tb.load_from_xml(ARGV)
+# Harvest morphology
+form_hash = {}
+tb.sources.reject { |source| source.language != language_tag }.each do |source|
+  source.tokens.each do |token|
+    next unless token.form and token.pos and token.morphology and token.relation
+    # TODO: problem with using token.form is that sentence-initial words are sometimes capitalised
+    relation_hash = (form_hash[token.form] ||= {})
+    pos_hash = (relation_hash[token.relation] ||= {})
+    morphology_hash = (pos_hash[token.pos] ||= {})
+    morphology_hash[token.morphology] ||= 0
+    morphology_hash[token.morphology] += 1
+  end
+end
+# Calculate by unique forms first
+unique_pos = 0
+relation_predicts_pos = 0
+relation_does_not_predict_pos = 0
+unique_morphology = 0
+relation_predicts_morphology = 0
+relation_does_not_predict_morphology = 0
+form_hash.each do |form, h|
+  number_of_poses = 0
+  number_of_morphologies = 0
+  h.each do |_, i|
+    i.each do |_, j|
+      number_of_poses += 1
+      j.each do |_, k|
+        number_of_morphologies += 1
+      end
+    end
+  end
+  if number_of_poses == 1
+    unique_pos += 1
+  elsif h.all? { |_, i| i.keys.count == 1 }
+    relation_predicts_pos += 1
+  else
+    relation_does_not_predict_pos += 1
+  end
+  if number_of_morphologies == 1
+    unique_morphology += 1
+  elsif h.all? { |_, i| i.all? { |_, j| j.keys.count == 1 } }
+    relation_predicts_morphology += 1
+  else
+    relation_does_not_predict_morphology += 1
+  end
+end
+puts "By unique forms (types)"
+puts "======================="
+puts "Forms with a unique POS:                               #{unique_pos}"
+puts "Forms whose relation predicts its POS:                 #{relation_predicts_pos}"
+puts "Forms whose relation does not predict its POS:         #{relation_does_not_predict_pos}"
+puts "Forms with a unique morphology:                        #{unique_morphology}"
+puts "Forms whose relation predicts its morphology:          #{relation_predicts_morphology}"
+puts "Forms whose relation does not predict its morphology:  #{relation_does_not_predict_morphology}"
+# Calculate by actual number of occurrences
+unique_pos = 0
+relation_predicts_pos = 0
+relation_does_not_predict_pos = 0
+unique_morphology = 0
+relation_predicts_morphology = 0
+relation_does_not_predict_morphology = 0
+form_hash.each do |form, h|
+  n = 0
+  number_of_poses = 0
+  number_of_morphologies = 0
+  h.each do |_, i|
+    i.each do |_, j|
+      number_of_poses += 1
+      j.each do |_, k|
+        number_of_morphologies += 1
+        n += k
+      end
+    end
+  end
+  if number_of_poses == 1
+    unique_pos += n
+  elsif h.all? { |_, i| i.keys.count == 1 }
+    relation_predicts_pos += n
+  else
+    relation_does_not_predict_pos += n
+  end
+  if number_of_morphologies == 1
+    unique_morphology += n
+  elsif h.all? { |_, i| i.all? { |_, j| j.keys.count == 1 } }
+    relation_predicts_morphology += n
+  else
+    relation_does_not_predict_morphology += n
+  end
+end
+puts
+puts "By occurrences of forms (tokens)"
+puts "================================"
+puts "Forms with a unique POS:                               #{unique_pos}"
+puts "Forms whose relation predicts its POS:                 #{relation_predicts_pos}"
+puts "Forms whose relation does not predict its POS:         #{relation_does_not_predict_pos}"
+puts "Forms with a unique morphology:                        #{unique_morphology}"
+puts "Forms whose relation predicts its morphology:          #{relation_predicts_morphology}"
+puts "Forms whose relation does not predict its morphology:  #{relation_does_not_predict_morphology}"
+exit 0

data/examples/word-occurrences.rb ADDED Viewed

@@ -0,0 +1,30 @@
+#!/usr/bin/env ruby
+#
+# Word form occurrence extraction
+#
+require 'colorize'
+require 'proiel'
+if ARGV.length < 1
+  STDERR.puts "Usage: #{$0} treebank-files(s)"
+  exit 1
+end
+tb = PROIEL::Treebank.new
+tb.load_from_xml(ARGV)
+form_index = {}
+tb.sources.each do |source|
+  source.tokens.each do |token|
+    unless token.form.nil?
+      form_index[token.form] ||= []
+      form_index[token.form] << [source.id, token.id].join(':')
+    end
+  end
+end
+form_index.sort_by(&:first).each do |form, ids|
+  puts "#{form}: #{ids.join(', ')}"
+end

data/lib/proiel/cli.rb ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ require 'proiel/cli/version'
2	+ require 'proiel/cli/commands'

data/lib/proiel/cli/commands.rb ADDED Viewed

@@ -0,0 +1,28 @@
+require 'mercenary'
+require 'colorize'
+require 'proiel'
+module PROIEL
+  class Command
+    class << self
+      def subclasses
+        @subclasses ||= []
+      end
+      def inherited(base)
+        subclasses << base
+        super(base)
+      end
+    end
+  end
+end
+def require_all(path)
+  glob = File.join(File.dirname(__FILE__), path, '*.rb')
+  Dir[glob].each do |f|
+    require f
+  end
+end
+require_all 'commands'
+require_all 'converters'

data/lib/proiel/cli/commands/convert.rb ADDED Viewed

@@ -0,0 +1,94 @@
+module PROIEL
+  module Commands
+    class Convert < Command
+      class << self
+        def init_with_program(prog)
+          prog.command(:convert) do |c|
+            c.syntax 'convert format'
+            c.description 'Convert to a different format'
+            c.command(:proielxml) do |f|
+              f.syntax '[options] [filename(s)]'
+              f.description 'Convert to PROIEL XML format'
+              f.option 'remove-not-annotated', '--remove-not-annotated', 'Remove sentences that have not been annotated'
+              f.option 'remove-not-reviewed', '--remove-not-reviewed', 'Remove sentences that have not been reviewed'
+              f.option 'remove-morphology', '--remove-morphology', 'Remove morphological annotation (part of speech, morphology and lemma)'
+              f.option 'remove-syntax', '--remove-syntax', 'Remove syntactic annotation (relation, head ID and slashes)'
+              f.option 'remove-information-structure', '--remove-information-structure', 'Remove informtion structure annotation (antecedent ID, information status and contrast group)'
+              f.option 'remove-status', '--remove-status', 'Remove sentence status (i.e. revert all sentences to unannotated status)'
+              f.option 'remove-empty-divs', '--remove-empty-divs', 'Remove div elements that do not contain any sentences'
+              f.action { |args, options| process(args, options, PROIEL::Converter::PROIELXML) }
+            end
+            c.command(:tnt) do |f|
+              f.syntax '[options] filename(s)'
+              f.description 'Convert to TNT/hunpos format'
+              f.option 'morphology', '-m', '--morphology', 'Include morphological tags'
+              f.action { |args, options| process(args, options, PROIEL::Converter::TNT) }
+            end
+            c.command(:"conll-x") do |f|
+              f.syntax 'filename(s)'
+              f.description 'Convert to CoNLL-X format'
+              f.action { |args, options| process(args, options, PROIEL::Converter::CoNLLX) }
+            end
+            c.command(:"conll-u") do |f|
+              f.syntax 'filename(s)'
+              f.description 'Convert to CoNLL-U format'
+              f.action { |args, options| process(args, options, PROIEL::Converter::CoNLLU) }
+            end
+            c.command(:tiger) do |f|
+              f.syntax 'filename(s)'
+              f.description 'Convert to TIGER XML format'
+              f.action { |args, options| process(args, options, PROIEL::Converter::Tiger) }
+            end
+            c.command(:tiger2) do |f|
+              f.syntax 'filename(s)'
+              f.description 'Convert to TIGER2 format'
+              f.action { |args, options| process(args, options, PROIEL::Converter::Tiger2) }
+            end
+            c.command(:text) do |f|
+              f.syntax 'filename(s)'
+              f.description 'Convert to plain text (UTF-8 with Unix line-endings)'
+              f.option 'diffable', '-d', '--diffable', 'Make the output diffable'
+              f.action { |args, options| process(args, options, PROIEL::Converter::Text) }
+            end
+            c.command(:lexc) do |f|
+              f.syntax '[options] filename(s)'
+              f.description 'Convert to lexc format'
+              f.option 'morphology', '-m', '--morphology', 'Include morphological tags'
+              f.action { |args, options| process(args, options, PROIEL::Converter::Lexc) }
+            end
+            c.action do |_, _|
+              STDERR.puts 'Missing or invalid format. Use --help for more information.'
+              exit 1
+            end
+          end
+        end
+        def process(args, options, converter)
+          tb = PROIEL::Treebank.new
+          if args.empty?
+            STDERR.puts "Reading from standard input...".green if options['verbose']
+            tb.load_from_xml(STDIN)
+          else
+            args.each do |filename|
+              STDERR.puts "Reading #{filename}...".green if options['verbose']
+              tb.load_from_xml(filename)
+            end
+          end
+          converter.process(tb, options)
+        end
+      end
+    end
+  end
+end

data/lib/proiel/cli/commands/grep.rb ADDED Viewed

@@ -0,0 +1,136 @@
+module PROIEL
+  module Commands
+    class Grep < Command
+      class << self
+        def init_with_program(prog)
+          prog.command(:grep) do |c|
+            c.syntax 'grep [options] pattern filename(s)'
+            c.description 'Search the text'
+            c.option 'level', '--level LEVEL', 'Select level to match. LEVEL should be "token" or "sentence" (default)'
+            c.option 'nostrip', '--nostrip', 'Do not strip whitespace from start and beginning of strings'
+            c.option 'nosubst', '--nosubst', 'Do not substitute whitespace sequences with a single space'
+            c.option 'nocolour', '--nocolour', 'Do not colour code matches'
+            c.option 'ignore-case', '-i', '--ignore-case', 'Ignore uppercase/lowercase'
+            c.action { |args, options| process(args, options) }
+          end
+        end
+        def match(s, citation, object_id, pattern, options)
+          s.strip! unless options['nostrip']
+          s.gsub!(/\s+/, ' ') unless options['nosubst']
+          if s[pattern]
+            s.gsub!(pattern) { |m| m.yellow } unless options['nocolour']
+            puts "#{citation} (ID = #{object_id}) #{s}"
+          end
+        end
+        def merge_citation_parts(citation_upper, citation_lower_start, citation_lower_end = nil)
+          citation_lower =
+            if citation_lower_start == citation_lower_end
+              citation_lower_start
+            else
+              [citation_lower_start, citation_lower_end].compact.join('-')
+            end
+          [citation_upper, citation_lower].compact.join(' ')
+        end
+        def process_div_for_sentences(citation_upper, div, pattern, options)
+          div.sentences.each do |sentence|
+            s = sentence.presentation_before || ''
+            citation_lower_start = nil
+            citation_lower_end = nil
+            sentence.tokens.each do |token|
+              unless token.is_empty?
+                s += token.presentation_before || ''
+                s += token.form || ''
+                s += token.presentation_after || ''
+                citation_lower_start = token.citation_part if citation_lower_start.nil?
+                citation_lower_end = token.citation_part
+              end
+            end
+            s += sentence.presentation_after || ''
+            citation = merge_citation_parts(citation_upper, citation_lower_start, citation_lower_end)
+            match(s, citation, sentence.id, pattern, options)
+          end
+        end
+        def process_div_for_tokens(citation_upper, div, pattern, options)
+          div.sentences.each do |sentence|
+            sentence.tokens.each do |token|
+              unless token.is_empty?
+                s = token.presentation_before || ''
+                s += token.form || ''
+                s += token.presentation_after || ''
+                citation = merge_citation_parts(citation_upper, token.citation_part)
+                match(s, citation, token.id, pattern, options)
+              end
+            end
+          end
+        end
+        def process_div(citation_upper, div, pattern, options)
+          case options['level']
+          when 'sentence'
+            process_div_for_sentences(citation_upper, div, pattern, options)
+          when 'token'
+            process_div_for_tokens(citation_upper, div, pattern, options)
+          end
+        end
+        def process(args, options)
+          if args.empty?
+            STDERR.puts 'Missing pattern. Use --help for more information.'
+            exit 1
+          end
+          pattern_string = args.shift
+          pattern =
+            if options['ignore-case']
+              Regexp.new(pattern_string, Regexp::IGNORECASE)
+            else
+              Regexp.new(pattern_string)
+            end
+          if args.empty?
+            STDERR.puts 'Missing filename(s). Use --help for more information.'
+            exit 1
+          end
+          options['level'] ||= 'sentence'
+          unless %w(token sentence).include?(options['level'])
+            STDERR.puts 'Invalid matching level. Use --help for more information.'
+            exit 1
+          end
+          tb = PROIEL::Treebank.new
+          args.each do |filename|
+            STDERR.puts "Reading #{filename}...".green if options['verbose']
+            tb.load_from_xml(filename)
+          end
+          tb.sources.each do |source|
+            citation_upper = source.citation_part
+            source.divs.each do |div|
+              process_div(citation_upper, div, pattern, options)
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/proiel/cli/commands/info.rb ADDED Viewed

@@ -0,0 +1,126 @@
+module PROIEL
+  module Commands
+    class CountWords < Command
+      class << self
+        def init_with_program(prog)
+          prog.command(:info) do |c|
+            c.syntax 'info [options] filename(s)'
+            c.description 'Show information about the treebank'
+            c.action do |args, options|
+              if args.empty?
+                STDERR.puts 'Missing filename(s). Use --help for more information.'
+              else
+                process(args, options)
+              end
+            end
+          end
+        end
+        def process(args, options)
+          tb = PROIEL::Treebank.new
+          args.each do |filename|
+            STDERR.puts "Reading #{filename}...".green if options['verbose']
+            tb.load_from_xml(filename)
+          end
+          t = treebank_statistics(tb)
+          puts "Loaded treebank files contain #{tb.sources.count} source(s)".yellow
+          puts "   Overall size: #{t.sentence_count} sentence(s), #{t.token_count} token(s)"
+          puts
+          tb.sources.each_with_index do |source, i|
+            s = source_statistics(source)
+            n = s.sentence_count
+            r = s.reviewed_sentence_count * 100.0 / n
+            a = s.annotated_sentence_count * 100.0 / n
+            puts "#{i + 1}. #{pretty_title(source)}".yellow
+            puts "   Version:      #{source.date}"
+            puts "   License:      #{pretty_license(source)}"
+            puts "   Language:     #{pretty_language(source)}"
+            puts "   Printed text: #{pretty_printed_text_info(source)}"
+            puts "   Electr. text: #{pretty_electronic_text_info(source)}"
+            puts "   Size:         #{n} sentence(s), #{s.token_count} token(s)"
+            puts "   Annotation:   %.2f%% reviewed, %.2f%% annotated" % [r, a]
+          end
+        end
+        def treebank_statistics(tb)
+          OpenStruct.new.tap do |s|
+            s.sentence_count = 0
+            s.token_count = 0
+            s.annotated_sentence_count = 0
+            s.reviewed_sentence_count = 0
+            tb.sources.each do |source|
+              s.token_count += source_statistics(source).token_count
+              s.sentence_count += source_statistics(source).sentence_count
+              s.annotated_sentence_count += source_statistics(source).annotated_sentence_count
+              s.reviewed_sentence_count += source_statistics(source).reviewed_sentence_count
+            end
+          end
+        end
+        def source_statistics(source)
+          OpenStruct.new.tap do |s|
+            s.sentence_count = 0
+            s.token_count = 0
+            s.annotated_sentence_count = 0
+            s.reviewed_sentence_count = 0
+            source.divs.each do |div|
+              div.sentences.each do |sentence|
+                s.token_count += sentence.tokens.count
+              end
+              s.sentence_count += div.sentences.count
+              s.annotated_sentence_count += div.sentences.select(&:annotated?).count
+              s.reviewed_sentence_count += div.sentences.select(&:reviewed?).count
+            end
+          end
+        end
+        def pretty_language(source)
+          case source.language
+          when 'lat'
+            'Latin'
+          else
+            "Unknown (language code #{source.language})"
+          end
+        end
+        def pretty_printed_text_info(source)
+          [source.printed_text_title,
+           source.printed_text_editor ? "ed. #{source.printed_text_editor}" : nil,
+           source.printed_text_publisher,
+           source.printed_text_place,
+           source.printed_text_date].compact.join(', ')
+        end
+        def pretty_electronic_text_info(source)
+          [source.electronic_text_title,
+           source.electronic_text_editor ? "ed. #{source.electronic_text_editor}" : nil,
+           source.electronic_text_publisher,
+           source.electronic_text_place,
+           source.electronic_text_date].compact.join(', ')
+        end
+        def pretty_license(source)
+          if source.license_url
+            "#{source.license} (#{source.license_url})"
+          else
+            source.license
+          end
+        end
+        def pretty_title(source)
+          [source.author, source.title].compact.join(', ')
+        end
+      end
+    end
+  end
+end