proiel-cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +23 -0
- data/README.md +34 -0
- data/bin/proiel +27 -0
- data/bin/setup +7 -0
- data/contrib/proiel-giza-train +6 -0
- data/contrib/proiel-lexc-compile +18 -0
- data/contrib/proiel-maltparser-parse +2 -0
- data/contrib/proiel-maltparser-train +6 -0
- data/contrib/proiel-tnt-train +15 -0
- data/examples/decision-tree.rb +41 -0
- data/examples/dep-pos-cooccurrences.rb +84 -0
- data/examples/lint-rules.rb +174 -0
- data/examples/relation-as-disambiguator.rb +134 -0
- data/examples/word-occurrences.rb +30 -0
- data/lib/proiel/cli.rb +2 -0
- data/lib/proiel/cli/commands.rb +28 -0
- data/lib/proiel/cli/commands/convert.rb +94 -0
- data/lib/proiel/cli/commands/grep.rb +136 -0
- data/lib/proiel/cli/commands/info.rb +126 -0
- data/lib/proiel/cli/commands/tokenize.rb +165 -0
- data/lib/proiel/cli/commands/validate.rb +42 -0
- data/lib/proiel/cli/converters/conll-u.rb +589 -0
- data/lib/proiel/cli/converters/conll-u/morphology.rb +235 -0
- data/lib/proiel/cli/converters/conll-u/syntax.rb +81 -0
- data/lib/proiel/cli/converters/conll-x.rb +66 -0
- data/lib/proiel/cli/converters/lexc.rb +36 -0
- data/lib/proiel/cli/converters/proielxml.rb +152 -0
- data/lib/proiel/cli/converters/text.rb +99 -0
- data/lib/proiel/cli/converters/tiger.rb +157 -0
- data/lib/proiel/cli/converters/tiger2.rb +193 -0
- data/lib/proiel/cli/converters/tnt.rb +30 -0
- data/lib/proiel/cli/version.rb +5 -0
- metadata +248 -0
| @@ -0,0 +1,134 @@ | |
| 1 | 
            +
            #!/usr/bin/env ruby
         | 
| 2 | 
            +
            #
         | 
| 3 | 
            +
            # Does the dependency relation suffice to disambiguate ambiguous morphology?
         | 
| 4 | 
            +
            #
         | 
| 5 | 
            +
            require 'colorize'
         | 
| 6 | 
            +
            require 'proiel'
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            if ARGV.length < 1
         | 
| 9 | 
            +
              STDERR.puts "Usage: #{$0} treebank-files(s)"
         | 
| 10 | 
            +
             | 
| 11 | 
            +
              exit 1
         | 
| 12 | 
            +
            end
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            tb = PROIEL::Treebank.new
         | 
| 15 | 
            +
            tb.load_from_xml(ARGV)
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            # Harvest morphology
         | 
| 18 | 
            +
            form_hash = {}
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            tb.sources.reject { |source| source.language != language_tag }.each do |source|
         | 
| 21 | 
            +
              source.tokens.each do |token|
         | 
| 22 | 
            +
                next unless token.form and token.pos and token.morphology and token.relation
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                # TODO: problem with using token.form is that sentence-initial words are sometimes capitalised
         | 
| 25 | 
            +
                relation_hash = (form_hash[token.form] ||= {})
         | 
| 26 | 
            +
                pos_hash = (relation_hash[token.relation] ||= {})
         | 
| 27 | 
            +
                morphology_hash = (pos_hash[token.pos] ||= {})
         | 
| 28 | 
            +
                morphology_hash[token.morphology] ||= 0
         | 
| 29 | 
            +
                morphology_hash[token.morphology] += 1
         | 
| 30 | 
            +
              end
         | 
| 31 | 
            +
            end
         | 
| 32 | 
            +
             | 
| 33 | 
            +
            # Calculate by unique forms first
         | 
| 34 | 
            +
            unique_pos = 0
         | 
| 35 | 
            +
            relation_predicts_pos = 0
         | 
| 36 | 
            +
            relation_does_not_predict_pos = 0
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            unique_morphology = 0
         | 
| 39 | 
            +
            relation_predicts_morphology = 0
         | 
| 40 | 
            +
            relation_does_not_predict_morphology = 0
         | 
| 41 | 
            +
             | 
| 42 | 
            +
            form_hash.each do |form, h|
         | 
| 43 | 
            +
              number_of_poses = 0
         | 
| 44 | 
            +
              number_of_morphologies = 0
         | 
| 45 | 
            +
             | 
| 46 | 
            +
              h.each do |_, i|
         | 
| 47 | 
            +
                i.each do |_, j|
         | 
| 48 | 
            +
                  number_of_poses += 1
         | 
| 49 | 
            +
                  j.each do |_, k|
         | 
| 50 | 
            +
                    number_of_morphologies += 1
         | 
| 51 | 
            +
                  end
         | 
| 52 | 
            +
                end
         | 
| 53 | 
            +
              end
         | 
| 54 | 
            +
             | 
| 55 | 
            +
              if number_of_poses == 1
         | 
| 56 | 
            +
                unique_pos += 1
         | 
| 57 | 
            +
              elsif h.all? { |_, i| i.keys.count == 1 }
         | 
| 58 | 
            +
                relation_predicts_pos += 1
         | 
| 59 | 
            +
              else
         | 
| 60 | 
            +
                relation_does_not_predict_pos += 1
         | 
| 61 | 
            +
              end
         | 
| 62 | 
            +
             | 
| 63 | 
            +
              if number_of_morphologies == 1
         | 
| 64 | 
            +
                unique_morphology += 1
         | 
| 65 | 
            +
              elsif h.all? { |_, i| i.all? { |_, j| j.keys.count == 1 } }
         | 
| 66 | 
            +
                relation_predicts_morphology += 1
         | 
| 67 | 
            +
              else
         | 
| 68 | 
            +
                relation_does_not_predict_morphology += 1
         | 
| 69 | 
            +
              end
         | 
| 70 | 
            +
            end
         | 
| 71 | 
            +
             | 
| 72 | 
            +
            puts "By unique forms (types)"
         | 
| 73 | 
            +
            puts "======================="
         | 
| 74 | 
            +
            puts "Forms with a unique POS:                               #{unique_pos}"
         | 
| 75 | 
            +
            puts "Forms whose relation predicts its POS:                 #{relation_predicts_pos}"
         | 
| 76 | 
            +
            puts "Forms whose relation does not predict its POS:         #{relation_does_not_predict_pos}"
         | 
| 77 | 
            +
             | 
| 78 | 
            +
            puts "Forms with a unique morphology:                        #{unique_morphology}"
         | 
| 79 | 
            +
            puts "Forms whose relation predicts its morphology:          #{relation_predicts_morphology}"
         | 
| 80 | 
            +
            puts "Forms whose relation does not predict its morphology:  #{relation_does_not_predict_morphology}"
         | 
| 81 | 
            +
             | 
| 82 | 
            +
            # Calculate by actual number of occurrences
         | 
| 83 | 
            +
            unique_pos = 0
         | 
| 84 | 
            +
            relation_predicts_pos = 0
         | 
| 85 | 
            +
            relation_does_not_predict_pos = 0
         | 
| 86 | 
            +
             | 
| 87 | 
            +
            unique_morphology = 0
         | 
| 88 | 
            +
            relation_predicts_morphology = 0
         | 
| 89 | 
            +
            relation_does_not_predict_morphology = 0
         | 
| 90 | 
            +
             | 
| 91 | 
            +
            form_hash.each do |form, h|
         | 
| 92 | 
            +
              n = 0
         | 
| 93 | 
            +
              number_of_poses = 0
         | 
| 94 | 
            +
              number_of_morphologies = 0
         | 
| 95 | 
            +
             | 
| 96 | 
            +
              h.each do |_, i|
         | 
| 97 | 
            +
                i.each do |_, j|
         | 
| 98 | 
            +
                  number_of_poses += 1
         | 
| 99 | 
            +
                  j.each do |_, k|
         | 
| 100 | 
            +
                    number_of_morphologies += 1
         | 
| 101 | 
            +
                    n += k
         | 
| 102 | 
            +
                  end
         | 
| 103 | 
            +
                end
         | 
| 104 | 
            +
              end
         | 
| 105 | 
            +
             | 
| 106 | 
            +
              if number_of_poses == 1
         | 
| 107 | 
            +
                unique_pos += n
         | 
| 108 | 
            +
              elsif h.all? { |_, i| i.keys.count == 1 }
         | 
| 109 | 
            +
                relation_predicts_pos += n
         | 
| 110 | 
            +
              else
         | 
| 111 | 
            +
                relation_does_not_predict_pos += n
         | 
| 112 | 
            +
              end
         | 
| 113 | 
            +
             | 
| 114 | 
            +
              if number_of_morphologies == 1
         | 
| 115 | 
            +
                unique_morphology += n
         | 
| 116 | 
            +
              elsif h.all? { |_, i| i.all? { |_, j| j.keys.count == 1 } }
         | 
| 117 | 
            +
                relation_predicts_morphology += n
         | 
| 118 | 
            +
              else
         | 
| 119 | 
            +
                relation_does_not_predict_morphology += n
         | 
| 120 | 
            +
              end
         | 
| 121 | 
            +
            end
         | 
| 122 | 
            +
             | 
| 123 | 
            +
            puts
         | 
| 124 | 
            +
            puts "By occurrences of forms (tokens)"
         | 
| 125 | 
            +
            puts "================================"
         | 
| 126 | 
            +
            puts "Forms with a unique POS:                               #{unique_pos}"
         | 
| 127 | 
            +
            puts "Forms whose relation predicts its POS:                 #{relation_predicts_pos}"
         | 
| 128 | 
            +
            puts "Forms whose relation does not predict its POS:         #{relation_does_not_predict_pos}"
         | 
| 129 | 
            +
             | 
| 130 | 
            +
            puts "Forms with a unique morphology:                        #{unique_morphology}"
         | 
| 131 | 
            +
            puts "Forms whose relation predicts its morphology:          #{relation_predicts_morphology}"
         | 
| 132 | 
            +
            puts "Forms whose relation does not predict its morphology:  #{relation_does_not_predict_morphology}"
         | 
| 133 | 
            +
             | 
| 134 | 
            +
            exit 0
         | 
| @@ -0,0 +1,30 @@ | |
| 1 | 
            +
            #!/usr/bin/env ruby
         | 
| 2 | 
            +
            #
         | 
| 3 | 
            +
            # Word form occurrence extraction
         | 
| 4 | 
            +
            #
         | 
| 5 | 
            +
            require 'colorize'
         | 
| 6 | 
            +
            require 'proiel'
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            if ARGV.length < 1
         | 
| 9 | 
            +
              STDERR.puts "Usage: #{$0} treebank-files(s)"
         | 
| 10 | 
            +
             | 
| 11 | 
            +
              exit 1
         | 
| 12 | 
            +
            end
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            tb = PROIEL::Treebank.new
         | 
| 15 | 
            +
            tb.load_from_xml(ARGV)
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            form_index = {}
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            tb.sources.each do |source|
         | 
| 20 | 
            +
              source.tokens.each do |token|
         | 
| 21 | 
            +
                unless token.form.nil?
         | 
| 22 | 
            +
                  form_index[token.form] ||= []
         | 
| 23 | 
            +
                  form_index[token.form] << [source.id, token.id].join(':')
         | 
| 24 | 
            +
                end
         | 
| 25 | 
            +
              end
         | 
| 26 | 
            +
            end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            form_index.sort_by(&:first).each do |form, ids|
         | 
| 29 | 
            +
              puts "#{form}: #{ids.join(', ')}"
         | 
| 30 | 
            +
            end
         | 
    
        data/lib/proiel/cli.rb
    ADDED
    
    
| @@ -0,0 +1,28 @@ | |
| 1 | 
            +
            require 'mercenary'
         | 
| 2 | 
            +
            require 'colorize'
         | 
| 3 | 
            +
            require 'proiel'
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            module PROIEL
         | 
| 6 | 
            +
              class Command
         | 
| 7 | 
            +
                class << self
         | 
| 8 | 
            +
                  def subclasses
         | 
| 9 | 
            +
                    @subclasses ||= []
         | 
| 10 | 
            +
                  end
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                  def inherited(base)
         | 
| 13 | 
            +
                    subclasses << base
         | 
| 14 | 
            +
                    super(base)
         | 
| 15 | 
            +
                  end
         | 
| 16 | 
            +
                end
         | 
| 17 | 
            +
              end
         | 
| 18 | 
            +
            end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            def require_all(path)
         | 
| 21 | 
            +
              glob = File.join(File.dirname(__FILE__), path, '*.rb')
         | 
| 22 | 
            +
              Dir[glob].each do |f|
         | 
| 23 | 
            +
                require f
         | 
| 24 | 
            +
              end
         | 
| 25 | 
            +
            end
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            require_all 'commands'
         | 
| 28 | 
            +
            require_all 'converters'
         | 
| @@ -0,0 +1,94 @@ | |
| 1 | 
            +
            module PROIEL
         | 
| 2 | 
            +
              module Commands
         | 
| 3 | 
            +
                class Convert < Command
         | 
| 4 | 
            +
                  class << self
         | 
| 5 | 
            +
                    def init_with_program(prog)
         | 
| 6 | 
            +
                      prog.command(:convert) do |c|
         | 
| 7 | 
            +
                        c.syntax 'convert format'
         | 
| 8 | 
            +
                        c.description 'Convert to a different format'
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                        c.command(:proielxml) do |f|
         | 
| 11 | 
            +
                          f.syntax '[options] [filename(s)]'
         | 
| 12 | 
            +
                          f.description 'Convert to PROIEL XML format'
         | 
| 13 | 
            +
                          f.option 'remove-not-annotated', '--remove-not-annotated', 'Remove sentences that have not been annotated'
         | 
| 14 | 
            +
                          f.option 'remove-not-reviewed', '--remove-not-reviewed', 'Remove sentences that have not been reviewed'
         | 
| 15 | 
            +
                          f.option 'remove-morphology', '--remove-morphology', 'Remove morphological annotation (part of speech, morphology and lemma)'
         | 
| 16 | 
            +
                          f.option 'remove-syntax', '--remove-syntax', 'Remove syntactic annotation (relation, head ID and slashes)'
         | 
| 17 | 
            +
                          f.option 'remove-information-structure', '--remove-information-structure', 'Remove informtion structure annotation (antecedent ID, information status and contrast group)'
         | 
| 18 | 
            +
                          f.option 'remove-status', '--remove-status', 'Remove sentence status (i.e. revert all sentences to unannotated status)'
         | 
| 19 | 
            +
                          f.option 'remove-empty-divs', '--remove-empty-divs', 'Remove div elements that do not contain any sentences'
         | 
| 20 | 
            +
                          f.action { |args, options| process(args, options, PROIEL::Converter::PROIELXML) }
         | 
| 21 | 
            +
                        end
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                        c.command(:tnt) do |f|
         | 
| 24 | 
            +
                          f.syntax '[options] filename(s)'
         | 
| 25 | 
            +
                          f.description 'Convert to TNT/hunpos format'
         | 
| 26 | 
            +
                          f.option 'morphology', '-m', '--morphology', 'Include morphological tags'
         | 
| 27 | 
            +
                          f.action { |args, options| process(args, options, PROIEL::Converter::TNT) }
         | 
| 28 | 
            +
                        end
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                        c.command(:"conll-x") do |f|
         | 
| 31 | 
            +
                          f.syntax 'filename(s)'
         | 
| 32 | 
            +
                          f.description 'Convert to CoNLL-X format'
         | 
| 33 | 
            +
                          f.action { |args, options| process(args, options, PROIEL::Converter::CoNLLX) }
         | 
| 34 | 
            +
                        end
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                        c.command(:"conll-u") do |f|
         | 
| 37 | 
            +
                          f.syntax 'filename(s)'
         | 
| 38 | 
            +
                          f.description 'Convert to CoNLL-U format'
         | 
| 39 | 
            +
                          f.action { |args, options| process(args, options, PROIEL::Converter::CoNLLU) }
         | 
| 40 | 
            +
                        end
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                        c.command(:tiger) do |f|
         | 
| 43 | 
            +
                          f.syntax 'filename(s)'
         | 
| 44 | 
            +
                          f.description 'Convert to TIGER XML format'
         | 
| 45 | 
            +
                          f.action { |args, options| process(args, options, PROIEL::Converter::Tiger) }
         | 
| 46 | 
            +
                        end
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                        c.command(:tiger2) do |f|
         | 
| 49 | 
            +
                          f.syntax 'filename(s)'
         | 
| 50 | 
            +
                          f.description 'Convert to TIGER2 format'
         | 
| 51 | 
            +
                          f.action { |args, options| process(args, options, PROIEL::Converter::Tiger2) }
         | 
| 52 | 
            +
                        end
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                        c.command(:text) do |f|
         | 
| 55 | 
            +
                          f.syntax 'filename(s)'
         | 
| 56 | 
            +
                          f.description 'Convert to plain text (UTF-8 with Unix line-endings)'
         | 
| 57 | 
            +
                          f.option 'diffable', '-d', '--diffable', 'Make the output diffable'
         | 
| 58 | 
            +
                          f.action { |args, options| process(args, options, PROIEL::Converter::Text) }
         | 
| 59 | 
            +
                        end
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                        c.command(:lexc) do |f|
         | 
| 62 | 
            +
                          f.syntax '[options] filename(s)'
         | 
| 63 | 
            +
                          f.description 'Convert to lexc format'
         | 
| 64 | 
            +
                          f.option 'morphology', '-m', '--morphology', 'Include morphological tags'
         | 
| 65 | 
            +
                          f.action { |args, options| process(args, options, PROIEL::Converter::Lexc) }
         | 
| 66 | 
            +
                        end
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                        c.action do |_, _|
         | 
| 69 | 
            +
                          STDERR.puts 'Missing or invalid format. Use --help for more information.'
         | 
| 70 | 
            +
                          exit 1
         | 
| 71 | 
            +
                        end
         | 
| 72 | 
            +
                      end
         | 
| 73 | 
            +
                    end
         | 
| 74 | 
            +
             | 
| 75 | 
            +
                    def process(args, options, converter)
         | 
| 76 | 
            +
                      tb = PROIEL::Treebank.new
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                      if args.empty?
         | 
| 79 | 
            +
                        STDERR.puts "Reading from standard input...".green if options['verbose']
         | 
| 80 | 
            +
                        tb.load_from_xml(STDIN)
         | 
| 81 | 
            +
                      else
         | 
| 82 | 
            +
                        args.each do |filename|
         | 
| 83 | 
            +
                          STDERR.puts "Reading #{filename}...".green if options['verbose']
         | 
| 84 | 
            +
             | 
| 85 | 
            +
                          tb.load_from_xml(filename)
         | 
| 86 | 
            +
                        end
         | 
| 87 | 
            +
                      end
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                      converter.process(tb, options)
         | 
| 90 | 
            +
                    end
         | 
| 91 | 
            +
                  end
         | 
| 92 | 
            +
                end
         | 
| 93 | 
            +
              end
         | 
| 94 | 
            +
            end
         | 
| @@ -0,0 +1,136 @@ | |
| 1 | 
            +
            module PROIEL
         | 
| 2 | 
            +
              module Commands
         | 
| 3 | 
            +
                class Grep < Command
         | 
| 4 | 
            +
                  class << self
         | 
| 5 | 
            +
                    def init_with_program(prog)
         | 
| 6 | 
            +
                      prog.command(:grep) do |c|
         | 
| 7 | 
            +
                        c.syntax 'grep [options] pattern filename(s)'
         | 
| 8 | 
            +
                        c.description 'Search the text'
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                        c.option 'level', '--level LEVEL', 'Select level to match. LEVEL should be "token" or "sentence" (default)'
         | 
| 11 | 
            +
                        c.option 'nostrip', '--nostrip', 'Do not strip whitespace from start and beginning of strings'
         | 
| 12 | 
            +
                        c.option 'nosubst', '--nosubst', 'Do not substitute whitespace sequences with a single space'
         | 
| 13 | 
            +
                        c.option 'nocolour', '--nocolour', 'Do not colour code matches'
         | 
| 14 | 
            +
                        c.option 'ignore-case', '-i', '--ignore-case', 'Ignore uppercase/lowercase'
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                        c.action { |args, options| process(args, options) }
         | 
| 17 | 
            +
                      end
         | 
| 18 | 
            +
                    end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                    def match(s, citation, object_id, pattern, options)
         | 
| 21 | 
            +
                      s.strip! unless options['nostrip']
         | 
| 22 | 
            +
                      s.gsub!(/\s+/, ' ') unless options['nosubst']
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                      if s[pattern]
         | 
| 25 | 
            +
                        s.gsub!(pattern) { |m| m.yellow } unless options['nocolour']
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                        puts "#{citation} (ID = #{object_id}) #{s}"
         | 
| 28 | 
            +
                      end
         | 
| 29 | 
            +
                    end
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                    def merge_citation_parts(citation_upper, citation_lower_start, citation_lower_end = nil)
         | 
| 32 | 
            +
                      citation_lower =
         | 
| 33 | 
            +
                        if citation_lower_start == citation_lower_end
         | 
| 34 | 
            +
                          citation_lower_start
         | 
| 35 | 
            +
                        else
         | 
| 36 | 
            +
                          [citation_lower_start, citation_lower_end].compact.join('-')
         | 
| 37 | 
            +
                        end
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                      [citation_upper, citation_lower].compact.join(' ')
         | 
| 40 | 
            +
                    end
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                    def process_div_for_sentences(citation_upper, div, pattern, options)
         | 
| 43 | 
            +
                      div.sentences.each do |sentence|
         | 
| 44 | 
            +
                        s = sentence.presentation_before || ''
         | 
| 45 | 
            +
                        citation_lower_start = nil
         | 
| 46 | 
            +
                        citation_lower_end = nil
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                        sentence.tokens.each do |token|
         | 
| 49 | 
            +
                          unless token.is_empty?
         | 
| 50 | 
            +
                            s += token.presentation_before || ''
         | 
| 51 | 
            +
                            s += token.form || ''
         | 
| 52 | 
            +
                            s += token.presentation_after || ''
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                            citation_lower_start = token.citation_part if citation_lower_start.nil?
         | 
| 55 | 
            +
                            citation_lower_end = token.citation_part
         | 
| 56 | 
            +
                          end
         | 
| 57 | 
            +
                        end
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                        s += sentence.presentation_after || ''
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                        citation = merge_citation_parts(citation_upper, citation_lower_start, citation_lower_end)
         | 
| 62 | 
            +
                        match(s, citation, sentence.id, pattern, options)
         | 
| 63 | 
            +
                      end
         | 
| 64 | 
            +
                    end
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                    def process_div_for_tokens(citation_upper, div, pattern, options)
         | 
| 67 | 
            +
                      div.sentences.each do |sentence|
         | 
| 68 | 
            +
                        sentence.tokens.each do |token|
         | 
| 69 | 
            +
                          unless token.is_empty?
         | 
| 70 | 
            +
                            s = token.presentation_before || ''
         | 
| 71 | 
            +
                            s += token.form || ''
         | 
| 72 | 
            +
                            s += token.presentation_after || ''
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                            citation = merge_citation_parts(citation_upper, token.citation_part)
         | 
| 75 | 
            +
                            match(s, citation, token.id, pattern, options)
         | 
| 76 | 
            +
                          end
         | 
| 77 | 
            +
                        end
         | 
| 78 | 
            +
                      end
         | 
| 79 | 
            +
                    end
         | 
| 80 | 
            +
             | 
| 81 | 
            +
                    def process_div(citation_upper, div, pattern, options)
         | 
| 82 | 
            +
                      case options['level']
         | 
| 83 | 
            +
                      when 'sentence'
         | 
| 84 | 
            +
                        process_div_for_sentences(citation_upper, div, pattern, options)
         | 
| 85 | 
            +
                      when 'token'
         | 
| 86 | 
            +
                        process_div_for_tokens(citation_upper, div, pattern, options)
         | 
| 87 | 
            +
                      end
         | 
| 88 | 
            +
                    end
         | 
| 89 | 
            +
             | 
| 90 | 
            +
                    def process(args, options)
         | 
| 91 | 
            +
                      if args.empty?
         | 
| 92 | 
            +
                        STDERR.puts 'Missing pattern. Use --help for more information.'
         | 
| 93 | 
            +
                        exit 1
         | 
| 94 | 
            +
                      end
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                      pattern_string = args.shift
         | 
| 97 | 
            +
             | 
| 98 | 
            +
                      pattern =
         | 
| 99 | 
            +
                        if options['ignore-case']
         | 
| 100 | 
            +
                          Regexp.new(pattern_string, Regexp::IGNORECASE)
         | 
| 101 | 
            +
                        else
         | 
| 102 | 
            +
                          Regexp.new(pattern_string)
         | 
| 103 | 
            +
                        end
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                      if args.empty?
         | 
| 106 | 
            +
                        STDERR.puts 'Missing filename(s). Use --help for more information.'
         | 
| 107 | 
            +
                        exit 1
         | 
| 108 | 
            +
                      end
         | 
| 109 | 
            +
             | 
| 110 | 
            +
                      options['level'] ||= 'sentence'
         | 
| 111 | 
            +
             | 
| 112 | 
            +
                      unless %w(token sentence).include?(options['level'])
         | 
| 113 | 
            +
                        STDERR.puts 'Invalid matching level. Use --help for more information.'
         | 
| 114 | 
            +
                        exit 1
         | 
| 115 | 
            +
                      end
         | 
| 116 | 
            +
             | 
| 117 | 
            +
                      tb = PROIEL::Treebank.new
         | 
| 118 | 
            +
             | 
| 119 | 
            +
                      args.each do |filename|
         | 
| 120 | 
            +
                        STDERR.puts "Reading #{filename}...".green if options['verbose']
         | 
| 121 | 
            +
             | 
| 122 | 
            +
                        tb.load_from_xml(filename)
         | 
| 123 | 
            +
                      end
         | 
| 124 | 
            +
             | 
| 125 | 
            +
                      tb.sources.each do |source|
         | 
| 126 | 
            +
                        citation_upper = source.citation_part
         | 
| 127 | 
            +
             | 
| 128 | 
            +
                        source.divs.each do |div|
         | 
| 129 | 
            +
                          process_div(citation_upper, div, pattern, options)
         | 
| 130 | 
            +
                        end
         | 
| 131 | 
            +
                      end
         | 
| 132 | 
            +
                    end
         | 
| 133 | 
            +
                  end
         | 
| 134 | 
            +
                end
         | 
| 135 | 
            +
              end
         | 
| 136 | 
            +
            end
         | 
| @@ -0,0 +1,126 @@ | |
| 1 | 
            +
            module PROIEL
         | 
| 2 | 
            +
              module Commands
         | 
| 3 | 
            +
                class CountWords < Command
         | 
| 4 | 
            +
                  class << self
         | 
| 5 | 
            +
                    def init_with_program(prog)
         | 
| 6 | 
            +
                      prog.command(:info) do |c|
         | 
| 7 | 
            +
                        c.syntax 'info [options] filename(s)'
         | 
| 8 | 
            +
                        c.description 'Show information about the treebank'
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                        c.action do |args, options|
         | 
| 11 | 
            +
                          if args.empty?
         | 
| 12 | 
            +
                            STDERR.puts 'Missing filename(s). Use --help for more information.'
         | 
| 13 | 
            +
                          else
         | 
| 14 | 
            +
                            process(args, options)
         | 
| 15 | 
            +
                          end
         | 
| 16 | 
            +
                        end
         | 
| 17 | 
            +
                      end
         | 
| 18 | 
            +
                    end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                    def process(args, options)
         | 
| 21 | 
            +
                      tb = PROIEL::Treebank.new
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                      args.each do |filename|
         | 
| 24 | 
            +
                        STDERR.puts "Reading #{filename}...".green if options['verbose']
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                        tb.load_from_xml(filename)
         | 
| 27 | 
            +
                      end
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                      t = treebank_statistics(tb)
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                      puts "Loaded treebank files contain #{tb.sources.count} source(s)".yellow
         | 
| 32 | 
            +
                      puts "   Overall size: #{t.sentence_count} sentence(s), #{t.token_count} token(s)"
         | 
| 33 | 
            +
                      puts
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                      tb.sources.each_with_index do |source, i|
         | 
| 36 | 
            +
                        s = source_statistics(source)
         | 
| 37 | 
            +
                        n = s.sentence_count
         | 
| 38 | 
            +
                        r = s.reviewed_sentence_count * 100.0 / n
         | 
| 39 | 
            +
                        a = s.annotated_sentence_count * 100.0 / n
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                        puts "#{i + 1}. #{pretty_title(source)}".yellow
         | 
| 42 | 
            +
                        puts "   Version:      #{source.date}"
         | 
| 43 | 
            +
                        puts "   License:      #{pretty_license(source)}"
         | 
| 44 | 
            +
                        puts "   Language:     #{pretty_language(source)}"
         | 
| 45 | 
            +
                        puts "   Printed text: #{pretty_printed_text_info(source)}"
         | 
| 46 | 
            +
                        puts "   Electr. text: #{pretty_electronic_text_info(source)}"
         | 
| 47 | 
            +
                        puts "   Size:         #{n} sentence(s), #{s.token_count} token(s)"
         | 
| 48 | 
            +
                        puts "   Annotation:   %.2f%% reviewed, %.2f%% annotated" % [r, a]
         | 
| 49 | 
            +
                      end
         | 
| 50 | 
            +
                    end
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                    def treebank_statistics(tb)
         | 
| 53 | 
            +
                      OpenStruct.new.tap do |s|
         | 
| 54 | 
            +
                        s.sentence_count = 0
         | 
| 55 | 
            +
                        s.token_count = 0
         | 
| 56 | 
            +
                        s.annotated_sentence_count = 0
         | 
| 57 | 
            +
                        s.reviewed_sentence_count = 0
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                        tb.sources.each do |source|
         | 
| 60 | 
            +
                          s.token_count += source_statistics(source).token_count
         | 
| 61 | 
            +
                          s.sentence_count += source_statistics(source).sentence_count
         | 
| 62 | 
            +
                          s.annotated_sentence_count += source_statistics(source).annotated_sentence_count
         | 
| 63 | 
            +
                          s.reviewed_sentence_count += source_statistics(source).reviewed_sentence_count
         | 
| 64 | 
            +
                        end
         | 
| 65 | 
            +
                      end
         | 
| 66 | 
            +
                    end
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                    def source_statistics(source)
         | 
| 69 | 
            +
                      OpenStruct.new.tap do |s|
         | 
| 70 | 
            +
                        s.sentence_count = 0
         | 
| 71 | 
            +
                        s.token_count = 0
         | 
| 72 | 
            +
                        s.annotated_sentence_count = 0
         | 
| 73 | 
            +
                        s.reviewed_sentence_count = 0
         | 
| 74 | 
            +
             | 
| 75 | 
            +
                        source.divs.each do |div|
         | 
| 76 | 
            +
                          div.sentences.each do |sentence|
         | 
| 77 | 
            +
                            s.token_count += sentence.tokens.count
         | 
| 78 | 
            +
                          end
         | 
| 79 | 
            +
             | 
| 80 | 
            +
                          s.sentence_count += div.sentences.count
         | 
| 81 | 
            +
                          s.annotated_sentence_count += div.sentences.select(&:annotated?).count
         | 
| 82 | 
            +
                          s.reviewed_sentence_count += div.sentences.select(&:reviewed?).count
         | 
| 83 | 
            +
                        end
         | 
| 84 | 
            +
                      end
         | 
| 85 | 
            +
                    end
         | 
| 86 | 
            +
             | 
| 87 | 
            +
                    def pretty_language(source)
         | 
| 88 | 
            +
                      case source.language
         | 
| 89 | 
            +
                      when 'lat'
         | 
| 90 | 
            +
                        'Latin'
         | 
| 91 | 
            +
                      else
         | 
| 92 | 
            +
                        "Unknown (language code #{source.language})"
         | 
| 93 | 
            +
                      end
         | 
| 94 | 
            +
                    end
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                    def pretty_printed_text_info(source)
         | 
| 97 | 
            +
                      [source.printed_text_title,
         | 
| 98 | 
            +
                       source.printed_text_editor ? "ed. #{source.printed_text_editor}" : nil,
         | 
| 99 | 
            +
                       source.printed_text_publisher,
         | 
| 100 | 
            +
                       source.printed_text_place,
         | 
| 101 | 
            +
                       source.printed_text_date].compact.join(', ')
         | 
| 102 | 
            +
                    end
         | 
| 103 | 
            +
             | 
| 104 | 
            +
                    def pretty_electronic_text_info(source)
         | 
| 105 | 
            +
                      [source.electronic_text_title,
         | 
| 106 | 
            +
                       source.electronic_text_editor ? "ed. #{source.electronic_text_editor}" : nil,
         | 
| 107 | 
            +
                       source.electronic_text_publisher,
         | 
| 108 | 
            +
                       source.electronic_text_place,
         | 
| 109 | 
            +
                       source.electronic_text_date].compact.join(', ')
         | 
| 110 | 
            +
                    end
         | 
| 111 | 
            +
             | 
| 112 | 
            +
                    def pretty_license(source)
         | 
| 113 | 
            +
                      if source.license_url
         | 
| 114 | 
            +
                        "#{source.license} (#{source.license_url})"
         | 
| 115 | 
            +
                      else
         | 
| 116 | 
            +
                        source.license
         | 
| 117 | 
            +
                      end
         | 
| 118 | 
            +
                    end
         | 
| 119 | 
            +
             | 
| 120 | 
            +
                    def pretty_title(source)
         | 
| 121 | 
            +
                      [source.author, source.title].compact.join(', ')
         | 
| 122 | 
            +
                    end
         | 
| 123 | 
            +
                  end
         | 
| 124 | 
            +
                end
         | 
| 125 | 
            +
              end
         | 
| 126 | 
            +
            end
         |