RubyGems - proiel-cli - Versions diffs - 1.2.0 → 1.3.0 - Mend

proiel-cli 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +5 -5
data/README.md +18 -3
data/bin/proiel +1 -1
data/lib/proiel/cli/commands/build.rb +91 -0
data/lib/proiel/cli/commands/convert.rb +7 -2
data/lib/proiel/cli/commands/dictionary.rb +46 -0
data/lib/proiel/cli/commands/info.rb +1 -1
data/lib/proiel/cli/commands/shell.rb +34 -0
data/lib/proiel/cli/commands/tokenize.rb +2 -2
data/lib/proiel/cli/commands/validate.rb +6 -4
data/lib/proiel/cli/commands/visualize.rb +14 -11
data/lib/proiel/cli/converters/conll-u/morphology.rb +162 -72
data/lib/proiel/cli/converters/conll-u/syntax.rb +108 -62
data/lib/proiel/cli/converters/conll-u.rb +648 -548
data/lib/proiel/cli/converters/conll-x.rb +67 -52
data/lib/proiel/cli/converters/lexc.rb +21 -23
data/lib/proiel/cli/converters/proielxml.rb +173 -132
data/lib/proiel/cli/converters/text.rb +69 -71
data/lib/proiel/cli/converters/tiger.rb +110 -114
data/lib/proiel/cli/converters/tiger2.rb +139 -141
data/lib/proiel/cli/converters/tnt.rb +19 -15
data/lib/proiel/cli/version.rb +1 -1
data/lib/proiel/cli.rb +26 -1
metadata +43 -58
data/bin/setup +0 -8
data/contrib/proiel-tnt-train +0 -15
data/lib/proiel/cli/commands.rb +0 -28

data/lib/proiel/cli/converters/conll-x.rb CHANGED Viewed

@@ -1,64 +1,79 @@
-module PROIEL
-  module Converter
-    # This converts to the CoNLL-X format as described on http://ilk.uvt.nl/conll/#dataformat.
-    class CoNLLX
-      class << self
-        def process(tb, options)
-          tb.sources.each do |source|
-            source.divs.each do |div|
-              div.sentences.each do |sentence|
-                id_to_number = {}
-                # Do not care about prodrop tokens
-                tk = sentence.tokens.reject { |t| t.empty_token_sort == 'P' }
-                # Renumber to make the sequence continguous after prodrop tokens where left out
-                tk.map(&:id).each_with_index.each do |id, i|
-                  id_to_number[id] = i + 1
-                end
+module PROIEL::Converter
+  # Converter that outputs the CoNLL-X format as described on
+  # http://ilk.uvt.nl/conll/#dataformat.
+  #
+  # The conversion removes empty tokens. PRO tokens are completely ignored,
+  # while null C and null V tokens are eliminated by attaching their
+  # dependents to the first non-null ancestor and labelling them with a
+  # concatenation of dependency relations.
+  #
+  # Sequences of whitespace in forms and lemmas are represented by '.'.
+  class CoNLLX
+    class << self
+      def process(tb, _)
+        tb.sources.each do |source|
+          source.sentences.each do |sentence|
+            process_sentence(tb, sentence)
+          end
+        end
+      end
-                id_to_token = tk.inject({}) { |h, t| h.merge({t.id => t}) }
+      def process_sentence(tb, sentence)
+        tokens = sentence.tokens
-                tk.each do |token|
-                  unless token.is_empty?
-                    this_number = id_to_number[token.id]
-                    head_number, relation = find_lexical_head_and_relation(id_to_number, id_to_token, token)
-                    form = token.form.gsub(/[[:space:]]/, '.')
-                    lemma = token.lemma.gsub(/[[:space:]]/, '.')
-                    pos_major = token.part_of_speech_hash[:major]
-                    pos_full = token.part_of_speech
-                    morphology = format_morphology(token)
+        # Generate 1-based continguous numbering of overt tokens with
+        # null V and null C tokens appended at the end. We do this
+        # manually to ensure that the numbering is correct whatever the
+        # sequence is in the treebank.
+        id_map = Hash.new { |h, k| h[k] = h.keys.length + 1 }
+        tokens.select(&:has_content?).each { |t| id_map[t] } # these blocks have side-effects
+        tokens.reject(&:has_content?).reject(&:pro?).each { |t| id_map[t] }
-                    puts [this_number, form, lemma, pos_major, pos_full,
-                          morphology, head_number, relation, "_", "_"].join("\t")
-                  end
-                end
+        # Iterate overt tokens and print one formatted line per token.
+        tokens.select(&:has_content?).each do |token|
+          this_number = id_map[token]
+          head_number, relation = find_lexical_head_and_relation(id_map, tb, token)
+          form = format_text(token.form)
+          lemma = format_text(token.lemma)
+          pos_major, pos_full = format_pos(token)
+          morphology = format_morphology(token)
-                puts
-              end
-            end
-          end
+          puts [this_number, form, lemma, pos_major, pos_full,
+                morphology, head_number, relation, '_', '_'].join("\t")
         end
-        def format_morphology(token)
-          token.morphology_hash.map do |k, v|
-            # Remove inflection tag unless when set to inflecting
-            if k == :inflection and v =='i'
-              nil
-            else
-              "#{k.upcase[0..3]}#{v}"
-            end
-          end.compact.join('|')
-        end
+        # Separate sentences by an empty line.
+        puts
+      end
-        def find_lexical_head_and_relation(id_to_number, id_to_token, t, rel = '')
-          if t.is_root?
-            [0, rel + t.relation] # FIXME: may be empty token anyway
-          elsif id_to_token[t.head_id].has_content?
-            [id_to_number[t.head_id], rel + t.relation]
+      def format_text(s)
+        s.gsub(/[[:space:]]+/, '.')
+      end
+      def format_pos(token)
+        [token.part_of_speech_hash[:major], token.part_of_speech]
+      end
+      def format_morphology(token)
+        token.morphology_hash.map do |k, v|
+          # Remove inflection tag except when set to inflecting
+          if k == :inflection and v =='i'
+            nil
           else
-            find_lexical_head_and_relation(id_to_number, id_to_token, id_to_token[t.head_id], rel + "#{t.relation}(#{id_to_number[t.head_id]})")
+            "#{k.upcase[0..3]}#{v}"
           end
+        end.compact.join('|')
+      end
+      def find_lexical_head_and_relation(id_map, tb, t, rel = '')
+        new_relation = rel + t.relation
+        if t.is_root?
+          [0, new_relation]
+        elsif t.head.has_content?
+          [id_map[t.head], new_relation]
+        else
+          find_lexical_head_and_relation(id_map, tb, t.head, "#{new_relation}(#{id_map[t.head]})")
         end
       end
     end

data/lib/proiel/cli/converters/lexc.rb CHANGED Viewed

@@ -1,33 +1,31 @@
-module PROIEL
-  module Converter
-    # This converts part of speech and morphology to a lexc file.
-    class Lexc
-      class << self
-        def process(tb, options)
-          lexicon = {}
+module PROIEL::Converter
+  # Converter that outputs a lexc file with part of speech and morphology.
+  class Lexc
+    class << self
+      def process(tb, options)
+        lexicon = {}
-          tb.sources.each do |source|
-            source.divs.each do |div|
-              div.sentences.each do |sentence|
-                sentence.tokens.each do |token|
-                  unless token.is_empty?
-                    lexicon[token.form] ||= []
-                    if options['morphology']
-                      lexicon[token.form] << [token.lemma, [token.part_of_speech, token.morphology].join].join(',')
-                    else
-                      lexicon[token.form] << [token.lemma, token.part_of_speech].join(',')
-                    end
+        tb.sources.each do |source|
+          source.divs.each do |div|
+            div.sentences.each do |sentence|
+              sentence.tokens.each do |token|
+                unless token.is_empty?
+                  lexicon[token.form] ||= []
+                  if options['morphology']
+                    lexicon[token.form] << [token.lemma, [token.part_of_speech, token.morphology].join].join(',')
+                  else
+                    lexicon[token.form] << [token.lemma, token.part_of_speech].join(',')
                   end
                 end
               end
             end
           end
+        end
-          puts "LEXICON Root"
-          lexicon.sort.each do |form, tags|
-            tags.sort.uniq.each do |tag|
-              puts "  %s:%s #;" % [tag, form]
-            end
+        puts 'LEXICON Root'
+        lexicon.sort.each do |form, tags|
+          tags.sort.uniq.each do |tag|
+            puts '  %s:%s #;' % [tag, form]
           end
         end
       end

data/lib/proiel/cli/converters/proielxml.rb CHANGED Viewed

@@ -1,168 +1,209 @@
-module PROIEL
-  module Converter
-    class PROIELXML
-      class << self
-        def process(tb, options)
-          builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2)
-          builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
-          builder.proiel('export-time' => DateTime.now.xmlschema, 'schema-version' => '2.1') do
-            builder.annotation do
-              builder.relations do
-                tb.annotation_schema.relation_tags.each do |tag, value|
-                  attrs = { tag: tag }
-                  attrs.merge!(grab_features(value, %i(summary primary secondary)))
-                  builder.value(attrs)
-                end
+module PROIEL::Converter
+  # Converter that outputs PROIEL XML. This is primarily useful for filtering,
+  # merging or splitting PROIEL XML data. It is also useful for "upgrading"
+  # PROIEL XML to a new version or for testing round tripping of data.
+  class PROIELXML
+    class << self
+      def process(tb, options)
+        builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2)
+        builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
+        builder.proiel('export-time' => DateTime.now.xmlschema, 'schema-version' => '2.1') do
+          builder.annotation do
+            builder.relations do
+              tb.annotation_schema.relation_tags.each do |tag, value|
+                attrs = { tag: tag }
+                attrs.merge!(grab_features(value, %i(summary primary secondary)))
+                builder.value(attrs)
               end
+            end
-              builder.tag! 'parts-of-speech' do
-                tb.annotation_schema.part_of_speech_tags.each do |tag, value|
-                  attrs = { tag: tag }
-                  attrs.merge!(grab_features(value, %i(summary)))
-                  builder.value(attrs)
-                end
+            builder.tag! 'parts-of-speech' do
+              tb.annotation_schema.part_of_speech_tags.each do |tag, value|
+                attrs = { tag: tag }
+                attrs.merge!(grab_features(value, %i(summary)))
+                builder.value(attrs)
               end
+            end
-              builder.morphology do
-                tb.annotation_schema.morphology_tags.each do |cat_tag, cat_values|
-                  builder.field(tag: cat_tag) do
-                    cat_values.each do |tag, value|
-                      attrs = { tag: tag }
-                      attrs.merge!(grab_features(value, %i(summary)))
-                      builder.value(attrs)
-                    end
+            builder.morphology do
+              tb.annotation_schema.morphology_tags.each do |cat_tag, cat_values|
+                builder.field(tag: cat_tag) do
+                  cat_values.each do |tag, value|
+                    attrs = { tag: tag }
+                    attrs.merge!(grab_features(value, %i(summary)))
+                    builder.value(attrs)
                   end
                 end
               end
+            end
-              builder.tag! 'information-statuses' do
-                tb.annotation_schema.information_status_tags.each do |tag, value|
-                  attrs = { tag: tag }
-                  attrs.merge!(grab_features(value, %i(summary)))
-                  builder.value(attrs)
-                end
+            builder.tag! 'information-statuses' do
+              tb.annotation_schema.information_status_tags.each do |tag, value|
+                attrs = { tag: tag }
+                attrs.merge!(grab_features(value, %i(summary)))
+                builder.value(attrs)
               end
             end
+          end
-            tb.sources.each do |source|
-              mandatory_features = %i(id language)
-              optional_features = []
-              optional_features += %i(alignment_id) unless options['remove-alignments']
+          tb.sources.each do |source|
+            next if options['remove-unaligned-sources'] and source.alignment_id.nil?
-              builder.source(grab_features(source, mandatory_features, optional_features)) do
-                PROIEL::Treebank::METADATA_ELEMENTS.each do |field|
-                  builder.tag!(field.to_s.gsub('_', '-'), source.send(field)) if source.send(field)
-                end
+            mandatory_features = %i(id language)
+            optional_features = []
+            optional_features += %i(alignment_id) unless options['remove-alignments']
-                source.divs.each do |div|
-                  if include_div?(div, options)
-                    mandatory_features = %i()
-                    optional_features = []
-                    optional_features += %i(presentation_before presentation_after)
-                    optional_features += %i(alignment_id) unless options['remove-alignments']
-                    builder.div(grab_features(div, mandatory_features, optional_features)) do
-                      builder.title div.title if div.title
-                      div.sentences.each do |sentence|
-                        if include_sentence?(sentence, options)
-                          mandatory_features = %i(id)
-                          optional_features = [] # we do it this way to preserve the order of status and presentation_* so that diffing files is easier
-                          optional_features += %i(status) unless options['remove-status']
-                          optional_features += %i(presentation_before presentation_after)
-                          optional_features += %i(alignment_id) unless options['remove-alignments']
-                          optional_features += %i(annotated_at) unless options['remove-annotator']
-                          optional_features += %i(reviewed_at) unless options['remove-reviewer']
-                          optional_features += %i(annotated_by) unless options['remove-annotator']
-                          optional_features += %i(reviewed_by) unless options['remove-reviewer']
-                          builder.sentence(grab_features(sentence, mandatory_features, optional_features)) do
-                            sentence.tokens.each do |token|
-                              next if token.empty_token_sort == 'P' and options['remove-information-structure']
-                              next if token.empty_token_sort == 'C' and options['remove-syntax']
-                              next if token.empty_token_sort == 'V' and options['remove-syntax']
-                              mandatory_features = %i(id)
-                              optional_features = %i(citation_part)
-                              optional_features += %i(lemma part_of_speech morphology) unless options['remove-morphology']
-                              optional_features += %i(head_id relation) unless options['remove-syntax']
-                              optional_features += %i(antecedent_id information_status contrast_group) unless options['remove-information-structure']
-                              unless token.is_empty?
-                                mandatory_features << :form
-                                optional_features += %i(presentation_before presentation_after foreign_ids)
-                              else
-                                mandatory_features << :empty_token_sort
-                              end
-                              optional_features += %i(alignment_id) unless options['remove-alignments']
-                              attrs = grab_features(token, mandatory_features, optional_features)
-                              unless token.slashes.empty? or options['remove-syntax'] # this extra test avoids <token></token> style XML
-                                builder.token(attrs) do
-                                  token.slashes.each do |relation, target_id|
-                                    builder.slash(:"target-id" => target_id, relation: relation)
-                                  end
-                                end
-                              else
-                                unless options['remove-syntax'] and token.is_empty?
-                                  builder.token(attrs)
-                                end
-                              end
-                            end
-                          end
-                        end
-                      end
-                    end
-                  end
+            builder.source(grab_features(source, mandatory_features, optional_features)) do
+              PROIEL::Treebank::METADATA_ELEMENTS.each do |field|
+                builder.tag!(field.to_s.gsub('_', '-'), source.send(field)) if source.send(field)
+              end
+              source.divs.each do |div|
+                if include_div?(div, options)
+                  overrides = {
+                    div: {},
+                    sentence: {},
+                    token: {}
+                  }
+                  process_div(builder, tb, source, div, options, overrides)
                 end
               end
             end
           end
         end
+      end
+      def include_div?(div, options)
+        if options['remove-empty-divs']
+          div.sentences.any? { |sentence| include_sentence?(sentence, options) }
+        else
+          true
+        end
+      end
+      def include_sentence?(sentence, options)
+        case sentence.status
+        when :reviewed
+          not options['remove-reviewed'] and not options['remove-annotated']
+        when :annotated
+          not options['remove-not-reviewed'] and not options['remove-annotated']
+        else
+          not options['remove-not-reviewed'] and not options['remove-not-annotated']
+        end
+      end
+      def include_token?(token, options)
+        if options['remove-syntax'] and (token.empty_token_sort == 'C' or token.empty_token_sort == 'V')
+          false
+        elsif token.empty_token_sort == 'P' and options['remove-information-structure']
+          false
+        else
+          true
+        end
+      end
+      def process_div(builder, tb, source, div, options, overrides)
+        mandatory_features = %i()
-        def include_div?(div, options)
-          if options['remove-empty-divs']
-            div.sentences.any? { |sentence| include_sentence?(sentence, options) }
-          else
-            true
+        optional_features = []
+        optional_features += %i(presentation_before presentation_after)
+        optional_features += %i(id alignment_id) unless options['remove-alignments']
+        if options['infer-alignments'] and source.alignment_id
+          aligned_source = tb.find_source(source.alignment_id)
+          # FIXME: how to behave here? overwrite existing? what if nil? how to deal with multiple aligned divs?
+          overrides[:div][:alignment_id] = div.alignment_id || div.inferred_alignment(aligned_source).map(&:id).join(',')
+        end
+        builder.div(grab_features(div, mandatory_features, optional_features, overrides[:div])) do
+          builder.title div.title if div.title
+          div.sentences.select do |sentence|
+            include_sentence?(sentence, options)
+          end.each do |sentence|
+            process_sentence(builder, tb, sentence, options, overrides)
           end
         end
+      end
-        def include_sentence?(sentence, options)
-          case sentence.status
-          when :reviewed
-            true
-          when :annotated
-            not options['remove-not-reviewed']
-          else
-            not options['remove-not-reviewed'] and not options['remove-not-annotated']
+      def process_sentence(builder, tb, sentence, options, overrides)
+        mandatory_features = %i(id)
+        optional_features = [] # we do it this way to preserve the order of status and presentation_* so that diffing files is easier
+        optional_features += %i(status) unless options['remove-status']
+        optional_features += %i(presentation_before presentation_after)
+        optional_features += %i(alignment_id) unless options['remove-alignments']
+        optional_features += %i(annotated_at) unless options['remove-annotator']
+        optional_features += %i(reviewed_at) unless options['remove-reviewer']
+        optional_features += %i(annotated_by) unless options['remove-annotator']
+        optional_features += %i(reviewed_by) unless options['remove-reviewer']
+        builder.sentence(grab_features(sentence, mandatory_features, optional_features)) do
+          sentence.tokens.select do |token|
+            include_token?(token, options)
+          end.each do |token|
+            process_token(builder, tb, token, options, overrides)
           end
         end
+      end
-        def grab_features(obj, mandatory_features, optional_features = [])
-          attrs = {}
+      def process_token(builder, tb, token, options, overrides)
+        mandatory_features = %i(id)
-          mandatory_features.each do |f|
-            v = obj.send(f)
+        optional_features = %i(citation_part)
+        optional_features += %i(lemma part_of_speech morphology) unless options['remove-morphology']
+        optional_features += %i(head_id relation) unless options['remove-syntax']
+        optional_features += %i(antecedent_id information_status contrast_group) unless options['remove-information-structure']
-            attrs[f.to_s.gsub('_', '-')] = v
-          end
+        unless token.is_empty?
+          mandatory_features << :form
+          optional_features += %i(presentation_before presentation_after foreign_ids)
+        else
+          mandatory_features << :empty_token_sort
+        end
+        if options['remove-not-reviewed'] or options['remove-not-annotated'] or options['remove-annotated'] or options['remove-annotated']
+          overrides[:token][:antecedent_id] =
+            (token.antecedent_id and include_sentence?(tb.find_token(token.antecedent_id.to_i).sentence, options)) ? token.antecedent_id : nil
+        end
+        optional_features += %i(alignment_id) unless options['remove-alignments']
-          optional_features.each do |f|
-            v = obj.send(f)
+        attrs = grab_features(token, mandatory_features, optional_features, overrides[:token])
-            if v and v.to_s != ''
-              attrs[f.to_s.gsub('_', '-')] = v
+        unless token.slashes.empty? or options['remove-syntax'] # this extra test avoids <token></token> style XML
+          builder.token(attrs) do
+            token.slashes.each do |relation, target_id|
+              builder.slash(:"target-id" => target_id, relation: relation)
             end
           end
+        else
+          unless options['remove-syntax'] and token.is_empty?
+            builder.token(attrs)
+          end
+        end
+      end
-          attrs
+      def grab_features(obj, mandatory_features, optional_features = [], overrides = {})
+        attrs = {}
+        mandatory_features.each do |f|
+          v = overrides.key?(f) ? overrides[f] : obj.send(f)
+          attrs[f.to_s.gsub('_', '-')] = v
         end
+        optional_features.each do |f|
+          v = overrides.key?(f) ? overrides[f] : obj.send(f)
+          if v and v.to_s != ''
+            attrs[f.to_s.gsub('_', '-')] = v
+          end
+        end
+        attrs
       end
     end
   end