RubyGems - proiel-cli - Versions diffs - 1.2.1 → 1.3.0 - Mend

proiel-cli 1.2.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +5 -5
data/README.md +11 -3
data/bin/proiel +1 -1
data/lib/proiel/cli/commands/build.rb +91 -0
data/lib/proiel/cli/commands/convert.rb +7 -2
data/lib/proiel/cli/commands/dictionary.rb +46 -0
data/lib/proiel/cli/commands/info.rb +1 -1
data/lib/proiel/cli/commands/shell.rb +34 -0
data/lib/proiel/cli/commands/tokenize.rb +2 -2
data/lib/proiel/cli/commands/validate.rb +1 -1
data/lib/proiel/cli/commands/visualize.rb +14 -11
data/lib/proiel/cli/converters/conll-u/morphology.rb +162 -72
data/lib/proiel/cli/converters/conll-u/syntax.rb +108 -62
data/lib/proiel/cli/converters/conll-u.rb +648 -548
data/lib/proiel/cli/converters/conll-x.rb +67 -52
data/lib/proiel/cli/converters/lexc.rb +21 -23
data/lib/proiel/cli/converters/proielxml.rb +173 -132
data/lib/proiel/cli/converters/text.rb +69 -71
data/lib/proiel/cli/converters/tiger.rb +110 -114
data/lib/proiel/cli/converters/tiger2.rb +139 -141
data/lib/proiel/cli/converters/tnt.rb +19 -15
data/lib/proiel/cli/version.rb +1 -1
data/lib/proiel/cli.rb +26 -1
metadata +43 -58
data/bin/setup +0 -8
data/contrib/proiel-tnt-train +0 -15
data/lib/proiel/cli/commands.rb +0 -28

data/lib/proiel/cli/converters/text.rb CHANGED Viewed

@@ -1,98 +1,96 @@
-module PROIEL
-  module Converter
-    class Text
-      class << self
-        def process(tb, options)
-          tb.sources.each do |source|
-            puts "% id = #{source.id}"
-            puts "% export_time = #{source.export_time}"
-            puts "% title = #{source.title}"
-            puts "% author = #{source.author}"
-            puts "% citation_part = #{source.citation_part}"
-            puts "% language = #{source.language}"
-            source.divs.each do |div|
-              puts
-              puts "# #{div.title}"
-              puts
-              if options['diffable']
-                print_diffable_div(div)
-              else
-                print_formatted_div(div)
-              end
+module PROIEL::Converter
+  class Text
+    class << self
+      def process(tb, options)
+        tb.sources.each do |source|
+          puts "% id = #{source.id}"
+          puts "% export_time = #{source.export_time}"
+          puts "% title = #{source.title}"
+          puts "% author = #{source.author}"
+          puts "% citation_part = #{source.citation_part}"
+          puts "% language = #{source.language}"
+          source.divs.each do |div|
+            puts
+            puts "# #{div.title}"
+            puts
+            if options['diffable']
+              print_diffable_div(div)
+            else
+              print_formatted_div(div)
             end
           end
         end
+      end
-        def print_formatted_div(div)
-          p = ''
-          p += div.presentation_before unless div.presentation_before.nil?
-          current_citation = nil
+      def print_formatted_div(div)
+        p = ''
+        p += div.presentation_before unless div.presentation_before.nil?
-          div.sentences.each do |sentence|
-            p += sentence.presentation_before unless sentence.presentation_before.nil?
+        current_citation = nil
-            sentence.tokens.each do |token|
-              if token.has_content?
-                new_citation = token.citation_part
+        div.sentences.each do |sentence|
+          p += sentence.presentation_before unless sentence.presentation_before.nil?
-                if current_citation != new_citation
-                  p += "§#{new_citation.gsub(/\s+/, '_')} "
-                  current_citation = new_citation
-                end
+          sentence.tokens.each do |token|
+            if token.has_content?
+              new_citation = token.citation_part
-                p += [token.presentation_before,
-                      token.form,
-                      token.presentation_after].compact.join
+              if current_citation != new_citation
+                p += "§#{new_citation.gsub(/\s+/, '_')} "
+                current_citation = new_citation
               end
-            end
-            p += sentence.presentation_after unless sentence.presentation_after.nil?
+              p += [token.presentation_before,
+                    token.form,
+                    token.presentation_after].compact.join
+            end
           end
-          p += div.presentation_after unless div.presentation_after.nil?
+          p += sentence.presentation_after unless sentence.presentation_after.nil?
+        end
-          p = p.strip.gsub(/ +/, ' ').split("\n").collect do |line|
-              line.length > 80 ? line.gsub(/(.{1,80})(\s+|$)/, "\\1\n").strip : line
-            end * "\n"
+        p += div.presentation_after unless div.presentation_after.nil?
-          puts p
-        end
+        p = p.strip.gsub(/ +/, ' ').split("\n").collect do |line|
+            line.length > 80 ? line.gsub(/(.{1,80})(\s+|$)/, "\\1\n").strip : line
+          end * "\n"
-        def print_diffable_div(div)
-          current_citation = nil
+        puts p
+      end
-          p = ''
-          pb = div.presentation_before || ''
+      def print_diffable_div(div)
+        current_citation = nil
-          div.sentences.each do |sentence|
-            pb += sentence.presentation_before || ''
+        p = ''
+        pb = div.presentation_before || ''
-            sentence.tokens.each do |token|
-              if token.has_content?
-                if current_citation != token.citation_part
-                  puts p unless p.empty?
-                  p = "§#{token.citation_part.gsub(/\s+/, '_')} "
-                  current_citation = token.citation_part
-                end
+        div.sentences.each do |sentence|
+          pb += sentence.presentation_before || ''
-                p += [pb,
-                      token.presentation_before,
-                      token.form,
-                      token.presentation_after].compact.join
-                pb = ''
+          sentence.tokens.each do |token|
+            if token.has_content?
+              if current_citation != token.citation_part
+                puts p unless p.empty?
+                p = "§#{token.citation_part.gsub(/\s+/, '_')} "
+                current_citation = token.citation_part
               end
-            end
-            p += sentence.presentation_after unless sentence.presentation_after.nil?
+              p += [pb,
+                    token.presentation_before,
+                    token.form,
+                    token.presentation_after].compact.join
+              pb = ''
+            end
           end
-          p += div.presentation_after unless div.presentation_after.nil?
-          puts p unless p.empty?
+          p += sentence.presentation_after unless sentence.presentation_after.nil?
         end
+        p += div.presentation_after unless div.presentation_after.nil?
+        puts p unless p.empty?
       end
     end
   end

data/lib/proiel/cli/converters/tiger.rb CHANGED Viewed

@@ -1,154 +1,150 @@
-require 'builder'
-module PROIEL
-  module Converter
-    # Converter for the TigerXML format
-    # (http://www.ims.uni-stuttgart.de/projekte/TIGER/TIGERSearch/doc/html/TigerXML.html)
-    # in the variant used by VISL under the name 'TIGER dependency format'
-    # (http://beta.visl.sdu.dk/treebanks.html#TIGER_dependency_format).
-    class Tiger
-      SCHEMA_FILE = File.join('tigerxml', 'TigerXML.xsd')
-      MORPHOLOGICAL_FEATURES = %i(person_number tense_mood_voice case_number gender degree strength inflection)
-      OTHER_FEATURES = %i(lemma pos information_status antecedent_id word)
-      class << self
-        def process(tb, options)
-          selected_features = MORPHOLOGICAL_FEATURES + OTHER_FEATURES
-          @features = selected_features.map { |f| [f, 'FREC'] }.to_h
-          builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2)
-          builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
-          tb.sources.each do |source|
-            @hack = tb.annotation_schema
-            write_source(builder, source) do
-              source.divs.each do |div|
-                div.sentences.each do |sentence|
-                  write_sentence(builder, sentence)
-                end
+module PROIEL::Converter
+  # Converter for the TigerXML format
+  # (http://www.ims.uni-stuttgart.de/projekte/TIGER/TIGERSearch/doc/html/TigerXML.html)
+  # in the variant used by VISL under the name 'TIGER dependency format'
+  # (http://beta.visl.sdu.dk/treebanks.html#TIGER_dependency_format).
+  class Tiger
+    SCHEMA_FILE = File.join('tigerxml', 'TigerXML.xsd')
+    MORPHOLOGICAL_FEATURES = %i(person_number tense_mood_voice case_number gender degree strength inflection)
+    OTHER_FEATURES = %i(lemma pos information_status antecedent_id word)
+    class << self
+      def process(tb, _)
+        selected_features = MORPHOLOGICAL_FEATURES + OTHER_FEATURES
+        @features = selected_features.map { |f| [f, 'FREC'] }.to_h
+        builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2)
+        builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
+        tb.sources.each do |source|
+          @hack = tb.annotation_schema
+          write_source(builder, source) do
+            source.divs.each do |div|
+              div.sentences.each do |sentence|
+                write_sentence(builder, sentence)
               end
             end
           end
         end
+      end
-        def write_source(builder, s)
-          builder.corpus(id: s.id) do
-            builder.head do
-              builder.meta do
-                builder.name(s.title)
-              end
-              declare_annotation(builder, @features, @hack)
+      def write_source(builder, s)
+        builder.corpus(id: s.id) do
+          builder.head do
+            builder.meta do
+              builder.name(s.title)
             end
-            builder.body do
-              yield
-            end
+            declare_annotation(builder, @features, @hack)
+          end
+          builder.body do
+            yield
           end
         end
+      end
-        def declare_annotation(builder, features, annotation_schema)
-          builder.annotation do
-            features.each do |name, domain|
-              # FIXME: we may want to list possible values for some of these
-              builder.feature(name: name, domain: domain)
-            end
+      def declare_annotation(builder, features, annotation_schema)
+        builder.annotation do
+          features.each do |name, domain|
+            # FIXME: we may want to list possible values for some of these
+            builder.feature(name: name, domain: domain)
+          end
-            builder.edgelabel do
-              builder.value(name: '--')
+          builder.edgelabel do
+            builder.value(name: '--')
-              annotation_schema.primary_relations.each do |tag, features|
-                builder.value({ name: tag }, features.summary)
-              end
+            annotation_schema.primary_relations.each do |tag, features|
+              builder.value({ name: tag }, features.summary)
             end
+          end
-            builder.secedgelabel do
-              annotation_schema.secondary_relations.each do |tag, features|
-                builder.value({name: tag }, features.summary)
-              end
+          builder.secedgelabel do
+            annotation_schema.secondary_relations.each do |tag, features|
+              builder.value({name: tag }, features.summary)
             end
           end
         end
+      end
-        def token_attrs(s, t, type)
-          attrs = {}
-          @features.each do |name, domain|
-            if domain == 'FREC' or domain == type
-              case name
-              when :word, :cat
-                attrs[name] = t.pro? ? "PRO-#{t.relation.upcase}" : t.form
-              when *@semantic_features
-                attrs[name] = t.sem_tags_to_hash[attr]
-              when :lemma
-                attrs[name] = t.lemma
-              when :pos
-                if t.empty_token_sort
-                  attrs[name] = t.empty_token_sort + "-"
-                else
-                  attrs[name] = t.pos
-                end
-              when *MORPHOLOGICAL_FEATURES
-                attrs[name] = name.to_s.split("_").map { |a| t.morphology_hash[a.to_sym] || '-' }.join
+      def token_attrs(t, type)
+        attrs = {}
+        @features.each do |name, domain|
+          if domain == 'FREC' or domain == type
+            case name
+            when :word, :cat
+              attrs[name] = t.pro? ? "PRO-#{t.relation.upcase}" : t.form
+            when *@semantic_features
+              attrs[name] = t.sem_tags_to_hash[attr]
+            when :lemma
+              attrs[name] = t.lemma
+            when :pos
+              if t.empty_token_sort
+                attrs[name] = t.empty_token_sort + '-'
+              else
+                attrs[name] = t.pos
+              end
+            when *MORPHOLOGICAL_FEATURES
+              attrs[name] = name.to_s.split('_').map { |a| t.morphology_hash[a.to_sym] || '-' }.join
+            else
+              if t.respond_to?(name)
+                attrs[name] = t.send(name)
               else
-                if t.respond_to?(name)
-                  attrs[name] = t.send(name)
-                else
-                  raise "Do not know how to get required attribute #{name}"
-                end
+                raise "Do not know how to get required attribute #{name}"
               end
-              attrs[name] ||= "--"
             end
+            attrs[name] ||= '--'
           end
-          attrs
         end
-        def write_terminals(builder, s)
-          builder.terminals do
-            s.tokens.each do |t|
-              builder.t(token_attrs(s, t, 'T').merge({ id: "w#{t.id}"}))
-            end
+        attrs
+      end
+      def write_terminals(builder, s)
+        builder.terminals do
+          s.tokens.each do |t|
+            builder.t(token_attrs(t, 'T').merge({ id: "w#{t.id}"}))
           end
         end
+      end
-        def write_nonterminals(builder, s)
-          builder.nonterminals do
-            # Add an empty root node
-            h = @features.select { |_, domain| ['FREC', 'NT'].include?(domain) }.map { |name, _| [name, '--'] }.to_h
-            h[:id] = "s#{s.id}_root"
+      def write_nonterminals(builder, s)
+        builder.nonterminals do
+          # Add an empty root node
+          h = @features.select { |_, domain| ['FREC', 'NT'].include?(domain) }.map { |name, _| [name, '--'] }.to_h
+          h[:id] = "s#{s.id}_root"
-            builder.nt(h) do
-              s.tokens.reject { |t| t.head or t.pro? }.each do |t|
-                builder.edge(idref: "p#{t.id}", label: t.relation)
-              end
+          builder.nt(h) do
+            s.tokens.reject { |t| t.head or t.pro? }.each do |t|
+              builder.edge(idref: "p#{t.id}", label: t.relation)
             end
+          end
-            # Add other NTs
-            s.tokens.each do |t|
-              builder.nt(token_attrs(s, t, 'NT').merge(id: "p#{t.id}")) do
-                # Add an edge to the correspoding terminal node
-                builder.edge(idref: "w#{t.id}", label: '--')
+          # Add other NTs
+          s.tokens.each do |t|
+            builder.nt(token_attrs(t, 'NT').merge(id: "p#{t.id}")) do
+              # Add an edge to the correspoding terminal node
+              builder.edge(idref: "w#{t.id}", label: '--')
-                # Add primary dependency edges
-                t.children.each { |d| builder.edge(idref: "p#{d.id}", label: d.relation) }
+              # Add primary dependency edges
+              t.children.each { |d| builder.edge(idref: "p#{d.id}", label: d.relation) }
-                # Add secondary dependency edges
-                t.slashes.each do |relation, target_id|
-                  builder.secedge(idref: "p#{target_id}", label: relation)
-                end
+              # Add secondary dependency edges
+              t.slashes.each do |relation, target_id|
+                builder.secedge(idref: "p#{target_id}", label: relation)
               end
             end
           end
         end
+      end
-        def write_sentence(builder, s)
-          builder.s(id: "s#{s.id}") do
-            builder.graph(root: "s#{s.id}_root") do
-              write_terminals(builder, s)
-              write_nonterminals(builder, s)
-            end
+      def write_sentence(builder, s)
+        builder.s(id: "s#{s.id}") do
+          builder.graph(root: "s#{s.id}_root") do
+            write_terminals(builder, s)
+            write_nonterminals(builder, s)
           end
         end
       end