RubyGems - corpus-processor - Versions diffs - 0.2.0 → 0.3.0 - Mend

corpus-processor 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

checksums.yaml +4 -4
data/.travis.yml +5 -0
data/.yardopts +1 -0
data/README.md +235 -34
data/bin/corpus-processor +3 -3
data/corpus-processor.gemspec +16 -14
data/lib/corpus-processor.rb +12 -8
data/lib/corpus-processor/categories.rb +58 -0
data/lib/corpus-processor/categories/default.yml +10 -0
data/lib/corpus-processor/cli.rb +31 -11
data/lib/corpus-processor/generators.rb +5 -1
data/lib/corpus-processor/generators/stanford_ner.rb +19 -10
data/lib/corpus-processor/parsers.rb +5 -1
data/lib/corpus-processor/parsers/lampada.rb +103 -47
data/lib/corpus-processor/processor.rb +19 -4
data/lib/corpus-processor/token.rb +35 -1
data/lib/corpus-processor/version.rb +1 -1
data/spec/{integration → corpus-processor}/cli_spec.rb +81 -71
data/spec/corpus-processor/generators/stanford_ner_spec.rb +57 -0
data/spec/corpus-processor/parsers/lampada_spec.rb +333 -0
data/spec/corpus-processor/processor_spec.rb +36 -0
data/spec/corpus-processor/token_spec.rb +15 -0
data/spec/spec_helper.rb +7 -4
metadata +39 -27
data/lib/corpus-processor/default_categories.rb +0 -14
data/lib/corpus-processor/tokenizer.rb +0 -17
data/lib/corpus-processor/traverser.rb +0 -19
data/spec/unit/generators/stanford_ner_spec.rb +0 -46
data/spec/unit/parsers/lampada_spec.rb +0 -269
data/spec/unit/processor.rb +0 -37
data/spec/unit/token_spec.rb +0 -8
data/spec/unit/tokenizer_spec.rb +0 -121
data/spec/unit/traverser_spec.rb +0 -68

data/lib/corpus-processor/categories.rb ADDED Viewed

@@ -0,0 +1,58 @@
+# The helper to load categories definitions.
+#
+# Categories definitions is a Hash with two keys named `:input` and `:output`.
+#
+# The `:input` has `String` keys that match the categories found in original
+# corpus. Its values are `Symbol`s that represent the category internally.
+#
+# The `:output` has `Symbol`s keys that represent the category internally
+# and should the values from the `:input` hash. Its values are the `String`s
+# representing the category in the final converted corpus.
+#
+# An optional `:default` key is allowed in the `:output` hash. If present
+# the resulting loaded hash has the specified default value.
+#
+# @example YAML file defining categories.
+#   ---
+#   :input:
+#     PESSOA: :person
+#     LOCAL: :location
+#     ORGANIZACAO: :organization
+#   :output:
+#     :default: O
+#     :person: PERSON
+#     :location: LOCATION
+#     :organization: ORGANIZATION
+class CorpusProcessor::Categories
+  # Load a set of categories definitions.
+  #
+  # @param path [String] the path to the YAML file that defines the categories.
+  # @return [Hash] the categories extracted from the YAML file.
+  # @see .default
+  def self.load path
+    @@instances[path] ||= YAML.load(File.read(path)).tap { |categories|
+      default = categories[:output] && categories[:output][:default]
+      if default
+        categories[:output].default = default
+        categories[:output].delete :default
+      end
+    }
+  end
+  # The default set of categories definitions.
+  #
+  # The YAML definition file is
+  # {file:lib/corpus-processor/categories/default.yml}.
+  #
+  # @return (see .load)
+  # @see .load
+  def self.default
+    self.load(File.expand_path(File.join('..', 'categories', 'default.yml'),
+                               __FILE__))
+  end
+  protected
+    @@instances = Hash.new
+end

data/lib/corpus-processor/categories/default.yml ADDED Viewed

@@ -0,0 +1,10 @@
+---
+:input:
+  PESSOA: :person
+  LOCAL: :location
+  ORGANIZACAO: :organization
+:output:
+  :default: O
+  :person: PERSON
+  :location: LOCATION
+  :organization: ORGANIZATION

data/lib/corpus-processor/cli.rb CHANGED Viewed

@@ -1,17 +1,37 @@
-require "corpus-processor"
-require "thor"
+require 'thor'
-module CorpusProcessor
-  class Cli < ::Thor
+require 'corpus-processor'
-    desc "process [INPUT_FILE [OUTPUT_FILE]] ", "convert corpus from LâMPADA format to Stanford-NER format"
-    def process(input_file = $stdin, output_file = $stdout)
-      input_file  = File.new( input_file, "r") if  input_file.is_a? String
-      output_file = File.new(output_file, "w") if output_file.is_a? String
+# The operations available to users from CLI.
+class CorpusProcessor::Cli < Thor
-      output_file.puts(CorpusProcessor::Processor.new.process(input_file.read))
+  option :categories,
+         aliases: :c,
+         banner:  'CATEGORIES_FILE',
+         desc:    'Path to categories YAML file'
+  desc 'process [INPUT_FILE [OUTPUT_FILE]]',
+    'convert corpus from LâMPADA format to Stanford-NER format'
+  # Convert a given corpus from one format to other.
+  #
+  # By default the input format is LâMPADA and the output format is the one
+  # used by Stanford NER in training.
+  #
+  # @param input_file [String, IO] the file that contains the original corpus.
+  # @param output_file [String, IO] the file in which the converted corpus
+  #   is written.
+  # @return [void]
+  def process input_file = STDIN, output_file = STDOUT
+    input_file  = File.open( input_file, 'r') if  input_file.is_a? String
+    output_file = File.open(output_file, 'w') if output_file.is_a? String
+    categories  = if options[:categories]
+                    CorpusProcessor::Categories.load(options[:categories])
+                  else
+                    CorpusProcessor::Categories.default
+                  end
-      output_file.close
-    end
+    output_file.puts CorpusProcessor::Processor.new(categories: categories)
+                                               .process(input_file.read)
+    output_file.close
   end
 end

data/lib/corpus-processor/generators.rb CHANGED Viewed

@@ -1 +1,5 @@
-require "corpus-processor/generators/stanford_ner"
+# Namespace for generators.
+module CorpusProcessor::Generators
+end
+require 'corpus-processor/generators/stanford_ner'

data/lib/corpus-processor/generators/stanford_ner.rb CHANGED Viewed

@@ -1,13 +1,22 @@
-module CorpusProcessor::Generators
-  class StanfordNer
-    def initialize(categories = CorpusProcessor::DEFAULT_CATEGORIES[:output])
-      @categories = categories
-    end
+# The generator for Stanford NER corpus.
+#
+# Generates corpus in the format used by Stanford NER training.
+class CorpusProcessor::Generators::StanfordNer
-    def generate(tokens)
-      tokens.map { |token|
-        "#{ token.word }	#{ @categories[token.category] }"
-      }.join("\n") + "\n"
-    end
+  # @param categories [Hash] the categories definitions loaded by
+  #   {CorpusProcessor::Categories}.
+  def initialize categories = CorpusProcessor::Categories.default
+    @categories = categories.fetch :output
+  end
+  # Generate the corpus from tokens.
+  #
+  # @param tokens [Array<CorpusProcessor::Token>] the tokens from which
+  #   the corpus is generated.
+  # @return [String] the generated corpus.
+  def generate tokens
+    tokens.map { |token|
+      "#{ token.word }\t#{ @categories[token.category] }"
+    }.join("\n") + "\n"
   end
 end

data/lib/corpus-processor/parsers.rb CHANGED Viewed

@@ -1 +1,5 @@
-require "corpus-processor/parsers/lampada"
+# Namespace for parsers.
+module CorpusProcessor::Parsers
+end
+require 'corpus-processor/parsers/lampada'

data/lib/corpus-processor/parsers/lampada.rb CHANGED Viewed

@@ -1,52 +1,108 @@
-module CorpusProcessor::Parsers
-  class Lampada
-    CATEGORY_REGEX = /
-      (?<any_text>           .*?                       ){0}
-      (?<entity_attributes>  \s\g<any_text>
-        CATEG="\g<categories>"\g<any_text>             ){0}
-      (?<entity_opening_tag> <em\g<entity_attributes>> ){0}
-      (?<entity_closing_tag> <\/em>                    ){0}
-      # groups of interest
-      (?<inner_text>         \g<any_text>              ){0}
-      (?<categories>         \g<any_text>              ){0}
-      \g<entity_opening_tag>\g<inner_text>\g<entity_closing_tag>
-    /ix
-    def initialize(categories = CorpusProcessor::DEFAULT_CATEGORIES[:input],
-                   traverser  = CorpusProcessor::Traverser.new,
-                   tokenizer  = CorpusProcessor::Tokenizer.new)
-      @categories = categories
-      @traverser  = traverser
-      @tokenizer  = tokenizer
-    end
-    def parse(corpus)
-      [].tap { |tokens|
-        @traverser.traverse(@tokenizer.join_lines(corpus),
-                            CATEGORY_REGEX) do |match|
-          text_to_tokenize, category = case match
-                                       when String
-                                         [match, nil]
-                                       when MatchData
-                                         [
-                                           match[:inner_text],
-                                           extract_category(match[:categories])
-                                         ]
-                                       end
-          tokens.push(*@tokenizer.tokenize(text_to_tokenize, category))
-        end
+# The parser for the corpus in LâMPADA format.
+class CorpusProcessor::Parsers::Lampada
+  # @param (see Generators::StanfordNer#initialize)
+  def initialize categories = CorpusProcessor::Categories.default
+    self.categories = categories.fetch :input
+  end
+  # Parse the corpus in LâMPADA format.
+  #
+  # @param corpus [String] the original corpus.
+  # @return [Array<CorpusProcessor::Token>] the tokens extracted from corpus.
+  def parse corpus
+    process_nodes Nokogiri::XML(corpus).css('P')
+  end
+  protected
+    attr_accessor :categories
+    attr_accessor :current_category
+    def process_nodes nodes
+      nodes.reduce([]) { |tokens, node| tokens.push(*process_node(node)) }
+    end
+    def process_node node
+      case node
+      when Nokogiri::XML::Text    then process_text    node.text
+      when Nokogiri::XML::Element then process_element node
+      else
+        raise ArgumentError, "#{ node } cannot be handled by " \
+                             "#{ self.class }. This is probably a bug, "\
+                             "please report."
+      end
+    end
+    def process_text text
+      text.gsub(punct, ' \0 ')
+          .strip
+          .split(spaces)
+          .map { |word|
+        CorpusProcessor::Token.new(word, current_category)
       }
     end
-    def extract_category(categories)
-      categories
-        .split("|")
-        .map { |category_string| @categories[category_string] }
-        .compact
-        .first
+    def process_element element
+      case element.name
+      when 'P'   then process_p     element
+      when 'EM'  then process_em    element
+      when 'ALT' then process_alt   element
+      else            process_nodes element.children
+      end
+    end
+    def process_p p
+      tokens = process_nodes p.children
+      tokens << period_token if ! tokens.empty? && tokens.last.word !~ punct
+      tokens
+    end
+    def process_em em
+      with_category em.attributes['CATEG'] { process_nodes em.children }
+    end
+    def process_alt alt
+      alternatives  = alt.inner_html.encode('UTF-8').split('|')
+      fake_xmls     = alternatives.map { |alternative|
+        Nokogiri::XML "<document>#{ alternative }</document>"
+      }
+      alternatives_tokens = fake_xmls.map { |fake_xml|
+        process_nodes fake_xml.children
+      }
+      alternatives_tokens.max_by { |alternative_tokens|
+        alternative_tokens.count { |alternative_token|
+          ! alternative_token.category.nil?
+        }
+      }
+    end
+    def with_category categories_attribute, &block
+      unless categories_attribute.nil?
+        self.current_category = extract categories_attribute.text
+      end
+      tokens = block.call
+      self.current_category = nil
+      tokens
+    end
+    def extract categories_string
+      category = categories_string.split('|').find { |category_string|
+        categories.include? category_string
+      }
+      categories[category]
+    end
+    def punct
+      /[[:punct:]]/
+    end
+    def spaces
+      /\s+/
+    end
+    def period_token
+      @period_token ||= CorpusProcessor::Token.new('.')
     end
-  end
 end

data/lib/corpus-processor/processor.rb CHANGED Viewed

@@ -1,11 +1,26 @@
+# The entry point for processing corpus.
+#
+# @example Simple use with default configuration.
+#   CorpusProcessor::Processor.new.process('<P>Some text</P>')
+#   # => "Some\tO\ntext\tO\n.\tO\n""
 class CorpusProcessor::Processor
-  def initialize(parser    = CorpusProcessor::Parsers::Lampada.new,
-                 generator = CorpusProcessor::Generators::StanfordNer.new)
+  # @param categories [Hash] the categories extracted with {Categories}.
+  # @param parser [#parse] the parser for original corpus.
+  # @param generator [#generate] the generator that computes tokens into
+  #   the tranformed corpus.
+  def initialize(
+    categories: CorpusProcessor::Categories.default,
+    parser:     CorpusProcessor::Parsers::Lampada.new(categories),
+    generator:  CorpusProcessor::Generators::StanfordNer.new(categories))
     @parser    = parser
     @generator = generator
   end
-  def process(corpus)
-    @generator.generate(@parser.parse(corpus))
+  # Perform the processing of corpus.
+  #
+  # @return [String] the converted corpus.
+  def process corpus
+    @generator.generate @parser.parse(corpus)
   end
 end

data/lib/corpus-processor/token.rb CHANGED Viewed

@@ -1,2 +1,36 @@
-class CorpusProcessor::Token < Struct.new(:word, :category)
+# The internal representation of a token.
+#
+# Tokens are extracted from original corpus and are defined by single words
+# or punctuation.
+#
+# They also contain a category, which is originated form the tagging in the
+# corpus.
+class CorpusProcessor::Token
+  # @return [String] the word from text. It shouldn't contain spaces.
+  attr_reader :word
+  # @return [Symbol] the type of the {Token}. It should be a valid category
+  #   from {Categories}.
+  attr_reader :category
+  # @param word [String] the word from text. It shouldn't contain spaces.
+  # @param category [Symbol] the type of the {Token}. It should be a valid
+  #   category from {Categories}.
+  def initialize word = '', category = nil
+    self.word     = word
+    self.category = category
+  end
+  # Determine equality of two {Token}s.
+  #
+  # @param other [Token] the other {Token} to test.
+  def ==(other)
+    word == other.word && category == other.category
+  end
+  protected
+    attr_writer :word
+    attr_writer :category
 end

data/lib/corpus-processor/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module CorpusProcessor
-  VERSION = "0.2.0"
+  VERSION = '0.3.0'
 end

data/spec/{integration → corpus-processor}/cli_spec.rb RENAMED Viewed

@@ -1,17 +1,19 @@
-require "spec_helper"
+require 'spec_helper'
-require "corpus-processor/cli"
+require 'corpus-processor/cli'
 describe CorpusProcessor::Cli do
-  include FakeFS::SpecHelpers
   subject(:cli) { CorpusProcessor::Cli.new }
-  let(:input_file)  { "input_file"  }
-  let(:output_file) { "output_file" }
+  let(:input_file)  { STDIN  }
+  let(:output_file) { STDOUT }
-  before do
-    File.open(input_file, "w") { |file|
-      file.write <<-INPUT
+  describe '#process' do
+    subject { cli.process }
+    before do
+      expect(input_file).to receive(:read)
+        .and_return(<<-INPUT.encode('ISO-8859-1'))
 <?xml version="1.0" encoding="ISO-8859-1"?>
 <!DOCTYPE colHAREM>
 <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
@@ -30,71 +32,79 @@ describe CorpusProcessor::Cli do
       no papado de <EM ID="H2-dftre765-11" CATEG="ACONTECIMENTO" TIPO="EVENTO">Avignon</EM>
       , o
 INPUT
-    }
-  end
-  describe "#process" do
-    before do
-      cli.process(input_file, output_file)
+      expect(output_file).to receive(:puts).with(<<-OUTPUT)
+Fatores\tO
+Demográficos\tO
+e\tO
+Econômicos\tO
+Subjacentes\tO
+.\tO
+A\tO
+revolta\tO
+histórica\tO
+produz\tO
+normalmente\tO
+uma\tO
+nova\tO
+forma\tO
+de\tO
+pensamento\tO
+quanto\tO
+à\tO
+forma\tO
+de\tO
+organização\tO
+da\tO
+sociedade\tO
+.\tO
+Assim\tO
+foi\tO
+com\tO
+a\tO
+Reforma\tO
+Protestante\tO
+.\tO
+No\tO
+seguimento\tO
+do\tO
+colapso\tO
+de\tO
+instituições\tO
+monásticas\tO
+e\tO
+do\tO
+escolasticismo\tO
+nos\tO
+finais\tO
+da\tO
+Idade\tO
+Média\tO
+na\tO
+Europa\tLOCATION
+,\tO
+acentuado\tO
+pela\tO
+"\tO
+Cativeiro\tO
+Babilónica\tO
+da\tO
+igreja\tO
+"\tO
+no\tO
+papado\tO
+de\tO
+Avignon\tO
+,\tO
+o\tO
+.\tO
+OUTPUT
+      expect(output_file).to receive(:close)
     end
-    specify { File.read(output_file).should == <<-OUTPUT }
-Fatores	O
-Demográficos	O
-e	O
-Econômicos	O
-Subjacentes	O
-A	O
-revolta	O
-histórica	O
-produz	O
-normalmente	O
-uma	O
-nova	O
-forma	O
-de	O
-pensamento	O
-quanto	O
-à	O
-forma	O
-de	O
-organização	O
-da	O
-sociedade	O
-Assim	O
-foi	O
-com	O
-a	O
-Reforma	O
-Protestante	O
-No	O
-seguimento	O
-do	O
-colapso	O
-de	O
-instituições	O
-monásticas	O
-e	O
-do	O
-escolasticismo	O
-nos	O
-finais	O
-da	O
-Idade	O
-Média	O
-na	O
-Europa	LOCATION
-acentuado	O
-pela	O
-Cativeiro	O
-Babilónica	O
-da	O
-igreja	O
-no	O
-papado	O
-de	O
-Avignon	O
-o	O
-OUTPUT
+    it 'processes the corpus' do
+      subject
+    end
   end
 end