RubyGems - corpus-processor - Versions diffs - 0.0.1 - Mend

corpus-processor 0.0.1

Files changed (28) hide show

checksums.yaml +7 -0
data/.gitignore +17 -0
data/.rspec +2 -0
data/Gemfile +3 -0
data/README.md +86 -0
data/bin/corpus-processor +9 -0
data/corpus-processor.gemspec +28 -0
data/lib/corpus-processor.rb +8 -0
data/lib/corpus-processor/cli.rb +17 -0
data/lib/corpus-processor/default_categories.rb +14 -0
data/lib/corpus-processor/generators.rb +1 -0
data/lib/corpus-processor/generators/stanford_ner.rb +13 -0
data/lib/corpus-processor/parsers.rb +1 -0
data/lib/corpus-processor/parsers/harem.rb +52 -0
data/lib/corpus-processor/processor.rb +11 -0
data/lib/corpus-processor/token.rb +2 -0
data/lib/corpus-processor/tokenizer.rb +17 -0
data/lib/corpus-processor/traverser.rb +19 -0
data/lib/corpus-processor/version.rb +3 -0
data/spec/integration/cli_spec.rb +100 -0
data/spec/spec_helper.rb +22 -0
data/spec/unit/generators/stanford_ner_spec.rb +46 -0
data/spec/unit/parsers/harem_spec.rb +269 -0
data/spec/unit/processor.rb +37 -0
data/spec/unit/token_spec.rb +8 -0
data/spec/unit/tokenizer_spec.rb +121 -0
data/spec/unit/traverser_spec.rb +68 -0
metadata +149 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 8a0ff96102528239769c105832893034e21434bf
+  data.tar.gz: 625ffe80fa8399f20610e048c6ce346a69eef9c0
+SHA512:
+  metadata.gz: 1716f52826fa5b895977760e33f5e918a9b7fcebd0d3448b6419c4cb9e8d1b7902f8d99cb6646f4b33693f5743aac3802bc4476e1eac9db555cd188d52acb9e0
+  data.tar.gz: 770efa624c0c2fcb0b3170d10dcce05069f90650f04b775f6d5662c9ac4b61b71f7884831e4903c015398d21cb21498206f6f2bc4a41cf59b6905d887222d9b8

data/.gitignore ADDED Viewed

@@ -0,0 +1,17 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp

data/.rspec ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ --color
2	+ --format progress

data/Gemfile ADDED Viewed

@@ -0,0 +1,3 @@
+source 'https://rubygems.org'
+gemspec

data/README.md ADDED Viewed

@@ -0,0 +1,86 @@
+Corpus Processor
+================
+![Corpus Processor](http://badge.fury.io/rb/corpus-processor)
+Tool to work with [Corpus Linguistics](http://en.wikipedia.org/wiki/Corpus_linguistics). Corpus Processor converts _corpora_ between different formats for use in Natural Language Processing (NLP) tools.
+The first purpose of Corpus Processor and its current only feature is to transform _corpora_ found in [Linguateca](http://www.linguateca.pt) into the format used for training in [Stanford NER](http://nlp.stanford.edu/software/CRF-NER.shtml).
+[Linguateca](http://www.linguateca.pt) is an excellent source of _corpora_ in Portuguese.
+[Stanford NER](http://nlp.stanford.edu/software/CRF-NER.shtml) is an excellent implementation of [Named Entity Recognition](http://en.wikipedia.org/wiki/Named-entity_recognition).
+Installation
+------------
+Corpus Processor is a [Ruby](http://www.ruby-lang.org/) [Gem](http://rubygems.org/). To install it, given a working installation of Ruby, run:
+```bash
+$ gem install corpus_processor
+```
+Usage
+-----
+Convert corpus from HAREM format to Stanford-NER format:
+```bash
+$ corpus-processor process [INPUT_FILE [OUTPUT_FILE]]
+```
+Results
+-------
+For an example of converting one corpus with Corpus Processor, refer to this [gist](https://gist.github.com/leafac/5259008).
+The corpus is from [Linguateca](http://www.linguateca.pt/HAREM/) and the training used [Stanford NER](http://nlp.stanford.edu/software/CRF-NER.shtml).
+Contributing
+------------
+1. Fork it.
+2. Create your feature branch (`git checkout -b my-new-feature`).
+3. Commit your changes (`git commit -am 'Add some feature'`).
+4. Push to the branch (`git push origin my-new-feature`).
+5. Create new Pull Request.
+Changelog
+---------
+### 0.0.1
+* [Harem](http://www.linguateca.pt/HAREM/) Parser.
+* [Stanford NER](http://nlp.stanford.edu/software/CRF-NER.shtml) Generator.
+Thanks
+------
+* *Diana Santos* and her team in [Linguateca](http://www.linguateca.pt) for the semantic annotated corpus in Portuguese.
+* *[Stanford NLP team](http://www-nlp.stanford.edu/)* for the [Stanford NER](http://nlp.stanford.edu/software/CRF-NER.shtml) tool.
+License
+-------
+Copyright (c) 2013 Das Dad
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/bin/corpus-processor ADDED Viewed

@@ -0,0 +1,9 @@
+#!/usr/bin/env ruby
+$LOAD_PATH.unshift(File.expand_path("../../lib", __FILE__))
+require "bundler/setup"
+require "corpus-processor/cli"
+CorpusProcessor::Cli.start(ARGV)

data/corpus-processor.gemspec ADDED Viewed

@@ -0,0 +1,28 @@
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'corpus-processor/version'
+Gem::Specification.new do |spec|
+  spec.name          = "corpus-processor"
+  spec.version       = CorpusProcessor::VERSION
+  spec.authors       = ["Das Dad"]
+  spec.email         = ["feedback@dasdad.com.br"]
+  spec.description   = %q{Process linguistic corpus}
+  spec.summary       = %q{Handle linguistic corpus and convert it to use NLP tools}
+  spec.homepage      = "https://github.com/dasdad/corpus-processor"
+  spec.license       = "MIT"
+  spec.files         = `git ls-files`.split($/)
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ["lib"]
+  spec.required_ruby_version = Gem::Requirement.new(">= 2.0.0")
+  spec.add_dependency "thor"
+  spec.add_development_dependency "bundler", "~> 1.3"
+  spec.add_development_dependency "rspec"
+  spec.add_development_dependency "fakefs"
+  spec.add_development_dependency "pry-nav"
+end

data/lib/corpus-processor.rb ADDED Viewed

@@ -0,0 +1,8 @@
+require "corpus-processor/version"
+require "corpus-processor/token"
+require "corpus-processor/default_categories"
+require "corpus-processor/traverser"
+require "corpus-processor/tokenizer"
+require "corpus-processor/parsers"
+require "corpus-processor/generators"
+require "corpus-processor/processor"

data/lib/corpus-processor/cli.rb ADDED Viewed

@@ -0,0 +1,17 @@
+require "corpus-processor"
+require "thor"
+module CorpusProcessor
+  class Cli < ::Thor
+    desc "process [INPUT_FILE [OUTPUT_FILE]] ", "convert corpus from HAREM format to Stanford-NER format"
+    def process(input_file = $stdin, output_file = $stdout)
+      input_file  = File.new( input_file, "r") if  input_file.is_a? String
+      output_file = File.new(output_file, "w") if output_file.is_a? String
+      output_file.puts(CorpusProcessor::Processor.new.process(input_file.read))
+      output_file.close
+    end
+  end
+end

data/lib/corpus-processor/default_categories.rb ADDED Viewed

@@ -0,0 +1,14 @@
+module CorpusProcessor
+  DEFAULT_CATEGORIES = {
+    input: {
+      "PESSOA"      => :person,
+      "LOCAL"       => :location,
+      "ORGANIZACAO" => :organization,
+    },
+    output: Hash.new("O").merge(
+        person:       "PERSON",
+        location:     "LOCATION",
+        organization: "ORGANIZATION",
+    )
+  }
+end

data/lib/corpus-processor/generators.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+ require "corpus-processor/generators/stanford_ner"

data/lib/corpus-processor/generators/stanford_ner.rb ADDED Viewed

@@ -0,0 +1,13 @@
+module CorpusProcessor::Generators
+  class StanfordNer
+    def initialize(categories = CorpusProcessor::DEFAULT_CATEGORIES[:output])
+      @categories = categories
+    end
+    def generate(tokens)
+      tokens.map { |token|
+        "#{ token.word }	#{ @categories[token.category] }"
+      }.join("\n") + "\n"
+    end
+  end
+end

data/lib/corpus-processor/parsers.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+ require "corpus-processor/parsers/harem"

data/lib/corpus-processor/parsers/harem.rb ADDED Viewed

@@ -0,0 +1,52 @@
+module CorpusProcessor::Parsers
+  class Harem
+    CATEGORY_REGEX = /
+      (?<any_text>           .*?                       ){0}
+      (?<entity_attributes>  \s\g<any_text>
+        CATEG="\g<categories>"\g<any_text>             ){0}
+      (?<entity_opening_tag> <em\g<entity_attributes>> ){0}
+      (?<entity_closing_tag> <\/em>                    ){0}
+      # groups of interest
+      (?<inner_text>         \g<any_text>              ){0}
+      (?<categories>         \g<any_text>              ){0}
+      \g<entity_opening_tag>\g<inner_text>\g<entity_closing_tag>
+    /ix
+    def initialize(categories = CorpusProcessor::DEFAULT_CATEGORIES[:input],
+                   traverser  = CorpusProcessor::Traverser.new,
+                   tokenizer  = CorpusProcessor::Tokenizer.new)
+      @categories = categories
+      @traverser  = traverser
+      @tokenizer  = tokenizer
+    end
+    def parse(corpus)
+      [].tap { |tokens|
+        @traverser.traverse(@tokenizer.join_lines(corpus),
+                            CATEGORY_REGEX) do |match|
+          text_to_tokenize, category = case match
+                                       when String
+                                         [match, nil]
+                                       when MatchData
+                                         [
+                                           match[:inner_text],
+                                           extract_category(match[:categories])
+                                         ]
+                                       end
+          tokens.push(*@tokenizer.tokenize(text_to_tokenize, category))
+        end
+      }
+    end
+    def extract_category(categories)
+      categories
+        .split("|")
+        .map { |category_string| @categories[category_string] }
+        .compact
+        .first
+    end
+  end
+end

data/lib/corpus-processor/processor.rb ADDED Viewed

@@ -0,0 +1,11 @@
+class CorpusProcessor::Processor
+  def initialize(parser    = CorpusProcessor::Parsers::Harem.new,
+                 generator = CorpusProcessor::Generators::StanfordNer.new)
+    @parser    = parser
+    @generator = generator
+  end
+  def process(corpus)
+    @generator.generate(@parser.parse(corpus))
+  end
+end

data/lib/corpus-processor/token.rb ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ class CorpusProcessor::Token < Struct.new(:word, :category)
2	+ end

data/lib/corpus-processor/tokenizer.rb ADDED Viewed

@@ -0,0 +1,17 @@
+class CorpusProcessor::Tokenizer
+  def tokenize(text, category = nil)
+    strip_tags(text)
+      .gsub(/[[:punct:]]/, "")
+      .strip
+      .split(/\s+/)
+      .map { |word| CorpusProcessor::Token.new(word, category) }
+  end
+  def strip_tags(text)
+    text.gsub(/<.*?>/, " ").strip
+  end
+  def join_lines(text)
+    text.gsub(/\s+/, " ").strip
+  end
+end

data/lib/corpus-processor/traverser.rb ADDED Viewed

@@ -0,0 +1,19 @@
+class CorpusProcessor::Traverser
+  def traverse(text, regexp, &block)
+    return if block.nil?
+    remaining_search = text
+    until remaining_search.empty?
+      match = remaining_search.match(regexp)
+      if match.nil?
+        block.call remaining_search unless remaining_search.empty?
+        remaining_search = ""
+      else
+        before           = remaining_search[0...match.begin(0)]
+        remaining_search = remaining_search[match.end(0)..-1]
+        block.call before unless before.empty?
+        block.call match
+      end
+    end
+  end
+end

data/lib/corpus-processor/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module CorpusProcessor
+  VERSION = "0.0.1"
+end

data/spec/integration/cli_spec.rb ADDED Viewed

@@ -0,0 +1,100 @@
+require "spec_helper"
+require "corpus-processor/cli"
+describe CorpusProcessor::Cli do
+  include FakeFS::SpecHelpers
+  subject(:cli) { CorpusProcessor::Cli.new }
+  let(:input_file)  { "input_file"  }
+  let(:output_file) { "output_file" }
+  before do
+    File.open(input_file, "w") { |file|
+      file.write <<-INPUT
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<!DOCTYPE colHAREM>
+<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
+  <DOC DOCID="H2-dftre765">
+    <P>Fatores Demográficos e Econômicos Subjacentes</P>
+    <P>
+      A revolta histórica produz normalmente uma nova forma de pensamento quanto à forma de organização da sociedade. Assim foi com a
+      <EM ID="H2-dftre765-1" CATEG="ABSTRACCAO|ACONTECIMENTO" TIPO="IDEIA|EFEMERIDE">Reforma Protestante</EM>
+      . No seguimento do colapso de instituições monásticas e do escolasticismo
+      nos finais da <EM ID="H2-dftre765-102" CATEG="OUTRO" COMENT="DUVIDA_DIRECTIVASTEMPO">Idade Média</EM>
+      na
+      <EM ID="H2-dftre765-37" CATEG="LOCAL" TIPO="HUMANO" SUBTIPO="DIVISAO">Europa</EM>
+      , acentuado pela "
+<OMITIDO>      <EM ID="H2-dftre765-17" CATEG="ACONTECIMENTO" TIPO="EFEMERIDE">Cativeiro Babilónica da igreja</EM></OMITIDO>
+      "
+      no papado de <EM ID="H2-dftre765-11" CATEG="ACONTECIMENTO" TIPO="EVENTO">Avignon</EM>
+      , o
+INPUT
+    }
+  end
+  describe "#process" do
+    before do
+      cli.process(input_file, output_file)
+    end
+    specify { File.read(output_file).should == <<-OUTPUT }
+Fatores	O
+Demográficos	O
+e	O
+Econômicos	O
+Subjacentes	O
+A	O
+revolta	O
+histórica	O
+produz	O
+normalmente	O
+uma	O
+nova	O
+forma	O
+de	O
+pensamento	O
+quanto	O
+à	O
+forma	O
+de	O
+organização	O
+da	O
+sociedade	O
+Assim	O
+foi	O
+com	O
+a	O
+Reforma	O
+Protestante	O
+No	O
+seguimento	O
+do	O
+colapso	O
+de	O
+instituições	O
+monásticas	O
+e	O
+do	O
+escolasticismo	O
+nos	O
+finais	O
+da	O
+Idade	O
+Média	O
+na	O
+Europa	LOCATION
+acentuado	O
+pela	O
+Cativeiro	O
+Babilónica	O
+da	O
+igreja	O
+no	O
+papado	O
+de	O
+Avignon	O
+o	O
+OUTPUT
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,22 @@
+# This file was generated by the `rspec --init` command. Conventionally, all
+# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
+# Require this file using `require "spec_helper"` to ensure that it is only
+# loaded once.
+#
+# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
+require "fakefs/spec_helpers"
+require "corpus-processor"
+RSpec.configure do |config|
+  config.treat_symbols_as_metadata_keys_with_true_values = true
+  config.run_all_when_everything_filtered = true
+  config.filter_run :focus
+  # Run specs in random order to surface order dependencies. If you find an
+  # order dependency and want to debug it, you can fix the order by providing
+  # the seed, which is printed after each run.
+  #     --seed 1234
+  config.order = "random"
+end

data/spec/unit/generators/stanford_ner_spec.rb ADDED Viewed

@@ -0,0 +1,46 @@
+require "spec_helper"
+describe CorpusProcessor::Generators::StanfordNer do
+  subject(:stanford_ner) { CorpusProcessor::Generators::StanfordNer.new }
+  describe "#generate" do
+    subject { stanford_ner.generate(tokens) }
+    context "no tokens" do
+      let(:tokens) { [] }
+      it { should == "\n" }
+    end
+    context "one token" do
+      let(:tokens) { [CorpusProcessor::Token.new("banana")] }
+      it { should == "banana	O\n" }
+    end
+    context "two tokens" do
+      let(:tokens) { [
+        CorpusProcessor::Token.new("good"),
+        CorpusProcessor::Token.new("banana"),
+      ] }
+      it { should == "good	O\nbanana	O\n" }
+    end
+    context "with category" do
+      let(:tokens) { [CorpusProcessor::Token.new("Leandro", :person)] }
+      it { should == "Leandro	PERSON\n" }
+    end
+    context "with non-default categories" do
+      let(:stanford_ner) { CorpusProcessor::Generators::StanfordNer.new(
+        banana: "BANANA"
+      ) }
+      let(:tokens) { [CorpusProcessor::Token.new("Nanica", :banana)] }
+      it { should == "Nanica	BANANA\n" }
+    end
+  end
+end

data/spec/unit/parsers/harem_spec.rb ADDED Viewed

@@ -0,0 +1,269 @@
+require "spec_helper"
+describe CorpusProcessor::Parsers::Harem do
+  subject(:harem) { CorpusProcessor::Parsers::Harem.new }
+  describe "#parse" do
+    subject { harem.parse(corpus) }
+    context "default categories" do
+      context "empty corpus" do
+        let(:corpus) { "" }
+        it { should == [] }
+      end
+      context "doctype" do
+        let(:corpus) {
+<<-CORPUS
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<!DOCTYPE colHAREM>
+CORPUS
+        }
+        it { should == [] }
+      end
+      context "simple phrase" do
+        let(:corpus) {
+<<-CORPUS
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<!DOCTYPE colHAREM>
+<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
+<DOC DOCID="H2-dftre765">
+  <P>Fatores Demográficos e Econômicos Subjacentes</P>
+</DOC>
+</colHAREM>
+CORPUS
+        }
+        it { should == [
+            CorpusProcessor::Token.new("Fatores"),
+            CorpusProcessor::Token.new("Demográficos"),
+            CorpusProcessor::Token.new("e"),
+            CorpusProcessor::Token.new("Econômicos"),
+            CorpusProcessor::Token.new("Subjacentes"),
+          ]
+        }
+      end
+      context "two simple phrases" do
+        let(:corpus) {
+<<-CORPUS
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<!DOCTYPE colHAREM>
+<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
+<DOC DOCID="H2-dftre765">
+  <P>Fatores Demográficos e Econômicos Subjacentes</P>
+  <P>Fatores Demográficos e Econômicos Subjacentes</P>
+</DOC>
+</colHAREM>
+CORPUS
+        }
+        it { should == [
+            CorpusProcessor::Token.new("Fatores"),
+            CorpusProcessor::Token.new("Demográficos"),
+            CorpusProcessor::Token.new("e"),
+            CorpusProcessor::Token.new("Econômicos"),
+            CorpusProcessor::Token.new("Subjacentes"),
+            CorpusProcessor::Token.new("Fatores"),
+            CorpusProcessor::Token.new("Demográficos"),
+            CorpusProcessor::Token.new("e"),
+            CorpusProcessor::Token.new("Econômicos"),
+            CorpusProcessor::Token.new("Subjacentes"),
+          ]
+        }
+      end
+      context "useless entity" do
+        let(:corpus) {
+<<-CORPUS
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<!DOCTYPE colHAREM>
+<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
+<DOC DOCID="H2-dftre765">
+  <P>Nos finais da <EM ID="H2-dftre765-102" CATEG="OUTRO" COMENT="DUVIDA_DIRECTIVASTEMPO">Idade Média</EM></P>
+</DOC>
+</colHAREM>
+CORPUS
+        }
+        it { should == [
+            CorpusProcessor::Token.new("Nos"),
+            CorpusProcessor::Token.new("finais"),
+            CorpusProcessor::Token.new("da"),
+            CorpusProcessor::Token.new("Idade"),
+            CorpusProcessor::Token.new("Média"),
+          ]
+        }
+      end
+      context "one entity" do
+        let(:corpus) {
+<<-CORPUS
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<!DOCTYPE colHAREM>
+<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
+  <DOC DOCID="H2-dftre765">
+      <P>Foram igualmente determinantes para evitar que as ideias reformadoras encontrassem divulgação em
+      <EM ID="H2-dftre765-23" CATEG="LOCAL" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-8 H2-dftre765-37" TIPOREL="local_nascimento_de incluido">Portugal</EM>
+      </P>
+  </DOC>
+</colHAREM>
+CORPUS
+        }
+        it { should == [
+            CorpusProcessor::Token.new("Foram"),
+            CorpusProcessor::Token.new("igualmente"),
+            CorpusProcessor::Token.new("determinantes"),
+            CorpusProcessor::Token.new("para"),
+            CorpusProcessor::Token.new("evitar"),
+            CorpusProcessor::Token.new("que"),
+            CorpusProcessor::Token.new("as"),
+            CorpusProcessor::Token.new("ideias"),
+            CorpusProcessor::Token.new("reformadoras"),
+            CorpusProcessor::Token.new("encontrassem"),
+            CorpusProcessor::Token.new("divulgação"),
+            CorpusProcessor::Token.new("em"),
+            CorpusProcessor::Token.new("Portugal", :location),
+          ]
+        }
+      end
+      context "multiple entities" do
+        let(:corpus) {
+<<-CORPUS
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<!DOCTYPE colHAREM>
+<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
+  <DOC DOCID="H2-dftre765">
+      <P>
+      A imprensa, inventada na
+      <EM ID="H2-dftre765-9" CATEG="LOCAL" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-37" TIPOREL="incluido">Alemanha</EM>
+      por
+      <EM ID="H2-dftre765-10" CATEG="PESSOA" TIPO="INDIVIDUAL" COREL="H2-dftre765-9" TIPOREL="natural_de">John Gutenberg</EM>
+      <EM ID="H2-dftre765-20" CATEG="ORGANIZACAO" TIPO="INSTITUICAO" COMENT="2/3">Inquisição</EM>
+      e a censura
+      </P>
+  </DOC>
+</colHAREM>
+CORPUS
+        }
+        it { should == [
+            CorpusProcessor::Token.new("A"),
+            CorpusProcessor::Token.new("imprensa"),
+            CorpusProcessor::Token.new("inventada"),
+            CorpusProcessor::Token.new("na"),
+            CorpusProcessor::Token.new("Alemanha", :location),
+            CorpusProcessor::Token.new("por"),
+            CorpusProcessor::Token.new("John", :person),
+            CorpusProcessor::Token.new("Gutenberg", :person),
+            CorpusProcessor::Token.new("Inquisição", :organization),
+            CorpusProcessor::Token.new("e"),
+            CorpusProcessor::Token.new("a"),
+            CorpusProcessor::Token.new("censura"),
+          ]
+        }
+      end
+      context "spaces after ponctuation" do
+        let(:corpus) {
+<<-CORPUS
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<!DOCTYPE colHAREM>
+<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
+  <DOC DOCID="H2-dftre765">
+      <EM ID="H2-dftre765-1" CATEG="ABSTRACCAO|ACONTECIMENTO" TIPO="IDEIA|EFEMERIDE">Reforma Protestante</EM>
+      . No
+  </DOC>
+</colHAREM>
+CORPUS
+        }
+        it { should == [
+            CorpusProcessor::Token.new("Reforma"),
+            CorpusProcessor::Token.new("Protestante"),
+            CorpusProcessor::Token.new("No"),
+          ]
+        }
+      end
+    end
+    context "user-defined categories" do
+      let(:harem) {
+        CorpusProcessor::Parsers::Harem.new({
+          "FRUTA" => :fruit,
+          "LIVRO" => :book,
+        })
+      }
+      context "multiple entities" do
+        let(:corpus) {
+<<-CORPUS
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<!DOCTYPE colHAREM>
+<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
+  <DOC DOCID="H2-dftre765">
+      <P>
+      A imprensa, inventada na
+      <EM ID="H2-dftre765-9" CATEG="FRUTA" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-37" TIPOREL="incluido">Banana</EM>
+      por
+      <EM ID="H2-dftre765-10" CATEG="LIVRO" TIPO="INDIVIDUAL" COREL="H2-dftre765-9" TIPOREL="natural_de">Harry Potter</EM>
+      <EM ID="H2-dftre765-20" CATEG="ORGANIZACAO" TIPO="INSTITUICAO" COMENT="2/3">Inquisição</EM>
+      e a censura
+      </P>
+  </DOC>
+</colHAREM>
+CORPUS
+        }
+        it { should == [
+            CorpusProcessor::Token.new("A"),
+            CorpusProcessor::Token.new("imprensa"),
+            CorpusProcessor::Token.new("inventada"),
+            CorpusProcessor::Token.new("na"),
+            CorpusProcessor::Token.new("Banana", :fruit),
+            CorpusProcessor::Token.new("por"),
+            CorpusProcessor::Token.new("Harry", :book),
+            CorpusProcessor::Token.new("Potter", :book),
+            CorpusProcessor::Token.new("Inquisição"),
+            CorpusProcessor::Token.new("e"),
+            CorpusProcessor::Token.new("a"),
+            CorpusProcessor::Token.new("censura"),
+          ]
+        }
+      end
+    end
+  end
+  describe "#extract_category" do
+    subject { harem.extract_category(categories) }
+    context "empty categories" do
+      let(:categories) { "" }
+      it { should == nil }
+    end
+    context "one category" do
+      let(:categories) { "PESSOA" }
+      it { should == :person }
+    end
+    context "two categories" do
+      let(:categories) { "OUTRA|ORGANIZACAO" }
+      it { should == :organization }
+    end
+    context "ambiguidade" do
+      let(:categories) { "PESSOA|ORGANIZACAO" }
+      it { should == :person }
+    end
+  end
+end

data/spec/unit/processor.rb ADDED Viewed

@@ -0,0 +1,37 @@
+require "spec_helper"
+describe CorpusProcessor::Processor do
+  subject(:processor) { CorpusProcessor::Processor.new(parser, generator) }
+  describe "#process" do
+    subject { processor.process(corpus) }
+    let(:corpus) { "Some corpus" }
+    let(:processed_corpus) {
+<<-CORPUS
+Some	O
+corpus	O
+CORPUS
+    }
+    let(:tokens) {
+      [
+        CorpusProcessor::Token.new("Some"),
+        CorpusProcessor::Token.new("corpus"),
+      ]
+    }
+    let(:parser)    { double :parser    }
+    let(:generator) { double :generator }
+    specify {
+      parser.should_receive(:parse)
+            .with(corpus)
+            .and_return(tokens)
+      generator.should_receive(:generate)
+            .with(tokens)
+            .and_return(processed_corpus)
+      subject.should == processed_corpus
+    }
+  end
+end

data/spec/unit/token_spec.rb ADDED Viewed

@@ -0,0 +1,8 @@
+require "spec_helper"
+describe CorpusProcessor::Token do
+  subject { CorpusProcessor::Token.new }
+  it { should respond_to(:word)     }
+  it { should respond_to(:category) }
+end

data/spec/unit/tokenizer_spec.rb ADDED Viewed

@@ -0,0 +1,121 @@
+require "spec_helper"
+describe CorpusProcessor::Tokenizer do
+  subject(:tokenizer) { CorpusProcessor::Tokenizer.new }
+  describe "#tokenize" do
+    subject { tokenizer.tokenize(text, category) }
+    let(:category) { nil }
+    context "empty string" do
+      let(:text) { "" }
+      it { should == [] }
+    end
+    context "one word" do
+      let(:text) { "banana" }
+      it { should == [CorpusProcessor::Token.new("banana")] }
+    end
+    context "two words" do
+      let(:text) { "good banana" }
+      it { should == [
+        CorpusProcessor::Token.new("good"),
+        CorpusProcessor::Token.new("banana"),
+      ] }
+    end
+    context "ponctuation" do
+      let(:text) { "good, banana" }
+      it { should == [
+        CorpusProcessor::Token.new("good"),
+        CorpusProcessor::Token.new("banana"),
+      ] }
+    end
+    context "default category" do
+      let(:text)     { "Google" }
+      let(:category) { :organization }
+      it { should == [
+        CorpusProcessor::Token.new("Google", :organization),
+      ] }
+    end
+    context "with tags" do
+      let(:text) { "good<lalala/>, banana" }
+      it { should == [
+        CorpusProcessor::Token.new("good"),
+        CorpusProcessor::Token.new("banana"),
+      ] }
+    end
+  end
+  describe "#strip_tags" do
+    subject { tokenizer.strip_tags(text) }
+    context "empty text" do
+      let(:text) { "" }
+      it { should == "" }
+    end
+    context "self closed tag" do
+      let(:text) { "<br/>" }
+      it { should == "" }
+    end
+    context "tag with content" do
+      let(:text) { "<p>Some text</p>" }
+      it { should == "Some text" }
+    end
+    context "content after tag" do
+      let(:text) { "<p>Some<br/>text</p>" }
+      it { should == "Some text" }
+    end
+  end
+  describe "#join_lines" do
+    subject { tokenizer.join_lines(text) }
+    context "empty text" do
+      let(:text) { "" }
+      it { should == "" }
+    end
+    context "one word" do
+      let(:text) { "banana" }
+      it { should == "banana" }
+    end
+    context "two lines" do
+      let(:text) { "banana\nquiabo" }
+      it { should == "banana quiabo" }
+    end
+    context "line with empty space" do
+      let(:text) { "banana\n   \nquiabo" }
+      it { should == "banana quiabo" }
+    end
+    context "leading spaces" do
+      let(:text) { "  \n  banana\n   \nquiabo  \n" }
+      it { should == "banana quiabo" }
+    end
+  end
+end

data/spec/unit/traverser_spec.rb ADDED Viewed

@@ -0,0 +1,68 @@
+require "spec_helper"
+describe CorpusProcessor::Traverser do
+  subject(:traverser) { CorpusProcessor::Traverser.new }
+  describe "#traverse" do
+    subject { traverser.traverse(text, regexp) }
+    context "empty text" do
+      let(:text)   { "" }
+      let(:regexp) { // }
+      specify {
+        expect { |mock_block|
+          traverser.traverse(text, regexp, &mock_block)
+        }.not_to yield_control
+      }
+    end
+    context "simple text" do
+      let(:text)   { "abc" }
+      let(:regexp) { /b/ }
+      specify {
+        expect { |mock_block|
+          traverser.traverse(text, regexp, &mock_block)
+        }.to yield_successive_args "a", text.match(regexp), "c"
+      }
+    end
+    context "two matches" do
+      let(:text)   { "abcbd" }
+      let(:regexp) { /b/ }
+      specify {
+        expect { |mock_block|
+          traverser.traverse(text, regexp, &mock_block)
+        }.to yield_successive_args "a",
+                                   text.match(regexp),
+                                   "c",
+                                   text[2..-1].match(regexp),
+                                   "d"
+      }
+    end
+    context "match in beginning" do
+      let(:text)   { "bc" }
+      let(:regexp) { /b/ }
+      specify {
+        expect { |mock_block|
+          traverser.traverse(text, regexp, &mock_block)
+        }.to yield_successive_args text.match(regexp), "c"
+      }
+    end
+    context "match in ending" do
+      let(:text)   { "bc" }
+      let(:regexp) { /c/ }
+      specify {
+        expect { |mock_block|
+          traverser.traverse(text, regexp, &mock_block)
+        }.to yield_successive_args "b", text.match(regexp)
+      }
+    end
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,149 @@
+--- !ruby/object:Gem::Specification
+name: corpus-processor
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Das Dad
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-03-27 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: thor
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.3'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.3'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: fakefs
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: pry-nav
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description: Process linguistic corpus
+email:
+- feedback@dasdad.com.br
+executables:
+- corpus-processor
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- .rspec
+- Gemfile
+- README.md
+- bin/corpus-processor
+- corpus-processor.gemspec
+- lib/corpus-processor.rb
+- lib/corpus-processor/cli.rb
+- lib/corpus-processor/default_categories.rb
+- lib/corpus-processor/generators.rb
+- lib/corpus-processor/generators/stanford_ner.rb
+- lib/corpus-processor/parsers.rb
+- lib/corpus-processor/parsers/harem.rb
+- lib/corpus-processor/processor.rb
+- lib/corpus-processor/token.rb
+- lib/corpus-processor/tokenizer.rb
+- lib/corpus-processor/traverser.rb
+- lib/corpus-processor/version.rb
+- spec/integration/cli_spec.rb
+- spec/spec_helper.rb
+- spec/unit/generators/stanford_ner_spec.rb
+- spec/unit/parsers/harem_spec.rb
+- spec/unit/processor.rb
+- spec/unit/token_spec.rb
+- spec/unit/tokenizer_spec.rb
+- spec/unit/traverser_spec.rb
+homepage: https://github.com/dasdad/corpus-processor
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: 2.0.0
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.0.0
+signing_key:
+specification_version: 4
+summary: Handle linguistic corpus and convert it to use NLP tools
+test_files:
+- spec/integration/cli_spec.rb
+- spec/spec_helper.rb
+- spec/unit/generators/stanford_ner_spec.rb
+- spec/unit/parsers/harem_spec.rb
+- spec/unit/processor.rb
+- spec/unit/token_spec.rb
+- spec/unit/tokenizer_spec.rb
+- spec/unit/traverser_spec.rb