corpus-processor 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8a0ff96102528239769c105832893034e21434bf
4
+ data.tar.gz: 625ffe80fa8399f20610e048c6ce346a69eef9c0
5
+ SHA512:
6
+ metadata.gz: 1716f52826fa5b895977760e33f5e918a9b7fcebd0d3448b6419c4cb9e8d1b7902f8d99cb6646f4b33693f5743aac3802bc4476e1eac9db555cd188d52acb9e0
7
+ data.tar.gz: 770efa624c0c2fcb0b3170d10dcce05069f90650f04b775f6d5662c9ac4b61b71f7884831e4903c015398d21cb21498206f6f2bc4a41cf59b6905d887222d9b8
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/README.md ADDED
@@ -0,0 +1,86 @@
1
+ Corpus Processor
2
+ ================
3
+
4
+ ![Corpus Processor](http://badge.fury.io/rb/corpus-processor)
5
+
6
+ Tool to work with [Corpus Linguistics](http://en.wikipedia.org/wiki/Corpus_linguistics). Corpus Processor converts _corpora_ between different formats for use in Natural Language Processing (NLP) tools.
7
+
8
+ The first purpose of Corpus Processor and its current only feature is to transform _corpora_ found in [Linguateca](http://www.linguateca.pt) into the format used for training in [Stanford NER](http://nlp.stanford.edu/software/CRF-NER.shtml).
9
+
10
+ [Linguateca](http://www.linguateca.pt) is an excellent source of _corpora_ in Portuguese.
11
+
12
+ [Stanford NER](http://nlp.stanford.edu/software/CRF-NER.shtml) is an excellent implementation of [Named Entity Recognition](http://en.wikipedia.org/wiki/Named-entity_recognition).
13
+
14
+ Installation
15
+ ------------
16
+
17
+ Corpus Processor is a [Ruby](http://www.ruby-lang.org/) [Gem](http://rubygems.org/). To install it, given a working installation of Ruby, run:
18
+
19
+ ```bash
20
+ $ gem install corpus_processor
21
+ ```
22
+
23
+ Usage
24
+ -----
25
+
26
+ Convert corpus from HAREM format to Stanford-NER format:
27
+
28
+ ```bash
29
+ $ corpus-processor process [INPUT_FILE [OUTPUT_FILE]]
30
+ ```
31
+
32
+ Results
33
+ -------
34
+
35
+ For an example of converting one corpus with Corpus Processor, refer to this [gist](https://gist.github.com/leafac/5259008).
36
+
37
+ The corpus is from [Linguateca](http://www.linguateca.pt/HAREM/) and the training used [Stanford NER](http://nlp.stanford.edu/software/CRF-NER.shtml).
38
+
39
+ Contributing
40
+ ------------
41
+
42
+ 1. Fork it.
43
+ 2. Create your feature branch (`git checkout -b my-new-feature`).
44
+ 3. Commit your changes (`git commit -am 'Add some feature'`).
45
+ 4. Push to the branch (`git push origin my-new-feature`).
46
+ 5. Create new Pull Request.
47
+
48
+ Changelog
49
+ ---------
50
+
51
+ ### 0.0.1
52
+
53
+ * [Harem](http://www.linguateca.pt/HAREM/) Parser.
54
+ * [Stanford NER](http://nlp.stanford.edu/software/CRF-NER.shtml) Generator.
55
+
56
+ Thanks
57
+ ------
58
+
59
+ * *Diana Santos* and her team in [Linguateca](http://www.linguateca.pt) for the semantic annotated corpus in Portuguese.
60
+ * *[Stanford NLP team](http://www-nlp.stanford.edu/)* for the [Stanford NER](http://nlp.stanford.edu/software/CRF-NER.shtml) tool.
61
+
62
+ License
63
+ -------
64
+
65
+ Copyright (c) 2013 Das Dad
66
+
67
+ MIT License
68
+
69
+ Permission is hereby granted, free of charge, to any person obtaining
70
+ a copy of this software and associated documentation files (the
71
+ "Software"), to deal in the Software without restriction, including
72
+ without limitation the rights to use, copy, modify, merge, publish,
73
+ distribute, sublicense, and/or sell copies of the Software, and to
74
+ permit persons to whom the Software is furnished to do so, subject to
75
+ the following conditions:
76
+
77
+ The above copyright notice and this permission notice shall be
78
+ included in all copies or substantial portions of the Software.
79
+
80
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
81
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
82
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
83
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
84
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
85
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
86
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift(File.expand_path("../../lib", __FILE__))
4
+
5
+ require "bundler/setup"
6
+
7
+ require "corpus-processor/cli"
8
+
9
+ CorpusProcessor::Cli.start(ARGV)
@@ -0,0 +1,28 @@
1
+ lib = File.expand_path('../lib', __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'corpus-processor/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "corpus-processor"
7
+ spec.version = CorpusProcessor::VERSION
8
+ spec.authors = ["Das Dad"]
9
+ spec.email = ["feedback@dasdad.com.br"]
10
+ spec.description = %q{Process linguistic corpus}
11
+ spec.summary = %q{Handle linguistic corpus and convert it to use NLP tools}
12
+ spec.homepage = "https://github.com/dasdad/corpus-processor"
13
+ spec.license = "MIT"
14
+
15
+ spec.files = `git ls-files`.split($/)
16
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
+ spec.require_paths = ["lib"]
19
+
20
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.0.0")
21
+
22
+ spec.add_dependency "thor"
23
+
24
+ spec.add_development_dependency "bundler", "~> 1.3"
25
+ spec.add_development_dependency "rspec"
26
+ spec.add_development_dependency "fakefs"
27
+ spec.add_development_dependency "pry-nav"
28
+ end
@@ -0,0 +1,8 @@
1
+ require "corpus-processor/version"
2
+ require "corpus-processor/token"
3
+ require "corpus-processor/default_categories"
4
+ require "corpus-processor/traverser"
5
+ require "corpus-processor/tokenizer"
6
+ require "corpus-processor/parsers"
7
+ require "corpus-processor/generators"
8
+ require "corpus-processor/processor"
@@ -0,0 +1,17 @@
1
+ require "corpus-processor"
2
+ require "thor"
3
+
4
+ module CorpusProcessor
5
+ class Cli < ::Thor
6
+
7
+ desc "process [INPUT_FILE [OUTPUT_FILE]] ", "convert corpus from HAREM format to Stanford-NER format"
8
+ def process(input_file = $stdin, output_file = $stdout)
9
+ input_file = File.new( input_file, "r") if input_file.is_a? String
10
+ output_file = File.new(output_file, "w") if output_file.is_a? String
11
+
12
+ output_file.puts(CorpusProcessor::Processor.new.process(input_file.read))
13
+
14
+ output_file.close
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,14 @@
1
+ module CorpusProcessor
2
+ DEFAULT_CATEGORIES = {
3
+ input: {
4
+ "PESSOA" => :person,
5
+ "LOCAL" => :location,
6
+ "ORGANIZACAO" => :organization,
7
+ },
8
+ output: Hash.new("O").merge(
9
+ person: "PERSON",
10
+ location: "LOCATION",
11
+ organization: "ORGANIZATION",
12
+ )
13
+ }
14
+ end
@@ -0,0 +1 @@
1
+ require "corpus-processor/generators/stanford_ner"
@@ -0,0 +1,13 @@
1
+ module CorpusProcessor::Generators
2
+ class StanfordNer
3
+ def initialize(categories = CorpusProcessor::DEFAULT_CATEGORIES[:output])
4
+ @categories = categories
5
+ end
6
+
7
+ def generate(tokens)
8
+ tokens.map { |token|
9
+ "#{ token.word } #{ @categories[token.category] }"
10
+ }.join("\n") + "\n"
11
+ end
12
+ end
13
+ end
@@ -0,0 +1 @@
1
+ require "corpus-processor/parsers/harem"
@@ -0,0 +1,52 @@
1
+ module CorpusProcessor::Parsers
2
+ class Harem
3
+
4
+ CATEGORY_REGEX = /
5
+ (?<any_text> .*? ){0}
6
+ (?<entity_attributes> \s\g<any_text>
7
+ CATEG="\g<categories>"\g<any_text> ){0}
8
+ (?<entity_opening_tag> <em\g<entity_attributes>> ){0}
9
+ (?<entity_closing_tag> <\/em> ){0}
10
+
11
+ # groups of interest
12
+ (?<inner_text> \g<any_text> ){0}
13
+ (?<categories> \g<any_text> ){0}
14
+
15
+ \g<entity_opening_tag>\g<inner_text>\g<entity_closing_tag>
16
+ /ix
17
+
18
+ def initialize(categories = CorpusProcessor::DEFAULT_CATEGORIES[:input],
19
+ traverser = CorpusProcessor::Traverser.new,
20
+ tokenizer = CorpusProcessor::Tokenizer.new)
21
+ @categories = categories
22
+ @traverser = traverser
23
+ @tokenizer = tokenizer
24
+ end
25
+
26
+ def parse(corpus)
27
+ [].tap { |tokens|
28
+ @traverser.traverse(@tokenizer.join_lines(corpus),
29
+ CATEGORY_REGEX) do |match|
30
+ text_to_tokenize, category = case match
31
+ when String
32
+ [match, nil]
33
+ when MatchData
34
+ [
35
+ match[:inner_text],
36
+ extract_category(match[:categories])
37
+ ]
38
+ end
39
+ tokens.push(*@tokenizer.tokenize(text_to_tokenize, category))
40
+ end
41
+ }
42
+ end
43
+
44
+ def extract_category(categories)
45
+ categories
46
+ .split("|")
47
+ .map { |category_string| @categories[category_string] }
48
+ .compact
49
+ .first
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,11 @@
1
+ class CorpusProcessor::Processor
2
+ def initialize(parser = CorpusProcessor::Parsers::Harem.new,
3
+ generator = CorpusProcessor::Generators::StanfordNer.new)
4
+ @parser = parser
5
+ @generator = generator
6
+ end
7
+
8
+ def process(corpus)
9
+ @generator.generate(@parser.parse(corpus))
10
+ end
11
+ end
@@ -0,0 +1,2 @@
1
+ class CorpusProcessor::Token < Struct.new(:word, :category)
2
+ end
@@ -0,0 +1,17 @@
1
+ class CorpusProcessor::Tokenizer
2
+ def tokenize(text, category = nil)
3
+ strip_tags(text)
4
+ .gsub(/[[:punct:]]/, "")
5
+ .strip
6
+ .split(/\s+/)
7
+ .map { |word| CorpusProcessor::Token.new(word, category) }
8
+ end
9
+
10
+ def strip_tags(text)
11
+ text.gsub(/<.*?>/, " ").strip
12
+ end
13
+
14
+ def join_lines(text)
15
+ text.gsub(/\s+/, " ").strip
16
+ end
17
+ end
@@ -0,0 +1,19 @@
1
+ class CorpusProcessor::Traverser
2
+ def traverse(text, regexp, &block)
3
+ return if block.nil?
4
+ remaining_search = text
5
+ until remaining_search.empty?
6
+ match = remaining_search.match(regexp)
7
+ if match.nil?
8
+ block.call remaining_search unless remaining_search.empty?
9
+ remaining_search = ""
10
+ else
11
+ before = remaining_search[0...match.begin(0)]
12
+ remaining_search = remaining_search[match.end(0)..-1]
13
+
14
+ block.call before unless before.empty?
15
+ block.call match
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,3 @@
1
+ module CorpusProcessor
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,100 @@
1
+ require "spec_helper"
2
+
3
+ require "corpus-processor/cli"
4
+
5
+ describe CorpusProcessor::Cli do
6
+ include FakeFS::SpecHelpers
7
+ subject(:cli) { CorpusProcessor::Cli.new }
8
+
9
+ let(:input_file) { "input_file" }
10
+ let(:output_file) { "output_file" }
11
+
12
+ before do
13
+ File.open(input_file, "w") { |file|
14
+ file.write <<-INPUT
15
+ <?xml version="1.0" encoding="ISO-8859-1"?>
16
+ <!DOCTYPE colHAREM>
17
+ <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
18
+ <DOC DOCID="H2-dftre765">
19
+ <P>Fatores Demográficos e Econômicos Subjacentes</P>
20
+ <P>
21
+ A revolta histórica produz normalmente uma nova forma de pensamento quanto à forma de organização da sociedade. Assim foi com a
22
+ <EM ID="H2-dftre765-1" CATEG="ABSTRACCAO|ACONTECIMENTO" TIPO="IDEIA|EFEMERIDE">Reforma Protestante</EM>
23
+ . No seguimento do colapso de instituições monásticas e do escolasticismo
24
+ nos finais da <EM ID="H2-dftre765-102" CATEG="OUTRO" COMENT="DUVIDA_DIRECTIVASTEMPO">Idade Média</EM>
25
+ na
26
+ <EM ID="H2-dftre765-37" CATEG="LOCAL" TIPO="HUMANO" SUBTIPO="DIVISAO">Europa</EM>
27
+ , acentuado pela "
28
+ <OMITIDO> <EM ID="H2-dftre765-17" CATEG="ACONTECIMENTO" TIPO="EFEMERIDE">Cativeiro Babilónica da igreja</EM></OMITIDO>
29
+ "
30
+ no papado de <EM ID="H2-dftre765-11" CATEG="ACONTECIMENTO" TIPO="EVENTO">Avignon</EM>
31
+ , o
32
+ INPUT
33
+ }
34
+ end
35
+
36
+ describe "#process" do
37
+ before do
38
+ cli.process(input_file, output_file)
39
+ end
40
+
41
+ specify { File.read(output_file).should == <<-OUTPUT }
42
+ Fatores O
43
+ Demográficos O
44
+ e O
45
+ Econômicos O
46
+ Subjacentes O
47
+ A O
48
+ revolta O
49
+ histórica O
50
+ produz O
51
+ normalmente O
52
+ uma O
53
+ nova O
54
+ forma O
55
+ de O
56
+ pensamento O
57
+ quanto O
58
+ à O
59
+ forma O
60
+ de O
61
+ organização O
62
+ da O
63
+ sociedade O
64
+ Assim O
65
+ foi O
66
+ com O
67
+ a O
68
+ Reforma O
69
+ Protestante O
70
+ No O
71
+ seguimento O
72
+ do O
73
+ colapso O
74
+ de O
75
+ instituições O
76
+ monásticas O
77
+ e O
78
+ do O
79
+ escolasticismo O
80
+ nos O
81
+ finais O
82
+ da O
83
+ Idade O
84
+ Média O
85
+ na O
86
+ Europa LOCATION
87
+ acentuado O
88
+ pela O
89
+ Cativeiro O
90
+ Babilónica O
91
+ da O
92
+ igreja O
93
+ no O
94
+ papado O
95
+ de O
96
+ Avignon O
97
+ o O
98
+ OUTPUT
99
+ end
100
+ end
@@ -0,0 +1,22 @@
1
+ # This file was generated by the `rspec --init` command. Conventionally, all
2
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
+ # Require this file using `require "spec_helper"` to ensure that it is only
4
+ # loaded once.
5
+ #
6
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
+
8
+ require "fakefs/spec_helpers"
9
+
10
+ require "corpus-processor"
11
+
12
+ RSpec.configure do |config|
13
+ config.treat_symbols_as_metadata_keys_with_true_values = true
14
+ config.run_all_when_everything_filtered = true
15
+ config.filter_run :focus
16
+
17
+ # Run specs in random order to surface order dependencies. If you find an
18
+ # order dependency and want to debug it, you can fix the order by providing
19
+ # the seed, which is printed after each run.
20
+ # --seed 1234
21
+ config.order = "random"
22
+ end
@@ -0,0 +1,46 @@
1
+ require "spec_helper"
2
+
3
+ describe CorpusProcessor::Generators::StanfordNer do
4
+ subject(:stanford_ner) { CorpusProcessor::Generators::StanfordNer.new }
5
+
6
+ describe "#generate" do
7
+ subject { stanford_ner.generate(tokens) }
8
+
9
+ context "no tokens" do
10
+ let(:tokens) { [] }
11
+
12
+ it { should == "\n" }
13
+ end
14
+
15
+ context "one token" do
16
+ let(:tokens) { [CorpusProcessor::Token.new("banana")] }
17
+
18
+ it { should == "banana O\n" }
19
+ end
20
+
21
+ context "two tokens" do
22
+ let(:tokens) { [
23
+ CorpusProcessor::Token.new("good"),
24
+ CorpusProcessor::Token.new("banana"),
25
+ ] }
26
+
27
+ it { should == "good O\nbanana O\n" }
28
+ end
29
+
30
+ context "with category" do
31
+ let(:tokens) { [CorpusProcessor::Token.new("Leandro", :person)] }
32
+
33
+ it { should == "Leandro PERSON\n" }
34
+ end
35
+
36
+ context "with non-default categories" do
37
+ let(:stanford_ner) { CorpusProcessor::Generators::StanfordNer.new(
38
+ banana: "BANANA"
39
+ ) }
40
+
41
+ let(:tokens) { [CorpusProcessor::Token.new("Nanica", :banana)] }
42
+
43
+ it { should == "Nanica BANANA\n" }
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,269 @@
1
+ require "spec_helper"
2
+
3
+ describe CorpusProcessor::Parsers::Harem do
4
+ subject(:harem) { CorpusProcessor::Parsers::Harem.new }
5
+
6
+ describe "#parse" do
7
+ subject { harem.parse(corpus) }
8
+
9
+ context "default categories" do
10
+ context "empty corpus" do
11
+ let(:corpus) { "" }
12
+
13
+ it { should == [] }
14
+ end
15
+
16
+ context "doctype" do
17
+ let(:corpus) {
18
+ <<-CORPUS
19
+ <?xml version="1.0" encoding="ISO-8859-1"?>
20
+ <!DOCTYPE colHAREM>
21
+ CORPUS
22
+ }
23
+
24
+ it { should == [] }
25
+ end
26
+
27
+ context "simple phrase" do
28
+ let(:corpus) {
29
+ <<-CORPUS
30
+ <?xml version="1.0" encoding="ISO-8859-1"?>
31
+ <!DOCTYPE colHAREM>
32
+ <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
33
+ <DOC DOCID="H2-dftre765">
34
+ <P>Fatores Demográficos e Econômicos Subjacentes</P>
35
+ </DOC>
36
+ </colHAREM>
37
+ CORPUS
38
+ }
39
+
40
+ it { should == [
41
+ CorpusProcessor::Token.new("Fatores"),
42
+ CorpusProcessor::Token.new("Demográficos"),
43
+ CorpusProcessor::Token.new("e"),
44
+ CorpusProcessor::Token.new("Econômicos"),
45
+ CorpusProcessor::Token.new("Subjacentes"),
46
+ ]
47
+ }
48
+ end
49
+
50
+ context "two simple phrases" do
51
+ let(:corpus) {
52
+ <<-CORPUS
53
+ <?xml version="1.0" encoding="ISO-8859-1"?>
54
+ <!DOCTYPE colHAREM>
55
+ <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
56
+ <DOC DOCID="H2-dftre765">
57
+ <P>Fatores Demográficos e Econômicos Subjacentes</P>
58
+ <P>Fatores Demográficos e Econômicos Subjacentes</P>
59
+ </DOC>
60
+ </colHAREM>
61
+ CORPUS
62
+ }
63
+
64
+ it { should == [
65
+ CorpusProcessor::Token.new("Fatores"),
66
+ CorpusProcessor::Token.new("Demográficos"),
67
+ CorpusProcessor::Token.new("e"),
68
+ CorpusProcessor::Token.new("Econômicos"),
69
+ CorpusProcessor::Token.new("Subjacentes"),
70
+ CorpusProcessor::Token.new("Fatores"),
71
+ CorpusProcessor::Token.new("Demográficos"),
72
+ CorpusProcessor::Token.new("e"),
73
+ CorpusProcessor::Token.new("Econômicos"),
74
+ CorpusProcessor::Token.new("Subjacentes"),
75
+ ]
76
+ }
77
+ end
78
+
79
+ context "useless entity" do
80
+ let(:corpus) {
81
+ <<-CORPUS
82
+ <?xml version="1.0" encoding="ISO-8859-1"?>
83
+ <!DOCTYPE colHAREM>
84
+ <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
85
+ <DOC DOCID="H2-dftre765">
86
+ <P>Nos finais da <EM ID="H2-dftre765-102" CATEG="OUTRO" COMENT="DUVIDA_DIRECTIVASTEMPO">Idade Média</EM></P>
87
+ </DOC>
88
+ </colHAREM>
89
+ CORPUS
90
+ }
91
+
92
+ it { should == [
93
+ CorpusProcessor::Token.new("Nos"),
94
+ CorpusProcessor::Token.new("finais"),
95
+ CorpusProcessor::Token.new("da"),
96
+ CorpusProcessor::Token.new("Idade"),
97
+ CorpusProcessor::Token.new("Média"),
98
+ ]
99
+ }
100
+ end
101
+
102
+ context "one entity" do
103
+ let(:corpus) {
104
+ <<-CORPUS
105
+ <?xml version="1.0" encoding="ISO-8859-1"?>
106
+ <!DOCTYPE colHAREM>
107
+ <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
108
+ <DOC DOCID="H2-dftre765">
109
+ <P>Foram igualmente determinantes para evitar que as ideias reformadoras encontrassem divulgação em
110
+ <EM ID="H2-dftre765-23" CATEG="LOCAL" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-8 H2-dftre765-37" TIPOREL="local_nascimento_de incluido">Portugal</EM>
111
+ </P>
112
+ </DOC>
113
+ </colHAREM>
114
+ CORPUS
115
+ }
116
+
117
+ it { should == [
118
+ CorpusProcessor::Token.new("Foram"),
119
+ CorpusProcessor::Token.new("igualmente"),
120
+ CorpusProcessor::Token.new("determinantes"),
121
+ CorpusProcessor::Token.new("para"),
122
+ CorpusProcessor::Token.new("evitar"),
123
+ CorpusProcessor::Token.new("que"),
124
+ CorpusProcessor::Token.new("as"),
125
+ CorpusProcessor::Token.new("ideias"),
126
+ CorpusProcessor::Token.new("reformadoras"),
127
+ CorpusProcessor::Token.new("encontrassem"),
128
+ CorpusProcessor::Token.new("divulgação"),
129
+ CorpusProcessor::Token.new("em"),
130
+ CorpusProcessor::Token.new("Portugal", :location),
131
+ ]
132
+ }
133
+ end
134
+
135
+ context "multiple entities" do
136
+ let(:corpus) {
137
+ <<-CORPUS
138
+ <?xml version="1.0" encoding="ISO-8859-1"?>
139
+ <!DOCTYPE colHAREM>
140
+ <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
141
+ <DOC DOCID="H2-dftre765">
142
+ <P>
143
+ A imprensa, inventada na
144
+ <EM ID="H2-dftre765-9" CATEG="LOCAL" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-37" TIPOREL="incluido">Alemanha</EM>
145
+ por
146
+ <EM ID="H2-dftre765-10" CATEG="PESSOA" TIPO="INDIVIDUAL" COREL="H2-dftre765-9" TIPOREL="natural_de">John Gutenberg</EM>
147
+ <EM ID="H2-dftre765-20" CATEG="ORGANIZACAO" TIPO="INSTITUICAO" COMENT="2/3">Inquisição</EM>
148
+ e a censura
149
+ </P>
150
+ </DOC>
151
+ </colHAREM>
152
+ CORPUS
153
+ }
154
+
155
+ it { should == [
156
+ CorpusProcessor::Token.new("A"),
157
+ CorpusProcessor::Token.new("imprensa"),
158
+ CorpusProcessor::Token.new("inventada"),
159
+ CorpusProcessor::Token.new("na"),
160
+ CorpusProcessor::Token.new("Alemanha", :location),
161
+ CorpusProcessor::Token.new("por"),
162
+ CorpusProcessor::Token.new("John", :person),
163
+ CorpusProcessor::Token.new("Gutenberg", :person),
164
+ CorpusProcessor::Token.new("Inquisição", :organization),
165
+ CorpusProcessor::Token.new("e"),
166
+ CorpusProcessor::Token.new("a"),
167
+ CorpusProcessor::Token.new("censura"),
168
+ ]
169
+ }
170
+ end
171
+
172
+ context "spaces after ponctuation" do
173
+ let(:corpus) {
174
+ <<-CORPUS
175
+ <?xml version="1.0" encoding="ISO-8859-1"?>
176
+ <!DOCTYPE colHAREM>
177
+ <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
178
+ <DOC DOCID="H2-dftre765">
179
+ <EM ID="H2-dftre765-1" CATEG="ABSTRACCAO|ACONTECIMENTO" TIPO="IDEIA|EFEMERIDE">Reforma Protestante</EM>
180
+ . No
181
+ </DOC>
182
+ </colHAREM>
183
+ CORPUS
184
+ }
185
+
186
+ it { should == [
187
+ CorpusProcessor::Token.new("Reforma"),
188
+ CorpusProcessor::Token.new("Protestante"),
189
+ CorpusProcessor::Token.new("No"),
190
+ ]
191
+ }
192
+ end
193
+ end
194
+
195
+ context "user-defined categories" do
196
+ let(:harem) {
197
+ CorpusProcessor::Parsers::Harem.new({
198
+ "FRUTA" => :fruit,
199
+ "LIVRO" => :book,
200
+ })
201
+ }
202
+
203
+ context "multiple entities" do
204
+ let(:corpus) {
205
+ <<-CORPUS
206
+ <?xml version="1.0" encoding="ISO-8859-1"?>
207
+ <!DOCTYPE colHAREM>
208
+ <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
209
+ <DOC DOCID="H2-dftre765">
210
+ <P>
211
+ A imprensa, inventada na
212
+ <EM ID="H2-dftre765-9" CATEG="FRUTA" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-37" TIPOREL="incluido">Banana</EM>
213
+ por
214
+ <EM ID="H2-dftre765-10" CATEG="LIVRO" TIPO="INDIVIDUAL" COREL="H2-dftre765-9" TIPOREL="natural_de">Harry Potter</EM>
215
+ <EM ID="H2-dftre765-20" CATEG="ORGANIZACAO" TIPO="INSTITUICAO" COMENT="2/3">Inquisição</EM>
216
+ e a censura
217
+ </P>
218
+ </DOC>
219
+ </colHAREM>
220
+ CORPUS
221
+ }
222
+
223
+ it { should == [
224
+ CorpusProcessor::Token.new("A"),
225
+ CorpusProcessor::Token.new("imprensa"),
226
+ CorpusProcessor::Token.new("inventada"),
227
+ CorpusProcessor::Token.new("na"),
228
+ CorpusProcessor::Token.new("Banana", :fruit),
229
+ CorpusProcessor::Token.new("por"),
230
+ CorpusProcessor::Token.new("Harry", :book),
231
+ CorpusProcessor::Token.new("Potter", :book),
232
+ CorpusProcessor::Token.new("Inquisição"),
233
+ CorpusProcessor::Token.new("e"),
234
+ CorpusProcessor::Token.new("a"),
235
+ CorpusProcessor::Token.new("censura"),
236
+ ]
237
+ }
238
+ end
239
+ end
240
+ end
241
+
242
+ describe "#extract_category" do
243
+ subject { harem.extract_category(categories) }
244
+
245
+ context "empty categories" do
246
+ let(:categories) { "" }
247
+
248
+ it { should == nil }
249
+ end
250
+
251
+ context "one category" do
252
+ let(:categories) { "PESSOA" }
253
+
254
+ it { should == :person }
255
+ end
256
+
257
+ context "two categories" do
258
+ let(:categories) { "OUTRA|ORGANIZACAO" }
259
+
260
+ it { should == :organization }
261
+ end
262
+
263
+ context "ambiguidade" do
264
+ let(:categories) { "PESSOA|ORGANIZACAO" }
265
+
266
+ it { should == :person }
267
+ end
268
+ end
269
+ end
@@ -0,0 +1,37 @@
1
+ require "spec_helper"
2
+
3
+ describe CorpusProcessor::Processor do
4
+ subject(:processor) { CorpusProcessor::Processor.new(parser, generator) }
5
+
6
+ describe "#process" do
7
+ subject { processor.process(corpus) }
8
+
9
+ let(:corpus) { "Some corpus" }
10
+ let(:processed_corpus) {
11
+ <<-CORPUS
12
+ Some O
13
+ corpus O
14
+ CORPUS
15
+ }
16
+ let(:tokens) {
17
+ [
18
+ CorpusProcessor::Token.new("Some"),
19
+ CorpusProcessor::Token.new("corpus"),
20
+ ]
21
+ }
22
+ let(:parser) { double :parser }
23
+ let(:generator) { double :generator }
24
+
25
+ specify {
26
+ parser.should_receive(:parse)
27
+ .with(corpus)
28
+ .and_return(tokens)
29
+
30
+ generator.should_receive(:generate)
31
+ .with(tokens)
32
+ .and_return(processed_corpus)
33
+
34
+ subject.should == processed_corpus
35
+ }
36
+ end
37
+ end
@@ -0,0 +1,8 @@
1
+ require "spec_helper"
2
+
3
+ describe CorpusProcessor::Token do
4
+ subject { CorpusProcessor::Token.new }
5
+
6
+ it { should respond_to(:word) }
7
+ it { should respond_to(:category) }
8
+ end
@@ -0,0 +1,121 @@
1
+ require "spec_helper"
2
+
3
+ describe CorpusProcessor::Tokenizer do
4
+ subject(:tokenizer) { CorpusProcessor::Tokenizer.new }
5
+
6
+ describe "#tokenize" do
7
+ subject { tokenizer.tokenize(text, category) }
8
+
9
+ let(:category) { nil }
10
+
11
+ context "empty string" do
12
+ let(:text) { "" }
13
+
14
+ it { should == [] }
15
+ end
16
+
17
+ context "one word" do
18
+ let(:text) { "banana" }
19
+
20
+ it { should == [CorpusProcessor::Token.new("banana")] }
21
+ end
22
+
23
+ context "two words" do
24
+ let(:text) { "good banana" }
25
+
26
+ it { should == [
27
+ CorpusProcessor::Token.new("good"),
28
+ CorpusProcessor::Token.new("banana"),
29
+ ] }
30
+ end
31
+
32
+ context "ponctuation" do
33
+ let(:text) { "good, banana" }
34
+
35
+ it { should == [
36
+ CorpusProcessor::Token.new("good"),
37
+ CorpusProcessor::Token.new("banana"),
38
+ ] }
39
+ end
40
+
41
+ context "default category" do
42
+ let(:text) { "Google" }
43
+ let(:category) { :organization }
44
+
45
+ it { should == [
46
+ CorpusProcessor::Token.new("Google", :organization),
47
+ ] }
48
+ end
49
+
50
+ context "with tags" do
51
+ let(:text) { "good<lalala/>, banana" }
52
+
53
+ it { should == [
54
+ CorpusProcessor::Token.new("good"),
55
+ CorpusProcessor::Token.new("banana"),
56
+ ] }
57
+ end
58
+ end
59
+
60
+ describe "#strip_tags" do
61
+ subject { tokenizer.strip_tags(text) }
62
+
63
+ context "empty text" do
64
+ let(:text) { "" }
65
+
66
+ it { should == "" }
67
+ end
68
+
69
+ context "self closed tag" do
70
+ let(:text) { "<br/>" }
71
+
72
+ it { should == "" }
73
+ end
74
+
75
+ context "tag with content" do
76
+ let(:text) { "<p>Some text</p>" }
77
+
78
+ it { should == "Some text" }
79
+ end
80
+
81
+ context "content after tag" do
82
+ let(:text) { "<p>Some<br/>text</p>" }
83
+
84
+ it { should == "Some text" }
85
+ end
86
+ end
87
+
88
+ describe "#join_lines" do
89
+ subject { tokenizer.join_lines(text) }
90
+
91
+ context "empty text" do
92
+ let(:text) { "" }
93
+
94
+ it { should == "" }
95
+ end
96
+
97
+ context "one word" do
98
+ let(:text) { "banana" }
99
+
100
+ it { should == "banana" }
101
+ end
102
+
103
+ context "two lines" do
104
+ let(:text) { "banana\nquiabo" }
105
+
106
+ it { should == "banana quiabo" }
107
+ end
108
+
109
+ context "line with empty space" do
110
+ let(:text) { "banana\n \nquiabo" }
111
+
112
+ it { should == "banana quiabo" }
113
+ end
114
+
115
+ context "leading spaces" do
116
+ let(:text) { " \n banana\n \nquiabo \n" }
117
+
118
+ it { should == "banana quiabo" }
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,68 @@
1
+ require "spec_helper"
2
+
3
+ describe CorpusProcessor::Traverser do
4
+ subject(:traverser) { CorpusProcessor::Traverser.new }
5
+
6
+ describe "#traverse" do
7
+ subject { traverser.traverse(text, regexp) }
8
+
9
+ context "empty text" do
10
+ let(:text) { "" }
11
+ let(:regexp) { // }
12
+
13
+ specify {
14
+ expect { |mock_block|
15
+ traverser.traverse(text, regexp, &mock_block)
16
+ }.not_to yield_control
17
+ }
18
+ end
19
+
20
+ context "simple text" do
21
+ let(:text) { "abc" }
22
+ let(:regexp) { /b/ }
23
+
24
+ specify {
25
+ expect { |mock_block|
26
+ traverser.traverse(text, regexp, &mock_block)
27
+ }.to yield_successive_args "a", text.match(regexp), "c"
28
+ }
29
+ end
30
+
31
+ context "two matches" do
32
+ let(:text) { "abcbd" }
33
+ let(:regexp) { /b/ }
34
+
35
+ specify {
36
+ expect { |mock_block|
37
+ traverser.traverse(text, regexp, &mock_block)
38
+ }.to yield_successive_args "a",
39
+ text.match(regexp),
40
+ "c",
41
+ text[2..-1].match(regexp),
42
+ "d"
43
+ }
44
+ end
45
+
46
+ context "match in beginning" do
47
+ let(:text) { "bc" }
48
+ let(:regexp) { /b/ }
49
+
50
+ specify {
51
+ expect { |mock_block|
52
+ traverser.traverse(text, regexp, &mock_block)
53
+ }.to yield_successive_args text.match(regexp), "c"
54
+ }
55
+ end
56
+
57
+ context "match in ending" do
58
+ let(:text) { "bc" }
59
+ let(:regexp) { /c/ }
60
+
61
+ specify {
62
+ expect { |mock_block|
63
+ traverser.traverse(text, regexp, &mock_block)
64
+ }.to yield_successive_args "b", text.match(regexp)
65
+ }
66
+ end
67
+ end
68
+ end
metadata ADDED
@@ -0,0 +1,149 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: corpus-processor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Das Dad
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-03-27 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: thor
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '1.3'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '1.3'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: fakefs
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: pry-nav
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: Process linguistic corpus
84
+ email:
85
+ - feedback@dasdad.com.br
86
+ executables:
87
+ - corpus-processor
88
+ extensions: []
89
+ extra_rdoc_files: []
90
+ files:
91
+ - .gitignore
92
+ - .rspec
93
+ - Gemfile
94
+ - README.md
95
+ - bin/corpus-processor
96
+ - corpus-processor.gemspec
97
+ - lib/corpus-processor.rb
98
+ - lib/corpus-processor/cli.rb
99
+ - lib/corpus-processor/default_categories.rb
100
+ - lib/corpus-processor/generators.rb
101
+ - lib/corpus-processor/generators/stanford_ner.rb
102
+ - lib/corpus-processor/parsers.rb
103
+ - lib/corpus-processor/parsers/harem.rb
104
+ - lib/corpus-processor/processor.rb
105
+ - lib/corpus-processor/token.rb
106
+ - lib/corpus-processor/tokenizer.rb
107
+ - lib/corpus-processor/traverser.rb
108
+ - lib/corpus-processor/version.rb
109
+ - spec/integration/cli_spec.rb
110
+ - spec/spec_helper.rb
111
+ - spec/unit/generators/stanford_ner_spec.rb
112
+ - spec/unit/parsers/harem_spec.rb
113
+ - spec/unit/processor.rb
114
+ - spec/unit/token_spec.rb
115
+ - spec/unit/tokenizer_spec.rb
116
+ - spec/unit/traverser_spec.rb
117
+ homepage: https://github.com/dasdad/corpus-processor
118
+ licenses:
119
+ - MIT
120
+ metadata: {}
121
+ post_install_message:
122
+ rdoc_options: []
123
+ require_paths:
124
+ - lib
125
+ required_ruby_version: !ruby/object:Gem::Requirement
126
+ requirements:
127
+ - - '>='
128
+ - !ruby/object:Gem::Version
129
+ version: 2.0.0
130
+ required_rubygems_version: !ruby/object:Gem::Requirement
131
+ requirements:
132
+ - - '>='
133
+ - !ruby/object:Gem::Version
134
+ version: '0'
135
+ requirements: []
136
+ rubyforge_project:
137
+ rubygems_version: 2.0.0
138
+ signing_key:
139
+ specification_version: 4
140
+ summary: Handle linguistic corpus and convert it to use NLP tools
141
+ test_files:
142
+ - spec/integration/cli_spec.rb
143
+ - spec/spec_helper.rb
144
+ - spec/unit/generators/stanford_ner_spec.rb
145
+ - spec/unit/parsers/harem_spec.rb
146
+ - spec/unit/processor.rb
147
+ - spec/unit/token_spec.rb
148
+ - spec/unit/tokenizer_spec.rb
149
+ - spec/unit/traverser_spec.rb