corpus-processor 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8a0ff96102528239769c105832893034e21434bf
4
+ data.tar.gz: 625ffe80fa8399f20610e048c6ce346a69eef9c0
5
+ SHA512:
6
+ metadata.gz: 1716f52826fa5b895977760e33f5e918a9b7fcebd0d3448b6419c4cb9e8d1b7902f8d99cb6646f4b33693f5743aac3802bc4476e1eac9db555cd188d52acb9e0
7
+ data.tar.gz: 770efa624c0c2fcb0b3170d10dcce05069f90650f04b775f6d5662c9ac4b61b71f7884831e4903c015398d21cb21498206f6f2bc4a41cf59b6905d887222d9b8
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/README.md ADDED
@@ -0,0 +1,86 @@
1
+ Corpus Processor
2
+ ================
3
+
4
+ ![Corpus Processor](http://badge.fury.io/rb/corpus-processor)
5
+
6
+ Tool to work with [Corpus Linguistics](http://en.wikipedia.org/wiki/Corpus_linguistics). Corpus Processor converts _corpora_ between different formats for use in Natural Language Processing (NLP) tools.
7
+
8
+ The first purpose of Corpus Processor and its current only feature is to transform _corpora_ found in [Linguateca](http://www.linguateca.pt) into the format used for training in [Stanford NER](http://nlp.stanford.edu/software/CRF-NER.shtml).
9
+
10
+ [Linguateca](http://www.linguateca.pt) is an excellent source of _corpora_ in Portuguese.
11
+
12
+ [Stanford NER](http://nlp.stanford.edu/software/CRF-NER.shtml) is an excellent implementation of [Named Entity Recognition](http://en.wikipedia.org/wiki/Named-entity_recognition).
13
+
14
+ Installation
15
+ ------------
16
+
17
+ Corpus Processor is a [Ruby](http://www.ruby-lang.org/) [Gem](http://rubygems.org/). To install it, given a working installation of Ruby, run:
18
+
19
+ ```bash
20
+ $ gem install corpus_processor
21
+ ```
22
+
23
+ Usage
24
+ -----
25
+
26
+ Convert corpus from HAREM format to Stanford-NER format:
27
+
28
+ ```bash
29
+ $ corpus-processor process [INPUT_FILE [OUTPUT_FILE]]
30
+ ```
31
+
32
+ Results
33
+ -------
34
+
35
+ For an example of converting one corpus with Corpus Processor, refer to this [gist](https://gist.github.com/leafac/5259008).
36
+
37
+ The corpus is from [Linguateca](http://www.linguateca.pt/HAREM/) and the training used [Stanford NER](http://nlp.stanford.edu/software/CRF-NER.shtml).
38
+
39
+ Contributing
40
+ ------------
41
+
42
+ 1. Fork it.
43
+ 2. Create your feature branch (`git checkout -b my-new-feature`).
44
+ 3. Commit your changes (`git commit -am 'Add some feature'`).
45
+ 4. Push to the branch (`git push origin my-new-feature`).
46
+ 5. Create new Pull Request.
47
+
48
+ Changelog
49
+ ---------
50
+
51
+ ### 0.0.1
52
+
53
+ * [Harem](http://www.linguateca.pt/HAREM/) Parser.
54
+ * [Stanford NER](http://nlp.stanford.edu/software/CRF-NER.shtml) Generator.
55
+
56
+ Thanks
57
+ ------
58
+
59
+ * *Diana Santos* and her team in [Linguateca](http://www.linguateca.pt) for the semantic annotated corpus in Portuguese.
60
+ * *[Stanford NLP team](http://www-nlp.stanford.edu/)* for the [Stanford NER](http://nlp.stanford.edu/software/CRF-NER.shtml) tool.
61
+
62
+ License
63
+ -------
64
+
65
+ Copyright (c) 2013 Das Dad
66
+
67
+ MIT License
68
+
69
+ Permission is hereby granted, free of charge, to any person obtaining
70
+ a copy of this software and associated documentation files (the
71
+ "Software"), to deal in the Software without restriction, including
72
+ without limitation the rights to use, copy, modify, merge, publish,
73
+ distribute, sublicense, and/or sell copies of the Software, and to
74
+ permit persons to whom the Software is furnished to do so, subject to
75
+ the following conditions:
76
+
77
+ The above copyright notice and this permission notice shall be
78
+ included in all copies or substantial portions of the Software.
79
+
80
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
81
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
82
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
83
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
84
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
85
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
86
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift(File.expand_path("../../lib", __FILE__))
4
+
5
+ require "bundler/setup"
6
+
7
+ require "corpus-processor/cli"
8
+
9
+ CorpusProcessor::Cli.start(ARGV)
@@ -0,0 +1,28 @@
1
+ lib = File.expand_path('../lib', __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'corpus-processor/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "corpus-processor"
7
+ spec.version = CorpusProcessor::VERSION
8
+ spec.authors = ["Das Dad"]
9
+ spec.email = ["feedback@dasdad.com.br"]
10
+ spec.description = %q{Process linguistic corpus}
11
+ spec.summary = %q{Handle linguistic corpus and convert it to use NLP tools}
12
+ spec.homepage = "https://github.com/dasdad/corpus-processor"
13
+ spec.license = "MIT"
14
+
15
+ spec.files = `git ls-files`.split($/)
16
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
+ spec.require_paths = ["lib"]
19
+
20
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.0.0")
21
+
22
+ spec.add_dependency "thor"
23
+
24
+ spec.add_development_dependency "bundler", "~> 1.3"
25
+ spec.add_development_dependency "rspec"
26
+ spec.add_development_dependency "fakefs"
27
+ spec.add_development_dependency "pry-nav"
28
+ end
@@ -0,0 +1,8 @@
1
+ require "corpus-processor/version"
2
+ require "corpus-processor/token"
3
+ require "corpus-processor/default_categories"
4
+ require "corpus-processor/traverser"
5
+ require "corpus-processor/tokenizer"
6
+ require "corpus-processor/parsers"
7
+ require "corpus-processor/generators"
8
+ require "corpus-processor/processor"
@@ -0,0 +1,17 @@
1
+ require "corpus-processor"
2
+ require "thor"
3
+
4
+ module CorpusProcessor
5
+ class Cli < ::Thor
6
+
7
+ desc "process [INPUT_FILE [OUTPUT_FILE]] ", "convert corpus from HAREM format to Stanford-NER format"
8
+ def process(input_file = $stdin, output_file = $stdout)
9
+ input_file = File.new( input_file, "r") if input_file.is_a? String
10
+ output_file = File.new(output_file, "w") if output_file.is_a? String
11
+
12
+ output_file.puts(CorpusProcessor::Processor.new.process(input_file.read))
13
+
14
+ output_file.close
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,14 @@
1
+ module CorpusProcessor
2
+ DEFAULT_CATEGORIES = {
3
+ input: {
4
+ "PESSOA" => :person,
5
+ "LOCAL" => :location,
6
+ "ORGANIZACAO" => :organization,
7
+ },
8
+ output: Hash.new("O").merge(
9
+ person: "PERSON",
10
+ location: "LOCATION",
11
+ organization: "ORGANIZATION",
12
+ )
13
+ }
14
+ end
@@ -0,0 +1 @@
1
+ require "corpus-processor/generators/stanford_ner"
@@ -0,0 +1,13 @@
1
+ module CorpusProcessor::Generators
2
+ class StanfordNer
3
+ def initialize(categories = CorpusProcessor::DEFAULT_CATEGORIES[:output])
4
+ @categories = categories
5
+ end
6
+
7
+ def generate(tokens)
8
+ tokens.map { |token|
9
+ "#{ token.word } #{ @categories[token.category] }"
10
+ }.join("\n") + "\n"
11
+ end
12
+ end
13
+ end
@@ -0,0 +1 @@
1
+ require "corpus-processor/parsers/harem"
@@ -0,0 +1,52 @@
1
+ module CorpusProcessor::Parsers
2
+ class Harem
3
+
4
+ CATEGORY_REGEX = /
5
+ (?<any_text> .*? ){0}
6
+ (?<entity_attributes> \s\g<any_text>
7
+ CATEG="\g<categories>"\g<any_text> ){0}
8
+ (?<entity_opening_tag> <em\g<entity_attributes>> ){0}
9
+ (?<entity_closing_tag> <\/em> ){0}
10
+
11
+ # groups of interest
12
+ (?<inner_text> \g<any_text> ){0}
13
+ (?<categories> \g<any_text> ){0}
14
+
15
+ \g<entity_opening_tag>\g<inner_text>\g<entity_closing_tag>
16
+ /ix
17
+
18
+ def initialize(categories = CorpusProcessor::DEFAULT_CATEGORIES[:input],
19
+ traverser = CorpusProcessor::Traverser.new,
20
+ tokenizer = CorpusProcessor::Tokenizer.new)
21
+ @categories = categories
22
+ @traverser = traverser
23
+ @tokenizer = tokenizer
24
+ end
25
+
26
+ def parse(corpus)
27
+ [].tap { |tokens|
28
+ @traverser.traverse(@tokenizer.join_lines(corpus),
29
+ CATEGORY_REGEX) do |match|
30
+ text_to_tokenize, category = case match
31
+ when String
32
+ [match, nil]
33
+ when MatchData
34
+ [
35
+ match[:inner_text],
36
+ extract_category(match[:categories])
37
+ ]
38
+ end
39
+ tokens.push(*@tokenizer.tokenize(text_to_tokenize, category))
40
+ end
41
+ }
42
+ end
43
+
44
+ def extract_category(categories)
45
+ categories
46
+ .split("|")
47
+ .map { |category_string| @categories[category_string] }
48
+ .compact
49
+ .first
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,11 @@
1
+ class CorpusProcessor::Processor
2
+ def initialize(parser = CorpusProcessor::Parsers::Harem.new,
3
+ generator = CorpusProcessor::Generators::StanfordNer.new)
4
+ @parser = parser
5
+ @generator = generator
6
+ end
7
+
8
+ def process(corpus)
9
+ @generator.generate(@parser.parse(corpus))
10
+ end
11
+ end
@@ -0,0 +1,2 @@
1
+ class CorpusProcessor::Token < Struct.new(:word, :category)
2
+ end
@@ -0,0 +1,17 @@
1
+ class CorpusProcessor::Tokenizer
2
+ def tokenize(text, category = nil)
3
+ strip_tags(text)
4
+ .gsub(/[[:punct:]]/, "")
5
+ .strip
6
+ .split(/\s+/)
7
+ .map { |word| CorpusProcessor::Token.new(word, category) }
8
+ end
9
+
10
+ def strip_tags(text)
11
+ text.gsub(/<.*?>/, " ").strip
12
+ end
13
+
14
+ def join_lines(text)
15
+ text.gsub(/\s+/, " ").strip
16
+ end
17
+ end
@@ -0,0 +1,19 @@
1
+ class CorpusProcessor::Traverser
2
+ def traverse(text, regexp, &block)
3
+ return if block.nil?
4
+ remaining_search = text
5
+ until remaining_search.empty?
6
+ match = remaining_search.match(regexp)
7
+ if match.nil?
8
+ block.call remaining_search unless remaining_search.empty?
9
+ remaining_search = ""
10
+ else
11
+ before = remaining_search[0...match.begin(0)]
12
+ remaining_search = remaining_search[match.end(0)..-1]
13
+
14
+ block.call before unless before.empty?
15
+ block.call match
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,3 @@
1
+ module CorpusProcessor
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,100 @@
1
+ require "spec_helper"
2
+
3
+ require "corpus-processor/cli"
4
+
5
+ describe CorpusProcessor::Cli do
6
+ include FakeFS::SpecHelpers
7
+ subject(:cli) { CorpusProcessor::Cli.new }
8
+
9
+ let(:input_file) { "input_file" }
10
+ let(:output_file) { "output_file" }
11
+
12
+ before do
13
+ File.open(input_file, "w") { |file|
14
+ file.write <<-INPUT
15
+ <?xml version="1.0" encoding="ISO-8859-1"?>
16
+ <!DOCTYPE colHAREM>
17
+ <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
18
+ <DOC DOCID="H2-dftre765">
19
+ <P>Fatores Demográficos e Econômicos Subjacentes</P>
20
+ <P>
21
+ A revolta histórica produz normalmente uma nova forma de pensamento quanto à forma de organização da sociedade. Assim foi com a
22
+ <EM ID="H2-dftre765-1" CATEG="ABSTRACCAO|ACONTECIMENTO" TIPO="IDEIA|EFEMERIDE">Reforma Protestante</EM>
23
+ . No seguimento do colapso de instituições monásticas e do escolasticismo
24
+ nos finais da <EM ID="H2-dftre765-102" CATEG="OUTRO" COMENT="DUVIDA_DIRECTIVASTEMPO">Idade Média</EM>
25
+ na
26
+ <EM ID="H2-dftre765-37" CATEG="LOCAL" TIPO="HUMANO" SUBTIPO="DIVISAO">Europa</EM>
27
+ , acentuado pela "
28
+ <OMITIDO> <EM ID="H2-dftre765-17" CATEG="ACONTECIMENTO" TIPO="EFEMERIDE">Cativeiro Babilónica da igreja</EM></OMITIDO>
29
+ "
30
+ no papado de <EM ID="H2-dftre765-11" CATEG="ACONTECIMENTO" TIPO="EVENTO">Avignon</EM>
31
+ , o
32
+ INPUT
33
+ }
34
+ end
35
+
36
+ describe "#process" do
37
+ before do
38
+ cli.process(input_file, output_file)
39
+ end
40
+
41
+ specify { File.read(output_file).should == <<-OUTPUT }
42
+ Fatores O
43
+ Demográficos O
44
+ e O
45
+ Econômicos O
46
+ Subjacentes O
47
+ A O
48
+ revolta O
49
+ histórica O
50
+ produz O
51
+ normalmente O
52
+ uma O
53
+ nova O
54
+ forma O
55
+ de O
56
+ pensamento O
57
+ quanto O
58
+ à O
59
+ forma O
60
+ de O
61
+ organização O
62
+ da O
63
+ sociedade O
64
+ Assim O
65
+ foi O
66
+ com O
67
+ a O
68
+ Reforma O
69
+ Protestante O
70
+ No O
71
+ seguimento O
72
+ do O
73
+ colapso O
74
+ de O
75
+ instituições O
76
+ monásticas O
77
+ e O
78
+ do O
79
+ escolasticismo O
80
+ nos O
81
+ finais O
82
+ da O
83
+ Idade O
84
+ Média O
85
+ na O
86
+ Europa LOCATION
87
+ acentuado O
88
+ pela O
89
+ Cativeiro O
90
+ Babilónica O
91
+ da O
92
+ igreja O
93
+ no O
94
+ papado O
95
+ de O
96
+ Avignon O
97
+ o O
98
+ OUTPUT
99
+ end
100
+ end
@@ -0,0 +1,22 @@
1
+ # This file was generated by the `rspec --init` command. Conventionally, all
2
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
+ # Require this file using `require "spec_helper"` to ensure that it is only
4
+ # loaded once.
5
+ #
6
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
+
8
+ require "fakefs/spec_helpers"
9
+
10
+ require "corpus-processor"
11
+
12
+ RSpec.configure do |config|
13
+ config.treat_symbols_as_metadata_keys_with_true_values = true
14
+ config.run_all_when_everything_filtered = true
15
+ config.filter_run :focus
16
+
17
+ # Run specs in random order to surface order dependencies. If you find an
18
+ # order dependency and want to debug it, you can fix the order by providing
19
+ # the seed, which is printed after each run.
20
+ # --seed 1234
21
+ config.order = "random"
22
+ end
@@ -0,0 +1,46 @@
1
+ require "spec_helper"
2
+
3
+ describe CorpusProcessor::Generators::StanfordNer do
4
+ subject(:stanford_ner) { CorpusProcessor::Generators::StanfordNer.new }
5
+
6
+ describe "#generate" do
7
+ subject { stanford_ner.generate(tokens) }
8
+
9
+ context "no tokens" do
10
+ let(:tokens) { [] }
11
+
12
+ it { should == "\n" }
13
+ end
14
+
15
+ context "one token" do
16
+ let(:tokens) { [CorpusProcessor::Token.new("banana")] }
17
+
18
+ it { should == "banana O\n" }
19
+ end
20
+
21
+ context "two tokens" do
22
+ let(:tokens) { [
23
+ CorpusProcessor::Token.new("good"),
24
+ CorpusProcessor::Token.new("banana"),
25
+ ] }
26
+
27
+ it { should == "good O\nbanana O\n" }
28
+ end
29
+
30
+ context "with category" do
31
+ let(:tokens) { [CorpusProcessor::Token.new("Leandro", :person)] }
32
+
33
+ it { should == "Leandro PERSON\n" }
34
+ end
35
+
36
+ context "with non-default categories" do
37
+ let(:stanford_ner) { CorpusProcessor::Generators::StanfordNer.new(
38
+ banana: "BANANA"
39
+ ) }
40
+
41
+ let(:tokens) { [CorpusProcessor::Token.new("Nanica", :banana)] }
42
+
43
+ it { should == "Nanica BANANA\n" }
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,269 @@
1
+ require "spec_helper"
2
+
3
+ describe CorpusProcessor::Parsers::Harem do
4
+ subject(:harem) { CorpusProcessor::Parsers::Harem.new }
5
+
6
+ describe "#parse" do
7
+ subject { harem.parse(corpus) }
8
+
9
+ context "default categories" do
10
+ context "empty corpus" do
11
+ let(:corpus) { "" }
12
+
13
+ it { should == [] }
14
+ end
15
+
16
+ context "doctype" do
17
+ let(:corpus) {
18
+ <<-CORPUS
19
+ <?xml version="1.0" encoding="ISO-8859-1"?>
20
+ <!DOCTYPE colHAREM>
21
+ CORPUS
22
+ }
23
+
24
+ it { should == [] }
25
+ end
26
+
27
+ context "simple phrase" do
28
+ let(:corpus) {
29
+ <<-CORPUS
30
+ <?xml version="1.0" encoding="ISO-8859-1"?>
31
+ <!DOCTYPE colHAREM>
32
+ <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
33
+ <DOC DOCID="H2-dftre765">
34
+ <P>Fatores Demográficos e Econômicos Subjacentes</P>
35
+ </DOC>
36
+ </colHAREM>
37
+ CORPUS
38
+ }
39
+
40
+ it { should == [
41
+ CorpusProcessor::Token.new("Fatores"),
42
+ CorpusProcessor::Token.new("Demográficos"),
43
+ CorpusProcessor::Token.new("e"),
44
+ CorpusProcessor::Token.new("Econômicos"),
45
+ CorpusProcessor::Token.new("Subjacentes"),
46
+ ]
47
+ }
48
+ end
49
+
50
+ context "two simple phrases" do
51
+ let(:corpus) {
52
+ <<-CORPUS
53
+ <?xml version="1.0" encoding="ISO-8859-1"?>
54
+ <!DOCTYPE colHAREM>
55
+ <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
56
+ <DOC DOCID="H2-dftre765">
57
+ <P>Fatores Demográficos e Econômicos Subjacentes</P>
58
+ <P>Fatores Demográficos e Econômicos Subjacentes</P>
59
+ </DOC>
60
+ </colHAREM>
61
+ CORPUS
62
+ }
63
+
64
+ it { should == [
65
+ CorpusProcessor::Token.new("Fatores"),
66
+ CorpusProcessor::Token.new("Demográficos"),
67
+ CorpusProcessor::Token.new("e"),
68
+ CorpusProcessor::Token.new("Econômicos"),
69
+ CorpusProcessor::Token.new("Subjacentes"),
70
+ CorpusProcessor::Token.new("Fatores"),
71
+ CorpusProcessor::Token.new("Demográficos"),
72
+ CorpusProcessor::Token.new("e"),
73
+ CorpusProcessor::Token.new("Econômicos"),
74
+ CorpusProcessor::Token.new("Subjacentes"),
75
+ ]
76
+ }
77
+ end
78
+
79
+ context "useless entity" do
80
+ let(:corpus) {
81
+ <<-CORPUS
82
+ <?xml version="1.0" encoding="ISO-8859-1"?>
83
+ <!DOCTYPE colHAREM>
84
+ <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
85
+ <DOC DOCID="H2-dftre765">
86
+ <P>Nos finais da <EM ID="H2-dftre765-102" CATEG="OUTRO" COMENT="DUVIDA_DIRECTIVASTEMPO">Idade Média</EM></P>
87
+ </DOC>
88
+ </colHAREM>
89
+ CORPUS
90
+ }
91
+
92
+ it { should == [
93
+ CorpusProcessor::Token.new("Nos"),
94
+ CorpusProcessor::Token.new("finais"),
95
+ CorpusProcessor::Token.new("da"),
96
+ CorpusProcessor::Token.new("Idade"),
97
+ CorpusProcessor::Token.new("Média"),
98
+ ]
99
+ }
100
+ end
101
+
102
+ context "one entity" do
103
+ let(:corpus) {
104
+ <<-CORPUS
105
+ <?xml version="1.0" encoding="ISO-8859-1"?>
106
+ <!DOCTYPE colHAREM>
107
+ <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
108
+ <DOC DOCID="H2-dftre765">
109
+ <P>Foram igualmente determinantes para evitar que as ideias reformadoras encontrassem divulgação em
110
+ <EM ID="H2-dftre765-23" CATEG="LOCAL" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-8 H2-dftre765-37" TIPOREL="local_nascimento_de incluido">Portugal</EM>
111
+ </P>
112
+ </DOC>
113
+ </colHAREM>
114
+ CORPUS
115
+ }
116
+
117
+ it { should == [
118
+ CorpusProcessor::Token.new("Foram"),
119
+ CorpusProcessor::Token.new("igualmente"),
120
+ CorpusProcessor::Token.new("determinantes"),
121
+ CorpusProcessor::Token.new("para"),
122
+ CorpusProcessor::Token.new("evitar"),
123
+ CorpusProcessor::Token.new("que"),
124
+ CorpusProcessor::Token.new("as"),
125
+ CorpusProcessor::Token.new("ideias"),
126
+ CorpusProcessor::Token.new("reformadoras"),
127
+ CorpusProcessor::Token.new("encontrassem"),
128
+ CorpusProcessor::Token.new("divulgação"),
129
+ CorpusProcessor::Token.new("em"),
130
+ CorpusProcessor::Token.new("Portugal", :location),
131
+ ]
132
+ }
133
+ end
134
+
135
+ context "multiple entities" do
136
+ let(:corpus) {
137
+ <<-CORPUS
138
+ <?xml version="1.0" encoding="ISO-8859-1"?>
139
+ <!DOCTYPE colHAREM>
140
+ <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
141
+ <DOC DOCID="H2-dftre765">
142
+ <P>
143
+ A imprensa, inventada na
144
+ <EM ID="H2-dftre765-9" CATEG="LOCAL" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-37" TIPOREL="incluido">Alemanha</EM>
145
+ por
146
+ <EM ID="H2-dftre765-10" CATEG="PESSOA" TIPO="INDIVIDUAL" COREL="H2-dftre765-9" TIPOREL="natural_de">John Gutenberg</EM>
147
+ <EM ID="H2-dftre765-20" CATEG="ORGANIZACAO" TIPO="INSTITUICAO" COMENT="2/3">Inquisição</EM>
148
+ e a censura
149
+ </P>
150
+ </DOC>
151
+ </colHAREM>
152
+ CORPUS
153
+ }
154
+
155
+ it { should == [
156
+ CorpusProcessor::Token.new("A"),
157
+ CorpusProcessor::Token.new("imprensa"),
158
+ CorpusProcessor::Token.new("inventada"),
159
+ CorpusProcessor::Token.new("na"),
160
+ CorpusProcessor::Token.new("Alemanha", :location),
161
+ CorpusProcessor::Token.new("por"),
162
+ CorpusProcessor::Token.new("John", :person),
163
+ CorpusProcessor::Token.new("Gutenberg", :person),
164
+ CorpusProcessor::Token.new("Inquisição", :organization),
165
+ CorpusProcessor::Token.new("e"),
166
+ CorpusProcessor::Token.new("a"),
167
+ CorpusProcessor::Token.new("censura"),
168
+ ]
169
+ }
170
+ end
171
+
172
+ context "spaces after ponctuation" do
173
+ let(:corpus) {
174
+ <<-CORPUS
175
+ <?xml version="1.0" encoding="ISO-8859-1"?>
176
+ <!DOCTYPE colHAREM>
177
+ <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
178
+ <DOC DOCID="H2-dftre765">
179
+ <EM ID="H2-dftre765-1" CATEG="ABSTRACCAO|ACONTECIMENTO" TIPO="IDEIA|EFEMERIDE">Reforma Protestante</EM>
180
+ . No
181
+ </DOC>
182
+ </colHAREM>
183
+ CORPUS
184
+ }
185
+
186
+ it { should == [
187
+ CorpusProcessor::Token.new("Reforma"),
188
+ CorpusProcessor::Token.new("Protestante"),
189
+ CorpusProcessor::Token.new("No"),
190
+ ]
191
+ }
192
+ end
193
+ end
194
+
195
+ context "user-defined categories" do
196
+ let(:harem) {
197
+ CorpusProcessor::Parsers::Harem.new({
198
+ "FRUTA" => :fruit,
199
+ "LIVRO" => :book,
200
+ })
201
+ }
202
+
203
+ context "multiple entities" do
204
+ let(:corpus) {
205
+ <<-CORPUS
206
+ <?xml version="1.0" encoding="ISO-8859-1"?>
207
+ <!DOCTYPE colHAREM>
208
+ <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
209
+ <DOC DOCID="H2-dftre765">
210
+ <P>
211
+ A imprensa, inventada na
212
+ <EM ID="H2-dftre765-9" CATEG="FRUTA" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-37" TIPOREL="incluido">Banana</EM>
213
+ por
214
+ <EM ID="H2-dftre765-10" CATEG="LIVRO" TIPO="INDIVIDUAL" COREL="H2-dftre765-9" TIPOREL="natural_de">Harry Potter</EM>
215
+ <EM ID="H2-dftre765-20" CATEG="ORGANIZACAO" TIPO="INSTITUICAO" COMENT="2/3">Inquisição</EM>
216
+ e a censura
217
+ </P>
218
+ </DOC>
219
+ </colHAREM>
220
+ CORPUS
221
+ }
222
+
223
+ it { should == [
224
+ CorpusProcessor::Token.new("A"),
225
+ CorpusProcessor::Token.new("imprensa"),
226
+ CorpusProcessor::Token.new("inventada"),
227
+ CorpusProcessor::Token.new("na"),
228
+ CorpusProcessor::Token.new("Banana", :fruit),
229
+ CorpusProcessor::Token.new("por"),
230
+ CorpusProcessor::Token.new("Harry", :book),
231
+ CorpusProcessor::Token.new("Potter", :book),
232
+ CorpusProcessor::Token.new("Inquisição"),
233
+ CorpusProcessor::Token.new("e"),
234
+ CorpusProcessor::Token.new("a"),
235
+ CorpusProcessor::Token.new("censura"),
236
+ ]
237
+ }
238
+ end
239
+ end
240
+ end
241
+
242
+ describe "#extract_category" do
243
+ subject { harem.extract_category(categories) }
244
+
245
+ context "empty categories" do
246
+ let(:categories) { "" }
247
+
248
+ it { should == nil }
249
+ end
250
+
251
+ context "one category" do
252
+ let(:categories) { "PESSOA" }
253
+
254
+ it { should == :person }
255
+ end
256
+
257
+ context "two categories" do
258
+ let(:categories) { "OUTRA|ORGANIZACAO" }
259
+
260
+ it { should == :organization }
261
+ end
262
+
263
+ context "ambiguidade" do
264
+ let(:categories) { "PESSOA|ORGANIZACAO" }
265
+
266
+ it { should == :person }
267
+ end
268
+ end
269
+ end
@@ -0,0 +1,37 @@
1
+ require "spec_helper"
2
+
3
+ describe CorpusProcessor::Processor do
4
+ subject(:processor) { CorpusProcessor::Processor.new(parser, generator) }
5
+
6
+ describe "#process" do
7
+ subject { processor.process(corpus) }
8
+
9
+ let(:corpus) { "Some corpus" }
10
+ let(:processed_corpus) {
11
+ <<-CORPUS
12
+ Some O
13
+ corpus O
14
+ CORPUS
15
+ }
16
+ let(:tokens) {
17
+ [
18
+ CorpusProcessor::Token.new("Some"),
19
+ CorpusProcessor::Token.new("corpus"),
20
+ ]
21
+ }
22
+ let(:parser) { double :parser }
23
+ let(:generator) { double :generator }
24
+
25
+ specify {
26
+ parser.should_receive(:parse)
27
+ .with(corpus)
28
+ .and_return(tokens)
29
+
30
+ generator.should_receive(:generate)
31
+ .with(tokens)
32
+ .and_return(processed_corpus)
33
+
34
+ subject.should == processed_corpus
35
+ }
36
+ end
37
+ end
@@ -0,0 +1,8 @@
1
+ require "spec_helper"
2
+
3
+ describe CorpusProcessor::Token do
4
+ subject { CorpusProcessor::Token.new }
5
+
6
+ it { should respond_to(:word) }
7
+ it { should respond_to(:category) }
8
+ end
@@ -0,0 +1,121 @@
1
+ require "spec_helper"
2
+
3
+ describe CorpusProcessor::Tokenizer do
4
+ subject(:tokenizer) { CorpusProcessor::Tokenizer.new }
5
+
6
+ describe "#tokenize" do
7
+ subject { tokenizer.tokenize(text, category) }
8
+
9
+ let(:category) { nil }
10
+
11
+ context "empty string" do
12
+ let(:text) { "" }
13
+
14
+ it { should == [] }
15
+ end
16
+
17
+ context "one word" do
18
+ let(:text) { "banana" }
19
+
20
+ it { should == [CorpusProcessor::Token.new("banana")] }
21
+ end
22
+
23
+ context "two words" do
24
+ let(:text) { "good banana" }
25
+
26
+ it { should == [
27
+ CorpusProcessor::Token.new("good"),
28
+ CorpusProcessor::Token.new("banana"),
29
+ ] }
30
+ end
31
+
32
+ context "ponctuation" do
33
+ let(:text) { "good, banana" }
34
+
35
+ it { should == [
36
+ CorpusProcessor::Token.new("good"),
37
+ CorpusProcessor::Token.new("banana"),
38
+ ] }
39
+ end
40
+
41
+ context "default category" do
42
+ let(:text) { "Google" }
43
+ let(:category) { :organization }
44
+
45
+ it { should == [
46
+ CorpusProcessor::Token.new("Google", :organization),
47
+ ] }
48
+ end
49
+
50
+ context "with tags" do
51
+ let(:text) { "good<lalala/>, banana" }
52
+
53
+ it { should == [
54
+ CorpusProcessor::Token.new("good"),
55
+ CorpusProcessor::Token.new("banana"),
56
+ ] }
57
+ end
58
+ end
59
+
60
+ describe "#strip_tags" do
61
+ subject { tokenizer.strip_tags(text) }
62
+
63
+ context "empty text" do
64
+ let(:text) { "" }
65
+
66
+ it { should == "" }
67
+ end
68
+
69
+ context "self closed tag" do
70
+ let(:text) { "<br/>" }
71
+
72
+ it { should == "" }
73
+ end
74
+
75
+ context "tag with content" do
76
+ let(:text) { "<p>Some text</p>" }
77
+
78
+ it { should == "Some text" }
79
+ end
80
+
81
+ context "content after tag" do
82
+ let(:text) { "<p>Some<br/>text</p>" }
83
+
84
+ it { should == "Some text" }
85
+ end
86
+ end
87
+
88
+ describe "#join_lines" do
89
+ subject { tokenizer.join_lines(text) }
90
+
91
+ context "empty text" do
92
+ let(:text) { "" }
93
+
94
+ it { should == "" }
95
+ end
96
+
97
+ context "one word" do
98
+ let(:text) { "banana" }
99
+
100
+ it { should == "banana" }
101
+ end
102
+
103
+ context "two lines" do
104
+ let(:text) { "banana\nquiabo" }
105
+
106
+ it { should == "banana quiabo" }
107
+ end
108
+
109
+ context "line with empty space" do
110
+ let(:text) { "banana\n \nquiabo" }
111
+
112
+ it { should == "banana quiabo" }
113
+ end
114
+
115
+ context "leading spaces" do
116
+ let(:text) { " \n banana\n \nquiabo \n" }
117
+
118
+ it { should == "banana quiabo" }
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,68 @@
1
+ require "spec_helper"
2
+
3
+ describe CorpusProcessor::Traverser do
4
+ subject(:traverser) { CorpusProcessor::Traverser.new }
5
+
6
+ describe "#traverse" do
7
+ subject { traverser.traverse(text, regexp) }
8
+
9
+ context "empty text" do
10
+ let(:text) { "" }
11
+ let(:regexp) { // }
12
+
13
+ specify {
14
+ expect { |mock_block|
15
+ traverser.traverse(text, regexp, &mock_block)
16
+ }.not_to yield_control
17
+ }
18
+ end
19
+
20
+ context "simple text" do
21
+ let(:text) { "abc" }
22
+ let(:regexp) { /b/ }
23
+
24
+ specify {
25
+ expect { |mock_block|
26
+ traverser.traverse(text, regexp, &mock_block)
27
+ }.to yield_successive_args "a", text.match(regexp), "c"
28
+ }
29
+ end
30
+
31
+ context "two matches" do
32
+ let(:text) { "abcbd" }
33
+ let(:regexp) { /b/ }
34
+
35
+ specify {
36
+ expect { |mock_block|
37
+ traverser.traverse(text, regexp, &mock_block)
38
+ }.to yield_successive_args "a",
39
+ text.match(regexp),
40
+ "c",
41
+ text[2..-1].match(regexp),
42
+ "d"
43
+ }
44
+ end
45
+
46
+ context "match in beginning" do
47
+ let(:text) { "bc" }
48
+ let(:regexp) { /b/ }
49
+
50
+ specify {
51
+ expect { |mock_block|
52
+ traverser.traverse(text, regexp, &mock_block)
53
+ }.to yield_successive_args text.match(regexp), "c"
54
+ }
55
+ end
56
+
57
+ context "match in ending" do
58
+ let(:text) { "bc" }
59
+ let(:regexp) { /c/ }
60
+
61
+ specify {
62
+ expect { |mock_block|
63
+ traverser.traverse(text, regexp, &mock_block)
64
+ }.to yield_successive_args "b", text.match(regexp)
65
+ }
66
+ end
67
+ end
68
+ end
metadata ADDED
@@ -0,0 +1,149 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: corpus-processor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Das Dad
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-03-27 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: thor
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '1.3'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '1.3'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: fakefs
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: pry-nav
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: Process linguistic corpus
84
+ email:
85
+ - feedback@dasdad.com.br
86
+ executables:
87
+ - corpus-processor
88
+ extensions: []
89
+ extra_rdoc_files: []
90
+ files:
91
+ - .gitignore
92
+ - .rspec
93
+ - Gemfile
94
+ - README.md
95
+ - bin/corpus-processor
96
+ - corpus-processor.gemspec
97
+ - lib/corpus-processor.rb
98
+ - lib/corpus-processor/cli.rb
99
+ - lib/corpus-processor/default_categories.rb
100
+ - lib/corpus-processor/generators.rb
101
+ - lib/corpus-processor/generators/stanford_ner.rb
102
+ - lib/corpus-processor/parsers.rb
103
+ - lib/corpus-processor/parsers/harem.rb
104
+ - lib/corpus-processor/processor.rb
105
+ - lib/corpus-processor/token.rb
106
+ - lib/corpus-processor/tokenizer.rb
107
+ - lib/corpus-processor/traverser.rb
108
+ - lib/corpus-processor/version.rb
109
+ - spec/integration/cli_spec.rb
110
+ - spec/spec_helper.rb
111
+ - spec/unit/generators/stanford_ner_spec.rb
112
+ - spec/unit/parsers/harem_spec.rb
113
+ - spec/unit/processor.rb
114
+ - spec/unit/token_spec.rb
115
+ - spec/unit/tokenizer_spec.rb
116
+ - spec/unit/traverser_spec.rb
117
+ homepage: https://github.com/dasdad/corpus-processor
118
+ licenses:
119
+ - MIT
120
+ metadata: {}
121
+ post_install_message:
122
+ rdoc_options: []
123
+ require_paths:
124
+ - lib
125
+ required_ruby_version: !ruby/object:Gem::Requirement
126
+ requirements:
127
+ - - '>='
128
+ - !ruby/object:Gem::Version
129
+ version: 2.0.0
130
+ required_rubygems_version: !ruby/object:Gem::Requirement
131
+ requirements:
132
+ - - '>='
133
+ - !ruby/object:Gem::Version
134
+ version: '0'
135
+ requirements: []
136
+ rubyforge_project:
137
+ rubygems_version: 2.0.0
138
+ signing_key:
139
+ specification_version: 4
140
+ summary: Handle linguistic corpus and convert it to use NLP tools
141
+ test_files:
142
+ - spec/integration/cli_spec.rb
143
+ - spec/spec_helper.rb
144
+ - spec/unit/generators/stanford_ner_spec.rb
145
+ - spec/unit/parsers/harem_spec.rb
146
+ - spec/unit/processor.rb
147
+ - spec/unit/token_spec.rb
148
+ - spec/unit/tokenizer_spec.rb
149
+ - spec/unit/traverser_spec.rb