corpus-processor 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +5 -0
- data/.yardopts +1 -0
- data/README.md +235 -34
- data/bin/corpus-processor +3 -3
- data/corpus-processor.gemspec +16 -14
- data/lib/corpus-processor.rb +12 -8
- data/lib/corpus-processor/categories.rb +58 -0
- data/lib/corpus-processor/categories/default.yml +10 -0
- data/lib/corpus-processor/cli.rb +31 -11
- data/lib/corpus-processor/generators.rb +5 -1
- data/lib/corpus-processor/generators/stanford_ner.rb +19 -10
- data/lib/corpus-processor/parsers.rb +5 -1
- data/lib/corpus-processor/parsers/lampada.rb +103 -47
- data/lib/corpus-processor/processor.rb +19 -4
- data/lib/corpus-processor/token.rb +35 -1
- data/lib/corpus-processor/version.rb +1 -1
- data/spec/{integration → corpus-processor}/cli_spec.rb +81 -71
- data/spec/corpus-processor/generators/stanford_ner_spec.rb +57 -0
- data/spec/corpus-processor/parsers/lampada_spec.rb +333 -0
- data/spec/corpus-processor/processor_spec.rb +36 -0
- data/spec/corpus-processor/token_spec.rb +15 -0
- data/spec/spec_helper.rb +7 -4
- metadata +39 -27
- data/lib/corpus-processor/default_categories.rb +0 -14
- data/lib/corpus-processor/tokenizer.rb +0 -17
- data/lib/corpus-processor/traverser.rb +0 -19
- data/spec/unit/generators/stanford_ner_spec.rb +0 -46
- data/spec/unit/parsers/lampada_spec.rb +0 -269
- data/spec/unit/processor.rb +0 -37
- data/spec/unit/token_spec.rb +0 -8
- data/spec/unit/tokenizer_spec.rb +0 -121
- data/spec/unit/traverser_spec.rb +0 -68
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: corpus-processor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Das Dad
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-07-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - '>='
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: nokogiri
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: bundler
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -42,18 +56,18 @@ dependencies:
|
|
42
56
|
name: rspec
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
44
58
|
requirements:
|
45
|
-
- - '
|
59
|
+
- - '='
|
46
60
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
61
|
+
version: 2.14.0.rc1
|
48
62
|
type: :development
|
49
63
|
prerelease: false
|
50
64
|
version_requirements: !ruby/object:Gem::Requirement
|
51
65
|
requirements:
|
52
|
-
- - '
|
66
|
+
- - '='
|
53
67
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
68
|
+
version: 2.14.0.rc1
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
70
|
+
name: pry-nav
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
58
72
|
requirements:
|
59
73
|
- - '>='
|
@@ -67,7 +81,7 @@ dependencies:
|
|
67
81
|
- !ruby/object:Gem::Version
|
68
82
|
version: '0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
84
|
+
name: coveralls
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
72
86
|
requirements:
|
73
87
|
- - '>='
|
@@ -82,7 +96,7 @@ dependencies:
|
|
82
96
|
version: '0'
|
83
97
|
description: Process linguistic corpus
|
84
98
|
email:
|
85
|
-
-
|
99
|
+
- dev@dasdad.com.br
|
86
100
|
executables:
|
87
101
|
- corpus-processor
|
88
102
|
extensions: []
|
@@ -90,30 +104,29 @@ extra_rdoc_files: []
|
|
90
104
|
files:
|
91
105
|
- .gitignore
|
92
106
|
- .rspec
|
107
|
+
- .travis.yml
|
108
|
+
- .yardopts
|
93
109
|
- Gemfile
|
94
110
|
- README.md
|
95
111
|
- bin/corpus-processor
|
96
112
|
- corpus-processor.gemspec
|
97
113
|
- lib/corpus-processor.rb
|
114
|
+
- lib/corpus-processor/categories.rb
|
115
|
+
- lib/corpus-processor/categories/default.yml
|
98
116
|
- lib/corpus-processor/cli.rb
|
99
|
-
- lib/corpus-processor/default_categories.rb
|
100
117
|
- lib/corpus-processor/generators.rb
|
101
118
|
- lib/corpus-processor/generators/stanford_ner.rb
|
102
119
|
- lib/corpus-processor/parsers.rb
|
103
120
|
- lib/corpus-processor/parsers/lampada.rb
|
104
121
|
- lib/corpus-processor/processor.rb
|
105
122
|
- lib/corpus-processor/token.rb
|
106
|
-
- lib/corpus-processor/tokenizer.rb
|
107
|
-
- lib/corpus-processor/traverser.rb
|
108
123
|
- lib/corpus-processor/version.rb
|
109
|
-
- spec/
|
124
|
+
- spec/corpus-processor/cli_spec.rb
|
125
|
+
- spec/corpus-processor/generators/stanford_ner_spec.rb
|
126
|
+
- spec/corpus-processor/parsers/lampada_spec.rb
|
127
|
+
- spec/corpus-processor/processor_spec.rb
|
128
|
+
- spec/corpus-processor/token_spec.rb
|
110
129
|
- spec/spec_helper.rb
|
111
|
-
- spec/unit/generators/stanford_ner_spec.rb
|
112
|
-
- spec/unit/parsers/lampada_spec.rb
|
113
|
-
- spec/unit/processor.rb
|
114
|
-
- spec/unit/token_spec.rb
|
115
|
-
- spec/unit/tokenizer_spec.rb
|
116
|
-
- spec/unit/traverser_spec.rb
|
117
130
|
homepage: https://github.com/dasdad/corpus-processor
|
118
131
|
licenses:
|
119
132
|
- MIT
|
@@ -134,16 +147,15 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
134
147
|
version: '0'
|
135
148
|
requirements: []
|
136
149
|
rubyforge_project:
|
137
|
-
rubygems_version: 2.0.
|
150
|
+
rubygems_version: 2.0.2
|
138
151
|
signing_key:
|
139
152
|
specification_version: 4
|
140
153
|
summary: Handle linguistic corpus and convert it to use NLP tools
|
141
154
|
test_files:
|
142
|
-
- spec/
|
155
|
+
- spec/corpus-processor/cli_spec.rb
|
156
|
+
- spec/corpus-processor/generators/stanford_ner_spec.rb
|
157
|
+
- spec/corpus-processor/parsers/lampada_spec.rb
|
158
|
+
- spec/corpus-processor/processor_spec.rb
|
159
|
+
- spec/corpus-processor/token_spec.rb
|
143
160
|
- spec/spec_helper.rb
|
144
|
-
|
145
|
-
- spec/unit/parsers/lampada_spec.rb
|
146
|
-
- spec/unit/processor.rb
|
147
|
-
- spec/unit/token_spec.rb
|
148
|
-
- spec/unit/tokenizer_spec.rb
|
149
|
-
- spec/unit/traverser_spec.rb
|
161
|
+
has_rdoc:
|
@@ -1,14 +0,0 @@
|
|
1
|
-
module CorpusProcessor
|
2
|
-
DEFAULT_CATEGORIES = {
|
3
|
-
input: {
|
4
|
-
"PESSOA" => :person,
|
5
|
-
"LOCAL" => :location,
|
6
|
-
"ORGANIZACAO" => :organization,
|
7
|
-
},
|
8
|
-
output: Hash.new("O").merge(
|
9
|
-
person: "PERSON",
|
10
|
-
location: "LOCATION",
|
11
|
-
organization: "ORGANIZATION",
|
12
|
-
)
|
13
|
-
}
|
14
|
-
end
|
@@ -1,17 +0,0 @@
|
|
1
|
-
class CorpusProcessor::Tokenizer
|
2
|
-
def tokenize(text, category = nil)
|
3
|
-
strip_tags(text)
|
4
|
-
.gsub(/[[:punct:]]/, "")
|
5
|
-
.strip
|
6
|
-
.split(/\s+/)
|
7
|
-
.map { |word| CorpusProcessor::Token.new(word, category) }
|
8
|
-
end
|
9
|
-
|
10
|
-
def strip_tags(text)
|
11
|
-
text.gsub(/<.*?>/, " ").strip
|
12
|
-
end
|
13
|
-
|
14
|
-
def join_lines(text)
|
15
|
-
text.gsub(/\s+/, " ").strip
|
16
|
-
end
|
17
|
-
end
|
@@ -1,19 +0,0 @@
|
|
1
|
-
class CorpusProcessor::Traverser
|
2
|
-
def traverse(text, regexp, &block)
|
3
|
-
return if block.nil?
|
4
|
-
remaining_search = text
|
5
|
-
until remaining_search.empty?
|
6
|
-
match = remaining_search.match(regexp)
|
7
|
-
if match.nil?
|
8
|
-
block.call remaining_search unless remaining_search.empty?
|
9
|
-
remaining_search = ""
|
10
|
-
else
|
11
|
-
before = remaining_search[0...match.begin(0)]
|
12
|
-
remaining_search = remaining_search[match.end(0)..-1]
|
13
|
-
|
14
|
-
block.call before unless before.empty?
|
15
|
-
block.call match
|
16
|
-
end
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
@@ -1,46 +0,0 @@
|
|
1
|
-
require "spec_helper"
|
2
|
-
|
3
|
-
describe CorpusProcessor::Generators::StanfordNer do
|
4
|
-
subject(:stanford_ner) { CorpusProcessor::Generators::StanfordNer.new }
|
5
|
-
|
6
|
-
describe "#generate" do
|
7
|
-
subject { stanford_ner.generate(tokens) }
|
8
|
-
|
9
|
-
context "no tokens" do
|
10
|
-
let(:tokens) { [] }
|
11
|
-
|
12
|
-
it { should == "\n" }
|
13
|
-
end
|
14
|
-
|
15
|
-
context "one token" do
|
16
|
-
let(:tokens) { [CorpusProcessor::Token.new("banana")] }
|
17
|
-
|
18
|
-
it { should == "banana O\n" }
|
19
|
-
end
|
20
|
-
|
21
|
-
context "two tokens" do
|
22
|
-
let(:tokens) { [
|
23
|
-
CorpusProcessor::Token.new("good"),
|
24
|
-
CorpusProcessor::Token.new("banana"),
|
25
|
-
] }
|
26
|
-
|
27
|
-
it { should == "good O\nbanana O\n" }
|
28
|
-
end
|
29
|
-
|
30
|
-
context "with category" do
|
31
|
-
let(:tokens) { [CorpusProcessor::Token.new("Leandro", :person)] }
|
32
|
-
|
33
|
-
it { should == "Leandro PERSON\n" }
|
34
|
-
end
|
35
|
-
|
36
|
-
context "with non-default categories" do
|
37
|
-
let(:stanford_ner) { CorpusProcessor::Generators::StanfordNer.new(
|
38
|
-
banana: "BANANA"
|
39
|
-
) }
|
40
|
-
|
41
|
-
let(:tokens) { [CorpusProcessor::Token.new("Nanica", :banana)] }
|
42
|
-
|
43
|
-
it { should == "Nanica BANANA\n" }
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
@@ -1,269 +0,0 @@
|
|
1
|
-
require "spec_helper"
|
2
|
-
|
3
|
-
describe CorpusProcessor::Parsers::Lampada do
|
4
|
-
subject(:lampada) { CorpusProcessor::Parsers::Lampada.new }
|
5
|
-
|
6
|
-
describe "#parse" do
|
7
|
-
subject { lampada.parse(corpus) }
|
8
|
-
|
9
|
-
context "default categories" do
|
10
|
-
context "empty corpus" do
|
11
|
-
let(:corpus) { "" }
|
12
|
-
|
13
|
-
it { should == [] }
|
14
|
-
end
|
15
|
-
|
16
|
-
context "doctype" do
|
17
|
-
let(:corpus) {
|
18
|
-
<<-CORPUS
|
19
|
-
<?xml version="1.0" encoding="ISO-8859-1"?>
|
20
|
-
<!DOCTYPE colHAREM>
|
21
|
-
CORPUS
|
22
|
-
}
|
23
|
-
|
24
|
-
it { should == [] }
|
25
|
-
end
|
26
|
-
|
27
|
-
context "simple phrase" do
|
28
|
-
let(:corpus) {
|
29
|
-
<<-CORPUS
|
30
|
-
<?xml version="1.0" encoding="ISO-8859-1"?>
|
31
|
-
<!DOCTYPE colHAREM>
|
32
|
-
<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
|
33
|
-
<DOC DOCID="H2-dftre765">
|
34
|
-
<P>Fatores Demográficos e Econômicos Subjacentes</P>
|
35
|
-
</DOC>
|
36
|
-
</colHAREM>
|
37
|
-
CORPUS
|
38
|
-
}
|
39
|
-
|
40
|
-
it { should == [
|
41
|
-
CorpusProcessor::Token.new("Fatores"),
|
42
|
-
CorpusProcessor::Token.new("Demográficos"),
|
43
|
-
CorpusProcessor::Token.new("e"),
|
44
|
-
CorpusProcessor::Token.new("Econômicos"),
|
45
|
-
CorpusProcessor::Token.new("Subjacentes"),
|
46
|
-
]
|
47
|
-
}
|
48
|
-
end
|
49
|
-
|
50
|
-
context "two simple phrases" do
|
51
|
-
let(:corpus) {
|
52
|
-
<<-CORPUS
|
53
|
-
<?xml version="1.0" encoding="ISO-8859-1"?>
|
54
|
-
<!DOCTYPE colHAREM>
|
55
|
-
<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
|
56
|
-
<DOC DOCID="H2-dftre765">
|
57
|
-
<P>Fatores Demográficos e Econômicos Subjacentes</P>
|
58
|
-
<P>Fatores Demográficos e Econômicos Subjacentes</P>
|
59
|
-
</DOC>
|
60
|
-
</colHAREM>
|
61
|
-
CORPUS
|
62
|
-
}
|
63
|
-
|
64
|
-
it { should == [
|
65
|
-
CorpusProcessor::Token.new("Fatores"),
|
66
|
-
CorpusProcessor::Token.new("Demográficos"),
|
67
|
-
CorpusProcessor::Token.new("e"),
|
68
|
-
CorpusProcessor::Token.new("Econômicos"),
|
69
|
-
CorpusProcessor::Token.new("Subjacentes"),
|
70
|
-
CorpusProcessor::Token.new("Fatores"),
|
71
|
-
CorpusProcessor::Token.new("Demográficos"),
|
72
|
-
CorpusProcessor::Token.new("e"),
|
73
|
-
CorpusProcessor::Token.new("Econômicos"),
|
74
|
-
CorpusProcessor::Token.new("Subjacentes"),
|
75
|
-
]
|
76
|
-
}
|
77
|
-
end
|
78
|
-
|
79
|
-
context "useless entity" do
|
80
|
-
let(:corpus) {
|
81
|
-
<<-CORPUS
|
82
|
-
<?xml version="1.0" encoding="ISO-8859-1"?>
|
83
|
-
<!DOCTYPE colHAREM>
|
84
|
-
<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
|
85
|
-
<DOC DOCID="H2-dftre765">
|
86
|
-
<P>Nos finais da <EM ID="H2-dftre765-102" CATEG="OUTRO" COMENT="DUVIDA_DIRECTIVASTEMPO">Idade Média</EM></P>
|
87
|
-
</DOC>
|
88
|
-
</colHAREM>
|
89
|
-
CORPUS
|
90
|
-
}
|
91
|
-
|
92
|
-
it { should == [
|
93
|
-
CorpusProcessor::Token.new("Nos"),
|
94
|
-
CorpusProcessor::Token.new("finais"),
|
95
|
-
CorpusProcessor::Token.new("da"),
|
96
|
-
CorpusProcessor::Token.new("Idade"),
|
97
|
-
CorpusProcessor::Token.new("Média"),
|
98
|
-
]
|
99
|
-
}
|
100
|
-
end
|
101
|
-
|
102
|
-
context "one entity" do
|
103
|
-
let(:corpus) {
|
104
|
-
<<-CORPUS
|
105
|
-
<?xml version="1.0" encoding="ISO-8859-1"?>
|
106
|
-
<!DOCTYPE colHAREM>
|
107
|
-
<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
|
108
|
-
<DOC DOCID="H2-dftre765">
|
109
|
-
<P>Foram igualmente determinantes para evitar que as ideias reformadoras encontrassem divulgação em
|
110
|
-
<EM ID="H2-dftre765-23" CATEG="LOCAL" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-8 H2-dftre765-37" TIPOREL="local_nascimento_de incluido">Portugal</EM>
|
111
|
-
</P>
|
112
|
-
</DOC>
|
113
|
-
</colHAREM>
|
114
|
-
CORPUS
|
115
|
-
}
|
116
|
-
|
117
|
-
it { should == [
|
118
|
-
CorpusProcessor::Token.new("Foram"),
|
119
|
-
CorpusProcessor::Token.new("igualmente"),
|
120
|
-
CorpusProcessor::Token.new("determinantes"),
|
121
|
-
CorpusProcessor::Token.new("para"),
|
122
|
-
CorpusProcessor::Token.new("evitar"),
|
123
|
-
CorpusProcessor::Token.new("que"),
|
124
|
-
CorpusProcessor::Token.new("as"),
|
125
|
-
CorpusProcessor::Token.new("ideias"),
|
126
|
-
CorpusProcessor::Token.new("reformadoras"),
|
127
|
-
CorpusProcessor::Token.new("encontrassem"),
|
128
|
-
CorpusProcessor::Token.new("divulgação"),
|
129
|
-
CorpusProcessor::Token.new("em"),
|
130
|
-
CorpusProcessor::Token.new("Portugal", :location),
|
131
|
-
]
|
132
|
-
}
|
133
|
-
end
|
134
|
-
|
135
|
-
context "multiple entities" do
|
136
|
-
let(:corpus) {
|
137
|
-
<<-CORPUS
|
138
|
-
<?xml version="1.0" encoding="ISO-8859-1"?>
|
139
|
-
<!DOCTYPE colHAREM>
|
140
|
-
<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
|
141
|
-
<DOC DOCID="H2-dftre765">
|
142
|
-
<P>
|
143
|
-
A imprensa, inventada na
|
144
|
-
<EM ID="H2-dftre765-9" CATEG="LOCAL" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-37" TIPOREL="incluido">Alemanha</EM>
|
145
|
-
por
|
146
|
-
<EM ID="H2-dftre765-10" CATEG="PESSOA" TIPO="INDIVIDUAL" COREL="H2-dftre765-9" TIPOREL="natural_de">John Gutenberg</EM>
|
147
|
-
<EM ID="H2-dftre765-20" CATEG="ORGANIZACAO" TIPO="INSTITUICAO" COMENT="2/3">Inquisição</EM>
|
148
|
-
e a censura
|
149
|
-
</P>
|
150
|
-
</DOC>
|
151
|
-
</colHAREM>
|
152
|
-
CORPUS
|
153
|
-
}
|
154
|
-
|
155
|
-
it { should == [
|
156
|
-
CorpusProcessor::Token.new("A"),
|
157
|
-
CorpusProcessor::Token.new("imprensa"),
|
158
|
-
CorpusProcessor::Token.new("inventada"),
|
159
|
-
CorpusProcessor::Token.new("na"),
|
160
|
-
CorpusProcessor::Token.new("Alemanha", :location),
|
161
|
-
CorpusProcessor::Token.new("por"),
|
162
|
-
CorpusProcessor::Token.new("John", :person),
|
163
|
-
CorpusProcessor::Token.new("Gutenberg", :person),
|
164
|
-
CorpusProcessor::Token.new("Inquisição", :organization),
|
165
|
-
CorpusProcessor::Token.new("e"),
|
166
|
-
CorpusProcessor::Token.new("a"),
|
167
|
-
CorpusProcessor::Token.new("censura"),
|
168
|
-
]
|
169
|
-
}
|
170
|
-
end
|
171
|
-
|
172
|
-
context "spaces after ponctuation" do
|
173
|
-
let(:corpus) {
|
174
|
-
<<-CORPUS
|
175
|
-
<?xml version="1.0" encoding="ISO-8859-1"?>
|
176
|
-
<!DOCTYPE colHAREM>
|
177
|
-
<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
|
178
|
-
<DOC DOCID="H2-dftre765">
|
179
|
-
<EM ID="H2-dftre765-1" CATEG="ABSTRACCAO|ACONTECIMENTO" TIPO="IDEIA|EFEMERIDE">Reforma Protestante</EM>
|
180
|
-
. No
|
181
|
-
</DOC>
|
182
|
-
</colHAREM>
|
183
|
-
CORPUS
|
184
|
-
}
|
185
|
-
|
186
|
-
it { should == [
|
187
|
-
CorpusProcessor::Token.new("Reforma"),
|
188
|
-
CorpusProcessor::Token.new("Protestante"),
|
189
|
-
CorpusProcessor::Token.new("No"),
|
190
|
-
]
|
191
|
-
}
|
192
|
-
end
|
193
|
-
end
|
194
|
-
|
195
|
-
context "user-defined categories" do
|
196
|
-
let(:lampada) {
|
197
|
-
CorpusProcessor::Parsers::Lampada.new({
|
198
|
-
"FRUTA" => :fruit,
|
199
|
-
"LIVRO" => :book,
|
200
|
-
})
|
201
|
-
}
|
202
|
-
|
203
|
-
context "multiple entities" do
|
204
|
-
let(:corpus) {
|
205
|
-
<<-CORPUS
|
206
|
-
<?xml version="1.0" encoding="ISO-8859-1"?>
|
207
|
-
<!DOCTYPE colHAREM>
|
208
|
-
<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
|
209
|
-
<DOC DOCID="H2-dftre765">
|
210
|
-
<P>
|
211
|
-
A imprensa, inventada na
|
212
|
-
<EM ID="H2-dftre765-9" CATEG="FRUTA" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-37" TIPOREL="incluido">Banana</EM>
|
213
|
-
por
|
214
|
-
<EM ID="H2-dftre765-10" CATEG="LIVRO" TIPO="INDIVIDUAL" COREL="H2-dftre765-9" TIPOREL="natural_de">Harry Potter</EM>
|
215
|
-
<EM ID="H2-dftre765-20" CATEG="ORGANIZACAO" TIPO="INSTITUICAO" COMENT="2/3">Inquisição</EM>
|
216
|
-
e a censura
|
217
|
-
</P>
|
218
|
-
</DOC>
|
219
|
-
</colHAREM>
|
220
|
-
CORPUS
|
221
|
-
}
|
222
|
-
|
223
|
-
it { should == [
|
224
|
-
CorpusProcessor::Token.new("A"),
|
225
|
-
CorpusProcessor::Token.new("imprensa"),
|
226
|
-
CorpusProcessor::Token.new("inventada"),
|
227
|
-
CorpusProcessor::Token.new("na"),
|
228
|
-
CorpusProcessor::Token.new("Banana", :fruit),
|
229
|
-
CorpusProcessor::Token.new("por"),
|
230
|
-
CorpusProcessor::Token.new("Harry", :book),
|
231
|
-
CorpusProcessor::Token.new("Potter", :book),
|
232
|
-
CorpusProcessor::Token.new("Inquisição"),
|
233
|
-
CorpusProcessor::Token.new("e"),
|
234
|
-
CorpusProcessor::Token.new("a"),
|
235
|
-
CorpusProcessor::Token.new("censura"),
|
236
|
-
]
|
237
|
-
}
|
238
|
-
end
|
239
|
-
end
|
240
|
-
end
|
241
|
-
|
242
|
-
describe "#extract_category" do
|
243
|
-
subject { lampada.extract_category(categories) }
|
244
|
-
|
245
|
-
context "empty categories" do
|
246
|
-
let(:categories) { "" }
|
247
|
-
|
248
|
-
it { should == nil }
|
249
|
-
end
|
250
|
-
|
251
|
-
context "one category" do
|
252
|
-
let(:categories) { "PESSOA" }
|
253
|
-
|
254
|
-
it { should == :person }
|
255
|
-
end
|
256
|
-
|
257
|
-
context "two categories" do
|
258
|
-
let(:categories) { "OUTRA|ORGANIZACAO" }
|
259
|
-
|
260
|
-
it { should == :organization }
|
261
|
-
end
|
262
|
-
|
263
|
-
context "ambiguidade" do
|
264
|
-
let(:categories) { "PESSOA|ORGANIZACAO" }
|
265
|
-
|
266
|
-
it { should == :person }
|
267
|
-
end
|
268
|
-
end
|
269
|
-
end
|