corpus-processor 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +5 -0
- data/.yardopts +1 -0
- data/README.md +235 -34
- data/bin/corpus-processor +3 -3
- data/corpus-processor.gemspec +16 -14
- data/lib/corpus-processor.rb +12 -8
- data/lib/corpus-processor/categories.rb +58 -0
- data/lib/corpus-processor/categories/default.yml +10 -0
- data/lib/corpus-processor/cli.rb +31 -11
- data/lib/corpus-processor/generators.rb +5 -1
- data/lib/corpus-processor/generators/stanford_ner.rb +19 -10
- data/lib/corpus-processor/parsers.rb +5 -1
- data/lib/corpus-processor/parsers/lampada.rb +103 -47
- data/lib/corpus-processor/processor.rb +19 -4
- data/lib/corpus-processor/token.rb +35 -1
- data/lib/corpus-processor/version.rb +1 -1
- data/spec/{integration → corpus-processor}/cli_spec.rb +81 -71
- data/spec/corpus-processor/generators/stanford_ner_spec.rb +57 -0
- data/spec/corpus-processor/parsers/lampada_spec.rb +333 -0
- data/spec/corpus-processor/processor_spec.rb +36 -0
- data/spec/corpus-processor/token_spec.rb +15 -0
- data/spec/spec_helper.rb +7 -4
- metadata +39 -27
- data/lib/corpus-processor/default_categories.rb +0 -14
- data/lib/corpus-processor/tokenizer.rb +0 -17
- data/lib/corpus-processor/traverser.rb +0 -19
- data/spec/unit/generators/stanford_ner_spec.rb +0 -46
- data/spec/unit/parsers/lampada_spec.rb +0 -269
- data/spec/unit/processor.rb +0 -37
- data/spec/unit/token_spec.rb +0 -8
- data/spec/unit/tokenizer_spec.rb +0 -121
- data/spec/unit/traverser_spec.rb +0 -68
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: corpus-processor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Das Dad
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-07-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - '>='
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: nokogiri
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: bundler
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -42,18 +56,18 @@ dependencies:
|
|
42
56
|
name: rspec
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
44
58
|
requirements:
|
45
|
-
- - '
|
59
|
+
- - '='
|
46
60
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
61
|
+
version: 2.14.0.rc1
|
48
62
|
type: :development
|
49
63
|
prerelease: false
|
50
64
|
version_requirements: !ruby/object:Gem::Requirement
|
51
65
|
requirements:
|
52
|
-
- - '
|
66
|
+
- - '='
|
53
67
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
68
|
+
version: 2.14.0.rc1
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
70
|
+
name: pry-nav
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
58
72
|
requirements:
|
59
73
|
- - '>='
|
@@ -67,7 +81,7 @@ dependencies:
|
|
67
81
|
- !ruby/object:Gem::Version
|
68
82
|
version: '0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
84
|
+
name: coveralls
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
72
86
|
requirements:
|
73
87
|
- - '>='
|
@@ -82,7 +96,7 @@ dependencies:
|
|
82
96
|
version: '0'
|
83
97
|
description: Process linguistic corpus
|
84
98
|
email:
|
85
|
-
-
|
99
|
+
- dev@dasdad.com.br
|
86
100
|
executables:
|
87
101
|
- corpus-processor
|
88
102
|
extensions: []
|
@@ -90,30 +104,29 @@ extra_rdoc_files: []
|
|
90
104
|
files:
|
91
105
|
- .gitignore
|
92
106
|
- .rspec
|
107
|
+
- .travis.yml
|
108
|
+
- .yardopts
|
93
109
|
- Gemfile
|
94
110
|
- README.md
|
95
111
|
- bin/corpus-processor
|
96
112
|
- corpus-processor.gemspec
|
97
113
|
- lib/corpus-processor.rb
|
114
|
+
- lib/corpus-processor/categories.rb
|
115
|
+
- lib/corpus-processor/categories/default.yml
|
98
116
|
- lib/corpus-processor/cli.rb
|
99
|
-
- lib/corpus-processor/default_categories.rb
|
100
117
|
- lib/corpus-processor/generators.rb
|
101
118
|
- lib/corpus-processor/generators/stanford_ner.rb
|
102
119
|
- lib/corpus-processor/parsers.rb
|
103
120
|
- lib/corpus-processor/parsers/lampada.rb
|
104
121
|
- lib/corpus-processor/processor.rb
|
105
122
|
- lib/corpus-processor/token.rb
|
106
|
-
- lib/corpus-processor/tokenizer.rb
|
107
|
-
- lib/corpus-processor/traverser.rb
|
108
123
|
- lib/corpus-processor/version.rb
|
109
|
-
- spec/
|
124
|
+
- spec/corpus-processor/cli_spec.rb
|
125
|
+
- spec/corpus-processor/generators/stanford_ner_spec.rb
|
126
|
+
- spec/corpus-processor/parsers/lampada_spec.rb
|
127
|
+
- spec/corpus-processor/processor_spec.rb
|
128
|
+
- spec/corpus-processor/token_spec.rb
|
110
129
|
- spec/spec_helper.rb
|
111
|
-
- spec/unit/generators/stanford_ner_spec.rb
|
112
|
-
- spec/unit/parsers/lampada_spec.rb
|
113
|
-
- spec/unit/processor.rb
|
114
|
-
- spec/unit/token_spec.rb
|
115
|
-
- spec/unit/tokenizer_spec.rb
|
116
|
-
- spec/unit/traverser_spec.rb
|
117
130
|
homepage: https://github.com/dasdad/corpus-processor
|
118
131
|
licenses:
|
119
132
|
- MIT
|
@@ -134,16 +147,15 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
134
147
|
version: '0'
|
135
148
|
requirements: []
|
136
149
|
rubyforge_project:
|
137
|
-
rubygems_version: 2.0.
|
150
|
+
rubygems_version: 2.0.2
|
138
151
|
signing_key:
|
139
152
|
specification_version: 4
|
140
153
|
summary: Handle linguistic corpus and convert it to use NLP tools
|
141
154
|
test_files:
|
142
|
-
- spec/
|
155
|
+
- spec/corpus-processor/cli_spec.rb
|
156
|
+
- spec/corpus-processor/generators/stanford_ner_spec.rb
|
157
|
+
- spec/corpus-processor/parsers/lampada_spec.rb
|
158
|
+
- spec/corpus-processor/processor_spec.rb
|
159
|
+
- spec/corpus-processor/token_spec.rb
|
143
160
|
- spec/spec_helper.rb
|
144
|
-
|
145
|
-
- spec/unit/parsers/lampada_spec.rb
|
146
|
-
- spec/unit/processor.rb
|
147
|
-
- spec/unit/token_spec.rb
|
148
|
-
- spec/unit/tokenizer_spec.rb
|
149
|
-
- spec/unit/traverser_spec.rb
|
161
|
+
has_rdoc:
|
@@ -1,14 +0,0 @@
|
|
1
|
-
module CorpusProcessor
|
2
|
-
DEFAULT_CATEGORIES = {
|
3
|
-
input: {
|
4
|
-
"PESSOA" => :person,
|
5
|
-
"LOCAL" => :location,
|
6
|
-
"ORGANIZACAO" => :organization,
|
7
|
-
},
|
8
|
-
output: Hash.new("O").merge(
|
9
|
-
person: "PERSON",
|
10
|
-
location: "LOCATION",
|
11
|
-
organization: "ORGANIZATION",
|
12
|
-
)
|
13
|
-
}
|
14
|
-
end
|
@@ -1,17 +0,0 @@
|
|
1
|
-
class CorpusProcessor::Tokenizer
|
2
|
-
def tokenize(text, category = nil)
|
3
|
-
strip_tags(text)
|
4
|
-
.gsub(/[[:punct:]]/, "")
|
5
|
-
.strip
|
6
|
-
.split(/\s+/)
|
7
|
-
.map { |word| CorpusProcessor::Token.new(word, category) }
|
8
|
-
end
|
9
|
-
|
10
|
-
def strip_tags(text)
|
11
|
-
text.gsub(/<.*?>/, " ").strip
|
12
|
-
end
|
13
|
-
|
14
|
-
def join_lines(text)
|
15
|
-
text.gsub(/\s+/, " ").strip
|
16
|
-
end
|
17
|
-
end
|
@@ -1,19 +0,0 @@
|
|
1
|
-
class CorpusProcessor::Traverser
|
2
|
-
def traverse(text, regexp, &block)
|
3
|
-
return if block.nil?
|
4
|
-
remaining_search = text
|
5
|
-
until remaining_search.empty?
|
6
|
-
match = remaining_search.match(regexp)
|
7
|
-
if match.nil?
|
8
|
-
block.call remaining_search unless remaining_search.empty?
|
9
|
-
remaining_search = ""
|
10
|
-
else
|
11
|
-
before = remaining_search[0...match.begin(0)]
|
12
|
-
remaining_search = remaining_search[match.end(0)..-1]
|
13
|
-
|
14
|
-
block.call before unless before.empty?
|
15
|
-
block.call match
|
16
|
-
end
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
@@ -1,46 +0,0 @@
|
|
1
|
-
require "spec_helper"
|
2
|
-
|
3
|
-
describe CorpusProcessor::Generators::StanfordNer do
|
4
|
-
subject(:stanford_ner) { CorpusProcessor::Generators::StanfordNer.new }
|
5
|
-
|
6
|
-
describe "#generate" do
|
7
|
-
subject { stanford_ner.generate(tokens) }
|
8
|
-
|
9
|
-
context "no tokens" do
|
10
|
-
let(:tokens) { [] }
|
11
|
-
|
12
|
-
it { should == "\n" }
|
13
|
-
end
|
14
|
-
|
15
|
-
context "one token" do
|
16
|
-
let(:tokens) { [CorpusProcessor::Token.new("banana")] }
|
17
|
-
|
18
|
-
it { should == "banana O\n" }
|
19
|
-
end
|
20
|
-
|
21
|
-
context "two tokens" do
|
22
|
-
let(:tokens) { [
|
23
|
-
CorpusProcessor::Token.new("good"),
|
24
|
-
CorpusProcessor::Token.new("banana"),
|
25
|
-
] }
|
26
|
-
|
27
|
-
it { should == "good O\nbanana O\n" }
|
28
|
-
end
|
29
|
-
|
30
|
-
context "with category" do
|
31
|
-
let(:tokens) { [CorpusProcessor::Token.new("Leandro", :person)] }
|
32
|
-
|
33
|
-
it { should == "Leandro PERSON\n" }
|
34
|
-
end
|
35
|
-
|
36
|
-
context "with non-default categories" do
|
37
|
-
let(:stanford_ner) { CorpusProcessor::Generators::StanfordNer.new(
|
38
|
-
banana: "BANANA"
|
39
|
-
) }
|
40
|
-
|
41
|
-
let(:tokens) { [CorpusProcessor::Token.new("Nanica", :banana)] }
|
42
|
-
|
43
|
-
it { should == "Nanica BANANA\n" }
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
@@ -1,269 +0,0 @@
|
|
1
|
-
require "spec_helper"
|
2
|
-
|
3
|
-
describe CorpusProcessor::Parsers::Lampada do
|
4
|
-
subject(:lampada) { CorpusProcessor::Parsers::Lampada.new }
|
5
|
-
|
6
|
-
describe "#parse" do
|
7
|
-
subject { lampada.parse(corpus) }
|
8
|
-
|
9
|
-
context "default categories" do
|
10
|
-
context "empty corpus" do
|
11
|
-
let(:corpus) { "" }
|
12
|
-
|
13
|
-
it { should == [] }
|
14
|
-
end
|
15
|
-
|
16
|
-
context "doctype" do
|
17
|
-
let(:corpus) {
|
18
|
-
<<-CORPUS
|
19
|
-
<?xml version="1.0" encoding="ISO-8859-1"?>
|
20
|
-
<!DOCTYPE colHAREM>
|
21
|
-
CORPUS
|
22
|
-
}
|
23
|
-
|
24
|
-
it { should == [] }
|
25
|
-
end
|
26
|
-
|
27
|
-
context "simple phrase" do
|
28
|
-
let(:corpus) {
|
29
|
-
<<-CORPUS
|
30
|
-
<?xml version="1.0" encoding="ISO-8859-1"?>
|
31
|
-
<!DOCTYPE colHAREM>
|
32
|
-
<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
|
33
|
-
<DOC DOCID="H2-dftre765">
|
34
|
-
<P>Fatores Demográficos e Econômicos Subjacentes</P>
|
35
|
-
</DOC>
|
36
|
-
</colHAREM>
|
37
|
-
CORPUS
|
38
|
-
}
|
39
|
-
|
40
|
-
it { should == [
|
41
|
-
CorpusProcessor::Token.new("Fatores"),
|
42
|
-
CorpusProcessor::Token.new("Demográficos"),
|
43
|
-
CorpusProcessor::Token.new("e"),
|
44
|
-
CorpusProcessor::Token.new("Econômicos"),
|
45
|
-
CorpusProcessor::Token.new("Subjacentes"),
|
46
|
-
]
|
47
|
-
}
|
48
|
-
end
|
49
|
-
|
50
|
-
context "two simple phrases" do
|
51
|
-
let(:corpus) {
|
52
|
-
<<-CORPUS
|
53
|
-
<?xml version="1.0" encoding="ISO-8859-1"?>
|
54
|
-
<!DOCTYPE colHAREM>
|
55
|
-
<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
|
56
|
-
<DOC DOCID="H2-dftre765">
|
57
|
-
<P>Fatores Demográficos e Econômicos Subjacentes</P>
|
58
|
-
<P>Fatores Demográficos e Econômicos Subjacentes</P>
|
59
|
-
</DOC>
|
60
|
-
</colHAREM>
|
61
|
-
CORPUS
|
62
|
-
}
|
63
|
-
|
64
|
-
it { should == [
|
65
|
-
CorpusProcessor::Token.new("Fatores"),
|
66
|
-
CorpusProcessor::Token.new("Demográficos"),
|
67
|
-
CorpusProcessor::Token.new("e"),
|
68
|
-
CorpusProcessor::Token.new("Econômicos"),
|
69
|
-
CorpusProcessor::Token.new("Subjacentes"),
|
70
|
-
CorpusProcessor::Token.new("Fatores"),
|
71
|
-
CorpusProcessor::Token.new("Demográficos"),
|
72
|
-
CorpusProcessor::Token.new("e"),
|
73
|
-
CorpusProcessor::Token.new("Econômicos"),
|
74
|
-
CorpusProcessor::Token.new("Subjacentes"),
|
75
|
-
]
|
76
|
-
}
|
77
|
-
end
|
78
|
-
|
79
|
-
context "useless entity" do
|
80
|
-
let(:corpus) {
|
81
|
-
<<-CORPUS
|
82
|
-
<?xml version="1.0" encoding="ISO-8859-1"?>
|
83
|
-
<!DOCTYPE colHAREM>
|
84
|
-
<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
|
85
|
-
<DOC DOCID="H2-dftre765">
|
86
|
-
<P>Nos finais da <EM ID="H2-dftre765-102" CATEG="OUTRO" COMENT="DUVIDA_DIRECTIVASTEMPO">Idade Média</EM></P>
|
87
|
-
</DOC>
|
88
|
-
</colHAREM>
|
89
|
-
CORPUS
|
90
|
-
}
|
91
|
-
|
92
|
-
it { should == [
|
93
|
-
CorpusProcessor::Token.new("Nos"),
|
94
|
-
CorpusProcessor::Token.new("finais"),
|
95
|
-
CorpusProcessor::Token.new("da"),
|
96
|
-
CorpusProcessor::Token.new("Idade"),
|
97
|
-
CorpusProcessor::Token.new("Média"),
|
98
|
-
]
|
99
|
-
}
|
100
|
-
end
|
101
|
-
|
102
|
-
context "one entity" do
|
103
|
-
let(:corpus) {
|
104
|
-
<<-CORPUS
|
105
|
-
<?xml version="1.0" encoding="ISO-8859-1"?>
|
106
|
-
<!DOCTYPE colHAREM>
|
107
|
-
<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
|
108
|
-
<DOC DOCID="H2-dftre765">
|
109
|
-
<P>Foram igualmente determinantes para evitar que as ideias reformadoras encontrassem divulgação em
|
110
|
-
<EM ID="H2-dftre765-23" CATEG="LOCAL" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-8 H2-dftre765-37" TIPOREL="local_nascimento_de incluido">Portugal</EM>
|
111
|
-
</P>
|
112
|
-
</DOC>
|
113
|
-
</colHAREM>
|
114
|
-
CORPUS
|
115
|
-
}
|
116
|
-
|
117
|
-
it { should == [
|
118
|
-
CorpusProcessor::Token.new("Foram"),
|
119
|
-
CorpusProcessor::Token.new("igualmente"),
|
120
|
-
CorpusProcessor::Token.new("determinantes"),
|
121
|
-
CorpusProcessor::Token.new("para"),
|
122
|
-
CorpusProcessor::Token.new("evitar"),
|
123
|
-
CorpusProcessor::Token.new("que"),
|
124
|
-
CorpusProcessor::Token.new("as"),
|
125
|
-
CorpusProcessor::Token.new("ideias"),
|
126
|
-
CorpusProcessor::Token.new("reformadoras"),
|
127
|
-
CorpusProcessor::Token.new("encontrassem"),
|
128
|
-
CorpusProcessor::Token.new("divulgação"),
|
129
|
-
CorpusProcessor::Token.new("em"),
|
130
|
-
CorpusProcessor::Token.new("Portugal", :location),
|
131
|
-
]
|
132
|
-
}
|
133
|
-
end
|
134
|
-
|
135
|
-
context "multiple entities" do
|
136
|
-
let(:corpus) {
|
137
|
-
<<-CORPUS
|
138
|
-
<?xml version="1.0" encoding="ISO-8859-1"?>
|
139
|
-
<!DOCTYPE colHAREM>
|
140
|
-
<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
|
141
|
-
<DOC DOCID="H2-dftre765">
|
142
|
-
<P>
|
143
|
-
A imprensa, inventada na
|
144
|
-
<EM ID="H2-dftre765-9" CATEG="LOCAL" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-37" TIPOREL="incluido">Alemanha</EM>
|
145
|
-
por
|
146
|
-
<EM ID="H2-dftre765-10" CATEG="PESSOA" TIPO="INDIVIDUAL" COREL="H2-dftre765-9" TIPOREL="natural_de">John Gutenberg</EM>
|
147
|
-
<EM ID="H2-dftre765-20" CATEG="ORGANIZACAO" TIPO="INSTITUICAO" COMENT="2/3">Inquisição</EM>
|
148
|
-
e a censura
|
149
|
-
</P>
|
150
|
-
</DOC>
|
151
|
-
</colHAREM>
|
152
|
-
CORPUS
|
153
|
-
}
|
154
|
-
|
155
|
-
it { should == [
|
156
|
-
CorpusProcessor::Token.new("A"),
|
157
|
-
CorpusProcessor::Token.new("imprensa"),
|
158
|
-
CorpusProcessor::Token.new("inventada"),
|
159
|
-
CorpusProcessor::Token.new("na"),
|
160
|
-
CorpusProcessor::Token.new("Alemanha", :location),
|
161
|
-
CorpusProcessor::Token.new("por"),
|
162
|
-
CorpusProcessor::Token.new("John", :person),
|
163
|
-
CorpusProcessor::Token.new("Gutenberg", :person),
|
164
|
-
CorpusProcessor::Token.new("Inquisição", :organization),
|
165
|
-
CorpusProcessor::Token.new("e"),
|
166
|
-
CorpusProcessor::Token.new("a"),
|
167
|
-
CorpusProcessor::Token.new("censura"),
|
168
|
-
]
|
169
|
-
}
|
170
|
-
end
|
171
|
-
|
172
|
-
context "spaces after ponctuation" do
|
173
|
-
let(:corpus) {
|
174
|
-
<<-CORPUS
|
175
|
-
<?xml version="1.0" encoding="ISO-8859-1"?>
|
176
|
-
<!DOCTYPE colHAREM>
|
177
|
-
<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
|
178
|
-
<DOC DOCID="H2-dftre765">
|
179
|
-
<EM ID="H2-dftre765-1" CATEG="ABSTRACCAO|ACONTECIMENTO" TIPO="IDEIA|EFEMERIDE">Reforma Protestante</EM>
|
180
|
-
. No
|
181
|
-
</DOC>
|
182
|
-
</colHAREM>
|
183
|
-
CORPUS
|
184
|
-
}
|
185
|
-
|
186
|
-
it { should == [
|
187
|
-
CorpusProcessor::Token.new("Reforma"),
|
188
|
-
CorpusProcessor::Token.new("Protestante"),
|
189
|
-
CorpusProcessor::Token.new("No"),
|
190
|
-
]
|
191
|
-
}
|
192
|
-
end
|
193
|
-
end
|
194
|
-
|
195
|
-
context "user-defined categories" do
|
196
|
-
let(:lampada) {
|
197
|
-
CorpusProcessor::Parsers::Lampada.new({
|
198
|
-
"FRUTA" => :fruit,
|
199
|
-
"LIVRO" => :book,
|
200
|
-
})
|
201
|
-
}
|
202
|
-
|
203
|
-
context "multiple entities" do
|
204
|
-
let(:corpus) {
|
205
|
-
<<-CORPUS
|
206
|
-
<?xml version="1.0" encoding="ISO-8859-1"?>
|
207
|
-
<!DOCTYPE colHAREM>
|
208
|
-
<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
|
209
|
-
<DOC DOCID="H2-dftre765">
|
210
|
-
<P>
|
211
|
-
A imprensa, inventada na
|
212
|
-
<EM ID="H2-dftre765-9" CATEG="FRUTA" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-37" TIPOREL="incluido">Banana</EM>
|
213
|
-
por
|
214
|
-
<EM ID="H2-dftre765-10" CATEG="LIVRO" TIPO="INDIVIDUAL" COREL="H2-dftre765-9" TIPOREL="natural_de">Harry Potter</EM>
|
215
|
-
<EM ID="H2-dftre765-20" CATEG="ORGANIZACAO" TIPO="INSTITUICAO" COMENT="2/3">Inquisição</EM>
|
216
|
-
e a censura
|
217
|
-
</P>
|
218
|
-
</DOC>
|
219
|
-
</colHAREM>
|
220
|
-
CORPUS
|
221
|
-
}
|
222
|
-
|
223
|
-
it { should == [
|
224
|
-
CorpusProcessor::Token.new("A"),
|
225
|
-
CorpusProcessor::Token.new("imprensa"),
|
226
|
-
CorpusProcessor::Token.new("inventada"),
|
227
|
-
CorpusProcessor::Token.new("na"),
|
228
|
-
CorpusProcessor::Token.new("Banana", :fruit),
|
229
|
-
CorpusProcessor::Token.new("por"),
|
230
|
-
CorpusProcessor::Token.new("Harry", :book),
|
231
|
-
CorpusProcessor::Token.new("Potter", :book),
|
232
|
-
CorpusProcessor::Token.new("Inquisição"),
|
233
|
-
CorpusProcessor::Token.new("e"),
|
234
|
-
CorpusProcessor::Token.new("a"),
|
235
|
-
CorpusProcessor::Token.new("censura"),
|
236
|
-
]
|
237
|
-
}
|
238
|
-
end
|
239
|
-
end
|
240
|
-
end
|
241
|
-
|
242
|
-
describe "#extract_category" do
|
243
|
-
subject { lampada.extract_category(categories) }
|
244
|
-
|
245
|
-
context "empty categories" do
|
246
|
-
let(:categories) { "" }
|
247
|
-
|
248
|
-
it { should == nil }
|
249
|
-
end
|
250
|
-
|
251
|
-
context "one category" do
|
252
|
-
let(:categories) { "PESSOA" }
|
253
|
-
|
254
|
-
it { should == :person }
|
255
|
-
end
|
256
|
-
|
257
|
-
context "two categories" do
|
258
|
-
let(:categories) { "OUTRA|ORGANIZACAO" }
|
259
|
-
|
260
|
-
it { should == :organization }
|
261
|
-
end
|
262
|
-
|
263
|
-
context "ambiguidade" do
|
264
|
-
let(:categories) { "PESSOA|ORGANIZACAO" }
|
265
|
-
|
266
|
-
it { should == :person }
|
267
|
-
end
|
268
|
-
end
|
269
|
-
end
|