corpus-processor 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: corpus-processor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Das Dad
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-04-01 00:00:00.000000000 Z
11
+ date: 2013-07-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - '>='
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: bundler
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -42,18 +56,18 @@ dependencies:
42
56
  name: rspec
43
57
  requirement: !ruby/object:Gem::Requirement
44
58
  requirements:
45
- - - '>='
59
+ - - '='
46
60
  - !ruby/object:Gem::Version
47
- version: '0'
61
+ version: 2.14.0.rc1
48
62
  type: :development
49
63
  prerelease: false
50
64
  version_requirements: !ruby/object:Gem::Requirement
51
65
  requirements:
52
- - - '>='
66
+ - - '='
53
67
  - !ruby/object:Gem::Version
54
- version: '0'
68
+ version: 2.14.0.rc1
55
69
  - !ruby/object:Gem::Dependency
56
- name: fakefs
70
+ name: pry-nav
57
71
  requirement: !ruby/object:Gem::Requirement
58
72
  requirements:
59
73
  - - '>='
@@ -67,7 +81,7 @@ dependencies:
67
81
  - !ruby/object:Gem::Version
68
82
  version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
- name: pry-nav
84
+ name: coveralls
71
85
  requirement: !ruby/object:Gem::Requirement
72
86
  requirements:
73
87
  - - '>='
@@ -82,7 +96,7 @@ dependencies:
82
96
  version: '0'
83
97
  description: Process linguistic corpus
84
98
  email:
85
- - feedback@dasdad.com.br
99
+ - dev@dasdad.com.br
86
100
  executables:
87
101
  - corpus-processor
88
102
  extensions: []
@@ -90,30 +104,29 @@ extra_rdoc_files: []
90
104
  files:
91
105
  - .gitignore
92
106
  - .rspec
107
+ - .travis.yml
108
+ - .yardopts
93
109
  - Gemfile
94
110
  - README.md
95
111
  - bin/corpus-processor
96
112
  - corpus-processor.gemspec
97
113
  - lib/corpus-processor.rb
114
+ - lib/corpus-processor/categories.rb
115
+ - lib/corpus-processor/categories/default.yml
98
116
  - lib/corpus-processor/cli.rb
99
- - lib/corpus-processor/default_categories.rb
100
117
  - lib/corpus-processor/generators.rb
101
118
  - lib/corpus-processor/generators/stanford_ner.rb
102
119
  - lib/corpus-processor/parsers.rb
103
120
  - lib/corpus-processor/parsers/lampada.rb
104
121
  - lib/corpus-processor/processor.rb
105
122
  - lib/corpus-processor/token.rb
106
- - lib/corpus-processor/tokenizer.rb
107
- - lib/corpus-processor/traverser.rb
108
123
  - lib/corpus-processor/version.rb
109
- - spec/integration/cli_spec.rb
124
+ - spec/corpus-processor/cli_spec.rb
125
+ - spec/corpus-processor/generators/stanford_ner_spec.rb
126
+ - spec/corpus-processor/parsers/lampada_spec.rb
127
+ - spec/corpus-processor/processor_spec.rb
128
+ - spec/corpus-processor/token_spec.rb
110
129
  - spec/spec_helper.rb
111
- - spec/unit/generators/stanford_ner_spec.rb
112
- - spec/unit/parsers/lampada_spec.rb
113
- - spec/unit/processor.rb
114
- - spec/unit/token_spec.rb
115
- - spec/unit/tokenizer_spec.rb
116
- - spec/unit/traverser_spec.rb
117
130
  homepage: https://github.com/dasdad/corpus-processor
118
131
  licenses:
119
132
  - MIT
@@ -134,16 +147,15 @@ required_rubygems_version: !ruby/object:Gem::Requirement
134
147
  version: '0'
135
148
  requirements: []
136
149
  rubyforge_project:
137
- rubygems_version: 2.0.0
150
+ rubygems_version: 2.0.2
138
151
  signing_key:
139
152
  specification_version: 4
140
153
  summary: Handle linguistic corpus and convert it to use NLP tools
141
154
  test_files:
142
- - spec/integration/cli_spec.rb
155
+ - spec/corpus-processor/cli_spec.rb
156
+ - spec/corpus-processor/generators/stanford_ner_spec.rb
157
+ - spec/corpus-processor/parsers/lampada_spec.rb
158
+ - spec/corpus-processor/processor_spec.rb
159
+ - spec/corpus-processor/token_spec.rb
143
160
  - spec/spec_helper.rb
144
- - spec/unit/generators/stanford_ner_spec.rb
145
- - spec/unit/parsers/lampada_spec.rb
146
- - spec/unit/processor.rb
147
- - spec/unit/token_spec.rb
148
- - spec/unit/tokenizer_spec.rb
149
- - spec/unit/traverser_spec.rb
161
+ has_rdoc:
@@ -1,14 +0,0 @@
1
- module CorpusProcessor
2
- DEFAULT_CATEGORIES = {
3
- input: {
4
- "PESSOA" => :person,
5
- "LOCAL" => :location,
6
- "ORGANIZACAO" => :organization,
7
- },
8
- output: Hash.new("O").merge(
9
- person: "PERSON",
10
- location: "LOCATION",
11
- organization: "ORGANIZATION",
12
- )
13
- }
14
- end
@@ -1,17 +0,0 @@
1
- class CorpusProcessor::Tokenizer
2
- def tokenize(text, category = nil)
3
- strip_tags(text)
4
- .gsub(/[[:punct:]]/, "")
5
- .strip
6
- .split(/\s+/)
7
- .map { |word| CorpusProcessor::Token.new(word, category) }
8
- end
9
-
10
- def strip_tags(text)
11
- text.gsub(/<.*?>/, " ").strip
12
- end
13
-
14
- def join_lines(text)
15
- text.gsub(/\s+/, " ").strip
16
- end
17
- end
@@ -1,19 +0,0 @@
1
- class CorpusProcessor::Traverser
2
- def traverse(text, regexp, &block)
3
- return if block.nil?
4
- remaining_search = text
5
- until remaining_search.empty?
6
- match = remaining_search.match(regexp)
7
- if match.nil?
8
- block.call remaining_search unless remaining_search.empty?
9
- remaining_search = ""
10
- else
11
- before = remaining_search[0...match.begin(0)]
12
- remaining_search = remaining_search[match.end(0)..-1]
13
-
14
- block.call before unless before.empty?
15
- block.call match
16
- end
17
- end
18
- end
19
- end
@@ -1,46 +0,0 @@
1
- require "spec_helper"
2
-
3
- describe CorpusProcessor::Generators::StanfordNer do
4
- subject(:stanford_ner) { CorpusProcessor::Generators::StanfordNer.new }
5
-
6
- describe "#generate" do
7
- subject { stanford_ner.generate(tokens) }
8
-
9
- context "no tokens" do
10
- let(:tokens) { [] }
11
-
12
- it { should == "\n" }
13
- end
14
-
15
- context "one token" do
16
- let(:tokens) { [CorpusProcessor::Token.new("banana")] }
17
-
18
- it { should == "banana O\n" }
19
- end
20
-
21
- context "two tokens" do
22
- let(:tokens) { [
23
- CorpusProcessor::Token.new("good"),
24
- CorpusProcessor::Token.new("banana"),
25
- ] }
26
-
27
- it { should == "good O\nbanana O\n" }
28
- end
29
-
30
- context "with category" do
31
- let(:tokens) { [CorpusProcessor::Token.new("Leandro", :person)] }
32
-
33
- it { should == "Leandro PERSON\n" }
34
- end
35
-
36
- context "with non-default categories" do
37
- let(:stanford_ner) { CorpusProcessor::Generators::StanfordNer.new(
38
- banana: "BANANA"
39
- ) }
40
-
41
- let(:tokens) { [CorpusProcessor::Token.new("Nanica", :banana)] }
42
-
43
- it { should == "Nanica BANANA\n" }
44
- end
45
- end
46
- end
@@ -1,269 +0,0 @@
1
- require "spec_helper"
2
-
3
- describe CorpusProcessor::Parsers::Lampada do
4
- subject(:lampada) { CorpusProcessor::Parsers::Lampada.new }
5
-
6
- describe "#parse" do
7
- subject { lampada.parse(corpus) }
8
-
9
- context "default categories" do
10
- context "empty corpus" do
11
- let(:corpus) { "" }
12
-
13
- it { should == [] }
14
- end
15
-
16
- context "doctype" do
17
- let(:corpus) {
18
- <<-CORPUS
19
- <?xml version="1.0" encoding="ISO-8859-1"?>
20
- <!DOCTYPE colHAREM>
21
- CORPUS
22
- }
23
-
24
- it { should == [] }
25
- end
26
-
27
- context "simple phrase" do
28
- let(:corpus) {
29
- <<-CORPUS
30
- <?xml version="1.0" encoding="ISO-8859-1"?>
31
- <!DOCTYPE colHAREM>
32
- <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
33
- <DOC DOCID="H2-dftre765">
34
- <P>Fatores Demográficos e Econômicos Subjacentes</P>
35
- </DOC>
36
- </colHAREM>
37
- CORPUS
38
- }
39
-
40
- it { should == [
41
- CorpusProcessor::Token.new("Fatores"),
42
- CorpusProcessor::Token.new("Demográficos"),
43
- CorpusProcessor::Token.new("e"),
44
- CorpusProcessor::Token.new("Econômicos"),
45
- CorpusProcessor::Token.new("Subjacentes"),
46
- ]
47
- }
48
- end
49
-
50
- context "two simple phrases" do
51
- let(:corpus) {
52
- <<-CORPUS
53
- <?xml version="1.0" encoding="ISO-8859-1"?>
54
- <!DOCTYPE colHAREM>
55
- <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
56
- <DOC DOCID="H2-dftre765">
57
- <P>Fatores Demográficos e Econômicos Subjacentes</P>
58
- <P>Fatores Demográficos e Econômicos Subjacentes</P>
59
- </DOC>
60
- </colHAREM>
61
- CORPUS
62
- }
63
-
64
- it { should == [
65
- CorpusProcessor::Token.new("Fatores"),
66
- CorpusProcessor::Token.new("Demográficos"),
67
- CorpusProcessor::Token.new("e"),
68
- CorpusProcessor::Token.new("Econômicos"),
69
- CorpusProcessor::Token.new("Subjacentes"),
70
- CorpusProcessor::Token.new("Fatores"),
71
- CorpusProcessor::Token.new("Demográficos"),
72
- CorpusProcessor::Token.new("e"),
73
- CorpusProcessor::Token.new("Econômicos"),
74
- CorpusProcessor::Token.new("Subjacentes"),
75
- ]
76
- }
77
- end
78
-
79
- context "useless entity" do
80
- let(:corpus) {
81
- <<-CORPUS
82
- <?xml version="1.0" encoding="ISO-8859-1"?>
83
- <!DOCTYPE colHAREM>
84
- <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
85
- <DOC DOCID="H2-dftre765">
86
- <P>Nos finais da <EM ID="H2-dftre765-102" CATEG="OUTRO" COMENT="DUVIDA_DIRECTIVASTEMPO">Idade Média</EM></P>
87
- </DOC>
88
- </colHAREM>
89
- CORPUS
90
- }
91
-
92
- it { should == [
93
- CorpusProcessor::Token.new("Nos"),
94
- CorpusProcessor::Token.new("finais"),
95
- CorpusProcessor::Token.new("da"),
96
- CorpusProcessor::Token.new("Idade"),
97
- CorpusProcessor::Token.new("Média"),
98
- ]
99
- }
100
- end
101
-
102
- context "one entity" do
103
- let(:corpus) {
104
- <<-CORPUS
105
- <?xml version="1.0" encoding="ISO-8859-1"?>
106
- <!DOCTYPE colHAREM>
107
- <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
108
- <DOC DOCID="H2-dftre765">
109
- <P>Foram igualmente determinantes para evitar que as ideias reformadoras encontrassem divulgação em
110
- <EM ID="H2-dftre765-23" CATEG="LOCAL" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-8 H2-dftre765-37" TIPOREL="local_nascimento_de incluido">Portugal</EM>
111
- </P>
112
- </DOC>
113
- </colHAREM>
114
- CORPUS
115
- }
116
-
117
- it { should == [
118
- CorpusProcessor::Token.new("Foram"),
119
- CorpusProcessor::Token.new("igualmente"),
120
- CorpusProcessor::Token.new("determinantes"),
121
- CorpusProcessor::Token.new("para"),
122
- CorpusProcessor::Token.new("evitar"),
123
- CorpusProcessor::Token.new("que"),
124
- CorpusProcessor::Token.new("as"),
125
- CorpusProcessor::Token.new("ideias"),
126
- CorpusProcessor::Token.new("reformadoras"),
127
- CorpusProcessor::Token.new("encontrassem"),
128
- CorpusProcessor::Token.new("divulgação"),
129
- CorpusProcessor::Token.new("em"),
130
- CorpusProcessor::Token.new("Portugal", :location),
131
- ]
132
- }
133
- end
134
-
135
- context "multiple entities" do
136
- let(:corpus) {
137
- <<-CORPUS
138
- <?xml version="1.0" encoding="ISO-8859-1"?>
139
- <!DOCTYPE colHAREM>
140
- <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
141
- <DOC DOCID="H2-dftre765">
142
- <P>
143
- A imprensa, inventada na
144
- <EM ID="H2-dftre765-9" CATEG="LOCAL" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-37" TIPOREL="incluido">Alemanha</EM>
145
- por
146
- <EM ID="H2-dftre765-10" CATEG="PESSOA" TIPO="INDIVIDUAL" COREL="H2-dftre765-9" TIPOREL="natural_de">John Gutenberg</EM>
147
- <EM ID="H2-dftre765-20" CATEG="ORGANIZACAO" TIPO="INSTITUICAO" COMENT="2/3">Inquisição</EM>
148
- e a censura
149
- </P>
150
- </DOC>
151
- </colHAREM>
152
- CORPUS
153
- }
154
-
155
- it { should == [
156
- CorpusProcessor::Token.new("A"),
157
- CorpusProcessor::Token.new("imprensa"),
158
- CorpusProcessor::Token.new("inventada"),
159
- CorpusProcessor::Token.new("na"),
160
- CorpusProcessor::Token.new("Alemanha", :location),
161
- CorpusProcessor::Token.new("por"),
162
- CorpusProcessor::Token.new("John", :person),
163
- CorpusProcessor::Token.new("Gutenberg", :person),
164
- CorpusProcessor::Token.new("Inquisição", :organization),
165
- CorpusProcessor::Token.new("e"),
166
- CorpusProcessor::Token.new("a"),
167
- CorpusProcessor::Token.new("censura"),
168
- ]
169
- }
170
- end
171
-
172
- context "spaces after ponctuation" do
173
- let(:corpus) {
174
- <<-CORPUS
175
- <?xml version="1.0" encoding="ISO-8859-1"?>
176
- <!DOCTYPE colHAREM>
177
- <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
178
- <DOC DOCID="H2-dftre765">
179
- <EM ID="H2-dftre765-1" CATEG="ABSTRACCAO|ACONTECIMENTO" TIPO="IDEIA|EFEMERIDE">Reforma Protestante</EM>
180
- . No
181
- </DOC>
182
- </colHAREM>
183
- CORPUS
184
- }
185
-
186
- it { should == [
187
- CorpusProcessor::Token.new("Reforma"),
188
- CorpusProcessor::Token.new("Protestante"),
189
- CorpusProcessor::Token.new("No"),
190
- ]
191
- }
192
- end
193
- end
194
-
195
- context "user-defined categories" do
196
- let(:lampada) {
197
- CorpusProcessor::Parsers::Lampada.new({
198
- "FRUTA" => :fruit,
199
- "LIVRO" => :book,
200
- })
201
- }
202
-
203
- context "multiple entities" do
204
- let(:corpus) {
205
- <<-CORPUS
206
- <?xml version="1.0" encoding="ISO-8859-1"?>
207
- <!DOCTYPE colHAREM>
208
- <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
209
- <DOC DOCID="H2-dftre765">
210
- <P>
211
- A imprensa, inventada na
212
- <EM ID="H2-dftre765-9" CATEG="FRUTA" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-37" TIPOREL="incluido">Banana</EM>
213
- por
214
- <EM ID="H2-dftre765-10" CATEG="LIVRO" TIPO="INDIVIDUAL" COREL="H2-dftre765-9" TIPOREL="natural_de">Harry Potter</EM>
215
- <EM ID="H2-dftre765-20" CATEG="ORGANIZACAO" TIPO="INSTITUICAO" COMENT="2/3">Inquisição</EM>
216
- e a censura
217
- </P>
218
- </DOC>
219
- </colHAREM>
220
- CORPUS
221
- }
222
-
223
- it { should == [
224
- CorpusProcessor::Token.new("A"),
225
- CorpusProcessor::Token.new("imprensa"),
226
- CorpusProcessor::Token.new("inventada"),
227
- CorpusProcessor::Token.new("na"),
228
- CorpusProcessor::Token.new("Banana", :fruit),
229
- CorpusProcessor::Token.new("por"),
230
- CorpusProcessor::Token.new("Harry", :book),
231
- CorpusProcessor::Token.new("Potter", :book),
232
- CorpusProcessor::Token.new("Inquisição"),
233
- CorpusProcessor::Token.new("e"),
234
- CorpusProcessor::Token.new("a"),
235
- CorpusProcessor::Token.new("censura"),
236
- ]
237
- }
238
- end
239
- end
240
- end
241
-
242
- describe "#extract_category" do
243
- subject { lampada.extract_category(categories) }
244
-
245
- context "empty categories" do
246
- let(:categories) { "" }
247
-
248
- it { should == nil }
249
- end
250
-
251
- context "one category" do
252
- let(:categories) { "PESSOA" }
253
-
254
- it { should == :person }
255
- end
256
-
257
- context "two categories" do
258
- let(:categories) { "OUTRA|ORGANIZACAO" }
259
-
260
- it { should == :organization }
261
- end
262
-
263
- context "ambiguidade" do
264
- let(:categories) { "PESSOA|ORGANIZACAO" }
265
-
266
- it { should == :person }
267
- end
268
- end
269
- end