corpus-processor 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: corpus-processor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Das Dad
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-04-01 00:00:00.000000000 Z
11
+ date: 2013-07-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - '>='
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: bundler
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -42,18 +56,18 @@ dependencies:
42
56
  name: rspec
43
57
  requirement: !ruby/object:Gem::Requirement
44
58
  requirements:
45
- - - '>='
59
+ - - '='
46
60
  - !ruby/object:Gem::Version
47
- version: '0'
61
+ version: 2.14.0.rc1
48
62
  type: :development
49
63
  prerelease: false
50
64
  version_requirements: !ruby/object:Gem::Requirement
51
65
  requirements:
52
- - - '>='
66
+ - - '='
53
67
  - !ruby/object:Gem::Version
54
- version: '0'
68
+ version: 2.14.0.rc1
55
69
  - !ruby/object:Gem::Dependency
56
- name: fakefs
70
+ name: pry-nav
57
71
  requirement: !ruby/object:Gem::Requirement
58
72
  requirements:
59
73
  - - '>='
@@ -67,7 +81,7 @@ dependencies:
67
81
  - !ruby/object:Gem::Version
68
82
  version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
- name: pry-nav
84
+ name: coveralls
71
85
  requirement: !ruby/object:Gem::Requirement
72
86
  requirements:
73
87
  - - '>='
@@ -82,7 +96,7 @@ dependencies:
82
96
  version: '0'
83
97
  description: Process linguistic corpus
84
98
  email:
85
- - feedback@dasdad.com.br
99
+ - dev@dasdad.com.br
86
100
  executables:
87
101
  - corpus-processor
88
102
  extensions: []
@@ -90,30 +104,29 @@ extra_rdoc_files: []
90
104
  files:
91
105
  - .gitignore
92
106
  - .rspec
107
+ - .travis.yml
108
+ - .yardopts
93
109
  - Gemfile
94
110
  - README.md
95
111
  - bin/corpus-processor
96
112
  - corpus-processor.gemspec
97
113
  - lib/corpus-processor.rb
114
+ - lib/corpus-processor/categories.rb
115
+ - lib/corpus-processor/categories/default.yml
98
116
  - lib/corpus-processor/cli.rb
99
- - lib/corpus-processor/default_categories.rb
100
117
  - lib/corpus-processor/generators.rb
101
118
  - lib/corpus-processor/generators/stanford_ner.rb
102
119
  - lib/corpus-processor/parsers.rb
103
120
  - lib/corpus-processor/parsers/lampada.rb
104
121
  - lib/corpus-processor/processor.rb
105
122
  - lib/corpus-processor/token.rb
106
- - lib/corpus-processor/tokenizer.rb
107
- - lib/corpus-processor/traverser.rb
108
123
  - lib/corpus-processor/version.rb
109
- - spec/integration/cli_spec.rb
124
+ - spec/corpus-processor/cli_spec.rb
125
+ - spec/corpus-processor/generators/stanford_ner_spec.rb
126
+ - spec/corpus-processor/parsers/lampada_spec.rb
127
+ - spec/corpus-processor/processor_spec.rb
128
+ - spec/corpus-processor/token_spec.rb
110
129
  - spec/spec_helper.rb
111
- - spec/unit/generators/stanford_ner_spec.rb
112
- - spec/unit/parsers/lampada_spec.rb
113
- - spec/unit/processor.rb
114
- - spec/unit/token_spec.rb
115
- - spec/unit/tokenizer_spec.rb
116
- - spec/unit/traverser_spec.rb
117
130
  homepage: https://github.com/dasdad/corpus-processor
118
131
  licenses:
119
132
  - MIT
@@ -134,16 +147,15 @@ required_rubygems_version: !ruby/object:Gem::Requirement
134
147
  version: '0'
135
148
  requirements: []
136
149
  rubyforge_project:
137
- rubygems_version: 2.0.0
150
+ rubygems_version: 2.0.2
138
151
  signing_key:
139
152
  specification_version: 4
140
153
  summary: Handle linguistic corpus and convert it to use NLP tools
141
154
  test_files:
142
- - spec/integration/cli_spec.rb
155
+ - spec/corpus-processor/cli_spec.rb
156
+ - spec/corpus-processor/generators/stanford_ner_spec.rb
157
+ - spec/corpus-processor/parsers/lampada_spec.rb
158
+ - spec/corpus-processor/processor_spec.rb
159
+ - spec/corpus-processor/token_spec.rb
143
160
  - spec/spec_helper.rb
144
- - spec/unit/generators/stanford_ner_spec.rb
145
- - spec/unit/parsers/lampada_spec.rb
146
- - spec/unit/processor.rb
147
- - spec/unit/token_spec.rb
148
- - spec/unit/tokenizer_spec.rb
149
- - spec/unit/traverser_spec.rb
161
+ has_rdoc:
@@ -1,14 +0,0 @@
1
- module CorpusProcessor
2
- DEFAULT_CATEGORIES = {
3
- input: {
4
- "PESSOA" => :person,
5
- "LOCAL" => :location,
6
- "ORGANIZACAO" => :organization,
7
- },
8
- output: Hash.new("O").merge(
9
- person: "PERSON",
10
- location: "LOCATION",
11
- organization: "ORGANIZATION",
12
- )
13
- }
14
- end
@@ -1,17 +0,0 @@
1
- class CorpusProcessor::Tokenizer
2
- def tokenize(text, category = nil)
3
- strip_tags(text)
4
- .gsub(/[[:punct:]]/, "")
5
- .strip
6
- .split(/\s+/)
7
- .map { |word| CorpusProcessor::Token.new(word, category) }
8
- end
9
-
10
- def strip_tags(text)
11
- text.gsub(/<.*?>/, " ").strip
12
- end
13
-
14
- def join_lines(text)
15
- text.gsub(/\s+/, " ").strip
16
- end
17
- end
@@ -1,19 +0,0 @@
1
- class CorpusProcessor::Traverser
2
- def traverse(text, regexp, &block)
3
- return if block.nil?
4
- remaining_search = text
5
- until remaining_search.empty?
6
- match = remaining_search.match(regexp)
7
- if match.nil?
8
- block.call remaining_search unless remaining_search.empty?
9
- remaining_search = ""
10
- else
11
- before = remaining_search[0...match.begin(0)]
12
- remaining_search = remaining_search[match.end(0)..-1]
13
-
14
- block.call before unless before.empty?
15
- block.call match
16
- end
17
- end
18
- end
19
- end
@@ -1,46 +0,0 @@
1
- require "spec_helper"
2
-
3
- describe CorpusProcessor::Generators::StanfordNer do
4
- subject(:stanford_ner) { CorpusProcessor::Generators::StanfordNer.new }
5
-
6
- describe "#generate" do
7
- subject { stanford_ner.generate(tokens) }
8
-
9
- context "no tokens" do
10
- let(:tokens) { [] }
11
-
12
- it { should == "\n" }
13
- end
14
-
15
- context "one token" do
16
- let(:tokens) { [CorpusProcessor::Token.new("banana")] }
17
-
18
- it { should == "banana O\n" }
19
- end
20
-
21
- context "two tokens" do
22
- let(:tokens) { [
23
- CorpusProcessor::Token.new("good"),
24
- CorpusProcessor::Token.new("banana"),
25
- ] }
26
-
27
- it { should == "good O\nbanana O\n" }
28
- end
29
-
30
- context "with category" do
31
- let(:tokens) { [CorpusProcessor::Token.new("Leandro", :person)] }
32
-
33
- it { should == "Leandro PERSON\n" }
34
- end
35
-
36
- context "with non-default categories" do
37
- let(:stanford_ner) { CorpusProcessor::Generators::StanfordNer.new(
38
- banana: "BANANA"
39
- ) }
40
-
41
- let(:tokens) { [CorpusProcessor::Token.new("Nanica", :banana)] }
42
-
43
- it { should == "Nanica BANANA\n" }
44
- end
45
- end
46
- end
@@ -1,269 +0,0 @@
1
- require "spec_helper"
2
-
3
- describe CorpusProcessor::Parsers::Lampada do
4
- subject(:lampada) { CorpusProcessor::Parsers::Lampada.new }
5
-
6
- describe "#parse" do
7
- subject { lampada.parse(corpus) }
8
-
9
- context "default categories" do
10
- context "empty corpus" do
11
- let(:corpus) { "" }
12
-
13
- it { should == [] }
14
- end
15
-
16
- context "doctype" do
17
- let(:corpus) {
18
- <<-CORPUS
19
- <?xml version="1.0" encoding="ISO-8859-1"?>
20
- <!DOCTYPE colHAREM>
21
- CORPUS
22
- }
23
-
24
- it { should == [] }
25
- end
26
-
27
- context "simple phrase" do
28
- let(:corpus) {
29
- <<-CORPUS
30
- <?xml version="1.0" encoding="ISO-8859-1"?>
31
- <!DOCTYPE colHAREM>
32
- <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
33
- <DOC DOCID="H2-dftre765">
34
- <P>Fatores Demográficos e Econômicos Subjacentes</P>
35
- </DOC>
36
- </colHAREM>
37
- CORPUS
38
- }
39
-
40
- it { should == [
41
- CorpusProcessor::Token.new("Fatores"),
42
- CorpusProcessor::Token.new("Demográficos"),
43
- CorpusProcessor::Token.new("e"),
44
- CorpusProcessor::Token.new("Econômicos"),
45
- CorpusProcessor::Token.new("Subjacentes"),
46
- ]
47
- }
48
- end
49
-
50
- context "two simple phrases" do
51
- let(:corpus) {
52
- <<-CORPUS
53
- <?xml version="1.0" encoding="ISO-8859-1"?>
54
- <!DOCTYPE colHAREM>
55
- <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
56
- <DOC DOCID="H2-dftre765">
57
- <P>Fatores Demográficos e Econômicos Subjacentes</P>
58
- <P>Fatores Demográficos e Econômicos Subjacentes</P>
59
- </DOC>
60
- </colHAREM>
61
- CORPUS
62
- }
63
-
64
- it { should == [
65
- CorpusProcessor::Token.new("Fatores"),
66
- CorpusProcessor::Token.new("Demográficos"),
67
- CorpusProcessor::Token.new("e"),
68
- CorpusProcessor::Token.new("Econômicos"),
69
- CorpusProcessor::Token.new("Subjacentes"),
70
- CorpusProcessor::Token.new("Fatores"),
71
- CorpusProcessor::Token.new("Demográficos"),
72
- CorpusProcessor::Token.new("e"),
73
- CorpusProcessor::Token.new("Econômicos"),
74
- CorpusProcessor::Token.new("Subjacentes"),
75
- ]
76
- }
77
- end
78
-
79
- context "useless entity" do
80
- let(:corpus) {
81
- <<-CORPUS
82
- <?xml version="1.0" encoding="ISO-8859-1"?>
83
- <!DOCTYPE colHAREM>
84
- <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
85
- <DOC DOCID="H2-dftre765">
86
- <P>Nos finais da <EM ID="H2-dftre765-102" CATEG="OUTRO" COMENT="DUVIDA_DIRECTIVASTEMPO">Idade Média</EM></P>
87
- </DOC>
88
- </colHAREM>
89
- CORPUS
90
- }
91
-
92
- it { should == [
93
- CorpusProcessor::Token.new("Nos"),
94
- CorpusProcessor::Token.new("finais"),
95
- CorpusProcessor::Token.new("da"),
96
- CorpusProcessor::Token.new("Idade"),
97
- CorpusProcessor::Token.new("Média"),
98
- ]
99
- }
100
- end
101
-
102
- context "one entity" do
103
- let(:corpus) {
104
- <<-CORPUS
105
- <?xml version="1.0" encoding="ISO-8859-1"?>
106
- <!DOCTYPE colHAREM>
107
- <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
108
- <DOC DOCID="H2-dftre765">
109
- <P>Foram igualmente determinantes para evitar que as ideias reformadoras encontrassem divulgação em
110
- <EM ID="H2-dftre765-23" CATEG="LOCAL" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-8 H2-dftre765-37" TIPOREL="local_nascimento_de incluido">Portugal</EM>
111
- </P>
112
- </DOC>
113
- </colHAREM>
114
- CORPUS
115
- }
116
-
117
- it { should == [
118
- CorpusProcessor::Token.new("Foram"),
119
- CorpusProcessor::Token.new("igualmente"),
120
- CorpusProcessor::Token.new("determinantes"),
121
- CorpusProcessor::Token.new("para"),
122
- CorpusProcessor::Token.new("evitar"),
123
- CorpusProcessor::Token.new("que"),
124
- CorpusProcessor::Token.new("as"),
125
- CorpusProcessor::Token.new("ideias"),
126
- CorpusProcessor::Token.new("reformadoras"),
127
- CorpusProcessor::Token.new("encontrassem"),
128
- CorpusProcessor::Token.new("divulgação"),
129
- CorpusProcessor::Token.new("em"),
130
- CorpusProcessor::Token.new("Portugal", :location),
131
- ]
132
- }
133
- end
134
-
135
- context "multiple entities" do
136
- let(:corpus) {
137
- <<-CORPUS
138
- <?xml version="1.0" encoding="ISO-8859-1"?>
139
- <!DOCTYPE colHAREM>
140
- <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
141
- <DOC DOCID="H2-dftre765">
142
- <P>
143
- A imprensa, inventada na
144
- <EM ID="H2-dftre765-9" CATEG="LOCAL" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-37" TIPOREL="incluido">Alemanha</EM>
145
- por
146
- <EM ID="H2-dftre765-10" CATEG="PESSOA" TIPO="INDIVIDUAL" COREL="H2-dftre765-9" TIPOREL="natural_de">John Gutenberg</EM>
147
- <EM ID="H2-dftre765-20" CATEG="ORGANIZACAO" TIPO="INSTITUICAO" COMENT="2/3">Inquisição</EM>
148
- e a censura
149
- </P>
150
- </DOC>
151
- </colHAREM>
152
- CORPUS
153
- }
154
-
155
- it { should == [
156
- CorpusProcessor::Token.new("A"),
157
- CorpusProcessor::Token.new("imprensa"),
158
- CorpusProcessor::Token.new("inventada"),
159
- CorpusProcessor::Token.new("na"),
160
- CorpusProcessor::Token.new("Alemanha", :location),
161
- CorpusProcessor::Token.new("por"),
162
- CorpusProcessor::Token.new("John", :person),
163
- CorpusProcessor::Token.new("Gutenberg", :person),
164
- CorpusProcessor::Token.new("Inquisição", :organization),
165
- CorpusProcessor::Token.new("e"),
166
- CorpusProcessor::Token.new("a"),
167
- CorpusProcessor::Token.new("censura"),
168
- ]
169
- }
170
- end
171
-
172
- context "spaces after ponctuation" do
173
- let(:corpus) {
174
- <<-CORPUS
175
- <?xml version="1.0" encoding="ISO-8859-1"?>
176
- <!DOCTYPE colHAREM>
177
- <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
178
- <DOC DOCID="H2-dftre765">
179
- <EM ID="H2-dftre765-1" CATEG="ABSTRACCAO|ACONTECIMENTO" TIPO="IDEIA|EFEMERIDE">Reforma Protestante</EM>
180
- . No
181
- </DOC>
182
- </colHAREM>
183
- CORPUS
184
- }
185
-
186
- it { should == [
187
- CorpusProcessor::Token.new("Reforma"),
188
- CorpusProcessor::Token.new("Protestante"),
189
- CorpusProcessor::Token.new("No"),
190
- ]
191
- }
192
- end
193
- end
194
-
195
- context "user-defined categories" do
196
- let(:lampada) {
197
- CorpusProcessor::Parsers::Lampada.new({
198
- "FRUTA" => :fruit,
199
- "LIVRO" => :book,
200
- })
201
- }
202
-
203
- context "multiple entities" do
204
- let(:corpus) {
205
- <<-CORPUS
206
- <?xml version="1.0" encoding="ISO-8859-1"?>
207
- <!DOCTYPE colHAREM>
208
- <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
209
- <DOC DOCID="H2-dftre765">
210
- <P>
211
- A imprensa, inventada na
212
- <EM ID="H2-dftre765-9" CATEG="FRUTA" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-37" TIPOREL="incluido">Banana</EM>
213
- por
214
- <EM ID="H2-dftre765-10" CATEG="LIVRO" TIPO="INDIVIDUAL" COREL="H2-dftre765-9" TIPOREL="natural_de">Harry Potter</EM>
215
- <EM ID="H2-dftre765-20" CATEG="ORGANIZACAO" TIPO="INSTITUICAO" COMENT="2/3">Inquisição</EM>
216
- e a censura
217
- </P>
218
- </DOC>
219
- </colHAREM>
220
- CORPUS
221
- }
222
-
223
- it { should == [
224
- CorpusProcessor::Token.new("A"),
225
- CorpusProcessor::Token.new("imprensa"),
226
- CorpusProcessor::Token.new("inventada"),
227
- CorpusProcessor::Token.new("na"),
228
- CorpusProcessor::Token.new("Banana", :fruit),
229
- CorpusProcessor::Token.new("por"),
230
- CorpusProcessor::Token.new("Harry", :book),
231
- CorpusProcessor::Token.new("Potter", :book),
232
- CorpusProcessor::Token.new("Inquisição"),
233
- CorpusProcessor::Token.new("e"),
234
- CorpusProcessor::Token.new("a"),
235
- CorpusProcessor::Token.new("censura"),
236
- ]
237
- }
238
- end
239
- end
240
- end
241
-
242
- describe "#extract_category" do
243
- subject { lampada.extract_category(categories) }
244
-
245
- context "empty categories" do
246
- let(:categories) { "" }
247
-
248
- it { should == nil }
249
- end
250
-
251
- context "one category" do
252
- let(:categories) { "PESSOA" }
253
-
254
- it { should == :person }
255
- end
256
-
257
- context "two categories" do
258
- let(:categories) { "OUTRA|ORGANIZACAO" }
259
-
260
- it { should == :organization }
261
- end
262
-
263
- context "ambiguidade" do
264
- let(:categories) { "PESSOA|ORGANIZACAO" }
265
-
266
- it { should == :person }
267
- end
268
- end
269
- end