corpus-processor 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,57 @@
1
+ require 'spec_helper'
2
+
3
+ describe CorpusProcessor::Generators::StanfordNer do
4
+ subject(:stanford_ner) { CorpusProcessor::Generators::StanfordNer.new }
5
+
6
+ describe '#generate' do
7
+ subject { stanford_ner.generate(tokens) }
8
+
9
+ context 'no tokens' do
10
+ let(:tokens) { [] }
11
+
12
+ it 'returns a single new line' do
13
+ expect(subject).to eq("\n")
14
+ end
15
+ end
16
+
17
+ context 'one token' do
18
+ let(:tokens) { [CorpusProcessor::Token.new('banana')] }
19
+
20
+ it 'returns that token' do
21
+ expect(subject).to eq("banana\tO\n")
22
+ end
23
+ end
24
+
25
+ context 'two tokens' do
26
+ let(:tokens) { [
27
+ CorpusProcessor::Token.new('good'),
28
+ CorpusProcessor::Token.new('banana'),
29
+ ] }
30
+
31
+ it 'returns both tokens in separate lines' do
32
+ expect(subject).to eq("good\tO\nbanana\tO\n")
33
+ end
34
+ end
35
+
36
+ context 'with category' do
37
+ let(:tokens) { [CorpusProcessor::Token.new('Leandro', :person)] }
38
+
39
+ it 'returns that token with right category' do
40
+ expect(subject).to eq("Leandro\tPERSON\n")
41
+ end
42
+ end
43
+
44
+ context 'with non-default categories' do
45
+ let(:stanford_ner) {
46
+ CorpusProcessor::Generators::StanfordNer.new(
47
+ output: { banana: 'BANANA' })
48
+ }
49
+
50
+ let(:tokens) { [CorpusProcessor::Token.new('Nanica', :banana)] }
51
+
52
+ it 'uses those categories' do
53
+ expect(subject).to eq("Nanica\tBANANA\n")
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,333 @@
1
+ require 'spec_helper'
2
+
3
+ describe CorpusProcessor::Parsers::Lampada do
4
+ subject(:lampada) { CorpusProcessor::Parsers::Lampada.new }
5
+
6
+ describe '#parse' do
7
+ subject { lampada.parse(corpus) }
8
+
9
+ context 'default categories' do
10
+ context 'empty corpus' do
11
+ let(:corpus) { '' }
12
+
13
+ it 'returns an empty list' do
14
+ expect(subject).to eq([])
15
+ end
16
+ end
17
+
18
+ context 'doctype' do
19
+ let(:corpus) {
20
+ <<-CORPUS.encode('ISO-8859-1')
21
+ <?xml version="1.0" encoding="ISO-8859-1"?>
22
+ <!DOCTYPE colHAREM>
23
+ CORPUS
24
+ }
25
+
26
+ it 'returns an empty list' do
27
+ expect(subject).to eq([])
28
+ end
29
+ end
30
+
31
+ context 'simple phrase' do
32
+ let(:corpus) {
33
+ <<-CORPUS.encode('ISO-8859-1')
34
+ <?xml version="1.0" encoding="ISO-8859-1"?>
35
+ <!DOCTYPE colHAREM>
36
+ <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
37
+ <DOC DOCID="H2-dftre765">
38
+ <P>Fatores Demográficos e Econômicos Subjacentes.</P>
39
+ </DOC>
40
+ </colHAREM>
41
+ CORPUS
42
+ }
43
+
44
+ it 'tokenizes the phrase' do
45
+ expect(subject).to eq([
46
+ CorpusProcessor::Token.new('Fatores'),
47
+ CorpusProcessor::Token.new('Demográficos'),
48
+ CorpusProcessor::Token.new('e'),
49
+ CorpusProcessor::Token.new('Econômicos'),
50
+ CorpusProcessor::Token.new('Subjacentes'),
51
+ CorpusProcessor::Token.new('.'),
52
+ ])
53
+ end
54
+ end
55
+
56
+ context 'two simple phrases' do
57
+ let(:corpus) {
58
+ <<-CORPUS.encode('ISO-8859-1')
59
+ <?xml version="1.0" encoding="ISO-8859-1"?>
60
+ <!DOCTYPE colHAREM>
61
+ <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
62
+ <DOC DOCID="H2-dftre765">
63
+ <P>Fatores Demográficos e Econômicos Subjacentes</P>
64
+ <P>Fatores Demográficos e Econômicos Subjacentes</P>
65
+ </DOC>
66
+ </colHAREM>
67
+ CORPUS
68
+ }
69
+
70
+ it 'tokenizes the phrase and appends periods where needed' do
71
+ expect(subject).to eq([
72
+ CorpusProcessor::Token.new('Fatores'),
73
+ CorpusProcessor::Token.new('Demográficos'),
74
+ CorpusProcessor::Token.new('e'),
75
+ CorpusProcessor::Token.new('Econômicos'),
76
+ CorpusProcessor::Token.new('Subjacentes'),
77
+ CorpusProcessor::Token.new('.'),
78
+ CorpusProcessor::Token.new('Fatores'),
79
+ CorpusProcessor::Token.new('Demográficos'),
80
+ CorpusProcessor::Token.new('e'),
81
+ CorpusProcessor::Token.new('Econômicos'),
82
+ CorpusProcessor::Token.new('Subjacentes'),
83
+ CorpusProcessor::Token.new('.'),
84
+ ])
85
+ end
86
+ end
87
+
88
+ context 'useless entity' do
89
+ let(:corpus) {
90
+ <<-CORPUS.encode('ISO-8859-1')
91
+ <?xml version="1.0" encoding="ISO-8859-1"?>
92
+ <!DOCTYPE colHAREM>
93
+ <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
94
+ <DOC DOCID="H2-dftre765">
95
+ <P>Nos finais da <EM ID="H2-dftre765-102" CATEG="OUTRO" COMENT="DUVIDA_DIRECTIVASTEMPO">Idade Média</EM></P>
96
+ </DOC>
97
+ </colHAREM>
98
+ CORPUS
99
+ }
100
+
101
+ it 'ignores that entity' do
102
+ expect(subject).to eq([
103
+ CorpusProcessor::Token.new('Nos'),
104
+ CorpusProcessor::Token.new('finais'),
105
+ CorpusProcessor::Token.new('da'),
106
+ CorpusProcessor::Token.new('Idade'),
107
+ CorpusProcessor::Token.new('Média'),
108
+ CorpusProcessor::Token.new('.'),
109
+ ])
110
+ end
111
+ end
112
+
113
+ context 'one relevant entity' do
114
+ let(:corpus) {
115
+ <<-CORPUS.encode('ISO-8859-1')
116
+ <?xml version="1.0" encoding="ISO-8859-1"?>
117
+ <!DOCTYPE colHAREM>
118
+ <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
119
+ <DOC DOCID="H2-dftre765">
120
+ <P>Foram igualmente determinantes para evitar que as ideias reformadoras encontrassem divulgação em
121
+ <EM ID="H2-dftre765-23" CATEG="LOCAL" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-8 H2-dftre765-37" TIPOREL="local_nascimento_de incluido">Portugal</EM>
122
+ </P>
123
+ </DOC>
124
+ </colHAREM>
125
+ CORPUS
126
+ }
127
+
128
+ it 'finds that entity' do
129
+ expect(subject).to eq([
130
+ CorpusProcessor::Token.new('Foram'),
131
+ CorpusProcessor::Token.new('igualmente'),
132
+ CorpusProcessor::Token.new('determinantes'),
133
+ CorpusProcessor::Token.new('para'),
134
+ CorpusProcessor::Token.new('evitar'),
135
+ CorpusProcessor::Token.new('que'),
136
+ CorpusProcessor::Token.new('as'),
137
+ CorpusProcessor::Token.new('ideias'),
138
+ CorpusProcessor::Token.new('reformadoras'),
139
+ CorpusProcessor::Token.new('encontrassem'),
140
+ CorpusProcessor::Token.new('divulgação'),
141
+ CorpusProcessor::Token.new('em'),
142
+ CorpusProcessor::Token.new('Portugal', :location),
143
+ CorpusProcessor::Token.new('.'),
144
+ ])
145
+ end
146
+ end
147
+
148
+ context 'multiple relevant entities' do
149
+ let(:corpus) {
150
+ <<-CORPUS.encode('ISO-8859-1')
151
+ <?xml version="1.0" encoding="ISO-8859-1"?>
152
+ <!DOCTYPE colHAREM>
153
+ <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
154
+ <DOC DOCID="H2-dftre765">
155
+ <P>
156
+ A imprensa, inventada na
157
+ <EM ID="H2-dftre765-9" CATEG="LOCAL" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-37" TIPOREL="incluido">Alemanha</EM>
158
+ por
159
+ <EM ID="H2-dftre765-10" CATEG="PESSOA" TIPO="INDIVIDUAL" COREL="H2-dftre765-9" TIPOREL="natural_de">John Gutenberg</EM>
160
+ <EM ID="H2-dftre765-20" CATEG="ORGANIZACAO" TIPO="INSTITUICAO" COMENT="2/3">Inquisição</EM>
161
+ e a censura
162
+ </P>
163
+ </DOC>
164
+ </colHAREM>
165
+ CORPUS
166
+ }
167
+
168
+ it 'finds all of them' do
169
+ expect(subject).to eq([
170
+ CorpusProcessor::Token.new('A'),
171
+ CorpusProcessor::Token.new('imprensa'),
172
+ CorpusProcessor::Token.new(','),
173
+ CorpusProcessor::Token.new('inventada'),
174
+ CorpusProcessor::Token.new('na'),
175
+ CorpusProcessor::Token.new('Alemanha', :location),
176
+ CorpusProcessor::Token.new('por'),
177
+ CorpusProcessor::Token.new('John', :person),
178
+ CorpusProcessor::Token.new('Gutenberg', :person),
179
+ CorpusProcessor::Token.new('Inquisição', :organization),
180
+ CorpusProcessor::Token.new('e'),
181
+ CorpusProcessor::Token.new('a'),
182
+ CorpusProcessor::Token.new('censura'),
183
+ CorpusProcessor::Token.new('.'),
184
+ ])
185
+ end
186
+ end
187
+
188
+ context 'alternative tags' do
189
+ context 'all options are unknown categories' do
190
+ let(:corpus) {
191
+ <<-CORPUS.encode('ISO-8859-1')
192
+ <?xml version="1.0" encoding="ISO-8859-1"?>
193
+ <!DOCTYPE colHAREM>
194
+ <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
195
+ <DOC DOCID="H2-dftre765">
196
+ <P>
197
+ <ALT> <EM ID="H2-dftre765-12aa" CATEG="OBRA" TIPO="REPRODUZIDA">95 Teses de Martinho Lutero</EM> |
198
+ <EM ID="H2-dftre765-12" CATEG="OBRA" TIPO="REPRODUZIDA" SUBTIPO="LIVRO">95 Teses</EM>
199
+ de
200
+ <EM ID="H2-dftre765-13" CATEG="SER-HUMANO" TIPO="INDIVIDUAL" COREL="H2-dftre765-12 H2-dftre765-9 H2-dftre765-1" TIPOREL="autor_de natural_de PESSOA**participante_em**H2-dftre765-1**ACONTECIMENTO">Martinho Lutero</EM></ALT>
201
+ </P>
202
+ </DOC>
203
+ </colHAREM>
204
+ CORPUS
205
+ }
206
+
207
+ it 'rejects all of them' do
208
+ expect(subject).to eq([
209
+ CorpusProcessor::Token.new('95'),
210
+ CorpusProcessor::Token.new('Teses'),
211
+ CorpusProcessor::Token.new('de'),
212
+ CorpusProcessor::Token.new('Martinho'),
213
+ CorpusProcessor::Token.new('Lutero'),
214
+ CorpusProcessor::Token.new('.'),
215
+ ])
216
+ end
217
+ end
218
+
219
+ context 'one of the options has known categories' do
220
+ let(:corpus) {
221
+ <<-CORPUS.encode('ISO-8859-1')
222
+ <?xml version="1.0" encoding="ISO-8859-1"?>
223
+ <!DOCTYPE colHAREM>
224
+ <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
225
+ <DOC DOCID="H2-dftre765">
226
+ <P>
227
+ <ALT> <EM ID="H2-dftre765-12aa" CATEG="OBRA" TIPO="REPRODUZIDA">95 Teses de Martinho Lutero</EM> |
228
+ <EM ID="H2-dftre765-12" CATEG="OBRA" TIPO="REPRODUZIDA" SUBTIPO="LIVRO">95 Teses</EM>
229
+ de
230
+ <EM ID="H2-dftre765-13" CATEG="PESSOA" TIPO="INDIVIDUAL" COREL="H2-dftre765-12 H2-dftre765-9 H2-dftre765-1" TIPOREL="autor_de natural_de PESSOA**participante_em**H2-dftre765-1**ACONTECIMENTO">Martinho Lutero</EM></ALT>
231
+ </P>
232
+ </DOC>
233
+ </colHAREM>
234
+ CORPUS
235
+ }
236
+
237
+ it 'prefers that option' do
238
+ expect(subject).to eq([
239
+ CorpusProcessor::Token.new('95'),
240
+ CorpusProcessor::Token.new('Teses'),
241
+ CorpusProcessor::Token.new('de'),
242
+ CorpusProcessor::Token.new('Martinho', :person),
243
+ CorpusProcessor::Token.new('Lutero', :person),
244
+ CorpusProcessor::Token.new('.'),
245
+ ])
246
+ end
247
+ end
248
+
249
+ context 'more than one option have known categories' do
250
+ let(:corpus) {
251
+ <<-CORPUS.encode('ISO-8859-1')
252
+ <?xml version="1.0" encoding="ISO-8859-1"?>
253
+ <!DOCTYPE colHAREM>
254
+ <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
255
+ <DOC DOCID="H2-dftre765">
256
+ <P>
257
+ <ALT> <EM ID="H2-dftre765-12aa" CATEG="LOCAL" TIPO="REPRODUZIDA">95 Teses de Martinho Lutero</EM> |
258
+ <EM ID="H2-dftre765-12" CATEG="OBRA" TIPO="REPRODUZIDA" SUBTIPO="LIVRO">95 Teses</EM>
259
+ de
260
+ <EM ID="H2-dftre765-13" CATEG="PESSOA" TIPO="INDIVIDUAL" COREL="H2-dftre765-12 H2-dftre765-9 H2-dftre765-1" TIPOREL="autor_de natural_de PESSOA**participante_em**H2-dftre765-1**ACONTECIMENTO">Martinho Lutero</EM></ALT>
261
+ </P>
262
+ </DOC>
263
+ </colHAREM>
264
+ CORPUS
265
+ }
266
+
267
+ it 'prefers the option that covers most text with known ' \
268
+ 'categories' do
269
+ expect(subject).to eq([
270
+ CorpusProcessor::Token.new('95', :location),
271
+ CorpusProcessor::Token.new('Teses', :location),
272
+ CorpusProcessor::Token.new('de', :location),
273
+ CorpusProcessor::Token.new('Martinho', :location),
274
+ CorpusProcessor::Token.new('Lutero', :location),
275
+ CorpusProcessor::Token.new('.'),
276
+ ])
277
+ end
278
+ end
279
+ end
280
+ end
281
+
282
+ context 'user-defined categories' do
283
+ let(:lampada) {
284
+ CorpusProcessor::Parsers::Lampada.new(
285
+ input: {
286
+ 'FRUTA' => :fruit,
287
+ 'LIVRO' => :book,
288
+ }
289
+ )
290
+ }
291
+
292
+ context 'multiple entities' do
293
+ let(:corpus) {
294
+ <<-CORPUS.encode('ISO-8859-1')
295
+ <?xml version="1.0" encoding="ISO-8859-1"?>
296
+ <!DOCTYPE colHAREM>
297
+ <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
298
+ <DOC DOCID="H2-dftre765">
299
+ <P>
300
+ A imprensa, inventada na
301
+ <EM ID="H2-dftre765-9" CATEG="FRUTA" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-37" TIPOREL="incluido">Banana</EM>
302
+ por
303
+ <EM ID="H2-dftre765-10" CATEG="LIVRO" TIPO="INDIVIDUAL" COREL="H2-dftre765-9" TIPOREL="natural_de">Harry Potter</EM>
304
+ <EM ID="H2-dftre765-20" CATEG="ORGANIZACAO" TIPO="INSTITUICAO" COMENT="2/3">Inquisição</EM>
305
+ e a censura
306
+ </P>
307
+ </DOC>
308
+ </colHAREM>
309
+ CORPUS
310
+ }
311
+
312
+ it 'finds all of them' do
313
+ expect(subject).to eq([
314
+ CorpusProcessor::Token.new('A'),
315
+ CorpusProcessor::Token.new('imprensa'),
316
+ CorpusProcessor::Token.new(','),
317
+ CorpusProcessor::Token.new('inventada'),
318
+ CorpusProcessor::Token.new('na'),
319
+ CorpusProcessor::Token.new('Banana', :fruit),
320
+ CorpusProcessor::Token.new('por'),
321
+ CorpusProcessor::Token.new('Harry', :book),
322
+ CorpusProcessor::Token.new('Potter', :book),
323
+ CorpusProcessor::Token.new('Inquisição'),
324
+ CorpusProcessor::Token.new('e'),
325
+ CorpusProcessor::Token.new('a'),
326
+ CorpusProcessor::Token.new('censura'),
327
+ CorpusProcessor::Token.new('.'),
328
+ ])
329
+ end
330
+ end
331
+ end
332
+ end
333
+ end
@@ -0,0 +1,36 @@
1
+ require 'spec_helper'
2
+
3
+ describe CorpusProcessor::Processor do
4
+ subject(:processor) {
5
+ CorpusProcessor::Processor.new parser: parser, generator: generator
6
+ }
7
+
8
+ describe '#process' do
9
+ subject { processor.process(corpus) }
10
+
11
+ let(:corpus) { 'Some corpus' }
12
+ let(:processed_corpus) {
13
+ <<-CORPUS
14
+ Some\tO
15
+ corpus\tO
16
+ CORPUS
17
+ }
18
+ let(:tokens) {
19
+ [
20
+ CorpusProcessor::Token.new('Some'),
21
+ CorpusProcessor::Token.new('corpus'),
22
+ ]
23
+ }
24
+ let(:parser) { double :parser }
25
+ let(:generator) { double :generator }
26
+
27
+ it 'uses parser and generator to process corpus' do
28
+ expect(parser).to receive(:parse).with(corpus).and_return(tokens)
29
+
30
+ expect(generator).to receive(:generate).with(tokens)
31
+ .and_return(processed_corpus)
32
+
33
+ expect(subject).to eq(processed_corpus)
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,15 @@
1
+ require 'spec_helper'
2
+
3
+ describe CorpusProcessor::Token do
4
+ subject { CorpusProcessor::Token.new }
5
+
6
+ describe 'attributes' do
7
+ it 'word' do
8
+ expect(subject).to respond_to(:word)
9
+ end
10
+
11
+ it 'category' do
12
+ expect(subject).to respond_to(:category)
13
+ end
14
+ end
15
+ end
data/spec/spec_helper.rb CHANGED
@@ -1,13 +1,16 @@
1
1
  # This file was generated by the `rspec --init` command. Conventionally, all
2
2
  # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
- # Require this file using `require "spec_helper"` to ensure that it is only
3
+ # Require this file using `require 'spec_helper'` to ensure that it is only
4
4
  # loaded once.
5
5
  #
6
6
  # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
7
 
8
- require "fakefs/spec_helpers"
8
+ if ENV['CI'] == 'true'
9
+ require 'coveralls'
10
+ Coveralls.wear!
11
+ end
9
12
 
10
- require "corpus-processor"
13
+ require 'corpus-processor'
11
14
 
12
15
  RSpec.configure do |config|
13
16
  config.treat_symbols_as_metadata_keys_with_true_values = true
@@ -18,5 +21,5 @@ RSpec.configure do |config|
18
21
  # order dependency and want to debug it, you can fix the order by providing
19
22
  # the seed, which is printed after each run.
20
23
  # --seed 1234
21
- config.order = "random"
24
+ config.order = 'random'
22
25
  end