corpus-processor 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +5 -0
- data/.yardopts +1 -0
- data/README.md +235 -34
- data/bin/corpus-processor +3 -3
- data/corpus-processor.gemspec +16 -14
- data/lib/corpus-processor.rb +12 -8
- data/lib/corpus-processor/categories.rb +58 -0
- data/lib/corpus-processor/categories/default.yml +10 -0
- data/lib/corpus-processor/cli.rb +31 -11
- data/lib/corpus-processor/generators.rb +5 -1
- data/lib/corpus-processor/generators/stanford_ner.rb +19 -10
- data/lib/corpus-processor/parsers.rb +5 -1
- data/lib/corpus-processor/parsers/lampada.rb +103 -47
- data/lib/corpus-processor/processor.rb +19 -4
- data/lib/corpus-processor/token.rb +35 -1
- data/lib/corpus-processor/version.rb +1 -1
- data/spec/{integration → corpus-processor}/cli_spec.rb +81 -71
- data/spec/corpus-processor/generators/stanford_ner_spec.rb +57 -0
- data/spec/corpus-processor/parsers/lampada_spec.rb +333 -0
- data/spec/corpus-processor/processor_spec.rb +36 -0
- data/spec/corpus-processor/token_spec.rb +15 -0
- data/spec/spec_helper.rb +7 -4
- metadata +39 -27
- data/lib/corpus-processor/default_categories.rb +0 -14
- data/lib/corpus-processor/tokenizer.rb +0 -17
- data/lib/corpus-processor/traverser.rb +0 -19
- data/spec/unit/generators/stanford_ner_spec.rb +0 -46
- data/spec/unit/parsers/lampada_spec.rb +0 -269
- data/spec/unit/processor.rb +0 -37
- data/spec/unit/token_spec.rb +0 -8
- data/spec/unit/tokenizer_spec.rb +0 -121
- data/spec/unit/traverser_spec.rb +0 -68
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5e5579e5e275e1eeeada264b8a42cb0a87f1389a
|
4
|
+
data.tar.gz: 8adc37111c8b7e4a550bef2ff3a919db4ff82ca4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a35d00e8ad4b2ed8f0a98e176dfdd4967d3e55a8a6368304a746b6d8df6a21ad5efd9cf75ccc3dae39cb4032e6d30a749dae777531868958a39a7e7cb244012c
|
7
|
+
data.tar.gz: dd32457b4f1bc1e45fcefa92d0c8bae6a0461f33e60784238e0ce95c6ea5dc078daf07449450ead6f26a46d14a9e0999d6f0adcf844d60419885f08026e97e11
|
data/.travis.yml
ADDED
data/.yardopts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--markup markdown
|
data/README.md
CHANGED
@@ -1,26 +1,36 @@
|
|
1
1
|
Corpus Processor
|
2
2
|
================
|
3
3
|
|
4
|
-
[![Gem Version]
|
4
|
+
[![Gem Version][1]](http://badge.fury.io/rb/corpus-processor)
|
5
|
+
[![Build Status][2]](https://travis-ci.org/dasdad/corpus-processor)
|
6
|
+
[![Code Climate][3]](https://codeclimate.com/github/dasdad/corpus-processor)
|
7
|
+
[![Dependency Status][4]](https://gemnasium.com/dasdad/corpus-processor)
|
8
|
+
[![Coverage Status][5]](https://coveralls.io/r/dasdad/corpus-processor)
|
5
9
|
|
6
|
-
* [Versão em português]
|
7
|
-
* [English version]
|
10
|
+
* [Versão em português][6]
|
11
|
+
* [English version][7]
|
8
12
|
|
9
|
-
Versão em
|
13
|
+
Versão em português
|
10
14
|
===================
|
11
15
|
|
12
|
-
Corpus Processor é uma ferramenta para trabalhar com [Linguística de
|
16
|
+
Corpus Processor é uma ferramenta para trabalhar com [Linguística de
|
17
|
+
Corpus][8]. Ele converte _corpora_ entre diferentes formatos para serem usados
|
18
|
+
em ferramentas de Processamento de Linguagem Natural (NLP).
|
13
19
|
|
14
|
-
O primeiro propósito do Corpus Processor e seu único recurso implementado
|
20
|
+
O primeiro propósito do Corpus Processor e seu único recurso implementado
|
21
|
+
até agora é transformar _corpora_ encontrados na [Linguateca][9] para o
|
22
|
+
formato usado pelo treinamento do [Stanford NER][10].
|
15
23
|
|
16
|
-
[Linguateca]
|
24
|
+
[Linguateca][11] é uma fonte de _corpora_ em português.
|
17
25
|
|
18
|
-
[Stanford NER]
|
26
|
+
[Stanford NER][12] é uma implementação de [Reconhecimento de Entidade
|
27
|
+
Mencionada (NER)][13].
|
19
28
|
|
20
29
|
Instalação
|
21
30
|
----------
|
22
31
|
|
23
|
-
Corpus Processor é uma [Ruby]
|
32
|
+
Corpus Processor é uma [Ruby][14] [Gem][15]. Para instalar, dada uma
|
33
|
+
instalação de Ruby, rode:
|
24
34
|
|
25
35
|
```bash
|
26
36
|
$ gem install corpus_processor
|
@@ -35,43 +45,106 @@ Converter _corpus_ do formato do LâMPADA 2.0 para o formato do Stanford-NER:
|
|
35
45
|
$ corpus-processor process [INPUT_FILE [OUTPUT_FILE]]
|
36
46
|
```
|
37
47
|
|
48
|
+
As classes reconhecidas por padrão pelo Corpus Processor são `PESSOA`, `LOCAL`
|
49
|
+
e `ORGANIZACAO`. Para configurar outras classes, veja o arquivo de configuração
|
50
|
+
em `lib/corpus-processor/categories/default.yml`.
|
51
|
+
|
52
|
+
Para usar outras configurações, veja as opções com:
|
53
|
+
|
54
|
+
```bash
|
55
|
+
$ corpus-processor help process
|
56
|
+
```
|
57
|
+
|
38
58
|
Resultados
|
39
59
|
----------
|
40
60
|
|
41
|
-
|
61
|
+
Os resultados do uso do [Corpus Processor][16] com um _corpus_ do
|
62
|
+
[LâMPADA 2.0 / Classic HAREM 2.0 Golden Collection][17] - disponível na
|
63
|
+
[Linguateca][20] - estão [neste diretório][19]:
|
64
|
+
|
65
|
+
* `ner-pt_br.training.txt`: O _corpus_ da [Linguateca][20] convertido com o
|
66
|
+
[Corpus Processor][21] para o formato de treinamento do [Stanford NER][22].
|
67
|
+
* `ner-pt_br.training-partial.txt`: Os primeiros 95% do _corpus_ em
|
68
|
+
`ner-pt_br.training.txt`, usados para o teste de precisão do
|
69
|
+
[Stanford NER][23].
|
70
|
+
* `ner-pt_br.test.txt`: Os últimos 5% do curpus em `ner-pt_br.training.txt`,
|
71
|
+
usado para testar o modelo linguístico.
|
72
|
+
* `ner-pt_br.prop`: O arquivo de propriedados no formato do [Stanford NER][24]
|
73
|
+
que é usado para treinar com o `ner-pt_br.training.txt`.
|
74
|
+
* `ner-pt_br.partial.prop`: O arquivo de propriedados no formato do
|
75
|
+
[Stanford NER][25] que é usado para treinar com o
|
76
|
+
`ner-pt_br.training-partial.txt`.
|
77
|
+
* `ner-pt_br.ser.gz`: O modelo linguístico no formato do [Stanford NER][26]
|
78
|
+
resultante do treinamento com o `ner-pt_br.training.txt`.
|
79
|
+
* `ner-pt_br.ser-partial.gz`: O modelo linguístico no formato do
|
80
|
+
[Stanford NER][27] resultante do treinamento com o
|
81
|
+
`ner-pt_br.training-partial.txt`.
|
82
|
+
|
83
|
+
A performance do modelo linguístico testado é:
|
84
|
+
|
85
|
+
```
|
86
|
+
CRFClassifier tagged 4450 words in 1 documents at 3632.65 words per second.
|
87
|
+
Entity P R F1 TP FP FN
|
88
|
+
LOCATION 0.5667 0.3953 0.4658 17 13 26
|
89
|
+
ORGANIZATION 0.4531 0.2500 0.3222 29 35 87
|
90
|
+
PERSON 0.5333 0.7442 0.6214 32 28 11
|
91
|
+
Totals 0.5065 0.3861 0.4382 78 76 124
|
92
|
+
```
|
93
|
+
|
94
|
+
Essa performance é ruim se compara com [outros trabalhos][28] sobre o assunto,
|
95
|
+
mas tem servido aos nossos propósitos. Nós continuaremos tentando melhorar
|
96
|
+
essa situação.
|
97
|
+
|
98
|
+
Sugestões são bem vindas sobre como fazer isso.
|
42
99
|
|
43
|
-
|
100
|
+
---
|
44
101
|
|
45
|
-
**Note** que a transformação do Corpus Processor descarta muita informação do
|
102
|
+
**Note** que a transformação do Corpus Processor descarta muita informação do
|
103
|
+
_corpus_ anotado. Os _corpora_ usados são bastante ricos em anotações e para
|
104
|
+
tirar completo proveito deles considere usar as ferramentas encontradas na
|
105
|
+
[Linguateca][29].
|
46
106
|
|
47
107
|
Para entender melhor, siga as seguintes referências:
|
48
108
|
|
49
|
-
|
50
|
-
|
109
|
+
```
|
110
|
+
Diana Santos. "O modelo semântico usado no Primeiro HAREM". In Diana Santos & Nuno Cardoso (eds.), Reconhecimento de entidades mencionadas em português: Documentação e actas do HAREM, a primeira avaliação conjunta na área. Linguateca, 2007, pp. 43-57.
|
111
|
+
http://www.linguateca.pt/aval_conjunta/LivroHAREM/Cap04-SantosCardoso2007-Santos.pdf
|
112
|
+
|
113
|
+
Diana Santos. "Evaluation in natural language processing". European Summer School on Language, Logic and Information (ESSLLI 2007) (Trinity College, Dublin, Irlanda, 6-17 de Agosto de 2007).
|
114
|
+
```
|
115
|
+
|
116
|
+
---
|
117
|
+
|
118
|
+
[Leia mais sobre o processo de treinamento][30].
|
51
119
|
|
52
|
-
Diana Santos. "Evaluation in natural language processing". European Summer School on Language, Logic and Information (ESSLLI 2007) (Trinity College, Dublin, Irlanda, 6-17 de Agosto de 2007).
|
53
120
|
|
54
121
|
Agradecimentos
|
55
122
|
--------------
|
56
123
|
|
57
|
-
* [Time do HAREM / Linguateca]
|
58
|
-
|
124
|
+
* [Time do HAREM / Linguateca][31] pelo _corpus_ com anotações semânticas em
|
125
|
+
português.
|
126
|
+
* *[Time de NLP de Stanford][32]* pela ferramenta [Stanford NER][33].
|
59
127
|
|
60
128
|
English version
|
61
129
|
===============
|
62
130
|
|
63
|
-
Corpus Processor is a tool to work with [Corpus Linguistics]
|
131
|
+
Corpus Processor is a tool to work with [Corpus Linguistics][34]. It converts
|
132
|
+
_corpora_ between different formats for use in Natural Language Processing
|
133
|
+
(NLP) tools.
|
64
134
|
|
65
|
-
The first purpose of Corpus Processor and its current only feature is to
|
135
|
+
The first purpose of Corpus Processor and its current only feature is to
|
136
|
+
transform _corpora_ found in [Linguateca][35] into the format used for training
|
137
|
+
in [Stanford NER][36].
|
66
138
|
|
67
|
-
[Linguateca]
|
139
|
+
[Linguateca][37] is an source of _corpora_ in Portuguese.
|
68
140
|
|
69
|
-
[Stanford NER]
|
141
|
+
[Stanford NER][38] is an implementation of [Named Entity Recognition][39].
|
70
142
|
|
71
143
|
Installation
|
72
144
|
------------
|
73
145
|
|
74
|
-
Corpus Processor is a [Ruby]
|
146
|
+
Corpus Processor is a [Ruby][40] [Gem][41]. To install it, given a working
|
147
|
+
installation of Ruby, run:
|
75
148
|
|
76
149
|
```bash
|
77
150
|
$ gem install corpus_processor
|
@@ -86,27 +159,82 @@ Convert _corpus_ from LâMPADA 2.0 format to Stanford-NER format:
|
|
86
159
|
$ corpus-processor process [INPUT_FILE [OUTPUT_FILE]]
|
87
160
|
```
|
88
161
|
|
162
|
+
Classes recognized by default in Corpus Processor are `PESSOA` (person),
|
163
|
+
`LOCAL` (location) and `ORGANIZACAO` (organization). In order to configure
|
164
|
+
other classes, refer to `lib/corpus-processor/categories/default.yml`.
|
165
|
+
|
166
|
+
To run with different configurations, consult the options with:
|
167
|
+
|
168
|
+
```bash
|
169
|
+
$ corpus-processor help process
|
170
|
+
```
|
171
|
+
|
89
172
|
Results
|
90
173
|
-------
|
91
174
|
|
92
|
-
|
175
|
+
The results of using [Corpus Processor][42] with a _corpus_ from
|
176
|
+
[LâMPADA 2.0 / Classic HAREM 2.0 Golden Collection][43] - available in
|
177
|
+
[Linguateca][20] - are in [this directory][19]:
|
178
|
+
|
179
|
+
* `ner-pt_br.training.txt`: The _corpus_ from [Linguateca][46] converted with
|
180
|
+
[Corpus Processor][47] to [Stanford NER][48] training format.
|
181
|
+
* `ner-pt_br.training-partial.txt`: The first 95% of the _corpus_ in
|
182
|
+
`ner-pt_br.training.txt`, used for training [Stanford NER][49] for accuracy
|
183
|
+
testing.
|
184
|
+
* `ner-pt_br.test.txt`: The last 5% of the _corpus_ in
|
185
|
+
`ner-pt_br.training.txt`, used to test the language model.
|
186
|
+
* `ner-pt_br.prop`: The property file in [Stanford NER][50]'s format for
|
187
|
+
setting up the training with the whole `ner-pt_br.training.txt`.
|
188
|
+
* `ner-pt_br.partial.prop`: The property file in [Stanford NER][51]'s format
|
189
|
+
for setting up the training with the partial
|
190
|
+
`ner-pt_br.training-partial.txt`.
|
191
|
+
* `ner-pt_br.ser.gz`: The resulting language model for [Stanford NER][52]
|
192
|
+
trained with `ner-pt_br.training.txt`.
|
193
|
+
* `ner-pt_br.ser-partial.gz`: The resulting language model for
|
194
|
+
[Stanford NER][53] trained with `ner-pt_br.training-partial.txt`.
|
195
|
+
|
196
|
+
The performance of the language model under test is:
|
197
|
+
|
198
|
+
```
|
199
|
+
CRFClassifier tagged 4450 words in 1 documents at 3632.65 words per second.
|
200
|
+
Entity P R F1 TP FP FN
|
201
|
+
LOCATION 0.5667 0.3953 0.4658 17 13 26
|
202
|
+
ORGANIZATION 0.4531 0.2500 0.3222 29 35 87
|
203
|
+
PERSON 0.5333 0.7442 0.6214 32 28 11
|
204
|
+
Totals 0.5065 0.3861 0.4382 78 76 124
|
205
|
+
```
|
206
|
+
|
207
|
+
This performance is poor if compared with [other works][54] on the topic,
|
208
|
+
but it has served well our purposes. We'll keep trying to improve on this.
|
209
|
+
|
210
|
+
Suggestions are welcome in this regard.
|
93
211
|
|
94
|
-
|
212
|
+
---
|
95
213
|
|
96
|
-
**Note** that the transformation performed by Corpus Processor discards lots
|
214
|
+
**Note** that the transformation performed by Corpus Processor discards lots
|
215
|
+
of information from the annotated _corpus_. The _corpora_ used in this process
|
216
|
+
are very rich in annotations, in order to extract all of it consider using one
|
217
|
+
of the tools found in [Linguateca][55].
|
97
218
|
|
98
219
|
Further information about the subject can be found in the following sources:
|
99
220
|
|
100
|
-
|
101
|
-
|
221
|
+
```
|
222
|
+
Diana Santos. "O modelo semântico usado no Primeiro HAREM". In Diana Santos & Nuno Cardoso (eds.), Reconhecimento de entidades mencionadas em português: Documentação e actas do HAREM, a primeira avaliação conjunta na área. Linguateca, 2007, pp. 43-57.
|
223
|
+
http://www.linguateca.pt/aval_conjunta/LivroHAREM/Cap04-SantosCardoso2007-Santos.pdf
|
102
224
|
|
103
|
-
|
225
|
+
Diana Santos. "Evaluation in natural language processing". European Summer School on Language, Logic and Information (ESSLLI 2007) (Trinity College, Dublin, Irlanda, 6-17 de Agosto de 2007).
|
226
|
+
```
|
227
|
+
|
228
|
+
---
|
229
|
+
|
230
|
+
[Read more about the process of training][56].
|
104
231
|
|
105
232
|
Thanks
|
106
233
|
------
|
107
234
|
|
108
|
-
* [HAREM / Linguateca team]
|
109
|
-
|
235
|
+
* *[HAREM / Linguateca team][57]* for the semantic annotated _corpus_ in
|
236
|
+
Portuguese.
|
237
|
+
* *[Stanford NLP team][58]* for the [Stanford NER][59] tool.
|
110
238
|
|
111
239
|
Contributing
|
112
240
|
------------
|
@@ -120,15 +248,24 @@ Contributing
|
|
120
248
|
Changelog
|
121
249
|
---------
|
122
250
|
|
123
|
-
### 0.0
|
251
|
+
### 0.3.0
|
124
252
|
|
125
|
-
*
|
126
|
-
*
|
253
|
+
* Stoped using Regex for parser and started using [Nokogiri][62].
|
254
|
+
* Fixed missing punctuation.
|
255
|
+
* Fixed inconsistencies in tagging. The issue was caused by `<ALT>` tags.
|
256
|
+
* Accepted categories definitions from users.
|
257
|
+
* Installed several measures for quality of code.
|
258
|
+
* Added documentation.
|
127
259
|
|
128
|
-
### 0.0
|
260
|
+
### 0.2.0
|
129
261
|
|
130
262
|
* Renamed Harem to LâMPADA, as asked by Linguateca's team.
|
131
263
|
|
264
|
+
### 0.0.1
|
265
|
+
|
266
|
+
* [LâMPADA 2.0 / Classic HAREM 2.0 Golden Collection][60] Parser.
|
267
|
+
* [Stanford NER][61] Generator.
|
268
|
+
|
132
269
|
License
|
133
270
|
-------
|
134
271
|
|
@@ -154,3 +291,67 @@ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
154
291
|
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
155
292
|
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
156
293
|
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
294
|
+
|
295
|
+
|
296
|
+
[1]: https://fury-badge.herokuapp.com/rb/corpus-processor.png
|
297
|
+
[2]: https://travis-ci.org/dasdad/corpus-processor.png
|
298
|
+
[3]: https://codeclimate.com/github/dasdad/corpus-processor.png
|
299
|
+
[4]: https://gemnasium.com/dasdad/corpus-processor.png
|
300
|
+
[5]: https://coveralls.io/repos/dasdad/corpus-processor/badge.png
|
301
|
+
[6]: #verso-em-portugus
|
302
|
+
[7]: #english-version
|
303
|
+
[8]: http://pt.wikipedia.org/wiki/Lingu%C3%ADstica_de_corpus
|
304
|
+
[9]: http://www.linguateca.pt
|
305
|
+
[10]: http://nlp.stanford.edu/software/CRF-NER.shtml
|
306
|
+
[11]: http://www.linguateca.pt
|
307
|
+
[12]: http://nlp.stanford.edu/software/CRF-NER.shtml
|
308
|
+
[13]: http://pt.wikipedia.org/wiki/Reconhecimento_de_entidade_mencionada
|
309
|
+
[14]: http://www.ruby-lang.org/
|
310
|
+
[15]: http://rubygems.org/
|
311
|
+
[16]: https://github.com/dasdad/corpus-processor
|
312
|
+
[17]: http://www.linguateca.pt/HAREM/
|
313
|
+
[18]: http://www.linguateca.pt/
|
314
|
+
[19]: https://www.dropbox.com/sh/8p6cbbcaoyv23u7/GxY0qKObYV/corpus
|
315
|
+
[20]: http://www.linguateca.pt/
|
316
|
+
[21]: https://github.com/dasdad/corpus-processor
|
317
|
+
[22]: http://nlp.stanford.edu/software/CRF-NER.shtml
|
318
|
+
[23]: http://nlp.stanford.edu/software/CRF-NER.shtml
|
319
|
+
[24]: http://nlp.stanford.edu/software/CRF-NER.shtml
|
320
|
+
[25]: http://nlp.stanford.edu/software/CRF-NER.shtml
|
321
|
+
[26]: http://nlp.stanford.edu/software/CRF-NER.shtml
|
322
|
+
[27]: http://nlp.stanford.edu/software/CRF-NER.shtml
|
323
|
+
[28]: ftp://ftp.inf.puc-rio.br/pub/docs/techreports/07_09_duarte.pdf
|
324
|
+
[29]: http://www.linguateca.pt
|
325
|
+
[30]: http://nlp.stanford.edu/software/crf-faq.shtml
|
326
|
+
[31]: http://www.linguateca.pt/HAREM
|
327
|
+
[32]: http://www-nlp.stanford.edu/
|
328
|
+
[33]: http://nlp.stanford.edu/software/CRF-NER.shtml
|
329
|
+
[34]: http://en.wikipedia.org/wiki/Corpus_linguistics
|
330
|
+
[35]: http://www.linguateca.pt
|
331
|
+
[36]: http://nlp.stanford.edu/software/CRF-NER.shtml
|
332
|
+
[37]: http://www.linguateca.pt
|
333
|
+
[38]: http://nlp.stanford.edu/software/CRF-NER.shtml
|
334
|
+
[39]: http://en.wikipedia.org/wiki/Named-entity_recognition
|
335
|
+
[40]: http://www.ruby-lang.org/
|
336
|
+
[41]: http://rubygems.org/
|
337
|
+
[42]: https://github.com/dasdad/corpus-processor
|
338
|
+
[43]: http://www.linguateca.pt/HAREM/
|
339
|
+
[44]: http://www.linguateca.pt/
|
340
|
+
[45]: https://www.dropbox.com/sh/8p6cbbcaoyv23u7/GxY0qKObYV/corpus
|
341
|
+
[46]: http://www.linguateca.pt/
|
342
|
+
[47]: https://github.com/dasdad/corpus-processor
|
343
|
+
[48]: http://nlp.stanford.edu/software/CRF-NER.shtml
|
344
|
+
[49]: http://nlp.stanford.edu/software/CRF-NER.shtml
|
345
|
+
[50]: http://nlp.stanford.edu/software/CRF-NER.shtml
|
346
|
+
[51]: http://nlp.stanford.edu/software/CRF-NER.shtml
|
347
|
+
[52]: http://nlp.stanford.edu/software/CRF-NER.shtml
|
348
|
+
[53]: http://nlp.stanford.edu/software/CRF-NER.shtml
|
349
|
+
[54]: ftp://ftp.inf.puc-rio.br/pub/docs/techreports/07_09_duarte.pdf
|
350
|
+
[55]: http://www.linguateca.pt
|
351
|
+
[56]: http://nlp.stanford.edu/software/crf-faq.shtml
|
352
|
+
[57]: http://www.linguateca.pt/HAREM
|
353
|
+
[58]: http://www-nlp.stanford.edu/
|
354
|
+
[59]: http://nlp.stanford.edu/software/CRF-NER.shtml
|
355
|
+
[60]: http://www.linguateca.pt/HAREM/
|
356
|
+
[61]: http://nlp.stanford.edu/software/CRF-NER.shtml
|
357
|
+
[62]: http://nokogiri.org/
|
data/bin/corpus-processor
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
$LOAD_PATH.unshift(File.expand_path(
|
3
|
+
$LOAD_PATH.unshift(File.expand_path('../../lib', __FILE__))
|
4
4
|
|
5
|
-
require
|
5
|
+
require 'bundler/setup'
|
6
6
|
|
7
|
-
require
|
7
|
+
require 'corpus-processor/cli'
|
8
8
|
|
9
9
|
CorpusProcessor::Cli.start(ARGV)
|
data/corpus-processor.gemspec
CHANGED
@@ -3,26 +3,28 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
3
3
|
require 'corpus-processor/version'
|
4
4
|
|
5
5
|
Gem::Specification.new do |spec|
|
6
|
-
spec.name =
|
6
|
+
spec.name = 'corpus-processor'
|
7
7
|
spec.version = CorpusProcessor::VERSION
|
8
|
-
spec.authors = [
|
9
|
-
spec.email = [
|
10
|
-
spec.description =
|
11
|
-
spec.summary =
|
12
|
-
|
13
|
-
spec.
|
8
|
+
spec.authors = ['Das Dad']
|
9
|
+
spec.email = ['dev@dasdad.com.br']
|
10
|
+
spec.description = 'Process linguistic corpus'
|
11
|
+
spec.summary = 'Handle linguistic corpus and convert it to use NLP ' \
|
12
|
+
'tools'
|
13
|
+
spec.homepage = 'https://github.com/dasdad/corpus-processor'
|
14
|
+
spec.license = 'MIT'
|
14
15
|
|
15
16
|
spec.files = `git ls-files`.split($/)
|
16
17
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
|
-
spec.require_paths = [
|
19
|
+
spec.require_paths = ['lib']
|
19
20
|
|
20
|
-
spec.required_ruby_version = Gem::Requirement.new(
|
21
|
+
spec.required_ruby_version = Gem::Requirement.new('>= 2.0.0')
|
21
22
|
|
22
|
-
spec.add_dependency
|
23
|
+
spec.add_dependency 'thor'
|
24
|
+
spec.add_dependency 'nokogiri'
|
23
25
|
|
24
|
-
spec.add_development_dependency
|
25
|
-
spec.add_development_dependency
|
26
|
-
spec.add_development_dependency
|
27
|
-
spec.add_development_dependency
|
26
|
+
spec.add_development_dependency 'bundler', '~> 1.3'
|
27
|
+
spec.add_development_dependency 'rspec', '2.14.0.rc1'
|
28
|
+
spec.add_development_dependency 'pry-nav'
|
29
|
+
spec.add_development_dependency 'coveralls'
|
28
30
|
end
|
data/lib/corpus-processor.rb
CHANGED
@@ -1,8 +1,12 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
require
|
8
|
-
require
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'yaml'
|
3
|
+
|
4
|
+
module CorpusProcessor
|
5
|
+
end
|
6
|
+
|
7
|
+
require 'corpus-processor/version'
|
8
|
+
require 'corpus-processor/token'
|
9
|
+
require 'corpus-processor/categories'
|
10
|
+
require 'corpus-processor/parsers'
|
11
|
+
require 'corpus-processor/generators'
|
12
|
+
require 'corpus-processor/processor'
|