corpus-processor 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.rspec +2 -0
- data/Gemfile +3 -0
- data/README.md +86 -0
- data/bin/corpus-processor +9 -0
- data/corpus-processor.gemspec +28 -0
- data/lib/corpus-processor.rb +8 -0
- data/lib/corpus-processor/cli.rb +17 -0
- data/lib/corpus-processor/default_categories.rb +14 -0
- data/lib/corpus-processor/generators.rb +1 -0
- data/lib/corpus-processor/generators/stanford_ner.rb +13 -0
- data/lib/corpus-processor/parsers.rb +1 -0
- data/lib/corpus-processor/parsers/harem.rb +52 -0
- data/lib/corpus-processor/processor.rb +11 -0
- data/lib/corpus-processor/token.rb +2 -0
- data/lib/corpus-processor/tokenizer.rb +17 -0
- data/lib/corpus-processor/traverser.rb +19 -0
- data/lib/corpus-processor/version.rb +3 -0
- data/spec/integration/cli_spec.rb +100 -0
- data/spec/spec_helper.rb +22 -0
- data/spec/unit/generators/stanford_ner_spec.rb +46 -0
- data/spec/unit/parsers/harem_spec.rb +269 -0
- data/spec/unit/processor.rb +37 -0
- data/spec/unit/token_spec.rb +8 -0
- data/spec/unit/tokenizer_spec.rb +121 -0
- data/spec/unit/traverser_spec.rb +68 -0
- metadata +149 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 8a0ff96102528239769c105832893034e21434bf
|
4
|
+
data.tar.gz: 625ffe80fa8399f20610e048c6ce346a69eef9c0
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 1716f52826fa5b895977760e33f5e918a9b7fcebd0d3448b6419c4cb9e8d1b7902f8d99cb6646f4b33693f5743aac3802bc4476e1eac9db555cd188d52acb9e0
|
7
|
+
data.tar.gz: 770efa624c0c2fcb0b3170d10dcce05069f90650f04b775f6d5662c9ac4b61b71f7884831e4903c015398d21cb21498206f6f2bc4a41cf59b6905d887222d9b8
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
Corpus Processor
|
2
|
+
================
|
3
|
+
|
4
|
+
![Corpus Processor](http://badge.fury.io/rb/corpus-processor)
|
5
|
+
|
6
|
+
Tool to work with [Corpus Linguistics](http://en.wikipedia.org/wiki/Corpus_linguistics). Corpus Processor converts _corpora_ between different formats for use in Natural Language Processing (NLP) tools.
|
7
|
+
|
8
|
+
The first purpose of Corpus Processor and its current only feature is to transform _corpora_ found in [Linguateca](http://www.linguateca.pt) into the format used for training in [Stanford NER](http://nlp.stanford.edu/software/CRF-NER.shtml).
|
9
|
+
|
10
|
+
[Linguateca](http://www.linguateca.pt) is an excellent source of _corpora_ in Portuguese.
|
11
|
+
|
12
|
+
[Stanford NER](http://nlp.stanford.edu/software/CRF-NER.shtml) is an excellent implementation of [Named Entity Recognition](http://en.wikipedia.org/wiki/Named-entity_recognition).
|
13
|
+
|
14
|
+
Installation
|
15
|
+
------------
|
16
|
+
|
17
|
+
Corpus Processor is a [Ruby](http://www.ruby-lang.org/) [Gem](http://rubygems.org/). To install it, given a working installation of Ruby, run:
|
18
|
+
|
19
|
+
```bash
|
20
|
+
$ gem install corpus_processor
|
21
|
+
```
|
22
|
+
|
23
|
+
Usage
|
24
|
+
-----
|
25
|
+
|
26
|
+
Convert corpus from HAREM format to Stanford-NER format:
|
27
|
+
|
28
|
+
```bash
|
29
|
+
$ corpus-processor process [INPUT_FILE [OUTPUT_FILE]]
|
30
|
+
```
|
31
|
+
|
32
|
+
Results
|
33
|
+
-------
|
34
|
+
|
35
|
+
For an example of converting one corpus with Corpus Processor, refer to this [gist](https://gist.github.com/leafac/5259008).
|
36
|
+
|
37
|
+
The corpus is from [Linguateca](http://www.linguateca.pt/HAREM/) and the training used [Stanford NER](http://nlp.stanford.edu/software/CRF-NER.shtml).
|
38
|
+
|
39
|
+
Contributing
|
40
|
+
------------
|
41
|
+
|
42
|
+
1. Fork it.
|
43
|
+
2. Create your feature branch (`git checkout -b my-new-feature`).
|
44
|
+
3. Commit your changes (`git commit -am 'Add some feature'`).
|
45
|
+
4. Push to the branch (`git push origin my-new-feature`).
|
46
|
+
5. Create new Pull Request.
|
47
|
+
|
48
|
+
Changelog
|
49
|
+
---------
|
50
|
+
|
51
|
+
### 0.0.1
|
52
|
+
|
53
|
+
* [Harem](http://www.linguateca.pt/HAREM/) Parser.
|
54
|
+
* [Stanford NER](http://nlp.stanford.edu/software/CRF-NER.shtml) Generator.
|
55
|
+
|
56
|
+
Thanks
|
57
|
+
------
|
58
|
+
|
59
|
+
* *Diana Santos* and her team in [Linguateca](http://www.linguateca.pt) for the semantic annotated corpus in Portuguese.
|
60
|
+
* *[Stanford NLP team](http://www-nlp.stanford.edu/)* for the [Stanford NER](http://nlp.stanford.edu/software/CRF-NER.shtml) tool.
|
61
|
+
|
62
|
+
License
|
63
|
+
-------
|
64
|
+
|
65
|
+
Copyright (c) 2013 Das Dad
|
66
|
+
|
67
|
+
MIT License
|
68
|
+
|
69
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
70
|
+
a copy of this software and associated documentation files (the
|
71
|
+
"Software"), to deal in the Software without restriction, including
|
72
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
73
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
74
|
+
permit persons to whom the Software is furnished to do so, subject to
|
75
|
+
the following conditions:
|
76
|
+
|
77
|
+
The above copyright notice and this permission notice shall be
|
78
|
+
included in all copies or substantial portions of the Software.
|
79
|
+
|
80
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
81
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
82
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
83
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
84
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
85
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
86
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
@@ -0,0 +1,28 @@
|
|
1
|
+
lib = File.expand_path('../lib', __FILE__)
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
require 'corpus-processor/version'
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = "corpus-processor"
|
7
|
+
spec.version = CorpusProcessor::VERSION
|
8
|
+
spec.authors = ["Das Dad"]
|
9
|
+
spec.email = ["feedback@dasdad.com.br"]
|
10
|
+
spec.description = %q{Process linguistic corpus}
|
11
|
+
spec.summary = %q{Handle linguistic corpus and convert it to use NLP tools}
|
12
|
+
spec.homepage = "https://github.com/dasdad/corpus-processor"
|
13
|
+
spec.license = "MIT"
|
14
|
+
|
15
|
+
spec.files = `git ls-files`.split($/)
|
16
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
|
+
spec.require_paths = ["lib"]
|
19
|
+
|
20
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 2.0.0")
|
21
|
+
|
22
|
+
spec.add_dependency "thor"
|
23
|
+
|
24
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
25
|
+
spec.add_development_dependency "rspec"
|
26
|
+
spec.add_development_dependency "fakefs"
|
27
|
+
spec.add_development_dependency "pry-nav"
|
28
|
+
end
|
@@ -0,0 +1,8 @@
|
|
1
|
+
require "corpus-processor/version"
|
2
|
+
require "corpus-processor/token"
|
3
|
+
require "corpus-processor/default_categories"
|
4
|
+
require "corpus-processor/traverser"
|
5
|
+
require "corpus-processor/tokenizer"
|
6
|
+
require "corpus-processor/parsers"
|
7
|
+
require "corpus-processor/generators"
|
8
|
+
require "corpus-processor/processor"
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require "corpus-processor"
|
2
|
+
require "thor"
|
3
|
+
|
4
|
+
module CorpusProcessor
|
5
|
+
class Cli < ::Thor
|
6
|
+
|
7
|
+
desc "process [INPUT_FILE [OUTPUT_FILE]] ", "convert corpus from HAREM format to Stanford-NER format"
|
8
|
+
def process(input_file = $stdin, output_file = $stdout)
|
9
|
+
input_file = File.new( input_file, "r") if input_file.is_a? String
|
10
|
+
output_file = File.new(output_file, "w") if output_file.is_a? String
|
11
|
+
|
12
|
+
output_file.puts(CorpusProcessor::Processor.new.process(input_file.read))
|
13
|
+
|
14
|
+
output_file.close
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module CorpusProcessor
|
2
|
+
DEFAULT_CATEGORIES = {
|
3
|
+
input: {
|
4
|
+
"PESSOA" => :person,
|
5
|
+
"LOCAL" => :location,
|
6
|
+
"ORGANIZACAO" => :organization,
|
7
|
+
},
|
8
|
+
output: Hash.new("O").merge(
|
9
|
+
person: "PERSON",
|
10
|
+
location: "LOCATION",
|
11
|
+
organization: "ORGANIZATION",
|
12
|
+
)
|
13
|
+
}
|
14
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require "corpus-processor/generators/stanford_ner"
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module CorpusProcessor::Generators
|
2
|
+
class StanfordNer
|
3
|
+
def initialize(categories = CorpusProcessor::DEFAULT_CATEGORIES[:output])
|
4
|
+
@categories = categories
|
5
|
+
end
|
6
|
+
|
7
|
+
def generate(tokens)
|
8
|
+
tokens.map { |token|
|
9
|
+
"#{ token.word } #{ @categories[token.category] }"
|
10
|
+
}.join("\n") + "\n"
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require "corpus-processor/parsers/harem"
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module CorpusProcessor::Parsers
|
2
|
+
class Harem
|
3
|
+
|
4
|
+
CATEGORY_REGEX = /
|
5
|
+
(?<any_text> .*? ){0}
|
6
|
+
(?<entity_attributes> \s\g<any_text>
|
7
|
+
CATEG="\g<categories>"\g<any_text> ){0}
|
8
|
+
(?<entity_opening_tag> <em\g<entity_attributes>> ){0}
|
9
|
+
(?<entity_closing_tag> <\/em> ){0}
|
10
|
+
|
11
|
+
# groups of interest
|
12
|
+
(?<inner_text> \g<any_text> ){0}
|
13
|
+
(?<categories> \g<any_text> ){0}
|
14
|
+
|
15
|
+
\g<entity_opening_tag>\g<inner_text>\g<entity_closing_tag>
|
16
|
+
/ix
|
17
|
+
|
18
|
+
def initialize(categories = CorpusProcessor::DEFAULT_CATEGORIES[:input],
|
19
|
+
traverser = CorpusProcessor::Traverser.new,
|
20
|
+
tokenizer = CorpusProcessor::Tokenizer.new)
|
21
|
+
@categories = categories
|
22
|
+
@traverser = traverser
|
23
|
+
@tokenizer = tokenizer
|
24
|
+
end
|
25
|
+
|
26
|
+
def parse(corpus)
|
27
|
+
[].tap { |tokens|
|
28
|
+
@traverser.traverse(@tokenizer.join_lines(corpus),
|
29
|
+
CATEGORY_REGEX) do |match|
|
30
|
+
text_to_tokenize, category = case match
|
31
|
+
when String
|
32
|
+
[match, nil]
|
33
|
+
when MatchData
|
34
|
+
[
|
35
|
+
match[:inner_text],
|
36
|
+
extract_category(match[:categories])
|
37
|
+
]
|
38
|
+
end
|
39
|
+
tokens.push(*@tokenizer.tokenize(text_to_tokenize, category))
|
40
|
+
end
|
41
|
+
}
|
42
|
+
end
|
43
|
+
|
44
|
+
def extract_category(categories)
|
45
|
+
categories
|
46
|
+
.split("|")
|
47
|
+
.map { |category_string| @categories[category_string] }
|
48
|
+
.compact
|
49
|
+
.first
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
class CorpusProcessor::Processor
|
2
|
+
def initialize(parser = CorpusProcessor::Parsers::Harem.new,
|
3
|
+
generator = CorpusProcessor::Generators::StanfordNer.new)
|
4
|
+
@parser = parser
|
5
|
+
@generator = generator
|
6
|
+
end
|
7
|
+
|
8
|
+
def process(corpus)
|
9
|
+
@generator.generate(@parser.parse(corpus))
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
class CorpusProcessor::Tokenizer
|
2
|
+
def tokenize(text, category = nil)
|
3
|
+
strip_tags(text)
|
4
|
+
.gsub(/[[:punct:]]/, "")
|
5
|
+
.strip
|
6
|
+
.split(/\s+/)
|
7
|
+
.map { |word| CorpusProcessor::Token.new(word, category) }
|
8
|
+
end
|
9
|
+
|
10
|
+
def strip_tags(text)
|
11
|
+
text.gsub(/<.*?>/, " ").strip
|
12
|
+
end
|
13
|
+
|
14
|
+
def join_lines(text)
|
15
|
+
text.gsub(/\s+/, " ").strip
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
class CorpusProcessor::Traverser
|
2
|
+
def traverse(text, regexp, &block)
|
3
|
+
return if block.nil?
|
4
|
+
remaining_search = text
|
5
|
+
until remaining_search.empty?
|
6
|
+
match = remaining_search.match(regexp)
|
7
|
+
if match.nil?
|
8
|
+
block.call remaining_search unless remaining_search.empty?
|
9
|
+
remaining_search = ""
|
10
|
+
else
|
11
|
+
before = remaining_search[0...match.begin(0)]
|
12
|
+
remaining_search = remaining_search[match.end(0)..-1]
|
13
|
+
|
14
|
+
block.call before unless before.empty?
|
15
|
+
block.call match
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
require "corpus-processor/cli"
|
4
|
+
|
5
|
+
describe CorpusProcessor::Cli do
|
6
|
+
include FakeFS::SpecHelpers
|
7
|
+
subject(:cli) { CorpusProcessor::Cli.new }
|
8
|
+
|
9
|
+
let(:input_file) { "input_file" }
|
10
|
+
let(:output_file) { "output_file" }
|
11
|
+
|
12
|
+
before do
|
13
|
+
File.open(input_file, "w") { |file|
|
14
|
+
file.write <<-INPUT
|
15
|
+
<?xml version="1.0" encoding="ISO-8859-1"?>
|
16
|
+
<!DOCTYPE colHAREM>
|
17
|
+
<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
|
18
|
+
<DOC DOCID="H2-dftre765">
|
19
|
+
<P>Fatores Demográficos e Econômicos Subjacentes</P>
|
20
|
+
<P>
|
21
|
+
A revolta histórica produz normalmente uma nova forma de pensamento quanto à forma de organização da sociedade. Assim foi com a
|
22
|
+
<EM ID="H2-dftre765-1" CATEG="ABSTRACCAO|ACONTECIMENTO" TIPO="IDEIA|EFEMERIDE">Reforma Protestante</EM>
|
23
|
+
. No seguimento do colapso de instituições monásticas e do escolasticismo
|
24
|
+
nos finais da <EM ID="H2-dftre765-102" CATEG="OUTRO" COMENT="DUVIDA_DIRECTIVASTEMPO">Idade Média</EM>
|
25
|
+
na
|
26
|
+
<EM ID="H2-dftre765-37" CATEG="LOCAL" TIPO="HUMANO" SUBTIPO="DIVISAO">Europa</EM>
|
27
|
+
, acentuado pela "
|
28
|
+
<OMITIDO> <EM ID="H2-dftre765-17" CATEG="ACONTECIMENTO" TIPO="EFEMERIDE">Cativeiro Babilónica da igreja</EM></OMITIDO>
|
29
|
+
"
|
30
|
+
no papado de <EM ID="H2-dftre765-11" CATEG="ACONTECIMENTO" TIPO="EVENTO">Avignon</EM>
|
31
|
+
, o
|
32
|
+
INPUT
|
33
|
+
}
|
34
|
+
end
|
35
|
+
|
36
|
+
describe "#process" do
|
37
|
+
before do
|
38
|
+
cli.process(input_file, output_file)
|
39
|
+
end
|
40
|
+
|
41
|
+
specify { File.read(output_file).should == <<-OUTPUT }
|
42
|
+
Fatores O
|
43
|
+
Demográficos O
|
44
|
+
e O
|
45
|
+
Econômicos O
|
46
|
+
Subjacentes O
|
47
|
+
A O
|
48
|
+
revolta O
|
49
|
+
histórica O
|
50
|
+
produz O
|
51
|
+
normalmente O
|
52
|
+
uma O
|
53
|
+
nova O
|
54
|
+
forma O
|
55
|
+
de O
|
56
|
+
pensamento O
|
57
|
+
quanto O
|
58
|
+
à O
|
59
|
+
forma O
|
60
|
+
de O
|
61
|
+
organização O
|
62
|
+
da O
|
63
|
+
sociedade O
|
64
|
+
Assim O
|
65
|
+
foi O
|
66
|
+
com O
|
67
|
+
a O
|
68
|
+
Reforma O
|
69
|
+
Protestante O
|
70
|
+
No O
|
71
|
+
seguimento O
|
72
|
+
do O
|
73
|
+
colapso O
|
74
|
+
de O
|
75
|
+
instituições O
|
76
|
+
monásticas O
|
77
|
+
e O
|
78
|
+
do O
|
79
|
+
escolasticismo O
|
80
|
+
nos O
|
81
|
+
finais O
|
82
|
+
da O
|
83
|
+
Idade O
|
84
|
+
Média O
|
85
|
+
na O
|
86
|
+
Europa LOCATION
|
87
|
+
acentuado O
|
88
|
+
pela O
|
89
|
+
Cativeiro O
|
90
|
+
Babilónica O
|
91
|
+
da O
|
92
|
+
igreja O
|
93
|
+
no O
|
94
|
+
papado O
|
95
|
+
de O
|
96
|
+
Avignon O
|
97
|
+
o O
|
98
|
+
OUTPUT
|
99
|
+
end
|
100
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
2
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
|
+
# Require this file using `require "spec_helper"` to ensure that it is only
|
4
|
+
# loaded once.
|
5
|
+
#
|
6
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
7
|
+
|
8
|
+
require "fakefs/spec_helpers"
|
9
|
+
|
10
|
+
require "corpus-processor"
|
11
|
+
|
12
|
+
RSpec.configure do |config|
|
13
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
14
|
+
config.run_all_when_everything_filtered = true
|
15
|
+
config.filter_run :focus
|
16
|
+
|
17
|
+
# Run specs in random order to surface order dependencies. If you find an
|
18
|
+
# order dependency and want to debug it, you can fix the order by providing
|
19
|
+
# the seed, which is printed after each run.
|
20
|
+
# --seed 1234
|
21
|
+
config.order = "random"
|
22
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe CorpusProcessor::Generators::StanfordNer do
|
4
|
+
subject(:stanford_ner) { CorpusProcessor::Generators::StanfordNer.new }
|
5
|
+
|
6
|
+
describe "#generate" do
|
7
|
+
subject { stanford_ner.generate(tokens) }
|
8
|
+
|
9
|
+
context "no tokens" do
|
10
|
+
let(:tokens) { [] }
|
11
|
+
|
12
|
+
it { should == "\n" }
|
13
|
+
end
|
14
|
+
|
15
|
+
context "one token" do
|
16
|
+
let(:tokens) { [CorpusProcessor::Token.new("banana")] }
|
17
|
+
|
18
|
+
it { should == "banana O\n" }
|
19
|
+
end
|
20
|
+
|
21
|
+
context "two tokens" do
|
22
|
+
let(:tokens) { [
|
23
|
+
CorpusProcessor::Token.new("good"),
|
24
|
+
CorpusProcessor::Token.new("banana"),
|
25
|
+
] }
|
26
|
+
|
27
|
+
it { should == "good O\nbanana O\n" }
|
28
|
+
end
|
29
|
+
|
30
|
+
context "with category" do
|
31
|
+
let(:tokens) { [CorpusProcessor::Token.new("Leandro", :person)] }
|
32
|
+
|
33
|
+
it { should == "Leandro PERSON\n" }
|
34
|
+
end
|
35
|
+
|
36
|
+
context "with non-default categories" do
|
37
|
+
let(:stanford_ner) { CorpusProcessor::Generators::StanfordNer.new(
|
38
|
+
banana: "BANANA"
|
39
|
+
) }
|
40
|
+
|
41
|
+
let(:tokens) { [CorpusProcessor::Token.new("Nanica", :banana)] }
|
42
|
+
|
43
|
+
it { should == "Nanica BANANA\n" }
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,269 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe CorpusProcessor::Parsers::Harem do
|
4
|
+
subject(:harem) { CorpusProcessor::Parsers::Harem.new }
|
5
|
+
|
6
|
+
describe "#parse" do
|
7
|
+
subject { harem.parse(corpus) }
|
8
|
+
|
9
|
+
context "default categories" do
|
10
|
+
context "empty corpus" do
|
11
|
+
let(:corpus) { "" }
|
12
|
+
|
13
|
+
it { should == [] }
|
14
|
+
end
|
15
|
+
|
16
|
+
context "doctype" do
|
17
|
+
let(:corpus) {
|
18
|
+
<<-CORPUS
|
19
|
+
<?xml version="1.0" encoding="ISO-8859-1"?>
|
20
|
+
<!DOCTYPE colHAREM>
|
21
|
+
CORPUS
|
22
|
+
}
|
23
|
+
|
24
|
+
it { should == [] }
|
25
|
+
end
|
26
|
+
|
27
|
+
context "simple phrase" do
|
28
|
+
let(:corpus) {
|
29
|
+
<<-CORPUS
|
30
|
+
<?xml version="1.0" encoding="ISO-8859-1"?>
|
31
|
+
<!DOCTYPE colHAREM>
|
32
|
+
<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
|
33
|
+
<DOC DOCID="H2-dftre765">
|
34
|
+
<P>Fatores Demográficos e Econômicos Subjacentes</P>
|
35
|
+
</DOC>
|
36
|
+
</colHAREM>
|
37
|
+
CORPUS
|
38
|
+
}
|
39
|
+
|
40
|
+
it { should == [
|
41
|
+
CorpusProcessor::Token.new("Fatores"),
|
42
|
+
CorpusProcessor::Token.new("Demográficos"),
|
43
|
+
CorpusProcessor::Token.new("e"),
|
44
|
+
CorpusProcessor::Token.new("Econômicos"),
|
45
|
+
CorpusProcessor::Token.new("Subjacentes"),
|
46
|
+
]
|
47
|
+
}
|
48
|
+
end
|
49
|
+
|
50
|
+
context "two simple phrases" do
|
51
|
+
let(:corpus) {
|
52
|
+
<<-CORPUS
|
53
|
+
<?xml version="1.0" encoding="ISO-8859-1"?>
|
54
|
+
<!DOCTYPE colHAREM>
|
55
|
+
<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
|
56
|
+
<DOC DOCID="H2-dftre765">
|
57
|
+
<P>Fatores Demográficos e Econômicos Subjacentes</P>
|
58
|
+
<P>Fatores Demográficos e Econômicos Subjacentes</P>
|
59
|
+
</DOC>
|
60
|
+
</colHAREM>
|
61
|
+
CORPUS
|
62
|
+
}
|
63
|
+
|
64
|
+
it { should == [
|
65
|
+
CorpusProcessor::Token.new("Fatores"),
|
66
|
+
CorpusProcessor::Token.new("Demográficos"),
|
67
|
+
CorpusProcessor::Token.new("e"),
|
68
|
+
CorpusProcessor::Token.new("Econômicos"),
|
69
|
+
CorpusProcessor::Token.new("Subjacentes"),
|
70
|
+
CorpusProcessor::Token.new("Fatores"),
|
71
|
+
CorpusProcessor::Token.new("Demográficos"),
|
72
|
+
CorpusProcessor::Token.new("e"),
|
73
|
+
CorpusProcessor::Token.new("Econômicos"),
|
74
|
+
CorpusProcessor::Token.new("Subjacentes"),
|
75
|
+
]
|
76
|
+
}
|
77
|
+
end
|
78
|
+
|
79
|
+
context "useless entity" do
|
80
|
+
let(:corpus) {
|
81
|
+
<<-CORPUS
|
82
|
+
<?xml version="1.0" encoding="ISO-8859-1"?>
|
83
|
+
<!DOCTYPE colHAREM>
|
84
|
+
<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
|
85
|
+
<DOC DOCID="H2-dftre765">
|
86
|
+
<P>Nos finais da <EM ID="H2-dftre765-102" CATEG="OUTRO" COMENT="DUVIDA_DIRECTIVASTEMPO">Idade Média</EM></P>
|
87
|
+
</DOC>
|
88
|
+
</colHAREM>
|
89
|
+
CORPUS
|
90
|
+
}
|
91
|
+
|
92
|
+
it { should == [
|
93
|
+
CorpusProcessor::Token.new("Nos"),
|
94
|
+
CorpusProcessor::Token.new("finais"),
|
95
|
+
CorpusProcessor::Token.new("da"),
|
96
|
+
CorpusProcessor::Token.new("Idade"),
|
97
|
+
CorpusProcessor::Token.new("Média"),
|
98
|
+
]
|
99
|
+
}
|
100
|
+
end
|
101
|
+
|
102
|
+
context "one entity" do
|
103
|
+
let(:corpus) {
|
104
|
+
<<-CORPUS
|
105
|
+
<?xml version="1.0" encoding="ISO-8859-1"?>
|
106
|
+
<!DOCTYPE colHAREM>
|
107
|
+
<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
|
108
|
+
<DOC DOCID="H2-dftre765">
|
109
|
+
<P>Foram igualmente determinantes para evitar que as ideias reformadoras encontrassem divulgação em
|
110
|
+
<EM ID="H2-dftre765-23" CATEG="LOCAL" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-8 H2-dftre765-37" TIPOREL="local_nascimento_de incluido">Portugal</EM>
|
111
|
+
</P>
|
112
|
+
</DOC>
|
113
|
+
</colHAREM>
|
114
|
+
CORPUS
|
115
|
+
}
|
116
|
+
|
117
|
+
it { should == [
|
118
|
+
CorpusProcessor::Token.new("Foram"),
|
119
|
+
CorpusProcessor::Token.new("igualmente"),
|
120
|
+
CorpusProcessor::Token.new("determinantes"),
|
121
|
+
CorpusProcessor::Token.new("para"),
|
122
|
+
CorpusProcessor::Token.new("evitar"),
|
123
|
+
CorpusProcessor::Token.new("que"),
|
124
|
+
CorpusProcessor::Token.new("as"),
|
125
|
+
CorpusProcessor::Token.new("ideias"),
|
126
|
+
CorpusProcessor::Token.new("reformadoras"),
|
127
|
+
CorpusProcessor::Token.new("encontrassem"),
|
128
|
+
CorpusProcessor::Token.new("divulgação"),
|
129
|
+
CorpusProcessor::Token.new("em"),
|
130
|
+
CorpusProcessor::Token.new("Portugal", :location),
|
131
|
+
]
|
132
|
+
}
|
133
|
+
end
|
134
|
+
|
135
|
+
context "multiple entities" do
|
136
|
+
let(:corpus) {
|
137
|
+
<<-CORPUS
|
138
|
+
<?xml version="1.0" encoding="ISO-8859-1"?>
|
139
|
+
<!DOCTYPE colHAREM>
|
140
|
+
<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
|
141
|
+
<DOC DOCID="H2-dftre765">
|
142
|
+
<P>
|
143
|
+
A imprensa, inventada na
|
144
|
+
<EM ID="H2-dftre765-9" CATEG="LOCAL" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-37" TIPOREL="incluido">Alemanha</EM>
|
145
|
+
por
|
146
|
+
<EM ID="H2-dftre765-10" CATEG="PESSOA" TIPO="INDIVIDUAL" COREL="H2-dftre765-9" TIPOREL="natural_de">John Gutenberg</EM>
|
147
|
+
<EM ID="H2-dftre765-20" CATEG="ORGANIZACAO" TIPO="INSTITUICAO" COMENT="2/3">Inquisição</EM>
|
148
|
+
e a censura
|
149
|
+
</P>
|
150
|
+
</DOC>
|
151
|
+
</colHAREM>
|
152
|
+
CORPUS
|
153
|
+
}
|
154
|
+
|
155
|
+
it { should == [
|
156
|
+
CorpusProcessor::Token.new("A"),
|
157
|
+
CorpusProcessor::Token.new("imprensa"),
|
158
|
+
CorpusProcessor::Token.new("inventada"),
|
159
|
+
CorpusProcessor::Token.new("na"),
|
160
|
+
CorpusProcessor::Token.new("Alemanha", :location),
|
161
|
+
CorpusProcessor::Token.new("por"),
|
162
|
+
CorpusProcessor::Token.new("John", :person),
|
163
|
+
CorpusProcessor::Token.new("Gutenberg", :person),
|
164
|
+
CorpusProcessor::Token.new("Inquisição", :organization),
|
165
|
+
CorpusProcessor::Token.new("e"),
|
166
|
+
CorpusProcessor::Token.new("a"),
|
167
|
+
CorpusProcessor::Token.new("censura"),
|
168
|
+
]
|
169
|
+
}
|
170
|
+
end
|
171
|
+
|
172
|
+
context "spaces after ponctuation" do
|
173
|
+
let(:corpus) {
|
174
|
+
<<-CORPUS
|
175
|
+
<?xml version="1.0" encoding="ISO-8859-1"?>
|
176
|
+
<!DOCTYPE colHAREM>
|
177
|
+
<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
|
178
|
+
<DOC DOCID="H2-dftre765">
|
179
|
+
<EM ID="H2-dftre765-1" CATEG="ABSTRACCAO|ACONTECIMENTO" TIPO="IDEIA|EFEMERIDE">Reforma Protestante</EM>
|
180
|
+
. No
|
181
|
+
</DOC>
|
182
|
+
</colHAREM>
|
183
|
+
CORPUS
|
184
|
+
}
|
185
|
+
|
186
|
+
it { should == [
|
187
|
+
CorpusProcessor::Token.new("Reforma"),
|
188
|
+
CorpusProcessor::Token.new("Protestante"),
|
189
|
+
CorpusProcessor::Token.new("No"),
|
190
|
+
]
|
191
|
+
}
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
context "user-defined categories" do
|
196
|
+
let(:harem) {
|
197
|
+
CorpusProcessor::Parsers::Harem.new({
|
198
|
+
"FRUTA" => :fruit,
|
199
|
+
"LIVRO" => :book,
|
200
|
+
})
|
201
|
+
}
|
202
|
+
|
203
|
+
context "multiple entities" do
|
204
|
+
let(:corpus) {
|
205
|
+
<<-CORPUS
|
206
|
+
<?xml version="1.0" encoding="ISO-8859-1"?>
|
207
|
+
<!DOCTYPE colHAREM>
|
208
|
+
<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
|
209
|
+
<DOC DOCID="H2-dftre765">
|
210
|
+
<P>
|
211
|
+
A imprensa, inventada na
|
212
|
+
<EM ID="H2-dftre765-9" CATEG="FRUTA" TIPO="HUMANO" SUBTIPO="PAIS" COREL="H2-dftre765-37" TIPOREL="incluido">Banana</EM>
|
213
|
+
por
|
214
|
+
<EM ID="H2-dftre765-10" CATEG="LIVRO" TIPO="INDIVIDUAL" COREL="H2-dftre765-9" TIPOREL="natural_de">Harry Potter</EM>
|
215
|
+
<EM ID="H2-dftre765-20" CATEG="ORGANIZACAO" TIPO="INSTITUICAO" COMENT="2/3">Inquisição</EM>
|
216
|
+
e a censura
|
217
|
+
</P>
|
218
|
+
</DOC>
|
219
|
+
</colHAREM>
|
220
|
+
CORPUS
|
221
|
+
}
|
222
|
+
|
223
|
+
it { should == [
|
224
|
+
CorpusProcessor::Token.new("A"),
|
225
|
+
CorpusProcessor::Token.new("imprensa"),
|
226
|
+
CorpusProcessor::Token.new("inventada"),
|
227
|
+
CorpusProcessor::Token.new("na"),
|
228
|
+
CorpusProcessor::Token.new("Banana", :fruit),
|
229
|
+
CorpusProcessor::Token.new("por"),
|
230
|
+
CorpusProcessor::Token.new("Harry", :book),
|
231
|
+
CorpusProcessor::Token.new("Potter", :book),
|
232
|
+
CorpusProcessor::Token.new("Inquisição"),
|
233
|
+
CorpusProcessor::Token.new("e"),
|
234
|
+
CorpusProcessor::Token.new("a"),
|
235
|
+
CorpusProcessor::Token.new("censura"),
|
236
|
+
]
|
237
|
+
}
|
238
|
+
end
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
describe "#extract_category" do
|
243
|
+
subject { harem.extract_category(categories) }
|
244
|
+
|
245
|
+
context "empty categories" do
|
246
|
+
let(:categories) { "" }
|
247
|
+
|
248
|
+
it { should == nil }
|
249
|
+
end
|
250
|
+
|
251
|
+
context "one category" do
|
252
|
+
let(:categories) { "PESSOA" }
|
253
|
+
|
254
|
+
it { should == :person }
|
255
|
+
end
|
256
|
+
|
257
|
+
context "two categories" do
|
258
|
+
let(:categories) { "OUTRA|ORGANIZACAO" }
|
259
|
+
|
260
|
+
it { should == :organization }
|
261
|
+
end
|
262
|
+
|
263
|
+
context "ambiguidade" do
|
264
|
+
let(:categories) { "PESSOA|ORGANIZACAO" }
|
265
|
+
|
266
|
+
it { should == :person }
|
267
|
+
end
|
268
|
+
end
|
269
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe CorpusProcessor::Processor do
|
4
|
+
subject(:processor) { CorpusProcessor::Processor.new(parser, generator) }
|
5
|
+
|
6
|
+
describe "#process" do
|
7
|
+
subject { processor.process(corpus) }
|
8
|
+
|
9
|
+
let(:corpus) { "Some corpus" }
|
10
|
+
let(:processed_corpus) {
|
11
|
+
<<-CORPUS
|
12
|
+
Some O
|
13
|
+
corpus O
|
14
|
+
CORPUS
|
15
|
+
}
|
16
|
+
let(:tokens) {
|
17
|
+
[
|
18
|
+
CorpusProcessor::Token.new("Some"),
|
19
|
+
CorpusProcessor::Token.new("corpus"),
|
20
|
+
]
|
21
|
+
}
|
22
|
+
let(:parser) { double :parser }
|
23
|
+
let(:generator) { double :generator }
|
24
|
+
|
25
|
+
specify {
|
26
|
+
parser.should_receive(:parse)
|
27
|
+
.with(corpus)
|
28
|
+
.and_return(tokens)
|
29
|
+
|
30
|
+
generator.should_receive(:generate)
|
31
|
+
.with(tokens)
|
32
|
+
.and_return(processed_corpus)
|
33
|
+
|
34
|
+
subject.should == processed_corpus
|
35
|
+
}
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,121 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe CorpusProcessor::Tokenizer do
|
4
|
+
subject(:tokenizer) { CorpusProcessor::Tokenizer.new }
|
5
|
+
|
6
|
+
describe "#tokenize" do
|
7
|
+
subject { tokenizer.tokenize(text, category) }
|
8
|
+
|
9
|
+
let(:category) { nil }
|
10
|
+
|
11
|
+
context "empty string" do
|
12
|
+
let(:text) { "" }
|
13
|
+
|
14
|
+
it { should == [] }
|
15
|
+
end
|
16
|
+
|
17
|
+
context "one word" do
|
18
|
+
let(:text) { "banana" }
|
19
|
+
|
20
|
+
it { should == [CorpusProcessor::Token.new("banana")] }
|
21
|
+
end
|
22
|
+
|
23
|
+
context "two words" do
|
24
|
+
let(:text) { "good banana" }
|
25
|
+
|
26
|
+
it { should == [
|
27
|
+
CorpusProcessor::Token.new("good"),
|
28
|
+
CorpusProcessor::Token.new("banana"),
|
29
|
+
] }
|
30
|
+
end
|
31
|
+
|
32
|
+
context "ponctuation" do
|
33
|
+
let(:text) { "good, banana" }
|
34
|
+
|
35
|
+
it { should == [
|
36
|
+
CorpusProcessor::Token.new("good"),
|
37
|
+
CorpusProcessor::Token.new("banana"),
|
38
|
+
] }
|
39
|
+
end
|
40
|
+
|
41
|
+
context "default category" do
|
42
|
+
let(:text) { "Google" }
|
43
|
+
let(:category) { :organization }
|
44
|
+
|
45
|
+
it { should == [
|
46
|
+
CorpusProcessor::Token.new("Google", :organization),
|
47
|
+
] }
|
48
|
+
end
|
49
|
+
|
50
|
+
context "with tags" do
|
51
|
+
let(:text) { "good<lalala/>, banana" }
|
52
|
+
|
53
|
+
it { should == [
|
54
|
+
CorpusProcessor::Token.new("good"),
|
55
|
+
CorpusProcessor::Token.new("banana"),
|
56
|
+
] }
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
describe "#strip_tags" do
|
61
|
+
subject { tokenizer.strip_tags(text) }
|
62
|
+
|
63
|
+
context "empty text" do
|
64
|
+
let(:text) { "" }
|
65
|
+
|
66
|
+
it { should == "" }
|
67
|
+
end
|
68
|
+
|
69
|
+
context "self closed tag" do
|
70
|
+
let(:text) { "<br/>" }
|
71
|
+
|
72
|
+
it { should == "" }
|
73
|
+
end
|
74
|
+
|
75
|
+
context "tag with content" do
|
76
|
+
let(:text) { "<p>Some text</p>" }
|
77
|
+
|
78
|
+
it { should == "Some text" }
|
79
|
+
end
|
80
|
+
|
81
|
+
context "content after tag" do
|
82
|
+
let(:text) { "<p>Some<br/>text</p>" }
|
83
|
+
|
84
|
+
it { should == "Some text" }
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
describe "#join_lines" do
|
89
|
+
subject { tokenizer.join_lines(text) }
|
90
|
+
|
91
|
+
context "empty text" do
|
92
|
+
let(:text) { "" }
|
93
|
+
|
94
|
+
it { should == "" }
|
95
|
+
end
|
96
|
+
|
97
|
+
context "one word" do
|
98
|
+
let(:text) { "banana" }
|
99
|
+
|
100
|
+
it { should == "banana" }
|
101
|
+
end
|
102
|
+
|
103
|
+
context "two lines" do
|
104
|
+
let(:text) { "banana\nquiabo" }
|
105
|
+
|
106
|
+
it { should == "banana quiabo" }
|
107
|
+
end
|
108
|
+
|
109
|
+
context "line with empty space" do
|
110
|
+
let(:text) { "banana\n \nquiabo" }
|
111
|
+
|
112
|
+
it { should == "banana quiabo" }
|
113
|
+
end
|
114
|
+
|
115
|
+
context "leading spaces" do
|
116
|
+
let(:text) { " \n banana\n \nquiabo \n" }
|
117
|
+
|
118
|
+
it { should == "banana quiabo" }
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe CorpusProcessor::Traverser do
|
4
|
+
subject(:traverser) { CorpusProcessor::Traverser.new }
|
5
|
+
|
6
|
+
describe "#traverse" do
|
7
|
+
subject { traverser.traverse(text, regexp) }
|
8
|
+
|
9
|
+
context "empty text" do
|
10
|
+
let(:text) { "" }
|
11
|
+
let(:regexp) { // }
|
12
|
+
|
13
|
+
specify {
|
14
|
+
expect { |mock_block|
|
15
|
+
traverser.traverse(text, regexp, &mock_block)
|
16
|
+
}.not_to yield_control
|
17
|
+
}
|
18
|
+
end
|
19
|
+
|
20
|
+
context "simple text" do
|
21
|
+
let(:text) { "abc" }
|
22
|
+
let(:regexp) { /b/ }
|
23
|
+
|
24
|
+
specify {
|
25
|
+
expect { |mock_block|
|
26
|
+
traverser.traverse(text, regexp, &mock_block)
|
27
|
+
}.to yield_successive_args "a", text.match(regexp), "c"
|
28
|
+
}
|
29
|
+
end
|
30
|
+
|
31
|
+
context "two matches" do
|
32
|
+
let(:text) { "abcbd" }
|
33
|
+
let(:regexp) { /b/ }
|
34
|
+
|
35
|
+
specify {
|
36
|
+
expect { |mock_block|
|
37
|
+
traverser.traverse(text, regexp, &mock_block)
|
38
|
+
}.to yield_successive_args "a",
|
39
|
+
text.match(regexp),
|
40
|
+
"c",
|
41
|
+
text[2..-1].match(regexp),
|
42
|
+
"d"
|
43
|
+
}
|
44
|
+
end
|
45
|
+
|
46
|
+
context "match in beginning" do
|
47
|
+
let(:text) { "bc" }
|
48
|
+
let(:regexp) { /b/ }
|
49
|
+
|
50
|
+
specify {
|
51
|
+
expect { |mock_block|
|
52
|
+
traverser.traverse(text, regexp, &mock_block)
|
53
|
+
}.to yield_successive_args text.match(regexp), "c"
|
54
|
+
}
|
55
|
+
end
|
56
|
+
|
57
|
+
context "match in ending" do
|
58
|
+
let(:text) { "bc" }
|
59
|
+
let(:regexp) { /c/ }
|
60
|
+
|
61
|
+
specify {
|
62
|
+
expect { |mock_block|
|
63
|
+
traverser.traverse(text, regexp, &mock_block)
|
64
|
+
}.to yield_successive_args "b", text.match(regexp)
|
65
|
+
}
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
metadata
ADDED
@@ -0,0 +1,149 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: corpus-processor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Das Dad
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-03-27 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: thor
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bundler
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.3'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ~>
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.3'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: fakefs
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: pry-nav
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
description: Process linguistic corpus
|
84
|
+
email:
|
85
|
+
- feedback@dasdad.com.br
|
86
|
+
executables:
|
87
|
+
- corpus-processor
|
88
|
+
extensions: []
|
89
|
+
extra_rdoc_files: []
|
90
|
+
files:
|
91
|
+
- .gitignore
|
92
|
+
- .rspec
|
93
|
+
- Gemfile
|
94
|
+
- README.md
|
95
|
+
- bin/corpus-processor
|
96
|
+
- corpus-processor.gemspec
|
97
|
+
- lib/corpus-processor.rb
|
98
|
+
- lib/corpus-processor/cli.rb
|
99
|
+
- lib/corpus-processor/default_categories.rb
|
100
|
+
- lib/corpus-processor/generators.rb
|
101
|
+
- lib/corpus-processor/generators/stanford_ner.rb
|
102
|
+
- lib/corpus-processor/parsers.rb
|
103
|
+
- lib/corpus-processor/parsers/harem.rb
|
104
|
+
- lib/corpus-processor/processor.rb
|
105
|
+
- lib/corpus-processor/token.rb
|
106
|
+
- lib/corpus-processor/tokenizer.rb
|
107
|
+
- lib/corpus-processor/traverser.rb
|
108
|
+
- lib/corpus-processor/version.rb
|
109
|
+
- spec/integration/cli_spec.rb
|
110
|
+
- spec/spec_helper.rb
|
111
|
+
- spec/unit/generators/stanford_ner_spec.rb
|
112
|
+
- spec/unit/parsers/harem_spec.rb
|
113
|
+
- spec/unit/processor.rb
|
114
|
+
- spec/unit/token_spec.rb
|
115
|
+
- spec/unit/tokenizer_spec.rb
|
116
|
+
- spec/unit/traverser_spec.rb
|
117
|
+
homepage: https://github.com/dasdad/corpus-processor
|
118
|
+
licenses:
|
119
|
+
- MIT
|
120
|
+
metadata: {}
|
121
|
+
post_install_message:
|
122
|
+
rdoc_options: []
|
123
|
+
require_paths:
|
124
|
+
- lib
|
125
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
126
|
+
requirements:
|
127
|
+
- - '>='
|
128
|
+
- !ruby/object:Gem::Version
|
129
|
+
version: 2.0.0
|
130
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
131
|
+
requirements:
|
132
|
+
- - '>='
|
133
|
+
- !ruby/object:Gem::Version
|
134
|
+
version: '0'
|
135
|
+
requirements: []
|
136
|
+
rubyforge_project:
|
137
|
+
rubygems_version: 2.0.0
|
138
|
+
signing_key:
|
139
|
+
specification_version: 4
|
140
|
+
summary: Handle linguistic corpus and convert it to use NLP tools
|
141
|
+
test_files:
|
142
|
+
- spec/integration/cli_spec.rb
|
143
|
+
- spec/spec_helper.rb
|
144
|
+
- spec/unit/generators/stanford_ner_spec.rb
|
145
|
+
- spec/unit/parsers/harem_spec.rb
|
146
|
+
- spec/unit/processor.rb
|
147
|
+
- spec/unit/token_spec.rb
|
148
|
+
- spec/unit/tokenizer_spec.rb
|
149
|
+
- spec/unit/traverser_spec.rb
|