corpus-processor 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +5 -0
- data/.yardopts +1 -0
- data/README.md +235 -34
- data/bin/corpus-processor +3 -3
- data/corpus-processor.gemspec +16 -14
- data/lib/corpus-processor.rb +12 -8
- data/lib/corpus-processor/categories.rb +58 -0
- data/lib/corpus-processor/categories/default.yml +10 -0
- data/lib/corpus-processor/cli.rb +31 -11
- data/lib/corpus-processor/generators.rb +5 -1
- data/lib/corpus-processor/generators/stanford_ner.rb +19 -10
- data/lib/corpus-processor/parsers.rb +5 -1
- data/lib/corpus-processor/parsers/lampada.rb +103 -47
- data/lib/corpus-processor/processor.rb +19 -4
- data/lib/corpus-processor/token.rb +35 -1
- data/lib/corpus-processor/version.rb +1 -1
- data/spec/{integration → corpus-processor}/cli_spec.rb +81 -71
- data/spec/corpus-processor/generators/stanford_ner_spec.rb +57 -0
- data/spec/corpus-processor/parsers/lampada_spec.rb +333 -0
- data/spec/corpus-processor/processor_spec.rb +36 -0
- data/spec/corpus-processor/token_spec.rb +15 -0
- data/spec/spec_helper.rb +7 -4
- metadata +39 -27
- data/lib/corpus-processor/default_categories.rb +0 -14
- data/lib/corpus-processor/tokenizer.rb +0 -17
- data/lib/corpus-processor/traverser.rb +0 -19
- data/spec/unit/generators/stanford_ner_spec.rb +0 -46
- data/spec/unit/parsers/lampada_spec.rb +0 -269
- data/spec/unit/processor.rb +0 -37
- data/spec/unit/token_spec.rb +0 -8
- data/spec/unit/tokenizer_spec.rb +0 -121
- data/spec/unit/traverser_spec.rb +0 -68
@@ -0,0 +1,58 @@
|
|
1
|
+
# The helper to load categories definitions.
|
2
|
+
#
|
3
|
+
# Categories definitions is a Hash with two keys named `:input` and `:output`.
|
4
|
+
#
|
5
|
+
# The `:input` has `String` keys that match the categories found in original
|
6
|
+
# corpus. Its values are `Symbol`s that represent the category internally.
|
7
|
+
#
|
8
|
+
# The `:output` has `Symbol`s keys that represent the category internally
|
9
|
+
# and should the values from the `:input` hash. Its values are the `String`s
|
10
|
+
# representing the category in the final converted corpus.
|
11
|
+
#
|
12
|
+
# An optional `:default` key is allowed in the `:output` hash. If present
|
13
|
+
# the resulting loaded hash has the specified default value.
|
14
|
+
#
|
15
|
+
# @example YAML file defining categories.
|
16
|
+
# ---
|
17
|
+
# :input:
|
18
|
+
# PESSOA: :person
|
19
|
+
# LOCAL: :location
|
20
|
+
# ORGANIZACAO: :organization
|
21
|
+
# :output:
|
22
|
+
# :default: O
|
23
|
+
# :person: PERSON
|
24
|
+
# :location: LOCATION
|
25
|
+
# :organization: ORGANIZATION
|
26
|
+
class CorpusProcessor::Categories
|
27
|
+
|
28
|
+
# Load a set of categories definitions.
|
29
|
+
#
|
30
|
+
# @param path [String] the path to the YAML file that defines the categories.
|
31
|
+
# @return [Hash] the categories extracted from the YAML file.
|
32
|
+
# @see .default
|
33
|
+
def self.load path
|
34
|
+
@@instances[path] ||= YAML.load(File.read(path)).tap { |categories|
|
35
|
+
default = categories[:output] && categories[:output][:default]
|
36
|
+
if default
|
37
|
+
categories[:output].default = default
|
38
|
+
categories[:output].delete :default
|
39
|
+
end
|
40
|
+
}
|
41
|
+
end
|
42
|
+
|
43
|
+
# The default set of categories definitions.
|
44
|
+
#
|
45
|
+
# The YAML definition file is
|
46
|
+
# {file:lib/corpus-processor/categories/default.yml}.
|
47
|
+
#
|
48
|
+
# @return (see .load)
|
49
|
+
# @see .load
|
50
|
+
def self.default
|
51
|
+
self.load(File.expand_path(File.join('..', 'categories', 'default.yml'),
|
52
|
+
__FILE__))
|
53
|
+
end
|
54
|
+
|
55
|
+
protected
|
56
|
+
|
57
|
+
@@instances = Hash.new
|
58
|
+
end
|
data/lib/corpus-processor/cli.rb
CHANGED
@@ -1,17 +1,37 @@
|
|
1
|
-
require
|
2
|
-
require "thor"
|
1
|
+
require 'thor'
|
3
2
|
|
4
|
-
|
5
|
-
class Cli < ::Thor
|
3
|
+
require 'corpus-processor'
|
6
4
|
|
7
|
-
|
8
|
-
|
9
|
-
input_file = File.new( input_file, "r") if input_file.is_a? String
|
10
|
-
output_file = File.new(output_file, "w") if output_file.is_a? String
|
5
|
+
# The operations available to users from CLI.
|
6
|
+
class CorpusProcessor::Cli < Thor
|
11
7
|
|
12
|
-
|
8
|
+
option :categories,
|
9
|
+
aliases: :c,
|
10
|
+
banner: 'CATEGORIES_FILE',
|
11
|
+
desc: 'Path to categories YAML file'
|
12
|
+
desc 'process [INPUT_FILE [OUTPUT_FILE]]',
|
13
|
+
'convert corpus from LâMPADA format to Stanford-NER format'
|
14
|
+
# Convert a given corpus from one format to other.
|
15
|
+
#
|
16
|
+
# By default the input format is LâMPADA and the output format is the one
|
17
|
+
# used by Stanford NER in training.
|
18
|
+
#
|
19
|
+
# @param input_file [String, IO] the file that contains the original corpus.
|
20
|
+
# @param output_file [String, IO] the file in which the converted corpus
|
21
|
+
# is written.
|
22
|
+
# @return [void]
|
23
|
+
def process input_file = STDIN, output_file = STDOUT
|
24
|
+
input_file = File.open( input_file, 'r') if input_file.is_a? String
|
25
|
+
output_file = File.open(output_file, 'w') if output_file.is_a? String
|
26
|
+
categories = if options[:categories]
|
27
|
+
CorpusProcessor::Categories.load(options[:categories])
|
28
|
+
else
|
29
|
+
CorpusProcessor::Categories.default
|
30
|
+
end
|
13
31
|
|
14
|
-
|
15
|
-
|
32
|
+
output_file.puts CorpusProcessor::Processor.new(categories: categories)
|
33
|
+
.process(input_file.read)
|
34
|
+
|
35
|
+
output_file.close
|
16
36
|
end
|
17
37
|
end
|
@@ -1,13 +1,22 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
end
|
1
|
+
# The generator for Stanford NER corpus.
|
2
|
+
#
|
3
|
+
# Generates corpus in the format used by Stanford NER training.
|
4
|
+
class CorpusProcessor::Generators::StanfordNer
|
6
5
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
6
|
+
# @param categories [Hash] the categories definitions loaded by
|
7
|
+
# {CorpusProcessor::Categories}.
|
8
|
+
def initialize categories = CorpusProcessor::Categories.default
|
9
|
+
@categories = categories.fetch :output
|
10
|
+
end
|
11
|
+
|
12
|
+
# Generate the corpus from tokens.
|
13
|
+
#
|
14
|
+
# @param tokens [Array<CorpusProcessor::Token>] the tokens from which
|
15
|
+
# the corpus is generated.
|
16
|
+
# @return [String] the generated corpus.
|
17
|
+
def generate tokens
|
18
|
+
tokens.map { |token|
|
19
|
+
"#{ token.word }\t#{ @categories[token.category] }"
|
20
|
+
}.join("\n") + "\n"
|
12
21
|
end
|
13
22
|
end
|
@@ -1,52 +1,108 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
end
|
25
|
-
|
26
|
-
def
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
1
|
+
# The parser for the corpus in LâMPADA format.
|
2
|
+
class CorpusProcessor::Parsers::Lampada
|
3
|
+
|
4
|
+
# @param (see Generators::StanfordNer#initialize)
|
5
|
+
def initialize categories = CorpusProcessor::Categories.default
|
6
|
+
self.categories = categories.fetch :input
|
7
|
+
end
|
8
|
+
|
9
|
+
# Parse the corpus in LâMPADA format.
|
10
|
+
#
|
11
|
+
# @param corpus [String] the original corpus.
|
12
|
+
# @return [Array<CorpusProcessor::Token>] the tokens extracted from corpus.
|
13
|
+
def parse corpus
|
14
|
+
process_nodes Nokogiri::XML(corpus).css('P')
|
15
|
+
end
|
16
|
+
|
17
|
+
protected
|
18
|
+
|
19
|
+
attr_accessor :categories
|
20
|
+
attr_accessor :current_category
|
21
|
+
|
22
|
+
def process_nodes nodes
|
23
|
+
nodes.reduce([]) { |tokens, node| tokens.push(*process_node(node)) }
|
24
|
+
end
|
25
|
+
|
26
|
+
def process_node node
|
27
|
+
case node
|
28
|
+
when Nokogiri::XML::Text then process_text node.text
|
29
|
+
when Nokogiri::XML::Element then process_element node
|
30
|
+
else
|
31
|
+
raise ArgumentError, "#{ node } cannot be handled by " \
|
32
|
+
"#{ self.class }. This is probably a bug, "\
|
33
|
+
"please report."
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def process_text text
|
38
|
+
text.gsub(punct, ' \0 ')
|
39
|
+
.strip
|
40
|
+
.split(spaces)
|
41
|
+
.map { |word|
|
42
|
+
CorpusProcessor::Token.new(word, current_category)
|
41
43
|
}
|
42
44
|
end
|
43
45
|
|
44
|
-
def
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
46
|
+
def process_element element
|
47
|
+
case element.name
|
48
|
+
when 'P' then process_p element
|
49
|
+
when 'EM' then process_em element
|
50
|
+
when 'ALT' then process_alt element
|
51
|
+
else process_nodes element.children
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def process_p p
|
56
|
+
tokens = process_nodes p.children
|
57
|
+
tokens << period_token if ! tokens.empty? && tokens.last.word !~ punct
|
58
|
+
tokens
|
59
|
+
end
|
60
|
+
|
61
|
+
def process_em em
|
62
|
+
with_category em.attributes['CATEG'] { process_nodes em.children }
|
63
|
+
end
|
64
|
+
|
65
|
+
def process_alt alt
|
66
|
+
alternatives = alt.inner_html.encode('UTF-8').split('|')
|
67
|
+
fake_xmls = alternatives.map { |alternative|
|
68
|
+
Nokogiri::XML "<document>#{ alternative }</document>"
|
69
|
+
}
|
70
|
+
alternatives_tokens = fake_xmls.map { |fake_xml|
|
71
|
+
process_nodes fake_xml.children
|
72
|
+
}
|
73
|
+
alternatives_tokens.max_by { |alternative_tokens|
|
74
|
+
alternative_tokens.count { |alternative_token|
|
75
|
+
! alternative_token.category.nil?
|
76
|
+
}
|
77
|
+
}
|
78
|
+
end
|
79
|
+
|
80
|
+
def with_category categories_attribute, &block
|
81
|
+
unless categories_attribute.nil?
|
82
|
+
self.current_category = extract categories_attribute.text
|
83
|
+
end
|
84
|
+
tokens = block.call
|
85
|
+
self.current_category = nil
|
86
|
+
tokens
|
87
|
+
end
|
88
|
+
|
89
|
+
def extract categories_string
|
90
|
+
category = categories_string.split('|').find { |category_string|
|
91
|
+
categories.include? category_string
|
92
|
+
}
|
93
|
+
|
94
|
+
categories[category]
|
95
|
+
end
|
96
|
+
|
97
|
+
def punct
|
98
|
+
/[[:punct:]]/
|
99
|
+
end
|
100
|
+
|
101
|
+
def spaces
|
102
|
+
/\s+/
|
103
|
+
end
|
104
|
+
|
105
|
+
def period_token
|
106
|
+
@period_token ||= CorpusProcessor::Token.new('.')
|
50
107
|
end
|
51
|
-
end
|
52
108
|
end
|
@@ -1,11 +1,26 @@
|
|
1
|
+
# The entry point for processing corpus.
|
2
|
+
#
|
3
|
+
# @example Simple use with default configuration.
|
4
|
+
# CorpusProcessor::Processor.new.process('<P>Some text</P>')
|
5
|
+
# # => "Some\tO\ntext\tO\n.\tO\n""
|
1
6
|
class CorpusProcessor::Processor
|
2
|
-
|
3
|
-
|
7
|
+
|
8
|
+
# @param categories [Hash] the categories extracted with {Categories}.
|
9
|
+
# @param parser [#parse] the parser for original corpus.
|
10
|
+
# @param generator [#generate] the generator that computes tokens into
|
11
|
+
# the tranformed corpus.
|
12
|
+
def initialize(
|
13
|
+
categories: CorpusProcessor::Categories.default,
|
14
|
+
parser: CorpusProcessor::Parsers::Lampada.new(categories),
|
15
|
+
generator: CorpusProcessor::Generators::StanfordNer.new(categories))
|
4
16
|
@parser = parser
|
5
17
|
@generator = generator
|
6
18
|
end
|
7
19
|
|
8
|
-
|
9
|
-
|
20
|
+
# Perform the processing of corpus.
|
21
|
+
#
|
22
|
+
# @return [String] the converted corpus.
|
23
|
+
def process corpus
|
24
|
+
@generator.generate @parser.parse(corpus)
|
10
25
|
end
|
11
26
|
end
|
@@ -1,2 +1,36 @@
|
|
1
|
-
|
1
|
+
# The internal representation of a token.
|
2
|
+
#
|
3
|
+
# Tokens are extracted from original corpus and are defined by single words
|
4
|
+
# or punctuation.
|
5
|
+
#
|
6
|
+
# They also contain a category, which is originated form the tagging in the
|
7
|
+
# corpus.
|
8
|
+
class CorpusProcessor::Token
|
9
|
+
|
10
|
+
# @return [String] the word from text. It shouldn't contain spaces.
|
11
|
+
attr_reader :word
|
12
|
+
|
13
|
+
# @return [Symbol] the type of the {Token}. It should be a valid category
|
14
|
+
# from {Categories}.
|
15
|
+
attr_reader :category
|
16
|
+
|
17
|
+
# @param word [String] the word from text. It shouldn't contain spaces.
|
18
|
+
# @param category [Symbol] the type of the {Token}. It should be a valid
|
19
|
+
# category from {Categories}.
|
20
|
+
def initialize word = '', category = nil
|
21
|
+
self.word = word
|
22
|
+
self.category = category
|
23
|
+
end
|
24
|
+
|
25
|
+
# Determine equality of two {Token}s.
|
26
|
+
#
|
27
|
+
# @param other [Token] the other {Token} to test.
|
28
|
+
def ==(other)
|
29
|
+
word == other.word && category == other.category
|
30
|
+
end
|
31
|
+
|
32
|
+
protected
|
33
|
+
|
34
|
+
attr_writer :word
|
35
|
+
attr_writer :category
|
2
36
|
end
|
@@ -1,17 +1,19 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
|
-
require
|
3
|
+
require 'corpus-processor/cli'
|
4
4
|
|
5
5
|
describe CorpusProcessor::Cli do
|
6
|
-
include FakeFS::SpecHelpers
|
7
6
|
subject(:cli) { CorpusProcessor::Cli.new }
|
8
7
|
|
9
|
-
let(:input_file) {
|
10
|
-
let(:output_file) {
|
8
|
+
let(:input_file) { STDIN }
|
9
|
+
let(:output_file) { STDOUT }
|
11
10
|
|
12
|
-
|
13
|
-
|
14
|
-
|
11
|
+
describe '#process' do
|
12
|
+
subject { cli.process }
|
13
|
+
|
14
|
+
before do
|
15
|
+
expect(input_file).to receive(:read)
|
16
|
+
.and_return(<<-INPUT.encode('ISO-8859-1'))
|
15
17
|
<?xml version="1.0" encoding="ISO-8859-1"?>
|
16
18
|
<!DOCTYPE colHAREM>
|
17
19
|
<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
|
@@ -30,71 +32,79 @@ describe CorpusProcessor::Cli do
|
|
30
32
|
no papado de <EM ID="H2-dftre765-11" CATEG="ACONTECIMENTO" TIPO="EVENTO">Avignon</EM>
|
31
33
|
, o
|
32
34
|
INPUT
|
33
|
-
}
|
34
|
-
end
|
35
35
|
|
36
|
-
|
37
|
-
|
38
|
-
|
36
|
+
expect(output_file).to receive(:puts).with(<<-OUTPUT)
|
37
|
+
Fatores\tO
|
38
|
+
Demográficos\tO
|
39
|
+
e\tO
|
40
|
+
Econômicos\tO
|
41
|
+
Subjacentes\tO
|
42
|
+
.\tO
|
43
|
+
A\tO
|
44
|
+
revolta\tO
|
45
|
+
histórica\tO
|
46
|
+
produz\tO
|
47
|
+
normalmente\tO
|
48
|
+
uma\tO
|
49
|
+
nova\tO
|
50
|
+
forma\tO
|
51
|
+
de\tO
|
52
|
+
pensamento\tO
|
53
|
+
quanto\tO
|
54
|
+
à\tO
|
55
|
+
forma\tO
|
56
|
+
de\tO
|
57
|
+
organização\tO
|
58
|
+
da\tO
|
59
|
+
sociedade\tO
|
60
|
+
.\tO
|
61
|
+
Assim\tO
|
62
|
+
foi\tO
|
63
|
+
com\tO
|
64
|
+
a\tO
|
65
|
+
Reforma\tO
|
66
|
+
Protestante\tO
|
67
|
+
.\tO
|
68
|
+
No\tO
|
69
|
+
seguimento\tO
|
70
|
+
do\tO
|
71
|
+
colapso\tO
|
72
|
+
de\tO
|
73
|
+
instituições\tO
|
74
|
+
monásticas\tO
|
75
|
+
e\tO
|
76
|
+
do\tO
|
77
|
+
escolasticismo\tO
|
78
|
+
nos\tO
|
79
|
+
finais\tO
|
80
|
+
da\tO
|
81
|
+
Idade\tO
|
82
|
+
Média\tO
|
83
|
+
na\tO
|
84
|
+
Europa\tLOCATION
|
85
|
+
,\tO
|
86
|
+
acentuado\tO
|
87
|
+
pela\tO
|
88
|
+
"\tO
|
89
|
+
Cativeiro\tO
|
90
|
+
Babilónica\tO
|
91
|
+
da\tO
|
92
|
+
igreja\tO
|
93
|
+
"\tO
|
94
|
+
no\tO
|
95
|
+
papado\tO
|
96
|
+
de\tO
|
97
|
+
Avignon\tO
|
98
|
+
,\tO
|
99
|
+
o\tO
|
100
|
+
.\tO
|
101
|
+
OUTPUT
|
102
|
+
|
103
|
+
expect(output_file).to receive(:close)
|
39
104
|
end
|
40
105
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
e O
|
45
|
-
Econômicos O
|
46
|
-
Subjacentes O
|
47
|
-
A O
|
48
|
-
revolta O
|
49
|
-
histórica O
|
50
|
-
produz O
|
51
|
-
normalmente O
|
52
|
-
uma O
|
53
|
-
nova O
|
54
|
-
forma O
|
55
|
-
de O
|
56
|
-
pensamento O
|
57
|
-
quanto O
|
58
|
-
à O
|
59
|
-
forma O
|
60
|
-
de O
|
61
|
-
organização O
|
62
|
-
da O
|
63
|
-
sociedade O
|
64
|
-
Assim O
|
65
|
-
foi O
|
66
|
-
com O
|
67
|
-
a O
|
68
|
-
Reforma O
|
69
|
-
Protestante O
|
70
|
-
No O
|
71
|
-
seguimento O
|
72
|
-
do O
|
73
|
-
colapso O
|
74
|
-
de O
|
75
|
-
instituições O
|
76
|
-
monásticas O
|
77
|
-
e O
|
78
|
-
do O
|
79
|
-
escolasticismo O
|
80
|
-
nos O
|
81
|
-
finais O
|
82
|
-
da O
|
83
|
-
Idade O
|
84
|
-
Média O
|
85
|
-
na O
|
86
|
-
Europa LOCATION
|
87
|
-
acentuado O
|
88
|
-
pela O
|
89
|
-
Cativeiro O
|
90
|
-
Babilónica O
|
91
|
-
da O
|
92
|
-
igreja O
|
93
|
-
no O
|
94
|
-
papado O
|
95
|
-
de O
|
96
|
-
Avignon O
|
97
|
-
o O
|
98
|
-
OUTPUT
|
106
|
+
it 'processes the corpus' do
|
107
|
+
subject
|
108
|
+
end
|
99
109
|
end
|
100
110
|
end
|