corpus-processor 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +5 -0
- data/.yardopts +1 -0
- data/README.md +235 -34
- data/bin/corpus-processor +3 -3
- data/corpus-processor.gemspec +16 -14
- data/lib/corpus-processor.rb +12 -8
- data/lib/corpus-processor/categories.rb +58 -0
- data/lib/corpus-processor/categories/default.yml +10 -0
- data/lib/corpus-processor/cli.rb +31 -11
- data/lib/corpus-processor/generators.rb +5 -1
- data/lib/corpus-processor/generators/stanford_ner.rb +19 -10
- data/lib/corpus-processor/parsers.rb +5 -1
- data/lib/corpus-processor/parsers/lampada.rb +103 -47
- data/lib/corpus-processor/processor.rb +19 -4
- data/lib/corpus-processor/token.rb +35 -1
- data/lib/corpus-processor/version.rb +1 -1
- data/spec/{integration → corpus-processor}/cli_spec.rb +81 -71
- data/spec/corpus-processor/generators/stanford_ner_spec.rb +57 -0
- data/spec/corpus-processor/parsers/lampada_spec.rb +333 -0
- data/spec/corpus-processor/processor_spec.rb +36 -0
- data/spec/corpus-processor/token_spec.rb +15 -0
- data/spec/spec_helper.rb +7 -4
- metadata +39 -27
- data/lib/corpus-processor/default_categories.rb +0 -14
- data/lib/corpus-processor/tokenizer.rb +0 -17
- data/lib/corpus-processor/traverser.rb +0 -19
- data/spec/unit/generators/stanford_ner_spec.rb +0 -46
- data/spec/unit/parsers/lampada_spec.rb +0 -269
- data/spec/unit/processor.rb +0 -37
- data/spec/unit/token_spec.rb +0 -8
- data/spec/unit/tokenizer_spec.rb +0 -121
- data/spec/unit/traverser_spec.rb +0 -68
@@ -0,0 +1,58 @@
|
|
1
|
+
# The helper to load categories definitions.
|
2
|
+
#
|
3
|
+
# Categories definitions is a Hash with two keys named `:input` and `:output`.
|
4
|
+
#
|
5
|
+
# The `:input` has `String` keys that match the categories found in original
|
6
|
+
# corpus. Its values are `Symbol`s that represent the category internally.
|
7
|
+
#
|
8
|
+
# The `:output` has `Symbol`s keys that represent the category internally
|
9
|
+
# and should the values from the `:input` hash. Its values are the `String`s
|
10
|
+
# representing the category in the final converted corpus.
|
11
|
+
#
|
12
|
+
# An optional `:default` key is allowed in the `:output` hash. If present
|
13
|
+
# the resulting loaded hash has the specified default value.
|
14
|
+
#
|
15
|
+
# @example YAML file defining categories.
|
16
|
+
# ---
|
17
|
+
# :input:
|
18
|
+
# PESSOA: :person
|
19
|
+
# LOCAL: :location
|
20
|
+
# ORGANIZACAO: :organization
|
21
|
+
# :output:
|
22
|
+
# :default: O
|
23
|
+
# :person: PERSON
|
24
|
+
# :location: LOCATION
|
25
|
+
# :organization: ORGANIZATION
|
26
|
+
class CorpusProcessor::Categories
|
27
|
+
|
28
|
+
# Load a set of categories definitions.
|
29
|
+
#
|
30
|
+
# @param path [String] the path to the YAML file that defines the categories.
|
31
|
+
# @return [Hash] the categories extracted from the YAML file.
|
32
|
+
# @see .default
|
33
|
+
def self.load path
|
34
|
+
@@instances[path] ||= YAML.load(File.read(path)).tap { |categories|
|
35
|
+
default = categories[:output] && categories[:output][:default]
|
36
|
+
if default
|
37
|
+
categories[:output].default = default
|
38
|
+
categories[:output].delete :default
|
39
|
+
end
|
40
|
+
}
|
41
|
+
end
|
42
|
+
|
43
|
+
# The default set of categories definitions.
|
44
|
+
#
|
45
|
+
# The YAML definition file is
|
46
|
+
# {file:lib/corpus-processor/categories/default.yml}.
|
47
|
+
#
|
48
|
+
# @return (see .load)
|
49
|
+
# @see .load
|
50
|
+
def self.default
|
51
|
+
self.load(File.expand_path(File.join('..', 'categories', 'default.yml'),
|
52
|
+
__FILE__))
|
53
|
+
end
|
54
|
+
|
55
|
+
protected
|
56
|
+
|
57
|
+
@@instances = Hash.new
|
58
|
+
end
|
data/lib/corpus-processor/cli.rb
CHANGED
@@ -1,17 +1,37 @@
|
|
1
|
-
require
|
2
|
-
require "thor"
|
1
|
+
require 'thor'
|
3
2
|
|
4
|
-
|
5
|
-
class Cli < ::Thor
|
3
|
+
require 'corpus-processor'
|
6
4
|
|
7
|
-
|
8
|
-
|
9
|
-
input_file = File.new( input_file, "r") if input_file.is_a? String
|
10
|
-
output_file = File.new(output_file, "w") if output_file.is_a? String
|
5
|
+
# The operations available to users from CLI.
|
6
|
+
class CorpusProcessor::Cli < Thor
|
11
7
|
|
12
|
-
|
8
|
+
option :categories,
|
9
|
+
aliases: :c,
|
10
|
+
banner: 'CATEGORIES_FILE',
|
11
|
+
desc: 'Path to categories YAML file'
|
12
|
+
desc 'process [INPUT_FILE [OUTPUT_FILE]]',
|
13
|
+
'convert corpus from LâMPADA format to Stanford-NER format'
|
14
|
+
# Convert a given corpus from one format to other.
|
15
|
+
#
|
16
|
+
# By default the input format is LâMPADA and the output format is the one
|
17
|
+
# used by Stanford NER in training.
|
18
|
+
#
|
19
|
+
# @param input_file [String, IO] the file that contains the original corpus.
|
20
|
+
# @param output_file [String, IO] the file in which the converted corpus
|
21
|
+
# is written.
|
22
|
+
# @return [void]
|
23
|
+
def process input_file = STDIN, output_file = STDOUT
|
24
|
+
input_file = File.open( input_file, 'r') if input_file.is_a? String
|
25
|
+
output_file = File.open(output_file, 'w') if output_file.is_a? String
|
26
|
+
categories = if options[:categories]
|
27
|
+
CorpusProcessor::Categories.load(options[:categories])
|
28
|
+
else
|
29
|
+
CorpusProcessor::Categories.default
|
30
|
+
end
|
13
31
|
|
14
|
-
|
15
|
-
|
32
|
+
output_file.puts CorpusProcessor::Processor.new(categories: categories)
|
33
|
+
.process(input_file.read)
|
34
|
+
|
35
|
+
output_file.close
|
16
36
|
end
|
17
37
|
end
|
@@ -1,13 +1,22 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
end
|
1
|
+
# The generator for Stanford NER corpus.
|
2
|
+
#
|
3
|
+
# Generates corpus in the format used by Stanford NER training.
|
4
|
+
class CorpusProcessor::Generators::StanfordNer
|
6
5
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
6
|
+
# @param categories [Hash] the categories definitions loaded by
|
7
|
+
# {CorpusProcessor::Categories}.
|
8
|
+
def initialize categories = CorpusProcessor::Categories.default
|
9
|
+
@categories = categories.fetch :output
|
10
|
+
end
|
11
|
+
|
12
|
+
# Generate the corpus from tokens.
|
13
|
+
#
|
14
|
+
# @param tokens [Array<CorpusProcessor::Token>] the tokens from which
|
15
|
+
# the corpus is generated.
|
16
|
+
# @return [String] the generated corpus.
|
17
|
+
def generate tokens
|
18
|
+
tokens.map { |token|
|
19
|
+
"#{ token.word }\t#{ @categories[token.category] }"
|
20
|
+
}.join("\n") + "\n"
|
12
21
|
end
|
13
22
|
end
|
@@ -1,52 +1,108 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
end
|
25
|
-
|
26
|
-
def
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
1
|
+
# The parser for the corpus in LâMPADA format.
|
2
|
+
class CorpusProcessor::Parsers::Lampada
|
3
|
+
|
4
|
+
# @param (see Generators::StanfordNer#initialize)
|
5
|
+
def initialize categories = CorpusProcessor::Categories.default
|
6
|
+
self.categories = categories.fetch :input
|
7
|
+
end
|
8
|
+
|
9
|
+
# Parse the corpus in LâMPADA format.
|
10
|
+
#
|
11
|
+
# @param corpus [String] the original corpus.
|
12
|
+
# @return [Array<CorpusProcessor::Token>] the tokens extracted from corpus.
|
13
|
+
def parse corpus
|
14
|
+
process_nodes Nokogiri::XML(corpus).css('P')
|
15
|
+
end
|
16
|
+
|
17
|
+
protected
|
18
|
+
|
19
|
+
attr_accessor :categories
|
20
|
+
attr_accessor :current_category
|
21
|
+
|
22
|
+
def process_nodes nodes
|
23
|
+
nodes.reduce([]) { |tokens, node| tokens.push(*process_node(node)) }
|
24
|
+
end
|
25
|
+
|
26
|
+
def process_node node
|
27
|
+
case node
|
28
|
+
when Nokogiri::XML::Text then process_text node.text
|
29
|
+
when Nokogiri::XML::Element then process_element node
|
30
|
+
else
|
31
|
+
raise ArgumentError, "#{ node } cannot be handled by " \
|
32
|
+
"#{ self.class }. This is probably a bug, "\
|
33
|
+
"please report."
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def process_text text
|
38
|
+
text.gsub(punct, ' \0 ')
|
39
|
+
.strip
|
40
|
+
.split(spaces)
|
41
|
+
.map { |word|
|
42
|
+
CorpusProcessor::Token.new(word, current_category)
|
41
43
|
}
|
42
44
|
end
|
43
45
|
|
44
|
-
def
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
46
|
+
def process_element element
|
47
|
+
case element.name
|
48
|
+
when 'P' then process_p element
|
49
|
+
when 'EM' then process_em element
|
50
|
+
when 'ALT' then process_alt element
|
51
|
+
else process_nodes element.children
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def process_p p
|
56
|
+
tokens = process_nodes p.children
|
57
|
+
tokens << period_token if ! tokens.empty? && tokens.last.word !~ punct
|
58
|
+
tokens
|
59
|
+
end
|
60
|
+
|
61
|
+
def process_em em
|
62
|
+
with_category em.attributes['CATEG'] { process_nodes em.children }
|
63
|
+
end
|
64
|
+
|
65
|
+
def process_alt alt
|
66
|
+
alternatives = alt.inner_html.encode('UTF-8').split('|')
|
67
|
+
fake_xmls = alternatives.map { |alternative|
|
68
|
+
Nokogiri::XML "<document>#{ alternative }</document>"
|
69
|
+
}
|
70
|
+
alternatives_tokens = fake_xmls.map { |fake_xml|
|
71
|
+
process_nodes fake_xml.children
|
72
|
+
}
|
73
|
+
alternatives_tokens.max_by { |alternative_tokens|
|
74
|
+
alternative_tokens.count { |alternative_token|
|
75
|
+
! alternative_token.category.nil?
|
76
|
+
}
|
77
|
+
}
|
78
|
+
end
|
79
|
+
|
80
|
+
def with_category categories_attribute, &block
|
81
|
+
unless categories_attribute.nil?
|
82
|
+
self.current_category = extract categories_attribute.text
|
83
|
+
end
|
84
|
+
tokens = block.call
|
85
|
+
self.current_category = nil
|
86
|
+
tokens
|
87
|
+
end
|
88
|
+
|
89
|
+
def extract categories_string
|
90
|
+
category = categories_string.split('|').find { |category_string|
|
91
|
+
categories.include? category_string
|
92
|
+
}
|
93
|
+
|
94
|
+
categories[category]
|
95
|
+
end
|
96
|
+
|
97
|
+
def punct
|
98
|
+
/[[:punct:]]/
|
99
|
+
end
|
100
|
+
|
101
|
+
def spaces
|
102
|
+
/\s+/
|
103
|
+
end
|
104
|
+
|
105
|
+
def period_token
|
106
|
+
@period_token ||= CorpusProcessor::Token.new('.')
|
50
107
|
end
|
51
|
-
end
|
52
108
|
end
|
@@ -1,11 +1,26 @@
|
|
1
|
+
# The entry point for processing corpus.
|
2
|
+
#
|
3
|
+
# @example Simple use with default configuration.
|
4
|
+
# CorpusProcessor::Processor.new.process('<P>Some text</P>')
|
5
|
+
# # => "Some\tO\ntext\tO\n.\tO\n""
|
1
6
|
class CorpusProcessor::Processor
|
2
|
-
|
3
|
-
|
7
|
+
|
8
|
+
# @param categories [Hash] the categories extracted with {Categories}.
|
9
|
+
# @param parser [#parse] the parser for original corpus.
|
10
|
+
# @param generator [#generate] the generator that computes tokens into
|
11
|
+
# the tranformed corpus.
|
12
|
+
def initialize(
|
13
|
+
categories: CorpusProcessor::Categories.default,
|
14
|
+
parser: CorpusProcessor::Parsers::Lampada.new(categories),
|
15
|
+
generator: CorpusProcessor::Generators::StanfordNer.new(categories))
|
4
16
|
@parser = parser
|
5
17
|
@generator = generator
|
6
18
|
end
|
7
19
|
|
8
|
-
|
9
|
-
|
20
|
+
# Perform the processing of corpus.
|
21
|
+
#
|
22
|
+
# @return [String] the converted corpus.
|
23
|
+
def process corpus
|
24
|
+
@generator.generate @parser.parse(corpus)
|
10
25
|
end
|
11
26
|
end
|
@@ -1,2 +1,36 @@
|
|
1
|
-
|
1
|
+
# The internal representation of a token.
|
2
|
+
#
|
3
|
+
# Tokens are extracted from original corpus and are defined by single words
|
4
|
+
# or punctuation.
|
5
|
+
#
|
6
|
+
# They also contain a category, which is originated form the tagging in the
|
7
|
+
# corpus.
|
8
|
+
class CorpusProcessor::Token
|
9
|
+
|
10
|
+
# @return [String] the word from text. It shouldn't contain spaces.
|
11
|
+
attr_reader :word
|
12
|
+
|
13
|
+
# @return [Symbol] the type of the {Token}. It should be a valid category
|
14
|
+
# from {Categories}.
|
15
|
+
attr_reader :category
|
16
|
+
|
17
|
+
# @param word [String] the word from text. It shouldn't contain spaces.
|
18
|
+
# @param category [Symbol] the type of the {Token}. It should be a valid
|
19
|
+
# category from {Categories}.
|
20
|
+
def initialize word = '', category = nil
|
21
|
+
self.word = word
|
22
|
+
self.category = category
|
23
|
+
end
|
24
|
+
|
25
|
+
# Determine equality of two {Token}s.
|
26
|
+
#
|
27
|
+
# @param other [Token] the other {Token} to test.
|
28
|
+
def ==(other)
|
29
|
+
word == other.word && category == other.category
|
30
|
+
end
|
31
|
+
|
32
|
+
protected
|
33
|
+
|
34
|
+
attr_writer :word
|
35
|
+
attr_writer :category
|
2
36
|
end
|
@@ -1,17 +1,19 @@
|
|
1
|
-
require
|
1
|
+
require 'spec_helper'
|
2
2
|
|
3
|
-
require
|
3
|
+
require 'corpus-processor/cli'
|
4
4
|
|
5
5
|
describe CorpusProcessor::Cli do
|
6
|
-
include FakeFS::SpecHelpers
|
7
6
|
subject(:cli) { CorpusProcessor::Cli.new }
|
8
7
|
|
9
|
-
let(:input_file) {
|
10
|
-
let(:output_file) {
|
8
|
+
let(:input_file) { STDIN }
|
9
|
+
let(:output_file) { STDOUT }
|
11
10
|
|
12
|
-
|
13
|
-
|
14
|
-
|
11
|
+
describe '#process' do
|
12
|
+
subject { cli.process }
|
13
|
+
|
14
|
+
before do
|
15
|
+
expect(input_file).to receive(:read)
|
16
|
+
.and_return(<<-INPUT.encode('ISO-8859-1'))
|
15
17
|
<?xml version="1.0" encoding="ISO-8859-1"?>
|
16
18
|
<!DOCTYPE colHAREM>
|
17
19
|
<colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
|
@@ -30,71 +32,79 @@ describe CorpusProcessor::Cli do
|
|
30
32
|
no papado de <EM ID="H2-dftre765-11" CATEG="ACONTECIMENTO" TIPO="EVENTO">Avignon</EM>
|
31
33
|
, o
|
32
34
|
INPUT
|
33
|
-
}
|
34
|
-
end
|
35
35
|
|
36
|
-
|
37
|
-
|
38
|
-
|
36
|
+
expect(output_file).to receive(:puts).with(<<-OUTPUT)
|
37
|
+
Fatores\tO
|
38
|
+
Demográficos\tO
|
39
|
+
e\tO
|
40
|
+
Econômicos\tO
|
41
|
+
Subjacentes\tO
|
42
|
+
.\tO
|
43
|
+
A\tO
|
44
|
+
revolta\tO
|
45
|
+
histórica\tO
|
46
|
+
produz\tO
|
47
|
+
normalmente\tO
|
48
|
+
uma\tO
|
49
|
+
nova\tO
|
50
|
+
forma\tO
|
51
|
+
de\tO
|
52
|
+
pensamento\tO
|
53
|
+
quanto\tO
|
54
|
+
à\tO
|
55
|
+
forma\tO
|
56
|
+
de\tO
|
57
|
+
organização\tO
|
58
|
+
da\tO
|
59
|
+
sociedade\tO
|
60
|
+
.\tO
|
61
|
+
Assim\tO
|
62
|
+
foi\tO
|
63
|
+
com\tO
|
64
|
+
a\tO
|
65
|
+
Reforma\tO
|
66
|
+
Protestante\tO
|
67
|
+
.\tO
|
68
|
+
No\tO
|
69
|
+
seguimento\tO
|
70
|
+
do\tO
|
71
|
+
colapso\tO
|
72
|
+
de\tO
|
73
|
+
instituições\tO
|
74
|
+
monásticas\tO
|
75
|
+
e\tO
|
76
|
+
do\tO
|
77
|
+
escolasticismo\tO
|
78
|
+
nos\tO
|
79
|
+
finais\tO
|
80
|
+
da\tO
|
81
|
+
Idade\tO
|
82
|
+
Média\tO
|
83
|
+
na\tO
|
84
|
+
Europa\tLOCATION
|
85
|
+
,\tO
|
86
|
+
acentuado\tO
|
87
|
+
pela\tO
|
88
|
+
"\tO
|
89
|
+
Cativeiro\tO
|
90
|
+
Babilónica\tO
|
91
|
+
da\tO
|
92
|
+
igreja\tO
|
93
|
+
"\tO
|
94
|
+
no\tO
|
95
|
+
papado\tO
|
96
|
+
de\tO
|
97
|
+
Avignon\tO
|
98
|
+
,\tO
|
99
|
+
o\tO
|
100
|
+
.\tO
|
101
|
+
OUTPUT
|
102
|
+
|
103
|
+
expect(output_file).to receive(:close)
|
39
104
|
end
|
40
105
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
e O
|
45
|
-
Econômicos O
|
46
|
-
Subjacentes O
|
47
|
-
A O
|
48
|
-
revolta O
|
49
|
-
histórica O
|
50
|
-
produz O
|
51
|
-
normalmente O
|
52
|
-
uma O
|
53
|
-
nova O
|
54
|
-
forma O
|
55
|
-
de O
|
56
|
-
pensamento O
|
57
|
-
quanto O
|
58
|
-
à O
|
59
|
-
forma O
|
60
|
-
de O
|
61
|
-
organização O
|
62
|
-
da O
|
63
|
-
sociedade O
|
64
|
-
Assim O
|
65
|
-
foi O
|
66
|
-
com O
|
67
|
-
a O
|
68
|
-
Reforma O
|
69
|
-
Protestante O
|
70
|
-
No O
|
71
|
-
seguimento O
|
72
|
-
do O
|
73
|
-
colapso O
|
74
|
-
de O
|
75
|
-
instituições O
|
76
|
-
monásticas O
|
77
|
-
e O
|
78
|
-
do O
|
79
|
-
escolasticismo O
|
80
|
-
nos O
|
81
|
-
finais O
|
82
|
-
da O
|
83
|
-
Idade O
|
84
|
-
Média O
|
85
|
-
na O
|
86
|
-
Europa LOCATION
|
87
|
-
acentuado O
|
88
|
-
pela O
|
89
|
-
Cativeiro O
|
90
|
-
Babilónica O
|
91
|
-
da O
|
92
|
-
igreja O
|
93
|
-
no O
|
94
|
-
papado O
|
95
|
-
de O
|
96
|
-
Avignon O
|
97
|
-
o O
|
98
|
-
OUTPUT
|
106
|
+
it 'processes the corpus' do
|
107
|
+
subject
|
108
|
+
end
|
99
109
|
end
|
100
110
|
end
|