corpus-processor 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,58 @@
1
+ # The helper to load categories definitions.
2
+ #
3
+ # Categories definitions is a Hash with two keys named `:input` and `:output`.
4
+ #
5
+ # The `:input` has `String` keys that match the categories found in original
6
+ # corpus. Its values are `Symbol`s that represent the category internally.
7
+ #
8
+ # The `:output` has `Symbol`s keys that represent the category internally
9
+ # and should the values from the `:input` hash. Its values are the `String`s
10
+ # representing the category in the final converted corpus.
11
+ #
12
+ # An optional `:default` key is allowed in the `:output` hash. If present
13
+ # the resulting loaded hash has the specified default value.
14
+ #
15
+ # @example YAML file defining categories.
16
+ # ---
17
+ # :input:
18
+ # PESSOA: :person
19
+ # LOCAL: :location
20
+ # ORGANIZACAO: :organization
21
+ # :output:
22
+ # :default: O
23
+ # :person: PERSON
24
+ # :location: LOCATION
25
+ # :organization: ORGANIZATION
26
+ class CorpusProcessor::Categories
27
+
28
+ # Load a set of categories definitions.
29
+ #
30
+ # @param path [String] the path to the YAML file that defines the categories.
31
+ # @return [Hash] the categories extracted from the YAML file.
32
+ # @see .default
33
+ def self.load path
34
+ @@instances[path] ||= YAML.load(File.read(path)).tap { |categories|
35
+ default = categories[:output] && categories[:output][:default]
36
+ if default
37
+ categories[:output].default = default
38
+ categories[:output].delete :default
39
+ end
40
+ }
41
+ end
42
+
43
+ # The default set of categories definitions.
44
+ #
45
+ # The YAML definition file is
46
+ # {file:lib/corpus-processor/categories/default.yml}.
47
+ #
48
+ # @return (see .load)
49
+ # @see .load
50
+ def self.default
51
+ self.load(File.expand_path(File.join('..', 'categories', 'default.yml'),
52
+ __FILE__))
53
+ end
54
+
55
+ protected
56
+
57
+ @@instances = Hash.new
58
+ end
@@ -0,0 +1,10 @@
1
+ ---
2
+ :input:
3
+ PESSOA: :person
4
+ LOCAL: :location
5
+ ORGANIZACAO: :organization
6
+ :output:
7
+ :default: O
8
+ :person: PERSON
9
+ :location: LOCATION
10
+ :organization: ORGANIZATION
@@ -1,17 +1,37 @@
1
- require "corpus-processor"
2
- require "thor"
1
+ require 'thor'
3
2
 
4
- module CorpusProcessor
5
- class Cli < ::Thor
3
+ require 'corpus-processor'
6
4
 
7
- desc "process [INPUT_FILE [OUTPUT_FILE]] ", "convert corpus from LâMPADA format to Stanford-NER format"
8
- def process(input_file = $stdin, output_file = $stdout)
9
- input_file = File.new( input_file, "r") if input_file.is_a? String
10
- output_file = File.new(output_file, "w") if output_file.is_a? String
5
+ # The operations available to users from CLI.
6
+ class CorpusProcessor::Cli < Thor
11
7
 
12
- output_file.puts(CorpusProcessor::Processor.new.process(input_file.read))
8
+ option :categories,
9
+ aliases: :c,
10
+ banner: 'CATEGORIES_FILE',
11
+ desc: 'Path to categories YAML file'
12
+ desc 'process [INPUT_FILE [OUTPUT_FILE]]',
13
+ 'convert corpus from LâMPADA format to Stanford-NER format'
14
+ # Convert a given corpus from one format to other.
15
+ #
16
+ # By default the input format is LâMPADA and the output format is the one
17
+ # used by Stanford NER in training.
18
+ #
19
+ # @param input_file [String, IO] the file that contains the original corpus.
20
+ # @param output_file [String, IO] the file in which the converted corpus
21
+ # is written.
22
+ # @return [void]
23
+ def process input_file = STDIN, output_file = STDOUT
24
+ input_file = File.open( input_file, 'r') if input_file.is_a? String
25
+ output_file = File.open(output_file, 'w') if output_file.is_a? String
26
+ categories = if options[:categories]
27
+ CorpusProcessor::Categories.load(options[:categories])
28
+ else
29
+ CorpusProcessor::Categories.default
30
+ end
13
31
 
14
- output_file.close
15
- end
32
+ output_file.puts CorpusProcessor::Processor.new(categories: categories)
33
+ .process(input_file.read)
34
+
35
+ output_file.close
16
36
  end
17
37
  end
@@ -1 +1,5 @@
1
- require "corpus-processor/generators/stanford_ner"
1
+ # Namespace for generators.
2
+ module CorpusProcessor::Generators
3
+ end
4
+
5
+ require 'corpus-processor/generators/stanford_ner'
@@ -1,13 +1,22 @@
1
- module CorpusProcessor::Generators
2
- class StanfordNer
3
- def initialize(categories = CorpusProcessor::DEFAULT_CATEGORIES[:output])
4
- @categories = categories
5
- end
1
+ # The generator for Stanford NER corpus.
2
+ #
3
+ # Generates corpus in the format used by Stanford NER training.
4
+ class CorpusProcessor::Generators::StanfordNer
6
5
 
7
- def generate(tokens)
8
- tokens.map { |token|
9
- "#{ token.word } #{ @categories[token.category] }"
10
- }.join("\n") + "\n"
11
- end
6
+ # @param categories [Hash] the categories definitions loaded by
7
+ # {CorpusProcessor::Categories}.
8
+ def initialize categories = CorpusProcessor::Categories.default
9
+ @categories = categories.fetch :output
10
+ end
11
+
12
+ # Generate the corpus from tokens.
13
+ #
14
+ # @param tokens [Array<CorpusProcessor::Token>] the tokens from which
15
+ # the corpus is generated.
16
+ # @return [String] the generated corpus.
17
+ def generate tokens
18
+ tokens.map { |token|
19
+ "#{ token.word }\t#{ @categories[token.category] }"
20
+ }.join("\n") + "\n"
12
21
  end
13
22
  end
@@ -1 +1,5 @@
1
- require "corpus-processor/parsers/lampada"
1
+ # Namespace for parsers.
2
+ module CorpusProcessor::Parsers
3
+ end
4
+
5
+ require 'corpus-processor/parsers/lampada'
@@ -1,52 +1,108 @@
1
- module CorpusProcessor::Parsers
2
- class Lampada
3
-
4
- CATEGORY_REGEX = /
5
- (?<any_text> .*? ){0}
6
- (?<entity_attributes> \s\g<any_text>
7
- CATEG="\g<categories>"\g<any_text> ){0}
8
- (?<entity_opening_tag> <em\g<entity_attributes>> ){0}
9
- (?<entity_closing_tag> <\/em> ){0}
10
-
11
- # groups of interest
12
- (?<inner_text> \g<any_text> ){0}
13
- (?<categories> \g<any_text> ){0}
14
-
15
- \g<entity_opening_tag>\g<inner_text>\g<entity_closing_tag>
16
- /ix
17
-
18
- def initialize(categories = CorpusProcessor::DEFAULT_CATEGORIES[:input],
19
- traverser = CorpusProcessor::Traverser.new,
20
- tokenizer = CorpusProcessor::Tokenizer.new)
21
- @categories = categories
22
- @traverser = traverser
23
- @tokenizer = tokenizer
24
- end
25
-
26
- def parse(corpus)
27
- [].tap { |tokens|
28
- @traverser.traverse(@tokenizer.join_lines(corpus),
29
- CATEGORY_REGEX) do |match|
30
- text_to_tokenize, category = case match
31
- when String
32
- [match, nil]
33
- when MatchData
34
- [
35
- match[:inner_text],
36
- extract_category(match[:categories])
37
- ]
38
- end
39
- tokens.push(*@tokenizer.tokenize(text_to_tokenize, category))
40
- end
1
+ # The parser for the corpus in LâMPADA format.
2
+ class CorpusProcessor::Parsers::Lampada
3
+
4
+ # @param (see Generators::StanfordNer#initialize)
5
+ def initialize categories = CorpusProcessor::Categories.default
6
+ self.categories = categories.fetch :input
7
+ end
8
+
9
+ # Parse the corpus in LâMPADA format.
10
+ #
11
+ # @param corpus [String] the original corpus.
12
+ # @return [Array<CorpusProcessor::Token>] the tokens extracted from corpus.
13
+ def parse corpus
14
+ process_nodes Nokogiri::XML(corpus).css('P')
15
+ end
16
+
17
+ protected
18
+
19
+ attr_accessor :categories
20
+ attr_accessor :current_category
21
+
22
+ def process_nodes nodes
23
+ nodes.reduce([]) { |tokens, node| tokens.push(*process_node(node)) }
24
+ end
25
+
26
+ def process_node node
27
+ case node
28
+ when Nokogiri::XML::Text then process_text node.text
29
+ when Nokogiri::XML::Element then process_element node
30
+ else
31
+ raise ArgumentError, "#{ node } cannot be handled by " \
32
+ "#{ self.class }. This is probably a bug, "\
33
+ "please report."
34
+ end
35
+ end
36
+
37
+ def process_text text
38
+ text.gsub(punct, ' \0 ')
39
+ .strip
40
+ .split(spaces)
41
+ .map { |word|
42
+ CorpusProcessor::Token.new(word, current_category)
41
43
  }
42
44
  end
43
45
 
44
- def extract_category(categories)
45
- categories
46
- .split("|")
47
- .map { |category_string| @categories[category_string] }
48
- .compact
49
- .first
46
+ def process_element element
47
+ case element.name
48
+ when 'P' then process_p element
49
+ when 'EM' then process_em element
50
+ when 'ALT' then process_alt element
51
+ else process_nodes element.children
52
+ end
53
+ end
54
+
55
+ def process_p p
56
+ tokens = process_nodes p.children
57
+ tokens << period_token if ! tokens.empty? && tokens.last.word !~ punct
58
+ tokens
59
+ end
60
+
61
+ def process_em em
62
+ with_category em.attributes['CATEG'] { process_nodes em.children }
63
+ end
64
+
65
+ def process_alt alt
66
+ alternatives = alt.inner_html.encode('UTF-8').split('|')
67
+ fake_xmls = alternatives.map { |alternative|
68
+ Nokogiri::XML "<document>#{ alternative }</document>"
69
+ }
70
+ alternatives_tokens = fake_xmls.map { |fake_xml|
71
+ process_nodes fake_xml.children
72
+ }
73
+ alternatives_tokens.max_by { |alternative_tokens|
74
+ alternative_tokens.count { |alternative_token|
75
+ ! alternative_token.category.nil?
76
+ }
77
+ }
78
+ end
79
+
80
+ def with_category categories_attribute, &block
81
+ unless categories_attribute.nil?
82
+ self.current_category = extract categories_attribute.text
83
+ end
84
+ tokens = block.call
85
+ self.current_category = nil
86
+ tokens
87
+ end
88
+
89
+ def extract categories_string
90
+ category = categories_string.split('|').find { |category_string|
91
+ categories.include? category_string
92
+ }
93
+
94
+ categories[category]
95
+ end
96
+
97
+ def punct
98
+ /[[:punct:]]/
99
+ end
100
+
101
+ def spaces
102
+ /\s+/
103
+ end
104
+
105
+ def period_token
106
+ @period_token ||= CorpusProcessor::Token.new('.')
50
107
  end
51
- end
52
108
  end
@@ -1,11 +1,26 @@
1
+ # The entry point for processing corpus.
2
+ #
3
+ # @example Simple use with default configuration.
4
+ # CorpusProcessor::Processor.new.process('<P>Some text</P>')
5
+ # # => "Some\tO\ntext\tO\n.\tO\n""
1
6
  class CorpusProcessor::Processor
2
- def initialize(parser = CorpusProcessor::Parsers::Lampada.new,
3
- generator = CorpusProcessor::Generators::StanfordNer.new)
7
+
8
+ # @param categories [Hash] the categories extracted with {Categories}.
9
+ # @param parser [#parse] the parser for original corpus.
10
+ # @param generator [#generate] the generator that computes tokens into
11
+ # the tranformed corpus.
12
+ def initialize(
13
+ categories: CorpusProcessor::Categories.default,
14
+ parser: CorpusProcessor::Parsers::Lampada.new(categories),
15
+ generator: CorpusProcessor::Generators::StanfordNer.new(categories))
4
16
  @parser = parser
5
17
  @generator = generator
6
18
  end
7
19
 
8
- def process(corpus)
9
- @generator.generate(@parser.parse(corpus))
20
+ # Perform the processing of corpus.
21
+ #
22
+ # @return [String] the converted corpus.
23
+ def process corpus
24
+ @generator.generate @parser.parse(corpus)
10
25
  end
11
26
  end
@@ -1,2 +1,36 @@
1
- class CorpusProcessor::Token < Struct.new(:word, :category)
1
+ # The internal representation of a token.
2
+ #
3
+ # Tokens are extracted from original corpus and are defined by single words
4
+ # or punctuation.
5
+ #
6
+ # They also contain a category, which is originated form the tagging in the
7
+ # corpus.
8
+ class CorpusProcessor::Token
9
+
10
+ # @return [String] the word from text. It shouldn't contain spaces.
11
+ attr_reader :word
12
+
13
+ # @return [Symbol] the type of the {Token}. It should be a valid category
14
+ # from {Categories}.
15
+ attr_reader :category
16
+
17
+ # @param word [String] the word from text. It shouldn't contain spaces.
18
+ # @param category [Symbol] the type of the {Token}. It should be a valid
19
+ # category from {Categories}.
20
+ def initialize word = '', category = nil
21
+ self.word = word
22
+ self.category = category
23
+ end
24
+
25
+ # Determine equality of two {Token}s.
26
+ #
27
+ # @param other [Token] the other {Token} to test.
28
+ def ==(other)
29
+ word == other.word && category == other.category
30
+ end
31
+
32
+ protected
33
+
34
+ attr_writer :word
35
+ attr_writer :category
2
36
  end
@@ -1,3 +1,3 @@
1
1
  module CorpusProcessor
2
- VERSION = "0.2.0"
2
+ VERSION = '0.3.0'
3
3
  end
@@ -1,17 +1,19 @@
1
- require "spec_helper"
1
+ require 'spec_helper'
2
2
 
3
- require "corpus-processor/cli"
3
+ require 'corpus-processor/cli'
4
4
 
5
5
  describe CorpusProcessor::Cli do
6
- include FakeFS::SpecHelpers
7
6
  subject(:cli) { CorpusProcessor::Cli.new }
8
7
 
9
- let(:input_file) { "input_file" }
10
- let(:output_file) { "output_file" }
8
+ let(:input_file) { STDIN }
9
+ let(:output_file) { STDOUT }
11
10
 
12
- before do
13
- File.open(input_file, "w") { |file|
14
- file.write <<-INPUT
11
+ describe '#process' do
12
+ subject { cli.process }
13
+
14
+ before do
15
+ expect(input_file).to receive(:read)
16
+ .and_return(<<-INPUT.encode('ISO-8859-1'))
15
17
  <?xml version="1.0" encoding="ISO-8859-1"?>
16
18
  <!DOCTYPE colHAREM>
17
19
  <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
@@ -30,71 +32,79 @@ describe CorpusProcessor::Cli do
30
32
  no papado de <EM ID="H2-dftre765-11" CATEG="ACONTECIMENTO" TIPO="EVENTO">Avignon</EM>
31
33
  , o
32
34
  INPUT
33
- }
34
- end
35
35
 
36
- describe "#process" do
37
- before do
38
- cli.process(input_file, output_file)
36
+ expect(output_file).to receive(:puts).with(<<-OUTPUT)
37
+ Fatores\tO
38
+ Demográficos\tO
39
+ e\tO
40
+ Econômicos\tO
41
+ Subjacentes\tO
42
+ .\tO
43
+ A\tO
44
+ revolta\tO
45
+ histórica\tO
46
+ produz\tO
47
+ normalmente\tO
48
+ uma\tO
49
+ nova\tO
50
+ forma\tO
51
+ de\tO
52
+ pensamento\tO
53
+ quanto\tO
54
+ à\tO
55
+ forma\tO
56
+ de\tO
57
+ organização\tO
58
+ da\tO
59
+ sociedade\tO
60
+ .\tO
61
+ Assim\tO
62
+ foi\tO
63
+ com\tO
64
+ a\tO
65
+ Reforma\tO
66
+ Protestante\tO
67
+ .\tO
68
+ No\tO
69
+ seguimento\tO
70
+ do\tO
71
+ colapso\tO
72
+ de\tO
73
+ instituições\tO
74
+ monásticas\tO
75
+ e\tO
76
+ do\tO
77
+ escolasticismo\tO
78
+ nos\tO
79
+ finais\tO
80
+ da\tO
81
+ Idade\tO
82
+ Média\tO
83
+ na\tO
84
+ Europa\tLOCATION
85
+ ,\tO
86
+ acentuado\tO
87
+ pela\tO
88
+ "\tO
89
+ Cativeiro\tO
90
+ Babilónica\tO
91
+ da\tO
92
+ igreja\tO
93
+ "\tO
94
+ no\tO
95
+ papado\tO
96
+ de\tO
97
+ Avignon\tO
98
+ ,\tO
99
+ o\tO
100
+ .\tO
101
+ OUTPUT
102
+
103
+ expect(output_file).to receive(:close)
39
104
  end
40
105
 
41
- specify { File.read(output_file).should == <<-OUTPUT }
42
- Fatores O
43
- Demográficos O
44
- e O
45
- Econômicos O
46
- Subjacentes O
47
- A O
48
- revolta O
49
- histórica O
50
- produz O
51
- normalmente O
52
- uma O
53
- nova O
54
- forma O
55
- de O
56
- pensamento O
57
- quanto O
58
- à O
59
- forma O
60
- de O
61
- organização O
62
- da O
63
- sociedade O
64
- Assim O
65
- foi O
66
- com O
67
- a O
68
- Reforma O
69
- Protestante O
70
- No O
71
- seguimento O
72
- do O
73
- colapso O
74
- de O
75
- instituições O
76
- monásticas O
77
- e O
78
- do O
79
- escolasticismo O
80
- nos O
81
- finais O
82
- da O
83
- Idade O
84
- Média O
85
- na O
86
- Europa LOCATION
87
- acentuado O
88
- pela O
89
- Cativeiro O
90
- Babilónica O
91
- da O
92
- igreja O
93
- no O
94
- papado O
95
- de O
96
- Avignon O
97
- o O
98
- OUTPUT
106
+ it 'processes the corpus' do
107
+ subject
108
+ end
99
109
  end
100
110
  end