corpus-processor 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,58 @@
1
+ # The helper to load categories definitions.
2
+ #
3
+ # Categories definitions is a Hash with two keys named `:input` and `:output`.
4
+ #
5
+ # The `:input` has `String` keys that match the categories found in original
6
+ # corpus. Its values are `Symbol`s that represent the category internally.
7
+ #
8
+ # The `:output` has `Symbol`s keys that represent the category internally
9
+ # and should the values from the `:input` hash. Its values are the `String`s
10
+ # representing the category in the final converted corpus.
11
+ #
12
+ # An optional `:default` key is allowed in the `:output` hash. If present
13
+ # the resulting loaded hash has the specified default value.
14
+ #
15
+ # @example YAML file defining categories.
16
+ # ---
17
+ # :input:
18
+ # PESSOA: :person
19
+ # LOCAL: :location
20
+ # ORGANIZACAO: :organization
21
+ # :output:
22
+ # :default: O
23
+ # :person: PERSON
24
+ # :location: LOCATION
25
+ # :organization: ORGANIZATION
26
+ class CorpusProcessor::Categories
27
+
28
+ # Load a set of categories definitions.
29
+ #
30
+ # @param path [String] the path to the YAML file that defines the categories.
31
+ # @return [Hash] the categories extracted from the YAML file.
32
+ # @see .default
33
+ def self.load path
34
+ @@instances[path] ||= YAML.load(File.read(path)).tap { |categories|
35
+ default = categories[:output] && categories[:output][:default]
36
+ if default
37
+ categories[:output].default = default
38
+ categories[:output].delete :default
39
+ end
40
+ }
41
+ end
42
+
43
+ # The default set of categories definitions.
44
+ #
45
+ # The YAML definition file is
46
+ # {file:lib/corpus-processor/categories/default.yml}.
47
+ #
48
+ # @return (see .load)
49
+ # @see .load
50
+ def self.default
51
+ self.load(File.expand_path(File.join('..', 'categories', 'default.yml'),
52
+ __FILE__))
53
+ end
54
+
55
+ protected
56
+
57
+ @@instances = Hash.new
58
+ end
@@ -0,0 +1,10 @@
1
+ ---
2
+ :input:
3
+ PESSOA: :person
4
+ LOCAL: :location
5
+ ORGANIZACAO: :organization
6
+ :output:
7
+ :default: O
8
+ :person: PERSON
9
+ :location: LOCATION
10
+ :organization: ORGANIZATION
@@ -1,17 +1,37 @@
1
- require "corpus-processor"
2
- require "thor"
1
+ require 'thor'
3
2
 
4
- module CorpusProcessor
5
- class Cli < ::Thor
3
+ require 'corpus-processor'
6
4
 
7
- desc "process [INPUT_FILE [OUTPUT_FILE]] ", "convert corpus from LâMPADA format to Stanford-NER format"
8
- def process(input_file = $stdin, output_file = $stdout)
9
- input_file = File.new( input_file, "r") if input_file.is_a? String
10
- output_file = File.new(output_file, "w") if output_file.is_a? String
5
+ # The operations available to users from CLI.
6
+ class CorpusProcessor::Cli < Thor
11
7
 
12
- output_file.puts(CorpusProcessor::Processor.new.process(input_file.read))
8
+ option :categories,
9
+ aliases: :c,
10
+ banner: 'CATEGORIES_FILE',
11
+ desc: 'Path to categories YAML file'
12
+ desc 'process [INPUT_FILE [OUTPUT_FILE]]',
13
+ 'convert corpus from LâMPADA format to Stanford-NER format'
14
+ # Convert a given corpus from one format to other.
15
+ #
16
+ # By default the input format is LâMPADA and the output format is the one
17
+ # used by Stanford NER in training.
18
+ #
19
+ # @param input_file [String, IO] the file that contains the original corpus.
20
+ # @param output_file [String, IO] the file in which the converted corpus
21
+ # is written.
22
+ # @return [void]
23
+ def process input_file = STDIN, output_file = STDOUT
24
+ input_file = File.open( input_file, 'r') if input_file.is_a? String
25
+ output_file = File.open(output_file, 'w') if output_file.is_a? String
26
+ categories = if options[:categories]
27
+ CorpusProcessor::Categories.load(options[:categories])
28
+ else
29
+ CorpusProcessor::Categories.default
30
+ end
13
31
 
14
- output_file.close
15
- end
32
+ output_file.puts CorpusProcessor::Processor.new(categories: categories)
33
+ .process(input_file.read)
34
+
35
+ output_file.close
16
36
  end
17
37
  end
@@ -1 +1,5 @@
1
- require "corpus-processor/generators/stanford_ner"
1
+ # Namespace for generators.
2
+ module CorpusProcessor::Generators
3
+ end
4
+
5
+ require 'corpus-processor/generators/stanford_ner'
@@ -1,13 +1,22 @@
1
- module CorpusProcessor::Generators
2
- class StanfordNer
3
- def initialize(categories = CorpusProcessor::DEFAULT_CATEGORIES[:output])
4
- @categories = categories
5
- end
1
+ # The generator for Stanford NER corpus.
2
+ #
3
+ # Generates corpus in the format used by Stanford NER training.
4
+ class CorpusProcessor::Generators::StanfordNer
6
5
 
7
- def generate(tokens)
8
- tokens.map { |token|
9
- "#{ token.word } #{ @categories[token.category] }"
10
- }.join("\n") + "\n"
11
- end
6
+ # @param categories [Hash] the categories definitions loaded by
7
+ # {CorpusProcessor::Categories}.
8
+ def initialize categories = CorpusProcessor::Categories.default
9
+ @categories = categories.fetch :output
10
+ end
11
+
12
+ # Generate the corpus from tokens.
13
+ #
14
+ # @param tokens [Array<CorpusProcessor::Token>] the tokens from which
15
+ # the corpus is generated.
16
+ # @return [String] the generated corpus.
17
+ def generate tokens
18
+ tokens.map { |token|
19
+ "#{ token.word }\t#{ @categories[token.category] }"
20
+ }.join("\n") + "\n"
12
21
  end
13
22
  end
@@ -1 +1,5 @@
1
- require "corpus-processor/parsers/lampada"
1
+ # Namespace for parsers.
2
+ module CorpusProcessor::Parsers
3
+ end
4
+
5
+ require 'corpus-processor/parsers/lampada'
@@ -1,52 +1,108 @@
1
- module CorpusProcessor::Parsers
2
- class Lampada
3
-
4
- CATEGORY_REGEX = /
5
- (?<any_text> .*? ){0}
6
- (?<entity_attributes> \s\g<any_text>
7
- CATEG="\g<categories>"\g<any_text> ){0}
8
- (?<entity_opening_tag> <em\g<entity_attributes>> ){0}
9
- (?<entity_closing_tag> <\/em> ){0}
10
-
11
- # groups of interest
12
- (?<inner_text> \g<any_text> ){0}
13
- (?<categories> \g<any_text> ){0}
14
-
15
- \g<entity_opening_tag>\g<inner_text>\g<entity_closing_tag>
16
- /ix
17
-
18
- def initialize(categories = CorpusProcessor::DEFAULT_CATEGORIES[:input],
19
- traverser = CorpusProcessor::Traverser.new,
20
- tokenizer = CorpusProcessor::Tokenizer.new)
21
- @categories = categories
22
- @traverser = traverser
23
- @tokenizer = tokenizer
24
- end
25
-
26
- def parse(corpus)
27
- [].tap { |tokens|
28
- @traverser.traverse(@tokenizer.join_lines(corpus),
29
- CATEGORY_REGEX) do |match|
30
- text_to_tokenize, category = case match
31
- when String
32
- [match, nil]
33
- when MatchData
34
- [
35
- match[:inner_text],
36
- extract_category(match[:categories])
37
- ]
38
- end
39
- tokens.push(*@tokenizer.tokenize(text_to_tokenize, category))
40
- end
1
+ # The parser for the corpus in LâMPADA format.
2
+ class CorpusProcessor::Parsers::Lampada
3
+
4
+ # @param (see Generators::StanfordNer#initialize)
5
+ def initialize categories = CorpusProcessor::Categories.default
6
+ self.categories = categories.fetch :input
7
+ end
8
+
9
+ # Parse the corpus in LâMPADA format.
10
+ #
11
+ # @param corpus [String] the original corpus.
12
+ # @return [Array<CorpusProcessor::Token>] the tokens extracted from corpus.
13
+ def parse corpus
14
+ process_nodes Nokogiri::XML(corpus).css('P')
15
+ end
16
+
17
+ protected
18
+
19
+ attr_accessor :categories
20
+ attr_accessor :current_category
21
+
22
+ def process_nodes nodes
23
+ nodes.reduce([]) { |tokens, node| tokens.push(*process_node(node)) }
24
+ end
25
+
26
+ def process_node node
27
+ case node
28
+ when Nokogiri::XML::Text then process_text node.text
29
+ when Nokogiri::XML::Element then process_element node
30
+ else
31
+ raise ArgumentError, "#{ node } cannot be handled by " \
32
+ "#{ self.class }. This is probably a bug, "\
33
+ "please report."
34
+ end
35
+ end
36
+
37
+ def process_text text
38
+ text.gsub(punct, ' \0 ')
39
+ .strip
40
+ .split(spaces)
41
+ .map { |word|
42
+ CorpusProcessor::Token.new(word, current_category)
41
43
  }
42
44
  end
43
45
 
44
- def extract_category(categories)
45
- categories
46
- .split("|")
47
- .map { |category_string| @categories[category_string] }
48
- .compact
49
- .first
46
+ def process_element element
47
+ case element.name
48
+ when 'P' then process_p element
49
+ when 'EM' then process_em element
50
+ when 'ALT' then process_alt element
51
+ else process_nodes element.children
52
+ end
53
+ end
54
+
55
+ def process_p p
56
+ tokens = process_nodes p.children
57
+ tokens << period_token if ! tokens.empty? && tokens.last.word !~ punct
58
+ tokens
59
+ end
60
+
61
+ def process_em em
62
+ with_category em.attributes['CATEG'] { process_nodes em.children }
63
+ end
64
+
65
+ def process_alt alt
66
+ alternatives = alt.inner_html.encode('UTF-8').split('|')
67
+ fake_xmls = alternatives.map { |alternative|
68
+ Nokogiri::XML "<document>#{ alternative }</document>"
69
+ }
70
+ alternatives_tokens = fake_xmls.map { |fake_xml|
71
+ process_nodes fake_xml.children
72
+ }
73
+ alternatives_tokens.max_by { |alternative_tokens|
74
+ alternative_tokens.count { |alternative_token|
75
+ ! alternative_token.category.nil?
76
+ }
77
+ }
78
+ end
79
+
80
+ def with_category categories_attribute, &block
81
+ unless categories_attribute.nil?
82
+ self.current_category = extract categories_attribute.text
83
+ end
84
+ tokens = block.call
85
+ self.current_category = nil
86
+ tokens
87
+ end
88
+
89
+ def extract categories_string
90
+ category = categories_string.split('|').find { |category_string|
91
+ categories.include? category_string
92
+ }
93
+
94
+ categories[category]
95
+ end
96
+
97
+ def punct
98
+ /[[:punct:]]/
99
+ end
100
+
101
+ def spaces
102
+ /\s+/
103
+ end
104
+
105
+ def period_token
106
+ @period_token ||= CorpusProcessor::Token.new('.')
50
107
  end
51
- end
52
108
  end
@@ -1,11 +1,26 @@
1
+ # The entry point for processing corpus.
2
+ #
3
+ # @example Simple use with default configuration.
4
+ # CorpusProcessor::Processor.new.process('<P>Some text</P>')
5
+ # # => "Some\tO\ntext\tO\n.\tO\n""
1
6
  class CorpusProcessor::Processor
2
- def initialize(parser = CorpusProcessor::Parsers::Lampada.new,
3
- generator = CorpusProcessor::Generators::StanfordNer.new)
7
+
8
+ # @param categories [Hash] the categories extracted with {Categories}.
9
+ # @param parser [#parse] the parser for original corpus.
10
+ # @param generator [#generate] the generator that computes tokens into
11
+ # the tranformed corpus.
12
+ def initialize(
13
+ categories: CorpusProcessor::Categories.default,
14
+ parser: CorpusProcessor::Parsers::Lampada.new(categories),
15
+ generator: CorpusProcessor::Generators::StanfordNer.new(categories))
4
16
  @parser = parser
5
17
  @generator = generator
6
18
  end
7
19
 
8
- def process(corpus)
9
- @generator.generate(@parser.parse(corpus))
20
+ # Perform the processing of corpus.
21
+ #
22
+ # @return [String] the converted corpus.
23
+ def process corpus
24
+ @generator.generate @parser.parse(corpus)
10
25
  end
11
26
  end
@@ -1,2 +1,36 @@
1
- class CorpusProcessor::Token < Struct.new(:word, :category)
1
+ # The internal representation of a token.
2
+ #
3
+ # Tokens are extracted from original corpus and are defined by single words
4
+ # or punctuation.
5
+ #
6
+ # They also contain a category, which is originated form the tagging in the
7
+ # corpus.
8
+ class CorpusProcessor::Token
9
+
10
+ # @return [String] the word from text. It shouldn't contain spaces.
11
+ attr_reader :word
12
+
13
+ # @return [Symbol] the type of the {Token}. It should be a valid category
14
+ # from {Categories}.
15
+ attr_reader :category
16
+
17
+ # @param word [String] the word from text. It shouldn't contain spaces.
18
+ # @param category [Symbol] the type of the {Token}. It should be a valid
19
+ # category from {Categories}.
20
+ def initialize word = '', category = nil
21
+ self.word = word
22
+ self.category = category
23
+ end
24
+
25
+ # Determine equality of two {Token}s.
26
+ #
27
+ # @param other [Token] the other {Token} to test.
28
+ def ==(other)
29
+ word == other.word && category == other.category
30
+ end
31
+
32
+ protected
33
+
34
+ attr_writer :word
35
+ attr_writer :category
2
36
  end
@@ -1,3 +1,3 @@
1
1
  module CorpusProcessor
2
- VERSION = "0.2.0"
2
+ VERSION = '0.3.0'
3
3
  end
@@ -1,17 +1,19 @@
1
- require "spec_helper"
1
+ require 'spec_helper'
2
2
 
3
- require "corpus-processor/cli"
3
+ require 'corpus-processor/cli'
4
4
 
5
5
  describe CorpusProcessor::Cli do
6
- include FakeFS::SpecHelpers
7
6
  subject(:cli) { CorpusProcessor::Cli.new }
8
7
 
9
- let(:input_file) { "input_file" }
10
- let(:output_file) { "output_file" }
8
+ let(:input_file) { STDIN }
9
+ let(:output_file) { STDOUT }
11
10
 
12
- before do
13
- File.open(input_file, "w") { |file|
14
- file.write <<-INPUT
11
+ describe '#process' do
12
+ subject { cli.process }
13
+
14
+ before do
15
+ expect(input_file).to receive(:read)
16
+ .and_return(<<-INPUT.encode('ISO-8859-1'))
15
17
  <?xml version="1.0" encoding="ISO-8859-1"?>
16
18
  <!DOCTYPE colHAREM>
17
19
  <colHAREM versao="Segundo_dourada_com_relacoes_14Abril2010">
@@ -30,71 +32,79 @@ describe CorpusProcessor::Cli do
30
32
  no papado de <EM ID="H2-dftre765-11" CATEG="ACONTECIMENTO" TIPO="EVENTO">Avignon</EM>
31
33
  , o
32
34
  INPUT
33
- }
34
- end
35
35
 
36
- describe "#process" do
37
- before do
38
- cli.process(input_file, output_file)
36
+ expect(output_file).to receive(:puts).with(<<-OUTPUT)
37
+ Fatores\tO
38
+ Demográficos\tO
39
+ e\tO
40
+ Econômicos\tO
41
+ Subjacentes\tO
42
+ .\tO
43
+ A\tO
44
+ revolta\tO
45
+ histórica\tO
46
+ produz\tO
47
+ normalmente\tO
48
+ uma\tO
49
+ nova\tO
50
+ forma\tO
51
+ de\tO
52
+ pensamento\tO
53
+ quanto\tO
54
+ à\tO
55
+ forma\tO
56
+ de\tO
57
+ organização\tO
58
+ da\tO
59
+ sociedade\tO
60
+ .\tO
61
+ Assim\tO
62
+ foi\tO
63
+ com\tO
64
+ a\tO
65
+ Reforma\tO
66
+ Protestante\tO
67
+ .\tO
68
+ No\tO
69
+ seguimento\tO
70
+ do\tO
71
+ colapso\tO
72
+ de\tO
73
+ instituições\tO
74
+ monásticas\tO
75
+ e\tO
76
+ do\tO
77
+ escolasticismo\tO
78
+ nos\tO
79
+ finais\tO
80
+ da\tO
81
+ Idade\tO
82
+ Média\tO
83
+ na\tO
84
+ Europa\tLOCATION
85
+ ,\tO
86
+ acentuado\tO
87
+ pela\tO
88
+ "\tO
89
+ Cativeiro\tO
90
+ Babilónica\tO
91
+ da\tO
92
+ igreja\tO
93
+ "\tO
94
+ no\tO
95
+ papado\tO
96
+ de\tO
97
+ Avignon\tO
98
+ ,\tO
99
+ o\tO
100
+ .\tO
101
+ OUTPUT
102
+
103
+ expect(output_file).to receive(:close)
39
104
  end
40
105
 
41
- specify { File.read(output_file).should == <<-OUTPUT }
42
- Fatores O
43
- Demográficos O
44
- e O
45
- Econômicos O
46
- Subjacentes O
47
- A O
48
- revolta O
49
- histórica O
50
- produz O
51
- normalmente O
52
- uma O
53
- nova O
54
- forma O
55
- de O
56
- pensamento O
57
- quanto O
58
- à O
59
- forma O
60
- de O
61
- organização O
62
- da O
63
- sociedade O
64
- Assim O
65
- foi O
66
- com O
67
- a O
68
- Reforma O
69
- Protestante O
70
- No O
71
- seguimento O
72
- do O
73
- colapso O
74
- de O
75
- instituições O
76
- monásticas O
77
- e O
78
- do O
79
- escolasticismo O
80
- nos O
81
- finais O
82
- da O
83
- Idade O
84
- Média O
85
- na O
86
- Europa LOCATION
87
- acentuado O
88
- pela O
89
- Cativeiro O
90
- Babilónica O
91
- da O
92
- igreja O
93
- no O
94
- papado O
95
- de O
96
- Avignon O
97
- o O
98
- OUTPUT
106
+ it 'processes the corpus' do
107
+ subject
108
+ end
99
109
  end
100
110
  end