proiel-cli 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,134 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Does the dependency relation suffice to disambiguate ambiguous morphology?
4
+ #
5
+ require 'colorize'
6
+ require 'proiel'
7
+
8
+ if ARGV.length < 1
9
+ STDERR.puts "Usage: #{$0} treebank-files(s)"
10
+
11
+ exit 1
12
+ end
13
+
14
+ tb = PROIEL::Treebank.new
15
+ tb.load_from_xml(ARGV)
16
+
17
+ # Harvest morphology
18
+ form_hash = {}
19
+
20
+ tb.sources.reject { |source| source.language != language_tag }.each do |source|
21
+ source.tokens.each do |token|
22
+ next unless token.form and token.pos and token.morphology and token.relation
23
+
24
+ # TODO: problem with using token.form is that sentence-initial words are sometimes capitalised
25
+ relation_hash = (form_hash[token.form] ||= {})
26
+ pos_hash = (relation_hash[token.relation] ||= {})
27
+ morphology_hash = (pos_hash[token.pos] ||= {})
28
+ morphology_hash[token.morphology] ||= 0
29
+ morphology_hash[token.morphology] += 1
30
+ end
31
+ end
32
+
33
+ # Calculate by unique forms first
34
+ unique_pos = 0
35
+ relation_predicts_pos = 0
36
+ relation_does_not_predict_pos = 0
37
+
38
+ unique_morphology = 0
39
+ relation_predicts_morphology = 0
40
+ relation_does_not_predict_morphology = 0
41
+
42
+ form_hash.each do |form, h|
43
+ number_of_poses = 0
44
+ number_of_morphologies = 0
45
+
46
+ h.each do |_, i|
47
+ i.each do |_, j|
48
+ number_of_poses += 1
49
+ j.each do |_, k|
50
+ number_of_morphologies += 1
51
+ end
52
+ end
53
+ end
54
+
55
+ if number_of_poses == 1
56
+ unique_pos += 1
57
+ elsif h.all? { |_, i| i.keys.count == 1 }
58
+ relation_predicts_pos += 1
59
+ else
60
+ relation_does_not_predict_pos += 1
61
+ end
62
+
63
+ if number_of_morphologies == 1
64
+ unique_morphology += 1
65
+ elsif h.all? { |_, i| i.all? { |_, j| j.keys.count == 1 } }
66
+ relation_predicts_morphology += 1
67
+ else
68
+ relation_does_not_predict_morphology += 1
69
+ end
70
+ end
71
+
72
+ puts "By unique forms (types)"
73
+ puts "======================="
74
+ puts "Forms with a unique POS: #{unique_pos}"
75
+ puts "Forms whose relation predicts its POS: #{relation_predicts_pos}"
76
+ puts "Forms whose relation does not predict its POS: #{relation_does_not_predict_pos}"
77
+
78
+ puts "Forms with a unique morphology: #{unique_morphology}"
79
+ puts "Forms whose relation predicts its morphology: #{relation_predicts_morphology}"
80
+ puts "Forms whose relation does not predict its morphology: #{relation_does_not_predict_morphology}"
81
+
82
+ # Calculate by actual number of occurrences
83
+ unique_pos = 0
84
+ relation_predicts_pos = 0
85
+ relation_does_not_predict_pos = 0
86
+
87
+ unique_morphology = 0
88
+ relation_predicts_morphology = 0
89
+ relation_does_not_predict_morphology = 0
90
+
91
+ form_hash.each do |form, h|
92
+ n = 0
93
+ number_of_poses = 0
94
+ number_of_morphologies = 0
95
+
96
+ h.each do |_, i|
97
+ i.each do |_, j|
98
+ number_of_poses += 1
99
+ j.each do |_, k|
100
+ number_of_morphologies += 1
101
+ n += k
102
+ end
103
+ end
104
+ end
105
+
106
+ if number_of_poses == 1
107
+ unique_pos += n
108
+ elsif h.all? { |_, i| i.keys.count == 1 }
109
+ relation_predicts_pos += n
110
+ else
111
+ relation_does_not_predict_pos += n
112
+ end
113
+
114
+ if number_of_morphologies == 1
115
+ unique_morphology += n
116
+ elsif h.all? { |_, i| i.all? { |_, j| j.keys.count == 1 } }
117
+ relation_predicts_morphology += n
118
+ else
119
+ relation_does_not_predict_morphology += n
120
+ end
121
+ end
122
+
123
+ puts
124
+ puts "By occurrences of forms (tokens)"
125
+ puts "================================"
126
+ puts "Forms with a unique POS: #{unique_pos}"
127
+ puts "Forms whose relation predicts its POS: #{relation_predicts_pos}"
128
+ puts "Forms whose relation does not predict its POS: #{relation_does_not_predict_pos}"
129
+
130
+ puts "Forms with a unique morphology: #{unique_morphology}"
131
+ puts "Forms whose relation predicts its morphology: #{relation_predicts_morphology}"
132
+ puts "Forms whose relation does not predict its morphology: #{relation_does_not_predict_morphology}"
133
+
134
+ exit 0
@@ -0,0 +1,30 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Word form occurrence extraction
4
+ #
5
+ require 'colorize'
6
+ require 'proiel'
7
+
8
+ if ARGV.length < 1
9
+ STDERR.puts "Usage: #{$0} treebank-files(s)"
10
+
11
+ exit 1
12
+ end
13
+
14
+ tb = PROIEL::Treebank.new
15
+ tb.load_from_xml(ARGV)
16
+
17
+ form_index = {}
18
+
19
+ tb.sources.each do |source|
20
+ source.tokens.each do |token|
21
+ unless token.form.nil?
22
+ form_index[token.form] ||= []
23
+ form_index[token.form] << [source.id, token.id].join(':')
24
+ end
25
+ end
26
+ end
27
+
28
+ form_index.sort_by(&:first).each do |form, ids|
29
+ puts "#{form}: #{ids.join(', ')}"
30
+ end
data/lib/proiel/cli.rb ADDED
@@ -0,0 +1,2 @@
1
+ require 'proiel/cli/version'
2
+ require 'proiel/cli/commands'
@@ -0,0 +1,28 @@
1
+ require 'mercenary'
2
+ require 'colorize'
3
+ require 'proiel'
4
+
5
+ module PROIEL
6
+ class Command
7
+ class << self
8
+ def subclasses
9
+ @subclasses ||= []
10
+ end
11
+
12
+ def inherited(base)
13
+ subclasses << base
14
+ super(base)
15
+ end
16
+ end
17
+ end
18
+ end
19
+
20
+ def require_all(path)
21
+ glob = File.join(File.dirname(__FILE__), path, '*.rb')
22
+ Dir[glob].each do |f|
23
+ require f
24
+ end
25
+ end
26
+
27
+ require_all 'commands'
28
+ require_all 'converters'
@@ -0,0 +1,94 @@
1
+ module PROIEL
2
+ module Commands
3
+ class Convert < Command
4
+ class << self
5
+ def init_with_program(prog)
6
+ prog.command(:convert) do |c|
7
+ c.syntax 'convert format'
8
+ c.description 'Convert to a different format'
9
+
10
+ c.command(:proielxml) do |f|
11
+ f.syntax '[options] [filename(s)]'
12
+ f.description 'Convert to PROIEL XML format'
13
+ f.option 'remove-not-annotated', '--remove-not-annotated', 'Remove sentences that have not been annotated'
14
+ f.option 'remove-not-reviewed', '--remove-not-reviewed', 'Remove sentences that have not been reviewed'
15
+ f.option 'remove-morphology', '--remove-morphology', 'Remove morphological annotation (part of speech, morphology and lemma)'
16
+ f.option 'remove-syntax', '--remove-syntax', 'Remove syntactic annotation (relation, head ID and slashes)'
17
+ f.option 'remove-information-structure', '--remove-information-structure', 'Remove informtion structure annotation (antecedent ID, information status and contrast group)'
18
+ f.option 'remove-status', '--remove-status', 'Remove sentence status (i.e. revert all sentences to unannotated status)'
19
+ f.option 'remove-empty-divs', '--remove-empty-divs', 'Remove div elements that do not contain any sentences'
20
+ f.action { |args, options| process(args, options, PROIEL::Converter::PROIELXML) }
21
+ end
22
+
23
+ c.command(:tnt) do |f|
24
+ f.syntax '[options] filename(s)'
25
+ f.description 'Convert to TNT/hunpos format'
26
+ f.option 'morphology', '-m', '--morphology', 'Include morphological tags'
27
+ f.action { |args, options| process(args, options, PROIEL::Converter::TNT) }
28
+ end
29
+
30
+ c.command(:"conll-x") do |f|
31
+ f.syntax 'filename(s)'
32
+ f.description 'Convert to CoNLL-X format'
33
+ f.action { |args, options| process(args, options, PROIEL::Converter::CoNLLX) }
34
+ end
35
+
36
+ c.command(:"conll-u") do |f|
37
+ f.syntax 'filename(s)'
38
+ f.description 'Convert to CoNLL-U format'
39
+ f.action { |args, options| process(args, options, PROIEL::Converter::CoNLLU) }
40
+ end
41
+
42
+ c.command(:tiger) do |f|
43
+ f.syntax 'filename(s)'
44
+ f.description 'Convert to TIGER XML format'
45
+ f.action { |args, options| process(args, options, PROIEL::Converter::Tiger) }
46
+ end
47
+
48
+ c.command(:tiger2) do |f|
49
+ f.syntax 'filename(s)'
50
+ f.description 'Convert to TIGER2 format'
51
+ f.action { |args, options| process(args, options, PROIEL::Converter::Tiger2) }
52
+ end
53
+
54
+ c.command(:text) do |f|
55
+ f.syntax 'filename(s)'
56
+ f.description 'Convert to plain text (UTF-8 with Unix line-endings)'
57
+ f.option 'diffable', '-d', '--diffable', 'Make the output diffable'
58
+ f.action { |args, options| process(args, options, PROIEL::Converter::Text) }
59
+ end
60
+
61
+ c.command(:lexc) do |f|
62
+ f.syntax '[options] filename(s)'
63
+ f.description 'Convert to lexc format'
64
+ f.option 'morphology', '-m', '--morphology', 'Include morphological tags'
65
+ f.action { |args, options| process(args, options, PROIEL::Converter::Lexc) }
66
+ end
67
+
68
+ c.action do |_, _|
69
+ STDERR.puts 'Missing or invalid format. Use --help for more information.'
70
+ exit 1
71
+ end
72
+ end
73
+ end
74
+
75
+ def process(args, options, converter)
76
+ tb = PROIEL::Treebank.new
77
+
78
+ if args.empty?
79
+ STDERR.puts "Reading from standard input...".green if options['verbose']
80
+ tb.load_from_xml(STDIN)
81
+ else
82
+ args.each do |filename|
83
+ STDERR.puts "Reading #{filename}...".green if options['verbose']
84
+
85
+ tb.load_from_xml(filename)
86
+ end
87
+ end
88
+
89
+ converter.process(tb, options)
90
+ end
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,136 @@
1
+ module PROIEL
2
+ module Commands
3
+ class Grep < Command
4
+ class << self
5
+ def init_with_program(prog)
6
+ prog.command(:grep) do |c|
7
+ c.syntax 'grep [options] pattern filename(s)'
8
+ c.description 'Search the text'
9
+
10
+ c.option 'level', '--level LEVEL', 'Select level to match. LEVEL should be "token" or "sentence" (default)'
11
+ c.option 'nostrip', '--nostrip', 'Do not strip whitespace from start and beginning of strings'
12
+ c.option 'nosubst', '--nosubst', 'Do not substitute whitespace sequences with a single space'
13
+ c.option 'nocolour', '--nocolour', 'Do not colour code matches'
14
+ c.option 'ignore-case', '-i', '--ignore-case', 'Ignore uppercase/lowercase'
15
+
16
+ c.action { |args, options| process(args, options) }
17
+ end
18
+ end
19
+
20
+ def match(s, citation, object_id, pattern, options)
21
+ s.strip! unless options['nostrip']
22
+ s.gsub!(/\s+/, ' ') unless options['nosubst']
23
+
24
+ if s[pattern]
25
+ s.gsub!(pattern) { |m| m.yellow } unless options['nocolour']
26
+
27
+ puts "#{citation} (ID = #{object_id}) #{s}"
28
+ end
29
+ end
30
+
31
+ def merge_citation_parts(citation_upper, citation_lower_start, citation_lower_end = nil)
32
+ citation_lower =
33
+ if citation_lower_start == citation_lower_end
34
+ citation_lower_start
35
+ else
36
+ [citation_lower_start, citation_lower_end].compact.join('-')
37
+ end
38
+
39
+ [citation_upper, citation_lower].compact.join(' ')
40
+ end
41
+
42
+ def process_div_for_sentences(citation_upper, div, pattern, options)
43
+ div.sentences.each do |sentence|
44
+ s = sentence.presentation_before || ''
45
+ citation_lower_start = nil
46
+ citation_lower_end = nil
47
+
48
+ sentence.tokens.each do |token|
49
+ unless token.is_empty?
50
+ s += token.presentation_before || ''
51
+ s += token.form || ''
52
+ s += token.presentation_after || ''
53
+
54
+ citation_lower_start = token.citation_part if citation_lower_start.nil?
55
+ citation_lower_end = token.citation_part
56
+ end
57
+ end
58
+
59
+ s += sentence.presentation_after || ''
60
+
61
+ citation = merge_citation_parts(citation_upper, citation_lower_start, citation_lower_end)
62
+ match(s, citation, sentence.id, pattern, options)
63
+ end
64
+ end
65
+
66
+ def process_div_for_tokens(citation_upper, div, pattern, options)
67
+ div.sentences.each do |sentence|
68
+ sentence.tokens.each do |token|
69
+ unless token.is_empty?
70
+ s = token.presentation_before || ''
71
+ s += token.form || ''
72
+ s += token.presentation_after || ''
73
+
74
+ citation = merge_citation_parts(citation_upper, token.citation_part)
75
+ match(s, citation, token.id, pattern, options)
76
+ end
77
+ end
78
+ end
79
+ end
80
+
81
+ def process_div(citation_upper, div, pattern, options)
82
+ case options['level']
83
+ when 'sentence'
84
+ process_div_for_sentences(citation_upper, div, pattern, options)
85
+ when 'token'
86
+ process_div_for_tokens(citation_upper, div, pattern, options)
87
+ end
88
+ end
89
+
90
+ def process(args, options)
91
+ if args.empty?
92
+ STDERR.puts 'Missing pattern. Use --help for more information.'
93
+ exit 1
94
+ end
95
+
96
+ pattern_string = args.shift
97
+
98
+ pattern =
99
+ if options['ignore-case']
100
+ Regexp.new(pattern_string, Regexp::IGNORECASE)
101
+ else
102
+ Regexp.new(pattern_string)
103
+ end
104
+
105
+ if args.empty?
106
+ STDERR.puts 'Missing filename(s). Use --help for more information.'
107
+ exit 1
108
+ end
109
+
110
+ options['level'] ||= 'sentence'
111
+
112
+ unless %w(token sentence).include?(options['level'])
113
+ STDERR.puts 'Invalid matching level. Use --help for more information.'
114
+ exit 1
115
+ end
116
+
117
+ tb = PROIEL::Treebank.new
118
+
119
+ args.each do |filename|
120
+ STDERR.puts "Reading #{filename}...".green if options['verbose']
121
+
122
+ tb.load_from_xml(filename)
123
+ end
124
+
125
+ tb.sources.each do |source|
126
+ citation_upper = source.citation_part
127
+
128
+ source.divs.each do |div|
129
+ process_div(citation_upper, div, pattern, options)
130
+ end
131
+ end
132
+ end
133
+ end
134
+ end
135
+ end
136
+ end
@@ -0,0 +1,126 @@
1
+ module PROIEL
2
+ module Commands
3
+ class CountWords < Command
4
+ class << self
5
+ def init_with_program(prog)
6
+ prog.command(:info) do |c|
7
+ c.syntax 'info [options] filename(s)'
8
+ c.description 'Show information about the treebank'
9
+
10
+ c.action do |args, options|
11
+ if args.empty?
12
+ STDERR.puts 'Missing filename(s). Use --help for more information.'
13
+ else
14
+ process(args, options)
15
+ end
16
+ end
17
+ end
18
+ end
19
+
20
+ def process(args, options)
21
+ tb = PROIEL::Treebank.new
22
+
23
+ args.each do |filename|
24
+ STDERR.puts "Reading #{filename}...".green if options['verbose']
25
+
26
+ tb.load_from_xml(filename)
27
+ end
28
+
29
+ t = treebank_statistics(tb)
30
+
31
+ puts "Loaded treebank files contain #{tb.sources.count} source(s)".yellow
32
+ puts " Overall size: #{t.sentence_count} sentence(s), #{t.token_count} token(s)"
33
+ puts
34
+
35
+ tb.sources.each_with_index do |source, i|
36
+ s = source_statistics(source)
37
+ n = s.sentence_count
38
+ r = s.reviewed_sentence_count * 100.0 / n
39
+ a = s.annotated_sentence_count * 100.0 / n
40
+
41
+ puts "#{i + 1}. #{pretty_title(source)}".yellow
42
+ puts " Version: #{source.date}"
43
+ puts " License: #{pretty_license(source)}"
44
+ puts " Language: #{pretty_language(source)}"
45
+ puts " Printed text: #{pretty_printed_text_info(source)}"
46
+ puts " Electr. text: #{pretty_electronic_text_info(source)}"
47
+ puts " Size: #{n} sentence(s), #{s.token_count} token(s)"
48
+ puts " Annotation: %.2f%% reviewed, %.2f%% annotated" % [r, a]
49
+ end
50
+ end
51
+
52
+ def treebank_statistics(tb)
53
+ OpenStruct.new.tap do |s|
54
+ s.sentence_count = 0
55
+ s.token_count = 0
56
+ s.annotated_sentence_count = 0
57
+ s.reviewed_sentence_count = 0
58
+
59
+ tb.sources.each do |source|
60
+ s.token_count += source_statistics(source).token_count
61
+ s.sentence_count += source_statistics(source).sentence_count
62
+ s.annotated_sentence_count += source_statistics(source).annotated_sentence_count
63
+ s.reviewed_sentence_count += source_statistics(source).reviewed_sentence_count
64
+ end
65
+ end
66
+ end
67
+
68
+ def source_statistics(source)
69
+ OpenStruct.new.tap do |s|
70
+ s.sentence_count = 0
71
+ s.token_count = 0
72
+ s.annotated_sentence_count = 0
73
+ s.reviewed_sentence_count = 0
74
+
75
+ source.divs.each do |div|
76
+ div.sentences.each do |sentence|
77
+ s.token_count += sentence.tokens.count
78
+ end
79
+
80
+ s.sentence_count += div.sentences.count
81
+ s.annotated_sentence_count += div.sentences.select(&:annotated?).count
82
+ s.reviewed_sentence_count += div.sentences.select(&:reviewed?).count
83
+ end
84
+ end
85
+ end
86
+
87
+ def pretty_language(source)
88
+ case source.language
89
+ when 'lat'
90
+ 'Latin'
91
+ else
92
+ "Unknown (language code #{source.language})"
93
+ end
94
+ end
95
+
96
+ def pretty_printed_text_info(source)
97
+ [source.printed_text_title,
98
+ source.printed_text_editor ? "ed. #{source.printed_text_editor}" : nil,
99
+ source.printed_text_publisher,
100
+ source.printed_text_place,
101
+ source.printed_text_date].compact.join(', ')
102
+ end
103
+
104
+ def pretty_electronic_text_info(source)
105
+ [source.electronic_text_title,
106
+ source.electronic_text_editor ? "ed. #{source.electronic_text_editor}" : nil,
107
+ source.electronic_text_publisher,
108
+ source.electronic_text_place,
109
+ source.electronic_text_date].compact.join(', ')
110
+ end
111
+
112
+ def pretty_license(source)
113
+ if source.license_url
114
+ "#{source.license} (#{source.license_url})"
115
+ else
116
+ source.license
117
+ end
118
+ end
119
+
120
+ def pretty_title(source)
121
+ [source.author, source.title].compact.join(', ')
122
+ end
123
+ end
124
+ end
125
+ end
126
+ end