proiel-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,134 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Does the dependency relation suffice to disambiguate ambiguous morphology?
4
+ #
5
+ require 'colorize'
6
+ require 'proiel'
7
+
8
+ if ARGV.length < 1
9
+ STDERR.puts "Usage: #{$0} treebank-files(s)"
10
+
11
+ exit 1
12
+ end
13
+
14
+ tb = PROIEL::Treebank.new
15
+ tb.load_from_xml(ARGV)
16
+
17
+ # Harvest morphology
18
+ form_hash = {}
19
+
20
+ tb.sources.reject { |source| source.language != language_tag }.each do |source|
21
+ source.tokens.each do |token|
22
+ next unless token.form and token.pos and token.morphology and token.relation
23
+
24
+ # TODO: problem with using token.form is that sentence-initial words are sometimes capitalised
25
+ relation_hash = (form_hash[token.form] ||= {})
26
+ pos_hash = (relation_hash[token.relation] ||= {})
27
+ morphology_hash = (pos_hash[token.pos] ||= {})
28
+ morphology_hash[token.morphology] ||= 0
29
+ morphology_hash[token.morphology] += 1
30
+ end
31
+ end
32
+
33
+ # Calculate by unique forms first
34
+ unique_pos = 0
35
+ relation_predicts_pos = 0
36
+ relation_does_not_predict_pos = 0
37
+
38
+ unique_morphology = 0
39
+ relation_predicts_morphology = 0
40
+ relation_does_not_predict_morphology = 0
41
+
42
+ form_hash.each do |form, h|
43
+ number_of_poses = 0
44
+ number_of_morphologies = 0
45
+
46
+ h.each do |_, i|
47
+ i.each do |_, j|
48
+ number_of_poses += 1
49
+ j.each do |_, k|
50
+ number_of_morphologies += 1
51
+ end
52
+ end
53
+ end
54
+
55
+ if number_of_poses == 1
56
+ unique_pos += 1
57
+ elsif h.all? { |_, i| i.keys.count == 1 }
58
+ relation_predicts_pos += 1
59
+ else
60
+ relation_does_not_predict_pos += 1
61
+ end
62
+
63
+ if number_of_morphologies == 1
64
+ unique_morphology += 1
65
+ elsif h.all? { |_, i| i.all? { |_, j| j.keys.count == 1 } }
66
+ relation_predicts_morphology += 1
67
+ else
68
+ relation_does_not_predict_morphology += 1
69
+ end
70
+ end
71
+
72
+ puts "By unique forms (types)"
73
+ puts "======================="
74
+ puts "Forms with a unique POS: #{unique_pos}"
75
+ puts "Forms whose relation predicts its POS: #{relation_predicts_pos}"
76
+ puts "Forms whose relation does not predict its POS: #{relation_does_not_predict_pos}"
77
+
78
+ puts "Forms with a unique morphology: #{unique_morphology}"
79
+ puts "Forms whose relation predicts its morphology: #{relation_predicts_morphology}"
80
+ puts "Forms whose relation does not predict its morphology: #{relation_does_not_predict_morphology}"
81
+
82
+ # Calculate by actual number of occurrences
83
+ unique_pos = 0
84
+ relation_predicts_pos = 0
85
+ relation_does_not_predict_pos = 0
86
+
87
+ unique_morphology = 0
88
+ relation_predicts_morphology = 0
89
+ relation_does_not_predict_morphology = 0
90
+
91
+ form_hash.each do |form, h|
92
+ n = 0
93
+ number_of_poses = 0
94
+ number_of_morphologies = 0
95
+
96
+ h.each do |_, i|
97
+ i.each do |_, j|
98
+ number_of_poses += 1
99
+ j.each do |_, k|
100
+ number_of_morphologies += 1
101
+ n += k
102
+ end
103
+ end
104
+ end
105
+
106
+ if number_of_poses == 1
107
+ unique_pos += n
108
+ elsif h.all? { |_, i| i.keys.count == 1 }
109
+ relation_predicts_pos += n
110
+ else
111
+ relation_does_not_predict_pos += n
112
+ end
113
+
114
+ if number_of_morphologies == 1
115
+ unique_morphology += n
116
+ elsif h.all? { |_, i| i.all? { |_, j| j.keys.count == 1 } }
117
+ relation_predicts_morphology += n
118
+ else
119
+ relation_does_not_predict_morphology += n
120
+ end
121
+ end
122
+
123
+ puts
124
+ puts "By occurrences of forms (tokens)"
125
+ puts "================================"
126
+ puts "Forms with a unique POS: #{unique_pos}"
127
+ puts "Forms whose relation predicts its POS: #{relation_predicts_pos}"
128
+ puts "Forms whose relation does not predict its POS: #{relation_does_not_predict_pos}"
129
+
130
+ puts "Forms with a unique morphology: #{unique_morphology}"
131
+ puts "Forms whose relation predicts its morphology: #{relation_predicts_morphology}"
132
+ puts "Forms whose relation does not predict its morphology: #{relation_does_not_predict_morphology}"
133
+
134
+ exit 0
@@ -0,0 +1,30 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Word form occurrence extraction
4
+ #
5
+ require 'colorize'
6
+ require 'proiel'
7
+
8
+ if ARGV.length < 1
9
+ STDERR.puts "Usage: #{$0} treebank-files(s)"
10
+
11
+ exit 1
12
+ end
13
+
14
+ tb = PROIEL::Treebank.new
15
+ tb.load_from_xml(ARGV)
16
+
17
+ form_index = {}
18
+
19
+ tb.sources.each do |source|
20
+ source.tokens.each do |token|
21
+ unless token.form.nil?
22
+ form_index[token.form] ||= []
23
+ form_index[token.form] << [source.id, token.id].join(':')
24
+ end
25
+ end
26
+ end
27
+
28
+ form_index.sort_by(&:first).each do |form, ids|
29
+ puts "#{form}: #{ids.join(', ')}"
30
+ end
data/lib/proiel/cli.rb ADDED
@@ -0,0 +1,2 @@
1
+ require 'proiel/cli/version'
2
+ require 'proiel/cli/commands'
@@ -0,0 +1,28 @@
1
+ require 'mercenary'
2
+ require 'colorize'
3
+ require 'proiel'
4
+
5
+ module PROIEL
6
+ class Command
7
+ class << self
8
+ def subclasses
9
+ @subclasses ||= []
10
+ end
11
+
12
+ def inherited(base)
13
+ subclasses << base
14
+ super(base)
15
+ end
16
+ end
17
+ end
18
+ end
19
+
20
+ def require_all(path)
21
+ glob = File.join(File.dirname(__FILE__), path, '*.rb')
22
+ Dir[glob].each do |f|
23
+ require f
24
+ end
25
+ end
26
+
27
+ require_all 'commands'
28
+ require_all 'converters'
@@ -0,0 +1,94 @@
1
+ module PROIEL
2
+ module Commands
3
+ class Convert < Command
4
+ class << self
5
+ def init_with_program(prog)
6
+ prog.command(:convert) do |c|
7
+ c.syntax 'convert format'
8
+ c.description 'Convert to a different format'
9
+
10
+ c.command(:proielxml) do |f|
11
+ f.syntax '[options] [filename(s)]'
12
+ f.description 'Convert to PROIEL XML format'
13
+ f.option 'remove-not-annotated', '--remove-not-annotated', 'Remove sentences that have not been annotated'
14
+ f.option 'remove-not-reviewed', '--remove-not-reviewed', 'Remove sentences that have not been reviewed'
15
+ f.option 'remove-morphology', '--remove-morphology', 'Remove morphological annotation (part of speech, morphology and lemma)'
16
+ f.option 'remove-syntax', '--remove-syntax', 'Remove syntactic annotation (relation, head ID and slashes)'
17
+ f.option 'remove-information-structure', '--remove-information-structure', 'Remove informtion structure annotation (antecedent ID, information status and contrast group)'
18
+ f.option 'remove-status', '--remove-status', 'Remove sentence status (i.e. revert all sentences to unannotated status)'
19
+ f.option 'remove-empty-divs', '--remove-empty-divs', 'Remove div elements that do not contain any sentences'
20
+ f.action { |args, options| process(args, options, PROIEL::Converter::PROIELXML) }
21
+ end
22
+
23
+ c.command(:tnt) do |f|
24
+ f.syntax '[options] filename(s)'
25
+ f.description 'Convert to TNT/hunpos format'
26
+ f.option 'morphology', '-m', '--morphology', 'Include morphological tags'
27
+ f.action { |args, options| process(args, options, PROIEL::Converter::TNT) }
28
+ end
29
+
30
+ c.command(:"conll-x") do |f|
31
+ f.syntax 'filename(s)'
32
+ f.description 'Convert to CoNLL-X format'
33
+ f.action { |args, options| process(args, options, PROIEL::Converter::CoNLLX) }
34
+ end
35
+
36
+ c.command(:"conll-u") do |f|
37
+ f.syntax 'filename(s)'
38
+ f.description 'Convert to CoNLL-U format'
39
+ f.action { |args, options| process(args, options, PROIEL::Converter::CoNLLU) }
40
+ end
41
+
42
+ c.command(:tiger) do |f|
43
+ f.syntax 'filename(s)'
44
+ f.description 'Convert to TIGER XML format'
45
+ f.action { |args, options| process(args, options, PROIEL::Converter::Tiger) }
46
+ end
47
+
48
+ c.command(:tiger2) do |f|
49
+ f.syntax 'filename(s)'
50
+ f.description 'Convert to TIGER2 format'
51
+ f.action { |args, options| process(args, options, PROIEL::Converter::Tiger2) }
52
+ end
53
+
54
+ c.command(:text) do |f|
55
+ f.syntax 'filename(s)'
56
+ f.description 'Convert to plain text (UTF-8 with Unix line-endings)'
57
+ f.option 'diffable', '-d', '--diffable', 'Make the output diffable'
58
+ f.action { |args, options| process(args, options, PROIEL::Converter::Text) }
59
+ end
60
+
61
+ c.command(:lexc) do |f|
62
+ f.syntax '[options] filename(s)'
63
+ f.description 'Convert to lexc format'
64
+ f.option 'morphology', '-m', '--morphology', 'Include morphological tags'
65
+ f.action { |args, options| process(args, options, PROIEL::Converter::Lexc) }
66
+ end
67
+
68
+ c.action do |_, _|
69
+ STDERR.puts 'Missing or invalid format. Use --help for more information.'
70
+ exit 1
71
+ end
72
+ end
73
+ end
74
+
75
+ def process(args, options, converter)
76
+ tb = PROIEL::Treebank.new
77
+
78
+ if args.empty?
79
+ STDERR.puts "Reading from standard input...".green if options['verbose']
80
+ tb.load_from_xml(STDIN)
81
+ else
82
+ args.each do |filename|
83
+ STDERR.puts "Reading #{filename}...".green if options['verbose']
84
+
85
+ tb.load_from_xml(filename)
86
+ end
87
+ end
88
+
89
+ converter.process(tb, options)
90
+ end
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,136 @@
1
+ module PROIEL
2
+ module Commands
3
+ class Grep < Command
4
+ class << self
5
+ def init_with_program(prog)
6
+ prog.command(:grep) do |c|
7
+ c.syntax 'grep [options] pattern filename(s)'
8
+ c.description 'Search the text'
9
+
10
+ c.option 'level', '--level LEVEL', 'Select level to match. LEVEL should be "token" or "sentence" (default)'
11
+ c.option 'nostrip', '--nostrip', 'Do not strip whitespace from start and beginning of strings'
12
+ c.option 'nosubst', '--nosubst', 'Do not substitute whitespace sequences with a single space'
13
+ c.option 'nocolour', '--nocolour', 'Do not colour code matches'
14
+ c.option 'ignore-case', '-i', '--ignore-case', 'Ignore uppercase/lowercase'
15
+
16
+ c.action { |args, options| process(args, options) }
17
+ end
18
+ end
19
+
20
+ def match(s, citation, object_id, pattern, options)
21
+ s.strip! unless options['nostrip']
22
+ s.gsub!(/\s+/, ' ') unless options['nosubst']
23
+
24
+ if s[pattern]
25
+ s.gsub!(pattern) { |m| m.yellow } unless options['nocolour']
26
+
27
+ puts "#{citation} (ID = #{object_id}) #{s}"
28
+ end
29
+ end
30
+
31
+ def merge_citation_parts(citation_upper, citation_lower_start, citation_lower_end = nil)
32
+ citation_lower =
33
+ if citation_lower_start == citation_lower_end
34
+ citation_lower_start
35
+ else
36
+ [citation_lower_start, citation_lower_end].compact.join('-')
37
+ end
38
+
39
+ [citation_upper, citation_lower].compact.join(' ')
40
+ end
41
+
42
+ def process_div_for_sentences(citation_upper, div, pattern, options)
43
+ div.sentences.each do |sentence|
44
+ s = sentence.presentation_before || ''
45
+ citation_lower_start = nil
46
+ citation_lower_end = nil
47
+
48
+ sentence.tokens.each do |token|
49
+ unless token.is_empty?
50
+ s += token.presentation_before || ''
51
+ s += token.form || ''
52
+ s += token.presentation_after || ''
53
+
54
+ citation_lower_start = token.citation_part if citation_lower_start.nil?
55
+ citation_lower_end = token.citation_part
56
+ end
57
+ end
58
+
59
+ s += sentence.presentation_after || ''
60
+
61
+ citation = merge_citation_parts(citation_upper, citation_lower_start, citation_lower_end)
62
+ match(s, citation, sentence.id, pattern, options)
63
+ end
64
+ end
65
+
66
+ def process_div_for_tokens(citation_upper, div, pattern, options)
67
+ div.sentences.each do |sentence|
68
+ sentence.tokens.each do |token|
69
+ unless token.is_empty?
70
+ s = token.presentation_before || ''
71
+ s += token.form || ''
72
+ s += token.presentation_after || ''
73
+
74
+ citation = merge_citation_parts(citation_upper, token.citation_part)
75
+ match(s, citation, token.id, pattern, options)
76
+ end
77
+ end
78
+ end
79
+ end
80
+
81
+ def process_div(citation_upper, div, pattern, options)
82
+ case options['level']
83
+ when 'sentence'
84
+ process_div_for_sentences(citation_upper, div, pattern, options)
85
+ when 'token'
86
+ process_div_for_tokens(citation_upper, div, pattern, options)
87
+ end
88
+ end
89
+
90
+ def process(args, options)
91
+ if args.empty?
92
+ STDERR.puts 'Missing pattern. Use --help for more information.'
93
+ exit 1
94
+ end
95
+
96
+ pattern_string = args.shift
97
+
98
+ pattern =
99
+ if options['ignore-case']
100
+ Regexp.new(pattern_string, Regexp::IGNORECASE)
101
+ else
102
+ Regexp.new(pattern_string)
103
+ end
104
+
105
+ if args.empty?
106
+ STDERR.puts 'Missing filename(s). Use --help for more information.'
107
+ exit 1
108
+ end
109
+
110
+ options['level'] ||= 'sentence'
111
+
112
+ unless %w(token sentence).include?(options['level'])
113
+ STDERR.puts 'Invalid matching level. Use --help for more information.'
114
+ exit 1
115
+ end
116
+
117
+ tb = PROIEL::Treebank.new
118
+
119
+ args.each do |filename|
120
+ STDERR.puts "Reading #{filename}...".green if options['verbose']
121
+
122
+ tb.load_from_xml(filename)
123
+ end
124
+
125
+ tb.sources.each do |source|
126
+ citation_upper = source.citation_part
127
+
128
+ source.divs.each do |div|
129
+ process_div(citation_upper, div, pattern, options)
130
+ end
131
+ end
132
+ end
133
+ end
134
+ end
135
+ end
136
+ end
@@ -0,0 +1,126 @@
1
+ module PROIEL
2
+ module Commands
3
+ class CountWords < Command
4
+ class << self
5
+ def init_with_program(prog)
6
+ prog.command(:info) do |c|
7
+ c.syntax 'info [options] filename(s)'
8
+ c.description 'Show information about the treebank'
9
+
10
+ c.action do |args, options|
11
+ if args.empty?
12
+ STDERR.puts 'Missing filename(s). Use --help for more information.'
13
+ else
14
+ process(args, options)
15
+ end
16
+ end
17
+ end
18
+ end
19
+
20
+ def process(args, options)
21
+ tb = PROIEL::Treebank.new
22
+
23
+ args.each do |filename|
24
+ STDERR.puts "Reading #{filename}...".green if options['verbose']
25
+
26
+ tb.load_from_xml(filename)
27
+ end
28
+
29
+ t = treebank_statistics(tb)
30
+
31
+ puts "Loaded treebank files contain #{tb.sources.count} source(s)".yellow
32
+ puts " Overall size: #{t.sentence_count} sentence(s), #{t.token_count} token(s)"
33
+ puts
34
+
35
+ tb.sources.each_with_index do |source, i|
36
+ s = source_statistics(source)
37
+ n = s.sentence_count
38
+ r = s.reviewed_sentence_count * 100.0 / n
39
+ a = s.annotated_sentence_count * 100.0 / n
40
+
41
+ puts "#{i + 1}. #{pretty_title(source)}".yellow
42
+ puts " Version: #{source.date}"
43
+ puts " License: #{pretty_license(source)}"
44
+ puts " Language: #{pretty_language(source)}"
45
+ puts " Printed text: #{pretty_printed_text_info(source)}"
46
+ puts " Electr. text: #{pretty_electronic_text_info(source)}"
47
+ puts " Size: #{n} sentence(s), #{s.token_count} token(s)"
48
+ puts " Annotation: %.2f%% reviewed, %.2f%% annotated" % [r, a]
49
+ end
50
+ end
51
+
52
+ def treebank_statistics(tb)
53
+ OpenStruct.new.tap do |s|
54
+ s.sentence_count = 0
55
+ s.token_count = 0
56
+ s.annotated_sentence_count = 0
57
+ s.reviewed_sentence_count = 0
58
+
59
+ tb.sources.each do |source|
60
+ s.token_count += source_statistics(source).token_count
61
+ s.sentence_count += source_statistics(source).sentence_count
62
+ s.annotated_sentence_count += source_statistics(source).annotated_sentence_count
63
+ s.reviewed_sentence_count += source_statistics(source).reviewed_sentence_count
64
+ end
65
+ end
66
+ end
67
+
68
+ def source_statistics(source)
69
+ OpenStruct.new.tap do |s|
70
+ s.sentence_count = 0
71
+ s.token_count = 0
72
+ s.annotated_sentence_count = 0
73
+ s.reviewed_sentence_count = 0
74
+
75
+ source.divs.each do |div|
76
+ div.sentences.each do |sentence|
77
+ s.token_count += sentence.tokens.count
78
+ end
79
+
80
+ s.sentence_count += div.sentences.count
81
+ s.annotated_sentence_count += div.sentences.select(&:annotated?).count
82
+ s.reviewed_sentence_count += div.sentences.select(&:reviewed?).count
83
+ end
84
+ end
85
+ end
86
+
87
+ def pretty_language(source)
88
+ case source.language
89
+ when 'lat'
90
+ 'Latin'
91
+ else
92
+ "Unknown (language code #{source.language})"
93
+ end
94
+ end
95
+
96
+ def pretty_printed_text_info(source)
97
+ [source.printed_text_title,
98
+ source.printed_text_editor ? "ed. #{source.printed_text_editor}" : nil,
99
+ source.printed_text_publisher,
100
+ source.printed_text_place,
101
+ source.printed_text_date].compact.join(', ')
102
+ end
103
+
104
+ def pretty_electronic_text_info(source)
105
+ [source.electronic_text_title,
106
+ source.electronic_text_editor ? "ed. #{source.electronic_text_editor}" : nil,
107
+ source.electronic_text_publisher,
108
+ source.electronic_text_place,
109
+ source.electronic_text_date].compact.join(', ')
110
+ end
111
+
112
+ def pretty_license(source)
113
+ if source.license_url
114
+ "#{source.license} (#{source.license_url})"
115
+ else
116
+ source.license
117
+ end
118
+ end
119
+
120
+ def pretty_title(source)
121
+ [source.author, source.title].compact.join(', ')
122
+ end
123
+ end
124
+ end
125
+ end
126
+ end