proiel-cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +23 -0
- data/README.md +34 -0
- data/bin/proiel +27 -0
- data/bin/setup +7 -0
- data/contrib/proiel-giza-train +6 -0
- data/contrib/proiel-lexc-compile +18 -0
- data/contrib/proiel-maltparser-parse +2 -0
- data/contrib/proiel-maltparser-train +6 -0
- data/contrib/proiel-tnt-train +15 -0
- data/examples/decision-tree.rb +41 -0
- data/examples/dep-pos-cooccurrences.rb +84 -0
- data/examples/lint-rules.rb +174 -0
- data/examples/relation-as-disambiguator.rb +134 -0
- data/examples/word-occurrences.rb +30 -0
- data/lib/proiel/cli.rb +2 -0
- data/lib/proiel/cli/commands.rb +28 -0
- data/lib/proiel/cli/commands/convert.rb +94 -0
- data/lib/proiel/cli/commands/grep.rb +136 -0
- data/lib/proiel/cli/commands/info.rb +126 -0
- data/lib/proiel/cli/commands/tokenize.rb +165 -0
- data/lib/proiel/cli/commands/validate.rb +42 -0
- data/lib/proiel/cli/converters/conll-u.rb +589 -0
- data/lib/proiel/cli/converters/conll-u/morphology.rb +235 -0
- data/lib/proiel/cli/converters/conll-u/syntax.rb +81 -0
- data/lib/proiel/cli/converters/conll-x.rb +66 -0
- data/lib/proiel/cli/converters/lexc.rb +36 -0
- data/lib/proiel/cli/converters/proielxml.rb +152 -0
- data/lib/proiel/cli/converters/text.rb +99 -0
- data/lib/proiel/cli/converters/tiger.rb +157 -0
- data/lib/proiel/cli/converters/tiger2.rb +193 -0
- data/lib/proiel/cli/converters/tnt.rb +30 -0
- data/lib/proiel/cli/version.rb +5 -0
- metadata +248 -0
@@ -0,0 +1,134 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Does the dependency relation suffice to disambiguate ambiguous morphology?
|
4
|
+
#
|
5
|
+
require 'colorize'
|
6
|
+
require 'proiel'
|
7
|
+
|
8
|
+
if ARGV.length < 1
|
9
|
+
STDERR.puts "Usage: #{$0} treebank-files(s)"
|
10
|
+
|
11
|
+
exit 1
|
12
|
+
end
|
13
|
+
|
14
|
+
tb = PROIEL::Treebank.new
|
15
|
+
tb.load_from_xml(ARGV)
|
16
|
+
|
17
|
+
# Harvest morphology
|
18
|
+
form_hash = {}
|
19
|
+
|
20
|
+
tb.sources.reject { |source| source.language != language_tag }.each do |source|
|
21
|
+
source.tokens.each do |token|
|
22
|
+
next unless token.form and token.pos and token.morphology and token.relation
|
23
|
+
|
24
|
+
# TODO: problem with using token.form is that sentence-initial words are sometimes capitalised
|
25
|
+
relation_hash = (form_hash[token.form] ||= {})
|
26
|
+
pos_hash = (relation_hash[token.relation] ||= {})
|
27
|
+
morphology_hash = (pos_hash[token.pos] ||= {})
|
28
|
+
morphology_hash[token.morphology] ||= 0
|
29
|
+
morphology_hash[token.morphology] += 1
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Calculate by unique forms first
|
34
|
+
unique_pos = 0
|
35
|
+
relation_predicts_pos = 0
|
36
|
+
relation_does_not_predict_pos = 0
|
37
|
+
|
38
|
+
unique_morphology = 0
|
39
|
+
relation_predicts_morphology = 0
|
40
|
+
relation_does_not_predict_morphology = 0
|
41
|
+
|
42
|
+
form_hash.each do |form, h|
|
43
|
+
number_of_poses = 0
|
44
|
+
number_of_morphologies = 0
|
45
|
+
|
46
|
+
h.each do |_, i|
|
47
|
+
i.each do |_, j|
|
48
|
+
number_of_poses += 1
|
49
|
+
j.each do |_, k|
|
50
|
+
number_of_morphologies += 1
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
if number_of_poses == 1
|
56
|
+
unique_pos += 1
|
57
|
+
elsif h.all? { |_, i| i.keys.count == 1 }
|
58
|
+
relation_predicts_pos += 1
|
59
|
+
else
|
60
|
+
relation_does_not_predict_pos += 1
|
61
|
+
end
|
62
|
+
|
63
|
+
if number_of_morphologies == 1
|
64
|
+
unique_morphology += 1
|
65
|
+
elsif h.all? { |_, i| i.all? { |_, j| j.keys.count == 1 } }
|
66
|
+
relation_predicts_morphology += 1
|
67
|
+
else
|
68
|
+
relation_does_not_predict_morphology += 1
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
puts "By unique forms (types)"
|
73
|
+
puts "======================="
|
74
|
+
puts "Forms with a unique POS: #{unique_pos}"
|
75
|
+
puts "Forms whose relation predicts its POS: #{relation_predicts_pos}"
|
76
|
+
puts "Forms whose relation does not predict its POS: #{relation_does_not_predict_pos}"
|
77
|
+
|
78
|
+
puts "Forms with a unique morphology: #{unique_morphology}"
|
79
|
+
puts "Forms whose relation predicts its morphology: #{relation_predicts_morphology}"
|
80
|
+
puts "Forms whose relation does not predict its morphology: #{relation_does_not_predict_morphology}"
|
81
|
+
|
82
|
+
# Calculate by actual number of occurrences
|
83
|
+
unique_pos = 0
|
84
|
+
relation_predicts_pos = 0
|
85
|
+
relation_does_not_predict_pos = 0
|
86
|
+
|
87
|
+
unique_morphology = 0
|
88
|
+
relation_predicts_morphology = 0
|
89
|
+
relation_does_not_predict_morphology = 0
|
90
|
+
|
91
|
+
form_hash.each do |form, h|
|
92
|
+
n = 0
|
93
|
+
number_of_poses = 0
|
94
|
+
number_of_morphologies = 0
|
95
|
+
|
96
|
+
h.each do |_, i|
|
97
|
+
i.each do |_, j|
|
98
|
+
number_of_poses += 1
|
99
|
+
j.each do |_, k|
|
100
|
+
number_of_morphologies += 1
|
101
|
+
n += k
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
if number_of_poses == 1
|
107
|
+
unique_pos += n
|
108
|
+
elsif h.all? { |_, i| i.keys.count == 1 }
|
109
|
+
relation_predicts_pos += n
|
110
|
+
else
|
111
|
+
relation_does_not_predict_pos += n
|
112
|
+
end
|
113
|
+
|
114
|
+
if number_of_morphologies == 1
|
115
|
+
unique_morphology += n
|
116
|
+
elsif h.all? { |_, i| i.all? { |_, j| j.keys.count == 1 } }
|
117
|
+
relation_predicts_morphology += n
|
118
|
+
else
|
119
|
+
relation_does_not_predict_morphology += n
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
puts
|
124
|
+
puts "By occurrences of forms (tokens)"
|
125
|
+
puts "================================"
|
126
|
+
puts "Forms with a unique POS: #{unique_pos}"
|
127
|
+
puts "Forms whose relation predicts its POS: #{relation_predicts_pos}"
|
128
|
+
puts "Forms whose relation does not predict its POS: #{relation_does_not_predict_pos}"
|
129
|
+
|
130
|
+
puts "Forms with a unique morphology: #{unique_morphology}"
|
131
|
+
puts "Forms whose relation predicts its morphology: #{relation_predicts_morphology}"
|
132
|
+
puts "Forms whose relation does not predict its morphology: #{relation_does_not_predict_morphology}"
|
133
|
+
|
134
|
+
exit 0
|
@@ -0,0 +1,30 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Word form occurrence extraction
|
4
|
+
#
|
5
|
+
require 'colorize'
|
6
|
+
require 'proiel'
|
7
|
+
|
8
|
+
if ARGV.length < 1
|
9
|
+
STDERR.puts "Usage: #{$0} treebank-files(s)"
|
10
|
+
|
11
|
+
exit 1
|
12
|
+
end
|
13
|
+
|
14
|
+
tb = PROIEL::Treebank.new
|
15
|
+
tb.load_from_xml(ARGV)
|
16
|
+
|
17
|
+
form_index = {}
|
18
|
+
|
19
|
+
tb.sources.each do |source|
|
20
|
+
source.tokens.each do |token|
|
21
|
+
unless token.form.nil?
|
22
|
+
form_index[token.form] ||= []
|
23
|
+
form_index[token.form] << [source.id, token.id].join(':')
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
form_index.sort_by(&:first).each do |form, ids|
|
29
|
+
puts "#{form}: #{ids.join(', ')}"
|
30
|
+
end
|
data/lib/proiel/cli.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'mercenary'
|
2
|
+
require 'colorize'
|
3
|
+
require 'proiel'
|
4
|
+
|
5
|
+
module PROIEL
|
6
|
+
class Command
|
7
|
+
class << self
|
8
|
+
def subclasses
|
9
|
+
@subclasses ||= []
|
10
|
+
end
|
11
|
+
|
12
|
+
def inherited(base)
|
13
|
+
subclasses << base
|
14
|
+
super(base)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def require_all(path)
|
21
|
+
glob = File.join(File.dirname(__FILE__), path, '*.rb')
|
22
|
+
Dir[glob].each do |f|
|
23
|
+
require f
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
require_all 'commands'
|
28
|
+
require_all 'converters'
|
@@ -0,0 +1,94 @@
|
|
1
|
+
module PROIEL
|
2
|
+
module Commands
|
3
|
+
class Convert < Command
|
4
|
+
class << self
|
5
|
+
def init_with_program(prog)
|
6
|
+
prog.command(:convert) do |c|
|
7
|
+
c.syntax 'convert format'
|
8
|
+
c.description 'Convert to a different format'
|
9
|
+
|
10
|
+
c.command(:proielxml) do |f|
|
11
|
+
f.syntax '[options] [filename(s)]'
|
12
|
+
f.description 'Convert to PROIEL XML format'
|
13
|
+
f.option 'remove-not-annotated', '--remove-not-annotated', 'Remove sentences that have not been annotated'
|
14
|
+
f.option 'remove-not-reviewed', '--remove-not-reviewed', 'Remove sentences that have not been reviewed'
|
15
|
+
f.option 'remove-morphology', '--remove-morphology', 'Remove morphological annotation (part of speech, morphology and lemma)'
|
16
|
+
f.option 'remove-syntax', '--remove-syntax', 'Remove syntactic annotation (relation, head ID and slashes)'
|
17
|
+
f.option 'remove-information-structure', '--remove-information-structure', 'Remove informtion structure annotation (antecedent ID, information status and contrast group)'
|
18
|
+
f.option 'remove-status', '--remove-status', 'Remove sentence status (i.e. revert all sentences to unannotated status)'
|
19
|
+
f.option 'remove-empty-divs', '--remove-empty-divs', 'Remove div elements that do not contain any sentences'
|
20
|
+
f.action { |args, options| process(args, options, PROIEL::Converter::PROIELXML) }
|
21
|
+
end
|
22
|
+
|
23
|
+
c.command(:tnt) do |f|
|
24
|
+
f.syntax '[options] filename(s)'
|
25
|
+
f.description 'Convert to TNT/hunpos format'
|
26
|
+
f.option 'morphology', '-m', '--morphology', 'Include morphological tags'
|
27
|
+
f.action { |args, options| process(args, options, PROIEL::Converter::TNT) }
|
28
|
+
end
|
29
|
+
|
30
|
+
c.command(:"conll-x") do |f|
|
31
|
+
f.syntax 'filename(s)'
|
32
|
+
f.description 'Convert to CoNLL-X format'
|
33
|
+
f.action { |args, options| process(args, options, PROIEL::Converter::CoNLLX) }
|
34
|
+
end
|
35
|
+
|
36
|
+
c.command(:"conll-u") do |f|
|
37
|
+
f.syntax 'filename(s)'
|
38
|
+
f.description 'Convert to CoNLL-U format'
|
39
|
+
f.action { |args, options| process(args, options, PROIEL::Converter::CoNLLU) }
|
40
|
+
end
|
41
|
+
|
42
|
+
c.command(:tiger) do |f|
|
43
|
+
f.syntax 'filename(s)'
|
44
|
+
f.description 'Convert to TIGER XML format'
|
45
|
+
f.action { |args, options| process(args, options, PROIEL::Converter::Tiger) }
|
46
|
+
end
|
47
|
+
|
48
|
+
c.command(:tiger2) do |f|
|
49
|
+
f.syntax 'filename(s)'
|
50
|
+
f.description 'Convert to TIGER2 format'
|
51
|
+
f.action { |args, options| process(args, options, PROIEL::Converter::Tiger2) }
|
52
|
+
end
|
53
|
+
|
54
|
+
c.command(:text) do |f|
|
55
|
+
f.syntax 'filename(s)'
|
56
|
+
f.description 'Convert to plain text (UTF-8 with Unix line-endings)'
|
57
|
+
f.option 'diffable', '-d', '--diffable', 'Make the output diffable'
|
58
|
+
f.action { |args, options| process(args, options, PROIEL::Converter::Text) }
|
59
|
+
end
|
60
|
+
|
61
|
+
c.command(:lexc) do |f|
|
62
|
+
f.syntax '[options] filename(s)'
|
63
|
+
f.description 'Convert to lexc format'
|
64
|
+
f.option 'morphology', '-m', '--morphology', 'Include morphological tags'
|
65
|
+
f.action { |args, options| process(args, options, PROIEL::Converter::Lexc) }
|
66
|
+
end
|
67
|
+
|
68
|
+
c.action do |_, _|
|
69
|
+
STDERR.puts 'Missing or invalid format. Use --help for more information.'
|
70
|
+
exit 1
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def process(args, options, converter)
|
76
|
+
tb = PROIEL::Treebank.new
|
77
|
+
|
78
|
+
if args.empty?
|
79
|
+
STDERR.puts "Reading from standard input...".green if options['verbose']
|
80
|
+
tb.load_from_xml(STDIN)
|
81
|
+
else
|
82
|
+
args.each do |filename|
|
83
|
+
STDERR.puts "Reading #{filename}...".green if options['verbose']
|
84
|
+
|
85
|
+
tb.load_from_xml(filename)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
converter.process(tb, options)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
module PROIEL
|
2
|
+
module Commands
|
3
|
+
class Grep < Command
|
4
|
+
class << self
|
5
|
+
def init_with_program(prog)
|
6
|
+
prog.command(:grep) do |c|
|
7
|
+
c.syntax 'grep [options] pattern filename(s)'
|
8
|
+
c.description 'Search the text'
|
9
|
+
|
10
|
+
c.option 'level', '--level LEVEL', 'Select level to match. LEVEL should be "token" or "sentence" (default)'
|
11
|
+
c.option 'nostrip', '--nostrip', 'Do not strip whitespace from start and beginning of strings'
|
12
|
+
c.option 'nosubst', '--nosubst', 'Do not substitute whitespace sequences with a single space'
|
13
|
+
c.option 'nocolour', '--nocolour', 'Do not colour code matches'
|
14
|
+
c.option 'ignore-case', '-i', '--ignore-case', 'Ignore uppercase/lowercase'
|
15
|
+
|
16
|
+
c.action { |args, options| process(args, options) }
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def match(s, citation, object_id, pattern, options)
|
21
|
+
s.strip! unless options['nostrip']
|
22
|
+
s.gsub!(/\s+/, ' ') unless options['nosubst']
|
23
|
+
|
24
|
+
if s[pattern]
|
25
|
+
s.gsub!(pattern) { |m| m.yellow } unless options['nocolour']
|
26
|
+
|
27
|
+
puts "#{citation} (ID = #{object_id}) #{s}"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def merge_citation_parts(citation_upper, citation_lower_start, citation_lower_end = nil)
|
32
|
+
citation_lower =
|
33
|
+
if citation_lower_start == citation_lower_end
|
34
|
+
citation_lower_start
|
35
|
+
else
|
36
|
+
[citation_lower_start, citation_lower_end].compact.join('-')
|
37
|
+
end
|
38
|
+
|
39
|
+
[citation_upper, citation_lower].compact.join(' ')
|
40
|
+
end
|
41
|
+
|
42
|
+
def process_div_for_sentences(citation_upper, div, pattern, options)
|
43
|
+
div.sentences.each do |sentence|
|
44
|
+
s = sentence.presentation_before || ''
|
45
|
+
citation_lower_start = nil
|
46
|
+
citation_lower_end = nil
|
47
|
+
|
48
|
+
sentence.tokens.each do |token|
|
49
|
+
unless token.is_empty?
|
50
|
+
s += token.presentation_before || ''
|
51
|
+
s += token.form || ''
|
52
|
+
s += token.presentation_after || ''
|
53
|
+
|
54
|
+
citation_lower_start = token.citation_part if citation_lower_start.nil?
|
55
|
+
citation_lower_end = token.citation_part
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
s += sentence.presentation_after || ''
|
60
|
+
|
61
|
+
citation = merge_citation_parts(citation_upper, citation_lower_start, citation_lower_end)
|
62
|
+
match(s, citation, sentence.id, pattern, options)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def process_div_for_tokens(citation_upper, div, pattern, options)
|
67
|
+
div.sentences.each do |sentence|
|
68
|
+
sentence.tokens.each do |token|
|
69
|
+
unless token.is_empty?
|
70
|
+
s = token.presentation_before || ''
|
71
|
+
s += token.form || ''
|
72
|
+
s += token.presentation_after || ''
|
73
|
+
|
74
|
+
citation = merge_citation_parts(citation_upper, token.citation_part)
|
75
|
+
match(s, citation, token.id, pattern, options)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def process_div(citation_upper, div, pattern, options)
|
82
|
+
case options['level']
|
83
|
+
when 'sentence'
|
84
|
+
process_div_for_sentences(citation_upper, div, pattern, options)
|
85
|
+
when 'token'
|
86
|
+
process_div_for_tokens(citation_upper, div, pattern, options)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def process(args, options)
|
91
|
+
if args.empty?
|
92
|
+
STDERR.puts 'Missing pattern. Use --help for more information.'
|
93
|
+
exit 1
|
94
|
+
end
|
95
|
+
|
96
|
+
pattern_string = args.shift
|
97
|
+
|
98
|
+
pattern =
|
99
|
+
if options['ignore-case']
|
100
|
+
Regexp.new(pattern_string, Regexp::IGNORECASE)
|
101
|
+
else
|
102
|
+
Regexp.new(pattern_string)
|
103
|
+
end
|
104
|
+
|
105
|
+
if args.empty?
|
106
|
+
STDERR.puts 'Missing filename(s). Use --help for more information.'
|
107
|
+
exit 1
|
108
|
+
end
|
109
|
+
|
110
|
+
options['level'] ||= 'sentence'
|
111
|
+
|
112
|
+
unless %w(token sentence).include?(options['level'])
|
113
|
+
STDERR.puts 'Invalid matching level. Use --help for more information.'
|
114
|
+
exit 1
|
115
|
+
end
|
116
|
+
|
117
|
+
tb = PROIEL::Treebank.new
|
118
|
+
|
119
|
+
args.each do |filename|
|
120
|
+
STDERR.puts "Reading #{filename}...".green if options['verbose']
|
121
|
+
|
122
|
+
tb.load_from_xml(filename)
|
123
|
+
end
|
124
|
+
|
125
|
+
tb.sources.each do |source|
|
126
|
+
citation_upper = source.citation_part
|
127
|
+
|
128
|
+
source.divs.each do |div|
|
129
|
+
process_div(citation_upper, div, pattern, options)
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
@@ -0,0 +1,126 @@
|
|
1
|
+
module PROIEL
|
2
|
+
module Commands
|
3
|
+
class CountWords < Command
|
4
|
+
class << self
|
5
|
+
def init_with_program(prog)
|
6
|
+
prog.command(:info) do |c|
|
7
|
+
c.syntax 'info [options] filename(s)'
|
8
|
+
c.description 'Show information about the treebank'
|
9
|
+
|
10
|
+
c.action do |args, options|
|
11
|
+
if args.empty?
|
12
|
+
STDERR.puts 'Missing filename(s). Use --help for more information.'
|
13
|
+
else
|
14
|
+
process(args, options)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def process(args, options)
|
21
|
+
tb = PROIEL::Treebank.new
|
22
|
+
|
23
|
+
args.each do |filename|
|
24
|
+
STDERR.puts "Reading #{filename}...".green if options['verbose']
|
25
|
+
|
26
|
+
tb.load_from_xml(filename)
|
27
|
+
end
|
28
|
+
|
29
|
+
t = treebank_statistics(tb)
|
30
|
+
|
31
|
+
puts "Loaded treebank files contain #{tb.sources.count} source(s)".yellow
|
32
|
+
puts " Overall size: #{t.sentence_count} sentence(s), #{t.token_count} token(s)"
|
33
|
+
puts
|
34
|
+
|
35
|
+
tb.sources.each_with_index do |source, i|
|
36
|
+
s = source_statistics(source)
|
37
|
+
n = s.sentence_count
|
38
|
+
r = s.reviewed_sentence_count * 100.0 / n
|
39
|
+
a = s.annotated_sentence_count * 100.0 / n
|
40
|
+
|
41
|
+
puts "#{i + 1}. #{pretty_title(source)}".yellow
|
42
|
+
puts " Version: #{source.date}"
|
43
|
+
puts " License: #{pretty_license(source)}"
|
44
|
+
puts " Language: #{pretty_language(source)}"
|
45
|
+
puts " Printed text: #{pretty_printed_text_info(source)}"
|
46
|
+
puts " Electr. text: #{pretty_electronic_text_info(source)}"
|
47
|
+
puts " Size: #{n} sentence(s), #{s.token_count} token(s)"
|
48
|
+
puts " Annotation: %.2f%% reviewed, %.2f%% annotated" % [r, a]
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def treebank_statistics(tb)
|
53
|
+
OpenStruct.new.tap do |s|
|
54
|
+
s.sentence_count = 0
|
55
|
+
s.token_count = 0
|
56
|
+
s.annotated_sentence_count = 0
|
57
|
+
s.reviewed_sentence_count = 0
|
58
|
+
|
59
|
+
tb.sources.each do |source|
|
60
|
+
s.token_count += source_statistics(source).token_count
|
61
|
+
s.sentence_count += source_statistics(source).sentence_count
|
62
|
+
s.annotated_sentence_count += source_statistics(source).annotated_sentence_count
|
63
|
+
s.reviewed_sentence_count += source_statistics(source).reviewed_sentence_count
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def source_statistics(source)
|
69
|
+
OpenStruct.new.tap do |s|
|
70
|
+
s.sentence_count = 0
|
71
|
+
s.token_count = 0
|
72
|
+
s.annotated_sentence_count = 0
|
73
|
+
s.reviewed_sentence_count = 0
|
74
|
+
|
75
|
+
source.divs.each do |div|
|
76
|
+
div.sentences.each do |sentence|
|
77
|
+
s.token_count += sentence.tokens.count
|
78
|
+
end
|
79
|
+
|
80
|
+
s.sentence_count += div.sentences.count
|
81
|
+
s.annotated_sentence_count += div.sentences.select(&:annotated?).count
|
82
|
+
s.reviewed_sentence_count += div.sentences.select(&:reviewed?).count
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def pretty_language(source)
|
88
|
+
case source.language
|
89
|
+
when 'lat'
|
90
|
+
'Latin'
|
91
|
+
else
|
92
|
+
"Unknown (language code #{source.language})"
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def pretty_printed_text_info(source)
|
97
|
+
[source.printed_text_title,
|
98
|
+
source.printed_text_editor ? "ed. #{source.printed_text_editor}" : nil,
|
99
|
+
source.printed_text_publisher,
|
100
|
+
source.printed_text_place,
|
101
|
+
source.printed_text_date].compact.join(', ')
|
102
|
+
end
|
103
|
+
|
104
|
+
def pretty_electronic_text_info(source)
|
105
|
+
[source.electronic_text_title,
|
106
|
+
source.electronic_text_editor ? "ed. #{source.electronic_text_editor}" : nil,
|
107
|
+
source.electronic_text_publisher,
|
108
|
+
source.electronic_text_place,
|
109
|
+
source.electronic_text_date].compact.join(', ')
|
110
|
+
end
|
111
|
+
|
112
|
+
def pretty_license(source)
|
113
|
+
if source.license_url
|
114
|
+
"#{source.license} (#{source.license_url})"
|
115
|
+
else
|
116
|
+
source.license
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def pretty_title(source)
|
121
|
+
[source.author, source.title].compact.join(', ')
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|