proiel-cli 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +23 -0
- data/README.md +34 -0
- data/bin/proiel +27 -0
- data/bin/setup +7 -0
- data/contrib/proiel-giza-train +6 -0
- data/contrib/proiel-lexc-compile +18 -0
- data/contrib/proiel-maltparser-parse +2 -0
- data/contrib/proiel-maltparser-train +6 -0
- data/contrib/proiel-tnt-train +15 -0
- data/examples/decision-tree.rb +41 -0
- data/examples/dep-pos-cooccurrences.rb +84 -0
- data/examples/lint-rules.rb +174 -0
- data/examples/relation-as-disambiguator.rb +134 -0
- data/examples/word-occurrences.rb +30 -0
- data/lib/proiel/cli.rb +2 -0
- data/lib/proiel/cli/commands.rb +28 -0
- data/lib/proiel/cli/commands/convert.rb +94 -0
- data/lib/proiel/cli/commands/grep.rb +136 -0
- data/lib/proiel/cli/commands/info.rb +126 -0
- data/lib/proiel/cli/commands/tokenize.rb +165 -0
- data/lib/proiel/cli/commands/validate.rb +42 -0
- data/lib/proiel/cli/converters/conll-u.rb +589 -0
- data/lib/proiel/cli/converters/conll-u/morphology.rb +235 -0
- data/lib/proiel/cli/converters/conll-u/syntax.rb +81 -0
- data/lib/proiel/cli/converters/conll-x.rb +66 -0
- data/lib/proiel/cli/converters/lexc.rb +36 -0
- data/lib/proiel/cli/converters/proielxml.rb +152 -0
- data/lib/proiel/cli/converters/text.rb +99 -0
- data/lib/proiel/cli/converters/tiger.rb +157 -0
- data/lib/proiel/cli/converters/tiger2.rb +193 -0
- data/lib/proiel/cli/converters/tnt.rb +30 -0
- data/lib/proiel/cli/version.rb +5 -0
- metadata +248 -0
@@ -0,0 +1,134 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Does the dependency relation suffice to disambiguate ambiguous morphology?
|
4
|
+
#
|
5
|
+
require 'colorize'
|
6
|
+
require 'proiel'
|
7
|
+
|
8
|
+
if ARGV.length < 1
|
9
|
+
STDERR.puts "Usage: #{$0} treebank-files(s)"
|
10
|
+
|
11
|
+
exit 1
|
12
|
+
end
|
13
|
+
|
14
|
+
tb = PROIEL::Treebank.new
|
15
|
+
tb.load_from_xml(ARGV)
|
16
|
+
|
17
|
+
# Harvest morphology
|
18
|
+
form_hash = {}
|
19
|
+
|
20
|
+
tb.sources.reject { |source| source.language != language_tag }.each do |source|
|
21
|
+
source.tokens.each do |token|
|
22
|
+
next unless token.form and token.pos and token.morphology and token.relation
|
23
|
+
|
24
|
+
# TODO: problem with using token.form is that sentence-initial words are sometimes capitalised
|
25
|
+
relation_hash = (form_hash[token.form] ||= {})
|
26
|
+
pos_hash = (relation_hash[token.relation] ||= {})
|
27
|
+
morphology_hash = (pos_hash[token.pos] ||= {})
|
28
|
+
morphology_hash[token.morphology] ||= 0
|
29
|
+
morphology_hash[token.morphology] += 1
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Calculate by unique forms first
|
34
|
+
unique_pos = 0
|
35
|
+
relation_predicts_pos = 0
|
36
|
+
relation_does_not_predict_pos = 0
|
37
|
+
|
38
|
+
unique_morphology = 0
|
39
|
+
relation_predicts_morphology = 0
|
40
|
+
relation_does_not_predict_morphology = 0
|
41
|
+
|
42
|
+
form_hash.each do |form, h|
|
43
|
+
number_of_poses = 0
|
44
|
+
number_of_morphologies = 0
|
45
|
+
|
46
|
+
h.each do |_, i|
|
47
|
+
i.each do |_, j|
|
48
|
+
number_of_poses += 1
|
49
|
+
j.each do |_, k|
|
50
|
+
number_of_morphologies += 1
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
if number_of_poses == 1
|
56
|
+
unique_pos += 1
|
57
|
+
elsif h.all? { |_, i| i.keys.count == 1 }
|
58
|
+
relation_predicts_pos += 1
|
59
|
+
else
|
60
|
+
relation_does_not_predict_pos += 1
|
61
|
+
end
|
62
|
+
|
63
|
+
if number_of_morphologies == 1
|
64
|
+
unique_morphology += 1
|
65
|
+
elsif h.all? { |_, i| i.all? { |_, j| j.keys.count == 1 } }
|
66
|
+
relation_predicts_morphology += 1
|
67
|
+
else
|
68
|
+
relation_does_not_predict_morphology += 1
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
puts "By unique forms (types)"
|
73
|
+
puts "======================="
|
74
|
+
puts "Forms with a unique POS: #{unique_pos}"
|
75
|
+
puts "Forms whose relation predicts its POS: #{relation_predicts_pos}"
|
76
|
+
puts "Forms whose relation does not predict its POS: #{relation_does_not_predict_pos}"
|
77
|
+
|
78
|
+
puts "Forms with a unique morphology: #{unique_morphology}"
|
79
|
+
puts "Forms whose relation predicts its morphology: #{relation_predicts_morphology}"
|
80
|
+
puts "Forms whose relation does not predict its morphology: #{relation_does_not_predict_morphology}"
|
81
|
+
|
82
|
+
# Calculate by actual number of occurrences
|
83
|
+
unique_pos = 0
|
84
|
+
relation_predicts_pos = 0
|
85
|
+
relation_does_not_predict_pos = 0
|
86
|
+
|
87
|
+
unique_morphology = 0
|
88
|
+
relation_predicts_morphology = 0
|
89
|
+
relation_does_not_predict_morphology = 0
|
90
|
+
|
91
|
+
form_hash.each do |form, h|
|
92
|
+
n = 0
|
93
|
+
number_of_poses = 0
|
94
|
+
number_of_morphologies = 0
|
95
|
+
|
96
|
+
h.each do |_, i|
|
97
|
+
i.each do |_, j|
|
98
|
+
number_of_poses += 1
|
99
|
+
j.each do |_, k|
|
100
|
+
number_of_morphologies += 1
|
101
|
+
n += k
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
if number_of_poses == 1
|
107
|
+
unique_pos += n
|
108
|
+
elsif h.all? { |_, i| i.keys.count == 1 }
|
109
|
+
relation_predicts_pos += n
|
110
|
+
else
|
111
|
+
relation_does_not_predict_pos += n
|
112
|
+
end
|
113
|
+
|
114
|
+
if number_of_morphologies == 1
|
115
|
+
unique_morphology += n
|
116
|
+
elsif h.all? { |_, i| i.all? { |_, j| j.keys.count == 1 } }
|
117
|
+
relation_predicts_morphology += n
|
118
|
+
else
|
119
|
+
relation_does_not_predict_morphology += n
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
puts
|
124
|
+
puts "By occurrences of forms (tokens)"
|
125
|
+
puts "================================"
|
126
|
+
puts "Forms with a unique POS: #{unique_pos}"
|
127
|
+
puts "Forms whose relation predicts its POS: #{relation_predicts_pos}"
|
128
|
+
puts "Forms whose relation does not predict its POS: #{relation_does_not_predict_pos}"
|
129
|
+
|
130
|
+
puts "Forms with a unique morphology: #{unique_morphology}"
|
131
|
+
puts "Forms whose relation predicts its morphology: #{relation_predicts_morphology}"
|
132
|
+
puts "Forms whose relation does not predict its morphology: #{relation_does_not_predict_morphology}"
|
133
|
+
|
134
|
+
exit 0
|
@@ -0,0 +1,30 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Word form occurrence extraction
|
4
|
+
#
|
5
|
+
require 'colorize'
|
6
|
+
require 'proiel'
|
7
|
+
|
8
|
+
if ARGV.length < 1
|
9
|
+
STDERR.puts "Usage: #{$0} treebank-files(s)"
|
10
|
+
|
11
|
+
exit 1
|
12
|
+
end
|
13
|
+
|
14
|
+
tb = PROIEL::Treebank.new
|
15
|
+
tb.load_from_xml(ARGV)
|
16
|
+
|
17
|
+
form_index = {}
|
18
|
+
|
19
|
+
tb.sources.each do |source|
|
20
|
+
source.tokens.each do |token|
|
21
|
+
unless token.form.nil?
|
22
|
+
form_index[token.form] ||= []
|
23
|
+
form_index[token.form] << [source.id, token.id].join(':')
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
form_index.sort_by(&:first).each do |form, ids|
|
29
|
+
puts "#{form}: #{ids.join(', ')}"
|
30
|
+
end
|
data/lib/proiel/cli.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'mercenary'
|
2
|
+
require 'colorize'
|
3
|
+
require 'proiel'
|
4
|
+
|
5
|
+
module PROIEL
|
6
|
+
class Command
|
7
|
+
class << self
|
8
|
+
def subclasses
|
9
|
+
@subclasses ||= []
|
10
|
+
end
|
11
|
+
|
12
|
+
def inherited(base)
|
13
|
+
subclasses << base
|
14
|
+
super(base)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def require_all(path)
|
21
|
+
glob = File.join(File.dirname(__FILE__), path, '*.rb')
|
22
|
+
Dir[glob].each do |f|
|
23
|
+
require f
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
require_all 'commands'
|
28
|
+
require_all 'converters'
|
@@ -0,0 +1,94 @@
|
|
1
|
+
module PROIEL
|
2
|
+
module Commands
|
3
|
+
class Convert < Command
|
4
|
+
class << self
|
5
|
+
def init_with_program(prog)
|
6
|
+
prog.command(:convert) do |c|
|
7
|
+
c.syntax 'convert format'
|
8
|
+
c.description 'Convert to a different format'
|
9
|
+
|
10
|
+
c.command(:proielxml) do |f|
|
11
|
+
f.syntax '[options] [filename(s)]'
|
12
|
+
f.description 'Convert to PROIEL XML format'
|
13
|
+
f.option 'remove-not-annotated', '--remove-not-annotated', 'Remove sentences that have not been annotated'
|
14
|
+
f.option 'remove-not-reviewed', '--remove-not-reviewed', 'Remove sentences that have not been reviewed'
|
15
|
+
f.option 'remove-morphology', '--remove-morphology', 'Remove morphological annotation (part of speech, morphology and lemma)'
|
16
|
+
f.option 'remove-syntax', '--remove-syntax', 'Remove syntactic annotation (relation, head ID and slashes)'
|
17
|
+
f.option 'remove-information-structure', '--remove-information-structure', 'Remove informtion structure annotation (antecedent ID, information status and contrast group)'
|
18
|
+
f.option 'remove-status', '--remove-status', 'Remove sentence status (i.e. revert all sentences to unannotated status)'
|
19
|
+
f.option 'remove-empty-divs', '--remove-empty-divs', 'Remove div elements that do not contain any sentences'
|
20
|
+
f.action { |args, options| process(args, options, PROIEL::Converter::PROIELXML) }
|
21
|
+
end
|
22
|
+
|
23
|
+
c.command(:tnt) do |f|
|
24
|
+
f.syntax '[options] filename(s)'
|
25
|
+
f.description 'Convert to TNT/hunpos format'
|
26
|
+
f.option 'morphology', '-m', '--morphology', 'Include morphological tags'
|
27
|
+
f.action { |args, options| process(args, options, PROIEL::Converter::TNT) }
|
28
|
+
end
|
29
|
+
|
30
|
+
c.command(:"conll-x") do |f|
|
31
|
+
f.syntax 'filename(s)'
|
32
|
+
f.description 'Convert to CoNLL-X format'
|
33
|
+
f.action { |args, options| process(args, options, PROIEL::Converter::CoNLLX) }
|
34
|
+
end
|
35
|
+
|
36
|
+
c.command(:"conll-u") do |f|
|
37
|
+
f.syntax 'filename(s)'
|
38
|
+
f.description 'Convert to CoNLL-U format'
|
39
|
+
f.action { |args, options| process(args, options, PROIEL::Converter::CoNLLU) }
|
40
|
+
end
|
41
|
+
|
42
|
+
c.command(:tiger) do |f|
|
43
|
+
f.syntax 'filename(s)'
|
44
|
+
f.description 'Convert to TIGER XML format'
|
45
|
+
f.action { |args, options| process(args, options, PROIEL::Converter::Tiger) }
|
46
|
+
end
|
47
|
+
|
48
|
+
c.command(:tiger2) do |f|
|
49
|
+
f.syntax 'filename(s)'
|
50
|
+
f.description 'Convert to TIGER2 format'
|
51
|
+
f.action { |args, options| process(args, options, PROIEL::Converter::Tiger2) }
|
52
|
+
end
|
53
|
+
|
54
|
+
c.command(:text) do |f|
|
55
|
+
f.syntax 'filename(s)'
|
56
|
+
f.description 'Convert to plain text (UTF-8 with Unix line-endings)'
|
57
|
+
f.option 'diffable', '-d', '--diffable', 'Make the output diffable'
|
58
|
+
f.action { |args, options| process(args, options, PROIEL::Converter::Text) }
|
59
|
+
end
|
60
|
+
|
61
|
+
c.command(:lexc) do |f|
|
62
|
+
f.syntax '[options] filename(s)'
|
63
|
+
f.description 'Convert to lexc format'
|
64
|
+
f.option 'morphology', '-m', '--morphology', 'Include morphological tags'
|
65
|
+
f.action { |args, options| process(args, options, PROIEL::Converter::Lexc) }
|
66
|
+
end
|
67
|
+
|
68
|
+
c.action do |_, _|
|
69
|
+
STDERR.puts 'Missing or invalid format. Use --help for more information.'
|
70
|
+
exit 1
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def process(args, options, converter)
|
76
|
+
tb = PROIEL::Treebank.new
|
77
|
+
|
78
|
+
if args.empty?
|
79
|
+
STDERR.puts "Reading from standard input...".green if options['verbose']
|
80
|
+
tb.load_from_xml(STDIN)
|
81
|
+
else
|
82
|
+
args.each do |filename|
|
83
|
+
STDERR.puts "Reading #{filename}...".green if options['verbose']
|
84
|
+
|
85
|
+
tb.load_from_xml(filename)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
converter.process(tb, options)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
module PROIEL
|
2
|
+
module Commands
|
3
|
+
class Grep < Command
|
4
|
+
class << self
|
5
|
+
def init_with_program(prog)
|
6
|
+
prog.command(:grep) do |c|
|
7
|
+
c.syntax 'grep [options] pattern filename(s)'
|
8
|
+
c.description 'Search the text'
|
9
|
+
|
10
|
+
c.option 'level', '--level LEVEL', 'Select level to match. LEVEL should be "token" or "sentence" (default)'
|
11
|
+
c.option 'nostrip', '--nostrip', 'Do not strip whitespace from start and beginning of strings'
|
12
|
+
c.option 'nosubst', '--nosubst', 'Do not substitute whitespace sequences with a single space'
|
13
|
+
c.option 'nocolour', '--nocolour', 'Do not colour code matches'
|
14
|
+
c.option 'ignore-case', '-i', '--ignore-case', 'Ignore uppercase/lowercase'
|
15
|
+
|
16
|
+
c.action { |args, options| process(args, options) }
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def match(s, citation, object_id, pattern, options)
|
21
|
+
s.strip! unless options['nostrip']
|
22
|
+
s.gsub!(/\s+/, ' ') unless options['nosubst']
|
23
|
+
|
24
|
+
if s[pattern]
|
25
|
+
s.gsub!(pattern) { |m| m.yellow } unless options['nocolour']
|
26
|
+
|
27
|
+
puts "#{citation} (ID = #{object_id}) #{s}"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def merge_citation_parts(citation_upper, citation_lower_start, citation_lower_end = nil)
|
32
|
+
citation_lower =
|
33
|
+
if citation_lower_start == citation_lower_end
|
34
|
+
citation_lower_start
|
35
|
+
else
|
36
|
+
[citation_lower_start, citation_lower_end].compact.join('-')
|
37
|
+
end
|
38
|
+
|
39
|
+
[citation_upper, citation_lower].compact.join(' ')
|
40
|
+
end
|
41
|
+
|
42
|
+
def process_div_for_sentences(citation_upper, div, pattern, options)
|
43
|
+
div.sentences.each do |sentence|
|
44
|
+
s = sentence.presentation_before || ''
|
45
|
+
citation_lower_start = nil
|
46
|
+
citation_lower_end = nil
|
47
|
+
|
48
|
+
sentence.tokens.each do |token|
|
49
|
+
unless token.is_empty?
|
50
|
+
s += token.presentation_before || ''
|
51
|
+
s += token.form || ''
|
52
|
+
s += token.presentation_after || ''
|
53
|
+
|
54
|
+
citation_lower_start = token.citation_part if citation_lower_start.nil?
|
55
|
+
citation_lower_end = token.citation_part
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
s += sentence.presentation_after || ''
|
60
|
+
|
61
|
+
citation = merge_citation_parts(citation_upper, citation_lower_start, citation_lower_end)
|
62
|
+
match(s, citation, sentence.id, pattern, options)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def process_div_for_tokens(citation_upper, div, pattern, options)
|
67
|
+
div.sentences.each do |sentence|
|
68
|
+
sentence.tokens.each do |token|
|
69
|
+
unless token.is_empty?
|
70
|
+
s = token.presentation_before || ''
|
71
|
+
s += token.form || ''
|
72
|
+
s += token.presentation_after || ''
|
73
|
+
|
74
|
+
citation = merge_citation_parts(citation_upper, token.citation_part)
|
75
|
+
match(s, citation, token.id, pattern, options)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def process_div(citation_upper, div, pattern, options)
|
82
|
+
case options['level']
|
83
|
+
when 'sentence'
|
84
|
+
process_div_for_sentences(citation_upper, div, pattern, options)
|
85
|
+
when 'token'
|
86
|
+
process_div_for_tokens(citation_upper, div, pattern, options)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def process(args, options)
|
91
|
+
if args.empty?
|
92
|
+
STDERR.puts 'Missing pattern. Use --help for more information.'
|
93
|
+
exit 1
|
94
|
+
end
|
95
|
+
|
96
|
+
pattern_string = args.shift
|
97
|
+
|
98
|
+
pattern =
|
99
|
+
if options['ignore-case']
|
100
|
+
Regexp.new(pattern_string, Regexp::IGNORECASE)
|
101
|
+
else
|
102
|
+
Regexp.new(pattern_string)
|
103
|
+
end
|
104
|
+
|
105
|
+
if args.empty?
|
106
|
+
STDERR.puts 'Missing filename(s). Use --help for more information.'
|
107
|
+
exit 1
|
108
|
+
end
|
109
|
+
|
110
|
+
options['level'] ||= 'sentence'
|
111
|
+
|
112
|
+
unless %w(token sentence).include?(options['level'])
|
113
|
+
STDERR.puts 'Invalid matching level. Use --help for more information.'
|
114
|
+
exit 1
|
115
|
+
end
|
116
|
+
|
117
|
+
tb = PROIEL::Treebank.new
|
118
|
+
|
119
|
+
args.each do |filename|
|
120
|
+
STDERR.puts "Reading #{filename}...".green if options['verbose']
|
121
|
+
|
122
|
+
tb.load_from_xml(filename)
|
123
|
+
end
|
124
|
+
|
125
|
+
tb.sources.each do |source|
|
126
|
+
citation_upper = source.citation_part
|
127
|
+
|
128
|
+
source.divs.each do |div|
|
129
|
+
process_div(citation_upper, div, pattern, options)
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
@@ -0,0 +1,126 @@
|
|
1
|
+
module PROIEL
|
2
|
+
module Commands
|
3
|
+
class CountWords < Command
|
4
|
+
class << self
|
5
|
+
def init_with_program(prog)
|
6
|
+
prog.command(:info) do |c|
|
7
|
+
c.syntax 'info [options] filename(s)'
|
8
|
+
c.description 'Show information about the treebank'
|
9
|
+
|
10
|
+
c.action do |args, options|
|
11
|
+
if args.empty?
|
12
|
+
STDERR.puts 'Missing filename(s). Use --help for more information.'
|
13
|
+
else
|
14
|
+
process(args, options)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def process(args, options)
|
21
|
+
tb = PROIEL::Treebank.new
|
22
|
+
|
23
|
+
args.each do |filename|
|
24
|
+
STDERR.puts "Reading #{filename}...".green if options['verbose']
|
25
|
+
|
26
|
+
tb.load_from_xml(filename)
|
27
|
+
end
|
28
|
+
|
29
|
+
t = treebank_statistics(tb)
|
30
|
+
|
31
|
+
puts "Loaded treebank files contain #{tb.sources.count} source(s)".yellow
|
32
|
+
puts " Overall size: #{t.sentence_count} sentence(s), #{t.token_count} token(s)"
|
33
|
+
puts
|
34
|
+
|
35
|
+
tb.sources.each_with_index do |source, i|
|
36
|
+
s = source_statistics(source)
|
37
|
+
n = s.sentence_count
|
38
|
+
r = s.reviewed_sentence_count * 100.0 / n
|
39
|
+
a = s.annotated_sentence_count * 100.0 / n
|
40
|
+
|
41
|
+
puts "#{i + 1}. #{pretty_title(source)}".yellow
|
42
|
+
puts " Version: #{source.date}"
|
43
|
+
puts " License: #{pretty_license(source)}"
|
44
|
+
puts " Language: #{pretty_language(source)}"
|
45
|
+
puts " Printed text: #{pretty_printed_text_info(source)}"
|
46
|
+
puts " Electr. text: #{pretty_electronic_text_info(source)}"
|
47
|
+
puts " Size: #{n} sentence(s), #{s.token_count} token(s)"
|
48
|
+
puts " Annotation: %.2f%% reviewed, %.2f%% annotated" % [r, a]
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def treebank_statistics(tb)
|
53
|
+
OpenStruct.new.tap do |s|
|
54
|
+
s.sentence_count = 0
|
55
|
+
s.token_count = 0
|
56
|
+
s.annotated_sentence_count = 0
|
57
|
+
s.reviewed_sentence_count = 0
|
58
|
+
|
59
|
+
tb.sources.each do |source|
|
60
|
+
s.token_count += source_statistics(source).token_count
|
61
|
+
s.sentence_count += source_statistics(source).sentence_count
|
62
|
+
s.annotated_sentence_count += source_statistics(source).annotated_sentence_count
|
63
|
+
s.reviewed_sentence_count += source_statistics(source).reviewed_sentence_count
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def source_statistics(source)
|
69
|
+
OpenStruct.new.tap do |s|
|
70
|
+
s.sentence_count = 0
|
71
|
+
s.token_count = 0
|
72
|
+
s.annotated_sentence_count = 0
|
73
|
+
s.reviewed_sentence_count = 0
|
74
|
+
|
75
|
+
source.divs.each do |div|
|
76
|
+
div.sentences.each do |sentence|
|
77
|
+
s.token_count += sentence.tokens.count
|
78
|
+
end
|
79
|
+
|
80
|
+
s.sentence_count += div.sentences.count
|
81
|
+
s.annotated_sentence_count += div.sentences.select(&:annotated?).count
|
82
|
+
s.reviewed_sentence_count += div.sentences.select(&:reviewed?).count
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def pretty_language(source)
|
88
|
+
case source.language
|
89
|
+
when 'lat'
|
90
|
+
'Latin'
|
91
|
+
else
|
92
|
+
"Unknown (language code #{source.language})"
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def pretty_printed_text_info(source)
|
97
|
+
[source.printed_text_title,
|
98
|
+
source.printed_text_editor ? "ed. #{source.printed_text_editor}" : nil,
|
99
|
+
source.printed_text_publisher,
|
100
|
+
source.printed_text_place,
|
101
|
+
source.printed_text_date].compact.join(', ')
|
102
|
+
end
|
103
|
+
|
104
|
+
def pretty_electronic_text_info(source)
|
105
|
+
[source.electronic_text_title,
|
106
|
+
source.electronic_text_editor ? "ed. #{source.electronic_text_editor}" : nil,
|
107
|
+
source.electronic_text_publisher,
|
108
|
+
source.electronic_text_place,
|
109
|
+
source.electronic_text_date].compact.join(', ')
|
110
|
+
end
|
111
|
+
|
112
|
+
def pretty_license(source)
|
113
|
+
if source.license_url
|
114
|
+
"#{source.license} (#{source.license_url})"
|
115
|
+
else
|
116
|
+
source.license
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def pretty_title(source)
|
121
|
+
[source.author, source.title].compact.join(', ')
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|