proiel 1.1.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/LICENSE +1 -1
- data/README.md +2 -2
- data/lib/proiel.rb +16 -1
- data/lib/proiel/alignment.rb +3 -0
- data/lib/proiel/alignment/builder.rb +220 -0
- data/lib/proiel/annotation_schema.rb +11 -4
- data/lib/proiel/chronology.rb +80 -0
- data/lib/proiel/dictionary.rb +79 -0
- data/lib/proiel/dictionary/builder.rb +224 -0
- data/lib/proiel/div.rb +22 -3
- data/lib/proiel/language.rb +108 -0
- data/lib/proiel/lemma.rb +77 -0
- data/lib/proiel/proiel_xml/proiel-3.0/proiel-3.0.xsd +383 -0
- data/lib/proiel/proiel_xml/reader.rb +138 -2
- data/lib/proiel/proiel_xml/schema.rb +4 -2
- data/lib/proiel/proiel_xml/validator.rb +76 -9
- data/lib/proiel/sentence.rb +27 -4
- data/lib/proiel/source.rb +14 -4
- data/lib/proiel/statistics.rb +2 -2
- data/lib/proiel/token.rb +14 -6
- data/lib/proiel/tokenization.rb +5 -3
- data/lib/proiel/treebank.rb +23 -6
- data/lib/proiel/utils.rb +0 -1
- data/lib/proiel/valency.rb +5 -0
- data/lib/proiel/valency/arguments.rb +151 -0
- data/lib/proiel/valency/lexicon.rb +59 -0
- data/lib/proiel/valency/obliqueness.rb +31 -0
- data/lib/proiel/version.rb +2 -3
- data/lib/proiel/visualization.rb +1 -0
- data/lib/proiel/visualization/graphviz.rb +111 -0
- data/lib/proiel/visualization/graphviz/aligned-modern.dot.erb +83 -0
- data/lib/proiel/visualization/graphviz/classic.dot.erb +24 -0
- data/lib/proiel/visualization/graphviz/linearized.dot.erb +57 -0
- data/lib/proiel/visualization/graphviz/modern.dot.erb +39 -0
- data/lib/proiel/visualization/graphviz/packed.dot.erb +25 -0
- metadata +76 -31
@@ -16,7 +16,7 @@ module PROIEL
|
|
16
16
|
# @return [String] schema version number
|
17
17
|
#
|
18
18
|
def self.current_proiel_xml_schema_version
|
19
|
-
'
|
19
|
+
'3.0'
|
20
20
|
end
|
21
21
|
|
22
22
|
# Invalid PROIEL XML schema version error.
|
@@ -41,6 +41,8 @@ module PROIEL
|
|
41
41
|
'2.0'
|
42
42
|
when '2.1'
|
43
43
|
'2.1'
|
44
|
+
when '3.0'
|
45
|
+
'3.0'
|
44
46
|
when NilClass
|
45
47
|
'1.0'
|
46
48
|
else
|
@@ -70,7 +72,7 @@ module PROIEL
|
|
70
72
|
# @raise ArgumentError
|
71
73
|
#
|
72
74
|
def self.proiel_xml_schema_filename(schema_version)
|
73
|
-
if schema_version == '1.0' or schema_version == '2.0' or schema_version == '2.1'
|
75
|
+
if schema_version == '1.0' or schema_version == '2.0' or schema_version == '2.1' or schema_version == '3.0'
|
74
76
|
File.join(File.dirname(__FILE__),
|
75
77
|
"proiel-#{schema_version}",
|
76
78
|
"proiel-#{schema_version}.xsd")
|
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015 Marius L. Jøhndal
|
2
|
+
# Copyright (c) 2015-2017 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -16,9 +16,11 @@ module PROIEL
|
|
16
16
|
# Creates a new validator for a PROIEL XML file.
|
17
17
|
#
|
18
18
|
# @param filename [String] name of PROIEL XML file to validate
|
19
|
+
# @param aligned_filename [NilClass, String] name of PROIEL XML file to validate alignments against
|
19
20
|
#
|
20
|
-
def initialize(filename)
|
21
|
+
def initialize(filename, aligned_filename = nil)
|
21
22
|
@filename = filename
|
23
|
+
@aligned_filename = aligned_filename
|
22
24
|
@errors = []
|
23
25
|
end
|
24
26
|
|
@@ -41,15 +43,13 @@ module PROIEL
|
|
41
43
|
# @return [true, false]
|
42
44
|
#
|
43
45
|
def wellformed?
|
44
|
-
|
45
|
-
Nokogiri::XML(File.read(@filename)) { |config| config.strict }
|
46
|
+
Nokogiri::XML(File.read(@filename)) { |config| config.strict }
|
46
47
|
|
47
|
-
|
48
|
-
|
49
|
-
|
48
|
+
true
|
49
|
+
rescue Nokogiri::XML::SyntaxError => _
|
50
|
+
@errors << 'XML file is not wellformed'
|
50
51
|
|
51
|
-
|
52
|
-
end
|
52
|
+
false
|
53
53
|
end
|
54
54
|
|
55
55
|
# Checks if the PROIEL XML file has a valid schema version number.
|
@@ -154,6 +154,27 @@ module PROIEL
|
|
154
154
|
end
|
155
155
|
end
|
156
156
|
|
157
|
+
# Pass 5: if div is aligned, sentences and tokens within should belong
|
158
|
+
# to aligned div(s); if sentence aligned, tokens within should belong
|
159
|
+
# to aligned sentence(s). Skip if no alignment_id on source (see pass
|
160
|
+
# 4) or if aligned source not available.
|
161
|
+
if @aligned_filename
|
162
|
+
aligned_tb = PROIEL::Treebank.new
|
163
|
+
aligned_tb.load_from_xml(@aligned_filename)
|
164
|
+
|
165
|
+
tb.sources.each do |source|
|
166
|
+
if source.alignment_id
|
167
|
+
aligned_source = aligned_tb.find_source(source.alignment_id)
|
168
|
+
|
169
|
+
if aligned_source
|
170
|
+
check_alignment_integrity(errors, source, aligned_source)
|
171
|
+
else
|
172
|
+
errors << "Aligned source not available in treebank"
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
157
178
|
# Decide if there were any errors
|
158
179
|
if errors.empty?
|
159
180
|
true
|
@@ -182,6 +203,52 @@ module PROIEL
|
|
182
203
|
errors << "Token #{token.id}: #{attribute_name} is null"
|
183
204
|
end
|
184
205
|
end
|
206
|
+
|
207
|
+
def check_alignment_integrity(errors, source, aligned_source)
|
208
|
+
source.divs.each do |div|
|
209
|
+
target_sentences =
|
210
|
+
div.sentences.map do |sentence|
|
211
|
+
target_tokens =
|
212
|
+
sentence.tokens.select(&:alignment_id).map do |token|
|
213
|
+
# Check that target token exists in aligned source
|
214
|
+
aligned_token = aligned_source.treebank.find_token(token.alignment_id)
|
215
|
+
|
216
|
+
if aligned_token
|
217
|
+
aligned_token
|
218
|
+
else
|
219
|
+
errors << "Token #{token.id}: aligned to token #{aligned_source.id}:#{token.alignment_id} which does not exist"
|
220
|
+
nil
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
inferred_target_sentences = target_tokens.compact.map(&:sentence).sort_by(&:id).uniq
|
225
|
+
|
226
|
+
if sentence.alignment_id
|
227
|
+
a = sentence.alignment_id.to_s.split(',').sort.join(',')
|
228
|
+
i = inferred_target_sentences.map(&:id).sort.join(',')
|
229
|
+
|
230
|
+
# FIXME: handle i.empty? case, in which we have to use a and check that the objects exist
|
231
|
+
if a != i
|
232
|
+
errors << "Sentence #{sentence.id}: aligned to sentence #{aligned_source.id}:#{a} but inferred alignment is #{aligned_source.id}:#{i}"
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
inferred_target_sentences
|
237
|
+
end
|
238
|
+
|
239
|
+
inferred_target_divs = target_sentences.flatten.compact.map(&:div).uniq
|
240
|
+
|
241
|
+
if div.alignment_id
|
242
|
+
a = div.alignment_id.to_s.split(',').sort.join(',')
|
243
|
+
i = inferred_target_divs.map(&:id).sort.join(',')
|
244
|
+
|
245
|
+
# FIXME: handle i.empty? case, in which we have to use a and check that the objects exist
|
246
|
+
if a != i
|
247
|
+
errors << "Div #{div.id}: aligned to div #{aligned_source.id}:#{a} but inferred alignment is #{aligned_source.id}:#{i}"
|
248
|
+
end
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
185
252
|
end
|
186
253
|
end
|
187
254
|
end
|
data/lib/proiel/sentence.rb
CHANGED
@@ -57,10 +57,14 @@ module PROIEL
|
|
57
57
|
raise ArgumentError, 'integer or nil expected' unless alignment_id.nil? or alignment_id.is_a?(Integer)
|
58
58
|
@alignment_id = alignment_id
|
59
59
|
|
60
|
-
|
60
|
+
unless annotated_at.nil? or PROIEL::Utilities.xmlschema_datetime?(annotated_at)
|
61
|
+
raise ArgumentError, 'XML schema date time or nil expected'
|
62
|
+
end
|
61
63
|
@annotated_at = annotated_at ? DateTime.xmlschema(annotated_at).freeze : nil
|
62
64
|
|
63
|
-
|
65
|
+
unless reviewed_at.nil? or PROIEL::Utilities.xmlschema_datetime?(reviewed_at)
|
66
|
+
raise ArgumentError, 'XML schema date time or nil expected'
|
67
|
+
end
|
64
68
|
@reviewed_at = reviewed_at ? DateTime.xmlschema(reviewed_at).freeze : nil
|
65
69
|
|
66
70
|
raise ArgumentError, 'string or nil expected' unless annotated_by.nil? or annotated_by.is_a?(String)
|
@@ -113,10 +117,13 @@ module PROIEL
|
|
113
117
|
# Returns the printable form of the sentence with all token forms and any
|
114
118
|
# presentation data.
|
115
119
|
#
|
120
|
+
# @param custom_token_formatter [Lambda] formatting function for tokens
|
121
|
+
# which is passed the token as its sole argument
|
122
|
+
#
|
116
123
|
# @return [String] the printable form of the sentence
|
117
|
-
def printable_form(
|
124
|
+
def printable_form(custom_token_formatter: nil)
|
118
125
|
[presentation_before,
|
119
|
-
@children.map { |t| t.printable_form(
|
126
|
+
@children.reject(&:is_empty?).map { |t| t.printable_form(custom_token_formatter: custom_token_formatter) },
|
120
127
|
presentation_after].compact.join
|
121
128
|
end
|
122
129
|
|
@@ -217,5 +224,21 @@ module PROIEL
|
|
217
224
|
def tokens
|
218
225
|
@children.to_enum
|
219
226
|
end
|
227
|
+
|
228
|
+
# Returns the aligned sentence if any.
|
229
|
+
#
|
230
|
+
# @return [Sentence, NilClass] aligned sentence
|
231
|
+
def alignment(aligned_source)
|
232
|
+
alignment_id ? aligned_source.treebank.find_sentence(alignment_id) : nil
|
233
|
+
end
|
234
|
+
|
235
|
+
# Returns inferred aligned sentences if any.
|
236
|
+
#
|
237
|
+
# @return [Array<Sentence>] inferred aligned sentences
|
238
|
+
def inferred_alignment(aligned_source)
|
239
|
+
tokens.select(&:alignment_id).map do |token|
|
240
|
+
token.alignment(aligned_source)
|
241
|
+
end.flatten.compact.map(&:sentence).uniq
|
242
|
+
end
|
220
243
|
end
|
221
244
|
end
|
data/lib/proiel/source.rb
CHANGED
@@ -15,6 +15,9 @@ module PROIEL
|
|
15
15
|
# @return [String] language of the source as an ISO 639-3 language tag
|
16
16
|
attr_reader :language
|
17
17
|
|
18
|
+
# @return [String] dialect of the source
|
19
|
+
attr_reader :dialect
|
20
|
+
|
18
21
|
# @return [DateTime] export time for the source
|
19
22
|
attr_reader :export_time
|
20
23
|
|
@@ -26,11 +29,15 @@ module PROIEL
|
|
26
29
|
attr_reader :alignment_id
|
27
30
|
|
28
31
|
# Creates a new source object.
|
29
|
-
def initialize(parent, id, export_time, language, metadata, alignment_id, &block)
|
32
|
+
def initialize(parent, id, export_time, language, dialect, metadata, alignment_id, &block)
|
30
33
|
@treebank = parent
|
31
34
|
@id = id.freeze
|
32
|
-
|
35
|
+
|
36
|
+
raise ArgumentError, 'string or nil expected' unless export_time.nil? or export_time.is_a?(String)
|
37
|
+
@export_time = export_time.nil? ? nil : DateTime.parse(export_time).freeze
|
38
|
+
|
33
39
|
@language = language.freeze
|
40
|
+
@dialect = dialect ? dialect.freeze : nil
|
34
41
|
@metadata = metadata.freeze
|
35
42
|
|
36
43
|
raise ArgumentError, 'string or nil expected' unless alignment_id.nil? or alignment_id.is_a?(String)
|
@@ -47,9 +54,12 @@ module PROIEL
|
|
47
54
|
# Returns the printable form of the source with all token forms and any
|
48
55
|
# presentation data.
|
49
56
|
#
|
57
|
+
# @param custom_token_formatter [Lambda] formatting function for tokens
|
58
|
+
# which is passed the token as its sole argument
|
59
|
+
#
|
50
60
|
# @return [String] the printable form of the source
|
51
|
-
def printable_form(
|
52
|
-
@children.map { |d| d.printable_form(
|
61
|
+
def printable_form(custom_token_formatter: nil)
|
62
|
+
@children.map { |d| d.printable_form(custom_token_formatter: custom_token_formatter) }.compact.join
|
53
63
|
end
|
54
64
|
|
55
65
|
# Accesses metadata fields.
|
data/lib/proiel/statistics.rb
CHANGED
@@ -26,11 +26,11 @@ module PROIEL
|
|
26
26
|
|
27
27
|
x_mean = x.reduce(&:+).to_f / x.size
|
28
28
|
y_mean = y.reduce(&:+).to_f / y.size
|
29
|
-
x_sqsum = x.reduce(0.0) { |sum, n| sum + n
|
29
|
+
x_sqsum = x.reduce(0.0) { |sum, n| sum + n**2 }
|
30
30
|
xy_sum = x.zip(y).reduce(0.0) { |sum, (m, n)| sum + m * n }
|
31
31
|
|
32
32
|
sxy = xy_sum - x.length * x_mean * y_mean
|
33
|
-
sx2 = x_sqsum - x.length * (x_mean
|
33
|
+
sx2 = x_sqsum - x.length * (x_mean**2)
|
34
34
|
|
35
35
|
beta = sxy / sx2
|
36
36
|
alfa = y_mean - beta * x_mean
|
data/lib/proiel/token.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015-
|
2
|
+
# Copyright (c) 2015-2017 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -160,12 +160,13 @@ module PROIEL
|
|
160
160
|
# Returns the printable form of the token with any presentation data.
|
161
161
|
#
|
162
162
|
# @param custom_token_formatter [Lambda] formatting function for tokens
|
163
|
+
# which is passed the token as its sole argument
|
163
164
|
#
|
164
165
|
# @return [String] the printable form of the token
|
165
166
|
def printable_form(custom_token_formatter: nil)
|
166
167
|
printable_form =
|
167
168
|
if custom_token_formatter
|
168
|
-
custom_token_formatter.call(
|
169
|
+
custom_token_formatter.call(self)
|
169
170
|
else
|
170
171
|
form
|
171
172
|
end
|
@@ -281,7 +282,7 @@ module PROIEL
|
|
281
282
|
#
|
282
283
|
# @return [Array<Token>] descendents
|
283
284
|
def descendents
|
284
|
-
dependents.map { |dependent| [dependent
|
285
|
+
dependents.map { |dependent| [dependent] + dependent.descendents }.flatten
|
285
286
|
end
|
286
287
|
|
287
288
|
memoize :descendents
|
@@ -393,22 +394,29 @@ module PROIEL
|
|
393
394
|
common_ancestors(other_token, inclusive: inclusive).first
|
394
395
|
end
|
395
396
|
|
397
|
+
# Returns the aligned token if any.
|
398
|
+
#
|
399
|
+
# @return [Token, NilClass] aligned token
|
400
|
+
def alignment(aligned_source)
|
401
|
+
alignment_id ? aligned_source.treebank.find_token(alignment_id) : nil
|
402
|
+
end
|
403
|
+
|
396
404
|
private
|
397
405
|
|
398
406
|
# FIXME: extract this from the header of the PROIEL XML file instead and
|
399
407
|
# subclass PositionalTag
|
400
|
-
POS_POSITIONAL_TAG_SEQUENCE = %i(major minor)
|
408
|
+
POS_POSITIONAL_TAG_SEQUENCE = %i(major minor).freeze
|
401
409
|
|
402
410
|
# FIXME: extract this from the header of the PROIEL XML file instead and
|
403
411
|
# subclass PositionalTag
|
404
412
|
MORPHOLOGY_POSITIONAL_TAG_SEQUENCE = %i(
|
405
413
|
person number tense mood voice gender case degree strength inflection
|
406
|
-
)
|
414
|
+
).freeze
|
407
415
|
|
408
416
|
NULL_PARTS_OF_SPEECH = {
|
409
417
|
'V' => 'V-',
|
410
418
|
'C' => 'C-',
|
411
419
|
'P' => 'Pp',
|
412
|
-
}
|
420
|
+
}.freeze
|
413
421
|
end
|
414
422
|
end
|
data/lib/proiel/tokenization.rb
CHANGED
@@ -24,7 +24,7 @@ module PROIEL
|
|
24
24
|
|
25
25
|
patterns = JSON.parse(File.read(filename))
|
26
26
|
|
27
|
-
regexes = patterns.map { |l, p| [l,
|
27
|
+
regexes = patterns.map { |l, p| [l, make_regex(p)] }.to_h
|
28
28
|
|
29
29
|
@@regexes ||= {}
|
30
30
|
@@regexes.merge!(regexes)
|
@@ -59,6 +59,8 @@ module PROIEL
|
|
59
59
|
form and form.length > 1
|
60
60
|
end
|
61
61
|
|
62
|
+
WORD_PATTERN = /([^[\u{E000}-\u{F8FF}][[:word:]]]+)/.freeze
|
63
|
+
|
62
64
|
# Splits a token form using the tokenization patterns that apply for a
|
63
65
|
# the specified language. Tokenization patterns must already have been
|
64
66
|
# loaded.
|
@@ -73,9 +75,9 @@ module PROIEL
|
|
73
75
|
raise ArgumentError, 'invalid language tag' unless language_tag.is_a?(String)
|
74
76
|
raise ArgumentError, 'invalid form' unless form.is_a?(String)
|
75
77
|
|
76
|
-
if form[
|
78
|
+
if form[WORD_PATTERN]
|
77
79
|
# Split on any non-word character like a space or punctuation
|
78
|
-
form.split(
|
80
|
+
form.split(WORD_PATTERN)
|
79
81
|
elsif @@regexes.key?(language_tag) and form[@@regexes[language_tag]]
|
80
82
|
# Apply language-specific pattern
|
81
83
|
form.match(@@regexes[language_tag]).captures
|
data/lib/proiel/treebank.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015 Marius L. Jøhndal
|
2
|
+
# Copyright (c) 2015-2018 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -23,9 +23,13 @@ module PROIEL
|
|
23
23
|
# @return [Array<Source>] sources in the treebank
|
24
24
|
attr_reader :sources
|
25
25
|
|
26
|
+
# @return [Array<Dictionary>] dictionaries in the treebank
|
27
|
+
attr_reader :dictionaries
|
28
|
+
|
26
29
|
# Available metadata elements for sources.
|
27
30
|
METADATA_ELEMENTS = %i(
|
28
31
|
title
|
32
|
+
alternative_title
|
29
33
|
author
|
30
34
|
citation_part
|
31
35
|
principal
|
@@ -55,13 +59,16 @@ module PROIEL
|
|
55
59
|
printed_text_publisher
|
56
60
|
printed_text_place
|
57
61
|
printed_text_date
|
58
|
-
|
62
|
+
chronology_composition
|
63
|
+
chronology_manuscript
|
64
|
+
).freeze
|
59
65
|
|
60
66
|
# Creates a new treebank object.
|
61
67
|
def initialize
|
62
68
|
@annotation_schema = nil
|
63
69
|
@schema_version = nil
|
64
70
|
@sources = []
|
71
|
+
@dictionaries = []
|
65
72
|
|
66
73
|
@source_index = {}
|
67
74
|
@div_index = {}
|
@@ -85,12 +92,18 @@ module PROIEL
|
|
85
92
|
tf = PROIELXML::Reader.parse_io(f)
|
86
93
|
|
87
94
|
tf.proiel.sources.each do |s|
|
88
|
-
@sources << Source.new(self, s.id, tf.proiel.export_time, s.language,
|
95
|
+
@sources << Source.new(self, s.id, tf.proiel.export_time, s.language, s.dialect,
|
89
96
|
bundle_metadata(s), s.alignment_id) do |source|
|
90
97
|
build_divs(s, source)
|
91
98
|
end
|
92
99
|
|
93
|
-
|
100
|
+
index_source_objects!(@sources.last)
|
101
|
+
end
|
102
|
+
|
103
|
+
tf.proiel.dictionaries.each do |s|
|
104
|
+
@dictionaries << Dictionary.new(self, tf.proiel.export_time, s.language, s.dialect, s)
|
105
|
+
|
106
|
+
index_dictionary_objects!(@dictionaries.last)
|
94
107
|
end
|
95
108
|
|
96
109
|
annotation_schema = AnnotationSchema.new(tf.proiel.annotation)
|
@@ -103,7 +116,7 @@ module PROIEL
|
|
103
116
|
# FIXME: consolidate export times? This is a design flaw in PROIEL XML
|
104
117
|
# 2.0: export time ought to be per source not per PROIEL XML file, so
|
105
118
|
# not clear what to do here. Pass it down to the source object?
|
106
|
-
|
119
|
+
# @export_time = tf.proiel.export_time
|
107
120
|
else
|
108
121
|
raise SchemaMismatch
|
109
122
|
end
|
@@ -198,7 +211,7 @@ module PROIEL
|
|
198
211
|
end
|
199
212
|
end
|
200
213
|
|
201
|
-
def
|
214
|
+
def index_source_objects!(source)
|
202
215
|
@source_index[source.id] = source
|
203
216
|
|
204
217
|
source.divs.each do |div|
|
@@ -213,5 +226,9 @@ module PROIEL
|
|
213
226
|
end
|
214
227
|
end
|
215
228
|
end
|
229
|
+
|
230
|
+
def index_dictionary_objects!(dictionary)
|
231
|
+
# TODO
|
232
|
+
end
|
216
233
|
end
|
217
234
|
end
|
data/lib/proiel/utils.rb
CHANGED