proiel 1.1.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +5 -5
  2. data/LICENSE +1 -1
  3. data/README.md +2 -2
  4. data/lib/proiel.rb +16 -1
  5. data/lib/proiel/alignment.rb +3 -0
  6. data/lib/proiel/alignment/builder.rb +220 -0
  7. data/lib/proiel/annotation_schema.rb +11 -4
  8. data/lib/proiel/chronology.rb +80 -0
  9. data/lib/proiel/dictionary.rb +79 -0
  10. data/lib/proiel/dictionary/builder.rb +224 -0
  11. data/lib/proiel/div.rb +22 -3
  12. data/lib/proiel/language.rb +108 -0
  13. data/lib/proiel/lemma.rb +77 -0
  14. data/lib/proiel/proiel_xml/proiel-3.0/proiel-3.0.xsd +383 -0
  15. data/lib/proiel/proiel_xml/reader.rb +138 -2
  16. data/lib/proiel/proiel_xml/schema.rb +4 -2
  17. data/lib/proiel/proiel_xml/validator.rb +76 -9
  18. data/lib/proiel/sentence.rb +27 -4
  19. data/lib/proiel/source.rb +14 -4
  20. data/lib/proiel/statistics.rb +2 -2
  21. data/lib/proiel/token.rb +14 -6
  22. data/lib/proiel/tokenization.rb +5 -3
  23. data/lib/proiel/treebank.rb +23 -6
  24. data/lib/proiel/utils.rb +0 -1
  25. data/lib/proiel/valency.rb +5 -0
  26. data/lib/proiel/valency/arguments.rb +151 -0
  27. data/lib/proiel/valency/lexicon.rb +59 -0
  28. data/lib/proiel/valency/obliqueness.rb +31 -0
  29. data/lib/proiel/version.rb +2 -3
  30. data/lib/proiel/visualization.rb +1 -0
  31. data/lib/proiel/visualization/graphviz.rb +111 -0
  32. data/lib/proiel/visualization/graphviz/aligned-modern.dot.erb +83 -0
  33. data/lib/proiel/visualization/graphviz/classic.dot.erb +24 -0
  34. data/lib/proiel/visualization/graphviz/linearized.dot.erb +57 -0
  35. data/lib/proiel/visualization/graphviz/modern.dot.erb +39 -0
  36. data/lib/proiel/visualization/graphviz/packed.dot.erb +25 -0
  37. metadata +76 -31
@@ -16,7 +16,7 @@ module PROIEL
16
16
  # @return [String] schema version number
17
17
  #
18
18
  def self.current_proiel_xml_schema_version
19
- '2.1'
19
+ '3.0'
20
20
  end
21
21
 
22
22
  # Invalid PROIEL XML schema version error.
@@ -41,6 +41,8 @@ module PROIEL
41
41
  '2.0'
42
42
  when '2.1'
43
43
  '2.1'
44
+ when '3.0'
45
+ '3.0'
44
46
  when NilClass
45
47
  '1.0'
46
48
  else
@@ -70,7 +72,7 @@ module PROIEL
70
72
  # @raise ArgumentError
71
73
  #
72
74
  def self.proiel_xml_schema_filename(schema_version)
73
- if schema_version == '1.0' or schema_version == '2.0' or schema_version == '2.1'
75
+ if schema_version == '1.0' or schema_version == '2.0' or schema_version == '2.1' or schema_version == '3.0'
74
76
  File.join(File.dirname(__FILE__),
75
77
  "proiel-#{schema_version}",
76
78
  "proiel-#{schema_version}.xsd")
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2015 Marius L. Jøhndal
2
+ # Copyright (c) 2015-2017 Marius L. Jøhndal
3
3
  #
4
4
  # See LICENSE in the top-level source directory for licensing terms.
5
5
  #++
@@ -16,9 +16,11 @@ module PROIEL
16
16
  # Creates a new validator for a PROIEL XML file.
17
17
  #
18
18
  # @param filename [String] name of PROIEL XML file to validate
19
+ # @param aligned_filename [NilClass, String] name of PROIEL XML file to validate alignments against
19
20
  #
20
- def initialize(filename)
21
+ def initialize(filename, aligned_filename = nil)
21
22
  @filename = filename
23
+ @aligned_filename = aligned_filename
22
24
  @errors = []
23
25
  end
24
26
 
@@ -41,15 +43,13 @@ module PROIEL
41
43
  # @return [true, false]
42
44
  #
43
45
  def wellformed?
44
- begin
45
- Nokogiri::XML(File.read(@filename)) { |config| config.strict }
46
+ Nokogiri::XML(File.read(@filename)) { |config| config.strict }
46
47
 
47
- true
48
- rescue Nokogiri::XML::SyntaxError => _
49
- @errors << 'XML file is not wellformed'
48
+ true
49
+ rescue Nokogiri::XML::SyntaxError => _
50
+ @errors << 'XML file is not wellformed'
50
51
 
51
- false
52
- end
52
+ false
53
53
  end
54
54
 
55
55
  # Checks if the PROIEL XML file has a valid schema version number.
@@ -154,6 +154,27 @@ module PROIEL
154
154
  end
155
155
  end
156
156
 
157
+ # Pass 5: if div is aligned, sentences and tokens within should belong
158
+ # to aligned div(s); if sentence aligned, tokens within should belong
159
+ # to aligned sentence(s). Skip if no alignment_id on source (see pass
160
+ # 4) or if aligned source not available.
161
+ if @aligned_filename
162
+ aligned_tb = PROIEL::Treebank.new
163
+ aligned_tb.load_from_xml(@aligned_filename)
164
+
165
+ tb.sources.each do |source|
166
+ if source.alignment_id
167
+ aligned_source = aligned_tb.find_source(source.alignment_id)
168
+
169
+ if aligned_source
170
+ check_alignment_integrity(errors, source, aligned_source)
171
+ else
172
+ errors << "Aligned source not available in treebank"
173
+ end
174
+ end
175
+ end
176
+ end
177
+
157
178
  # Decide if there were any errors
158
179
  if errors.empty?
159
180
  true
@@ -182,6 +203,52 @@ module PROIEL
182
203
  errors << "Token #{token.id}: #{attribute_name} is null"
183
204
  end
184
205
  end
206
+
207
+ def check_alignment_integrity(errors, source, aligned_source)
208
+ source.divs.each do |div|
209
+ target_sentences =
210
+ div.sentences.map do |sentence|
211
+ target_tokens =
212
+ sentence.tokens.select(&:alignment_id).map do |token|
213
+ # Check that target token exists in aligned source
214
+ aligned_token = aligned_source.treebank.find_token(token.alignment_id)
215
+
216
+ if aligned_token
217
+ aligned_token
218
+ else
219
+ errors << "Token #{token.id}: aligned to token #{aligned_source.id}:#{token.alignment_id} which does not exist"
220
+ nil
221
+ end
222
+ end
223
+
224
+ inferred_target_sentences = target_tokens.compact.map(&:sentence).sort_by(&:id).uniq
225
+
226
+ if sentence.alignment_id
227
+ a = sentence.alignment_id.to_s.split(',').sort.join(',')
228
+ i = inferred_target_sentences.map(&:id).sort.join(',')
229
+
230
+ # FIXME: handle i.empty? case, in which we have to use a and check that the objects exist
231
+ if a != i
232
+ errors << "Sentence #{sentence.id}: aligned to sentence #{aligned_source.id}:#{a} but inferred alignment is #{aligned_source.id}:#{i}"
233
+ end
234
+ end
235
+
236
+ inferred_target_sentences
237
+ end
238
+
239
+ inferred_target_divs = target_sentences.flatten.compact.map(&:div).uniq
240
+
241
+ if div.alignment_id
242
+ a = div.alignment_id.to_s.split(',').sort.join(',')
243
+ i = inferred_target_divs.map(&:id).sort.join(',')
244
+
245
+ # FIXME: handle i.empty? case, in which we have to use a and check that the objects exist
246
+ if a != i
247
+ errors << "Div #{div.id}: aligned to div #{aligned_source.id}:#{a} but inferred alignment is #{aligned_source.id}:#{i}"
248
+ end
249
+ end
250
+ end
251
+ end
185
252
  end
186
253
  end
187
254
  end
@@ -57,10 +57,14 @@ module PROIEL
57
57
  raise ArgumentError, 'integer or nil expected' unless alignment_id.nil? or alignment_id.is_a?(Integer)
58
58
  @alignment_id = alignment_id
59
59
 
60
- raise ArgumentError, 'XML schema date time or nil expected' unless annotated_at.nil? or PROIEL::Utilities.xmlschema_datetime?(annotated_at)
60
+ unless annotated_at.nil? or PROIEL::Utilities.xmlschema_datetime?(annotated_at)
61
+ raise ArgumentError, 'XML schema date time or nil expected'
62
+ end
61
63
  @annotated_at = annotated_at ? DateTime.xmlschema(annotated_at).freeze : nil
62
64
 
63
- raise ArgumentError, 'XML schema date time or nil expected' unless reviewed_at.nil? or PROIEL::Utilities.xmlschema_datetime?(reviewed_at)
65
+ unless reviewed_at.nil? or PROIEL::Utilities.xmlschema_datetime?(reviewed_at)
66
+ raise ArgumentError, 'XML schema date time or nil expected'
67
+ end
64
68
  @reviewed_at = reviewed_at ? DateTime.xmlschema(reviewed_at).freeze : nil
65
69
 
66
70
  raise ArgumentError, 'string or nil expected' unless annotated_by.nil? or annotated_by.is_a?(String)
@@ -113,10 +117,13 @@ module PROIEL
113
117
  # Returns the printable form of the sentence with all token forms and any
114
118
  # presentation data.
115
119
  #
120
+ # @param custom_token_formatter [Lambda] formatting function for tokens
121
+ # which is passed the token as its sole argument
122
+ #
116
123
  # @return [String] the printable form of the sentence
117
- def printable_form(options = {})
124
+ def printable_form(custom_token_formatter: nil)
118
125
  [presentation_before,
119
- @children.map { |t| t.printable_form(options) },
126
+ @children.reject(&:is_empty?).map { |t| t.printable_form(custom_token_formatter: custom_token_formatter) },
120
127
  presentation_after].compact.join
121
128
  end
122
129
 
@@ -217,5 +224,21 @@ module PROIEL
217
224
  def tokens
218
225
  @children.to_enum
219
226
  end
227
+
228
+ # Returns the aligned sentence if any.
229
+ #
230
+ # @return [Sentence, NilClass] aligned sentence
231
+ def alignment(aligned_source)
232
+ alignment_id ? aligned_source.treebank.find_sentence(alignment_id) : nil
233
+ end
234
+
235
+ # Returns inferred aligned sentences if any.
236
+ #
237
+ # @return [Array<Sentence>] inferred aligned sentences
238
+ def inferred_alignment(aligned_source)
239
+ tokens.select(&:alignment_id).map do |token|
240
+ token.alignment(aligned_source)
241
+ end.flatten.compact.map(&:sentence).uniq
242
+ end
220
243
  end
221
244
  end
@@ -15,6 +15,9 @@ module PROIEL
15
15
  # @return [String] language of the source as an ISO 639-3 language tag
16
16
  attr_reader :language
17
17
 
18
+ # @return [String] dialect of the source
19
+ attr_reader :dialect
20
+
18
21
  # @return [DateTime] export time for the source
19
22
  attr_reader :export_time
20
23
 
@@ -26,11 +29,15 @@ module PROIEL
26
29
  attr_reader :alignment_id
27
30
 
28
31
  # Creates a new source object.
29
- def initialize(parent, id, export_time, language, metadata, alignment_id, &block)
32
+ def initialize(parent, id, export_time, language, dialect, metadata, alignment_id, &block)
30
33
  @treebank = parent
31
34
  @id = id.freeze
32
- @export_time = DateTime.parse(export_time).freeze
35
+
36
+ raise ArgumentError, 'string or nil expected' unless export_time.nil? or export_time.is_a?(String)
37
+ @export_time = export_time.nil? ? nil : DateTime.parse(export_time).freeze
38
+
33
39
  @language = language.freeze
40
+ @dialect = dialect ? dialect.freeze : nil
34
41
  @metadata = metadata.freeze
35
42
 
36
43
  raise ArgumentError, 'string or nil expected' unless alignment_id.nil? or alignment_id.is_a?(String)
@@ -47,9 +54,12 @@ module PROIEL
47
54
  # Returns the printable form of the source with all token forms and any
48
55
  # presentation data.
49
56
  #
57
+ # @param custom_token_formatter [Lambda] formatting function for tokens
58
+ # which is passed the token as its sole argument
59
+ #
50
60
  # @return [String] the printable form of the source
51
- def printable_form(options = {})
52
- @children.map { |d| d.printable_form(options) }.compact.join
61
+ def printable_form(custom_token_formatter: nil)
62
+ @children.map { |d| d.printable_form(custom_token_formatter: custom_token_formatter) }.compact.join
53
63
  end
54
64
 
55
65
  # Accesses metadata fields.
@@ -26,11 +26,11 @@ module PROIEL
26
26
 
27
27
  x_mean = x.reduce(&:+).to_f / x.size
28
28
  y_mean = y.reduce(&:+).to_f / y.size
29
- x_sqsum = x.reduce(0.0) { |sum, n| sum + n ** 2 }
29
+ x_sqsum = x.reduce(0.0) { |sum, n| sum + n**2 }
30
30
  xy_sum = x.zip(y).reduce(0.0) { |sum, (m, n)| sum + m * n }
31
31
 
32
32
  sxy = xy_sum - x.length * x_mean * y_mean
33
- sx2 = x_sqsum - x.length * (x_mean ** 2)
33
+ sx2 = x_sqsum - x.length * (x_mean**2)
34
34
 
35
35
  beta = sxy / sx2
36
36
  alfa = y_mean - beta * x_mean
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2015-2016 Marius L. Jøhndal
2
+ # Copyright (c) 2015-2017 Marius L. Jøhndal
3
3
  #
4
4
  # See LICENSE in the top-level source directory for licensing terms.
5
5
  #++
@@ -160,12 +160,13 @@ module PROIEL
160
160
  # Returns the printable form of the token with any presentation data.
161
161
  #
162
162
  # @param custom_token_formatter [Lambda] formatting function for tokens
163
+ # which is passed the token as its sole argument
163
164
  #
164
165
  # @return [String] the printable form of the token
165
166
  def printable_form(custom_token_formatter: nil)
166
167
  printable_form =
167
168
  if custom_token_formatter
168
- custom_token_formatter.call(id, form)
169
+ custom_token_formatter.call(self)
169
170
  else
170
171
  form
171
172
  end
@@ -281,7 +282,7 @@ module PROIEL
281
282
  #
282
283
  # @return [Array<Token>] descendents
283
284
  def descendents
284
- dependents.map { |dependent| [dependent ] + dependent.descendents }.flatten
285
+ dependents.map { |dependent| [dependent] + dependent.descendents }.flatten
285
286
  end
286
287
 
287
288
  memoize :descendents
@@ -393,22 +394,29 @@ module PROIEL
393
394
  common_ancestors(other_token, inclusive: inclusive).first
394
395
  end
395
396
 
397
+ # Returns the aligned token if any.
398
+ #
399
+ # @return [Token, NilClass] aligned token
400
+ def alignment(aligned_source)
401
+ alignment_id ? aligned_source.treebank.find_token(alignment_id) : nil
402
+ end
403
+
396
404
  private
397
405
 
398
406
  # FIXME: extract this from the header of the PROIEL XML file instead and
399
407
  # subclass PositionalTag
400
- POS_POSITIONAL_TAG_SEQUENCE = %i(major minor)
408
+ POS_POSITIONAL_TAG_SEQUENCE = %i(major minor).freeze
401
409
 
402
410
  # FIXME: extract this from the header of the PROIEL XML file instead and
403
411
  # subclass PositionalTag
404
412
  MORPHOLOGY_POSITIONAL_TAG_SEQUENCE = %i(
405
413
  person number tense mood voice gender case degree strength inflection
406
- )
414
+ ).freeze
407
415
 
408
416
  NULL_PARTS_OF_SPEECH = {
409
417
  'V' => 'V-',
410
418
  'C' => 'C-',
411
419
  'P' => 'Pp',
412
- }
420
+ }.freeze
413
421
  end
414
422
  end
@@ -24,7 +24,7 @@ module PROIEL
24
24
 
25
25
  patterns = JSON.parse(File.read(filename))
26
26
 
27
- regexes = patterns.map { |l, p| [l, self.make_regex(p)] }.to_h
27
+ regexes = patterns.map { |l, p| [l, make_regex(p)] }.to_h
28
28
 
29
29
  @@regexes ||= {}
30
30
  @@regexes.merge!(regexes)
@@ -59,6 +59,8 @@ module PROIEL
59
59
  form and form.length > 1
60
60
  end
61
61
 
62
+ WORD_PATTERN = /([^[\u{E000}-\u{F8FF}][[:word:]]]+)/.freeze
63
+
62
64
  # Splits a token form using the tokenization patterns that apply for a
63
65
  # the specified language. Tokenization patterns must already have been
64
66
  # loaded.
@@ -73,9 +75,9 @@ module PROIEL
73
75
  raise ArgumentError, 'invalid language tag' unless language_tag.is_a?(String)
74
76
  raise ArgumentError, 'invalid form' unless form.is_a?(String)
75
77
 
76
- if form[/[^[:word:]]+/]
78
+ if form[WORD_PATTERN]
77
79
  # Split on any non-word character like a space or punctuation
78
- form.split(/([^[:word:]]+)/)
80
+ form.split(WORD_PATTERN)
79
81
  elsif @@regexes.key?(language_tag) and form[@@regexes[language_tag]]
80
82
  # Apply language-specific pattern
81
83
  form.match(@@regexes[language_tag]).captures
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2015 Marius L. Jøhndal
2
+ # Copyright (c) 2015-2018 Marius L. Jøhndal
3
3
  #
4
4
  # See LICENSE in the top-level source directory for licensing terms.
5
5
  #++
@@ -23,9 +23,13 @@ module PROIEL
23
23
  # @return [Array<Source>] sources in the treebank
24
24
  attr_reader :sources
25
25
 
26
+ # @return [Array<Dictionary>] dictionaries in the treebank
27
+ attr_reader :dictionaries
28
+
26
29
  # Available metadata elements for sources.
27
30
  METADATA_ELEMENTS = %i(
28
31
  title
32
+ alternative_title
29
33
  author
30
34
  citation_part
31
35
  principal
@@ -55,13 +59,16 @@ module PROIEL
55
59
  printed_text_publisher
56
60
  printed_text_place
57
61
  printed_text_date
58
- )
62
+ chronology_composition
63
+ chronology_manuscript
64
+ ).freeze
59
65
 
60
66
  # Creates a new treebank object.
61
67
  def initialize
62
68
  @annotation_schema = nil
63
69
  @schema_version = nil
64
70
  @sources = []
71
+ @dictionaries = []
65
72
 
66
73
  @source_index = {}
67
74
  @div_index = {}
@@ -85,12 +92,18 @@ module PROIEL
85
92
  tf = PROIELXML::Reader.parse_io(f)
86
93
 
87
94
  tf.proiel.sources.each do |s|
88
- @sources << Source.new(self, s.id, tf.proiel.export_time, s.language,
95
+ @sources << Source.new(self, s.id, tf.proiel.export_time, s.language, s.dialect,
89
96
  bundle_metadata(s), s.alignment_id) do |source|
90
97
  build_divs(s, source)
91
98
  end
92
99
 
93
- index_objects!(@sources.last)
100
+ index_source_objects!(@sources.last)
101
+ end
102
+
103
+ tf.proiel.dictionaries.each do |s|
104
+ @dictionaries << Dictionary.new(self, tf.proiel.export_time, s.language, s.dialect, s)
105
+
106
+ index_dictionary_objects!(@dictionaries.last)
94
107
  end
95
108
 
96
109
  annotation_schema = AnnotationSchema.new(tf.proiel.annotation)
@@ -103,7 +116,7 @@ module PROIEL
103
116
  # FIXME: consolidate export times? This is a design flaw in PROIEL XML
104
117
  # 2.0: export time ought to be per source not per PROIEL XML file, so
105
118
  # not clear what to do here. Pass it down to the source object?
106
- #@export_time = tf.proiel.export_time
119
+ # @export_time = tf.proiel.export_time
107
120
  else
108
121
  raise SchemaMismatch
109
122
  end
@@ -198,7 +211,7 @@ module PROIEL
198
211
  end
199
212
  end
200
213
 
201
- def index_objects!(source)
214
+ def index_source_objects!(source)
202
215
  @source_index[source.id] = source
203
216
 
204
217
  source.divs.each do |div|
@@ -213,5 +226,9 @@ module PROIEL
213
226
  end
214
227
  end
215
228
  end
229
+
230
+ def index_dictionary_objects!(dictionary)
231
+ # TODO
232
+ end
216
233
  end
217
234
  end
@@ -13,4 +13,3 @@ module PROIEL
13
13
  end
14
14
  end
15
15
  end
16
-
@@ -0,0 +1,5 @@
1
+ module PROIEL::Valency; end
2
+
3
+ require 'proiel/valency/obliqueness'
4
+ require 'proiel/valency/arguments'
5
+ require 'proiel/valency/lexicon'