proiel 1.1.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +5 -5
  2. data/LICENSE +1 -1
  3. data/README.md +2 -2
  4. data/lib/proiel.rb +16 -1
  5. data/lib/proiel/alignment.rb +3 -0
  6. data/lib/proiel/alignment/builder.rb +220 -0
  7. data/lib/proiel/annotation_schema.rb +11 -4
  8. data/lib/proiel/chronology.rb +80 -0
  9. data/lib/proiel/dictionary.rb +79 -0
  10. data/lib/proiel/dictionary/builder.rb +224 -0
  11. data/lib/proiel/div.rb +22 -3
  12. data/lib/proiel/language.rb +108 -0
  13. data/lib/proiel/lemma.rb +77 -0
  14. data/lib/proiel/proiel_xml/proiel-3.0/proiel-3.0.xsd +383 -0
  15. data/lib/proiel/proiel_xml/reader.rb +138 -2
  16. data/lib/proiel/proiel_xml/schema.rb +4 -2
  17. data/lib/proiel/proiel_xml/validator.rb +76 -9
  18. data/lib/proiel/sentence.rb +27 -4
  19. data/lib/proiel/source.rb +14 -4
  20. data/lib/proiel/statistics.rb +2 -2
  21. data/lib/proiel/token.rb +14 -6
  22. data/lib/proiel/tokenization.rb +5 -3
  23. data/lib/proiel/treebank.rb +23 -6
  24. data/lib/proiel/utils.rb +0 -1
  25. data/lib/proiel/valency.rb +5 -0
  26. data/lib/proiel/valency/arguments.rb +151 -0
  27. data/lib/proiel/valency/lexicon.rb +59 -0
  28. data/lib/proiel/valency/obliqueness.rb +31 -0
  29. data/lib/proiel/version.rb +2 -3
  30. data/lib/proiel/visualization.rb +1 -0
  31. data/lib/proiel/visualization/graphviz.rb +111 -0
  32. data/lib/proiel/visualization/graphviz/aligned-modern.dot.erb +83 -0
  33. data/lib/proiel/visualization/graphviz/classic.dot.erb +24 -0
  34. data/lib/proiel/visualization/graphviz/linearized.dot.erb +57 -0
  35. data/lib/proiel/visualization/graphviz/modern.dot.erb +39 -0
  36. data/lib/proiel/visualization/graphviz/packed.dot.erb +25 -0
  37. metadata +76 -31
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 39313c422eb3b2d2f3ad565c0cde3cbd3ddb5271
4
- data.tar.gz: 1edadad95bbaad82d4d7ab1a9cc409f8e80d3a74
2
+ SHA256:
3
+ metadata.gz: b0df361b20a949a5a7c51f1055470507b8b152af79737a896762ac269ac62c20
4
+ data.tar.gz: 2346771429bd177c4233e470c8e1830871001b2a511e311ef4cb70520f19687f
5
5
  SHA512:
6
- metadata.gz: 41bf1b5bcb3c8d8318128ea146b2609d02942d711553876d71c29cafc948312e79e8cd2e448fef751ca25c685c3f0d57a924004a46bdb3496a8f9913772e3e48
7
- data.tar.gz: add1511098c62bdd4ee59fdd53e55b4b331595a5a5e02320e97dadff194e8b0b96fb24bd48511a02933d70a83208e0f5d40093a49eec959917177bf59589cbb7
6
+ metadata.gz: 4c2195b08451ee0208aec4f80d7c6edfba1b0ecd2c0405d8a797a5e9fc8a8b135c6a9997d7a1b7b5a52519a3d8275e2be1d0b9389bd698b371372180254202a9
7
+ data.tar.gz: eb23ab51a1e7607dd4453e84820aa558636e5ebef7c7caf96824cb0f6dd17af7edaa34ad03efc663d1e66b60b3d63ea2d8053726c86b6435efe3a27f50ea53ba
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2015 Marius L. Jøhndal
1
+ Copyright (c) 2015-2016 Marius L. Jøhndal
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining a copy
4
4
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -12,7 +12,7 @@ PROIEL annotation scheme and the PROIEL XML-based interchange format.
12
12
 
13
13
  ## Installation
14
14
 
15
- To install this library you need Ruby 2.1 or newer.
15
+ This library requires Ruby >= 2.4. Install as
16
16
 
17
17
  ```shell
18
18
  gem install proiel
@@ -35,7 +35,7 @@ bundle
35
35
  ```
36
36
 
37
37
  To download a sample treebank, initialize a new git repository and add the
38
- [PROIEL treebank](http://proiel.github.io) as a submodule:
38
+ [PROIEL treebank](https://proiel.github.io) as a submodule:
39
39
 
40
40
  ```shell
41
41
  git init
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2015-2016 Marius L. Jøhndal
2
+ # Copyright (c) 2015-2018 Marius L. Jøhndal
3
3
  #
4
4
  # See LICENSE in the top-level source directory for licensing terms.
5
5
  #++
@@ -10,6 +10,12 @@ require 'ostruct'
10
10
  require 'sax-machine'
11
11
  require 'memoist'
12
12
  require 'nokogiri'
13
+ require 'singleton'
14
+ require 'erb'
15
+ require 'open3'
16
+ require 'set'
17
+ require 'builder'
18
+ require 'csv'
13
19
 
14
20
  require 'proiel/version'
15
21
  require 'proiel/utils'
@@ -27,3 +33,12 @@ require 'proiel/source'
27
33
  require 'proiel/div'
28
34
  require 'proiel/sentence'
29
35
  require 'proiel/token'
36
+ require 'proiel/dictionary'
37
+ require 'proiel/dictionary/builder'
38
+ require 'proiel/lemma'
39
+ require 'proiel/visualization'
40
+ require 'proiel/chronology'
41
+ require 'proiel/valency'
42
+ require 'proiel/dictionary/builder'
43
+ require 'proiel/alignment'
44
+ require 'proiel/language'
@@ -0,0 +1,3 @@
1
+ module PROIEL::Alignment; end
2
+
3
+ require 'proiel/alignment/builder'
@@ -0,0 +1,220 @@
1
+ module PROIEL
2
+ module Alignment
3
+ module Builder
4
+ # This computes a matrix of original and translation sentences that are
5
+ # aligned. For now, this function does not handle translation sentences that
6
+ # are unaligned (this is tricky to handle robustly!). As the current treebank
7
+ # collection stands this is an issue that *should* not arise so this is for
8
+ # now a reasonable approximation.
9
+ def self.compute_matrix(alignment, source, blacklist = [], log_directory = nil)
10
+ matrix1 = group_backwards(alignment, source, blacklist)
11
+ raise unless matrix1.map { |r| r[:original] }.flatten.compact == alignment.sentences.map(&:id)
12
+
13
+ matrix2 = group_forwards(alignment, source, blacklist)
14
+ raise unless matrix2.map { |r| r[:translation] }.flatten.compact == source.sentences.map(&:id)
15
+
16
+ if log_directory
17
+ # Verify that both texts are still in the correct sequence
18
+ File.open(File.join(log_directory, "#{source.id}1"), 'w') do |f|
19
+ matrix1.map do |x|
20
+ f.puts x.inspect
21
+ end
22
+ end
23
+
24
+ File.open(File.join(log_directory, "#{source.id}2"), 'w') do |f|
25
+ matrix2.map do |x|
26
+ f.puts x.inspect
27
+ end
28
+ end
29
+ end
30
+
31
+ matrix = []
32
+ iter1 = { i: 0, m: matrix1 }
33
+ iter2 = { i: 0, m: matrix2 }
34
+
35
+ loop do
36
+ # Take from matrix1 unless we have a translation
37
+ while iter1[:i] < iter1[:m].length and iter1[:m][iter1[:i]][:translation].empty?
38
+ matrix << iter1[:m][iter1[:i]]
39
+ iter1[:i] += 1
40
+ end
41
+
42
+ # Take from matrix2 unless we have an original
43
+ while iter2[:i] < iter2[:m].length and iter2[:m][iter2[:i]][:original].empty?
44
+ matrix << iter2[:m][iter2[:i]]
45
+ iter2[:i] += 1
46
+ end
47
+
48
+ if iter1[:i] < iter1[:m].length and iter2[:i] < iter2[:m].length
49
+ # Now the two should match provided alignments are sorted the same way,
50
+ # so take one from each. If they don't match outright, we may have a case
51
+ # of swapped sentence orders or a gap (one sentence unaligned in one of
52
+ # the texts surrounded by two sentences that are aligned to the same
53
+ # sentence in the other text). We'll try to repair this by merging bits
54
+ # from the next row in various combinations.
55
+ #
56
+ # When adding to the new mateix, pick original from matrix1 and
57
+ # translation from matrix2 so that the original textual order is
58
+ # preserved
59
+ if repair(matrix, iter1, 0, iter2, 0) or
60
+
61
+ repair(matrix, iter1, 1, iter2, 0) or
62
+ repair(matrix, iter1, 0, iter2, 1) or
63
+ repair(matrix, iter1, 1, iter2, 1) or
64
+
65
+ repair(matrix, iter1, 2, iter2, 0) or
66
+ repair(matrix, iter1, 0, iter2, 2) or
67
+ repair(matrix, iter1, 2, iter2, 1) or
68
+ repair(matrix, iter1, 1, iter2, 2) or
69
+ repair(matrix, iter1, 2, iter2, 2) or
70
+
71
+ repair(matrix, iter1, 3, iter2, 0) or
72
+ repair(matrix, iter1, 0, iter2, 3) or
73
+ repair(matrix, iter1, 3, iter2, 1) or
74
+ repair(matrix, iter1, 1, iter2, 3) or
75
+ repair(matrix, iter1, 3, iter2, 2) or
76
+ repair(matrix, iter1, 2, iter2, 3) or
77
+ repair(matrix, iter1, 3, iter2, 3) or
78
+
79
+ repair(matrix, iter1, 4, iter2, 0) or
80
+ repair(matrix, iter1, 0, iter2, 4) or
81
+ repair(matrix, iter1, 4, iter2, 1) or
82
+ repair(matrix, iter1, 1, iter2, 4) or
83
+ repair(matrix, iter1, 4, iter2, 2) or
84
+ repair(matrix, iter1, 2, iter2, 4) or
85
+ repair(matrix, iter1, 4, iter2, 3) or
86
+ repair(matrix, iter1, 3, iter2, 4) or
87
+ repair(matrix, iter1, 4, iter2, 4)
88
+ else
89
+ STDERR.puts iter1[:i], iter1[:m][iter1[:i]].inspect
90
+ STDERR.puts iter2[:i], iter2[:m][iter2[:i]].inspect
91
+ raise
92
+ end
93
+ else
94
+ raise unless iter1[:i] == iter1[:m].length and iter2[:i] == iter2[:m].length
95
+ break
96
+ end
97
+ end
98
+
99
+ if log_directory
100
+ File.open(File.join(log_directory, "#{source.id}3"), 'w') do |f|
101
+ matrix.map do |x|
102
+ f.puts x.inspect
103
+ end
104
+ end
105
+ end
106
+
107
+ raise unless matrix.map { |r| r[:original] }.flatten.compact == alignment.sentences.map(&:id)
108
+ raise unless matrix.map { |r| r[:translation] }.flatten.compact == source.sentences.map(&:id)
109
+
110
+ matrix
111
+ end
112
+
113
+ private
114
+
115
+ def self.group_forwards(alignment, source, blacklist = [])
116
+ # Make an original to translation ID mapping
117
+ mapping = {}
118
+
119
+ source.sentences.each do |sentence|
120
+ mapping[sentence.id] = []
121
+
122
+ next if blacklist.include?(sentence.id)
123
+
124
+ mapping[sentence.id] = sentence.inferred_alignment(alignment).map(&:id)
125
+ end
126
+
127
+ # Translate to a pairs of ID arrays, chunk original IDs that share at least
128
+ # one translation ID, then reduce the result so we get an array of m-to-n
129
+ # relations
130
+ mapping.map do |v, k|
131
+ { original: k, translation: [v] }
132
+ end.chunk_while do |x, y|
133
+ !(x[:original] & y[:original]).empty?
134
+ end.map do |chunk|
135
+ chunk.inject do |a, v|
136
+ a[:original] += v[:original]
137
+ a[:translation] += v[:translation]
138
+ a
139
+ end
140
+ end.map do |row|
141
+ { original: row[:original].uniq, translation: row[:translation] }
142
+ end
143
+ end
144
+
145
+ def self.group_backwards(alignment, source, blacklist = [])
146
+ # Make an original to translation ID mapping
147
+ mapping = {}
148
+
149
+ alignment.sentences.each do |sentence|
150
+ mapping[sentence.id] = []
151
+ end
152
+
153
+ source.sentences.each do |sentence|
154
+ next if blacklist.include?(sentence.id)
155
+
156
+ original_ids = sentence.inferred_alignment(alignment).map(&:id)
157
+
158
+ original_ids.each do |original_id|
159
+ mapping[original_id] << sentence.id
160
+ end
161
+ end
162
+
163
+ # Translate to a pairs of ID arrays, chunk original IDs that share at least
164
+ # one translation ID, then reduce the result so we get an array of m-to-n
165
+ # relations
166
+ mapping.map do |k, v|
167
+ { original: [k], translation: v }
168
+ end.chunk_while do |x, y|
169
+ !(x[:translation] & y[:translation]).empty?
170
+ end.map do |chunk|
171
+ chunk.inject do |a, v|
172
+ a[:original] += v[:original]
173
+ a[:translation] += v[:translation]
174
+ a
175
+ end
176
+ end.map do |row|
177
+ { original: row[:original], translation: row[:translation].uniq }
178
+ end
179
+ end
180
+
181
+ def self.repair_merge_cells(iter, delta, field)
182
+ matrix, i = iter[:m], iter[:i]
183
+ (0..delta).map { |j| matrix[i + j][field] }.inject(&:+)
184
+ end
185
+
186
+ def self.select_unaligned(iter, delta, field, check_field)
187
+ matrix, i = iter[:m], iter[:i]
188
+ (0..delta).select { |j| matrix[i + j][check_field].empty? }.map { |j| matrix[i + j][field] }.flatten
189
+ end
190
+
191
+ def self.repair(matrix, iter1, delta1, iter2, delta2)
192
+ o1 = repair_merge_cells(iter1, delta1, :original)
193
+ o2 = repair_merge_cells(iter2, delta2, :original)
194
+
195
+ t1 = repair_merge_cells(iter1, delta1, :translation)
196
+ t2 = repair_merge_cells(iter2, delta2, :translation)
197
+
198
+ u1 = select_unaligned(iter1, delta1, :original, :translation)
199
+ u2 = select_unaligned(iter2, delta2, :translation, :original)
200
+
201
+ if o1.sort - u1 == o2.sort.uniq and t1.sort.uniq == t2.sort - u2
202
+ unless delta1.zero? and delta2.zero?
203
+ STDERR.puts "Assuming #{delta1 + 1}/#{delta2 + 1} swapped sentence order:"
204
+ STDERR.puts ' * ' + (0..delta1).map { |j| iter1[:m][iter1[:i] + j].inspect }.join(' + ')
205
+ STDERR.puts ' * ' + (0..delta2).map { |j| iter2[:m][iter2[:i] + j].inspect }.join(' + ')
206
+ end
207
+
208
+ matrix << { original: o1, translation: t2 }
209
+
210
+ iter1[:i] += delta1 + 1
211
+ iter2[:i] += delta2 + 1
212
+
213
+ true
214
+ else
215
+ false
216
+ end
217
+ end
218
+ end
219
+ end
220
+ end
@@ -22,10 +22,17 @@ module PROIEL
22
22
 
23
23
  # Creates a new annotation schema object.
24
24
  def initialize(xml_object)
25
- @part_of_speech_tags = make_part_of_speech_tags(xml_object).freeze
26
- @relation_tags = make_relation_tags(xml_object).freeze
27
- @morphology_tags = make_morphology_tags(xml_object).freeze
28
- @information_status_tags = make_information_status_tags(xml_object).freeze
25
+ if xml_object
26
+ @part_of_speech_tags = make_part_of_speech_tags(xml_object).freeze
27
+ @relation_tags = make_relation_tags(xml_object).freeze
28
+ @morphology_tags = make_morphology_tags(xml_object).freeze
29
+ @information_status_tags = make_information_status_tags(xml_object).freeze
30
+ else
31
+ @part_of_speech_tags = {}.freeze
32
+ @relation_tags = {}.freeze
33
+ @morphology_tags = {}.freeze
34
+ @information_status_tags = {}.freeze
35
+ end
29
36
  end
30
37
 
31
38
  # @return [Hash<String,RelationTagDefinition>] definition of primary relation tags
@@ -0,0 +1,80 @@
1
+ #--
2
+ # Copyright (c) 2016-2017 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+
7
+ # Methods for parsing chronological descriptions. Extra care is taken to get
8
+ # the interpretation of centuries and ranges involving the transition between 1
9
+ # BC and AD 1 correct.
10
+ module PROIEL::Chronology
11
+ # Computes the chronological midpoint of a chronological description.
12
+ #
13
+ # @param s [String] chronological description
14
+ #
15
+ # @return [Integer]
16
+ #
17
+ # @example
18
+ # midpoint('1000') # => 1000
19
+ # midpoint('1000 BC') # => -1000
20
+ # midpoint('1000-1020') # => 1010
21
+ def self.midpoint(s)
22
+ i = parse(s)
23
+
24
+ if i.is_a?(Array)
25
+ # Handle missing Julian year 0 by shifting years after 1 BC down by 1 and then shifting the midpoint back
26
+ # up again unless negative
27
+ if i.first < 0 and i.last > 0
28
+ y = (i.first + i.last - 1) / 2.0
29
+ if y < 0
30
+ y.floor
31
+ else
32
+ (y + 1).floor
33
+ end
34
+ else
35
+ ((i.first + i.last) / 2.0).floor # a non-integer midpoint is within the year of the integer part
36
+ end
37
+ elsif i.is_a?(Integer)
38
+ i
39
+ else
40
+ raise ArgumentError, 'integer or array expected'
41
+ end
42
+ end
43
+
44
+ # Parses a chronological description. The syntax of chronological
45
+ # descriptions is explained in the [PROIEL XML
46
+ # documentation](http://proiel.github.io/handbook/developer/proielxml.html#chronological-data).
47
+ #
48
+ # @param s [String] chronological description
49
+ #
50
+ # @return [Integer, Array<Integer,Integer>]
51
+ #
52
+ # @example
53
+ # parse('1000') # => 1000
54
+ # parse('1000 BC') # => -1000
55
+ # parse('1000-1020') # => [1000,1020]
56
+ # parse('1000 BC-1020') # => [-1000,1020]
57
+ def self.parse(s)
58
+ case s
59
+ when /^\s*(?:c\.\s+)?(\d+)(\s+BC)?\s*$/
60
+ i = $1.to_i
61
+ multiplier = $2 ? -1 : 1
62
+ (i * multiplier).to_i.tap do |i|
63
+ # There is no year zero in the Julian calendar
64
+ raise ArgumentError, 'invalid year' if i.zero?
65
+ end
66
+ when /^\s*(1st|2nd|3rd|\d+th)\s+c\.\s*$/
67
+ a = $1.to_i * 100
68
+ [a - 99, a]
69
+ when /^\s*(1st|2nd|3rd|\d+th)\s+c\.\s+BC\s*$/
70
+ a = -$1.to_i * 100
71
+ [a, a + 99]
72
+ when /^\s*(?:c\.\s+)?\d+(\s+BC)?\s*-\s*(c\.\s+)?\d+(\s+BC)?\s*$/
73
+ s.split('-').map { |i| parse(i) }.tap do |from, to|
74
+ raise ArgumentError, 'invalid range' unless from < to
75
+ end
76
+ else
77
+ raise ArgumentError, 'unexpected format'
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,79 @@
1
+ #--
2
+ # Copyright (c) 2018 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ class Dictionary < TreebankObject
8
+ # @return [Treebank] treebank that this source belongs to
9
+ attr_reader :treebank
10
+
11
+ # @return [String] language of the source as an ISO 639-3 language tag
12
+ attr_reader :language
13
+
14
+ # @return [String] dialect of the source
15
+ attr_reader :dialect
16
+
17
+ # @return [DateTime] export time for the dictionary
18
+ attr_reader :export_time
19
+
20
+ # @return [Hash] all lemmata in the dictionary
21
+ attr_reader :lemmata
22
+
23
+ # @return [Integer] number of lemmata in the dictionary
24
+ attr_reader :n
25
+
26
+ # @return [Hash] all sources in the dictionary
27
+ attr_reader :sources
28
+
29
+ # Creates a new dictionary object.
30
+ def initialize(parent, export_time, language, dialect, xml = nil)
31
+ @treebank = parent
32
+
33
+ raise ArgumentError, 'string or nil expected' unless export_time.nil? or export_time.is_a?(String)
34
+ @export_time = export_time.nil? ? nil : DateTime.parse(export_time).freeze
35
+
36
+ @language = language.freeze
37
+ @dialect = dialect ? dialect.freeze : nil
38
+
39
+ @lemmata = {}
40
+ @sources = {}
41
+ @n = 0
42
+
43
+ from_xml(xml) if xml
44
+ end
45
+
46
+ # FIXME
47
+ def id
48
+ @language
49
+ end
50
+
51
+ private
52
+
53
+ def from_xml(xml)
54
+ xml.sources.each do |s|
55
+ @sources[s.idref] = { license: nullify(s.license), n: nullify(s.n, :int) }
56
+ end
57
+
58
+ xml.lemmata.each do |l|
59
+ @lemmata[l.lemma] ||= {}
60
+ @lemmata[l.lemma][l.part_of_speech] = Lemma.new(self, l)
61
+ @n += 1
62
+ end
63
+ end
64
+
65
+ def nullify(s, type = nil)
66
+ case s
67
+ when NilClass, /^\s*$/
68
+ nil
69
+ else
70
+ case type
71
+ when :int
72
+ s.to_i
73
+ else
74
+ s.to_s
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end