proiel 1.2.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 10affa8825a31d3bcb810a5dbc41a7869c4fe7d7cb15b1c361cc8c13947d3c4a
4
- data.tar.gz: 43145ff2225e521599bdc96983c295b2ccdef1a9b642849523f3852fb68b4d8d
3
+ metadata.gz: ccdb00c28a352d6f6481a76b5adf4bef5a426e98738c3ed4241134e202302aef
4
+ data.tar.gz: 299fde59d6c773a9f1246263f66ab3d37b4216f0b1dd873a552eb4c8d1cd6ef7
5
5
  SHA512:
6
- metadata.gz: cc4b7b78021b97304c93429bab8fbe44f38a2e4740c280c5085a86ecb6c43a4e44c55936a0192196d5b769a3f54169ff8dfe64eb31305c07abd791d1e6ea0a17
7
- data.tar.gz: cfcadba2ef52a4d81c6aa432549618c5c9dfef55876ae313f7cdd15704a825cb82be06b1fda0f53ef5983f17470aa443bf5be1d70d659fb066b1a3bbd57ea309
6
+ metadata.gz: 105c8c89b0d3df2491fb51a03dbf96797af8e195edcdb1b901b12e81e0a632dac1e8b2ae6b398606fbc0b18b856134c338410094a5d03efd3783a7fed6b756e1
7
+ data.tar.gz: ce513c17bfa2301928551a49c81147f6da693c38a733b2cc749705f5f96dc2798c6dd48729e2929a3202f1061ba458c306470aedf6598c684744dfc2b74acfd4
data/lib/proiel.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2015-2017 Marius L. Jøhndal
2
+ # Copyright (c) 2015-2018 Marius L. Jøhndal
3
3
  #
4
4
  # See LICENSE in the top-level source directory for licensing terms.
5
5
  #++
@@ -15,6 +15,7 @@ require 'erb'
15
15
  require 'open3'
16
16
  require 'set'
17
17
  require 'builder'
18
+ require 'csv'
18
19
 
19
20
  require 'proiel/version'
20
21
  require 'proiel/utils'
@@ -32,7 +33,12 @@ require 'proiel/source'
32
33
  require 'proiel/div'
33
34
  require 'proiel/sentence'
34
35
  require 'proiel/token'
36
+ require 'proiel/dictionary'
37
+ require 'proiel/dictionary/builder'
38
+ require 'proiel/lemma'
35
39
  require 'proiel/visualization'
36
40
  require 'proiel/chronology'
37
41
  require 'proiel/valency'
38
- require 'proiel/dictionary'
42
+ require 'proiel/dictionary/builder'
43
+ require 'proiel/alignment'
44
+ require 'proiel/language'
@@ -0,0 +1,3 @@
1
+ module PROIEL::Alignment; end
2
+
3
+ require 'proiel/alignment/builder'
@@ -0,0 +1,220 @@
1
+ module PROIEL
2
+ module Alignment
3
+ module Builder
4
+ # This computes a matrix of original and translation sentences that are
5
+ # aligned. For now, this function does not handle translation sentences that
6
+ # are unaligned (this is tricky to handle robustly!). As the current treebank
7
+ # collection stands this is an issue that *should* not arise so this is for
8
+ # now a reasonable approximation.
9
+ def self.compute_matrix(alignment, source, blacklist = [], log_directory = nil)
10
+ matrix1 = group_backwards(alignment, source, blacklist)
11
+ raise unless matrix1.map { |r| r[:original] }.flatten.compact == alignment.sentences.map(&:id)
12
+
13
+ matrix2 = group_forwards(alignment, source, blacklist)
14
+ raise unless matrix2.map { |r| r[:translation] }.flatten.compact == source.sentences.map(&:id)
15
+
16
+ if log_directory
17
+ # Verify that both texts are still in the correct sequence
18
+ File.open(File.join(log_directory, "#{source.id}1"), 'w') do |f|
19
+ matrix1.map do |x|
20
+ f.puts x.inspect
21
+ end
22
+ end
23
+
24
+ File.open(File.join(log_directory, "#{source.id}2"), 'w') do |f|
25
+ matrix2.map do |x|
26
+ f.puts x.inspect
27
+ end
28
+ end
29
+ end
30
+
31
+ matrix = []
32
+ iter1 = { i: 0, m: matrix1 }
33
+ iter2 = { i: 0, m: matrix2 }
34
+
35
+ loop do
36
+ # Take from matrix1 unless we have a translation
37
+ while iter1[:i] < iter1[:m].length and iter1[:m][iter1[:i]][:translation].empty?
38
+ matrix << iter1[:m][iter1[:i]]
39
+ iter1[:i] += 1
40
+ end
41
+
42
+ # Take from matrix2 unless we have an original
43
+ while iter2[:i] < iter2[:m].length and iter2[:m][iter2[:i]][:original].empty?
44
+ matrix << iter2[:m][iter2[:i]]
45
+ iter2[:i] += 1
46
+ end
47
+
48
+ if iter1[:i] < iter1[:m].length and iter2[:i] < iter2[:m].length
49
+ # Now the two should match provided alignments are sorted the same way,
50
+ # so take one from each. If they don't match outright, we may have a case
51
+ # of swapped sentence orders or a gap (one sentence unaligned in one of
52
+ # the texts surrounded by two sentences that are aligned to the same
53
+ # sentence in the other text). We'll try to repair this by merging bits
54
+ # from the next row in various combinations.
55
+ #
56
+ # When adding to the new mateix, pick original from matrix1 and
57
+ # translation from matrix2 so that the original textual order is
58
+ # preserved
59
+ if repair(matrix, iter1, 0, iter2, 0) or
60
+
61
+ repair(matrix, iter1, 1, iter2, 0) or
62
+ repair(matrix, iter1, 0, iter2, 1) or
63
+ repair(matrix, iter1, 1, iter2, 1) or
64
+
65
+ repair(matrix, iter1, 2, iter2, 0) or
66
+ repair(matrix, iter1, 0, iter2, 2) or
67
+ repair(matrix, iter1, 2, iter2, 1) or
68
+ repair(matrix, iter1, 1, iter2, 2) or
69
+ repair(matrix, iter1, 2, iter2, 2) or
70
+
71
+ repair(matrix, iter1, 3, iter2, 0) or
72
+ repair(matrix, iter1, 0, iter2, 3) or
73
+ repair(matrix, iter1, 3, iter2, 1) or
74
+ repair(matrix, iter1, 1, iter2, 3) or
75
+ repair(matrix, iter1, 3, iter2, 2) or
76
+ repair(matrix, iter1, 2, iter2, 3) or
77
+ repair(matrix, iter1, 3, iter2, 3) or
78
+
79
+ repair(matrix, iter1, 4, iter2, 0) or
80
+ repair(matrix, iter1, 0, iter2, 4) or
81
+ repair(matrix, iter1, 4, iter2, 1) or
82
+ repair(matrix, iter1, 1, iter2, 4) or
83
+ repair(matrix, iter1, 4, iter2, 2) or
84
+ repair(matrix, iter1, 2, iter2, 4) or
85
+ repair(matrix, iter1, 4, iter2, 3) or
86
+ repair(matrix, iter1, 3, iter2, 4) or
87
+ repair(matrix, iter1, 4, iter2, 4)
88
+ else
89
+ STDERR.puts iter1[:i], iter1[:m][iter1[:i]].inspect
90
+ STDERR.puts iter2[:i], iter2[:m][iter2[:i]].inspect
91
+ raise
92
+ end
93
+ else
94
+ raise unless iter1[:i] == iter1[:m].length and iter2[:i] == iter2[:m].length
95
+ break
96
+ end
97
+ end
98
+
99
+ if log_directory
100
+ File.open(File.join(log_directory, "#{source.id}3"), 'w') do |f|
101
+ matrix.map do |x|
102
+ f.puts x.inspect
103
+ end
104
+ end
105
+ end
106
+
107
+ raise unless matrix.map { |r| r[:original] }.flatten.compact == alignment.sentences.map(&:id)
108
+ raise unless matrix.map { |r| r[:translation] }.flatten.compact == source.sentences.map(&:id)
109
+
110
+ matrix
111
+ end
112
+
113
+ private
114
+
115
+ def self.group_forwards(alignment, source, blacklist = [])
116
+ # Make an original to translation ID mapping
117
+ mapping = {}
118
+
119
+ source.sentences.each do |sentence|
120
+ mapping[sentence.id] = []
121
+
122
+ next if blacklist.include?(sentence.id)
123
+
124
+ mapping[sentence.id] = sentence.inferred_alignment(alignment).map(&:id)
125
+ end
126
+
127
+ # Translate to a pairs of ID arrays, chunk original IDs that share at least
128
+ # one translation ID, then reduce the result so we get an array of m-to-n
129
+ # relations
130
+ mapping.map do |v, k|
131
+ { original: k, translation: [v] }
132
+ end.chunk_while do |x, y|
133
+ !(x[:original] & y[:original]).empty?
134
+ end.map do |chunk|
135
+ chunk.inject do |a, v|
136
+ a[:original] += v[:original]
137
+ a[:translation] += v[:translation]
138
+ a
139
+ end
140
+ end.map do |row|
141
+ { original: row[:original].uniq, translation: row[:translation] }
142
+ end
143
+ end
144
+
145
+ def self.group_backwards(alignment, source, blacklist = [])
146
+ # Make an original to translation ID mapping
147
+ mapping = {}
148
+
149
+ alignment.sentences.each do |sentence|
150
+ mapping[sentence.id] = []
151
+ end
152
+
153
+ source.sentences.each do |sentence|
154
+ next if blacklist.include?(sentence.id)
155
+
156
+ original_ids = sentence.inferred_alignment(alignment).map(&:id)
157
+
158
+ original_ids.each do |original_id|
159
+ mapping[original_id] << sentence.id
160
+ end
161
+ end
162
+
163
+ # Translate to a pairs of ID arrays, chunk original IDs that share at least
164
+ # one translation ID, then reduce the result so we get an array of m-to-n
165
+ # relations
166
+ mapping.map do |k, v|
167
+ { original: [k], translation: v }
168
+ end.chunk_while do |x, y|
169
+ !(x[:translation] & y[:translation]).empty?
170
+ end.map do |chunk|
171
+ chunk.inject do |a, v|
172
+ a[:original] += v[:original]
173
+ a[:translation] += v[:translation]
174
+ a
175
+ end
176
+ end.map do |row|
177
+ { original: row[:original], translation: row[:translation].uniq }
178
+ end
179
+ end
180
+
181
+ def self.repair_merge_cells(iter, delta, field)
182
+ matrix, i = iter[:m], iter[:i]
183
+ (0..delta).map { |j| matrix[i + j][field] }.inject(&:+)
184
+ end
185
+
186
+ def self.select_unaligned(iter, delta, field, check_field)
187
+ matrix, i = iter[:m], iter[:i]
188
+ (0..delta).select { |j| matrix[i + j][check_field].empty? }.map { |j| matrix[i + j][field] }.flatten
189
+ end
190
+
191
+ def self.repair(matrix, iter1, delta1, iter2, delta2)
192
+ o1 = repair_merge_cells(iter1, delta1, :original)
193
+ o2 = repair_merge_cells(iter2, delta2, :original)
194
+
195
+ t1 = repair_merge_cells(iter1, delta1, :translation)
196
+ t2 = repair_merge_cells(iter2, delta2, :translation)
197
+
198
+ u1 = select_unaligned(iter1, delta1, :original, :translation)
199
+ u2 = select_unaligned(iter2, delta2, :translation, :original)
200
+
201
+ if o1.sort - u1 == o2.sort.uniq and t1.sort.uniq == t2.sort - u2
202
+ unless delta1.zero? and delta2.zero?
203
+ STDERR.puts "Assuming #{delta1 + 1}/#{delta2 + 1} swapped sentence order:"
204
+ STDERR.puts ' * ' + (0..delta1).map { |j| iter1[:m][iter1[:i] + j].inspect }.join(' + ')
205
+ STDERR.puts ' * ' + (0..delta2).map { |j| iter2[:m][iter2[:i] + j].inspect }.join(' + ')
206
+ end
207
+
208
+ matrix << { original: o1, translation: t2 }
209
+
210
+ iter1[:i] += delta1 + 1
211
+ iter2[:i] += delta2 + 1
212
+
213
+ true
214
+ else
215
+ false
216
+ end
217
+ end
218
+ end
219
+ end
220
+ end
@@ -22,10 +22,17 @@ module PROIEL
22
22
 
23
23
  # Creates a new annotation schema object.
24
24
  def initialize(xml_object)
25
- @part_of_speech_tags = make_part_of_speech_tags(xml_object).freeze
26
- @relation_tags = make_relation_tags(xml_object).freeze
27
- @morphology_tags = make_morphology_tags(xml_object).freeze
28
- @information_status_tags = make_information_status_tags(xml_object).freeze
25
+ if xml_object
26
+ @part_of_speech_tags = make_part_of_speech_tags(xml_object).freeze
27
+ @relation_tags = make_relation_tags(xml_object).freeze
28
+ @morphology_tags = make_morphology_tags(xml_object).freeze
29
+ @information_status_tags = make_information_status_tags(xml_object).freeze
30
+ else
31
+ @part_of_speech_tags = {}.freeze
32
+ @relation_tags = {}.freeze
33
+ @morphology_tags = {}.freeze
34
+ @information_status_tags = {}.freeze
35
+ end
29
36
  end
30
37
 
31
38
  # @return [Hash<String,RelationTagDefinition>] definition of primary relation tags
@@ -1,3 +1,79 @@
1
- module PROIEL::Dictionary; end
1
+ #--
2
+ # Copyright (c) 2018 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ class Dictionary < TreebankObject
8
+ # @return [Treebank] treebank that this source belongs to
9
+ attr_reader :treebank
2
10
 
3
- require 'proiel/dictionary/builder'
11
+ # @return [String] language of the source as an ISO 639-3 language tag
12
+ attr_reader :language
13
+
14
+ # @return [String] dialect of the source
15
+ attr_reader :dialect
16
+
17
+ # @return [DateTime] export time for the dictionary
18
+ attr_reader :export_time
19
+
20
+ # @return [Hash] all lemmata in the dictionary
21
+ attr_reader :lemmata
22
+
23
+ # @return [Integer] number of lemmata in the dictionary
24
+ attr_reader :n
25
+
26
+ # @return [Hash] all sources in the dictionary
27
+ attr_reader :sources
28
+
29
+ # Creates a new dictionary object.
30
+ def initialize(parent, export_time, language, dialect, xml = nil)
31
+ @treebank = parent
32
+
33
+ raise ArgumentError, 'string or nil expected' unless export_time.nil? or export_time.is_a?(String)
34
+ @export_time = export_time.nil? ? nil : DateTime.parse(export_time).freeze
35
+
36
+ @language = language.freeze
37
+ @dialect = dialect ? dialect.freeze : nil
38
+
39
+ @lemmata = {}
40
+ @sources = {}
41
+ @n = 0
42
+
43
+ from_xml(xml) if xml
44
+ end
45
+
46
+ # FIXME
47
+ def id
48
+ @language
49
+ end
50
+
51
+ private
52
+
53
+ def from_xml(xml)
54
+ xml.sources.each do |s|
55
+ @sources[s.idref] = { license: nullify(s.license), n: nullify(s.n, :int) }
56
+ end
57
+
58
+ xml.lemmata.each do |l|
59
+ @lemmata[l.lemma] ||= {}
60
+ @lemmata[l.lemma][l.part_of_speech] = Lemma.new(self, l)
61
+ @n += 1
62
+ end
63
+ end
64
+
65
+ def nullify(s, type = nil)
66
+ case s
67
+ when NilClass, /^\s*$/
68
+ nil
69
+ else
70
+ case type
71
+ when :int
72
+ s.to_i
73
+ else
74
+ s.to_s
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end
@@ -1,12 +1,12 @@
1
1
  #--
2
- # Copyright (c) 2016-2017 Marius L. Jøhndal
2
+ # Copyright (c) 2016-2018 Marius L. Jøhndal
3
3
  #
4
4
  # See LICENSE in the top-level source directory for licensing terms.
5
5
  #++
6
6
 
7
7
  # Methods for synthesising and manipulating dictionaries from treebank data.
8
- module PROIEL::Dictionary
9
- class Builder
8
+ module PROIEL
9
+ class DictionaryBuilder
10
10
  attr_reader :license
11
11
  attr_reader :language
12
12
  attr_reader :sources
@@ -43,12 +43,13 @@ module PROIEL::Dictionary
43
43
  builder.dictionary(language: @language) do
44
44
  builder.sources do
45
45
  @sources.each do |source|
46
- builder.source(id: source.id, license: source.license)
46
+ builder.source(idref: source.id, license: source.license)
47
47
  end
48
48
  end
49
49
 
50
- builder.lemmata(n: @lemmata.count) do
51
- @lemmata.sort_by { |lemma, _| lemma.downcase }.each do |form, data|
50
+ builder.lemmata do
51
+ @lemmata.sort_by { |lemma, _| lemma.downcase }.each do |form_and_pos, data|
52
+ form, _ = form_and_pos.split(',')
52
53
  lemma_to_xml(builder, form, data)
53
54
  end
54
55
  end
@@ -56,10 +57,41 @@ module PROIEL::Dictionary
56
57
  end
57
58
  end
58
59
 
60
+ def add_external_glosses!(filename, languages = %i(eng))
61
+ raise ArgumentError, 'filename expected' unless filename.is_a?(String)
62
+ raise ArgumentError, 'file not found' unless File.exists?(filename)
63
+
64
+ CSV.foreach(filename, headers: true, encoding: 'utf-8', col_sep: "\t",
65
+ header_converters: :symbol, quote_char: "\b") do |row|
66
+ h = row.to_h
67
+ data = languages.map { |l| [l, h[l]] }.to_h
68
+
69
+ lemma = initialize_lemma!(row[:lemma], row[:part_of_speech])
70
+ lemma[:glosses] ||= {}
71
+ lemma[:glosses].merge!(data)
72
+ end
73
+ end
74
+
59
75
  private
60
76
 
77
+ def initialize_lemma!(lemma, part_of_speech)
78
+ encoded_lemma = [lemma, part_of_speech].join(',')
79
+
80
+ @lemmata[encoded_lemma] ||= {}
81
+ @lemmata[encoded_lemma][:lemma] ||= lemma
82
+ @lemmata[encoded_lemma][:part_of_speech] ||= part_of_speech
83
+ @lemmata[encoded_lemma][:homographs] ||= []
84
+ @lemmata[encoded_lemma][:n] ||= 0
85
+
86
+ %i(distribution glosses paradigm valency).each do |k|
87
+ @lemmata[encoded_lemma][k] ||= {}
88
+ end
89
+
90
+ @lemmata[encoded_lemma]
91
+ end
92
+
61
93
  def lemma_to_xml(builder, form, data)
62
- builder.lemma(form: form, part_of_speech: data[:part_of_speech], n: data[:n]) do
94
+ builder.lemma(lemma: form, "part-of-speech": data[:part_of_speech]) do
63
95
  distribution_to_xml(builder, data)
64
96
  glosses_to_xml(builder, data)
65
97
  homographs_to_xml(builder, data)
@@ -69,17 +101,21 @@ module PROIEL::Dictionary
69
101
  end
70
102
 
71
103
  def distribution_to_xml(builder, data)
72
- builder.distribution do
73
- data[:distribution].sort_by(&:first).each do |source_id, n|
74
- builder.source(id: source_id, n: n)
104
+ unless data[:distribution].empty?
105
+ builder.distribution do
106
+ data[:distribution].sort_by(&:first).each do |source_id, n|
107
+ builder.source(idref: source_id, n: n)
108
+ end
75
109
  end
76
110
  end
77
111
  end
78
112
 
79
113
  def glosses_to_xml(builder, data)
80
- if data[:glosses].count > 0
114
+ unless data[:glosses].empty?
81
115
  builder.glosses do
82
- # TODO
116
+ data[:glosses].each do |language, value|
117
+ builder.gloss(value, language: language)
118
+ end
83
119
  end
84
120
  end
85
121
  end
@@ -88,7 +124,8 @@ module PROIEL::Dictionary
88
124
  if data[:homographs].count > 0
89
125
  builder.homographs do
90
126
  data[:homographs].each do |homograph|
91
- builder.lemma form: homograph
127
+ lemma, part_of_speech = homograph.split(',')
128
+ builder.homograph lemma: lemma, "part-of-speech": part_of_speech
92
129
  end
93
130
  end
94
131
  end
@@ -120,22 +157,21 @@ module PROIEL::Dictionary
120
157
  builder.frame do
121
158
  builder.arguments do
122
159
  frame[:arguments].each do |argument|
160
+ # FIXME: deal with in a better way
161
+ argument[:"part-of-speech"] = argument[:part_of_speech] if argument[:part_of_speech]
162
+ argument.delete(:part_of_speech)
123
163
  builder.argument argument
124
164
  end
125
165
  end
126
166
 
127
- if frame[:tokens][:a].count > 0
128
- builder.tokens flags: 'a', n: frame[:tokens][:a].count do
167
+ if frame[:tokens][:a].count > 0 or frame[:tokens][:r].count > 0
168
+ builder.tokens do
129
169
  frame[:tokens][:a].each do |token_id|
130
- builder.token id: token_id
170
+ builder.token(flags: 'a', idref: token_id)
131
171
  end
132
- end
133
- end
134
172
 
135
- if frame[:tokens][:r].count > 0
136
- builder.tokens flags: 'r', n: frame[:tokens][:r].count do
137
173
  frame[:tokens][:r].each do |token_id|
138
- builder.token id: token_id
174
+ builder.token(flags: 'r', idref: token_id)
139
175
  end
140
176
  end
141
177
  end
@@ -146,7 +182,7 @@ module PROIEL::Dictionary
146
182
  end
147
183
 
148
184
  def index_homographs!
149
- @lemmata.keys.group_by { |l| l.split(',').first }.each do |m, homographs|
185
+ @lemmata.keys.group_by { |l| l.split(/[,#]/).first }.each do |m, homographs|
150
186
  if homographs.count > 1
151
187
  homographs.each do |form|
152
188
  @lemmata[form][:homographs] = homographs.reject { |homograph| homograph == form }
@@ -157,20 +193,9 @@ module PROIEL::Dictionary
157
193
 
158
194
  def index_token!(token)
159
195
  if token.lemma and token.part_of_speech
160
- encoded_lemma = [token.lemma, token.part_of_speech].join(',')
161
-
162
- @lemmata[encoded_lemma] ||= {
163
- lemma: token.lemma,
164
- part_of_speech: token.part_of_speech,
165
- distribution: {},
166
- glosses: {},
167
- homographs: [],
168
- paradigm: {},
169
- n: 0,
170
- valency: {},
171
- }
196
+ lemma = initialize_lemma!(token.lemma, token.part_of_speech)
172
197
 
173
- lemma = @lemmata[encoded_lemma]
198
+ lemma[:n] += 1
174
199
 
175
200
  lemma[:distribution][token.source.id] ||= 0
176
201
  lemma[:distribution][token.source.id] += 1
@@ -179,7 +204,6 @@ module PROIEL::Dictionary
179
204
  lemma[:paradigm][token.morphology][token.form] ||= 0
180
205
  lemma[:paradigm][token.morphology][token.form] += 1
181
206
 
182
- lemma[:n] += 1
183
207
 
184
208
  # Find verbal nodes
185
209
  if token.part_of_speech[/^V/]