proiel 1.2.1 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 10affa8825a31d3bcb810a5dbc41a7869c4fe7d7cb15b1c361cc8c13947d3c4a
4
- data.tar.gz: 43145ff2225e521599bdc96983c295b2ccdef1a9b642849523f3852fb68b4d8d
3
+ metadata.gz: ccdb00c28a352d6f6481a76b5adf4bef5a426e98738c3ed4241134e202302aef
4
+ data.tar.gz: 299fde59d6c773a9f1246263f66ab3d37b4216f0b1dd873a552eb4c8d1cd6ef7
5
5
  SHA512:
6
- metadata.gz: cc4b7b78021b97304c93429bab8fbe44f38a2e4740c280c5085a86ecb6c43a4e44c55936a0192196d5b769a3f54169ff8dfe64eb31305c07abd791d1e6ea0a17
7
- data.tar.gz: cfcadba2ef52a4d81c6aa432549618c5c9dfef55876ae313f7cdd15704a825cb82be06b1fda0f53ef5983f17470aa443bf5be1d70d659fb066b1a3bbd57ea309
6
+ metadata.gz: 105c8c89b0d3df2491fb51a03dbf96797af8e195edcdb1b901b12e81e0a632dac1e8b2ae6b398606fbc0b18b856134c338410094a5d03efd3783a7fed6b756e1
7
+ data.tar.gz: ce513c17bfa2301928551a49c81147f6da693c38a733b2cc749705f5f96dc2798c6dd48729e2929a3202f1061ba458c306470aedf6598c684744dfc2b74acfd4
data/lib/proiel.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2015-2017 Marius L. Jøhndal
2
+ # Copyright (c) 2015-2018 Marius L. Jøhndal
3
3
  #
4
4
  # See LICENSE in the top-level source directory for licensing terms.
5
5
  #++
@@ -15,6 +15,7 @@ require 'erb'
15
15
  require 'open3'
16
16
  require 'set'
17
17
  require 'builder'
18
+ require 'csv'
18
19
 
19
20
  require 'proiel/version'
20
21
  require 'proiel/utils'
@@ -32,7 +33,12 @@ require 'proiel/source'
32
33
  require 'proiel/div'
33
34
  require 'proiel/sentence'
34
35
  require 'proiel/token'
36
+ require 'proiel/dictionary'
37
+ require 'proiel/dictionary/builder'
38
+ require 'proiel/lemma'
35
39
  require 'proiel/visualization'
36
40
  require 'proiel/chronology'
37
41
  require 'proiel/valency'
38
- require 'proiel/dictionary'
42
+ require 'proiel/dictionary/builder'
43
+ require 'proiel/alignment'
44
+ require 'proiel/language'
@@ -0,0 +1,3 @@
1
+ module PROIEL::Alignment; end
2
+
3
+ require 'proiel/alignment/builder'
@@ -0,0 +1,220 @@
1
+ module PROIEL
2
+ module Alignment
3
+ module Builder
4
+ # This computes a matrix of original and translation sentences that are
5
+ # aligned. For now, this function does not handle translation sentences that
6
+ # are unaligned (this is tricky to handle robustly!). As the current treebank
7
+ # collection stands this is an issue that *should* not arise so this is for
8
+ # now a reasonable approximation.
9
+ def self.compute_matrix(alignment, source, blacklist = [], log_directory = nil)
10
+ matrix1 = group_backwards(alignment, source, blacklist)
11
+ raise unless matrix1.map { |r| r[:original] }.flatten.compact == alignment.sentences.map(&:id)
12
+
13
+ matrix2 = group_forwards(alignment, source, blacklist)
14
+ raise unless matrix2.map { |r| r[:translation] }.flatten.compact == source.sentences.map(&:id)
15
+
16
+ if log_directory
17
+ # Verify that both texts are still in the correct sequence
18
+ File.open(File.join(log_directory, "#{source.id}1"), 'w') do |f|
19
+ matrix1.map do |x|
20
+ f.puts x.inspect
21
+ end
22
+ end
23
+
24
+ File.open(File.join(log_directory, "#{source.id}2"), 'w') do |f|
25
+ matrix2.map do |x|
26
+ f.puts x.inspect
27
+ end
28
+ end
29
+ end
30
+
31
+ matrix = []
32
+ iter1 = { i: 0, m: matrix1 }
33
+ iter2 = { i: 0, m: matrix2 }
34
+
35
+ loop do
36
+ # Take from matrix1 unless we have a translation
37
+ while iter1[:i] < iter1[:m].length and iter1[:m][iter1[:i]][:translation].empty?
38
+ matrix << iter1[:m][iter1[:i]]
39
+ iter1[:i] += 1
40
+ end
41
+
42
+ # Take from matrix2 unless we have an original
43
+ while iter2[:i] < iter2[:m].length and iter2[:m][iter2[:i]][:original].empty?
44
+ matrix << iter2[:m][iter2[:i]]
45
+ iter2[:i] += 1
46
+ end
47
+
48
+ if iter1[:i] < iter1[:m].length and iter2[:i] < iter2[:m].length
49
+ # Now the two should match provided alignments are sorted the same way,
50
+ # so take one from each. If they don't match outright, we may have a case
51
+ # of swapped sentence orders or a gap (one sentence unaligned in one of
52
+ # the texts surrounded by two sentences that are aligned to the same
53
+ # sentence in the other text). We'll try to repair this by merging bits
54
+ # from the next row in various combinations.
55
+ #
56
+ # When adding to the new mateix, pick original from matrix1 and
57
+ # translation from matrix2 so that the original textual order is
58
+ # preserved
59
+ if repair(matrix, iter1, 0, iter2, 0) or
60
+
61
+ repair(matrix, iter1, 1, iter2, 0) or
62
+ repair(matrix, iter1, 0, iter2, 1) or
63
+ repair(matrix, iter1, 1, iter2, 1) or
64
+
65
+ repair(matrix, iter1, 2, iter2, 0) or
66
+ repair(matrix, iter1, 0, iter2, 2) or
67
+ repair(matrix, iter1, 2, iter2, 1) or
68
+ repair(matrix, iter1, 1, iter2, 2) or
69
+ repair(matrix, iter1, 2, iter2, 2) or
70
+
71
+ repair(matrix, iter1, 3, iter2, 0) or
72
+ repair(matrix, iter1, 0, iter2, 3) or
73
+ repair(matrix, iter1, 3, iter2, 1) or
74
+ repair(matrix, iter1, 1, iter2, 3) or
75
+ repair(matrix, iter1, 3, iter2, 2) or
76
+ repair(matrix, iter1, 2, iter2, 3) or
77
+ repair(matrix, iter1, 3, iter2, 3) or
78
+
79
+ repair(matrix, iter1, 4, iter2, 0) or
80
+ repair(matrix, iter1, 0, iter2, 4) or
81
+ repair(matrix, iter1, 4, iter2, 1) or
82
+ repair(matrix, iter1, 1, iter2, 4) or
83
+ repair(matrix, iter1, 4, iter2, 2) or
84
+ repair(matrix, iter1, 2, iter2, 4) or
85
+ repair(matrix, iter1, 4, iter2, 3) or
86
+ repair(matrix, iter1, 3, iter2, 4) or
87
+ repair(matrix, iter1, 4, iter2, 4)
88
+ else
89
+ STDERR.puts iter1[:i], iter1[:m][iter1[:i]].inspect
90
+ STDERR.puts iter2[:i], iter2[:m][iter2[:i]].inspect
91
+ raise
92
+ end
93
+ else
94
+ raise unless iter1[:i] == iter1[:m].length and iter2[:i] == iter2[:m].length
95
+ break
96
+ end
97
+ end
98
+
99
+ if log_directory
100
+ File.open(File.join(log_directory, "#{source.id}3"), 'w') do |f|
101
+ matrix.map do |x|
102
+ f.puts x.inspect
103
+ end
104
+ end
105
+ end
106
+
107
+ raise unless matrix.map { |r| r[:original] }.flatten.compact == alignment.sentences.map(&:id)
108
+ raise unless matrix.map { |r| r[:translation] }.flatten.compact == source.sentences.map(&:id)
109
+
110
+ matrix
111
+ end
112
+
113
+ private
114
+
115
+ def self.group_forwards(alignment, source, blacklist = [])
116
+ # Make an original to translation ID mapping
117
+ mapping = {}
118
+
119
+ source.sentences.each do |sentence|
120
+ mapping[sentence.id] = []
121
+
122
+ next if blacklist.include?(sentence.id)
123
+
124
+ mapping[sentence.id] = sentence.inferred_alignment(alignment).map(&:id)
125
+ end
126
+
127
+ # Translate to a pairs of ID arrays, chunk original IDs that share at least
128
+ # one translation ID, then reduce the result so we get an array of m-to-n
129
+ # relations
130
+ mapping.map do |v, k|
131
+ { original: k, translation: [v] }
132
+ end.chunk_while do |x, y|
133
+ !(x[:original] & y[:original]).empty?
134
+ end.map do |chunk|
135
+ chunk.inject do |a, v|
136
+ a[:original] += v[:original]
137
+ a[:translation] += v[:translation]
138
+ a
139
+ end
140
+ end.map do |row|
141
+ { original: row[:original].uniq, translation: row[:translation] }
142
+ end
143
+ end
144
+
145
+ def self.group_backwards(alignment, source, blacklist = [])
146
+ # Make an original to translation ID mapping
147
+ mapping = {}
148
+
149
+ alignment.sentences.each do |sentence|
150
+ mapping[sentence.id] = []
151
+ end
152
+
153
+ source.sentences.each do |sentence|
154
+ next if blacklist.include?(sentence.id)
155
+
156
+ original_ids = sentence.inferred_alignment(alignment).map(&:id)
157
+
158
+ original_ids.each do |original_id|
159
+ mapping[original_id] << sentence.id
160
+ end
161
+ end
162
+
163
+ # Translate to a pairs of ID arrays, chunk original IDs that share at least
164
+ # one translation ID, then reduce the result so we get an array of m-to-n
165
+ # relations
166
+ mapping.map do |k, v|
167
+ { original: [k], translation: v }
168
+ end.chunk_while do |x, y|
169
+ !(x[:translation] & y[:translation]).empty?
170
+ end.map do |chunk|
171
+ chunk.inject do |a, v|
172
+ a[:original] += v[:original]
173
+ a[:translation] += v[:translation]
174
+ a
175
+ end
176
+ end.map do |row|
177
+ { original: row[:original], translation: row[:translation].uniq }
178
+ end
179
+ end
180
+
181
+ def self.repair_merge_cells(iter, delta, field)
182
+ matrix, i = iter[:m], iter[:i]
183
+ (0..delta).map { |j| matrix[i + j][field] }.inject(&:+)
184
+ end
185
+
186
+ def self.select_unaligned(iter, delta, field, check_field)
187
+ matrix, i = iter[:m], iter[:i]
188
+ (0..delta).select { |j| matrix[i + j][check_field].empty? }.map { |j| matrix[i + j][field] }.flatten
189
+ end
190
+
191
+ def self.repair(matrix, iter1, delta1, iter2, delta2)
192
+ o1 = repair_merge_cells(iter1, delta1, :original)
193
+ o2 = repair_merge_cells(iter2, delta2, :original)
194
+
195
+ t1 = repair_merge_cells(iter1, delta1, :translation)
196
+ t2 = repair_merge_cells(iter2, delta2, :translation)
197
+
198
+ u1 = select_unaligned(iter1, delta1, :original, :translation)
199
+ u2 = select_unaligned(iter2, delta2, :translation, :original)
200
+
201
+ if o1.sort - u1 == o2.sort.uniq and t1.sort.uniq == t2.sort - u2
202
+ unless delta1.zero? and delta2.zero?
203
+ STDERR.puts "Assuming #{delta1 + 1}/#{delta2 + 1} swapped sentence order:"
204
+ STDERR.puts ' * ' + (0..delta1).map { |j| iter1[:m][iter1[:i] + j].inspect }.join(' + ')
205
+ STDERR.puts ' * ' + (0..delta2).map { |j| iter2[:m][iter2[:i] + j].inspect }.join(' + ')
206
+ end
207
+
208
+ matrix << { original: o1, translation: t2 }
209
+
210
+ iter1[:i] += delta1 + 1
211
+ iter2[:i] += delta2 + 1
212
+
213
+ true
214
+ else
215
+ false
216
+ end
217
+ end
218
+ end
219
+ end
220
+ end
@@ -22,10 +22,17 @@ module PROIEL
22
22
 
23
23
  # Creates a new annotation schema object.
24
24
  def initialize(xml_object)
25
- @part_of_speech_tags = make_part_of_speech_tags(xml_object).freeze
26
- @relation_tags = make_relation_tags(xml_object).freeze
27
- @morphology_tags = make_morphology_tags(xml_object).freeze
28
- @information_status_tags = make_information_status_tags(xml_object).freeze
25
+ if xml_object
26
+ @part_of_speech_tags = make_part_of_speech_tags(xml_object).freeze
27
+ @relation_tags = make_relation_tags(xml_object).freeze
28
+ @morphology_tags = make_morphology_tags(xml_object).freeze
29
+ @information_status_tags = make_information_status_tags(xml_object).freeze
30
+ else
31
+ @part_of_speech_tags = {}.freeze
32
+ @relation_tags = {}.freeze
33
+ @morphology_tags = {}.freeze
34
+ @information_status_tags = {}.freeze
35
+ end
29
36
  end
30
37
 
31
38
  # @return [Hash<String,RelationTagDefinition>] definition of primary relation tags
@@ -1,3 +1,79 @@
1
- module PROIEL::Dictionary; end
1
+ #--
2
+ # Copyright (c) 2018 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ class Dictionary < TreebankObject
8
+ # @return [Treebank] treebank that this source belongs to
9
+ attr_reader :treebank
2
10
 
3
- require 'proiel/dictionary/builder'
11
+ # @return [String] language of the source as an ISO 639-3 language tag
12
+ attr_reader :language
13
+
14
+ # @return [String] dialect of the source
15
+ attr_reader :dialect
16
+
17
+ # @return [DateTime] export time for the dictionary
18
+ attr_reader :export_time
19
+
20
+ # @return [Hash] all lemmata in the dictionary
21
+ attr_reader :lemmata
22
+
23
+ # @return [Integer] number of lemmata in the dictionary
24
+ attr_reader :n
25
+
26
+ # @return [Hash] all sources in the dictionary
27
+ attr_reader :sources
28
+
29
+ # Creates a new dictionary object.
30
+ def initialize(parent, export_time, language, dialect, xml = nil)
31
+ @treebank = parent
32
+
33
+ raise ArgumentError, 'string or nil expected' unless export_time.nil? or export_time.is_a?(String)
34
+ @export_time = export_time.nil? ? nil : DateTime.parse(export_time).freeze
35
+
36
+ @language = language.freeze
37
+ @dialect = dialect ? dialect.freeze : nil
38
+
39
+ @lemmata = {}
40
+ @sources = {}
41
+ @n = 0
42
+
43
+ from_xml(xml) if xml
44
+ end
45
+
46
+ # FIXME
47
+ def id
48
+ @language
49
+ end
50
+
51
+ private
52
+
53
+ def from_xml(xml)
54
+ xml.sources.each do |s|
55
+ @sources[s.idref] = { license: nullify(s.license), n: nullify(s.n, :int) }
56
+ end
57
+
58
+ xml.lemmata.each do |l|
59
+ @lemmata[l.lemma] ||= {}
60
+ @lemmata[l.lemma][l.part_of_speech] = Lemma.new(self, l)
61
+ @n += 1
62
+ end
63
+ end
64
+
65
+ def nullify(s, type = nil)
66
+ case s
67
+ when NilClass, /^\s*$/
68
+ nil
69
+ else
70
+ case type
71
+ when :int
72
+ s.to_i
73
+ else
74
+ s.to_s
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end
@@ -1,12 +1,12 @@
1
1
  #--
2
- # Copyright (c) 2016-2017 Marius L. Jøhndal
2
+ # Copyright (c) 2016-2018 Marius L. Jøhndal
3
3
  #
4
4
  # See LICENSE in the top-level source directory for licensing terms.
5
5
  #++
6
6
 
7
7
  # Methods for synthesising and manipulating dictionaries from treebank data.
8
- module PROIEL::Dictionary
9
- class Builder
8
+ module PROIEL
9
+ class DictionaryBuilder
10
10
  attr_reader :license
11
11
  attr_reader :language
12
12
  attr_reader :sources
@@ -43,12 +43,13 @@ module PROIEL::Dictionary
43
43
  builder.dictionary(language: @language) do
44
44
  builder.sources do
45
45
  @sources.each do |source|
46
- builder.source(id: source.id, license: source.license)
46
+ builder.source(idref: source.id, license: source.license)
47
47
  end
48
48
  end
49
49
 
50
- builder.lemmata(n: @lemmata.count) do
51
- @lemmata.sort_by { |lemma, _| lemma.downcase }.each do |form, data|
50
+ builder.lemmata do
51
+ @lemmata.sort_by { |lemma, _| lemma.downcase }.each do |form_and_pos, data|
52
+ form, _ = form_and_pos.split(',')
52
53
  lemma_to_xml(builder, form, data)
53
54
  end
54
55
  end
@@ -56,10 +57,41 @@ module PROIEL::Dictionary
56
57
  end
57
58
  end
58
59
 
60
+ def add_external_glosses!(filename, languages = %i(eng))
61
+ raise ArgumentError, 'filename expected' unless filename.is_a?(String)
62
+ raise ArgumentError, 'file not found' unless File.exists?(filename)
63
+
64
+ CSV.foreach(filename, headers: true, encoding: 'utf-8', col_sep: "\t",
65
+ header_converters: :symbol, quote_char: "\b") do |row|
66
+ h = row.to_h
67
+ data = languages.map { |l| [l, h[l]] }.to_h
68
+
69
+ lemma = initialize_lemma!(row[:lemma], row[:part_of_speech])
70
+ lemma[:glosses] ||= {}
71
+ lemma[:glosses].merge!(data)
72
+ end
73
+ end
74
+
59
75
  private
60
76
 
77
+ def initialize_lemma!(lemma, part_of_speech)
78
+ encoded_lemma = [lemma, part_of_speech].join(',')
79
+
80
+ @lemmata[encoded_lemma] ||= {}
81
+ @lemmata[encoded_lemma][:lemma] ||= lemma
82
+ @lemmata[encoded_lemma][:part_of_speech] ||= part_of_speech
83
+ @lemmata[encoded_lemma][:homographs] ||= []
84
+ @lemmata[encoded_lemma][:n] ||= 0
85
+
86
+ %i(distribution glosses paradigm valency).each do |k|
87
+ @lemmata[encoded_lemma][k] ||= {}
88
+ end
89
+
90
+ @lemmata[encoded_lemma]
91
+ end
92
+
61
93
  def lemma_to_xml(builder, form, data)
62
- builder.lemma(form: form, part_of_speech: data[:part_of_speech], n: data[:n]) do
94
+ builder.lemma(lemma: form, "part-of-speech": data[:part_of_speech]) do
63
95
  distribution_to_xml(builder, data)
64
96
  glosses_to_xml(builder, data)
65
97
  homographs_to_xml(builder, data)
@@ -69,17 +101,21 @@ module PROIEL::Dictionary
69
101
  end
70
102
 
71
103
  def distribution_to_xml(builder, data)
72
- builder.distribution do
73
- data[:distribution].sort_by(&:first).each do |source_id, n|
74
- builder.source(id: source_id, n: n)
104
+ unless data[:distribution].empty?
105
+ builder.distribution do
106
+ data[:distribution].sort_by(&:first).each do |source_id, n|
107
+ builder.source(idref: source_id, n: n)
108
+ end
75
109
  end
76
110
  end
77
111
  end
78
112
 
79
113
  def glosses_to_xml(builder, data)
80
- if data[:glosses].count > 0
114
+ unless data[:glosses].empty?
81
115
  builder.glosses do
82
- # TODO
116
+ data[:glosses].each do |language, value|
117
+ builder.gloss(value, language: language)
118
+ end
83
119
  end
84
120
  end
85
121
  end
@@ -88,7 +124,8 @@ module PROIEL::Dictionary
88
124
  if data[:homographs].count > 0
89
125
  builder.homographs do
90
126
  data[:homographs].each do |homograph|
91
- builder.lemma form: homograph
127
+ lemma, part_of_speech = homograph.split(',')
128
+ builder.homograph lemma: lemma, "part-of-speech": part_of_speech
92
129
  end
93
130
  end
94
131
  end
@@ -120,22 +157,21 @@ module PROIEL::Dictionary
120
157
  builder.frame do
121
158
  builder.arguments do
122
159
  frame[:arguments].each do |argument|
160
+ # FIXME: deal with in a better way
161
+ argument[:"part-of-speech"] = argument[:part_of_speech] if argument[:part_of_speech]
162
+ argument.delete(:part_of_speech)
123
163
  builder.argument argument
124
164
  end
125
165
  end
126
166
 
127
- if frame[:tokens][:a].count > 0
128
- builder.tokens flags: 'a', n: frame[:tokens][:a].count do
167
+ if frame[:tokens][:a].count > 0 or frame[:tokens][:r].count > 0
168
+ builder.tokens do
129
169
  frame[:tokens][:a].each do |token_id|
130
- builder.token id: token_id
170
+ builder.token(flags: 'a', idref: token_id)
131
171
  end
132
- end
133
- end
134
172
 
135
- if frame[:tokens][:r].count > 0
136
- builder.tokens flags: 'r', n: frame[:tokens][:r].count do
137
173
  frame[:tokens][:r].each do |token_id|
138
- builder.token id: token_id
174
+ builder.token(flags: 'r', idref: token_id)
139
175
  end
140
176
  end
141
177
  end
@@ -146,7 +182,7 @@ module PROIEL::Dictionary
146
182
  end
147
183
 
148
184
  def index_homographs!
149
- @lemmata.keys.group_by { |l| l.split(',').first }.each do |m, homographs|
185
+ @lemmata.keys.group_by { |l| l.split(/[,#]/).first }.each do |m, homographs|
150
186
  if homographs.count > 1
151
187
  homographs.each do |form|
152
188
  @lemmata[form][:homographs] = homographs.reject { |homograph| homograph == form }
@@ -157,20 +193,9 @@ module PROIEL::Dictionary
157
193
 
158
194
  def index_token!(token)
159
195
  if token.lemma and token.part_of_speech
160
- encoded_lemma = [token.lemma, token.part_of_speech].join(',')
161
-
162
- @lemmata[encoded_lemma] ||= {
163
- lemma: token.lemma,
164
- part_of_speech: token.part_of_speech,
165
- distribution: {},
166
- glosses: {},
167
- homographs: [],
168
- paradigm: {},
169
- n: 0,
170
- valency: {},
171
- }
196
+ lemma = initialize_lemma!(token.lemma, token.part_of_speech)
172
197
 
173
- lemma = @lemmata[encoded_lemma]
198
+ lemma[:n] += 1
174
199
 
175
200
  lemma[:distribution][token.source.id] ||= 0
176
201
  lemma[:distribution][token.source.id] += 1
@@ -179,7 +204,6 @@ module PROIEL::Dictionary
179
204
  lemma[:paradigm][token.morphology][token.form] ||= 0
180
205
  lemma[:paradigm][token.morphology][token.form] += 1
181
206
 
182
- lemma[:n] += 1
183
207
 
184
208
  # Find verbal nodes
185
209
  if token.part_of_speech[/^V/]