proiel 1.2.1 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/proiel.rb +8 -2
- data/lib/proiel/alignment.rb +3 -0
- data/lib/proiel/alignment/builder.rb +220 -0
- data/lib/proiel/annotation_schema.rb +11 -4
- data/lib/proiel/dictionary.rb +78 -2
- data/lib/proiel/dictionary/builder.rb +60 -36
- data/lib/proiel/div.rb +5 -2
- data/lib/proiel/language.rb +108 -0
- data/lib/proiel/lemma.rb +78 -0
- data/lib/proiel/proiel_xml/proiel-3.0/proiel-3.0.xsd +383 -0
- data/lib/proiel/proiel_xml/reader.rb +138 -2
- data/lib/proiel/proiel_xml/schema.rb +4 -2
- data/lib/proiel/sentence.rb +5 -2
- data/lib/proiel/source.rb +10 -3
- data/lib/proiel/treebank.rb +21 -4
- data/lib/proiel/version.rb +1 -1
- data/lib/proiel/visualization/graphviz.rb +9 -5
- data/lib/proiel/visualization/graphviz/aligned-modern.dot.erb +83 -0
- data/lib/proiel/visualization/graphviz/classic.dot.erb +2 -1
- data/lib/proiel/visualization/graphviz/linearized.dot.erb +7 -4
- data/lib/proiel/visualization/graphviz/modern.dot.erb +39 -0
- data/lib/proiel/visualization/graphviz/packed.dot.erb +5 -3
- metadata +22 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ccdb00c28a352d6f6481a76b5adf4bef5a426e98738c3ed4241134e202302aef
|
4
|
+
data.tar.gz: 299fde59d6c773a9f1246263f66ab3d37b4216f0b1dd873a552eb4c8d1cd6ef7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 105c8c89b0d3df2491fb51a03dbf96797af8e195edcdb1b901b12e81e0a632dac1e8b2ae6b398606fbc0b18b856134c338410094a5d03efd3783a7fed6b756e1
|
7
|
+
data.tar.gz: ce513c17bfa2301928551a49c81147f6da693c38a733b2cc749705f5f96dc2798c6dd48729e2929a3202f1061ba458c306470aedf6598c684744dfc2b74acfd4
|
data/lib/proiel.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015-
|
2
|
+
# Copyright (c) 2015-2018 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -15,6 +15,7 @@ require 'erb'
|
|
15
15
|
require 'open3'
|
16
16
|
require 'set'
|
17
17
|
require 'builder'
|
18
|
+
require 'csv'
|
18
19
|
|
19
20
|
require 'proiel/version'
|
20
21
|
require 'proiel/utils'
|
@@ -32,7 +33,12 @@ require 'proiel/source'
|
|
32
33
|
require 'proiel/div'
|
33
34
|
require 'proiel/sentence'
|
34
35
|
require 'proiel/token'
|
36
|
+
require 'proiel/dictionary'
|
37
|
+
require 'proiel/dictionary/builder'
|
38
|
+
require 'proiel/lemma'
|
35
39
|
require 'proiel/visualization'
|
36
40
|
require 'proiel/chronology'
|
37
41
|
require 'proiel/valency'
|
38
|
-
require 'proiel/dictionary'
|
42
|
+
require 'proiel/dictionary/builder'
|
43
|
+
require 'proiel/alignment'
|
44
|
+
require 'proiel/language'
|
@@ -0,0 +1,220 @@
|
|
1
|
+
module PROIEL
|
2
|
+
module Alignment
|
3
|
+
module Builder
|
4
|
+
# This computes a matrix of original and translation sentences that are
|
5
|
+
# aligned. For now, this function does not handle translation sentences that
|
6
|
+
# are unaligned (this is tricky to handle robustly!). As the current treebank
|
7
|
+
# collection stands this is an issue that *should* not arise so this is for
|
8
|
+
# now a reasonable approximation.
|
9
|
+
def self.compute_matrix(alignment, source, blacklist = [], log_directory = nil)
|
10
|
+
matrix1 = group_backwards(alignment, source, blacklist)
|
11
|
+
raise unless matrix1.map { |r| r[:original] }.flatten.compact == alignment.sentences.map(&:id)
|
12
|
+
|
13
|
+
matrix2 = group_forwards(alignment, source, blacklist)
|
14
|
+
raise unless matrix2.map { |r| r[:translation] }.flatten.compact == source.sentences.map(&:id)
|
15
|
+
|
16
|
+
if log_directory
|
17
|
+
# Verify that both texts are still in the correct sequence
|
18
|
+
File.open(File.join(log_directory, "#{source.id}1"), 'w') do |f|
|
19
|
+
matrix1.map do |x|
|
20
|
+
f.puts x.inspect
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
File.open(File.join(log_directory, "#{source.id}2"), 'w') do |f|
|
25
|
+
matrix2.map do |x|
|
26
|
+
f.puts x.inspect
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
matrix = []
|
32
|
+
iter1 = { i: 0, m: matrix1 }
|
33
|
+
iter2 = { i: 0, m: matrix2 }
|
34
|
+
|
35
|
+
loop do
|
36
|
+
# Take from matrix1 unless we have a translation
|
37
|
+
while iter1[:i] < iter1[:m].length and iter1[:m][iter1[:i]][:translation].empty?
|
38
|
+
matrix << iter1[:m][iter1[:i]]
|
39
|
+
iter1[:i] += 1
|
40
|
+
end
|
41
|
+
|
42
|
+
# Take from matrix2 unless we have an original
|
43
|
+
while iter2[:i] < iter2[:m].length and iter2[:m][iter2[:i]][:original].empty?
|
44
|
+
matrix << iter2[:m][iter2[:i]]
|
45
|
+
iter2[:i] += 1
|
46
|
+
end
|
47
|
+
|
48
|
+
if iter1[:i] < iter1[:m].length and iter2[:i] < iter2[:m].length
|
49
|
+
# Now the two should match provided alignments are sorted the same way,
|
50
|
+
# so take one from each. If they don't match outright, we may have a case
|
51
|
+
# of swapped sentence orders or a gap (one sentence unaligned in one of
|
52
|
+
# the texts surrounded by two sentences that are aligned to the same
|
53
|
+
# sentence in the other text). We'll try to repair this by merging bits
|
54
|
+
# from the next row in various combinations.
|
55
|
+
#
|
56
|
+
# When adding to the new mateix, pick original from matrix1 and
|
57
|
+
# translation from matrix2 so that the original textual order is
|
58
|
+
# preserved
|
59
|
+
if repair(matrix, iter1, 0, iter2, 0) or
|
60
|
+
|
61
|
+
repair(matrix, iter1, 1, iter2, 0) or
|
62
|
+
repair(matrix, iter1, 0, iter2, 1) or
|
63
|
+
repair(matrix, iter1, 1, iter2, 1) or
|
64
|
+
|
65
|
+
repair(matrix, iter1, 2, iter2, 0) or
|
66
|
+
repair(matrix, iter1, 0, iter2, 2) or
|
67
|
+
repair(matrix, iter1, 2, iter2, 1) or
|
68
|
+
repair(matrix, iter1, 1, iter2, 2) or
|
69
|
+
repair(matrix, iter1, 2, iter2, 2) or
|
70
|
+
|
71
|
+
repair(matrix, iter1, 3, iter2, 0) or
|
72
|
+
repair(matrix, iter1, 0, iter2, 3) or
|
73
|
+
repair(matrix, iter1, 3, iter2, 1) or
|
74
|
+
repair(matrix, iter1, 1, iter2, 3) or
|
75
|
+
repair(matrix, iter1, 3, iter2, 2) or
|
76
|
+
repair(matrix, iter1, 2, iter2, 3) or
|
77
|
+
repair(matrix, iter1, 3, iter2, 3) or
|
78
|
+
|
79
|
+
repair(matrix, iter1, 4, iter2, 0) or
|
80
|
+
repair(matrix, iter1, 0, iter2, 4) or
|
81
|
+
repair(matrix, iter1, 4, iter2, 1) or
|
82
|
+
repair(matrix, iter1, 1, iter2, 4) or
|
83
|
+
repair(matrix, iter1, 4, iter2, 2) or
|
84
|
+
repair(matrix, iter1, 2, iter2, 4) or
|
85
|
+
repair(matrix, iter1, 4, iter2, 3) or
|
86
|
+
repair(matrix, iter1, 3, iter2, 4) or
|
87
|
+
repair(matrix, iter1, 4, iter2, 4)
|
88
|
+
else
|
89
|
+
STDERR.puts iter1[:i], iter1[:m][iter1[:i]].inspect
|
90
|
+
STDERR.puts iter2[:i], iter2[:m][iter2[:i]].inspect
|
91
|
+
raise
|
92
|
+
end
|
93
|
+
else
|
94
|
+
raise unless iter1[:i] == iter1[:m].length and iter2[:i] == iter2[:m].length
|
95
|
+
break
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
if log_directory
|
100
|
+
File.open(File.join(log_directory, "#{source.id}3"), 'w') do |f|
|
101
|
+
matrix.map do |x|
|
102
|
+
f.puts x.inspect
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
raise unless matrix.map { |r| r[:original] }.flatten.compact == alignment.sentences.map(&:id)
|
108
|
+
raise unless matrix.map { |r| r[:translation] }.flatten.compact == source.sentences.map(&:id)
|
109
|
+
|
110
|
+
matrix
|
111
|
+
end
|
112
|
+
|
113
|
+
private
|
114
|
+
|
115
|
+
def self.group_forwards(alignment, source, blacklist = [])
|
116
|
+
# Make an original to translation ID mapping
|
117
|
+
mapping = {}
|
118
|
+
|
119
|
+
source.sentences.each do |sentence|
|
120
|
+
mapping[sentence.id] = []
|
121
|
+
|
122
|
+
next if blacklist.include?(sentence.id)
|
123
|
+
|
124
|
+
mapping[sentence.id] = sentence.inferred_alignment(alignment).map(&:id)
|
125
|
+
end
|
126
|
+
|
127
|
+
# Translate to a pairs of ID arrays, chunk original IDs that share at least
|
128
|
+
# one translation ID, then reduce the result so we get an array of m-to-n
|
129
|
+
# relations
|
130
|
+
mapping.map do |v, k|
|
131
|
+
{ original: k, translation: [v] }
|
132
|
+
end.chunk_while do |x, y|
|
133
|
+
!(x[:original] & y[:original]).empty?
|
134
|
+
end.map do |chunk|
|
135
|
+
chunk.inject do |a, v|
|
136
|
+
a[:original] += v[:original]
|
137
|
+
a[:translation] += v[:translation]
|
138
|
+
a
|
139
|
+
end
|
140
|
+
end.map do |row|
|
141
|
+
{ original: row[:original].uniq, translation: row[:translation] }
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
def self.group_backwards(alignment, source, blacklist = [])
|
146
|
+
# Make an original to translation ID mapping
|
147
|
+
mapping = {}
|
148
|
+
|
149
|
+
alignment.sentences.each do |sentence|
|
150
|
+
mapping[sentence.id] = []
|
151
|
+
end
|
152
|
+
|
153
|
+
source.sentences.each do |sentence|
|
154
|
+
next if blacklist.include?(sentence.id)
|
155
|
+
|
156
|
+
original_ids = sentence.inferred_alignment(alignment).map(&:id)
|
157
|
+
|
158
|
+
original_ids.each do |original_id|
|
159
|
+
mapping[original_id] << sentence.id
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
# Translate to a pairs of ID arrays, chunk original IDs that share at least
|
164
|
+
# one translation ID, then reduce the result so we get an array of m-to-n
|
165
|
+
# relations
|
166
|
+
mapping.map do |k, v|
|
167
|
+
{ original: [k], translation: v }
|
168
|
+
end.chunk_while do |x, y|
|
169
|
+
!(x[:translation] & y[:translation]).empty?
|
170
|
+
end.map do |chunk|
|
171
|
+
chunk.inject do |a, v|
|
172
|
+
a[:original] += v[:original]
|
173
|
+
a[:translation] += v[:translation]
|
174
|
+
a
|
175
|
+
end
|
176
|
+
end.map do |row|
|
177
|
+
{ original: row[:original], translation: row[:translation].uniq }
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
def self.repair_merge_cells(iter, delta, field)
|
182
|
+
matrix, i = iter[:m], iter[:i]
|
183
|
+
(0..delta).map { |j| matrix[i + j][field] }.inject(&:+)
|
184
|
+
end
|
185
|
+
|
186
|
+
def self.select_unaligned(iter, delta, field, check_field)
|
187
|
+
matrix, i = iter[:m], iter[:i]
|
188
|
+
(0..delta).select { |j| matrix[i + j][check_field].empty? }.map { |j| matrix[i + j][field] }.flatten
|
189
|
+
end
|
190
|
+
|
191
|
+
def self.repair(matrix, iter1, delta1, iter2, delta2)
|
192
|
+
o1 = repair_merge_cells(iter1, delta1, :original)
|
193
|
+
o2 = repair_merge_cells(iter2, delta2, :original)
|
194
|
+
|
195
|
+
t1 = repair_merge_cells(iter1, delta1, :translation)
|
196
|
+
t2 = repair_merge_cells(iter2, delta2, :translation)
|
197
|
+
|
198
|
+
u1 = select_unaligned(iter1, delta1, :original, :translation)
|
199
|
+
u2 = select_unaligned(iter2, delta2, :translation, :original)
|
200
|
+
|
201
|
+
if o1.sort - u1 == o2.sort.uniq and t1.sort.uniq == t2.sort - u2
|
202
|
+
unless delta1.zero? and delta2.zero?
|
203
|
+
STDERR.puts "Assuming #{delta1 + 1}/#{delta2 + 1} swapped sentence order:"
|
204
|
+
STDERR.puts ' * ' + (0..delta1).map { |j| iter1[:m][iter1[:i] + j].inspect }.join(' + ')
|
205
|
+
STDERR.puts ' * ' + (0..delta2).map { |j| iter2[:m][iter2[:i] + j].inspect }.join(' + ')
|
206
|
+
end
|
207
|
+
|
208
|
+
matrix << { original: o1, translation: t2 }
|
209
|
+
|
210
|
+
iter1[:i] += delta1 + 1
|
211
|
+
iter2[:i] += delta2 + 1
|
212
|
+
|
213
|
+
true
|
214
|
+
else
|
215
|
+
false
|
216
|
+
end
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
@@ -22,10 +22,17 @@ module PROIEL
|
|
22
22
|
|
23
23
|
# Creates a new annotation schema object.
|
24
24
|
def initialize(xml_object)
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
25
|
+
if xml_object
|
26
|
+
@part_of_speech_tags = make_part_of_speech_tags(xml_object).freeze
|
27
|
+
@relation_tags = make_relation_tags(xml_object).freeze
|
28
|
+
@morphology_tags = make_morphology_tags(xml_object).freeze
|
29
|
+
@information_status_tags = make_information_status_tags(xml_object).freeze
|
30
|
+
else
|
31
|
+
@part_of_speech_tags = {}.freeze
|
32
|
+
@relation_tags = {}.freeze
|
33
|
+
@morphology_tags = {}.freeze
|
34
|
+
@information_status_tags = {}.freeze
|
35
|
+
end
|
29
36
|
end
|
30
37
|
|
31
38
|
# @return [Hash<String,RelationTagDefinition>] definition of primary relation tags
|
data/lib/proiel/dictionary.rb
CHANGED
@@ -1,3 +1,79 @@
|
|
1
|
-
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2018 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
module PROIEL
|
7
|
+
class Dictionary < TreebankObject
|
8
|
+
# @return [Treebank] treebank that this source belongs to
|
9
|
+
attr_reader :treebank
|
2
10
|
|
3
|
-
|
11
|
+
# @return [String] language of the source as an ISO 639-3 language tag
|
12
|
+
attr_reader :language
|
13
|
+
|
14
|
+
# @return [String] dialect of the source
|
15
|
+
attr_reader :dialect
|
16
|
+
|
17
|
+
# @return [DateTime] export time for the dictionary
|
18
|
+
attr_reader :export_time
|
19
|
+
|
20
|
+
# @return [Hash] all lemmata in the dictionary
|
21
|
+
attr_reader :lemmata
|
22
|
+
|
23
|
+
# @return [Integer] number of lemmata in the dictionary
|
24
|
+
attr_reader :n
|
25
|
+
|
26
|
+
# @return [Hash] all sources in the dictionary
|
27
|
+
attr_reader :sources
|
28
|
+
|
29
|
+
# Creates a new dictionary object.
|
30
|
+
def initialize(parent, export_time, language, dialect, xml = nil)
|
31
|
+
@treebank = parent
|
32
|
+
|
33
|
+
raise ArgumentError, 'string or nil expected' unless export_time.nil? or export_time.is_a?(String)
|
34
|
+
@export_time = export_time.nil? ? nil : DateTime.parse(export_time).freeze
|
35
|
+
|
36
|
+
@language = language.freeze
|
37
|
+
@dialect = dialect ? dialect.freeze : nil
|
38
|
+
|
39
|
+
@lemmata = {}
|
40
|
+
@sources = {}
|
41
|
+
@n = 0
|
42
|
+
|
43
|
+
from_xml(xml) if xml
|
44
|
+
end
|
45
|
+
|
46
|
+
# FIXME
|
47
|
+
def id
|
48
|
+
@language
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
|
53
|
+
def from_xml(xml)
|
54
|
+
xml.sources.each do |s|
|
55
|
+
@sources[s.idref] = { license: nullify(s.license), n: nullify(s.n, :int) }
|
56
|
+
end
|
57
|
+
|
58
|
+
xml.lemmata.each do |l|
|
59
|
+
@lemmata[l.lemma] ||= {}
|
60
|
+
@lemmata[l.lemma][l.part_of_speech] = Lemma.new(self, l)
|
61
|
+
@n += 1
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def nullify(s, type = nil)
|
66
|
+
case s
|
67
|
+
when NilClass, /^\s*$/
|
68
|
+
nil
|
69
|
+
else
|
70
|
+
case type
|
71
|
+
when :int
|
72
|
+
s.to_i
|
73
|
+
else
|
74
|
+
s.to_s
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -1,12 +1,12 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2016-
|
2
|
+
# Copyright (c) 2016-2018 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
6
6
|
|
7
7
|
# Methods for synthesising and manipulating dictionaries from treebank data.
|
8
|
-
module PROIEL
|
9
|
-
class
|
8
|
+
module PROIEL
|
9
|
+
class DictionaryBuilder
|
10
10
|
attr_reader :license
|
11
11
|
attr_reader :language
|
12
12
|
attr_reader :sources
|
@@ -43,12 +43,13 @@ module PROIEL::Dictionary
|
|
43
43
|
builder.dictionary(language: @language) do
|
44
44
|
builder.sources do
|
45
45
|
@sources.each do |source|
|
46
|
-
builder.source(
|
46
|
+
builder.source(idref: source.id, license: source.license)
|
47
47
|
end
|
48
48
|
end
|
49
49
|
|
50
|
-
builder.lemmata
|
51
|
-
@lemmata.sort_by { |lemma, _| lemma.downcase }.each do |
|
50
|
+
builder.lemmata do
|
51
|
+
@lemmata.sort_by { |lemma, _| lemma.downcase }.each do |form_and_pos, data|
|
52
|
+
form, _ = form_and_pos.split(',')
|
52
53
|
lemma_to_xml(builder, form, data)
|
53
54
|
end
|
54
55
|
end
|
@@ -56,10 +57,41 @@ module PROIEL::Dictionary
|
|
56
57
|
end
|
57
58
|
end
|
58
59
|
|
60
|
+
def add_external_glosses!(filename, languages = %i(eng))
|
61
|
+
raise ArgumentError, 'filename expected' unless filename.is_a?(String)
|
62
|
+
raise ArgumentError, 'file not found' unless File.exists?(filename)
|
63
|
+
|
64
|
+
CSV.foreach(filename, headers: true, encoding: 'utf-8', col_sep: "\t",
|
65
|
+
header_converters: :symbol, quote_char: "\b") do |row|
|
66
|
+
h = row.to_h
|
67
|
+
data = languages.map { |l| [l, h[l]] }.to_h
|
68
|
+
|
69
|
+
lemma = initialize_lemma!(row[:lemma], row[:part_of_speech])
|
70
|
+
lemma[:glosses] ||= {}
|
71
|
+
lemma[:glosses].merge!(data)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
59
75
|
private
|
60
76
|
|
77
|
+
def initialize_lemma!(lemma, part_of_speech)
|
78
|
+
encoded_lemma = [lemma, part_of_speech].join(',')
|
79
|
+
|
80
|
+
@lemmata[encoded_lemma] ||= {}
|
81
|
+
@lemmata[encoded_lemma][:lemma] ||= lemma
|
82
|
+
@lemmata[encoded_lemma][:part_of_speech] ||= part_of_speech
|
83
|
+
@lemmata[encoded_lemma][:homographs] ||= []
|
84
|
+
@lemmata[encoded_lemma][:n] ||= 0
|
85
|
+
|
86
|
+
%i(distribution glosses paradigm valency).each do |k|
|
87
|
+
@lemmata[encoded_lemma][k] ||= {}
|
88
|
+
end
|
89
|
+
|
90
|
+
@lemmata[encoded_lemma]
|
91
|
+
end
|
92
|
+
|
61
93
|
def lemma_to_xml(builder, form, data)
|
62
|
-
builder.lemma(
|
94
|
+
builder.lemma(lemma: form, "part-of-speech": data[:part_of_speech]) do
|
63
95
|
distribution_to_xml(builder, data)
|
64
96
|
glosses_to_xml(builder, data)
|
65
97
|
homographs_to_xml(builder, data)
|
@@ -69,17 +101,21 @@ module PROIEL::Dictionary
|
|
69
101
|
end
|
70
102
|
|
71
103
|
def distribution_to_xml(builder, data)
|
72
|
-
|
73
|
-
|
74
|
-
|
104
|
+
unless data[:distribution].empty?
|
105
|
+
builder.distribution do
|
106
|
+
data[:distribution].sort_by(&:first).each do |source_id, n|
|
107
|
+
builder.source(idref: source_id, n: n)
|
108
|
+
end
|
75
109
|
end
|
76
110
|
end
|
77
111
|
end
|
78
112
|
|
79
113
|
def glosses_to_xml(builder, data)
|
80
|
-
|
114
|
+
unless data[:glosses].empty?
|
81
115
|
builder.glosses do
|
82
|
-
|
116
|
+
data[:glosses].each do |language, value|
|
117
|
+
builder.gloss(value, language: language)
|
118
|
+
end
|
83
119
|
end
|
84
120
|
end
|
85
121
|
end
|
@@ -88,7 +124,8 @@ module PROIEL::Dictionary
|
|
88
124
|
if data[:homographs].count > 0
|
89
125
|
builder.homographs do
|
90
126
|
data[:homographs].each do |homograph|
|
91
|
-
|
127
|
+
lemma, part_of_speech = homograph.split(',')
|
128
|
+
builder.homograph lemma: lemma, "part-of-speech": part_of_speech
|
92
129
|
end
|
93
130
|
end
|
94
131
|
end
|
@@ -120,22 +157,21 @@ module PROIEL::Dictionary
|
|
120
157
|
builder.frame do
|
121
158
|
builder.arguments do
|
122
159
|
frame[:arguments].each do |argument|
|
160
|
+
# FIXME: deal with in a better way
|
161
|
+
argument[:"part-of-speech"] = argument[:part_of_speech] if argument[:part_of_speech]
|
162
|
+
argument.delete(:part_of_speech)
|
123
163
|
builder.argument argument
|
124
164
|
end
|
125
165
|
end
|
126
166
|
|
127
|
-
if frame[:tokens][:a].count > 0
|
128
|
-
builder.tokens
|
167
|
+
if frame[:tokens][:a].count > 0 or frame[:tokens][:r].count > 0
|
168
|
+
builder.tokens do
|
129
169
|
frame[:tokens][:a].each do |token_id|
|
130
|
-
builder.token
|
170
|
+
builder.token(flags: 'a', idref: token_id)
|
131
171
|
end
|
132
|
-
end
|
133
|
-
end
|
134
172
|
|
135
|
-
if frame[:tokens][:r].count > 0
|
136
|
-
builder.tokens flags: 'r', n: frame[:tokens][:r].count do
|
137
173
|
frame[:tokens][:r].each do |token_id|
|
138
|
-
builder.token
|
174
|
+
builder.token(flags: 'r', idref: token_id)
|
139
175
|
end
|
140
176
|
end
|
141
177
|
end
|
@@ -146,7 +182,7 @@ module PROIEL::Dictionary
|
|
146
182
|
end
|
147
183
|
|
148
184
|
def index_homographs!
|
149
|
-
@lemmata.keys.group_by { |l| l.split(
|
185
|
+
@lemmata.keys.group_by { |l| l.split(/[,#]/).first }.each do |m, homographs|
|
150
186
|
if homographs.count > 1
|
151
187
|
homographs.each do |form|
|
152
188
|
@lemmata[form][:homographs] = homographs.reject { |homograph| homograph == form }
|
@@ -157,20 +193,9 @@ module PROIEL::Dictionary
|
|
157
193
|
|
158
194
|
def index_token!(token)
|
159
195
|
if token.lemma and token.part_of_speech
|
160
|
-
|
161
|
-
|
162
|
-
@lemmata[encoded_lemma] ||= {
|
163
|
-
lemma: token.lemma,
|
164
|
-
part_of_speech: token.part_of_speech,
|
165
|
-
distribution: {},
|
166
|
-
glosses: {},
|
167
|
-
homographs: [],
|
168
|
-
paradigm: {},
|
169
|
-
n: 0,
|
170
|
-
valency: {},
|
171
|
-
}
|
196
|
+
lemma = initialize_lemma!(token.lemma, token.part_of_speech)
|
172
197
|
|
173
|
-
lemma
|
198
|
+
lemma[:n] += 1
|
174
199
|
|
175
200
|
lemma[:distribution][token.source.id] ||= 0
|
176
201
|
lemma[:distribution][token.source.id] += 1
|
@@ -179,7 +204,6 @@ module PROIEL::Dictionary
|
|
179
204
|
lemma[:paradigm][token.morphology][token.form] ||= 0
|
180
205
|
lemma[:paradigm][token.morphology][token.form] += 1
|
181
206
|
|
182
|
-
lemma[:n] += 1
|
183
207
|
|
184
208
|
# Find verbal nodes
|
185
209
|
if token.part_of_speech[/^V/]
|