proiel 1.2.1 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/proiel.rb +8 -2
- data/lib/proiel/alignment.rb +3 -0
- data/lib/proiel/alignment/builder.rb +220 -0
- data/lib/proiel/annotation_schema.rb +11 -4
- data/lib/proiel/dictionary.rb +78 -2
- data/lib/proiel/dictionary/builder.rb +60 -36
- data/lib/proiel/div.rb +5 -2
- data/lib/proiel/language.rb +108 -0
- data/lib/proiel/lemma.rb +78 -0
- data/lib/proiel/proiel_xml/proiel-3.0/proiel-3.0.xsd +383 -0
- data/lib/proiel/proiel_xml/reader.rb +138 -2
- data/lib/proiel/proiel_xml/schema.rb +4 -2
- data/lib/proiel/sentence.rb +5 -2
- data/lib/proiel/source.rb +10 -3
- data/lib/proiel/treebank.rb +21 -4
- data/lib/proiel/version.rb +1 -1
- data/lib/proiel/visualization/graphviz.rb +9 -5
- data/lib/proiel/visualization/graphviz/aligned-modern.dot.erb +83 -0
- data/lib/proiel/visualization/graphviz/classic.dot.erb +2 -1
- data/lib/proiel/visualization/graphviz/linearized.dot.erb +7 -4
- data/lib/proiel/visualization/graphviz/modern.dot.erb +39 -0
- data/lib/proiel/visualization/graphviz/packed.dot.erb +5 -3
- metadata +22 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ccdb00c28a352d6f6481a76b5adf4bef5a426e98738c3ed4241134e202302aef
|
4
|
+
data.tar.gz: 299fde59d6c773a9f1246263f66ab3d37b4216f0b1dd873a552eb4c8d1cd6ef7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 105c8c89b0d3df2491fb51a03dbf96797af8e195edcdb1b901b12e81e0a632dac1e8b2ae6b398606fbc0b18b856134c338410094a5d03efd3783a7fed6b756e1
|
7
|
+
data.tar.gz: ce513c17bfa2301928551a49c81147f6da693c38a733b2cc749705f5f96dc2798c6dd48729e2929a3202f1061ba458c306470aedf6598c684744dfc2b74acfd4
|
data/lib/proiel.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015-
|
2
|
+
# Copyright (c) 2015-2018 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -15,6 +15,7 @@ require 'erb'
|
|
15
15
|
require 'open3'
|
16
16
|
require 'set'
|
17
17
|
require 'builder'
|
18
|
+
require 'csv'
|
18
19
|
|
19
20
|
require 'proiel/version'
|
20
21
|
require 'proiel/utils'
|
@@ -32,7 +33,12 @@ require 'proiel/source'
|
|
32
33
|
require 'proiel/div'
|
33
34
|
require 'proiel/sentence'
|
34
35
|
require 'proiel/token'
|
36
|
+
require 'proiel/dictionary'
|
37
|
+
require 'proiel/dictionary/builder'
|
38
|
+
require 'proiel/lemma'
|
35
39
|
require 'proiel/visualization'
|
36
40
|
require 'proiel/chronology'
|
37
41
|
require 'proiel/valency'
|
38
|
-
require 'proiel/dictionary'
|
42
|
+
require 'proiel/dictionary/builder'
|
43
|
+
require 'proiel/alignment'
|
44
|
+
require 'proiel/language'
|
@@ -0,0 +1,220 @@
|
|
1
|
+
module PROIEL
|
2
|
+
module Alignment
|
3
|
+
module Builder
|
4
|
+
# This computes a matrix of original and translation sentences that are
|
5
|
+
# aligned. For now, this function does not handle translation sentences that
|
6
|
+
# are unaligned (this is tricky to handle robustly!). As the current treebank
|
7
|
+
# collection stands this is an issue that *should* not arise so this is for
|
8
|
+
# now a reasonable approximation.
|
9
|
+
def self.compute_matrix(alignment, source, blacklist = [], log_directory = nil)
|
10
|
+
matrix1 = group_backwards(alignment, source, blacklist)
|
11
|
+
raise unless matrix1.map { |r| r[:original] }.flatten.compact == alignment.sentences.map(&:id)
|
12
|
+
|
13
|
+
matrix2 = group_forwards(alignment, source, blacklist)
|
14
|
+
raise unless matrix2.map { |r| r[:translation] }.flatten.compact == source.sentences.map(&:id)
|
15
|
+
|
16
|
+
if log_directory
|
17
|
+
# Verify that both texts are still in the correct sequence
|
18
|
+
File.open(File.join(log_directory, "#{source.id}1"), 'w') do |f|
|
19
|
+
matrix1.map do |x|
|
20
|
+
f.puts x.inspect
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
File.open(File.join(log_directory, "#{source.id}2"), 'w') do |f|
|
25
|
+
matrix2.map do |x|
|
26
|
+
f.puts x.inspect
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
matrix = []
|
32
|
+
iter1 = { i: 0, m: matrix1 }
|
33
|
+
iter2 = { i: 0, m: matrix2 }
|
34
|
+
|
35
|
+
loop do
|
36
|
+
# Take from matrix1 unless we have a translation
|
37
|
+
while iter1[:i] < iter1[:m].length and iter1[:m][iter1[:i]][:translation].empty?
|
38
|
+
matrix << iter1[:m][iter1[:i]]
|
39
|
+
iter1[:i] += 1
|
40
|
+
end
|
41
|
+
|
42
|
+
# Take from matrix2 unless we have an original
|
43
|
+
while iter2[:i] < iter2[:m].length and iter2[:m][iter2[:i]][:original].empty?
|
44
|
+
matrix << iter2[:m][iter2[:i]]
|
45
|
+
iter2[:i] += 1
|
46
|
+
end
|
47
|
+
|
48
|
+
if iter1[:i] < iter1[:m].length and iter2[:i] < iter2[:m].length
|
49
|
+
# Now the two should match provided alignments are sorted the same way,
|
50
|
+
# so take one from each. If they don't match outright, we may have a case
|
51
|
+
# of swapped sentence orders or a gap (one sentence unaligned in one of
|
52
|
+
# the texts surrounded by two sentences that are aligned to the same
|
53
|
+
# sentence in the other text). We'll try to repair this by merging bits
|
54
|
+
# from the next row in various combinations.
|
55
|
+
#
|
56
|
+
# When adding to the new mateix, pick original from matrix1 and
|
57
|
+
# translation from matrix2 so that the original textual order is
|
58
|
+
# preserved
|
59
|
+
if repair(matrix, iter1, 0, iter2, 0) or
|
60
|
+
|
61
|
+
repair(matrix, iter1, 1, iter2, 0) or
|
62
|
+
repair(matrix, iter1, 0, iter2, 1) or
|
63
|
+
repair(matrix, iter1, 1, iter2, 1) or
|
64
|
+
|
65
|
+
repair(matrix, iter1, 2, iter2, 0) or
|
66
|
+
repair(matrix, iter1, 0, iter2, 2) or
|
67
|
+
repair(matrix, iter1, 2, iter2, 1) or
|
68
|
+
repair(matrix, iter1, 1, iter2, 2) or
|
69
|
+
repair(matrix, iter1, 2, iter2, 2) or
|
70
|
+
|
71
|
+
repair(matrix, iter1, 3, iter2, 0) or
|
72
|
+
repair(matrix, iter1, 0, iter2, 3) or
|
73
|
+
repair(matrix, iter1, 3, iter2, 1) or
|
74
|
+
repair(matrix, iter1, 1, iter2, 3) or
|
75
|
+
repair(matrix, iter1, 3, iter2, 2) or
|
76
|
+
repair(matrix, iter1, 2, iter2, 3) or
|
77
|
+
repair(matrix, iter1, 3, iter2, 3) or
|
78
|
+
|
79
|
+
repair(matrix, iter1, 4, iter2, 0) or
|
80
|
+
repair(matrix, iter1, 0, iter2, 4) or
|
81
|
+
repair(matrix, iter1, 4, iter2, 1) or
|
82
|
+
repair(matrix, iter1, 1, iter2, 4) or
|
83
|
+
repair(matrix, iter1, 4, iter2, 2) or
|
84
|
+
repair(matrix, iter1, 2, iter2, 4) or
|
85
|
+
repair(matrix, iter1, 4, iter2, 3) or
|
86
|
+
repair(matrix, iter1, 3, iter2, 4) or
|
87
|
+
repair(matrix, iter1, 4, iter2, 4)
|
88
|
+
else
|
89
|
+
STDERR.puts iter1[:i], iter1[:m][iter1[:i]].inspect
|
90
|
+
STDERR.puts iter2[:i], iter2[:m][iter2[:i]].inspect
|
91
|
+
raise
|
92
|
+
end
|
93
|
+
else
|
94
|
+
raise unless iter1[:i] == iter1[:m].length and iter2[:i] == iter2[:m].length
|
95
|
+
break
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
if log_directory
|
100
|
+
File.open(File.join(log_directory, "#{source.id}3"), 'w') do |f|
|
101
|
+
matrix.map do |x|
|
102
|
+
f.puts x.inspect
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
raise unless matrix.map { |r| r[:original] }.flatten.compact == alignment.sentences.map(&:id)
|
108
|
+
raise unless matrix.map { |r| r[:translation] }.flatten.compact == source.sentences.map(&:id)
|
109
|
+
|
110
|
+
matrix
|
111
|
+
end
|
112
|
+
|
113
|
+
private
|
114
|
+
|
115
|
+
def self.group_forwards(alignment, source, blacklist = [])
|
116
|
+
# Make an original to translation ID mapping
|
117
|
+
mapping = {}
|
118
|
+
|
119
|
+
source.sentences.each do |sentence|
|
120
|
+
mapping[sentence.id] = []
|
121
|
+
|
122
|
+
next if blacklist.include?(sentence.id)
|
123
|
+
|
124
|
+
mapping[sentence.id] = sentence.inferred_alignment(alignment).map(&:id)
|
125
|
+
end
|
126
|
+
|
127
|
+
# Translate to a pairs of ID arrays, chunk original IDs that share at least
|
128
|
+
# one translation ID, then reduce the result so we get an array of m-to-n
|
129
|
+
# relations
|
130
|
+
mapping.map do |v, k|
|
131
|
+
{ original: k, translation: [v] }
|
132
|
+
end.chunk_while do |x, y|
|
133
|
+
!(x[:original] & y[:original]).empty?
|
134
|
+
end.map do |chunk|
|
135
|
+
chunk.inject do |a, v|
|
136
|
+
a[:original] += v[:original]
|
137
|
+
a[:translation] += v[:translation]
|
138
|
+
a
|
139
|
+
end
|
140
|
+
end.map do |row|
|
141
|
+
{ original: row[:original].uniq, translation: row[:translation] }
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
def self.group_backwards(alignment, source, blacklist = [])
|
146
|
+
# Make an original to translation ID mapping
|
147
|
+
mapping = {}
|
148
|
+
|
149
|
+
alignment.sentences.each do |sentence|
|
150
|
+
mapping[sentence.id] = []
|
151
|
+
end
|
152
|
+
|
153
|
+
source.sentences.each do |sentence|
|
154
|
+
next if blacklist.include?(sentence.id)
|
155
|
+
|
156
|
+
original_ids = sentence.inferred_alignment(alignment).map(&:id)
|
157
|
+
|
158
|
+
original_ids.each do |original_id|
|
159
|
+
mapping[original_id] << sentence.id
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
# Translate to a pairs of ID arrays, chunk original IDs that share at least
|
164
|
+
# one translation ID, then reduce the result so we get an array of m-to-n
|
165
|
+
# relations
|
166
|
+
mapping.map do |k, v|
|
167
|
+
{ original: [k], translation: v }
|
168
|
+
end.chunk_while do |x, y|
|
169
|
+
!(x[:translation] & y[:translation]).empty?
|
170
|
+
end.map do |chunk|
|
171
|
+
chunk.inject do |a, v|
|
172
|
+
a[:original] += v[:original]
|
173
|
+
a[:translation] += v[:translation]
|
174
|
+
a
|
175
|
+
end
|
176
|
+
end.map do |row|
|
177
|
+
{ original: row[:original], translation: row[:translation].uniq }
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
def self.repair_merge_cells(iter, delta, field)
|
182
|
+
matrix, i = iter[:m], iter[:i]
|
183
|
+
(0..delta).map { |j| matrix[i + j][field] }.inject(&:+)
|
184
|
+
end
|
185
|
+
|
186
|
+
def self.select_unaligned(iter, delta, field, check_field)
|
187
|
+
matrix, i = iter[:m], iter[:i]
|
188
|
+
(0..delta).select { |j| matrix[i + j][check_field].empty? }.map { |j| matrix[i + j][field] }.flatten
|
189
|
+
end
|
190
|
+
|
191
|
+
def self.repair(matrix, iter1, delta1, iter2, delta2)
|
192
|
+
o1 = repair_merge_cells(iter1, delta1, :original)
|
193
|
+
o2 = repair_merge_cells(iter2, delta2, :original)
|
194
|
+
|
195
|
+
t1 = repair_merge_cells(iter1, delta1, :translation)
|
196
|
+
t2 = repair_merge_cells(iter2, delta2, :translation)
|
197
|
+
|
198
|
+
u1 = select_unaligned(iter1, delta1, :original, :translation)
|
199
|
+
u2 = select_unaligned(iter2, delta2, :translation, :original)
|
200
|
+
|
201
|
+
if o1.sort - u1 == o2.sort.uniq and t1.sort.uniq == t2.sort - u2
|
202
|
+
unless delta1.zero? and delta2.zero?
|
203
|
+
STDERR.puts "Assuming #{delta1 + 1}/#{delta2 + 1} swapped sentence order:"
|
204
|
+
STDERR.puts ' * ' + (0..delta1).map { |j| iter1[:m][iter1[:i] + j].inspect }.join(' + ')
|
205
|
+
STDERR.puts ' * ' + (0..delta2).map { |j| iter2[:m][iter2[:i] + j].inspect }.join(' + ')
|
206
|
+
end
|
207
|
+
|
208
|
+
matrix << { original: o1, translation: t2 }
|
209
|
+
|
210
|
+
iter1[:i] += delta1 + 1
|
211
|
+
iter2[:i] += delta2 + 1
|
212
|
+
|
213
|
+
true
|
214
|
+
else
|
215
|
+
false
|
216
|
+
end
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
@@ -22,10 +22,17 @@ module PROIEL
|
|
22
22
|
|
23
23
|
# Creates a new annotation schema object.
|
24
24
|
def initialize(xml_object)
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
25
|
+
if xml_object
|
26
|
+
@part_of_speech_tags = make_part_of_speech_tags(xml_object).freeze
|
27
|
+
@relation_tags = make_relation_tags(xml_object).freeze
|
28
|
+
@morphology_tags = make_morphology_tags(xml_object).freeze
|
29
|
+
@information_status_tags = make_information_status_tags(xml_object).freeze
|
30
|
+
else
|
31
|
+
@part_of_speech_tags = {}.freeze
|
32
|
+
@relation_tags = {}.freeze
|
33
|
+
@morphology_tags = {}.freeze
|
34
|
+
@information_status_tags = {}.freeze
|
35
|
+
end
|
29
36
|
end
|
30
37
|
|
31
38
|
# @return [Hash<String,RelationTagDefinition>] definition of primary relation tags
|
data/lib/proiel/dictionary.rb
CHANGED
@@ -1,3 +1,79 @@
|
|
1
|
-
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2018 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
module PROIEL
|
7
|
+
class Dictionary < TreebankObject
|
8
|
+
# @return [Treebank] treebank that this source belongs to
|
9
|
+
attr_reader :treebank
|
2
10
|
|
3
|
-
|
11
|
+
# @return [String] language of the source as an ISO 639-3 language tag
|
12
|
+
attr_reader :language
|
13
|
+
|
14
|
+
# @return [String] dialect of the source
|
15
|
+
attr_reader :dialect
|
16
|
+
|
17
|
+
# @return [DateTime] export time for the dictionary
|
18
|
+
attr_reader :export_time
|
19
|
+
|
20
|
+
# @return [Hash] all lemmata in the dictionary
|
21
|
+
attr_reader :lemmata
|
22
|
+
|
23
|
+
# @return [Integer] number of lemmata in the dictionary
|
24
|
+
attr_reader :n
|
25
|
+
|
26
|
+
# @return [Hash] all sources in the dictionary
|
27
|
+
attr_reader :sources
|
28
|
+
|
29
|
+
# Creates a new dictionary object.
|
30
|
+
def initialize(parent, export_time, language, dialect, xml = nil)
|
31
|
+
@treebank = parent
|
32
|
+
|
33
|
+
raise ArgumentError, 'string or nil expected' unless export_time.nil? or export_time.is_a?(String)
|
34
|
+
@export_time = export_time.nil? ? nil : DateTime.parse(export_time).freeze
|
35
|
+
|
36
|
+
@language = language.freeze
|
37
|
+
@dialect = dialect ? dialect.freeze : nil
|
38
|
+
|
39
|
+
@lemmata = {}
|
40
|
+
@sources = {}
|
41
|
+
@n = 0
|
42
|
+
|
43
|
+
from_xml(xml) if xml
|
44
|
+
end
|
45
|
+
|
46
|
+
# FIXME
|
47
|
+
def id
|
48
|
+
@language
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
|
53
|
+
def from_xml(xml)
|
54
|
+
xml.sources.each do |s|
|
55
|
+
@sources[s.idref] = { license: nullify(s.license), n: nullify(s.n, :int) }
|
56
|
+
end
|
57
|
+
|
58
|
+
xml.lemmata.each do |l|
|
59
|
+
@lemmata[l.lemma] ||= {}
|
60
|
+
@lemmata[l.lemma][l.part_of_speech] = Lemma.new(self, l)
|
61
|
+
@n += 1
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def nullify(s, type = nil)
|
66
|
+
case s
|
67
|
+
when NilClass, /^\s*$/
|
68
|
+
nil
|
69
|
+
else
|
70
|
+
case type
|
71
|
+
when :int
|
72
|
+
s.to_i
|
73
|
+
else
|
74
|
+
s.to_s
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -1,12 +1,12 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2016-
|
2
|
+
# Copyright (c) 2016-2018 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
6
6
|
|
7
7
|
# Methods for synthesising and manipulating dictionaries from treebank data.
|
8
|
-
module PROIEL
|
9
|
-
class
|
8
|
+
module PROIEL
|
9
|
+
class DictionaryBuilder
|
10
10
|
attr_reader :license
|
11
11
|
attr_reader :language
|
12
12
|
attr_reader :sources
|
@@ -43,12 +43,13 @@ module PROIEL::Dictionary
|
|
43
43
|
builder.dictionary(language: @language) do
|
44
44
|
builder.sources do
|
45
45
|
@sources.each do |source|
|
46
|
-
builder.source(
|
46
|
+
builder.source(idref: source.id, license: source.license)
|
47
47
|
end
|
48
48
|
end
|
49
49
|
|
50
|
-
builder.lemmata
|
51
|
-
@lemmata.sort_by { |lemma, _| lemma.downcase }.each do |
|
50
|
+
builder.lemmata do
|
51
|
+
@lemmata.sort_by { |lemma, _| lemma.downcase }.each do |form_and_pos, data|
|
52
|
+
form, _ = form_and_pos.split(',')
|
52
53
|
lemma_to_xml(builder, form, data)
|
53
54
|
end
|
54
55
|
end
|
@@ -56,10 +57,41 @@ module PROIEL::Dictionary
|
|
56
57
|
end
|
57
58
|
end
|
58
59
|
|
60
|
+
def add_external_glosses!(filename, languages = %i(eng))
|
61
|
+
raise ArgumentError, 'filename expected' unless filename.is_a?(String)
|
62
|
+
raise ArgumentError, 'file not found' unless File.exists?(filename)
|
63
|
+
|
64
|
+
CSV.foreach(filename, headers: true, encoding: 'utf-8', col_sep: "\t",
|
65
|
+
header_converters: :symbol, quote_char: "\b") do |row|
|
66
|
+
h = row.to_h
|
67
|
+
data = languages.map { |l| [l, h[l]] }.to_h
|
68
|
+
|
69
|
+
lemma = initialize_lemma!(row[:lemma], row[:part_of_speech])
|
70
|
+
lemma[:glosses] ||= {}
|
71
|
+
lemma[:glosses].merge!(data)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
59
75
|
private
|
60
76
|
|
77
|
+
def initialize_lemma!(lemma, part_of_speech)
|
78
|
+
encoded_lemma = [lemma, part_of_speech].join(',')
|
79
|
+
|
80
|
+
@lemmata[encoded_lemma] ||= {}
|
81
|
+
@lemmata[encoded_lemma][:lemma] ||= lemma
|
82
|
+
@lemmata[encoded_lemma][:part_of_speech] ||= part_of_speech
|
83
|
+
@lemmata[encoded_lemma][:homographs] ||= []
|
84
|
+
@lemmata[encoded_lemma][:n] ||= 0
|
85
|
+
|
86
|
+
%i(distribution glosses paradigm valency).each do |k|
|
87
|
+
@lemmata[encoded_lemma][k] ||= {}
|
88
|
+
end
|
89
|
+
|
90
|
+
@lemmata[encoded_lemma]
|
91
|
+
end
|
92
|
+
|
61
93
|
def lemma_to_xml(builder, form, data)
|
62
|
-
builder.lemma(
|
94
|
+
builder.lemma(lemma: form, "part-of-speech": data[:part_of_speech]) do
|
63
95
|
distribution_to_xml(builder, data)
|
64
96
|
glosses_to_xml(builder, data)
|
65
97
|
homographs_to_xml(builder, data)
|
@@ -69,17 +101,21 @@ module PROIEL::Dictionary
|
|
69
101
|
end
|
70
102
|
|
71
103
|
def distribution_to_xml(builder, data)
|
72
|
-
|
73
|
-
|
74
|
-
|
104
|
+
unless data[:distribution].empty?
|
105
|
+
builder.distribution do
|
106
|
+
data[:distribution].sort_by(&:first).each do |source_id, n|
|
107
|
+
builder.source(idref: source_id, n: n)
|
108
|
+
end
|
75
109
|
end
|
76
110
|
end
|
77
111
|
end
|
78
112
|
|
79
113
|
def glosses_to_xml(builder, data)
|
80
|
-
|
114
|
+
unless data[:glosses].empty?
|
81
115
|
builder.glosses do
|
82
|
-
|
116
|
+
data[:glosses].each do |language, value|
|
117
|
+
builder.gloss(value, language: language)
|
118
|
+
end
|
83
119
|
end
|
84
120
|
end
|
85
121
|
end
|
@@ -88,7 +124,8 @@ module PROIEL::Dictionary
|
|
88
124
|
if data[:homographs].count > 0
|
89
125
|
builder.homographs do
|
90
126
|
data[:homographs].each do |homograph|
|
91
|
-
|
127
|
+
lemma, part_of_speech = homograph.split(',')
|
128
|
+
builder.homograph lemma: lemma, "part-of-speech": part_of_speech
|
92
129
|
end
|
93
130
|
end
|
94
131
|
end
|
@@ -120,22 +157,21 @@ module PROIEL::Dictionary
|
|
120
157
|
builder.frame do
|
121
158
|
builder.arguments do
|
122
159
|
frame[:arguments].each do |argument|
|
160
|
+
# FIXME: deal with in a better way
|
161
|
+
argument[:"part-of-speech"] = argument[:part_of_speech] if argument[:part_of_speech]
|
162
|
+
argument.delete(:part_of_speech)
|
123
163
|
builder.argument argument
|
124
164
|
end
|
125
165
|
end
|
126
166
|
|
127
|
-
if frame[:tokens][:a].count > 0
|
128
|
-
builder.tokens
|
167
|
+
if frame[:tokens][:a].count > 0 or frame[:tokens][:r].count > 0
|
168
|
+
builder.tokens do
|
129
169
|
frame[:tokens][:a].each do |token_id|
|
130
|
-
builder.token
|
170
|
+
builder.token(flags: 'a', idref: token_id)
|
131
171
|
end
|
132
|
-
end
|
133
|
-
end
|
134
172
|
|
135
|
-
if frame[:tokens][:r].count > 0
|
136
|
-
builder.tokens flags: 'r', n: frame[:tokens][:r].count do
|
137
173
|
frame[:tokens][:r].each do |token_id|
|
138
|
-
builder.token
|
174
|
+
builder.token(flags: 'r', idref: token_id)
|
139
175
|
end
|
140
176
|
end
|
141
177
|
end
|
@@ -146,7 +182,7 @@ module PROIEL::Dictionary
|
|
146
182
|
end
|
147
183
|
|
148
184
|
def index_homographs!
|
149
|
-
@lemmata.keys.group_by { |l| l.split(
|
185
|
+
@lemmata.keys.group_by { |l| l.split(/[,#]/).first }.each do |m, homographs|
|
150
186
|
if homographs.count > 1
|
151
187
|
homographs.each do |form|
|
152
188
|
@lemmata[form][:homographs] = homographs.reject { |homograph| homograph == form }
|
@@ -157,20 +193,9 @@ module PROIEL::Dictionary
|
|
157
193
|
|
158
194
|
def index_token!(token)
|
159
195
|
if token.lemma and token.part_of_speech
|
160
|
-
|
161
|
-
|
162
|
-
@lemmata[encoded_lemma] ||= {
|
163
|
-
lemma: token.lemma,
|
164
|
-
part_of_speech: token.part_of_speech,
|
165
|
-
distribution: {},
|
166
|
-
glosses: {},
|
167
|
-
homographs: [],
|
168
|
-
paradigm: {},
|
169
|
-
n: 0,
|
170
|
-
valency: {},
|
171
|
-
}
|
196
|
+
lemma = initialize_lemma!(token.lemma, token.part_of_speech)
|
172
197
|
|
173
|
-
lemma
|
198
|
+
lemma[:n] += 1
|
174
199
|
|
175
200
|
lemma[:distribution][token.source.id] ||= 0
|
176
201
|
lemma[:distribution][token.source.id] += 1
|
@@ -179,7 +204,6 @@ module PROIEL::Dictionary
|
|
179
204
|
lemma[:paradigm][token.morphology][token.form] ||= 0
|
180
205
|
lemma[:paradigm][token.morphology][token.form] += 1
|
181
206
|
|
182
|
-
lemma[:n] += 1
|
183
207
|
|
184
208
|
# Find verbal nodes
|
185
209
|
if token.part_of_speech[/^V/]
|