proiel 1.1.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/LICENSE +1 -1
- data/README.md +2 -2
- data/lib/proiel.rb +16 -1
- data/lib/proiel/alignment.rb +3 -0
- data/lib/proiel/alignment/builder.rb +220 -0
- data/lib/proiel/annotation_schema.rb +11 -4
- data/lib/proiel/chronology.rb +80 -0
- data/lib/proiel/dictionary.rb +79 -0
- data/lib/proiel/dictionary/builder.rb +224 -0
- data/lib/proiel/div.rb +22 -3
- data/lib/proiel/language.rb +108 -0
- data/lib/proiel/lemma.rb +77 -0
- data/lib/proiel/proiel_xml/proiel-3.0/proiel-3.0.xsd +383 -0
- data/lib/proiel/proiel_xml/reader.rb +138 -2
- data/lib/proiel/proiel_xml/schema.rb +4 -2
- data/lib/proiel/proiel_xml/validator.rb +76 -9
- data/lib/proiel/sentence.rb +27 -4
- data/lib/proiel/source.rb +14 -4
- data/lib/proiel/statistics.rb +2 -2
- data/lib/proiel/token.rb +14 -6
- data/lib/proiel/tokenization.rb +5 -3
- data/lib/proiel/treebank.rb +23 -6
- data/lib/proiel/utils.rb +0 -1
- data/lib/proiel/valency.rb +5 -0
- data/lib/proiel/valency/arguments.rb +151 -0
- data/lib/proiel/valency/lexicon.rb +59 -0
- data/lib/proiel/valency/obliqueness.rb +31 -0
- data/lib/proiel/version.rb +2 -3
- data/lib/proiel/visualization.rb +1 -0
- data/lib/proiel/visualization/graphviz.rb +111 -0
- data/lib/proiel/visualization/graphviz/aligned-modern.dot.erb +83 -0
- data/lib/proiel/visualization/graphviz/classic.dot.erb +24 -0
- data/lib/proiel/visualization/graphviz/linearized.dot.erb +57 -0
- data/lib/proiel/visualization/graphviz/modern.dot.erb +39 -0
- data/lib/proiel/visualization/graphviz/packed.dot.erb +25 -0
- metadata +76 -31
@@ -0,0 +1,224 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2016-2018 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
|
7
|
+
# Methods for synthesising and manipulating dictionaries from treebank data.
|
8
|
+
module PROIEL
|
9
|
+
class DictionaryBuilder
|
10
|
+
attr_reader :license
|
11
|
+
attr_reader :language
|
12
|
+
attr_reader :sources
|
13
|
+
attr_reader :lemmata
|
14
|
+
|
15
|
+
def initialize
|
16
|
+
@language = nil
|
17
|
+
@license = nil
|
18
|
+
@sources = []
|
19
|
+
@lemmata = {}
|
20
|
+
@valency = PROIEL::Valency::Lexicon.new
|
21
|
+
end
|
22
|
+
|
23
|
+
def add_source!(source)
|
24
|
+
raise ArgumentError, 'source expected' unless source.is_a?(PROIEL::Source)
|
25
|
+
raise ArgumentError, 'incompatible language' unless @language.nil? or @language == source.language
|
26
|
+
raise ArgumentError, 'incompatible license' unless @license.nil? or @license == source.license
|
27
|
+
|
28
|
+
@language ||= source.language
|
29
|
+
@license ||= source.license
|
30
|
+
@sources << source
|
31
|
+
|
32
|
+
source.tokens.each { |token| index_token!(token) }
|
33
|
+
|
34
|
+
index_homographs!
|
35
|
+
end
|
36
|
+
|
37
|
+
CURRENT_SCHEMA_VERSION = '3.0'.freeze
|
38
|
+
|
39
|
+
def to_xml(io)
|
40
|
+
builder = ::Builder::XmlMarkup.new(target: io, indent: 2)
|
41
|
+
builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
|
42
|
+
builder.proiel('export-time': DateTime.now.xmlschema, 'schema-version': CURRENT_SCHEMA_VERSION) do
|
43
|
+
builder.dictionary(language: @language) do
|
44
|
+
builder.sources do
|
45
|
+
@sources.each do |source|
|
46
|
+
builder.source(idref: source.id, license: source.license)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
builder.lemmata do
|
51
|
+
@lemmata.sort_by { |lemma, _| lemma.downcase }.each do |form_and_pos, data|
|
52
|
+
form, _ = form_and_pos.split(',')
|
53
|
+
lemma_to_xml(builder, form, data)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def add_external_glosses!(filename, languages = %i(eng))
|
61
|
+
raise ArgumentError, 'filename expected' unless filename.is_a?(String)
|
62
|
+
raise ArgumentError, 'file not found' unless File.exists?(filename)
|
63
|
+
|
64
|
+
CSV.foreach(filename, headers: true, encoding: 'utf-8', col_sep: "\t",
|
65
|
+
header_converters: :symbol, quote_char: "\b") do |row|
|
66
|
+
h = row.to_h
|
67
|
+
data = languages.map { |l| [l, h[l]] }.to_h
|
68
|
+
|
69
|
+
lemma = initialize_lemma!(row[:lemma], row[:part_of_speech])
|
70
|
+
lemma[:glosses] ||= {}
|
71
|
+
lemma[:glosses].merge!(data)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
private
|
76
|
+
|
77
|
+
def initialize_lemma!(lemma, part_of_speech)
|
78
|
+
encoded_lemma = [lemma, part_of_speech].join(',')
|
79
|
+
|
80
|
+
@lemmata[encoded_lemma] ||= {}
|
81
|
+
@lemmata[encoded_lemma][:lemma] ||= lemma
|
82
|
+
@lemmata[encoded_lemma][:part_of_speech] ||= part_of_speech
|
83
|
+
@lemmata[encoded_lemma][:homographs] ||= []
|
84
|
+
@lemmata[encoded_lemma][:n] ||= 0
|
85
|
+
|
86
|
+
%i(distribution glosses paradigm valency).each do |k|
|
87
|
+
@lemmata[encoded_lemma][k] ||= {}
|
88
|
+
end
|
89
|
+
|
90
|
+
@lemmata[encoded_lemma]
|
91
|
+
end
|
92
|
+
|
93
|
+
def lemma_to_xml(builder, form, data)
|
94
|
+
builder.lemma(lemma: form, "part-of-speech": data[:part_of_speech]) do
|
95
|
+
distribution_to_xml(builder, data)
|
96
|
+
glosses_to_xml(builder, data)
|
97
|
+
homographs_to_xml(builder, data)
|
98
|
+
paradigm_to_xml(builder, data)
|
99
|
+
valency_to_xml(builder, data)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def distribution_to_xml(builder, data)
|
104
|
+
unless data[:distribution].empty?
|
105
|
+
builder.distribution do
|
106
|
+
data[:distribution].sort_by(&:first).each do |source_id, n|
|
107
|
+
builder.source(idref: source_id, n: n)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def glosses_to_xml(builder, data)
|
114
|
+
unless data[:glosses].empty?
|
115
|
+
builder.glosses do
|
116
|
+
data[:glosses].each do |language, value|
|
117
|
+
builder.gloss(value, language: language)
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def homographs_to_xml(builder, data)
|
124
|
+
if data[:homographs].count > 0
|
125
|
+
builder.homographs do
|
126
|
+
data[:homographs].each do |homograph|
|
127
|
+
lemma, part_of_speech = homograph.split(',')
|
128
|
+
builder.homograph lemma: lemma, "part-of-speech": part_of_speech
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
def paradigm_to_xml(builder, data)
|
135
|
+
unless data[:paradigm].empty?
|
136
|
+
builder.paradigm do
|
137
|
+
data[:paradigm].sort_by(&:first).each do |morphology, d|
|
138
|
+
builder.slot1 morphology: morphology do
|
139
|
+
d.sort_by(&:first).each do |form, n|
|
140
|
+
builder.slot2 form: form, n: n
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
def valency_to_xml(builder, data)
|
149
|
+
unless data[:valency].empty?
|
150
|
+
builder.valency do
|
151
|
+
frames =
|
152
|
+
data[:valency].map do |arguments, token_ids|
|
153
|
+
{ arguments: arguments, tokens: token_ids }
|
154
|
+
end
|
155
|
+
|
156
|
+
PROIEL::Valency::Obliqueness.sort_frames(frames).each do |frame|
|
157
|
+
builder.frame do
|
158
|
+
builder.arguments do
|
159
|
+
frame[:arguments].each do |argument|
|
160
|
+
# FIXME: deal with in a better way
|
161
|
+
argument[:"part-of-speech"] = argument[:part_of_speech] if argument[:part_of_speech]
|
162
|
+
argument.delete(:part_of_speech)
|
163
|
+
builder.argument argument
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
if frame[:tokens][:a].count > 0 or frame[:tokens][:r].count > 0
|
168
|
+
builder.tokens do
|
169
|
+
frame[:tokens][:a].each do |token_id|
|
170
|
+
builder.token(flags: 'a', idref: token_id)
|
171
|
+
end
|
172
|
+
|
173
|
+
frame[:tokens][:r].each do |token_id|
|
174
|
+
builder.token(flags: 'r', idref: token_id)
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
def index_homographs!
|
185
|
+
@lemmata.keys.group_by { |l| l.split(/[,#]/).first }.each do |_, homographs|
|
186
|
+
if homographs.count > 1
|
187
|
+
homographs.each do |form|
|
188
|
+
@lemmata[form][:homographs] = homographs.reject { |homograph| homograph == form }
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
def index_token!(token)
|
195
|
+
if token.lemma and token.part_of_speech
|
196
|
+
lemma = initialize_lemma!(token.lemma, token.part_of_speech)
|
197
|
+
|
198
|
+
lemma[:n] += 1
|
199
|
+
|
200
|
+
lemma[:distribution][token.source.id] ||= 0
|
201
|
+
lemma[:distribution][token.source.id] += 1
|
202
|
+
|
203
|
+
lemma[:paradigm][token.morphology] ||= {}
|
204
|
+
lemma[:paradigm][token.morphology][token.form] ||= 0
|
205
|
+
lemma[:paradigm][token.morphology][token.form] += 1
|
206
|
+
|
207
|
+
# Find verbal nodes
|
208
|
+
if token.part_of_speech[/^V/]
|
209
|
+
frame = PROIEL::Valency::Arguments.get_argument_frame(token)
|
210
|
+
|
211
|
+
lemma[:valency][frame] ||= { a: [], r: [] }
|
212
|
+
|
213
|
+
entry = lemma[:valency][frame]
|
214
|
+
|
215
|
+
if token.dependents.any? { |d| d.relation == 'aux' and d.part_of_speech == 'Pk' }
|
216
|
+
entry[:r] << token.id
|
217
|
+
else
|
218
|
+
entry[:a] << token.id
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
222
|
+
end
|
223
|
+
end
|
224
|
+
end
|
data/lib/proiel/div.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015-
|
2
|
+
# Copyright (c) 2015-2017 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -89,10 +89,13 @@ module PROIEL
|
|
89
89
|
# Returns the printable form of the div with all token forms and any
|
90
90
|
# presentation data.
|
91
91
|
#
|
92
|
+
# @param custom_token_formatter [Lambda] formatting function for tokens
|
93
|
+
# which is passed the token as its sole argument
|
94
|
+
#
|
92
95
|
# @return [String] the printable form of the div
|
93
|
-
def printable_form(
|
96
|
+
def printable_form(custom_token_formatter: nil)
|
94
97
|
[presentation_before,
|
95
|
-
@children.map { |s| s.printable_form(
|
98
|
+
@children.map { |s| s.printable_form(custom_token_formatter: custom_token_formatter) },
|
96
99
|
presentation_after].compact.join
|
97
100
|
end
|
98
101
|
|
@@ -135,5 +138,21 @@ module PROIEL
|
|
135
138
|
end
|
136
139
|
end
|
137
140
|
end
|
141
|
+
|
142
|
+
# Returns the aligned div if any.
|
143
|
+
#
|
144
|
+
# @return [Div, NilClass] aligned div
|
145
|
+
def alignment(aligned_source)
|
146
|
+
alignment_id ? aligned_source.treebank.find_div(alignment_id) : nil
|
147
|
+
end
|
148
|
+
|
149
|
+
# Returns inferred aligned divs if any.
|
150
|
+
#
|
151
|
+
# @return [Array<Div>] inferred aligned divs
|
152
|
+
def inferred_alignment(aligned_source)
|
153
|
+
sentences.map do |sentence|
|
154
|
+
sentence.inferred_alignment(aligned_source)
|
155
|
+
end.flatten.compact.map(&:div).uniq
|
156
|
+
end
|
138
157
|
end
|
139
158
|
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2019 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
module PROIEL
|
7
|
+
module Language
|
8
|
+
SUPPORTED_LANGUAGES = {
|
9
|
+
# This is a subset of language codes from ISO 639-3 and Glottolog.
|
10
|
+
ang: 'Old English (ca. 450-1100)',
|
11
|
+
ave: 'Avestan',
|
12
|
+
axm: 'Middle Armenian',
|
13
|
+
chu: 'Church Slavic',
|
14
|
+
cms: 'Messapic',
|
15
|
+
cnx: 'Middle Cornish',
|
16
|
+
dum: 'Middle Dutch',
|
17
|
+
enm: 'Middle English',
|
18
|
+
frk: 'Old Frankish',
|
19
|
+
frm: 'Middle French',
|
20
|
+
fro: 'Old French (842-ca. 1400)',
|
21
|
+
ghc: 'Hiberno-Scottish Gaelic',
|
22
|
+
gmh: 'Middle High German',
|
23
|
+
gml: 'Middle Low German',
|
24
|
+
gmy: 'Mycenaean Greek',
|
25
|
+
goh: 'Old High German (ca. 750-1050)',
|
26
|
+
got: 'Gothic',
|
27
|
+
grc: 'Ancient Greek (to 1453)',
|
28
|
+
hit: 'Hittite',
|
29
|
+
hlu: 'Hieroglyphic Luwian',
|
30
|
+
htx: 'Middle Hittite',
|
31
|
+
lat: 'Latin',
|
32
|
+
lng: 'Langobardic',
|
33
|
+
mga: 'Middle Irish (10-12th century)',
|
34
|
+
non: 'Old Norse',
|
35
|
+
nrp: 'North Picene',
|
36
|
+
obt: 'Old Breton',
|
37
|
+
oco: 'Old Cornish',
|
38
|
+
odt: 'Old Dutch-Old Frankish',
|
39
|
+
ofs: 'Old Frisian',
|
40
|
+
oht: 'Old Hittite',
|
41
|
+
olt: 'Old Lithuanian',
|
42
|
+
orv: 'Old Russian',
|
43
|
+
osc: 'Oscan',
|
44
|
+
osp: 'Old Spanish',
|
45
|
+
osx: 'Old Saxon',
|
46
|
+
owl: 'Old-Middle Welsh',
|
47
|
+
peo: 'Old Persian (ca. 600-400 B.C.)',
|
48
|
+
pka: 'Ardhamāgadhī Prākrit',
|
49
|
+
pmh: 'Maharastri Prakrit',
|
50
|
+
por: 'Portuguese',
|
51
|
+
pro: 'Old Provençal',
|
52
|
+
psu: 'Sauraseni Prakrit',
|
53
|
+
rus: 'Russian',
|
54
|
+
san: 'Sanskrit',
|
55
|
+
sga: 'Early Irish',
|
56
|
+
sog: 'Sogdian',
|
57
|
+
spa: 'Spanish',
|
58
|
+
spx: 'South Picene',
|
59
|
+
txb: 'Tokharian B',
|
60
|
+
txh: 'Thracian',
|
61
|
+
wlm: 'Middle Welsh',
|
62
|
+
xbm: 'Middle Breton',
|
63
|
+
xcb: 'Cumbric',
|
64
|
+
xce: 'Celtiberian',
|
65
|
+
xcg: 'Cisalpine Gaulish',
|
66
|
+
xcl: 'Classical Armenian',
|
67
|
+
xum: 'Umbrian',
|
68
|
+
xve: 'Venetic',
|
69
|
+
}.freeze
|
70
|
+
|
71
|
+
# Checks if a language is supported.
|
72
|
+
#
|
73
|
+
# @param language_tag [String, Symbol] language tag of language to check
|
74
|
+
#
|
75
|
+
# @return [Boolean]
|
76
|
+
#
|
77
|
+
# @example
|
78
|
+
# language_supported?(:lat) # => true
|
79
|
+
# language_supported?('grc') # => true
|
80
|
+
def self.language_supported?(language_tag)
|
81
|
+
raise ArgumentError unless language_tag.is_a?(Symbol) or language_tag.is_a?(String)
|
82
|
+
|
83
|
+
SUPPORTED_LANGUAGES.key?(language_tag.to_sym)
|
84
|
+
end
|
85
|
+
|
86
|
+
# Returns the display name for a language.
|
87
|
+
#
|
88
|
+
# @param language_tag [String, Symbol] language tag of language
|
89
|
+
#
|
90
|
+
# @return [String]
|
91
|
+
#
|
92
|
+
# @example
|
93
|
+
# get_display_name(:lat) # => "Latin"
|
94
|
+
def self.get_display_name(language_tag)
|
95
|
+
raise ArgumentError unless language_tag.is_a?(Symbol) or language_tag.is_a?(String)
|
96
|
+
raise ArgumentError, 'unsupported language' unless language_supported?(language_tag)
|
97
|
+
|
98
|
+
SUPPORTED_LANGUAGES[language_tag.to_sym]
|
99
|
+
end
|
100
|
+
|
101
|
+
# Returns tag of all supported languages
|
102
|
+
#
|
103
|
+
# @return [Array<Symbol>]
|
104
|
+
def self.supported_language_tags
|
105
|
+
SUPPORTED_LANGUAGES.keys
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
data/lib/proiel/lemma.rb
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2018 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
module PROIEL
|
7
|
+
class Lemma < TreebankObject
|
8
|
+
# @return [Dictionary] source that the lemma belongs to
|
9
|
+
attr_reader :dictionary
|
10
|
+
|
11
|
+
attr_reader :n
|
12
|
+
|
13
|
+
# @return [Hash{String, Integer}] distribution of lemmata in sources. The
|
14
|
+
# keys are IDs of sources, the values give the frequency of the lemma per
|
15
|
+
# source.
|
16
|
+
attr_reader :distribution
|
17
|
+
|
18
|
+
# @return [Array<[String, String]> identified homographs of this lemma. The
|
19
|
+
# array contains pairs of lemma form (which will be homographs of this
|
20
|
+
# lemma form under the orthographic conventions of the language) and parts
|
21
|
+
# of speech.
|
22
|
+
attr_reader :homographs
|
23
|
+
|
24
|
+
# @return [Hash{Symbol, String}] glosses for the current lemma. The keys
|
25
|
+
# are language tags and the values the glosses.
|
26
|
+
attr_reader :glosses
|
27
|
+
attr_reader :paradigm
|
28
|
+
attr_reader :valency
|
29
|
+
|
30
|
+
# Creates a new lemma object.
|
31
|
+
def initialize(parent, xml = nil)
|
32
|
+
@dictionary = parent
|
33
|
+
|
34
|
+
@n = nil
|
35
|
+
|
36
|
+
@distribution = {}
|
37
|
+
@homographs = []
|
38
|
+
@glosses = {}
|
39
|
+
@paradigm = {}
|
40
|
+
@valency = []
|
41
|
+
|
42
|
+
from_xml(xml) if xml
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def from_xml(xml)
|
48
|
+
@n = nullify(xml.n, :int)
|
49
|
+
|
50
|
+
@distribution = xml.distribution.map { |h| [h.idref, nullify(h.n, :int)] }.to_h
|
51
|
+
@glosses = xml.glosses.map { |h| [h.language.to_sym, h.gloss] }.to_h
|
52
|
+
@homographs = xml.homographs.map { |h| [h.lemma, h.part_of_speech] }
|
53
|
+
@paradigm = xml.paradigm.map { |slot1| [slot1.morphology, slot1.slot2s.map { |slot2| [slot2.form, nullify(slot2.n, :int)] }.to_h] }.to_h
|
54
|
+
@valency =
|
55
|
+
xml.valency.map do |frame|
|
56
|
+
{
|
57
|
+
arguments: frame.arguments.map { |a| { relation: a.relation, lemma: a.lemma, part_of_speech: a.part_of_speech, mood: a.mood, case: a.case } },
|
58
|
+
tokens: frame.tokens.map { |t| { flags: t.flags, idref: t.idref } },
|
59
|
+
}
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def nullify(s, type = nil)
|
64
|
+
case s
|
65
|
+
when NilClass, /^\s*$/
|
66
|
+
nil
|
67
|
+
else
|
68
|
+
case type
|
69
|
+
when :int
|
70
|
+
s.to_i
|
71
|
+
else
|
72
|
+
s.to_s
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|