proiel 1.1.0 → 1.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/LICENSE +1 -1
- data/README.md +2 -2
- data/lib/proiel.rb +16 -1
- data/lib/proiel/alignment.rb +3 -0
- data/lib/proiel/alignment/builder.rb +220 -0
- data/lib/proiel/annotation_schema.rb +11 -4
- data/lib/proiel/chronology.rb +80 -0
- data/lib/proiel/dictionary.rb +79 -0
- data/lib/proiel/dictionary/builder.rb +224 -0
- data/lib/proiel/div.rb +22 -3
- data/lib/proiel/language.rb +108 -0
- data/lib/proiel/lemma.rb +77 -0
- data/lib/proiel/proiel_xml/proiel-3.0/proiel-3.0.xsd +383 -0
- data/lib/proiel/proiel_xml/reader.rb +138 -2
- data/lib/proiel/proiel_xml/schema.rb +4 -2
- data/lib/proiel/proiel_xml/validator.rb +76 -9
- data/lib/proiel/sentence.rb +27 -4
- data/lib/proiel/source.rb +14 -4
- data/lib/proiel/statistics.rb +2 -2
- data/lib/proiel/token.rb +14 -6
- data/lib/proiel/tokenization.rb +5 -3
- data/lib/proiel/treebank.rb +23 -6
- data/lib/proiel/utils.rb +0 -1
- data/lib/proiel/valency.rb +5 -0
- data/lib/proiel/valency/arguments.rb +151 -0
- data/lib/proiel/valency/lexicon.rb +59 -0
- data/lib/proiel/valency/obliqueness.rb +31 -0
- data/lib/proiel/version.rb +2 -3
- data/lib/proiel/visualization.rb +1 -0
- data/lib/proiel/visualization/graphviz.rb +111 -0
- data/lib/proiel/visualization/graphviz/aligned-modern.dot.erb +83 -0
- data/lib/proiel/visualization/graphviz/classic.dot.erb +24 -0
- data/lib/proiel/visualization/graphviz/linearized.dot.erb +57 -0
- data/lib/proiel/visualization/graphviz/modern.dot.erb +39 -0
- data/lib/proiel/visualization/graphviz/packed.dot.erb +25 -0
- metadata +76 -31
@@ -0,0 +1,224 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2016-2018 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
|
7
|
+
# Methods for synthesising and manipulating dictionaries from treebank data.
|
8
|
+
module PROIEL
|
9
|
+
class DictionaryBuilder
|
10
|
+
attr_reader :license
|
11
|
+
attr_reader :language
|
12
|
+
attr_reader :sources
|
13
|
+
attr_reader :lemmata
|
14
|
+
|
15
|
+
def initialize
|
16
|
+
@language = nil
|
17
|
+
@license = nil
|
18
|
+
@sources = []
|
19
|
+
@lemmata = {}
|
20
|
+
@valency = PROIEL::Valency::Lexicon.new
|
21
|
+
end
|
22
|
+
|
23
|
+
def add_source!(source)
|
24
|
+
raise ArgumentError, 'source expected' unless source.is_a?(PROIEL::Source)
|
25
|
+
raise ArgumentError, 'incompatible language' unless @language.nil? or @language == source.language
|
26
|
+
raise ArgumentError, 'incompatible license' unless @license.nil? or @license == source.license
|
27
|
+
|
28
|
+
@language ||= source.language
|
29
|
+
@license ||= source.license
|
30
|
+
@sources << source
|
31
|
+
|
32
|
+
source.tokens.each { |token| index_token!(token) }
|
33
|
+
|
34
|
+
index_homographs!
|
35
|
+
end
|
36
|
+
|
37
|
+
CURRENT_SCHEMA_VERSION = '3.0'.freeze
|
38
|
+
|
39
|
+
def to_xml(io)
|
40
|
+
builder = ::Builder::XmlMarkup.new(target: io, indent: 2)
|
41
|
+
builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
|
42
|
+
builder.proiel('export-time': DateTime.now.xmlschema, 'schema-version': CURRENT_SCHEMA_VERSION) do
|
43
|
+
builder.dictionary(language: @language) do
|
44
|
+
builder.sources do
|
45
|
+
@sources.each do |source|
|
46
|
+
builder.source(idref: source.id, license: source.license)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
builder.lemmata do
|
51
|
+
@lemmata.sort_by { |lemma, _| lemma.downcase }.each do |form_and_pos, data|
|
52
|
+
form, _ = form_and_pos.split(',')
|
53
|
+
lemma_to_xml(builder, form, data)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def add_external_glosses!(filename, languages = %i(eng))
|
61
|
+
raise ArgumentError, 'filename expected' unless filename.is_a?(String)
|
62
|
+
raise ArgumentError, 'file not found' unless File.exists?(filename)
|
63
|
+
|
64
|
+
CSV.foreach(filename, headers: true, encoding: 'utf-8', col_sep: "\t",
|
65
|
+
header_converters: :symbol, quote_char: "\b") do |row|
|
66
|
+
h = row.to_h
|
67
|
+
data = languages.map { |l| [l, h[l]] }.to_h
|
68
|
+
|
69
|
+
lemma = initialize_lemma!(row[:lemma], row[:part_of_speech])
|
70
|
+
lemma[:glosses] ||= {}
|
71
|
+
lemma[:glosses].merge!(data)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
private
|
76
|
+
|
77
|
+
def initialize_lemma!(lemma, part_of_speech)
|
78
|
+
encoded_lemma = [lemma, part_of_speech].join(',')
|
79
|
+
|
80
|
+
@lemmata[encoded_lemma] ||= {}
|
81
|
+
@lemmata[encoded_lemma][:lemma] ||= lemma
|
82
|
+
@lemmata[encoded_lemma][:part_of_speech] ||= part_of_speech
|
83
|
+
@lemmata[encoded_lemma][:homographs] ||= []
|
84
|
+
@lemmata[encoded_lemma][:n] ||= 0
|
85
|
+
|
86
|
+
%i(distribution glosses paradigm valency).each do |k|
|
87
|
+
@lemmata[encoded_lemma][k] ||= {}
|
88
|
+
end
|
89
|
+
|
90
|
+
@lemmata[encoded_lemma]
|
91
|
+
end
|
92
|
+
|
93
|
+
def lemma_to_xml(builder, form, data)
|
94
|
+
builder.lemma(lemma: form, "part-of-speech": data[:part_of_speech]) do
|
95
|
+
distribution_to_xml(builder, data)
|
96
|
+
glosses_to_xml(builder, data)
|
97
|
+
homographs_to_xml(builder, data)
|
98
|
+
paradigm_to_xml(builder, data)
|
99
|
+
valency_to_xml(builder, data)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def distribution_to_xml(builder, data)
|
104
|
+
unless data[:distribution].empty?
|
105
|
+
builder.distribution do
|
106
|
+
data[:distribution].sort_by(&:first).each do |source_id, n|
|
107
|
+
builder.source(idref: source_id, n: n)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def glosses_to_xml(builder, data)
|
114
|
+
unless data[:glosses].empty?
|
115
|
+
builder.glosses do
|
116
|
+
data[:glosses].each do |language, value|
|
117
|
+
builder.gloss(value, language: language)
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def homographs_to_xml(builder, data)
|
124
|
+
if data[:homographs].count > 0
|
125
|
+
builder.homographs do
|
126
|
+
data[:homographs].each do |homograph|
|
127
|
+
lemma, part_of_speech = homograph.split(',')
|
128
|
+
builder.homograph lemma: lemma, "part-of-speech": part_of_speech
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
def paradigm_to_xml(builder, data)
|
135
|
+
unless data[:paradigm].empty?
|
136
|
+
builder.paradigm do
|
137
|
+
data[:paradigm].sort_by(&:first).each do |morphology, d|
|
138
|
+
builder.slot1 morphology: morphology do
|
139
|
+
d.sort_by(&:first).each do |form, n|
|
140
|
+
builder.slot2 form: form, n: n
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
def valency_to_xml(builder, data)
|
149
|
+
unless data[:valency].empty?
|
150
|
+
builder.valency do
|
151
|
+
frames =
|
152
|
+
data[:valency].map do |arguments, token_ids|
|
153
|
+
{ arguments: arguments, tokens: token_ids }
|
154
|
+
end
|
155
|
+
|
156
|
+
PROIEL::Valency::Obliqueness.sort_frames(frames).each do |frame|
|
157
|
+
builder.frame do
|
158
|
+
builder.arguments do
|
159
|
+
frame[:arguments].each do |argument|
|
160
|
+
# FIXME: deal with in a better way
|
161
|
+
argument[:"part-of-speech"] = argument[:part_of_speech] if argument[:part_of_speech]
|
162
|
+
argument.delete(:part_of_speech)
|
163
|
+
builder.argument argument
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
if frame[:tokens][:a].count > 0 or frame[:tokens][:r].count > 0
|
168
|
+
builder.tokens do
|
169
|
+
frame[:tokens][:a].each do |token_id|
|
170
|
+
builder.token(flags: 'a', idref: token_id)
|
171
|
+
end
|
172
|
+
|
173
|
+
frame[:tokens][:r].each do |token_id|
|
174
|
+
builder.token(flags: 'r', idref: token_id)
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
def index_homographs!
|
185
|
+
@lemmata.keys.group_by { |l| l.split(/[,#]/).first }.each do |_, homographs|
|
186
|
+
if homographs.count > 1
|
187
|
+
homographs.each do |form|
|
188
|
+
@lemmata[form][:homographs] = homographs.reject { |homograph| homograph == form }
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
def index_token!(token)
|
195
|
+
if token.lemma and token.part_of_speech
|
196
|
+
lemma = initialize_lemma!(token.lemma, token.part_of_speech)
|
197
|
+
|
198
|
+
lemma[:n] += 1
|
199
|
+
|
200
|
+
lemma[:distribution][token.source.id] ||= 0
|
201
|
+
lemma[:distribution][token.source.id] += 1
|
202
|
+
|
203
|
+
lemma[:paradigm][token.morphology] ||= {}
|
204
|
+
lemma[:paradigm][token.morphology][token.form] ||= 0
|
205
|
+
lemma[:paradigm][token.morphology][token.form] += 1
|
206
|
+
|
207
|
+
# Find verbal nodes
|
208
|
+
if token.part_of_speech[/^V/]
|
209
|
+
frame = PROIEL::Valency::Arguments.get_argument_frame(token)
|
210
|
+
|
211
|
+
lemma[:valency][frame] ||= { a: [], r: [] }
|
212
|
+
|
213
|
+
entry = lemma[:valency][frame]
|
214
|
+
|
215
|
+
if token.dependents.any? { |d| d.relation == 'aux' and d.part_of_speech == 'Pk' }
|
216
|
+
entry[:r] << token.id
|
217
|
+
else
|
218
|
+
entry[:a] << token.id
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
222
|
+
end
|
223
|
+
end
|
224
|
+
end
|
data/lib/proiel/div.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c) 2015-
|
2
|
+
# Copyright (c) 2015-2017 Marius L. Jøhndal
|
3
3
|
#
|
4
4
|
# See LICENSE in the top-level source directory for licensing terms.
|
5
5
|
#++
|
@@ -89,10 +89,13 @@ module PROIEL
|
|
89
89
|
# Returns the printable form of the div with all token forms and any
|
90
90
|
# presentation data.
|
91
91
|
#
|
92
|
+
# @param custom_token_formatter [Lambda] formatting function for tokens
|
93
|
+
# which is passed the token as its sole argument
|
94
|
+
#
|
92
95
|
# @return [String] the printable form of the div
|
93
|
-
def printable_form(
|
96
|
+
def printable_form(custom_token_formatter: nil)
|
94
97
|
[presentation_before,
|
95
|
-
@children.map { |s| s.printable_form(
|
98
|
+
@children.map { |s| s.printable_form(custom_token_formatter: custom_token_formatter) },
|
96
99
|
presentation_after].compact.join
|
97
100
|
end
|
98
101
|
|
@@ -135,5 +138,21 @@ module PROIEL
|
|
135
138
|
end
|
136
139
|
end
|
137
140
|
end
|
141
|
+
|
142
|
+
# Returns the aligned div if any.
|
143
|
+
#
|
144
|
+
# @return [Div, NilClass] aligned div
|
145
|
+
def alignment(aligned_source)
|
146
|
+
alignment_id ? aligned_source.treebank.find_div(alignment_id) : nil
|
147
|
+
end
|
148
|
+
|
149
|
+
# Returns inferred aligned divs if any.
|
150
|
+
#
|
151
|
+
# @return [Array<Div>] inferred aligned divs
|
152
|
+
def inferred_alignment(aligned_source)
|
153
|
+
sentences.map do |sentence|
|
154
|
+
sentence.inferred_alignment(aligned_source)
|
155
|
+
end.flatten.compact.map(&:div).uniq
|
156
|
+
end
|
138
157
|
end
|
139
158
|
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2019 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
module PROIEL
|
7
|
+
module Language
|
8
|
+
SUPPORTED_LANGUAGES = {
|
9
|
+
# This is a subset of language codes from ISO 639-3 and Glottolog.
|
10
|
+
ang: 'Old English (ca. 450-1100)',
|
11
|
+
ave: 'Avestan',
|
12
|
+
axm: 'Middle Armenian',
|
13
|
+
chu: 'Church Slavic',
|
14
|
+
cms: 'Messapic',
|
15
|
+
cnx: 'Middle Cornish',
|
16
|
+
dum: 'Middle Dutch',
|
17
|
+
enm: 'Middle English',
|
18
|
+
frk: 'Old Frankish',
|
19
|
+
frm: 'Middle French',
|
20
|
+
fro: 'Old French (842-ca. 1400)',
|
21
|
+
ghc: 'Hiberno-Scottish Gaelic',
|
22
|
+
gmh: 'Middle High German',
|
23
|
+
gml: 'Middle Low German',
|
24
|
+
gmy: 'Mycenaean Greek',
|
25
|
+
goh: 'Old High German (ca. 750-1050)',
|
26
|
+
got: 'Gothic',
|
27
|
+
grc: 'Ancient Greek (to 1453)',
|
28
|
+
hit: 'Hittite',
|
29
|
+
hlu: 'Hieroglyphic Luwian',
|
30
|
+
htx: 'Middle Hittite',
|
31
|
+
lat: 'Latin',
|
32
|
+
lng: 'Langobardic',
|
33
|
+
mga: 'Middle Irish (10-12th century)',
|
34
|
+
non: 'Old Norse',
|
35
|
+
nrp: 'North Picene',
|
36
|
+
obt: 'Old Breton',
|
37
|
+
oco: 'Old Cornish',
|
38
|
+
odt: 'Old Dutch-Old Frankish',
|
39
|
+
ofs: 'Old Frisian',
|
40
|
+
oht: 'Old Hittite',
|
41
|
+
olt: 'Old Lithuanian',
|
42
|
+
orv: 'Old Russian',
|
43
|
+
osc: 'Oscan',
|
44
|
+
osp: 'Old Spanish',
|
45
|
+
osx: 'Old Saxon',
|
46
|
+
owl: 'Old-Middle Welsh',
|
47
|
+
peo: 'Old Persian (ca. 600-400 B.C.)',
|
48
|
+
pka: 'Ardhamāgadhī Prākrit',
|
49
|
+
pmh: 'Maharastri Prakrit',
|
50
|
+
por: 'Portuguese',
|
51
|
+
pro: 'Old Provençal',
|
52
|
+
psu: 'Sauraseni Prakrit',
|
53
|
+
rus: 'Russian',
|
54
|
+
san: 'Sanskrit',
|
55
|
+
sga: 'Early Irish',
|
56
|
+
sog: 'Sogdian',
|
57
|
+
spa: 'Spanish',
|
58
|
+
spx: 'South Picene',
|
59
|
+
txb: 'Tokharian B',
|
60
|
+
txh: 'Thracian',
|
61
|
+
wlm: 'Middle Welsh',
|
62
|
+
xbm: 'Middle Breton',
|
63
|
+
xcb: 'Cumbric',
|
64
|
+
xce: 'Celtiberian',
|
65
|
+
xcg: 'Cisalpine Gaulish',
|
66
|
+
xcl: 'Classical Armenian',
|
67
|
+
xum: 'Umbrian',
|
68
|
+
xve: 'Venetic',
|
69
|
+
}.freeze
|
70
|
+
|
71
|
+
# Checks if a language is supported.
|
72
|
+
#
|
73
|
+
# @param language_tag [String, Symbol] language tag of language to check
|
74
|
+
#
|
75
|
+
# @return [Boolean]
|
76
|
+
#
|
77
|
+
# @example
|
78
|
+
# language_supported?(:lat) # => true
|
79
|
+
# language_supported?('grc') # => true
|
80
|
+
def self.language_supported?(language_tag)
|
81
|
+
raise ArgumentError unless language_tag.is_a?(Symbol) or language_tag.is_a?(String)
|
82
|
+
|
83
|
+
SUPPORTED_LANGUAGES.key?(language_tag.to_sym)
|
84
|
+
end
|
85
|
+
|
86
|
+
# Returns the display name for a language.
|
87
|
+
#
|
88
|
+
# @param language_tag [String, Symbol] language tag of language
|
89
|
+
#
|
90
|
+
# @return [String]
|
91
|
+
#
|
92
|
+
# @example
|
93
|
+
# get_display_name(:lat) # => "Latin"
|
94
|
+
def self.get_display_name(language_tag)
|
95
|
+
raise ArgumentError unless language_tag.is_a?(Symbol) or language_tag.is_a?(String)
|
96
|
+
raise ArgumentError, 'unsupported language' unless language_supported?(language_tag)
|
97
|
+
|
98
|
+
SUPPORTED_LANGUAGES[language_tag.to_sym]
|
99
|
+
end
|
100
|
+
|
101
|
+
# Returns tag of all supported languages
|
102
|
+
#
|
103
|
+
# @return [Array<Symbol>]
|
104
|
+
def self.supported_language_tags
|
105
|
+
SUPPORTED_LANGUAGES.keys
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
data/lib/proiel/lemma.rb
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2018 Marius L. Jøhndal
|
3
|
+
#
|
4
|
+
# See LICENSE in the top-level source directory for licensing terms.
|
5
|
+
#++
|
6
|
+
module PROIEL
|
7
|
+
class Lemma < TreebankObject
|
8
|
+
# @return [Dictionary] source that the lemma belongs to
|
9
|
+
attr_reader :dictionary
|
10
|
+
|
11
|
+
attr_reader :n
|
12
|
+
|
13
|
+
# @return [Hash{String, Integer}] distribution of lemmata in sources. The
|
14
|
+
# keys are IDs of sources, the values give the frequency of the lemma per
|
15
|
+
# source.
|
16
|
+
attr_reader :distribution
|
17
|
+
|
18
|
+
# @return [Array<[String, String]> identified homographs of this lemma. The
|
19
|
+
# array contains pairs of lemma form (which will be homographs of this
|
20
|
+
# lemma form under the orthographic conventions of the language) and parts
|
21
|
+
# of speech.
|
22
|
+
attr_reader :homographs
|
23
|
+
|
24
|
+
# @return [Hash{Symbol, String}] glosses for the current lemma. The keys
|
25
|
+
# are language tags and the values the glosses.
|
26
|
+
attr_reader :glosses
|
27
|
+
attr_reader :paradigm
|
28
|
+
attr_reader :valency
|
29
|
+
|
30
|
+
# Creates a new lemma object.
|
31
|
+
def initialize(parent, xml = nil)
|
32
|
+
@dictionary = parent
|
33
|
+
|
34
|
+
@n = nil
|
35
|
+
|
36
|
+
@distribution = {}
|
37
|
+
@homographs = []
|
38
|
+
@glosses = {}
|
39
|
+
@paradigm = {}
|
40
|
+
@valency = []
|
41
|
+
|
42
|
+
from_xml(xml) if xml
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def from_xml(xml)
|
48
|
+
@n = nullify(xml.n, :int)
|
49
|
+
|
50
|
+
@distribution = xml.distribution.map { |h| [h.idref, nullify(h.n, :int)] }.to_h
|
51
|
+
@glosses = xml.glosses.map { |h| [h.language.to_sym, h.gloss] }.to_h
|
52
|
+
@homographs = xml.homographs.map { |h| [h.lemma, h.part_of_speech] }
|
53
|
+
@paradigm = xml.paradigm.map { |slot1| [slot1.morphology, slot1.slot2s.map { |slot2| [slot2.form, nullify(slot2.n, :int)] }.to_h] }.to_h
|
54
|
+
@valency =
|
55
|
+
xml.valency.map do |frame|
|
56
|
+
{
|
57
|
+
arguments: frame.arguments.map { |a| { relation: a.relation, lemma: a.lemma, part_of_speech: a.part_of_speech, mood: a.mood, case: a.case } },
|
58
|
+
tokens: frame.tokens.map { |t| { flags: t.flags, idref: t.idref } },
|
59
|
+
}
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def nullify(s, type = nil)
|
64
|
+
case s
|
65
|
+
when NilClass, /^\s*$/
|
66
|
+
nil
|
67
|
+
else
|
68
|
+
case type
|
69
|
+
when :int
|
70
|
+
s.to_i
|
71
|
+
else
|
72
|
+
s.to_s
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|