proiel 1.1.0 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. checksums.yaml +5 -5
  2. data/LICENSE +1 -1
  3. data/README.md +2 -2
  4. data/lib/proiel.rb +16 -1
  5. data/lib/proiel/alignment.rb +3 -0
  6. data/lib/proiel/alignment/builder.rb +220 -0
  7. data/lib/proiel/annotation_schema.rb +11 -4
  8. data/lib/proiel/chronology.rb +80 -0
  9. data/lib/proiel/dictionary.rb +79 -0
  10. data/lib/proiel/dictionary/builder.rb +224 -0
  11. data/lib/proiel/div.rb +22 -3
  12. data/lib/proiel/language.rb +108 -0
  13. data/lib/proiel/lemma.rb +77 -0
  14. data/lib/proiel/proiel_xml/proiel-3.0/proiel-3.0.xsd +383 -0
  15. data/lib/proiel/proiel_xml/reader.rb +138 -2
  16. data/lib/proiel/proiel_xml/schema.rb +4 -2
  17. data/lib/proiel/proiel_xml/validator.rb +76 -9
  18. data/lib/proiel/sentence.rb +27 -4
  19. data/lib/proiel/source.rb +14 -4
  20. data/lib/proiel/statistics.rb +2 -2
  21. data/lib/proiel/token.rb +14 -6
  22. data/lib/proiel/tokenization.rb +5 -3
  23. data/lib/proiel/treebank.rb +23 -6
  24. data/lib/proiel/utils.rb +0 -1
  25. data/lib/proiel/valency.rb +5 -0
  26. data/lib/proiel/valency/arguments.rb +151 -0
  27. data/lib/proiel/valency/lexicon.rb +59 -0
  28. data/lib/proiel/valency/obliqueness.rb +31 -0
  29. data/lib/proiel/version.rb +2 -3
  30. data/lib/proiel/visualization.rb +1 -0
  31. data/lib/proiel/visualization/graphviz.rb +111 -0
  32. data/lib/proiel/visualization/graphviz/aligned-modern.dot.erb +83 -0
  33. data/lib/proiel/visualization/graphviz/classic.dot.erb +24 -0
  34. data/lib/proiel/visualization/graphviz/linearized.dot.erb +57 -0
  35. data/lib/proiel/visualization/graphviz/modern.dot.erb +39 -0
  36. data/lib/proiel/visualization/graphviz/packed.dot.erb +25 -0
  37. metadata +76 -31
@@ -0,0 +1,224 @@
1
+ #--
2
+ # Copyright (c) 2016-2018 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+
7
+ # Methods for synthesising and manipulating dictionaries from treebank data.
8
+ module PROIEL
9
+ class DictionaryBuilder
10
+ attr_reader :license
11
+ attr_reader :language
12
+ attr_reader :sources
13
+ attr_reader :lemmata
14
+
15
+ def initialize
16
+ @language = nil
17
+ @license = nil
18
+ @sources = []
19
+ @lemmata = {}
20
+ @valency = PROIEL::Valency::Lexicon.new
21
+ end
22
+
23
+ def add_source!(source)
24
+ raise ArgumentError, 'source expected' unless source.is_a?(PROIEL::Source)
25
+ raise ArgumentError, 'incompatible language' unless @language.nil? or @language == source.language
26
+ raise ArgumentError, 'incompatible license' unless @license.nil? or @license == source.license
27
+
28
+ @language ||= source.language
29
+ @license ||= source.license
30
+ @sources << source
31
+
32
+ source.tokens.each { |token| index_token!(token) }
33
+
34
+ index_homographs!
35
+ end
36
+
37
+ CURRENT_SCHEMA_VERSION = '3.0'.freeze
38
+
39
+ def to_xml(io)
40
+ builder = ::Builder::XmlMarkup.new(target: io, indent: 2)
41
+ builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
42
+ builder.proiel('export-time': DateTime.now.xmlschema, 'schema-version': CURRENT_SCHEMA_VERSION) do
43
+ builder.dictionary(language: @language) do
44
+ builder.sources do
45
+ @sources.each do |source|
46
+ builder.source(idref: source.id, license: source.license)
47
+ end
48
+ end
49
+
50
+ builder.lemmata do
51
+ @lemmata.sort_by { |lemma, _| lemma.downcase }.each do |form_and_pos, data|
52
+ form, _ = form_and_pos.split(',')
53
+ lemma_to_xml(builder, form, data)
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
59
+
60
+ def add_external_glosses!(filename, languages = %i(eng))
61
+ raise ArgumentError, 'filename expected' unless filename.is_a?(String)
62
+ raise ArgumentError, 'file not found' unless File.exists?(filename)
63
+
64
+ CSV.foreach(filename, headers: true, encoding: 'utf-8', col_sep: "\t",
65
+ header_converters: :symbol, quote_char: "\b") do |row|
66
+ h = row.to_h
67
+ data = languages.map { |l| [l, h[l]] }.to_h
68
+
69
+ lemma = initialize_lemma!(row[:lemma], row[:part_of_speech])
70
+ lemma[:glosses] ||= {}
71
+ lemma[:glosses].merge!(data)
72
+ end
73
+ end
74
+
75
+ private
76
+
77
+ def initialize_lemma!(lemma, part_of_speech)
78
+ encoded_lemma = [lemma, part_of_speech].join(',')
79
+
80
+ @lemmata[encoded_lemma] ||= {}
81
+ @lemmata[encoded_lemma][:lemma] ||= lemma
82
+ @lemmata[encoded_lemma][:part_of_speech] ||= part_of_speech
83
+ @lemmata[encoded_lemma][:homographs] ||= []
84
+ @lemmata[encoded_lemma][:n] ||= 0
85
+
86
+ %i(distribution glosses paradigm valency).each do |k|
87
+ @lemmata[encoded_lemma][k] ||= {}
88
+ end
89
+
90
+ @lemmata[encoded_lemma]
91
+ end
92
+
93
+ def lemma_to_xml(builder, form, data)
94
+ builder.lemma(lemma: form, "part-of-speech": data[:part_of_speech]) do
95
+ distribution_to_xml(builder, data)
96
+ glosses_to_xml(builder, data)
97
+ homographs_to_xml(builder, data)
98
+ paradigm_to_xml(builder, data)
99
+ valency_to_xml(builder, data)
100
+ end
101
+ end
102
+
103
+ def distribution_to_xml(builder, data)
104
+ unless data[:distribution].empty?
105
+ builder.distribution do
106
+ data[:distribution].sort_by(&:first).each do |source_id, n|
107
+ builder.source(idref: source_id, n: n)
108
+ end
109
+ end
110
+ end
111
+ end
112
+
113
+ def glosses_to_xml(builder, data)
114
+ unless data[:glosses].empty?
115
+ builder.glosses do
116
+ data[:glosses].each do |language, value|
117
+ builder.gloss(value, language: language)
118
+ end
119
+ end
120
+ end
121
+ end
122
+
123
+ def homographs_to_xml(builder, data)
124
+ if data[:homographs].count > 0
125
+ builder.homographs do
126
+ data[:homographs].each do |homograph|
127
+ lemma, part_of_speech = homograph.split(',')
128
+ builder.homograph lemma: lemma, "part-of-speech": part_of_speech
129
+ end
130
+ end
131
+ end
132
+ end
133
+
134
+ def paradigm_to_xml(builder, data)
135
+ unless data[:paradigm].empty?
136
+ builder.paradigm do
137
+ data[:paradigm].sort_by(&:first).each do |morphology, d|
138
+ builder.slot1 morphology: morphology do
139
+ d.sort_by(&:first).each do |form, n|
140
+ builder.slot2 form: form, n: n
141
+ end
142
+ end
143
+ end
144
+ end
145
+ end
146
+ end
147
+
148
+ def valency_to_xml(builder, data)
149
+ unless data[:valency].empty?
150
+ builder.valency do
151
+ frames =
152
+ data[:valency].map do |arguments, token_ids|
153
+ { arguments: arguments, tokens: token_ids }
154
+ end
155
+
156
+ PROIEL::Valency::Obliqueness.sort_frames(frames).each do |frame|
157
+ builder.frame do
158
+ builder.arguments do
159
+ frame[:arguments].each do |argument|
160
+ # FIXME: deal with in a better way
161
+ argument[:"part-of-speech"] = argument[:part_of_speech] if argument[:part_of_speech]
162
+ argument.delete(:part_of_speech)
163
+ builder.argument argument
164
+ end
165
+ end
166
+
167
+ if frame[:tokens][:a].count > 0 or frame[:tokens][:r].count > 0
168
+ builder.tokens do
169
+ frame[:tokens][:a].each do |token_id|
170
+ builder.token(flags: 'a', idref: token_id)
171
+ end
172
+
173
+ frame[:tokens][:r].each do |token_id|
174
+ builder.token(flags: 'r', idref: token_id)
175
+ end
176
+ end
177
+ end
178
+ end
179
+ end
180
+ end
181
+ end
182
+ end
183
+
184
+ def index_homographs!
185
+ @lemmata.keys.group_by { |l| l.split(/[,#]/).first }.each do |_, homographs|
186
+ if homographs.count > 1
187
+ homographs.each do |form|
188
+ @lemmata[form][:homographs] = homographs.reject { |homograph| homograph == form }
189
+ end
190
+ end
191
+ end
192
+ end
193
+
194
+ def index_token!(token)
195
+ if token.lemma and token.part_of_speech
196
+ lemma = initialize_lemma!(token.lemma, token.part_of_speech)
197
+
198
+ lemma[:n] += 1
199
+
200
+ lemma[:distribution][token.source.id] ||= 0
201
+ lemma[:distribution][token.source.id] += 1
202
+
203
+ lemma[:paradigm][token.morphology] ||= {}
204
+ lemma[:paradigm][token.morphology][token.form] ||= 0
205
+ lemma[:paradigm][token.morphology][token.form] += 1
206
+
207
+ # Find verbal nodes
208
+ if token.part_of_speech[/^V/]
209
+ frame = PROIEL::Valency::Arguments.get_argument_frame(token)
210
+
211
+ lemma[:valency][frame] ||= { a: [], r: [] }
212
+
213
+ entry = lemma[:valency][frame]
214
+
215
+ if token.dependents.any? { |d| d.relation == 'aux' and d.part_of_speech == 'Pk' }
216
+ entry[:r] << token.id
217
+ else
218
+ entry[:a] << token.id
219
+ end
220
+ end
221
+ end
222
+ end
223
+ end
224
+ end
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2015-2016 Marius L. Jøhndal
2
+ # Copyright (c) 2015-2017 Marius L. Jøhndal
3
3
  #
4
4
  # See LICENSE in the top-level source directory for licensing terms.
5
5
  #++
@@ -89,10 +89,13 @@ module PROIEL
89
89
  # Returns the printable form of the div with all token forms and any
90
90
  # presentation data.
91
91
  #
92
+ # @param custom_token_formatter [Lambda] formatting function for tokens
93
+ # which is passed the token as its sole argument
94
+ #
92
95
  # @return [String] the printable form of the div
93
- def printable_form(options = {})
96
+ def printable_form(custom_token_formatter: nil)
94
97
  [presentation_before,
95
- @children.map { |s| s.printable_form(options) },
98
+ @children.map { |s| s.printable_form(custom_token_formatter: custom_token_formatter) },
96
99
  presentation_after].compact.join
97
100
  end
98
101
 
@@ -135,5 +138,21 @@ module PROIEL
135
138
  end
136
139
  end
137
140
  end
141
+
142
+ # Returns the aligned div if any.
143
+ #
144
+ # @return [Div, NilClass] aligned div
145
+ def alignment(aligned_source)
146
+ alignment_id ? aligned_source.treebank.find_div(alignment_id) : nil
147
+ end
148
+
149
+ # Returns inferred aligned divs if any.
150
+ #
151
+ # @return [Array<Div>] inferred aligned divs
152
+ def inferred_alignment(aligned_source)
153
+ sentences.map do |sentence|
154
+ sentence.inferred_alignment(aligned_source)
155
+ end.flatten.compact.map(&:div).uniq
156
+ end
138
157
  end
139
158
  end
@@ -0,0 +1,108 @@
1
+ #--
2
+ # Copyright (c) 2019 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ module Language
8
+ SUPPORTED_LANGUAGES = {
9
+ # This is a subset of language codes from ISO 639-3 and Glottolog.
10
+ ang: 'Old English (ca. 450-1100)',
11
+ ave: 'Avestan',
12
+ axm: 'Middle Armenian',
13
+ chu: 'Church Slavic',
14
+ cms: 'Messapic',
15
+ cnx: 'Middle Cornish',
16
+ dum: 'Middle Dutch',
17
+ enm: 'Middle English',
18
+ frk: 'Old Frankish',
19
+ frm: 'Middle French',
20
+ fro: 'Old French (842-ca. 1400)',
21
+ ghc: 'Hiberno-Scottish Gaelic',
22
+ gmh: 'Middle High German',
23
+ gml: 'Middle Low German',
24
+ gmy: 'Mycenaean Greek',
25
+ goh: 'Old High German (ca. 750-1050)',
26
+ got: 'Gothic',
27
+ grc: 'Ancient Greek (to 1453)',
28
+ hit: 'Hittite',
29
+ hlu: 'Hieroglyphic Luwian',
30
+ htx: 'Middle Hittite',
31
+ lat: 'Latin',
32
+ lng: 'Langobardic',
33
+ mga: 'Middle Irish (10-12th century)',
34
+ non: 'Old Norse',
35
+ nrp: 'North Picene',
36
+ obt: 'Old Breton',
37
+ oco: 'Old Cornish',
38
+ odt: 'Old Dutch-Old Frankish',
39
+ ofs: 'Old Frisian',
40
+ oht: 'Old Hittite',
41
+ olt: 'Old Lithuanian',
42
+ orv: 'Old Russian',
43
+ osc: 'Oscan',
44
+ osp: 'Old Spanish',
45
+ osx: 'Old Saxon',
46
+ owl: 'Old-Middle Welsh',
47
+ peo: 'Old Persian (ca. 600-400 B.C.)',
48
+ pka: 'Ardhamāgadhī Prākrit',
49
+ pmh: 'Maharastri Prakrit',
50
+ por: 'Portuguese',
51
+ pro: 'Old Provençal',
52
+ psu: 'Sauraseni Prakrit',
53
+ rus: 'Russian',
54
+ san: 'Sanskrit',
55
+ sga: 'Early Irish',
56
+ sog: 'Sogdian',
57
+ spa: 'Spanish',
58
+ spx: 'South Picene',
59
+ txb: 'Tokharian B',
60
+ txh: 'Thracian',
61
+ wlm: 'Middle Welsh',
62
+ xbm: 'Middle Breton',
63
+ xcb: 'Cumbric',
64
+ xce: 'Celtiberian',
65
+ xcg: 'Cisalpine Gaulish',
66
+ xcl: 'Classical Armenian',
67
+ xum: 'Umbrian',
68
+ xve: 'Venetic',
69
+ }.freeze
70
+
71
+ # Checks if a language is supported.
72
+ #
73
+ # @param language_tag [String, Symbol] language tag of language to check
74
+ #
75
+ # @return [Boolean]
76
+ #
77
+ # @example
78
+ # language_supported?(:lat) # => true
79
+ # language_supported?('grc') # => true
80
+ def self.language_supported?(language_tag)
81
+ raise ArgumentError unless language_tag.is_a?(Symbol) or language_tag.is_a?(String)
82
+
83
+ SUPPORTED_LANGUAGES.key?(language_tag.to_sym)
84
+ end
85
+
86
+ # Returns the display name for a language.
87
+ #
88
+ # @param language_tag [String, Symbol] language tag of language
89
+ #
90
+ # @return [String]
91
+ #
92
+ # @example
93
+ # get_display_name(:lat) # => "Latin"
94
+ def self.get_display_name(language_tag)
95
+ raise ArgumentError unless language_tag.is_a?(Symbol) or language_tag.is_a?(String)
96
+ raise ArgumentError, 'unsupported language' unless language_supported?(language_tag)
97
+
98
+ SUPPORTED_LANGUAGES[language_tag.to_sym]
99
+ end
100
+
101
+ # Returns tag of all supported languages
102
+ #
103
+ # @return [Array<Symbol>]
104
+ def self.supported_language_tags
105
+ SUPPORTED_LANGUAGES.keys
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,77 @@
1
+ #--
2
+ # Copyright (c) 2018 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ class Lemma < TreebankObject
8
+ # @return [Dictionary] source that the lemma belongs to
9
+ attr_reader :dictionary
10
+
11
+ attr_reader :n
12
+
13
+ # @return [Hash{String, Integer}] distribution of lemmata in sources. The
14
+ # keys are IDs of sources, the values give the frequency of the lemma per
15
+ # source.
16
+ attr_reader :distribution
17
+
18
+ # @return [Array<[String, String]> identified homographs of this lemma. The
19
+ # array contains pairs of lemma form (which will be homographs of this
20
+ # lemma form under the orthographic conventions of the language) and parts
21
+ # of speech.
22
+ attr_reader :homographs
23
+
24
+ # @return [Hash{Symbol, String}] glosses for the current lemma. The keys
25
+ # are language tags and the values the glosses.
26
+ attr_reader :glosses
27
+ attr_reader :paradigm
28
+ attr_reader :valency
29
+
30
+ # Creates a new lemma object.
31
+ def initialize(parent, xml = nil)
32
+ @dictionary = parent
33
+
34
+ @n = nil
35
+
36
+ @distribution = {}
37
+ @homographs = []
38
+ @glosses = {}
39
+ @paradigm = {}
40
+ @valency = []
41
+
42
+ from_xml(xml) if xml
43
+ end
44
+
45
+ private
46
+
47
+ def from_xml(xml)
48
+ @n = nullify(xml.n, :int)
49
+
50
+ @distribution = xml.distribution.map { |h| [h.idref, nullify(h.n, :int)] }.to_h
51
+ @glosses = xml.glosses.map { |h| [h.language.to_sym, h.gloss] }.to_h
52
+ @homographs = xml.homographs.map { |h| [h.lemma, h.part_of_speech] }
53
+ @paradigm = xml.paradigm.map { |slot1| [slot1.morphology, slot1.slot2s.map { |slot2| [slot2.form, nullify(slot2.n, :int)] }.to_h] }.to_h
54
+ @valency =
55
+ xml.valency.map do |frame|
56
+ {
57
+ arguments: frame.arguments.map { |a| { relation: a.relation, lemma: a.lemma, part_of_speech: a.part_of_speech, mood: a.mood, case: a.case } },
58
+ tokens: frame.tokens.map { |t| { flags: t.flags, idref: t.idref } },
59
+ }
60
+ end
61
+ end
62
+
63
+ def nullify(s, type = nil)
64
+ case s
65
+ when NilClass, /^\s*$/
66
+ nil
67
+ else
68
+ case type
69
+ when :int
70
+ s.to_i
71
+ else
72
+ s.to_s
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end