proiel 1.1.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +5 -5
  2. data/LICENSE +1 -1
  3. data/README.md +2 -2
  4. data/lib/proiel.rb +16 -1
  5. data/lib/proiel/alignment.rb +3 -0
  6. data/lib/proiel/alignment/builder.rb +220 -0
  7. data/lib/proiel/annotation_schema.rb +11 -4
  8. data/lib/proiel/chronology.rb +80 -0
  9. data/lib/proiel/dictionary.rb +79 -0
  10. data/lib/proiel/dictionary/builder.rb +224 -0
  11. data/lib/proiel/div.rb +22 -3
  12. data/lib/proiel/language.rb +108 -0
  13. data/lib/proiel/lemma.rb +77 -0
  14. data/lib/proiel/proiel_xml/proiel-3.0/proiel-3.0.xsd +383 -0
  15. data/lib/proiel/proiel_xml/reader.rb +138 -2
  16. data/lib/proiel/proiel_xml/schema.rb +4 -2
  17. data/lib/proiel/proiel_xml/validator.rb +76 -9
  18. data/lib/proiel/sentence.rb +27 -4
  19. data/lib/proiel/source.rb +14 -4
  20. data/lib/proiel/statistics.rb +2 -2
  21. data/lib/proiel/token.rb +14 -6
  22. data/lib/proiel/tokenization.rb +5 -3
  23. data/lib/proiel/treebank.rb +23 -6
  24. data/lib/proiel/utils.rb +0 -1
  25. data/lib/proiel/valency.rb +5 -0
  26. data/lib/proiel/valency/arguments.rb +151 -0
  27. data/lib/proiel/valency/lexicon.rb +59 -0
  28. data/lib/proiel/valency/obliqueness.rb +31 -0
  29. data/lib/proiel/version.rb +2 -3
  30. data/lib/proiel/visualization.rb +1 -0
  31. data/lib/proiel/visualization/graphviz.rb +111 -0
  32. data/lib/proiel/visualization/graphviz/aligned-modern.dot.erb +83 -0
  33. data/lib/proiel/visualization/graphviz/classic.dot.erb +24 -0
  34. data/lib/proiel/visualization/graphviz/linearized.dot.erb +57 -0
  35. data/lib/proiel/visualization/graphviz/modern.dot.erb +39 -0
  36. data/lib/proiel/visualization/graphviz/packed.dot.erb +25 -0
  37. metadata +76 -31
@@ -0,0 +1,224 @@
1
+ #--
2
+ # Copyright (c) 2016-2018 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+
7
+ # Methods for synthesising and manipulating dictionaries from treebank data.
8
+ module PROIEL
9
+ class DictionaryBuilder
10
+ attr_reader :license
11
+ attr_reader :language
12
+ attr_reader :sources
13
+ attr_reader :lemmata
14
+
15
+ def initialize
16
+ @language = nil
17
+ @license = nil
18
+ @sources = []
19
+ @lemmata = {}
20
+ @valency = PROIEL::Valency::Lexicon.new
21
+ end
22
+
23
+ def add_source!(source)
24
+ raise ArgumentError, 'source expected' unless source.is_a?(PROIEL::Source)
25
+ raise ArgumentError, 'incompatible language' unless @language.nil? or @language == source.language
26
+ raise ArgumentError, 'incompatible license' unless @license.nil? or @license == source.license
27
+
28
+ @language ||= source.language
29
+ @license ||= source.license
30
+ @sources << source
31
+
32
+ source.tokens.each { |token| index_token!(token) }
33
+
34
+ index_homographs!
35
+ end
36
+
37
+ CURRENT_SCHEMA_VERSION = '3.0'.freeze
38
+
39
+ def to_xml(io)
40
+ builder = ::Builder::XmlMarkup.new(target: io, indent: 2)
41
+ builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
42
+ builder.proiel('export-time': DateTime.now.xmlschema, 'schema-version': CURRENT_SCHEMA_VERSION) do
43
+ builder.dictionary(language: @language) do
44
+ builder.sources do
45
+ @sources.each do |source|
46
+ builder.source(idref: source.id, license: source.license)
47
+ end
48
+ end
49
+
50
+ builder.lemmata do
51
+ @lemmata.sort_by { |lemma, _| lemma.downcase }.each do |form_and_pos, data|
52
+ form, _ = form_and_pos.split(',')
53
+ lemma_to_xml(builder, form, data)
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
59
+
60
+ def add_external_glosses!(filename, languages = %i(eng))
61
+ raise ArgumentError, 'filename expected' unless filename.is_a?(String)
62
+ raise ArgumentError, 'file not found' unless File.exists?(filename)
63
+
64
+ CSV.foreach(filename, headers: true, encoding: 'utf-8', col_sep: "\t",
65
+ header_converters: :symbol, quote_char: "\b") do |row|
66
+ h = row.to_h
67
+ data = languages.map { |l| [l, h[l]] }.to_h
68
+
69
+ lemma = initialize_lemma!(row[:lemma], row[:part_of_speech])
70
+ lemma[:glosses] ||= {}
71
+ lemma[:glosses].merge!(data)
72
+ end
73
+ end
74
+
75
+ private
76
+
77
+ def initialize_lemma!(lemma, part_of_speech)
78
+ encoded_lemma = [lemma, part_of_speech].join(',')
79
+
80
+ @lemmata[encoded_lemma] ||= {}
81
+ @lemmata[encoded_lemma][:lemma] ||= lemma
82
+ @lemmata[encoded_lemma][:part_of_speech] ||= part_of_speech
83
+ @lemmata[encoded_lemma][:homographs] ||= []
84
+ @lemmata[encoded_lemma][:n] ||= 0
85
+
86
+ %i(distribution glosses paradigm valency).each do |k|
87
+ @lemmata[encoded_lemma][k] ||= {}
88
+ end
89
+
90
+ @lemmata[encoded_lemma]
91
+ end
92
+
93
+ def lemma_to_xml(builder, form, data)
94
+ builder.lemma(lemma: form, "part-of-speech": data[:part_of_speech]) do
95
+ distribution_to_xml(builder, data)
96
+ glosses_to_xml(builder, data)
97
+ homographs_to_xml(builder, data)
98
+ paradigm_to_xml(builder, data)
99
+ valency_to_xml(builder, data)
100
+ end
101
+ end
102
+
103
+ def distribution_to_xml(builder, data)
104
+ unless data[:distribution].empty?
105
+ builder.distribution do
106
+ data[:distribution].sort_by(&:first).each do |source_id, n|
107
+ builder.source(idref: source_id, n: n)
108
+ end
109
+ end
110
+ end
111
+ end
112
+
113
+ def glosses_to_xml(builder, data)
114
+ unless data[:glosses].empty?
115
+ builder.glosses do
116
+ data[:glosses].each do |language, value|
117
+ builder.gloss(value, language: language)
118
+ end
119
+ end
120
+ end
121
+ end
122
+
123
+ def homographs_to_xml(builder, data)
124
+ if data[:homographs].count > 0
125
+ builder.homographs do
126
+ data[:homographs].each do |homograph|
127
+ lemma, part_of_speech = homograph.split(',')
128
+ builder.homograph lemma: lemma, "part-of-speech": part_of_speech
129
+ end
130
+ end
131
+ end
132
+ end
133
+
134
+ def paradigm_to_xml(builder, data)
135
+ unless data[:paradigm].empty?
136
+ builder.paradigm do
137
+ data[:paradigm].sort_by(&:first).each do |morphology, d|
138
+ builder.slot1 morphology: morphology do
139
+ d.sort_by(&:first).each do |form, n|
140
+ builder.slot2 form: form, n: n
141
+ end
142
+ end
143
+ end
144
+ end
145
+ end
146
+ end
147
+
148
+ def valency_to_xml(builder, data)
149
+ unless data[:valency].empty?
150
+ builder.valency do
151
+ frames =
152
+ data[:valency].map do |arguments, token_ids|
153
+ { arguments: arguments, tokens: token_ids }
154
+ end
155
+
156
+ PROIEL::Valency::Obliqueness.sort_frames(frames).each do |frame|
157
+ builder.frame do
158
+ builder.arguments do
159
+ frame[:arguments].each do |argument|
160
+ # FIXME: deal with in a better way
161
+ argument[:"part-of-speech"] = argument[:part_of_speech] if argument[:part_of_speech]
162
+ argument.delete(:part_of_speech)
163
+ builder.argument argument
164
+ end
165
+ end
166
+
167
+ if frame[:tokens][:a].count > 0 or frame[:tokens][:r].count > 0
168
+ builder.tokens do
169
+ frame[:tokens][:a].each do |token_id|
170
+ builder.token(flags: 'a', idref: token_id)
171
+ end
172
+
173
+ frame[:tokens][:r].each do |token_id|
174
+ builder.token(flags: 'r', idref: token_id)
175
+ end
176
+ end
177
+ end
178
+ end
179
+ end
180
+ end
181
+ end
182
+ end
183
+
184
+ def index_homographs!
185
+ @lemmata.keys.group_by { |l| l.split(/[,#]/).first }.each do |_, homographs|
186
+ if homographs.count > 1
187
+ homographs.each do |form|
188
+ @lemmata[form][:homographs] = homographs.reject { |homograph| homograph == form }
189
+ end
190
+ end
191
+ end
192
+ end
193
+
194
+ def index_token!(token)
195
+ if token.lemma and token.part_of_speech
196
+ lemma = initialize_lemma!(token.lemma, token.part_of_speech)
197
+
198
+ lemma[:n] += 1
199
+
200
+ lemma[:distribution][token.source.id] ||= 0
201
+ lemma[:distribution][token.source.id] += 1
202
+
203
+ lemma[:paradigm][token.morphology] ||= {}
204
+ lemma[:paradigm][token.morphology][token.form] ||= 0
205
+ lemma[:paradigm][token.morphology][token.form] += 1
206
+
207
+ # Find verbal nodes
208
+ if token.part_of_speech[/^V/]
209
+ frame = PROIEL::Valency::Arguments.get_argument_frame(token)
210
+
211
+ lemma[:valency][frame] ||= { a: [], r: [] }
212
+
213
+ entry = lemma[:valency][frame]
214
+
215
+ if token.dependents.any? { |d| d.relation == 'aux' and d.part_of_speech == 'Pk' }
216
+ entry[:r] << token.id
217
+ else
218
+ entry[:a] << token.id
219
+ end
220
+ end
221
+ end
222
+ end
223
+ end
224
+ end
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2015-2016 Marius L. Jøhndal
2
+ # Copyright (c) 2015-2017 Marius L. Jøhndal
3
3
  #
4
4
  # See LICENSE in the top-level source directory for licensing terms.
5
5
  #++
@@ -89,10 +89,13 @@ module PROIEL
89
89
  # Returns the printable form of the div with all token forms and any
90
90
  # presentation data.
91
91
  #
92
+ # @param custom_token_formatter [Lambda] formatting function for tokens
93
+ # which is passed the token as its sole argument
94
+ #
92
95
  # @return [String] the printable form of the div
93
- def printable_form(options = {})
96
+ def printable_form(custom_token_formatter: nil)
94
97
  [presentation_before,
95
- @children.map { |s| s.printable_form(options) },
98
+ @children.map { |s| s.printable_form(custom_token_formatter: custom_token_formatter) },
96
99
  presentation_after].compact.join
97
100
  end
98
101
 
@@ -135,5 +138,21 @@ module PROIEL
135
138
  end
136
139
  end
137
140
  end
141
+
142
+ # Returns the aligned div if any.
143
+ #
144
+ # @return [Div, NilClass] aligned div
145
+ def alignment(aligned_source)
146
+ alignment_id ? aligned_source.treebank.find_div(alignment_id) : nil
147
+ end
148
+
149
+ # Returns inferred aligned divs if any.
150
+ #
151
+ # @return [Array<Div>] inferred aligned divs
152
+ def inferred_alignment(aligned_source)
153
+ sentences.map do |sentence|
154
+ sentence.inferred_alignment(aligned_source)
155
+ end.flatten.compact.map(&:div).uniq
156
+ end
138
157
  end
139
158
  end
@@ -0,0 +1,108 @@
1
+ #--
2
+ # Copyright (c) 2019 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ module Language
8
+ SUPPORTED_LANGUAGES = {
9
+ # This is a subset of language codes from ISO 639-3 and Glottolog.
10
+ ang: 'Old English (ca. 450-1100)',
11
+ ave: 'Avestan',
12
+ axm: 'Middle Armenian',
13
+ chu: 'Church Slavic',
14
+ cms: 'Messapic',
15
+ cnx: 'Middle Cornish',
16
+ dum: 'Middle Dutch',
17
+ enm: 'Middle English',
18
+ frk: 'Old Frankish',
19
+ frm: 'Middle French',
20
+ fro: 'Old French (842-ca. 1400)',
21
+ ghc: 'Hiberno-Scottish Gaelic',
22
+ gmh: 'Middle High German',
23
+ gml: 'Middle Low German',
24
+ gmy: 'Mycenaean Greek',
25
+ goh: 'Old High German (ca. 750-1050)',
26
+ got: 'Gothic',
27
+ grc: 'Ancient Greek (to 1453)',
28
+ hit: 'Hittite',
29
+ hlu: 'Hieroglyphic Luwian',
30
+ htx: 'Middle Hittite',
31
+ lat: 'Latin',
32
+ lng: 'Langobardic',
33
+ mga: 'Middle Irish (10-12th century)',
34
+ non: 'Old Norse',
35
+ nrp: 'North Picene',
36
+ obt: 'Old Breton',
37
+ oco: 'Old Cornish',
38
+ odt: 'Old Dutch-Old Frankish',
39
+ ofs: 'Old Frisian',
40
+ oht: 'Old Hittite',
41
+ olt: 'Old Lithuanian',
42
+ orv: 'Old Russian',
43
+ osc: 'Oscan',
44
+ osp: 'Old Spanish',
45
+ osx: 'Old Saxon',
46
+ owl: 'Old-Middle Welsh',
47
+ peo: 'Old Persian (ca. 600-400 B.C.)',
48
+ pka: 'Ardhamāgadhī Prākrit',
49
+ pmh: 'Maharastri Prakrit',
50
+ por: 'Portuguese',
51
+ pro: 'Old Provençal',
52
+ psu: 'Sauraseni Prakrit',
53
+ rus: 'Russian',
54
+ san: 'Sanskrit',
55
+ sga: 'Early Irish',
56
+ sog: 'Sogdian',
57
+ spa: 'Spanish',
58
+ spx: 'South Picene',
59
+ txb: 'Tokharian B',
60
+ txh: 'Thracian',
61
+ wlm: 'Middle Welsh',
62
+ xbm: 'Middle Breton',
63
+ xcb: 'Cumbric',
64
+ xce: 'Celtiberian',
65
+ xcg: 'Cisalpine Gaulish',
66
+ xcl: 'Classical Armenian',
67
+ xum: 'Umbrian',
68
+ xve: 'Venetic',
69
+ }.freeze
70
+
71
+ # Checks if a language is supported.
72
+ #
73
+ # @param language_tag [String, Symbol] language tag of language to check
74
+ #
75
+ # @return [Boolean]
76
+ #
77
+ # @example
78
+ # language_supported?(:lat) # => true
79
+ # language_supported?('grc') # => true
80
+ def self.language_supported?(language_tag)
81
+ raise ArgumentError unless language_tag.is_a?(Symbol) or language_tag.is_a?(String)
82
+
83
+ SUPPORTED_LANGUAGES.key?(language_tag.to_sym)
84
+ end
85
+
86
+ # Returns the display name for a language.
87
+ #
88
+ # @param language_tag [String, Symbol] language tag of language
89
+ #
90
+ # @return [String]
91
+ #
92
+ # @example
93
+ # get_display_name(:lat) # => "Latin"
94
+ def self.get_display_name(language_tag)
95
+ raise ArgumentError unless language_tag.is_a?(Symbol) or language_tag.is_a?(String)
96
+ raise ArgumentError, 'unsupported language' unless language_supported?(language_tag)
97
+
98
+ SUPPORTED_LANGUAGES[language_tag.to_sym]
99
+ end
100
+
101
+ # Returns tag of all supported languages
102
+ #
103
+ # @return [Array<Symbol>]
104
+ def self.supported_language_tags
105
+ SUPPORTED_LANGUAGES.keys
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,77 @@
1
+ #--
2
+ # Copyright (c) 2018 Marius L. Jøhndal
3
+ #
4
+ # See LICENSE in the top-level source directory for licensing terms.
5
+ #++
6
+ module PROIEL
7
+ class Lemma < TreebankObject
8
+ # @return [Dictionary] source that the lemma belongs to
9
+ attr_reader :dictionary
10
+
11
+ attr_reader :n
12
+
13
+ # @return [Hash{String, Integer}] distribution of lemmata in sources. The
14
+ # keys are IDs of sources, the values give the frequency of the lemma per
15
+ # source.
16
+ attr_reader :distribution
17
+
18
+ # @return [Array<[String, String]> identified homographs of this lemma. The
19
+ # array contains pairs of lemma form (which will be homographs of this
20
+ # lemma form under the orthographic conventions of the language) and parts
21
+ # of speech.
22
+ attr_reader :homographs
23
+
24
+ # @return [Hash{Symbol, String}] glosses for the current lemma. The keys
25
+ # are language tags and the values the glosses.
26
+ attr_reader :glosses
27
+ attr_reader :paradigm
28
+ attr_reader :valency
29
+
30
+ # Creates a new lemma object.
31
+ def initialize(parent, xml = nil)
32
+ @dictionary = parent
33
+
34
+ @n = nil
35
+
36
+ @distribution = {}
37
+ @homographs = []
38
+ @glosses = {}
39
+ @paradigm = {}
40
+ @valency = []
41
+
42
+ from_xml(xml) if xml
43
+ end
44
+
45
+ private
46
+
47
+ def from_xml(xml)
48
+ @n = nullify(xml.n, :int)
49
+
50
+ @distribution = xml.distribution.map { |h| [h.idref, nullify(h.n, :int)] }.to_h
51
+ @glosses = xml.glosses.map { |h| [h.language.to_sym, h.gloss] }.to_h
52
+ @homographs = xml.homographs.map { |h| [h.lemma, h.part_of_speech] }
53
+ @paradigm = xml.paradigm.map { |slot1| [slot1.morphology, slot1.slot2s.map { |slot2| [slot2.form, nullify(slot2.n, :int)] }.to_h] }.to_h
54
+ @valency =
55
+ xml.valency.map do |frame|
56
+ {
57
+ arguments: frame.arguments.map { |a| { relation: a.relation, lemma: a.lemma, part_of_speech: a.part_of_speech, mood: a.mood, case: a.case } },
58
+ tokens: frame.tokens.map { |t| { flags: t.flags, idref: t.idref } },
59
+ }
60
+ end
61
+ end
62
+
63
+ def nullify(s, type = nil)
64
+ case s
65
+ when NilClass, /^\s*$/
66
+ nil
67
+ else
68
+ case type
69
+ when :int
70
+ s.to_i
71
+ else
72
+ s.to_s
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end