tinycus 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/tinycus.rb +1514 -0
  3. metadata +46 -0
data/tinycus.rb ADDED
@@ -0,0 +1,1514 @@
1
+ # coding: utf-8
2
+ require 'digest'
3
+ require 'json'
4
+
5
+ module Tinycus
6
+
7
+ # The four "alpha_" functions work on Greek and English, also most Latin characters; see comments in Tr.get_greek_collation_tr.
8
+
9
+ def Tinycus.alpha_sort(l,n:false)
10
+ return Tinycus.sort(l,Tinycus.alpha_collation,n:false)
11
+ end
12
+
13
+ def Tinycus.alpha_equal(a,b,n:true)
14
+ return (Tinycus.alpha_compare(a,b,n:n)==0)
15
+ end
16
+
17
+ def Tinycus.alpha_compare(a,b,n:true)
18
+ # return (Tinycus::Tr.remove_accents_from_greek(a,n:n).downcase <=> Tinycus::Tr.remove_accents_from_greek(b,n:n).downcase)
19
+ collation_tr = Tinycus.alpha_collation
20
+ return collation_tr.apply(a,n:n) <=> collation_tr.apply(b,n:n)
21
+ end
22
+
23
+ def Tinycus.alpha_collation
24
+ return Tinycus::Tr.get_greek_collation_tr
25
+ end
26
+
27
+ def Tinycus.is_vowel(c)
28
+ # synonym of Tinycus.contains_vowel, for readability when using it on a single character
29
+ return Tinycus.contains_vowel(c)
30
+ end
31
+
32
+ def Tinycus.contains_vowel(s)
33
+ # works for Greek and Latin; considers y to be a vowel; doesn't handle stuff like Welsh w
34
+ if Tinycus::Tr.remove_accents_from_greek(s).downcase.match?(/[αειουηωaeiouyæ]/) then return true else return false end
35
+ end
36
+
37
+ # ---
38
+
39
+ def Tinycus.sort(l,collation_tr,n:false)
40
+ p = l.clone # This does work on a list of strings: ruby -e "a=['p','q']; b=a.clone; b[1]='x'; print a"
41
+ p.sort_by! { |x| collation_tr.apply(x,n:n) } # ruby's sort_by! only applies the block once to each element, to form an index
42
+ return p
43
+ end
44
+
45
+ def Tinycus.run_tests
46
+ # removing accents:
47
+ tests = [
48
+ ['',''],
49
+ ['ἔχω','εχω'],
50
+ ]
51
+ tests.each { |t|
52
+ x,y = t
53
+ z = Tinycus::Tr.remove_accents_from_greek(x)
54
+ if z!=y then raise "error in test, removing accents from '#{x}' gives '#{z}', expected '#{y}'" end
55
+ }
56
+ #----
57
+ # sorting:
58
+ tests = [
59
+ [ "Μῆνιν ἄειδε, θεά, Πηληϊάδεω Ἀχιλῆος οὐλομένην", "ἄειδε, Ἀχιλῆος θεά, Μῆνιν οὐλομένην Πηληϊάδεω" ],
60
+ [ "ὠμοφάγος ᾍδης", "ᾍδης ὠμοφάγος" ],
61
+ ]
62
+ tests.each { |t|
63
+ x,y = t
64
+ a = x.split(/\s+/)
65
+ a = Tinycus.alpha_sort(a)
66
+ z = a.join(' ')
67
+ if z!=y then raise "error in test, sorting words on #{x} gives #{z}, expected #{y}" end
68
+ }
69
+ #----
70
+ filename = "test_input/bad_combining_character.txt"
71
+ s = nil
72
+ File.open(filename,'r') { |f|
73
+ s = f.gets(nil) # nil means read whole file
74
+ }
75
+ if s.nil? || s.length<10 then raise "wtf?" end
76
+ s2 = Cleanup.clean_up_greek_combining_characters(s,allow_latin:true) # allow_latin is so we can put in stuff like xml fragments
77
+ if s2==s then raise "error in test, no change with Cleanup.clean_up_greek_combining_characters" end
78
+ Cleanup.clean_up_grotty_greek_one_word(s,allow_latin:true)
79
+ #----
80
+ # Beta code/unicode conversion. In the following, there can actually be more than one beta code representation
81
+ # of the same unicode character, so if the third-element flag is true, we don't expect uni to beta to work.
82
+ [
83
+ ["ἐν","E)N"],
84
+ ["Ἄλφα","*)/alfa"], # breathing and tonal accent are supposed to come first for uppercase
85
+ ["προϊέναι","PROI+E/NAI"],
86
+ ["πρός","pro/j"],
87
+ ["πρός","pro/s",true],
88
+ ["πρός","pro/s2",true],
89
+ ["σῖτον","si=ton"],
90
+ ["σῖτον","s1i=ton",true],
91
+ ["μάχαιρᾰ","MA/XAIRA'"],
92
+ ["μαχαίρᾱς","MAXAI/RA&j"],
93
+ ["ἤφυσ’","h)/fus'"],
94
+ ["·—",":_"],
95
+ ["ἴκτινος","i/)ktinos",true], # probably should be )/, but we should deal with it anyway
96
+ ["ἴκτινος","i)/ktinos",true],
97
+ ["βοΐ","boi/+"],
98
+ ["βοΐ","boi+/",true],
99
+ ["ᾇ","a(=|"],
100
+ ["ᾇτε","a(=|te"],
101
+ #["",""],
102
+ ].each { |t|
103
+ unicode,beta_code,no_uni_to_beta = t
104
+ beta_code = beta_code.downcase
105
+ if no_uni_to_beta.nil? then no_uni_to_beta=false end
106
+ if !no_uni_to_beta && Tinycus.greek_unicode_to_beta_code(unicode)!=beta_code then
107
+ raise "error in Tinycus.greek_unicode_to_beta_code, #{unicode} converts to #{Tinycus.greek_unicode_to_beta_code(unicode)}, expected #{beta_code}"
108
+ end
109
+ if Tinycus.greek_beta_code_to_unicode(beta_code)!=unicode then
110
+ raise "error in Tinycus.greek_beta_code_to_unicode, #{beta_code} converts to #{Tinycus.greek_beta_code_to_unicode(beta_code)}, expected #{unicode}"
111
+ end
112
+ }
113
+
114
+ [
115
+ ["ἀντῑκρύ̄","ἀντι_κρύ_"],
116
+ ["ἀνῑ̆άζω","ἀνιάζω"], # iota with macron, followed by a combining breve, i.e., can be either short or long
117
+ ["ἀ̄ϊ̄κή","ἀ_ϊ_κή"], # iota with diaresis, followed by a combining macron
118
+ ].each { |x|
119
+ a,b = x
120
+ b_actual = Tinycus::Tr.macronized_to_underbar_style(a)
121
+ if b_actual==b then
122
+ print "passed test of macronized_to_underbar_style, #{a} -> #{b}\n"
123
+ else
124
+ raise "failed, #{x}, actual = #{b_actual}"
125
+ end
126
+ }
127
+
128
+
129
+ end
130
+
131
+ # fixme:
132
+ # Determine byte order and make sure we convert to native (i.e., BE rather than LE if we're on a big-endian machine).
133
+ @@bloater = Encoding::Converter.new('UTF-8','UTF-32LE')
134
+ @@shrinker = Encoding::Converter.new('UTF-32LE','UTF-8')
135
+
136
+ def Tinycus.bloat(s) # private method
137
+ return @@bloater.convert(s)
138
+ end
139
+ def Tinycus.shrink(s) # private method
140
+ return @@shrinker.convert(s)
141
+ end
142
+
143
+ class Tinycus::MiscGreek
144
+ def MiscGreek.run_tests
145
+ print "testing MiscGreek.add_second_accent...\n"
146
+ [['θεμείλια','θεμείλιά'],
147
+ ['πόλεμονδε','πόλεμόνδε'],
148
+ ['οἶκονδε','οἶκόνδε'],
149
+ ['τῆσδε','τῆσδέ']
150
+ ].each { |x|
151
+ single,two = x
152
+ y = MiscGreek.add_second_accent(single)
153
+ print " #{single} #{two} #{y}\n"
154
+ if y!=two then raise "expected #{two}, got #{y}" end
155
+ }
156
+ end
157
+
158
+ def MiscGreek.estimate_syll_count(x)
159
+ # A rough approximation, for cases where we don't need perfect precision and either don't have Ransom's greek/syllab.rb
160
+ # or don't want the performance hit.
161
+ x = x.downcase
162
+ x = x.gsub(/[ϊ]/,'e') # prevent it from being misinterpreted as a diphthong after the diaresis is stripped below
163
+ x = Tr.remove_accents_from_greek(x)
164
+ x = x.gsub(/(αι|ει|οι|ου)/,'e')
165
+ return x.scan(/[αειουηωe]/).length
166
+ end
167
+
168
+ def MiscGreek.has_acute(x)
169
+ return Tr.remove_acute_from_greek(x)!=x
170
+ end
171
+
172
+ def MiscGreek.has_grave(x)
173
+ return Tr.remove_grave_from_greek(x)!=x
174
+ end
175
+
176
+ def MiscGreek.has_circumflex(x)
177
+ return Tr.remove_circumflex_from_greek(x)!=x
178
+ end
179
+
180
+ def MiscGreek.has_tonal_accent(x)
181
+ return Tr.remove_tonal_accents_from_greek(x)!=x
182
+ end
183
+
184
+ def MiscGreek.add_second_accent(w_orig)
185
+ # e.g., if w is θεμείλια, returns θεμείλιά
186
+ w = w_orig.clone # shallow copy, works on a string; I'm not clear in why this is necessary, but it is; modification to w_orig is visible in
187
+ # output of make test_misc_greek
188
+ if w=~/(.*)δε$/ then
189
+ stem = $1
190
+ nsyll = MiscGreek.estimate_syll_count(stem)
191
+ has_circumflex = MiscGreek.has_circumflex(stem)
192
+ if nsyll>=3 || (has_circumflex && nsyll>=2) then return MiscGreek.add_second_accent(stem)+"δε" end
193
+ end
194
+ # Locate the final vowel:
195
+ x = Tr.remove_accents_from_greek(w).downcase
196
+ (x.length-1).downto(1) { |i|
197
+ if x[i]=~/[αειουηω]/ then
198
+ w[i] = Tr.add_acute_to_greek(Tr.remove_acute_and_grave_from_greek(w[i]))
199
+ return w
200
+ end
201
+ }
202
+ return w
203
+ end
204
+
205
+ def MiscGreek.all_cases_and_accents(w)
206
+ # For a given word, try to predict every possible form it could take in a text, including
207
+ # both possible capitalizations, acute/grave, and multiple accents.
208
+ # The word w should already have been converted into a canonical dictionary form (typically a single acute accent).
209
+ # This is not 100% perfect, mainly because the rules for multiple accents are complicated and Tinycus doesn't include a full
210
+ # syllabification algorithm.
211
+ # I tested this as a round-trip on all multiply accented words occurring in Homer. The following three words were the only
212
+ # ones where it failed: κάλλίον, σταφύλῇ, ὕπὸ.
213
+ forms = [w.downcase]
214
+ forms = forms+forms.map { |x| Tr.greek_acute_to_grave(x) }
215
+ forms = forms+forms.map { |x| MiscGreek.add_second_accent(x) }
216
+ forms = forms+forms.map { |x| x.capitalize }
217
+ return forms.uniq
218
+ end
219
+ end
220
+
221
+ class Tinycus::Tr
222
+
223
+ @@prep_remove_acute_and_grave_from_greek = nil
224
+ @@prep_remove_circumflex_from_greek = nil
225
+ @@prep_add_circumflex_to_greek = nil
226
+ @@prep_remove_grave_from_greek = nil
227
+ @@prep_remove_acute_from_greek = nil
228
+ @@prep_add_grave_to_greek = nil
229
+ @@prep_add_acute_to_greek = nil
230
+ @@prep_remove_diar_from_greek = nil
231
+ @@prep_remove_breathing_from_greek = nil
232
+ @@prep_add_diar_to_greek = nil
233
+ @@prep_remove_accents_from_greek = nil
234
+ @@greek_grave_to_acute = nil
235
+ @@greek_acute_to_grave = nil
236
+ @@prep_greek_to_collation_form = nil
237
+
238
+ def initialize(a,b)
239
+ # Initialize a data structure that represents an action equivalent to string.tr(a,b), but faster.
240
+ # Including redudant characters is harmless and is fixed in this constructor; it does not cause
241
+ # any performance hit when the object is actually used.
242
+ if a.length!=b.length then raise "lengths unequal, #{a.length} and #{b.length}" end
243
+ @l = a.length
244
+ @orig_tables = [a.clone,b.clone] # stash them away for testing purposes
245
+
246
+ @h = {}
247
+ 0.upto(@l-1) { |i|
248
+ p,q = a[i],b[i]
249
+ @h[p] = q
250
+ }
251
+ @h.freeze
252
+
253
+ end
254
+
255
+ attr_reader :l,:a,:b
256
+
257
+ def apply(s,n:false)
258
+ # This function tends to be a bottleneck for performance in real-world applications. I tried several algorithms.
259
+ # See notes in comments at top of scripts/benchmark.rb.
260
+
261
+ if n then s = s.unicode_normalize(:nfc) end # 30% performance hit, not necessary if input has already been normalized
262
+
263
+ result = ''
264
+ 0.upto(s.length-1) { |i|
265
+ p = s[i]
266
+ q = @h[p]
267
+ if q.nil? then c=p else c=q end
268
+ result += c
269
+ }
270
+ return result
271
+
272
+ end
273
+
274
+ def self_test(alphabet)
275
+ # Raises an exception if it fails. Otherwise just returns silently.
276
+ # If not nil, then the alphabet parameter gives a list of characters that are allowed to exist in the output.
277
+ a,b = @orig_tables
278
+ if self.apply(a)!=b then raise "error in self_test, applying me to original a does not give original b" end
279
+ if self.apply(b)!=b then raise "error in self_test, applying me to original b does not give original b" end
280
+ if self.apply(self.apply(a))!=b then raise "error in self_test, fails idempotency" end
281
+ if !alphabet.nil? then
282
+ s = a+alphabet
283
+ ss = self.apply(s)
284
+ unless ss=~/^[#{alphabet}]+$/ then
285
+ raise "error in self_test, applying me to #{s} gives #{ss}, which contains characters not in the alphabet #{alphabet}"
286
+ end
287
+ end
288
+ end
289
+
290
+ def Tr.greek_grave_to_acute(s,n:false)
291
+ if @@greek_grave_to_acute.nil? then
292
+ @@greek_grave_to_acute = Tinycus::Tr.new(
293
+ "ÀÈÌÒÙàèìòùἂἃἊἒἓἢἣἫἲἳὂὃὊὋὓὢὣὫὰὲὴὶὸὺὼῒῢῸῂ","ÁÉÍÓÚáéíóúἄἅἌἔἕἤἥἭἴἵὄὅὌὍὕὤὥὭάέήίόύώΐΰΌῄ"
294
+ )
295
+ end
296
+ if n then s=s.unicode_normalize(:nfc) end
297
+ return @@greek_grave_to_acute.apply(s)
298
+ end
299
+
300
+ def Tr.greek_acute_to_grave(s,n:false)
301
+ if @@greek_acute_to_grave.nil? then
302
+ @@greek_acute_to_grave = Tinycus::Tr.new(
303
+ "ÁÉÍÓÚáéíóúἄἅἌἔἕἤἥἭἴἵὄὅὌὍὕὤὥὭάέήίόύώΐΰΌ","ÀÈÌÒÙàèìòùἂἃἊἒἓἢἣἫἲἳὂὃὊὋὓὢὣὫὰὲὴὶὸὺὼῒῢῸ"
304
+ )
305
+ end
306
+ if n then s=s.unicode_normalize(:nfc) end
307
+ return @@greek_acute_to_grave.apply(s)
308
+ end
309
+
310
+ def Tr.remove_tonal_accents_from_greek(s,n:false)
311
+ # to do: make this more efficient by creating a single-pass tr
312
+ return Tr.remove_circumflex_from_greek(Tr.remove_acute_and_grave_from_greek(s,n:n))
313
+ end
314
+
315
+ def Tr.remove_acute_and_grave_from_greek(s,n:false)
316
+ if @@prep_remove_acute_and_grave_from_greek.nil? then
317
+ @@prep_remove_acute_and_grave_from_greek =
318
+ Tinycus::Tr.new(
319
+ "ÀÁàáÈÉèéÌÍìíÒÓòóÙÚùúÝýΆάἂἃἄἅἊἌἍὰᾴΈέἒἓἔἕἜἝὲήἢἣἤἥἫἬἭὴᾓᾔᾕῂῄΊΐίἲἳἴἵἼἽὶῒΌόὂὃὄὅὊὋὌὍὸῸΰύὓὔὕὝὺῢώὢὣὤὥὫὬὭὼᾤῴᾍ",
320
+ "AAaaEEeeIIiiOOooUUuuYyΑαἀἁἀἁἈἈἉαᾳΕεἐἑἐἑἘἙεηἠἡἠἡἩἨἩηᾑᾐᾑῃῃΙϊιἰἱἰἱἸἹιϊΟοὀὁὀὁὈὉὈὉοΟϋυὑὐὑὙυϋωὠὡὠὡὩὨὩωᾠῳᾉ"
321
+ )
322
+ end
323
+ if n then s=s.unicode_normalize(:nfc) end
324
+ return @@prep_remove_acute_and_grave_from_greek.apply(s)
325
+ end
326
+
327
+ def Tr.remove_circumflex_from_greek(s,n:false)
328
+ if @@prep_remove_circumflex_from_greek.nil? then
329
+ @@prep_remove_circumflex_from_greek =
330
+ Tinycus::Tr.new(
331
+ "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåæçèéêëìíîïñòóôõöøùúûüýÿΆΈΊΌΐάέήίΰϊϋόύώỏἀἁἂἃἄἅἆἈἉἊἌἍἎἐἑἒἓἔἕἘἙἜἝἠἡἢἣἤἥἦἧἨἩἫἬἭἮἯἰἱἲἳἴἵἶἷἸἹἼἽἾὀὁὂὃὄὅὈὉὊὋὌὍὐὑὓὔὕὖὗὙὝὠὡὢὣὤὥὦὧὨὩὫὬὭὮὯὰὲὴὶὸὺὼᾐᾑᾓᾔᾕᾖᾗᾠᾤᾦᾧᾰᾱᾳᾴᾶᾷᾸᾹῂῃῄῆῇῐῑῒῖῗῘῙῠῡῢῥῦῨῩῬῳῴῶῷῸ",
332
+ "ÀÁAAÄÅÆÇÈÉEËÌÍIÏNÒÓOOÖØÙÚUÜÝàáaaäåæçèéeëìíiïnòóooöøùúuüýÿΆΈΊΌΐάέήίΰϊϋόύώỏἀἁἂἃἄἅἀἈἉἊἌἍἈἐἑἒἓἔἕἘἙἜἝἠἡἢἣἤἥἠἡἨἩἫἬἭἨἩἰἱἲἳἴἵἰἱἸἹἼἽἸὀὁὂὃὄὅὈὉὊὋὌὍὐὑὓὔὕὐὑὙὝὠὡὢὣὤὥὠὡὨὩὫὬὭὨὩὰὲὴὶὸὺὼᾐᾑᾓᾔᾕᾐᾑᾠᾤᾠᾡᾰᾱᾳᾴαᾳᾸᾹῂῃῄηῃῐῑῒιϊῘῙῠῡῢῥυῨῩῬῳῴωῳῸ"
333
+ )
334
+ end
335
+ if n then s=s.unicode_normalize(:nfc) end
336
+ return @@prep_remove_circumflex_from_greek.apply(s)
337
+ end
338
+
339
+ def Tr.add_circumflex_to_greek(s,n:false)
340
+ if @@prep_add_circumflex_to_greek.nil? then
341
+ @@prep_add_circumflex_to_greek =
342
+ Tinycus::Tr.new(
343
+ "AAEINOOUaaeinoouἀἈἠἡἨἩἰἱἸὐὑὠὡὨὩᾐᾑᾠᾡαᾳηῃιϊυωῳ",
344
+ "ÂÃÊÎÑÔÕÛâãêîñôõûἆἎἦἧἮἯἶἷἾὖὗὦὧὮὯᾖᾗᾦᾧᾶᾷῆῇῖῗῦῶῷ"
345
+ )
346
+ end
347
+ if n then s=s.unicode_normalize(:nfc) end
348
+ return @@prep_add_circumflex_to_greek.apply(s)
349
+ end
350
+
351
+ def Tr.remove_acute_from_greek(s,n:false)
352
+ if @@prep_remove_acute_from_greek.nil? then
353
+ @@prep_remove_acute_from_greek =
354
+ Tinycus::Tr.new(
355
+ "ÁÉÍÓÚáéíóúἄἅἌἔἕἤἥἭἴἵὄὅὌὍὕὤὥὭάέήίόύώῄῴΐΰΌ",
356
+ "AEIOUaeiouἀἁἈἐἑἠἡἩἰἱὀὁὈὉὑὠὡὩαεηιουωῃῳϊϋΟ"
357
+ )
358
+ end
359
+ if n then s=s.unicode_normalize(:nfc) end
360
+ return @@prep_remove_acute_from_greek.apply(s)
361
+ end
362
+
363
+ def Tr.remove_grave_from_greek(s,n:false)
364
+ if @@prep_remove_grave_from_greek.nil? then
365
+ @@prep_remove_grave_from_greek =
366
+ Tinycus::Tr.new(
367
+ "ÀÈÌÒÙàèìòùἂἃἊἒἓἢἣἫἲἳὂὃὊὋὓὢὣὫὰὲὴὶὸὺὼῂῒῢῸ",
368
+ "AEIOUaeiouἀἁἈἐἑἠἡἩἰἱὀὁὈὉὑὠὡὩαεηιουωῃϊϋΟ"
369
+ )
370
+ end
371
+ if n then s=s.unicode_normalize(:nfc) end
372
+ return @@prep_remove_grave_from_greek.apply(s)
373
+ end
374
+
375
+ def Tr.add_grave_to_greek(s,n:false)
376
+ if @@prep_add_grave_to_greek.nil? then
377
+ @@prep_add_grave_to_greek =
378
+ Tinycus::Tr.new(
379
+ "AEIOUaeiouἀἁἈἐἑἠἡἩἰἱὀὁὈὉὑὠὡὩαεηιουωῃϊϋΟ",
380
+ "ÀÈÌÒÙàèìòùἂἃἊἒἓἢἣἫἲἳὂὃὊὋὓὢὣὫὰὲὴὶὸὺὼῂῒῢῸ"
381
+ )
382
+ end
383
+ if n then s=s.unicode_normalize(:nfc) end
384
+ return @@prep_add_grave_to_greek.apply(s)
385
+ end
386
+
387
+ def Tr.add_acute_to_greek(s,n:false)
388
+ if @@prep_add_acute_to_greek.nil? then
389
+ @@prep_add_acute_to_greek =
390
+ Tinycus::Tr.new(
391
+ "AEIOUaeiouἀἁἈἐἑἠἡἩἰἱὀὁὈὉὑὠὡὩαεηιουωῃῳϊϋΟ",
392
+ "ÁÉÍÓÚáéíóúἄἅἌἔἕἤἥἭἴἵὄὅὌὍὕὤὥὭάέήίόύώῄῴΐΰΌ"
393
+ )
394
+ end
395
+ if n then s=s.unicode_normalize(:nfc) end
396
+ return @@prep_add_acute_to_greek.apply(s)
397
+ end
398
+
399
+ def Tr.remove_diar_from_greek(s,n:false)
400
+ # Can't do caps with diaresis, since they only exist as combining characters.
401
+ if @@prep_remove_diar_from_greek.nil? then
402
+ @@prep_remove_diar_from_greek =
403
+ Tinycus::Tr.new(
404
+ "ϊΐῒῗϋΰῢ",
405
+ "ιίὶῖυύὺ"
406
+ )
407
+ end
408
+ if n then s=s.unicode_normalize(:nfc) end
409
+ return @@prep_remove_diar_from_greek.apply(s)
410
+ end
411
+
412
+ def Tr.add_diar_to_greek(s,n:false)
413
+ # Can't do caps with diaresis, since they only exist as combining characters.
414
+ if @@prep_add_diar_to_greek.nil? then
415
+ @@prep_add_diar_to_greek =
416
+ Tinycus::Tr.new(
417
+ "ιίὶῖυύὺ",
418
+ "ϊΐῒῗϋΰῢ"
419
+ )
420
+ end
421
+ if n then s=s.unicode_normalize(:nfc) end
422
+ return @@prep_add_diar_to_greek.apply(s)
423
+ end
424
+
425
+ def Tr.remove_breathing_from_greek(s,n:false)
426
+ if @@prep_remove_breathing_from_greek.nil? then
427
+ @@prep_remove_breathing_from_greek =
428
+ Tinycus::Tr.new(
429
+ "ἄἌἈἈἀἈἁἉἂἊἅἍἆἉἉἃἋᾇἔἜἑἙἐἘἕἝἘἘἙἙἓἛἒἚἣἫἡἩἠἨἦἥἭἢἪἤἬᾔἬΙἧᾗἨἨᾕἭΙἩἩᾐἨΙᾖᾑἩΙἰἸἱἹἴἼἶἳἻἵἽἷἸἸἲἺἹἹὃὋὄὌὀὈὈὈὅὍὁὉὉὉῥῬῬῬῤὐὗὕὝὑὙὖὔὓὛὙὙὥὭὣὫὤὬᾤὬΙὠὨὦᾧὡὩὧᾠὨΙὨὨὢὪᾦὩὩ",
430
+ "άΆΑΑαΑαΑὰᾺάΆᾶΑΑὰᾺᾷέΈεΕεΕέΈΕΕΕΕὲῈὲῈὴῊηΗηΗῆήΉὴῊήΉῄΉΙῆῇΗΗῄΉΙΗΗῃΗΙῇῃΗΙιΙιΙίΊῖὶῚίΊῖΙΙὶῚΙΙὸῸόΌοΟΟΟόΌοΟΟΟρΡΡΡρυῦύΎυΥῦύὺῪΥΥώΏὼῺώΏῴΏΙωΩῶῷωΩῶῳΩΙΩΩὼῺῷΩΩ"
431
+ )
432
+ end
433
+ if n then s=s.unicode_normalize(:nfc) end
434
+ return @@prep_remove_breathing_from_greek.apply(s)
435
+ end
436
+
437
+ def Tr.remove_accents_from_greek(s,n:false)
438
+ if @@prep_remove_accents_from_greek.nil? then
439
+ @@prep_remove_accents_from_greek = Tr.remove_accents('el')
440
+ end
441
+ if n then s=s.unicode_normalize(:nfc) end
442
+ return @@prep_remove_accents_from_greek.apply(s)
443
+ end
444
+
445
+ def Tr.remove_macrons_and_breves(s)
446
+ # This can't be implemented using my fast method, because most of these are composed characters.
447
+ if !(s.kind_of?(String)) then return s end
448
+ # ...convenience feature for stuff like parsing json data, which may include integers. Won't work for arrays containing strings.
449
+ # ---
450
+ # Letters with both a macron and a breve. The ruby script to generate this is in the commented out code below.
451
+ if s=~/ᾱ̆|ῑ̆|ῡ̆|Ᾱ̆|Ῑ̆|Ῡ̆/ then
452
+ s = s.gsub(/ᾱ̆/,'α')
453
+ s = s.gsub(/ῑ̆/,'ι')
454
+ s = s.gsub(/ῡ̆/,'υ')
455
+ s = s.gsub(/Ᾱ̆/,'Α')
456
+ s = s.gsub(/Ῑ̆/,'Ι')
457
+ s = s.gsub(/Ῡ̆/,'Υ')
458
+ end
459
+ # ---
460
+ s = s.gsub(/ϊ̄/,'ϊ') # iota with macron and diaresis; happens in Cunliffe for ἀϊκη, ὠΐετο, ὁμοιΐου, ὁμοίϊος
461
+ s = s.gsub(/ΐ̄/,'ΐ') # iota with macron, diaresis, and acute; happens in Cunliffe for ὀΐομαι
462
+ # ---
463
+ # The following is kludgy, probably not the right way to handle this. Should this just be in the loop at the end?
464
+ s = s.gsub(/ῐ/,"ῐ")
465
+ # ῐ = iota + combining breve
466
+ # ῐ = iota with vrachy
467
+ s = s.gsub(/ῑ/,"ι")
468
+ s = s.gsub(/ᾰ/,"α")
469
+ s = s.gsub(/ᾱ̆/,"α") # alpha with both a macron and a breve!?
470
+ # ---
471
+ s = Tinycus::Util.safe_normalize(s)
472
+ s = s.tr("āēīōūӯ","aeiouy") # latin
473
+ s = s.tr("ᾰᾱᾸᾹῐῑῘῙῠῡῨῩ","ααΑΑιιΙΙυυΥΥ")
474
+ # Accent combined with macron. The monospaced fonts I'm using for coding display these incorrectly, and I also don't know how to type them.
475
+ # Furthermore, these seem to be represented as multiple characters, so that tr won't work. The following will be slow on short strings,
476
+ # but should perform well on long ones.
477
+ # The following isn't really an exhaustive list of vowels.
478
+ "άίύὰὶὺΆΊΎᾺῚῪἀἐἰὀὐἠὠἁἑἱὁὑἡὡἄἔἴὄὔἤὤἂἒἲὂὒἢὢἅἕἵὅὕἥὥἃἓἳὃὓἣὣΐῒ".chars.each { |c|
479
+ [772,774].each { |combining| # 772=combining macron, 774=combining breve (773=combining overline, presumably used for math)
480
+ m = [c.ord, combining].pack("U*") # is not a single character
481
+ s = s.gsub(/#{m}/,c)
482
+ }
483
+ }
484
+ # ---
485
+ s = s.unicode_normalize(:nfc) # found empirically that this was necessary, don't remove
486
+ # ---
487
+ return s
488
+ =begin
489
+ --------------------------------------------------------
490
+ m = [0x1fb1,0x1FD1,0x1fe1]
491
+ a = m.map { |u| [u, 0x0306].pack('U*') }
492
+ a = a + a.map { |s| s.upcase }
493
+ i=0
494
+ a.each { |c|
495
+ cc = ['α','ι','υ','Α','Ι','Υ'][i]
496
+ i += 1
497
+ print " s = s.gsub(/#{c}/,'#{cc}')\n"
498
+ }
499
+ --------------------------------------------------------
500
+ =end
501
+ end
502
+
503
+ def Tr.macronized_to_underbar_style(s)
504
+ # Changes a macronized string to one that looks like this: ἕννυ_μι.
505
+ # The lists in the regexes are generated by the commented-out scripts below, and are not actually totally comprehensive.
506
+ # We don't handle grave and circumflex accents, but those don't occur in dictionary headers with macrons.
507
+ # For an IfMows object in Ifthimos, this can be done using stringify(macronization:'underbar').
508
+ # ---
509
+ x = s.clone
510
+ x = x.gsub(/(ϊ̄)/) { "#{Tinycus::Tr.remove_macrons_and_breves($1)}_" } # iota with diaresis and macron, occurs in ἀϊκή
511
+ # First handle letters that have both a macron and a breve, treating them as if they weren't macronized at all:
512
+ x = x.gsub(/(ᾱ̆|ῑ̆|ῡ̆|Ᾱ̆|Ῑ̆|Ῡ̆)/) { Tinycus::Tr.remove_macrons_and_breves($1) }
513
+ # Next handle the ones that have macrons only:
514
+ x = x.gsub(/(ᾱ|ῑ|ῡ|ά̄|ί̄|ύ̄|ἀ̄|ἁ̄|ἄ̄|ἅ̄|ἰ̄|ἱ̄|ἴ̄|ἵ̄|ὐ̄|ὑ̄|ὔ̄|ὕ̄|Ᾱ|Ῑ|Ῡ|Ά̄|Ί̄|Ύ̄|Ἀ̄|Ἁ̄|Ἄ̄|Ἅ̄|Ἰ̄|Ἱ̄|Ἴ̄|Ἵ̄|Υ̓̄|Ὑ̄|Υ̓́̄|Ὕ̄)/) { "#{Tinycus::Tr.remove_macrons_and_breves($1)}_" }
515
+ # Finally, remove all breves:
516
+ return Tinycus::Tr.remove_macrons_and_breves(x)
517
+ =begin
518
+ ------------------------------------
519
+ m = [0x1fb1,0x1FD1,0x1fe1]
520
+ a = m.map { |u| [u, 0x0306].pack('U*') }
521
+ a = a + a.map { |s| s.upcase }
522
+ print a.join('|'),"--\n"
523
+ ------------------------------------
524
+ a = ['ᾱ','ῑ','ῡ']
525
+ [
526
+ 0x03ac,0x03af,0x03cd
527
+ ].each { |i|
528
+ x = [i, 0x0304].pack('U*')
529
+ a.push(x)
530
+ }
531
+ [
532
+ 0x1f00,0x1f30,0x1f50
533
+ ].each { |i|
534
+ [0,1,4,5].each { |j|
535
+ x = [i+j, 0x0304].pack('U*')
536
+ a.push(x)
537
+ }
538
+ }
539
+ a = a + a.map { |s| s.upcase }
540
+ print a.join('|'),"--\n"
541
+ ------------------------------------
542
+ =end
543
+ end
544
+
545
+ def Tr.get_greek_collation_tr
546
+ if @@prep_greek_to_collation_form.nil? then
547
+ @@prep_greek_to_collation_form = Tr.collation_form('el')
548
+ end
549
+ return @@prep_greek_to_collation_form
550
+ end
551
+
552
+ def Tr.greek_to_collation_form(s,n:false)
553
+ if @@prep_greek_to_collation_form.nil? then
554
+ @@prep_greek_to_collation_form = Tr.collation_form('el')
555
+ end
556
+ if n then s=s.unicode_normalize(:nfc) end
557
+ return @@prep_greek_to_collation_form.apply(s)
558
+ end
559
+
560
+ def Tr.remove_accents(locale)
561
+ # Returns a Tinycus::Tr object which can then be used to act on strings using the apply() method.
562
+ # The 'el' locale is a standard thing that software like ICU uses for polytonic Greek. The object constructed with this
563
+ # locale will also remove most accents and macrons from Latin characters, but will miss some cases like Czech, and will not handle Cyrillic.
564
+ t = {
565
+ "el"=>[
566
+ "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåæçèéêëìíîïñòóôõöøùúûüýÿΆΈΊΌΐάέήίΰϊϋόύώỏἀἁἂἃἄἅἆἈἉἊἌἍἎἐἑἒἓἔἕἘἙἜἝἠἡἢἣἤἥἦἧἨἩἫἬἭἮἯἰἱἲἳἴἵἶἷἸἹἼἽἾὀὁὂὃὄὅὈὉὊὋὌὍὐὑὓὔὕὖὗὙὝὠὡὢὣὤὥὦὧὨὩὫὬὭὮὯὰὲὴὶὸὺὼᾐᾑᾓᾔᾕᾖᾗᾠᾤᾦᾧᾰᾱᾳᾴᾶᾷᾸᾹῂῃῄῆῇῐῑῒῖῗῘῙῠῡῢῥῦῨῩῬῳῴῶῷῸῤᾆᾄᾂᾁᾇᾅᾃᾍᾡ",
567
+ "AAAAAAÆCEEEEIIIINOOOOOOUUUUYaaaaaaæceeeeiiiinoooooouuuuyyΑΕΙΟιαεηιυιυουωoαααααααΑΑΑΑΑΑεεεεεεΕΕΕΕηηηηηηηηΗΗΗΗΗΗΗιιιιιιιιΙΙΙΙΙοοοοοοΟΟΟΟΟΟυυυυυυυΥΥωωωωωωωωΩΩΩΩΩΩΩαεηιουωηηηηηηηωωωωααααααΑΑηηηηηιιιιιΙΙυυυρυΥΥΡωωωωΟραααααααΑω"
568
+ ]
569
+ }
570
+ tables = t[locale]
571
+ if tables.nil? then raise "unknown locale: #{locale}, known locales are: #{t.keys.join(' ')}" end
572
+ result = Tinycus::Tr.new(tables[0],tables[1])
573
+ return result
574
+ end
575
+
576
+ def Tr.collation_form(locale)
577
+ # Returns a Tinycus::Tr object which can then be used to act on strings using the apply() method. Gives a form that
578
+ # can be alphabetized properly.
579
+ # The 'el' locale is a standard thing that software like ICU uses for polytonic Greek. The object constructed with this
580
+ # locale will also produce correct results for most Latin-script words, will miss some cases like Czech, and will not handle Cyrillic.
581
+ t = {
582
+ "el"=>[
583
+ "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåæçèéêëìíîïñòóôõöøùúûüýÿΆΈΊΌΐάέήίΰϊϋόύώỏἀἁἂἃἄἅἆἈἉἊἌἍἎἐἑἒἓἔἕἘἙἜἝἠἡἢἣἤἥἦἧἨἩἫἬἭἮἯἰἱἲἳἴἵἶἷἸἹἼἽἾὀὁὂὃὄὅὈὉὊὋὌὍὐὑὓὔὕὖὗὙὝὠὡὢὣὤὥὦὧὨὩὫὬὭὮὯὰὲὴὶὸὺὼᾐᾑᾓᾔᾕᾖᾗᾠᾤᾦᾧᾰᾱᾳᾴᾶᾷᾸᾹῂῃῄῆῇῐῑῒῖῗῘῙῠῡῢῥῦῨῩῬῳῴῶῷῸῤᾆᾄᾂᾁᾇᾅᾃΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩςᾍ",
584
+ "aaaaaaæceeeeiiiinoooooouuuuyaaaaaaæceeeeiiiinoooooouuuuyyαειοιαεηιυιυουωoαααααααααααααεεεεεεεεεεηηηηηηηηηηηηηηηιιιιιιιιιιιιιοοοοοοοοοοοουυυυυυυυυωωωωωωωωωωωωωωωαεηιουωηηηηηηηωωωωααααααααηηηηηιιιιιιιυυυρυυυρωωωωορααααααααβγδεζηθικλμνξοπρστυφχψωσΑ"
585
+ ]
586
+ }
587
+ tables = t[locale]
588
+ if tables.nil? then raise "unknown locale: #{locale}, known locales are: #{t.keys.join(' ')}" end
589
+ result = Tinycus::Tr.new(tables[0],tables[1])
590
+ return result
591
+ end
592
+
593
+ def Tr.add_breathing_to_character(c,what)
594
+ plain,d = Tinycus.disassemble_greek_char(c)
595
+ d['breathing'] = what
596
+ return Tinycus.assemble_greek_char(plain,d)
597
+ end
598
+
599
+
600
+
601
+ def Tr.run_tests
602
+ # to execute this, do a "make test_tr"
603
+ ['el'].each { |locale|
604
+ tr = Tinycus::Tr.remove_accents(locale)
605
+ if locale=='el' then alphabet=Script.alphabet('greek')+Script.alphabet('latin')+'Ææ' else alphabet=nil end
606
+ tr.self_test(alphabet)
607
+ print "Passed self-test on locale #{locale}.\n"
608
+ }
609
+ end
610
+
611
+
612
+ end # class Tr
613
+
614
+ class Tinycus::Script
615
+
616
+ def Script.alphabet(script,c:'both')
617
+ # Script can be 'latin', 'greek', or 'hebrew'.
618
+ # C can be both, lowercase, or uppercase.
619
+ # For scripts that don't have case, c is ignored.
620
+ t = {
621
+ 'latin'=>{'has_case'=>true},
622
+ 'greek'=>{'has_case'=>true},
623
+ 'hebrew'=>{'has_case'=>false},
624
+ }
625
+ data = t[script]
626
+ if data.nil? then raise "unknown script: #{script}, possible values are: #{t.keys.join(' ')}" end
627
+ has_case = data['has_case']
628
+ if !has_case then return Script.alphabet_helper(script,nil) end
629
+ if c=='both' then return Script.alphabet(script,c:"lowercase")+Script.alphabet(script,c:"uppercase") end
630
+ # If we fall through to here, then we're doing a single case of an alphabet that has two cases.
631
+ if c=='lowercase' then return Script.alphabet_helper(script,true) end
632
+ if c=='uppercase' then return Script.alphabet_helper(script,false).upcase end
633
+ die("illegal value of c=#{c}, must be both, lowercase, or uppercase")
634
+ end
635
+
636
+ def Script.alphabet_helper(script,include_lc_only_chars)
637
+ if script=='latin' then return 'abcdefghijklmnopqrstuvwxyz' end
638
+ if script=='greek' then
639
+ result = 'αβγδεζηθικλμνξοπρστυφχψω'
640
+ if include_lc_only_chars then result = result+'ς' end
641
+ return result.unicode_normalize(:nfc)
642
+ end
643
+ if self.name=='hebrew' then return 'אבגדהוזחטילמנסעפצקרשתםןףץ'.unicode_normalize(:nfc) end
644
+ # ... Word-final forms are all at the end.
645
+ # To edit the Hebrew list, use mg, not emacs. Emacs tries to be smart about RTL but freaks out and gets it
646
+ # wrong on a line that mixes RTL and LTR.
647
+ raise "no alphabet available for script #{script}"
648
+ end
649
+
650
+ end
651
+
652
+ class Tinycus::Cleanup
653
+
654
+ def Cleanup.clean_up_grotty_greek(s,allow_latin:false,clean_perseus:true,standardize_punctuation:true)
655
+ # Designed for external data sources that can have all kinds of nasty crap in them. Slow, thorough, silent, and brutal.
656
+ a = s.split(/(\s+)/) # returns a string in which even indices are words, odd indices are whitespace
657
+ b = []
658
+ 0.upto(a.length-1) { |i|
659
+ w = a[i]
660
+ if i%2==0 then
661
+ looks_greek = true
662
+ if w=~/[a-zA-Z]/ then looks_greek=false end # for speed and reliability; if it contains Latin letters, it shouldn't be a greek word
663
+ if looks_greek && !(w=~/[α-ωΑ-Ως]/) && \
664
+ !(Tinycus::Tr.remove_accents_from_greek(w.unicode_normalize(:nfc))=~/[α-ωΑ-Ως]/) then looks_greek=false end
665
+ if looks_greek then
666
+ w=Cleanup.clean_up_grotty_greek_one_word(w,allow_latin:allow_latin,clean_perseus:clean_perseus,standardize_punctuation:standardize_punctuation)
667
+ end
668
+ end
669
+ b.push(w)
670
+ }
671
+ s = b.join('')
672
+ if standardize_punctuation then
673
+ s = Cleanup.standardize_greek_punctuation(s)
674
+ end
675
+ s = s.unicode_normalize(:nfc) # does stuff like changing deprecated 8059 (upsilon with oxia) to 973 (upsilon with tonos)
676
+ return s
677
+ end
678
+
679
+ def Cleanup.clean_up_grotty_greek_one_word(s,allow_latin:false,clean_perseus:true,standardize_punctuation:true)
680
+ # This works on a single word.
681
+ s = s.unicode_normalize(:nfc)
682
+ s = Cleanup.clean_up_greek_combining_characters(s,allow_latin:allow_latin)
683
+ # In Perseus's Polybius, they have bracketed text sometimes. In their system, this should probably be a separate punctuation token.
684
+ if clean_perseus then
685
+ s = s.sub(/\[/,'') # Hesiod, δεσμὸ]ν; also occurs sometimes like [word]
686
+ s = s.sub(/\]/,'')
687
+ s = s.sub(/^\(/,'')
688
+ s = s.sub(/\)$/,'')
689
+ s = s.sub(/\}/,'') # Hesiod, Ι{ππώ
690
+ s = s.sub(/\{/,'')
691
+ s = s.sub(/&?απο[σς];/,"᾽") # software bug in perseus, is '&apos;' transliterated into Greek
692
+ s.sub!(/\-$/,"᾽") # e.g., in Thucydides, Perseus has δοκεῖ δέ μοι, οὐδὲ τοὔνομα with τοὔνομα written as τ- οὔνομα
693
+ s.sub!(/\-/,'') # e.g., in Thucydides, Perseus has ἀντίσχουσαν lemmatized as ἀντί-ἴσχω
694
+ end
695
+ s = s.sub(/σ$/,'ς') # this won't work if there's trailing punct; what is the right way to spell a word that ends with a sigma, then an elision mark?
696
+ s = Cleanup.clean_up_greek_beta_code(s)
697
+ greek_koronis = [8125].pack('U')
698
+ if s[0]==greek_koronis then
699
+ s = s[1..-1] # this happens in perseus for the lemma ἀθήνη, which they have encoded as 787 7936 952 ..., i.e., the
700
+ # breathing mark is there twice, once as a combining comma above and once as part of the composed character ἀ
701
+ # https://github.com/PerseusDL/treebank_data/issues/37
702
+ end
703
+ s = Cleanup.standardize_greek_punctuation(s) if standardize_punctuation
704
+ if !allow_latin && s=~/[^[:alpha:]᾽[0-9]\?;,.··«»’᾽—“”]/ then raise "word #{s} contains unexpected characters; unicode=#{s.chars.map { |x| x.ord}}" end
705
+ return s
706
+ end
707
+
708
+ def Cleanup.clean_up_greek_combining_characters(s,allow_latin:false)
709
+ combining_comma_above = [787].pack('U')
710
+ combining_acute_accent = [769].pack('U')
711
+ greek_koronis = [8125].pack('U')
712
+ # s = s.sub(/α#{combining_comma_above}#{combining_acute_accent}/,'ἄ') # my error introduced in Lemming; no longer needed
713
+ s = s.sub(/#{combining_comma_above}/,greek_koronis)
714
+ # ... mistaken use of combining comma above rather than the spacing version
715
+ # https://github.com/PerseusDL/treebank_data/issues/31
716
+ s = s.sub(/#{combining_acute_accent}/,'') # occurs once in Perseus's Plutarch, in a place where it's redundant
717
+ s = s.sub(/#{[788].pack('U')}/,'') # combining reversed comma above; occurs once in Perseus's Polybius, where it's on a capital Ι
718
+ # seeming one-off errors in perseus:
719
+ s2 = s
720
+ s2 = s2.sub(/#{[8158, 7973].pack('U')}/,"ἥ") # dasia and oxia combining char with eta
721
+ s2 = s2.sub(/#{[8142, 7940].pack('U')}/,"ἄ") # psili and oxia combining char with alpha
722
+ s2 = s2.sub(/#{[8142, 7988].pack('U')}/,"ἴ")
723
+ s2 = s2.sub(/ἄἄ/,'ἄ') # why is this necessary...??
724
+ s2 = s2.sub(/ἥἥ/,'ἥ') # why is this necessary...??
725
+ s2 = s2.sub(/#{[769].pack('U')}([μτ])/) {$1} # accent on a mu or tau, obvious error
726
+ s2 = s2.sub(/#{[769].pack('U')}ε/) {'έ'}
727
+ s2 = s2.sub(/#{[180].pack('U')}([κ])/) {$1} # accent on a kappa, obvious error
728
+ s2 = s2.sub(/#{[834].pack('U')}/,'') # what the heck is this?
729
+ s2 = s2.sub(/ʽ([ἁἑἱὁὑἡὡ])/) {$1} # redundant rough breathing mark
730
+ # another repeating error:
731
+ s2 = s2.sub(/(?<=[[:alpha:]][[:alpha:]])([ἀἐἰὀὐἠὠ])(?![[:alpha:]])/) { $1.tr("ἀἐἰὀὐἠὠ","αειουηω")+"᾽" }
732
+ # ... smooth breathing on the last character of a long word; this is a mistake in representation of elision
733
+ # https://github.com/PerseusDL/treebank_data/issues/31
734
+ s = s2
735
+ return s
736
+ end
737
+
738
+ def Cleanup.clean_up_greek_beta_code(s)
739
+ # This was for when I mistakenly used old beta code version of project perseus.
740
+ # Even with perseus 2.1, some stuff seems to come through that looks like beta code, e.g., ἀργει~ος.
741
+ # https://github.com/PerseusDL/treebank_data/issues/30
742
+ s = s.sub(/\((.)/) { $1.tr("αειουηω","ἁἑἱὁὑἡὡ") }
743
+ s = s.sub(/\)(.)/) { $1.tr("αειουηω","ἀἐἰὀὐἠὠ") }
744
+ s = s.sub(/(.)~/) { $1.tr("αιυηω","ᾶῖῦῆῶ") }
745
+ s = s.sub(/\|/,'ϊ')
746
+ s = s.sub(/\/(.)/) { $1.tr("αειουηω","άέίόύήώ") }
747
+ s = s.sub(/&θυοτ;/,'')
748
+ s = s.sub(/θεοισ=ν/,'θεοῖσιν')
749
+ s = s.sub(/ὀ=νοψ1/,'οἴνοπα1')
750
+ s = s.sub(/π=ας/,'πᾶς')
751
+ return s
752
+ end
753
+
754
+ def Cleanup.standardize_greek_punctuation(s)
755
+ # Works on any string, doesn't have to be a single word. Standardize elision character and middle dot/ano teleia.
756
+ # Perseus and Monro/Allen write ρ with breathing mark instead of ρ᾽ when there's elision:
757
+ s = s.gsub(/(?<=[[:alpha:]])[ῤῥ](?![[:alpha:]])/,'ρ᾽')
758
+ # ... Note that we do need to reinsert the breathing mark, or else we lose the info needed to do accurate lemmatization. Cf. Spelling module.
759
+ # Wikisource has ῤῥ in the middle of words, e.g., χείμαῤῥοι, which OCT and Perseus don't have:
760
+ s = s.gsub(/(?<=[[:alpha:]])ῤῥ(?=[[:alpha:]])/,'ρρ')
761
+ # Standardize the elision character:
762
+ s = s.gsub(/[᾽’'](?![[:alpha:]])/,'᾽')
763
+ # ... There are other possibilities (see comments in ransom/contains_greek_elision), but these should already have been taken care of in Lemming.
764
+ s = s.gsub(/#{[183].pack('U')}/,[903].pack('U')) # ano teleia has two forms, B7=183 and 387=903; GFS Porson and Olga only have the latter code point
765
+ return s
766
+ end
767
+
768
+ end
769
+
770
+ class Tinycus::Util
771
+ def Util.is_valid_utf8(s)
772
+ return s.clone.force_encoding("UTF-8").valid_encoding?
773
+ end
774
+
775
+ def Util.explain_how_strings_differ(x,y)
776
+ result = []
777
+ result.push("comparing '#{x}' to '#{y}'")
778
+ if x==y then
779
+ result.push(" strings are equal")
780
+ else
781
+ if x.length!=y.length then
782
+ result.push(" strings differ in length, #{x.length} and #{y.length}")
783
+ else
784
+ 0.upto(x.length-1) { |i|
785
+ if x[i]!=y[i] then
786
+ result.push(" strings differ at position #{i}, #{x[i]}!=#{y[i]}, codes are #{x[i].ord} and #{y[i].ord}")
787
+ end
788
+ }
789
+ end
790
+ end
791
+ return result.map { |line| line+"\n" }.join('')
792
+ end
793
+
794
+ def Util.explain_chars_in_string(s)
795
+ result = []
796
+ result.push("explaining the characters in the string '#{s}':")
797
+ 0.upto(s.length-1) { |i|
798
+ result.push(" [#{i}] = '#{s[i]}', code = #{s[i].ord}")
799
+ }
800
+ return result.map { |line| line+"\n" }.join('')
801
+ end
802
+
803
+ def Util.longest_common_initial_substring(aa,bb,script)
804
+ if aa.length<bb.length then return longest_common_initial_substring(bb,aa,script) end
805
+ a = aa
806
+ b = bb
807
+ if script=='greek' then a=aa.gsub(/ς$/,'σ'); b=bb.gsub(/ς$/,'σ') end
808
+ # From here on, we're guaranteed that a is at least as long as b and neither has a final lunate sigma.
809
+ best = ''
810
+ 0.upto(b.length-1) { |l|
811
+ if a[0..l]==b[0..l] then best=a[0..l] end
812
+ }
813
+ if script=='greek' then best=best.sub(/σ$/,'ς') end
814
+ return best
815
+ end
816
+
817
+ def Util.words(s)
818
+ # Splits a string into an array of words, eliminating whitespace and interword punctuation.
819
+ # Don't use this for making word-by-word running hashes; that's what split_string_at_whitespace() is for.
820
+ # Knows about apostrophe for English and two elision characters for Greek.
821
+ # For a better-engineered version of this, see genos.string_to_words().
822
+ return s.scan(/[[:alpha:]'’᾽]+/)
823
+ end
824
+
825
+ def Util.strip_whitespace(s)
826
+ # strips leading and trailing whitespace
827
+ return s.sub(/^\s+/,'').sub(/\s+$/,'')
828
+ end
829
+
830
+ def Util.split_string_at_whitespace(text)
831
+ # Returns an array like [['The',' '],['quick',' '],...]. Every element is guaranteed to be a two-element list.
832
+ # In the final pair, the whitespace will be a null string if the text doesn't end with whitespace.
833
+ # This is basically meant for simple, reproducible word-by-word hashing (WhereAt.auto_hash), not for
834
+ # human-readable text processing, so don't use it for other purposes or fiddle with it to make it work
835
+ # for that purpose. For human-readable extraction of words, without punctuation, see words() above.
836
+ a = text.split(/(\s+)/)
837
+ if a.length%2==1 then a.push('') end
838
+ b = []
839
+ 0.upto(a.length/2-1) { |i| b.push([a[2*i],a[2*i+1]]) }
840
+ return b
841
+ end
842
+
843
+ def Util.split_string_into_paragraphs(text)
844
+ # Returns a list like ["This is a paragraph.","\n\n","Another paragraph.","\n \n\t\n",...].
845
+ # Guaranteed to have even length, so final element may be a null string.
846
+ # Like split_string_at_whitespace(), this is meant to be used for reproducible creation of hashes.
847
+ paras_and_delimiters = text.split(/(\s*(?:\n[ \t]*){2,}\s*)/) # even indices=paragraphs, odd=delimiters
848
+ if paras_and_delimiters.length%2==1 then paras_and_delimiters.push('') end # input doesn't end with a delimiter
849
+ return paras_and_delimiters
850
+ end
851
+
852
+ def Util.substr(x,i,len)
853
+ # Basically returns x[i..(i+len-1)], but doesn't do screwy stuff in cases like i=0, len=0.
854
+ result = ''
855
+ i.upto(i+len-1) { |m|
856
+ result = result+x[m]
857
+ }
858
+ return result
859
+ end
860
+
861
+ def Util.texify_quotes(s)
862
+ s = s.gsub(/((?<=[a-zA-Z]))'(?=[a-zA-Z])/,'__ENGLISH_INTERNAL_APOSTROPHE__')
863
+ # We don't want [[:alpha:]], because Greek doesn't use mid-word apostrophes, and we don't want to get confused by cases where elision
864
+ # was marked by an ASCII apostrophe.
865
+ # Handle nested quotes, working from the inside out.
866
+ 1.upto(3) { |i| # handle up to three levels
867
+ [[%q('),'SINGLE'],[%q("),'DOUBLE']].each { |x|
868
+ char,kind = x
869
+ s = s.gsub(/(?<![[:alpha:]])#{char}([^'"]+)#{char}(?![[:alpha:]])/) {"__OPEN_#{kind}_QUOTES__#{$1}__CLOSE_#{kind}_QUOTES__"}
870
+ # ... negative lookbehind and negative lookahead help to ensure we don't get confused
871
+ }
872
+ }
873
+ [['__OPEN_SINGLE_QUOTES__',%q(`)], ['__CLOSE_SINGLE_QUOTES__',%q(')],
874
+ ['__OPEN_DOUBLE_QUOTES__',%q(``)], ['__CLOSE_DOUBLE_QUOTES__',%q('')],
875
+ ['__ENGLISH_INTERNAL_APOSTROPHE__',%q(')] ].each { |x|
876
+ marker,replace_with = x
877
+ s = s.gsub(/#{marker}/,replace_with)
878
+ }
879
+ return s
880
+ end
881
+
882
+ def Util.canonicalize_greek_word(w,n:false)
883
+ # Works on a single word, not an entire string.
884
+ # Is designed so that calling it on a Latin word is fast and harmless.
885
+ # The n argument has the same definition as in Tinycus::Tr.remove_accents_from_greek().
886
+ if !looks_greek(w) then return w end # is fast on Latin script
887
+ if n then w = w.unicode_normalize(:nfc) end
888
+ w = to_single_accent(w)
889
+ w = Tinycus::Cleanup.standardize_greek_punctuation(w)
890
+ return w
891
+ end
892
+
893
+ def Util.looks_greek(w,depth:0)
894
+ # designed to be fast
895
+ if w=='' then return false end
896
+ if w=~/[a-zA-Z]/ then return false end
897
+ if w=~/[α-ωΑ-Ως]/ then return true end
898
+ if depth>=2 then return false end # happens if the string contains only punctuation, etc.
899
+ return looks_greek(Tinycus::Tr.remove_accents_from_greek(w),depth:depth+1) # slow fallback, almost never needed
900
+ end
901
+
902
+ def Util.mixes_scripts(s)
903
+ # not designed to be super fast or super accurate, just a quick check
904
+ if s=='' then return false end
905
+ if s=~/[a-zA-Z]/ then has_latin=true else has_latin=false end
906
+ if Tinycus::Tr.remove_accents_from_greek(s)=~/[α-ωΑ-Ως]/ then has_greek=true else has_greek=false end
907
+ return (has_latin && has_greek)
908
+ end
909
+
910
+ def Util.to_single_accent(w,grave_to_acute:true,n:false)
911
+ # In most cases, it's better to use canonicalize_greek_word() rather than this.
912
+ # If the word has both an acute and a grave, remove the grave. If it has only a grave, change it to an acute.
913
+ # This is used e.g. in LemmaUtil.make_inflected_form_flavored_like_lemma.
914
+ # Testing: ruby -e "require './lib/string_util'; print to_single_accent('χεῖράς')"
915
+ # The n argument has the same definition as in Tinycus::Tr.remove_accents_from_greek().
916
+ if Tinycus::Tr.remove_accents_from_greek(w,n:n)==w then return w end # for efficiency
917
+ if Tinycus::MiscGreek.has_circumflex(w) then return Tinycus::Tr.remove_acute_and_grave_from_greek(w) end
918
+ acc = []
919
+ w_no_acute_or_grave = Tinycus::Tr.remove_acute_and_grave_from_greek(w)
920
+ 0.upto(w.chars.length-1) { |i|
921
+ if w_no_acute_or_grave[i]!=w[i] then acc.push(i) end
922
+ }
923
+ if acc.length>1 then
924
+ # Remove every accent but the first.
925
+ ww = w.dup
926
+ 1.upto(acc.length-1) { |m|
927
+ i = acc[m]
928
+ ww[i] = Tinycus::Tr.remove_acute_and_grave_from_greek(ww[i])
929
+ }
930
+ return ww
931
+ else
932
+ if grave_to_acute then
933
+ return Tinycus::Tr.greek_grave_to_acute(w)
934
+ else
935
+ return w
936
+ end
937
+ end
938
+ end
939
+
940
+ def Util.remove_punctuation(s)
941
+ # also removes whitespace
942
+ return s.gsub(/[^[:alpha:]]/,'')
943
+ end
944
+
945
+ def Util.safe_normalize(s)
946
+ begin
947
+ return s.encode("UTF-8").unicode_normalize(:nfc)
948
+ rescue
949
+ # is probably 8-bit ascii/ISO-8859-1?
950
+ return s
951
+ end
952
+ end
953
+
954
+
955
+ def Util.lc_underbar(s)
956
+ return s.downcase.gsub(/ /,'_')
957
+ end
958
+
959
+ def Util.clean_up_greek(s,thorough:false,allow_latin:false,strip_punctuation:false)
960
+ # s is any string, can contain any script or mix of scripts, can be more than one word.
961
+ # Use the thorough option for external sources like raw Perseus xml files. This option is slow.
962
+ # The strip_punctuation option only strips punctuation that shouldn't be in a word, doesn't strip elision mark; is necessary because
963
+ # PROIEL Herodotus has a few errors where punct is included in word.
964
+ if !s.kind_of?(String) then return s end # convenience feature
965
+ if strip_punctuation then s=s.gsub(/[·,;«».]/,'') end
966
+ if thorough then
967
+ s = Tinycus::Cleanup.clean_up_grotty_greek(s,allow_latin:allow_latin) # standardizes punctuation by default
968
+ else
969
+ s = Tinycus::Cleanup.standardize_greek_punctuation(s) # Standardize elision character and middle dot/ano teleia.
970
+ end
971
+ return s
972
+ end
973
+
974
+ def Util.contains_greek_elision(s)
975
+ # The following checks are arranged in order so as to give best performance.
976
+ if !(s=~/[᾽’ῤῥ]/) then return false end
977
+ if s=~/[᾽’]/ then return true end
978
+ if s.length>=2 && s=~/[ῤῥ]$/ then return true end
979
+ return false
980
+ # the above are koronis (8125=14bd hex) and apostrophe (8217=2019 hex)
981
+ # see http://www.opoudjis.net/unicode/gkdiacritics.html
982
+ # Perseus sometimes has 787=313 hex, which is combining comma above, the non-spacing version of koronis. This seems
983
+ # to me to be a mistake on their part.
984
+ # https://github.com/PerseusDL/treebank_data/issues/31
985
+ # One could also have 700=2bc hex, spacing smooth breathing, which seems like an error, or 39=27 hex, the ascii apostrophe.
986
+ end
987
+
988
+ def Util.escape_double_quotes(s)
989
+ return s.gsub(/"/,'\\"') # escape double quotes
990
+ end
991
+
992
+ def Util.reverse_if_rtl(s)
993
+ if s=='' then return s end
994
+ if char_is_rtl(s[0]) then return Util.reverse_string(s) else return s end
995
+ end
996
+
997
+ def Util.reverse_string(s)
998
+ r = 0
999
+ s.chars.each { |c| r = c+r }
1000
+ return r
1001
+ end
1002
+
1003
+ def Util.console(*x)
1004
+ $stderr.print *x
1005
+ end
1006
+
1007
+ end
1008
+
1009
+ # The following table is output by scripts/generate_beta_code_tables.rb in Ifthimos.
1010
+ @@beta_code_conversion_json =
1011
+ <<-'JSON'
1012
+ [{"ἄ":"a)/","ἄ̄":"a)/&","ἄ̆":"a)/'","ἄ̄̆":"a)/&","ὰ":"a\\","ὰ̄":"a\\'","ὰ̆":"a\\'","ὰ̄̆":"a\\'","ά":"a/","ά̄":"a/&","ά̆":"a/'","ά̄̆":"a/&","Ἀ":"*)a","Ἀ̄":"*ā","Ἀ̆":"*ă","Ἀ̄̆":"*ā̆","α":"a","ᾱ":"a&","ᾰ":"a'","ᾱ̆":"a&'","ᾶ":"a=","ᾶ̄":"ā","ᾶ̆":"ă","ᾶ̄̆":"ā̆","ἀ":"a)","ἀ̄":"a)&","ἀ̆":"a)'","ἀ̄̆":"a)&","ἁ":"a(","ἁ̄":"a(&","ἁ̆":"a('","ἁ̄̆":"a(&","Ἄ":"*)/a","Ἄ̄":"*ā","Ἄ̆":"*ă","Ἄ̄̆":"*ā̆","ἂ":"a)\\","ἂ̄":"a)\\'","ἂ̆":"a)\\'","ἂ̄̆":"a)\\'","ᾷ":"a=|","ᾷ̄":"ā","ᾷ̆":"ă","ᾷ̄̆":"ā̆","Α":"*a","Ᾱ":"*a&","Ᾰ":"*a'","Ᾱ̆":"*a&'","ἅ":"a(/","ἅ̄":"a(/&","ἅ̆":"a(/'","ἅ̄̆":"a(/&","ἆ":"a)=","ἆ̄":"ā","ἆ̆":"ă","ἆ̄̆":"ā̆","ᾳ":"a|","ᾱͅ":"a|&","ᾰͅ":"a|'","ᾱ̆ͅ":"a|&'","Ἅ":"*(/a","Ἅ̄":"*ā","Ἅ̆":"*ă","Ἅ̄̆":"*ā̆","Ἁ":"*(a","Ἁ̄":"*ā","Ἁ̆":"*ă","Ἁ̄̆":"*ā̆","Ἆ":"*)=a","Ἆ̄":"*ā","Ἆ̆":"*ă","Ἆ̄̆":"*ā̆","ἃ":"a(\\","ἃ̄":"a(\\'","ἃ̆":"a(\\'","ἃ̄̆":"a(\\'","Ἂ":"*)\\a","Ἂ̄":"*ā","Ἂ̆":"*ă","Ἂ̄̆":"*ā̆","ᾇ":"a(=|","ᾇ̄":"ā","ᾇ̆":"ă","ᾇ̄̆":"ā̆","β":"b","Β":"*b","γ":"g","Γ":"*g","δ":"d","Δ":"*d","ε":"e","έ":"e/","ἔ":"e)/","ὲ":"e\\","ἑ":"e(","ἐ":"e)","ἕ":"e(/","Ἕ":"*(/e","Ἐ":"*)e","Ε":"*e","Ἑ":"*(e","Ἔ":"*)/e","ἓ":"e(\\","ἒ":"e)\\","ζ":"z","Ζ":"*z","ῆ":"h=","η":"h","ἣ":"h(\\","ἡ":"h(","ή":"h/","ὴ":"h\\","ἠ":"h)","ἦ":"h)=","ῃ":"h|","ἥ":"h(/","ἢ":"h)\\","ἤ":"h)/","ῇ":"h=|","Ἥ":"*(/h","ᾔ":"h)/|","ἧ":"h(=","ᾗ":"h(=|","Ἠ":"*)h","ῄ":"h/|","ᾕ":"h(/|","Ἡ":"*(h","Ἤ":"*)/h","ῂ":"h\\|","ᾐ":"h)|","ᾖ":"h)=|","ᾑ":"h(|","Η":"*h","Ἦ":"*)=h","Ἣ":"*(\\h","Ἧ":"*(=h","θ":"q","Θ":"*q","ι":"i","ῑ":"i&","ῐ":"i'","ῑ̆":"i&'","ϊ":"i+","ϊ̄":"i+&","ϊ̆":"ĭ","ϊ̄̆":"ī̆&","ί":"i/","ί̄":"i/&","ί̆":"i/'","ί̄̆":"i/&","ῖ":"i=","ῖ̄":"ī","ῖ̆":"ĭ","ῖ̄̆":"ī̆","ἰ":"i)","ἰ̄":"i)&","ἰ̆":"i)'","ἰ̄̆":"i)&","ΐ":"i/+","ΐ̄":"i/+'","ΐ̆":"i/+'","ΐ̄̆":"i/+'","ὶ":"i\\","ὶ̄":"i\\'","ὶ̆":"i\\'","ὶ̄̆":"i\\'","ἱ":"i(","ἱ̄":"i(&","ἱ̆":"i('","ἱ̄̆":"i(&","ἴ":"i)/","ἴ̄":"i)/&","ἴ̆":"i)/'","ἴ̄̆":"i)/&","ἶ":"i)=","ἶ̄":"ī","ἶ̆":"ĭ","ἶ̄̆":"ī̆","ἳ":"i(\\","ἳ̄":"i(\\'","ἳ̆":"i(\\'","ἳ̄̆":"i(\\'","Ἴ":"*)/i","Ἴ̄":"*ī","Ἴ̆":"*ĭ","Ἴ̄̆":"*ī̆","ῒ":"i\\+","ῒ̄":"i\\+'","ῒ̆":"i\\+'","ῒ̄̆":"i\\+'","ἵ":"i(/","ἵ̄":"i(/&","ἵ̆":"i(/'","ἵ̄̆":"i(/&","ἷ":"i(=","ἷ̄":"ī","ἷ̆":"ĭ","ἷ̄̆":"ī̆","Ἰ":"*)i","Ἰ̄":"*ī","Ἰ̆":"*ĭ","Ἰ̄̆":"*ī̆","ἲ":"i)\\","ἲ̄":"i)\\'","ἲ̆":"i)\\'","ἲ̄̆":"i)\\'","Ἱ":"*(i","Ἱ̄":"*ī","Ἱ̆":"*ĭ","Ἱ̄̆":"*ī̆","Ἶ":"*)=i","Ἶ̄":"*ī","Ἶ̆":"*ĭ","Ἶ̄̆":"*ī̆","ῗ":"i=+","ῗ̄":"ī","ῗ̆":"ĭ","ῗ̄̆":"ī̆","Ι":"*i","Ῑ":"*i&","Ῐ":"*i'","Ῑ̆":"*i&'","Ἵ":"*(/i","Ἵ̄":"*ī","Ἵ̆":"*ĭ","Ἵ̄̆":"*ī̆","κ":"k","Κ":"*k","λ":"l","Λ":"*l","μ":"m","Μ":"*m","ν":"n","Ν":"*n","ξ":"c","Ξ":"*c","ο":"o","ὸ":"o\\","ό":"o/","ὃ":"o(\\","ὄ":"o)/","ὀ":"o)","Ὀ":"*)o","Ο":"*o","ὅ":"o(/","ὁ":"o(","Ὄ":"*)/o","Ὅ":"*(/o","Ὃ":"*(\\o","Ὁ":"*(o","Π":"*p","π":"p","ρ":"r","ῥ":"r(","Ῥ":"*(r","ῤ":"r)","Ρ":"*r","ς":"j","σ":"s","Σ":"*s","τ":"t","Τ":"*t","ὐ":"u)","ὐ̄":"u)&","ὐ̆":"u)'","ὐ̄̆":"u)&","υ":"u","ῡ":"u&","ῠ":"u'","ῡ̆":"u&'","ὺ":"u\\","ὺ̄":"u\\'","ὺ̆":"u\\'","ὺ̄̆":"u\\'","ῦ":"u=","ῦ̄":"ū","ῦ̆":"ŭ","ῦ̄̆":"ū̆","ύ":"u/","ύ̄":"u/&","ύ̆":"u/'","ύ̄̆":"u/&","ὗ":"u(=","ὗ̄":"ū","ὗ̆":"ŭ","ὗ̄̆":"ū̆","ὕ":"u(/","ὕ̄":"u(/&","ὕ̆":"u(/'","ὕ̄̆":"u(/&","ϋ":"u+","ϋ̄":"ū","ϋ̆":"ŭ","ϋ̄̆":"ū̆","ὑ":"u(","ὑ̄":"u(&","ὑ̆":"u('","ὑ̄̆":"u(&","ὖ":"u)=","ὖ̄":"ū","ὖ̆":"ŭ","ὖ̄̆":"ū̆","ΰ":"u/+","ΰ̄":"ū","ΰ̆":"ŭ","ΰ̄̆":"ū̆","ὔ":"u)/","ὔ̄":"u)/&","ὔ̆":"u)/'","ὔ̄̆":"u)/&","ῢ":"u\\+","ῢ̄":"ū","ῢ̆":"ŭ","ῢ̄̆":"ū̆","ὓ":"u(\\","ὓ̄":"u(\\'","ὓ̆":"u(\\'","ὓ̄̆":"u(\\'","Ὑ":"*(u","Ὑ̄":"*ū","Ὑ̆":"*ŭ","Ὑ̄̆":"*ū̆","Ὕ":"*(/u","Ὕ̄":"*ū","Ὕ̆":"*ŭ","Ὕ̄̆":"*ū̆","Υ":"*u","Ῡ":"*u&","Ῠ":"*u'","Ῡ̆":"*u&'","φ":"f","Φ":"*f","χ":"x","Χ":"*x","ψ":"y","Ψ":"*y","ω":"w","ώ":"w/","ῶ":"w=","ῳ":"w|","ῷ":"w=|","ὼ":"w\\","ὥ":"w(/","ὣ":"w(\\","ὤ":"w)/","ῴ":"w/|","ᾤ":"w)/|","ὠ":"w)","ὦ":"w)=","ᾧ":"w(=|","ὡ":"w(","ὧ":"w(=","ᾠ":"w)|","Ὠ":"*)w","ὢ":"w)\\","Ὤ":"*)/w","Ὦ":"*)=w","Ὧ":"*(=w","ᾦ":"w)=|","Ω":"*w","Ὥ":"*(/w","Ὣ":"*(\\w","Ὡ":"*(w","Ϝ":"*v","ϝ":"v"},{"a)/":"ἄ","a)/&":"ἄ̄̆","a)/'":"ἄ̆","a\\":"ὰ","a\\'":"ὰ̄̆","a/":"ά","a/&":"ά̄̆","a/'":"ά̆","*)a":"Ἀ","*ā":"Ἂ̄","*ă":"Ἂ̆","*ā̆":"Ἂ̄̆","a":"α","a&":"ᾱ","a'":"ᾰ","a&'":"ᾱ̆","a=":"ᾶ","ā":"ᾇ̄","ă":"ᾇ̆","ā̆":"ᾇ̄̆","a)":"ἀ","a)&":"ἀ̄̆","a)'":"ἀ̆","a(":"ἁ","a(&":"ἁ̄̆","a('":"ἁ̆","*)/a":"Ἄ","a)\\":"ἂ","a)\\'":"ἂ̄̆","a=|":"ᾷ","*a":"Α","*a&":"Ᾱ","*a'":"Ᾰ","*a&'":"Ᾱ̆","a(/":"ἅ","a(/&":"ἅ̄̆","a(/'":"ἅ̆","a)=":"ἆ","a|":"ᾳ","a|&":"ᾱͅ","a|'":"ᾰͅ","a|&'":"ᾱ̆ͅ","*(/a":"Ἅ","*(a":"Ἁ","*)=a":"Ἆ","a(\\":"ἃ","a(\\'":"ἃ̄̆","*)\\a":"Ἂ","a(=|":"ᾇ","b":"β","*b":"Β","g":"γ","*g":"Γ","d":"δ","*d":"Δ","e":"ε","e/":"έ","e)/":"ἔ","e\\":"ὲ","e(":"ἑ","e)":"ἐ","e(/":"ἕ","*(/e":"Ἕ","*)e":"Ἐ","*e":"Ε","*(e":"Ἑ","*)/e":"Ἔ","e(\\":"ἓ","e)\\":"ἒ","z":"ζ","*z":"Ζ","h=":"ῆ","h":"η","h(\\":"ἣ","h(":"ἡ","h/":"ή","h\\":"ὴ","h)":"ἠ","h)=":"ἦ","h|":"ῃ","h(/":"ἥ","h)\\":"ἢ","h)/":"ἤ","h=|":"ῇ","*(/h":"Ἥ","h)/|":"ᾔ","h(=":"ἧ","h(=|":"ᾗ","*)h":"Ἠ","h/|":"ῄ","h(/|":"ᾕ","*(h":"Ἡ","*)/h":"Ἤ","h\\|":"ῂ","h)|":"ᾐ","h)=|":"ᾖ","h(|":"ᾑ","*h":"Η","*)=h":"Ἦ","*(\\h":"Ἣ","*(=h":"Ἧ","q":"θ","*q":"Θ","i":"ι","i&":"ῑ","i'":"ῐ","i&'":"ῑ̆","i+":"ϊ","i+&":"ϊ̄","ĭ":"ῗ̆","ī̆&":"ϊ̄̆","i/":"ί","i/&":"ί̄̆","i/'":"ί̆","i=":"ῖ","ī":"ῗ̄","ī̆":"ῗ̄̆","i)":"ἰ","i)&":"ἰ̄̆","i)'":"ἰ̆","i/+":"ΐ","i/+'":"ΐ̄̆","i\\":"ὶ","i\\'":"ὶ̄̆","i(":"ἱ","i(&":"ἱ̄̆","i('":"ἱ̆","i)/":"ἴ","i)/&":"ἴ̄̆","i)/'":"ἴ̆","i)=":"ἶ","i(\\":"ἳ","i(\\'":"ἳ̄̆","*)/i":"Ἴ","*ī":"Ἵ̄","*ĭ":"Ἵ̆","*ī̆":"Ἵ̄̆","i\\+":"ῒ","i\\+'":"ῒ̄̆","i(/":"ἵ","i(/&":"ἵ̄̆","i(/'":"ἵ̆","i(=":"ἷ","*)i":"Ἰ","i)\\":"ἲ","i)\\'":"ἲ̄̆","*(i":"Ἱ","*)=i":"Ἶ","i=+":"ῗ","*i":"Ι","*i&":"Ῑ","*i'":"Ῐ","*i&'":"Ῑ̆","*(/i":"Ἵ","k":"κ","*k":"Κ","l":"λ","*l":"Λ","m":"μ","*m":"Μ","n":"ν","*n":"Ν","c":"ξ","*c":"Ξ","o":"ο","o\\":"ὸ","o/":"ό","o(\\":"ὃ","o)/":"ὄ","o)":"ὀ","*)o":"Ὀ","*o":"Ο","o(/":"ὅ","o(":"ὁ","*)/o":"Ὄ","*(/o":"Ὅ","*(\\o":"Ὃ","*(o":"Ὁ","*p":"Π","p":"π","r":"ρ","r(":"ῥ","*(r":"Ῥ","r)":"ῤ","*r":"Ρ","j":"ς","s":"σ","*s":"Σ","t":"τ","*t":"Τ","u)":"ὐ","u)&":"ὐ̄̆","u)'":"ὐ̆","u":"υ","u&":"ῡ","u'":"ῠ","u&'":"ῡ̆","u\\":"ὺ","u\\'":"ὺ̄̆","u=":"ῦ","ū":"ῢ̄","ŭ":"ῢ̆","ū̆":"ῢ̄̆","u/":"ύ","u/&":"ύ̄̆","u/'":"ύ̆","u(=":"ὗ","u(/":"ὕ","u(/&":"ὕ̄̆","u(/'":"ὕ̆","u+":"ϋ","u(":"ὑ","u(&":"ὑ̄̆","u('":"ὑ̆","u)=":"ὖ","u/+":"ΰ","u)/":"ὔ","u)/&":"ὔ̄̆","u)/'":"ὔ̆","u\\+":"ῢ","u(\\":"ὓ","u(\\'":"ὓ̄̆","*(u":"Ὑ","*ū":"Ὕ̄","*ŭ":"Ὕ̆","*ū̆":"Ὕ̄̆","*(/u":"Ὕ","*u":"Υ","*u&":"Ῡ","*u'":"Ῠ","*u&'":"Ῡ̆","f":"φ","*f":"Φ","x":"χ","*x":"Χ","y":"ψ","*y":"Ψ","w":"ω","w/":"ώ","w=":"ῶ","w|":"ῳ","w=|":"ῷ","w\\":"ὼ","w(/":"ὥ","w(\\":"ὣ","w)/":"ὤ","w/|":"ῴ","w)/|":"ᾤ","w)":"ὠ","w)=":"ὦ","w(=|":"ᾧ","w(":"ὡ","w(=":"ὧ","w)|":"ᾠ","*)w":"Ὠ","w)\\":"ὢ","*)/w":"Ὤ","*)=w":"Ὦ","*(=w":"Ὧ","w)=|":"ᾦ","*w":"Ω","*(/w":"Ὥ","*(\\w":"Ὣ","*(w":"Ὡ","*v":"Ϝ","v":"ϝ"}]
1013
+ JSON
1014
+ @@beta_code_conversion = nil
1015
+
1016
+ def Tinycus.beta_code_conversion_table
1017
+ if @@beta_code_conversion.nil? then @@beta_code_conversion=JSON.parse(@@beta_code_conversion_json) end
1018
+ return @@beta_code_conversion
1019
+ end
1020
+
1021
+ def Tinycus.greek_char_unicode_to_beta_code(u)
1022
+ b = Tinycus.beta_code_conversion_table()[0][u]
1023
+ if !b.nil? then return b else return u end # most failures will just be whitespace, punctuation, etc.
1024
+ end
1025
+
1026
+ def Tinycus.greek_char_beta_code_to_unicode(b)
1027
+ b = Tinycus.canonicalize_char_greek_beta_code(b)
1028
+ u = Tinycus.beta_code_conversion_table()[1][b]
1029
+ if !u.nil? then return u else return b end
1030
+ end
1031
+
1032
+ def Tinycus.canonicalize_char_greek_beta_code(b)
1033
+ # Breathing normally comes after accent, but sometimes you see things in the wild where it's reversed.
1034
+ # I can't find any documentation for any preferred or canonical order. What circumstantial evidence I could find I put into the WP article.
1035
+ # Note that the order of |+ doesn't matter, because the same letter can't have both.
1036
+ ")(/\\=|+&'".chars.each { |c|
1037
+ if b=~/(.*)#{Regexp::quote(c)}(.*)/ then b = $1+$2+c end
1038
+ }
1039
+ b.sub!(/\*([a-zA-Z]\d?)([\/\\=)(]+)/) { '*'+$2+$1 } # if its capitalized, move tonal accents and breathing before the letter
1040
+ return b
1041
+ end
1042
+
1043
+ def Tinycus.greek_unicode_to_beta_code(u)
1044
+ u = Tinycus::Cleanup.clean_up_grotty_greek(u,allow_latin:true,clean_perseus:true,standardize_punctuation:false)
1045
+ # ... the conversion below will not work on unicode that isn't done cleanly and according to modern standards
1046
+ b = ''
1047
+ u.chars.each { |uc|
1048
+ b += Tinycus.greek_char_unicode_to_beta_code(uc)
1049
+ }
1050
+ b.tr!("·—’",":_'") # not implemeting keraia
1051
+ return b
1052
+ end
1053
+
1054
+ def Tinycus.greek_beta_code_to_unicode(b)
1055
+ # This implementation will be kind of slow because it does regex replacements in place.
1056
+ b = b.clone
1057
+ b.gsub!(/[sS][1-3]/,'s') # final sigma supported, actual lunate sigma (which looks like c) not supported
1058
+ # cons = "ϝβγδζθκλμνξπρσςτφχψh"
1059
+ b.gsub!(/(?<=[bcdfghjklmnpqrstvwxzBCDFGHJKLMNPQRSTVWXZ])'/,"’") # after a consonant, is apostrophe, not breve
1060
+ b.gsub!(/\*?[sS](?![1-3a-zA-Z*])/,'j') # make non-final sigma into final sigma if it's at end of word
1061
+ b.tr!(":_","·—") # not implemeting keraia
1062
+ b.gsub!(/(\*[)(\\\/=]*[a-zA-Z][)(\/=\\+|&'\´]*)/) { Tinycus.greek_char_beta_code_to_unicode($1) } # uppercase, expect breathing and tonal before vowel
1063
+ b.gsub!(/([a-zA-Z][)(\/=\\+|&'\´]*)/) { Tinycus.greek_char_beta_code_to_unicode($1) } # lowercase, breathing after vowel
1064
+ b.gsub!(/ς’/,'σ’')
1065
+ return b
1066
+ end
1067
+
1068
+ def Tinycus.disassemble_greek_char(c)
1069
+ # Returns [plain,d], where plain is a lowercase, unaccented Greek letter (α-ω, plus ς), and d is
1070
+ # a hash with the following keys:
1071
+ # uppercase, diar, iota_subscript - boolean values
1072
+ # tonal - string value: none acute grave circumflex
1073
+ # breathing - string value: none smooth rough
1074
+ # Doesn't handle macrons and breves. I have a function IfMows.disassemble_char in Ifthimos that does that.
1075
+ x = Tinycus.disassemble_greek_char_binary(c)
1076
+ if x.nil? then return nil end
1077
+ plain,decor = x
1078
+ d = {}
1079
+ d['uppercase'] = (decor & 0b1)!=0
1080
+ d['diar'] = (decor & 0b10)!=0
1081
+ d['iota_subscript'] = (decor & 0b100)!=0
1082
+ accent = (decor & 0b11000)>>3
1083
+ if accent==0b00 then d['tonal']='none' end
1084
+ if accent==0b01 then d['tonal']='acute' end
1085
+ if accent==0b10 then d['tonal']='grave' end
1086
+ if accent==0b11 then d['tonal']='circumflex' end
1087
+ if !d.has_key?('tonal') then raise "wtf? #{accent}" end
1088
+ breathing = (decor & 0b1100000)>>5
1089
+ if breathing==0b00 then d['breathing']='none' end
1090
+ if breathing==0b01 then d['breathing']='smooth' end
1091
+ if breathing==0b10 then d['breathing']='rough' end
1092
+ return [plain,d]
1093
+ end
1094
+
1095
+ def Tinycus.assemble_greek_char(plain,d)
1096
+ # The inverse of Tinycus.disassemble_greek_char.
1097
+ # Doesn't handle macrons and breves. I have a function IfMows.assemble_char in Ifthimos that does that.
1098
+ b = 0
1099
+ b |= 0b1 if d['uppercase']
1100
+ b |= 0b10 if d['diar']
1101
+ b |= 0b100 if d['iota_subscript']
1102
+ b |= 0b1000 if d['tonal']=='acute'
1103
+ b |= 0b10000 if d['tonal']=='grave'
1104
+ b |= 0b11000 if d['tonal']=='circumflex'
1105
+ b |= 0b0100000 if d['breathing']=='smooth'
1106
+ b |= 0b1000000 if d['breathing']=='rough'
1107
+ x = plain+("%.2x" % b)
1108
+ return Tinycus.assemble_greek_char_hex(x)
1109
+ end
1110
+
1111
+ def Tinycus.disassemble_greek_char_binary(c)
1112
+ # Returns [plain,b], where plain is a lowercase, unaccented Greek letter (α-ω, plus ς), and b is
1113
+ # an integer containing a set of flags encoded in binary, as follows:
1114
+ # b |= 0b1 if d['uppercase']
1115
+ # b |= 0b10 if d['diar']
1116
+ # b |= 0b100 if d['iota_subscript']
1117
+ # b |= 0b1000 if d['tonal']=='acute'
1118
+ # b |= 0b10000 if d['tonal']=='grave'
1119
+ # b |= 0b11000 if d['tonal']=='circumflex'
1120
+ # b |= 0b0100000 if d['breathing']=='smooth'
1121
+ # b |= 0b1000000 if d['breathing']=='rough'
1122
+ return @@disassemble_greek_char_hash[c]
1123
+ end
1124
+
1125
+ def Tinycus.assemble_greek_char_hex(x)
1126
+ # The inverse of the map in Tinycus.disassemble_greek_char_binary.
1127
+ # Accepts an input such as 'α08', where the second and third characters are the hex representation of of
1128
+ # the set of flags described in the comments in the forward map.
1129
+ return @@assemble_greek_char_hash[x]
1130
+ end
1131
+
1132
+ # The following are output by generating/assemble_disassemble.rb .
1133
+ @@disassemble_greek_char_hash = {
1134
+ 'α'=>['α',0b0],
1135
+ 'Α'=>['α',0b1],
1136
+ 'ᾳ'=>['α',0b100],
1137
+ 'ά'=>['α',0b1000],
1138
+ 'ὰ'=>['α',0b10000],
1139
+ 'ᾶ'=>['α',0b11000],
1140
+ 'ᾷ'=>['α',0b11100],
1141
+ 'ἀ'=>['α',0b100000],
1142
+ 'Ἀ'=>['α',0b100001],
1143
+ 'ἄ'=>['α',0b101000],
1144
+ 'Ἄ'=>['α',0b101001],
1145
+ 'ᾄ'=>['α',0b101100],
1146
+ 'ἂ'=>['α',0b110000],
1147
+ 'Ἂ'=>['α',0b110001],
1148
+ 'ἆ'=>['α',0b111000],
1149
+ 'Ἆ'=>['α',0b111001],
1150
+ 'ἁ'=>['α',0b1000000],
1151
+ 'Ἁ'=>['α',0b1000001],
1152
+ 'ἅ'=>['α',0b1001000],
1153
+ 'Ἅ'=>['α',0b1001001],
1154
+ 'ᾅ'=>['α',0b1001100],
1155
+ 'ἃ'=>['α',0b1010000],
1156
+ 'ᾇ'=>['α',0b1011100],
1157
+ 'β'=>['β',0b0],
1158
+ 'Β'=>['β',0b1],
1159
+ 'γ'=>['γ',0b0],
1160
+ 'Γ'=>['γ',0b1],
1161
+ 'δ'=>['δ',0b0],
1162
+ 'Δ'=>['δ',0b1],
1163
+ 'ε'=>['ε',0b0],
1164
+ 'Ε'=>['ε',0b1],
1165
+ 'έ'=>['ε',0b1000],
1166
+ 'ὲ'=>['ε',0b10000],
1167
+ 'ἐ'=>['ε',0b100000],
1168
+ 'Ἐ'=>['ε',0b100001],
1169
+ 'ἔ'=>['ε',0b101000],
1170
+ 'Ἔ'=>['ε',0b101001],
1171
+ 'ἒ'=>['ε',0b110000],
1172
+ 'ἑ'=>['ε',0b1000000],
1173
+ 'Ἑ'=>['ε',0b1000001],
1174
+ 'ἕ'=>['ε',0b1001000],
1175
+ 'Ἕ'=>['ε',0b1001001],
1176
+ 'ἓ'=>['ε',0b1010000],
1177
+ 'ζ'=>['ζ',0b0],
1178
+ 'Ζ'=>['ζ',0b1],
1179
+ 'η'=>['η',0b0],
1180
+ 'Η'=>['η',0b1],
1181
+ 'ῃ'=>['η',0b100],
1182
+ 'ή'=>['η',0b1000],
1183
+ 'ῄ'=>['η',0b1100],
1184
+ 'ὴ'=>['η',0b10000],
1185
+ 'ῂ'=>['η',0b10100],
1186
+ 'ῆ'=>['η',0b11000],
1187
+ 'ῇ'=>['η',0b11100],
1188
+ 'ἠ'=>['η',0b100000],
1189
+ 'Ἠ'=>['η',0b100001],
1190
+ 'ᾐ'=>['η',0b100100],
1191
+ 'ἤ'=>['η',0b101000],
1192
+ 'Ἤ'=>['η',0b101001],
1193
+ 'ᾔ'=>['η',0b101100],
1194
+ 'ἢ'=>['η',0b110000],
1195
+ 'ἦ'=>['η',0b111000],
1196
+ 'Ἦ'=>['η',0b111001],
1197
+ 'ᾖ'=>['η',0b111100],
1198
+ 'ἡ'=>['η',0b1000000],
1199
+ 'Ἡ'=>['η',0b1000001],
1200
+ 'ᾑ'=>['η',0b1000100],
1201
+ 'ἥ'=>['η',0b1001000],
1202
+ 'Ἥ'=>['η',0b1001001],
1203
+ 'ᾕ'=>['η',0b1001100],
1204
+ 'ἣ'=>['η',0b1010000],
1205
+ 'Ἣ'=>['η',0b1010001],
1206
+ 'ἧ'=>['η',0b1011000],
1207
+ 'Ἧ'=>['η',0b1011001],
1208
+ 'ᾗ'=>['η',0b1011100],
1209
+ 'θ'=>['θ',0b0],
1210
+ 'Θ'=>['θ',0b1],
1211
+ 'ι'=>['ι',0b0],
1212
+ 'Ι'=>['ι',0b1],
1213
+ 'ϊ'=>['ι',0b10],
1214
+ 'ί'=>['ι',0b1000],
1215
+ 'ΐ'=>['ι',0b1010],
1216
+ 'ὶ'=>['ι',0b10000],
1217
+ 'ῒ'=>['ι',0b10010],
1218
+ 'ῖ'=>['ι',0b11000],
1219
+ 'ῗ'=>['ι',0b11010],
1220
+ 'ἰ'=>['ι',0b100000],
1221
+ 'Ἰ'=>['ι',0b100001],
1222
+ 'ἴ'=>['ι',0b101000],
1223
+ 'Ἴ'=>['ι',0b101001],
1224
+ 'ἲ'=>['ι',0b110000],
1225
+ 'ἶ'=>['ι',0b111000],
1226
+ 'Ἶ'=>['ι',0b111001],
1227
+ 'ἱ'=>['ι',0b1000000],
1228
+ 'Ἱ'=>['ι',0b1000001],
1229
+ 'ἵ'=>['ι',0b1001000],
1230
+ 'Ἵ'=>['ι',0b1001001],
1231
+ 'ἳ'=>['ι',0b1010000],
1232
+ 'ἷ'=>['ι',0b1011000],
1233
+ 'κ'=>['κ',0b0],
1234
+ 'Κ'=>['κ',0b1],
1235
+ 'λ'=>['λ',0b0],
1236
+ 'Λ'=>['λ',0b1],
1237
+ 'μ'=>['μ',0b0],
1238
+ 'Μ'=>['μ',0b1],
1239
+ 'ν'=>['ν',0b0],
1240
+ 'Ν'=>['ν',0b1],
1241
+ 'ξ'=>['ξ',0b0],
1242
+ 'Ξ'=>['ξ',0b1],
1243
+ 'ο'=>['ο',0b0],
1244
+ 'Ο'=>['ο',0b1],
1245
+ 'ό'=>['ο',0b1000],
1246
+ 'ὸ'=>['ο',0b10000],
1247
+ 'ὀ'=>['ο',0b100000],
1248
+ 'Ὀ'=>['ο',0b100001],
1249
+ 'ὄ'=>['ο',0b101000],
1250
+ 'Ὄ'=>['ο',0b101001],
1251
+ 'ὁ'=>['ο',0b1000000],
1252
+ 'Ὁ'=>['ο',0b1000001],
1253
+ 'ὅ'=>['ο',0b1001000],
1254
+ 'Ὅ'=>['ο',0b1001001],
1255
+ 'ὃ'=>['ο',0b1010000],
1256
+ 'Ὃ'=>['ο',0b1010001],
1257
+ 'π'=>['π',0b0],
1258
+ 'Π'=>['π',0b1],
1259
+ 'ρ'=>['ρ',0b0],
1260
+ 'Ρ'=>['ρ',0b1],
1261
+ 'ῤ'=>['ρ',0b100000],
1262
+ 'ῥ'=>['ρ',0b1000000],
1263
+ 'Ῥ'=>['ρ',0b1000001],
1264
+ 'ς'=>['ς',0b0],
1265
+ 'σ'=>['σ',0b0],
1266
+ 'Σ'=>['σ',0b1],
1267
+ 'τ'=>['τ',0b0],
1268
+ 'Τ'=>['τ',0b1],
1269
+ 'υ'=>['υ',0b0],
1270
+ 'Υ'=>['υ',0b1],
1271
+ 'ϋ'=>['υ',0b10],
1272
+ 'ύ'=>['υ',0b1000],
1273
+ 'ΰ'=>['υ',0b1010],
1274
+ 'ὺ'=>['υ',0b10000],
1275
+ 'ῢ'=>['υ',0b10010],
1276
+ 'ῦ'=>['υ',0b11000],
1277
+ 'ὐ'=>['υ',0b100000],
1278
+ 'ὔ'=>['υ',0b101000],
1279
+ 'ὖ'=>['υ',0b111000],
1280
+ 'ὑ'=>['υ',0b1000000],
1281
+ 'Ὑ'=>['υ',0b1000001],
1282
+ 'ὕ'=>['υ',0b1001000],
1283
+ 'Ὕ'=>['υ',0b1001001],
1284
+ 'ὓ'=>['υ',0b1010000],
1285
+ 'ὗ'=>['υ',0b1011000],
1286
+ 'φ'=>['φ',0b0],
1287
+ 'Φ'=>['φ',0b1],
1288
+ 'χ'=>['χ',0b0],
1289
+ 'Χ'=>['χ',0b1],
1290
+ 'ψ'=>['ψ',0b0],
1291
+ 'Ψ'=>['ψ',0b1],
1292
+ 'ω'=>['ω',0b0],
1293
+ 'Ω'=>['ω',0b1],
1294
+ 'ῳ'=>['ω',0b100],
1295
+ 'ώ'=>['ω',0b1000],
1296
+ 'ῴ'=>['ω',0b1100],
1297
+ 'ὼ'=>['ω',0b10000],
1298
+ 'ῶ'=>['ω',0b11000],
1299
+ 'ῷ'=>['ω',0b11100],
1300
+ 'ὠ'=>['ω',0b100000],
1301
+ 'Ὠ'=>['ω',0b100001],
1302
+ 'ᾠ'=>['ω',0b100100],
1303
+ 'ὤ'=>['ω',0b101000],
1304
+ 'Ὤ'=>['ω',0b101001],
1305
+ 'ᾤ'=>['ω',0b101100],
1306
+ 'ὢ'=>['ω',0b110000],
1307
+ 'ὦ'=>['ω',0b111000],
1308
+ 'Ὦ'=>['ω',0b111001],
1309
+ 'ᾦ'=>['ω',0b111100],
1310
+ 'ὡ'=>['ω',0b1000000],
1311
+ 'Ὡ'=>['ω',0b1000001],
1312
+ 'ὥ'=>['ω',0b1001000],
1313
+ 'Ὥ'=>['ω',0b1001001],
1314
+ 'ὣ'=>['ω',0b1010000],
1315
+ 'Ὣ'=>['ω',0b1010001],
1316
+ 'ὧ'=>['ω',0b1011000],
1317
+ 'Ὧ'=>['ω',0b1011001],
1318
+ 'ᾧ'=>['ω',0b1011100],
1319
+ 'ϝ'=>['ϝ',0b0],
1320
+ 'Ϝ'=>['ϝ',0b1]
1321
+ }
1322
+ @@assemble_greek_char_hash = {
1323
+
1324
+ 'α00'=>'α',
1325
+ 'α01'=>'Α',
1326
+ 'α04'=>'ᾳ',
1327
+ 'α08'=>'ά',
1328
+ 'α10'=>'ὰ',
1329
+ 'α18'=>'ᾶ',
1330
+ 'α1c'=>'ᾷ',
1331
+ 'α20'=>'ἀ',
1332
+ 'α21'=>'Ἀ',
1333
+ 'α28'=>'ἄ',
1334
+ 'α29'=>'Ἄ',
1335
+ 'α2c'=>'ᾄ',
1336
+ 'α30'=>'ἂ',
1337
+ 'α31'=>'Ἂ',
1338
+ 'α38'=>'ἆ',
1339
+ 'α39'=>'Ἆ',
1340
+ 'α40'=>'ἁ',
1341
+ 'α41'=>'Ἁ',
1342
+ 'α48'=>'ἅ',
1343
+ 'α49'=>'Ἅ',
1344
+ 'α4c'=>'ᾅ',
1345
+ 'α50'=>'ἃ',
1346
+ 'α5c'=>'ᾇ',
1347
+ 'β00'=>'β',
1348
+ 'β01'=>'Β',
1349
+ 'γ00'=>'γ',
1350
+ 'γ01'=>'Γ',
1351
+ 'δ00'=>'δ',
1352
+ 'δ01'=>'Δ',
1353
+ 'ε00'=>'ε',
1354
+ 'ε01'=>'Ε',
1355
+ 'ε08'=>'έ',
1356
+ 'ε10'=>'ὲ',
1357
+ 'ε20'=>'ἐ',
1358
+ 'ε21'=>'Ἐ',
1359
+ 'ε28'=>'ἔ',
1360
+ 'ε29'=>'Ἔ',
1361
+ 'ε30'=>'ἒ',
1362
+ 'ε40'=>'ἑ',
1363
+ 'ε41'=>'Ἑ',
1364
+ 'ε48'=>'ἕ',
1365
+ 'ε49'=>'Ἕ',
1366
+ 'ε50'=>'ἓ',
1367
+ 'ζ00'=>'ζ',
1368
+ 'ζ01'=>'Ζ',
1369
+ 'η00'=>'η',
1370
+ 'η01'=>'Η',
1371
+ 'η04'=>'ῃ',
1372
+ 'η08'=>'ή',
1373
+ 'η0c'=>'ῄ',
1374
+ 'η10'=>'ὴ',
1375
+ 'η14'=>'ῂ',
1376
+ 'η18'=>'ῆ',
1377
+ 'η1c'=>'ῇ',
1378
+ 'η20'=>'ἠ',
1379
+ 'η21'=>'Ἠ',
1380
+ 'η24'=>'ᾐ',
1381
+ 'η28'=>'ἤ',
1382
+ 'η29'=>'Ἤ',
1383
+ 'η2c'=>'ᾔ',
1384
+ 'η30'=>'ἢ',
1385
+ 'η38'=>'ἦ',
1386
+ 'η39'=>'Ἦ',
1387
+ 'η3c'=>'ᾖ',
1388
+ 'η40'=>'ἡ',
1389
+ 'η41'=>'Ἡ',
1390
+ 'η44'=>'ᾑ',
1391
+ 'η48'=>'ἥ',
1392
+ 'η49'=>'Ἥ',
1393
+ 'η4c'=>'ᾕ',
1394
+ 'η50'=>'ἣ',
1395
+ 'η51'=>'Ἣ',
1396
+ 'η58'=>'ἧ',
1397
+ 'η59'=>'Ἧ',
1398
+ 'η5c'=>'ᾗ',
1399
+ 'θ00'=>'θ',
1400
+ 'θ01'=>'Θ',
1401
+ 'ι00'=>'ι',
1402
+ 'ι01'=>'Ι',
1403
+ 'ι02'=>'ϊ',
1404
+ 'ι08'=>'ί',
1405
+ 'ι0a'=>'ΐ',
1406
+ 'ι10'=>'ὶ',
1407
+ 'ι12'=>'ῒ',
1408
+ 'ι18'=>'ῖ',
1409
+ 'ι1a'=>'ῗ',
1410
+ 'ι20'=>'ἰ',
1411
+ 'ι21'=>'Ἰ',
1412
+ 'ι28'=>'ἴ',
1413
+ 'ι29'=>'Ἴ',
1414
+ 'ι30'=>'ἲ',
1415
+ 'ι38'=>'ἶ',
1416
+ 'ι39'=>'Ἶ',
1417
+ 'ι40'=>'ἱ',
1418
+ 'ι41'=>'Ἱ',
1419
+ 'ι48'=>'ἵ',
1420
+ 'ι49'=>'Ἵ',
1421
+ 'ι50'=>'ἳ',
1422
+ 'ι58'=>'ἷ',
1423
+ 'κ00'=>'κ',
1424
+ 'κ01'=>'Κ',
1425
+ 'λ00'=>'λ',
1426
+ 'λ01'=>'Λ',
1427
+ 'μ00'=>'μ',
1428
+ 'μ01'=>'Μ',
1429
+ 'ν00'=>'ν',
1430
+ 'ν01'=>'Ν',
1431
+ 'ξ00'=>'ξ',
1432
+ 'ξ01'=>'Ξ',
1433
+ 'ο00'=>'ο',
1434
+ 'ο01'=>'Ο',
1435
+ 'ο08'=>'ό',
1436
+ 'ο10'=>'ὸ',
1437
+ 'ο20'=>'ὀ',
1438
+ 'ο21'=>'Ὀ',
1439
+ 'ο28'=>'ὄ',
1440
+ 'ο29'=>'Ὄ',
1441
+ 'ο40'=>'ὁ',
1442
+ 'ο41'=>'Ὁ',
1443
+ 'ο48'=>'ὅ',
1444
+ 'ο49'=>'Ὅ',
1445
+ 'ο50'=>'ὃ',
1446
+ 'ο51'=>'Ὃ',
1447
+ 'π00'=>'π',
1448
+ 'π01'=>'Π',
1449
+ 'ρ00'=>'ρ',
1450
+ 'ρ01'=>'Ρ',
1451
+ 'ρ20'=>'ῤ',
1452
+ 'ρ40'=>'ῥ',
1453
+ 'ρ41'=>'Ῥ',
1454
+ 'ς00'=>'ς',
1455
+ 'σ00'=>'σ',
1456
+ 'σ01'=>'Σ',
1457
+ 'τ00'=>'τ',
1458
+ 'τ01'=>'Τ',
1459
+ 'υ00'=>'υ',
1460
+ 'υ01'=>'Υ',
1461
+ 'υ02'=>'ϋ',
1462
+ 'υ08'=>'ύ',
1463
+ 'υ0a'=>'ΰ',
1464
+ 'υ10'=>'ὺ',
1465
+ 'υ12'=>'ῢ',
1466
+ 'υ18'=>'ῦ',
1467
+ 'υ20'=>'ὐ',
1468
+ 'υ28'=>'ὔ',
1469
+ 'υ38'=>'ὖ',
1470
+ 'υ40'=>'ὑ',
1471
+ 'υ41'=>'Ὑ',
1472
+ 'υ48'=>'ὕ',
1473
+ 'υ49'=>'Ὕ',
1474
+ 'υ50'=>'ὓ',
1475
+ 'υ58'=>'ὗ',
1476
+ 'φ00'=>'φ',
1477
+ 'φ01'=>'Φ',
1478
+ 'χ00'=>'χ',
1479
+ 'χ01'=>'Χ',
1480
+ 'ψ00'=>'ψ',
1481
+ 'ψ01'=>'Ψ',
1482
+ 'ω00'=>'ω',
1483
+ 'ω01'=>'Ω',
1484
+ 'ω04'=>'ῳ',
1485
+ 'ω08'=>'ώ',
1486
+ 'ω0c'=>'ῴ',
1487
+ 'ω10'=>'ὼ',
1488
+ 'ω18'=>'ῶ',
1489
+ 'ω1c'=>'ῷ',
1490
+ 'ω20'=>'ὠ',
1491
+ 'ω21'=>'Ὠ',
1492
+ 'ω24'=>'ᾠ',
1493
+ 'ω28'=>'ὤ',
1494
+ 'ω29'=>'Ὤ',
1495
+ 'ω2c'=>'ᾤ',
1496
+ 'ω30'=>'ὢ',
1497
+ 'ω38'=>'ὦ',
1498
+ 'ω39'=>'Ὦ',
1499
+ 'ω3c'=>'ᾦ',
1500
+ 'ω40'=>'ὡ',
1501
+ 'ω41'=>'Ὡ',
1502
+ 'ω48'=>'ὥ',
1503
+ 'ω49'=>'Ὥ',
1504
+ 'ω50'=>'ὣ',
1505
+ 'ω51'=>'Ὣ',
1506
+ 'ω58'=>'ὧ',
1507
+ 'ω59'=>'Ὧ',
1508
+ 'ω5c'=>'ᾧ',
1509
+ 'ϝ00'=>'ϝ',
1510
+ 'ϝ01'=>'Ϝ'
1511
+ }
1512
+
1513
+ end # module Tinycus
1514
+