tinycus 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/tinycus.rb +1514 -0
- metadata +46 -0
data/tinycus.rb
ADDED
@@ -0,0 +1,1514 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require 'digest'
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
module Tinycus
|
6
|
+
|
7
|
+
# The four "alpha_" functions work on Greek and English, also most Latin characters; see comments in Tr.get_greek_collation_tr.
|
8
|
+
|
9
|
+
def Tinycus.alpha_sort(l,n:false)
|
10
|
+
return Tinycus.sort(l,Tinycus.alpha_collation,n:false)
|
11
|
+
end
|
12
|
+
|
13
|
+
def Tinycus.alpha_equal(a,b,n:true)
|
14
|
+
return (Tinycus.alpha_compare(a,b,n:n)==0)
|
15
|
+
end
|
16
|
+
|
17
|
+
def Tinycus.alpha_compare(a,b,n:true)
|
18
|
+
# return (Tinycus::Tr.remove_accents_from_greek(a,n:n).downcase <=> Tinycus::Tr.remove_accents_from_greek(b,n:n).downcase)
|
19
|
+
collation_tr = Tinycus.alpha_collation
|
20
|
+
return collation_tr.apply(a,n:n) <=> collation_tr.apply(b,n:n)
|
21
|
+
end
|
22
|
+
|
23
|
+
def Tinycus.alpha_collation
|
24
|
+
return Tinycus::Tr.get_greek_collation_tr
|
25
|
+
end
|
26
|
+
|
27
|
+
def Tinycus.is_vowel(c)
|
28
|
+
# synonym of Tinycus.contains_vowel, for readability when using it on a single character
|
29
|
+
return Tinycus.contains_vowel(c)
|
30
|
+
end
|
31
|
+
|
32
|
+
def Tinycus.contains_vowel(s)
|
33
|
+
# works for Greek and Latin; considers y to be a vowel; doesn't handle stuff like Welsh w
|
34
|
+
if Tinycus::Tr.remove_accents_from_greek(s).downcase.match?(/[αειουηωaeiouyæ]/) then return true else return false end
|
35
|
+
end
|
36
|
+
|
37
|
+
# ---
|
38
|
+
|
39
|
+
def Tinycus.sort(l,collation_tr,n:false)
|
40
|
+
p = l.clone # This does work on a list of strings: ruby -e "a=['p','q']; b=a.clone; b[1]='x'; print a"
|
41
|
+
p.sort_by! { |x| collation_tr.apply(x,n:n) } # ruby's sort_by! only applies the block once to each element, to form an index
|
42
|
+
return p
|
43
|
+
end
|
44
|
+
|
45
|
+
def Tinycus.run_tests
|
46
|
+
# removing accents:
|
47
|
+
tests = [
|
48
|
+
['',''],
|
49
|
+
['ἔχω','εχω'],
|
50
|
+
]
|
51
|
+
tests.each { |t|
|
52
|
+
x,y = t
|
53
|
+
z = Tinycus::Tr.remove_accents_from_greek(x)
|
54
|
+
if z!=y then raise "error in test, removing accents from '#{x}' gives '#{z}', expected '#{y}'" end
|
55
|
+
}
|
56
|
+
#----
|
57
|
+
# sorting:
|
58
|
+
tests = [
|
59
|
+
[ "Μῆνιν ἄειδε, θεά, Πηληϊάδεω Ἀχιλῆος οὐλομένην", "ἄειδε, Ἀχιλῆος θεά, Μῆνιν οὐλομένην Πηληϊάδεω" ],
|
60
|
+
[ "ὠμοφάγος ᾍδης", "ᾍδης ὠμοφάγος" ],
|
61
|
+
]
|
62
|
+
tests.each { |t|
|
63
|
+
x,y = t
|
64
|
+
a = x.split(/\s+/)
|
65
|
+
a = Tinycus.alpha_sort(a)
|
66
|
+
z = a.join(' ')
|
67
|
+
if z!=y then raise "error in test, sorting words on #{x} gives #{z}, expected #{y}" end
|
68
|
+
}
|
69
|
+
#----
|
70
|
+
filename = "test_input/bad_combining_character.txt"
|
71
|
+
s = nil
|
72
|
+
File.open(filename,'r') { |f|
|
73
|
+
s = f.gets(nil) # nil means read whole file
|
74
|
+
}
|
75
|
+
if s.nil? || s.length<10 then raise "wtf?" end
|
76
|
+
s2 = Cleanup.clean_up_greek_combining_characters(s,allow_latin:true) # allow_latin is so we can put in stuff like xml fragments
|
77
|
+
if s2==s then raise "error in test, no change with Cleanup.clean_up_greek_combining_characters" end
|
78
|
+
Cleanup.clean_up_grotty_greek_one_word(s,allow_latin:true)
|
79
|
+
#----
|
80
|
+
# Beta code/unicode conversion. In the following, there can actually be more than one beta code representation
|
81
|
+
# of the same unicode character, so if the third-element flag is true, we don't expect uni to beta to work.
|
82
|
+
[
|
83
|
+
["ἐν","E)N"],
|
84
|
+
["Ἄλφα","*)/alfa"], # breathing and tonal accent are supposed to come first for uppercase
|
85
|
+
["προϊέναι","PROI+E/NAI"],
|
86
|
+
["πρός","pro/j"],
|
87
|
+
["πρός","pro/s",true],
|
88
|
+
["πρός","pro/s2",true],
|
89
|
+
["σῖτον","si=ton"],
|
90
|
+
["σῖτον","s1i=ton",true],
|
91
|
+
["μάχαιρᾰ","MA/XAIRA'"],
|
92
|
+
["μαχαίρᾱς","MAXAI/RA&j"],
|
93
|
+
["ἤφυσ’","h)/fus'"],
|
94
|
+
["·—",":_"],
|
95
|
+
["ἴκτινος","i/)ktinos",true], # probably should be )/, but we should deal with it anyway
|
96
|
+
["ἴκτινος","i)/ktinos",true],
|
97
|
+
["βοΐ","boi/+"],
|
98
|
+
["βοΐ","boi+/",true],
|
99
|
+
["ᾇ","a(=|"],
|
100
|
+
["ᾇτε","a(=|te"],
|
101
|
+
#["",""],
|
102
|
+
].each { |t|
|
103
|
+
unicode,beta_code,no_uni_to_beta = t
|
104
|
+
beta_code = beta_code.downcase
|
105
|
+
if no_uni_to_beta.nil? then no_uni_to_beta=false end
|
106
|
+
if !no_uni_to_beta && Tinycus.greek_unicode_to_beta_code(unicode)!=beta_code then
|
107
|
+
raise "error in Tinycus.greek_unicode_to_beta_code, #{unicode} converts to #{Tinycus.greek_unicode_to_beta_code(unicode)}, expected #{beta_code}"
|
108
|
+
end
|
109
|
+
if Tinycus.greek_beta_code_to_unicode(beta_code)!=unicode then
|
110
|
+
raise "error in Tinycus.greek_beta_code_to_unicode, #{beta_code} converts to #{Tinycus.greek_beta_code_to_unicode(beta_code)}, expected #{unicode}"
|
111
|
+
end
|
112
|
+
}
|
113
|
+
|
114
|
+
[
|
115
|
+
["ἀντῑκρύ̄","ἀντι_κρύ_"],
|
116
|
+
["ἀνῑ̆άζω","ἀνιάζω"], # iota with macron, followed by a combining breve, i.e., can be either short or long
|
117
|
+
["ἀ̄ϊ̄κή","ἀ_ϊ_κή"], # iota with diaresis, followed by a combining macron
|
118
|
+
].each { |x|
|
119
|
+
a,b = x
|
120
|
+
b_actual = Tinycus::Tr.macronized_to_underbar_style(a)
|
121
|
+
if b_actual==b then
|
122
|
+
print "passed test of macronized_to_underbar_style, #{a} -> #{b}\n"
|
123
|
+
else
|
124
|
+
raise "failed, #{x}, actual = #{b_actual}"
|
125
|
+
end
|
126
|
+
}
|
127
|
+
|
128
|
+
|
129
|
+
end
|
130
|
+
|
131
|
+
# fixme:
|
132
|
+
# Determine byte order and make sure we convert to native (i.e., BE rather than LE if we're on a big-endian machine).
|
133
|
+
@@bloater = Encoding::Converter.new('UTF-8','UTF-32LE')
|
134
|
+
@@shrinker = Encoding::Converter.new('UTF-32LE','UTF-8')
|
135
|
+
|
136
|
+
def Tinycus.bloat(s) # private method
|
137
|
+
return @@bloater.convert(s)
|
138
|
+
end
|
139
|
+
def Tinycus.shrink(s) # private method
|
140
|
+
return @@shrinker.convert(s)
|
141
|
+
end
|
142
|
+
|
143
|
+
class Tinycus::MiscGreek
|
144
|
+
def MiscGreek.run_tests
|
145
|
+
print "testing MiscGreek.add_second_accent...\n"
|
146
|
+
[['θεμείλια','θεμείλιά'],
|
147
|
+
['πόλεμονδε','πόλεμόνδε'],
|
148
|
+
['οἶκονδε','οἶκόνδε'],
|
149
|
+
['τῆσδε','τῆσδέ']
|
150
|
+
].each { |x|
|
151
|
+
single,two = x
|
152
|
+
y = MiscGreek.add_second_accent(single)
|
153
|
+
print " #{single} #{two} #{y}\n"
|
154
|
+
if y!=two then raise "expected #{two}, got #{y}" end
|
155
|
+
}
|
156
|
+
end
|
157
|
+
|
158
|
+
def MiscGreek.estimate_syll_count(x)
|
159
|
+
# A rough approximation, for cases where we don't need perfect precision and either don't have Ransom's greek/syllab.rb
|
160
|
+
# or don't want the performance hit.
|
161
|
+
x = x.downcase
|
162
|
+
x = x.gsub(/[ϊ]/,'e') # prevent it from being misinterpreted as a diphthong after the diaresis is stripped below
|
163
|
+
x = Tr.remove_accents_from_greek(x)
|
164
|
+
x = x.gsub(/(αι|ει|οι|ου)/,'e')
|
165
|
+
return x.scan(/[αειουηωe]/).length
|
166
|
+
end
|
167
|
+
|
168
|
+
def MiscGreek.has_acute(x)
|
169
|
+
return Tr.remove_acute_from_greek(x)!=x
|
170
|
+
end
|
171
|
+
|
172
|
+
def MiscGreek.has_grave(x)
|
173
|
+
return Tr.remove_grave_from_greek(x)!=x
|
174
|
+
end
|
175
|
+
|
176
|
+
def MiscGreek.has_circumflex(x)
|
177
|
+
return Tr.remove_circumflex_from_greek(x)!=x
|
178
|
+
end
|
179
|
+
|
180
|
+
def MiscGreek.has_tonal_accent(x)
|
181
|
+
return Tr.remove_tonal_accents_from_greek(x)!=x
|
182
|
+
end
|
183
|
+
|
184
|
+
def MiscGreek.add_second_accent(w_orig)
|
185
|
+
# e.g., if w is θεμείλια, returns θεμείλιά
|
186
|
+
w = w_orig.clone # shallow copy, works on a string; I'm not clear in why this is necessary, but it is; modification to w_orig is visible in
|
187
|
+
# output of make test_misc_greek
|
188
|
+
if w=~/(.*)δε$/ then
|
189
|
+
stem = $1
|
190
|
+
nsyll = MiscGreek.estimate_syll_count(stem)
|
191
|
+
has_circumflex = MiscGreek.has_circumflex(stem)
|
192
|
+
if nsyll>=3 || (has_circumflex && nsyll>=2) then return MiscGreek.add_second_accent(stem)+"δε" end
|
193
|
+
end
|
194
|
+
# Locate the final vowel:
|
195
|
+
x = Tr.remove_accents_from_greek(w).downcase
|
196
|
+
(x.length-1).downto(1) { |i|
|
197
|
+
if x[i]=~/[αειουηω]/ then
|
198
|
+
w[i] = Tr.add_acute_to_greek(Tr.remove_acute_and_grave_from_greek(w[i]))
|
199
|
+
return w
|
200
|
+
end
|
201
|
+
}
|
202
|
+
return w
|
203
|
+
end
|
204
|
+
|
205
|
+
def MiscGreek.all_cases_and_accents(w)
|
206
|
+
# For a given word, try to predict every possible form it could take in a text, including
|
207
|
+
# both possible capitalizations, acute/grave, and multiple accents.
|
208
|
+
# The word w should already have been converted into a canonical dictionary form (typically a single acute accent).
|
209
|
+
# This is not 100% perfect, mainly because the rules for multiple accents are complicated and Tinycus doesn't include a full
|
210
|
+
# syllabification algorithm.
|
211
|
+
# I tested this as a round-trip on all multiply accented words occurring in Homer. The following three words were the only
|
212
|
+
# ones where it failed: κάλλίον, σταφύλῇ, ὕπὸ.
|
213
|
+
forms = [w.downcase]
|
214
|
+
forms = forms+forms.map { |x| Tr.greek_acute_to_grave(x) }
|
215
|
+
forms = forms+forms.map { |x| MiscGreek.add_second_accent(x) }
|
216
|
+
forms = forms+forms.map { |x| x.capitalize }
|
217
|
+
return forms.uniq
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
class Tinycus::Tr
|
222
|
+
|
223
|
+
@@prep_remove_acute_and_grave_from_greek = nil
|
224
|
+
@@prep_remove_circumflex_from_greek = nil
|
225
|
+
@@prep_add_circumflex_to_greek = nil
|
226
|
+
@@prep_remove_grave_from_greek = nil
|
227
|
+
@@prep_remove_acute_from_greek = nil
|
228
|
+
@@prep_add_grave_to_greek = nil
|
229
|
+
@@prep_add_acute_to_greek = nil
|
230
|
+
@@prep_remove_diar_from_greek = nil
|
231
|
+
@@prep_remove_breathing_from_greek = nil
|
232
|
+
@@prep_add_diar_to_greek = nil
|
233
|
+
@@prep_remove_accents_from_greek = nil
|
234
|
+
@@greek_grave_to_acute = nil
|
235
|
+
@@greek_acute_to_grave = nil
|
236
|
+
@@prep_greek_to_collation_form = nil
|
237
|
+
|
238
|
+
def initialize(a,b)
|
239
|
+
# Initialize a data structure that represents an action equivalent to string.tr(a,b), but faster.
|
240
|
+
# Including redudant characters is harmless and is fixed in this constructor; it does not cause
|
241
|
+
# any performance hit when the object is actually used.
|
242
|
+
if a.length!=b.length then raise "lengths unequal, #{a.length} and #{b.length}" end
|
243
|
+
@l = a.length
|
244
|
+
@orig_tables = [a.clone,b.clone] # stash them away for testing purposes
|
245
|
+
|
246
|
+
@h = {}
|
247
|
+
0.upto(@l-1) { |i|
|
248
|
+
p,q = a[i],b[i]
|
249
|
+
@h[p] = q
|
250
|
+
}
|
251
|
+
@h.freeze
|
252
|
+
|
253
|
+
end
|
254
|
+
|
255
|
+
attr_reader :l,:a,:b
|
256
|
+
|
257
|
+
def apply(s,n:false)
|
258
|
+
# This function tends to be a bottleneck for performance in real-world applications. I tried several algorithms.
|
259
|
+
# See notes in comments at top of scripts/benchmark.rb.
|
260
|
+
|
261
|
+
if n then s = s.unicode_normalize(:nfc) end # 30% performance hit, not necessary if input has already been normalized
|
262
|
+
|
263
|
+
result = ''
|
264
|
+
0.upto(s.length-1) { |i|
|
265
|
+
p = s[i]
|
266
|
+
q = @h[p]
|
267
|
+
if q.nil? then c=p else c=q end
|
268
|
+
result += c
|
269
|
+
}
|
270
|
+
return result
|
271
|
+
|
272
|
+
end
|
273
|
+
|
274
|
+
def self_test(alphabet)
|
275
|
+
# Raises an exception if it fails. Otherwise just returns silently.
|
276
|
+
# If not nil, then the alphabet parameter gives a list of characters that are allowed to exist in the output.
|
277
|
+
a,b = @orig_tables
|
278
|
+
if self.apply(a)!=b then raise "error in self_test, applying me to original a does not give original b" end
|
279
|
+
if self.apply(b)!=b then raise "error in self_test, applying me to original b does not give original b" end
|
280
|
+
if self.apply(self.apply(a))!=b then raise "error in self_test, fails idempotency" end
|
281
|
+
if !alphabet.nil? then
|
282
|
+
s = a+alphabet
|
283
|
+
ss = self.apply(s)
|
284
|
+
unless ss=~/^[#{alphabet}]+$/ then
|
285
|
+
raise "error in self_test, applying me to #{s} gives #{ss}, which contains characters not in the alphabet #{alphabet}"
|
286
|
+
end
|
287
|
+
end
|
288
|
+
end
|
289
|
+
|
290
|
+
def Tr.greek_grave_to_acute(s,n:false)
|
291
|
+
if @@greek_grave_to_acute.nil? then
|
292
|
+
@@greek_grave_to_acute = Tinycus::Tr.new(
|
293
|
+
"ÀÈÌÒÙàèìòùἂἃἊἒἓἢἣἫἲἳὂὃὊὋὓὢὣὫὰὲὴὶὸὺὼῒῢῸῂ","ÁÉÍÓÚáéíóúἄἅἌἔἕἤἥἭἴἵὄὅὌὍὕὤὥὭάέήίόύώΐΰΌῄ"
|
294
|
+
)
|
295
|
+
end
|
296
|
+
if n then s=s.unicode_normalize(:nfc) end
|
297
|
+
return @@greek_grave_to_acute.apply(s)
|
298
|
+
end
|
299
|
+
|
300
|
+
def Tr.greek_acute_to_grave(s,n:false)
|
301
|
+
if @@greek_acute_to_grave.nil? then
|
302
|
+
@@greek_acute_to_grave = Tinycus::Tr.new(
|
303
|
+
"ÁÉÍÓÚáéíóúἄἅἌἔἕἤἥἭἴἵὄὅὌὍὕὤὥὭάέήίόύώΐΰΌ","ÀÈÌÒÙàèìòùἂἃἊἒἓἢἣἫἲἳὂὃὊὋὓὢὣὫὰὲὴὶὸὺὼῒῢῸ"
|
304
|
+
)
|
305
|
+
end
|
306
|
+
if n then s=s.unicode_normalize(:nfc) end
|
307
|
+
return @@greek_acute_to_grave.apply(s)
|
308
|
+
end
|
309
|
+
|
310
|
+
def Tr.remove_tonal_accents_from_greek(s,n:false)
|
311
|
+
# to do: make this more efficient by creating a single-pass tr
|
312
|
+
return Tr.remove_circumflex_from_greek(Tr.remove_acute_and_grave_from_greek(s,n:n))
|
313
|
+
end
|
314
|
+
|
315
|
+
def Tr.remove_acute_and_grave_from_greek(s,n:false)
|
316
|
+
if @@prep_remove_acute_and_grave_from_greek.nil? then
|
317
|
+
@@prep_remove_acute_and_grave_from_greek =
|
318
|
+
Tinycus::Tr.new(
|
319
|
+
"ÀÁàáÈÉèéÌÍìíÒÓòóÙÚùúÝýΆάἂἃἄἅἊἌἍὰᾴΈέἒἓἔἕἜἝὲήἢἣἤἥἫἬἭὴᾓᾔᾕῂῄΊΐίἲἳἴἵἼἽὶῒΌόὂὃὄὅὊὋὌὍὸῸΰύὓὔὕὝὺῢώὢὣὤὥὫὬὭὼᾤῴᾍ",
|
320
|
+
"AAaaEEeeIIiiOOooUUuuYyΑαἀἁἀἁἈἈἉαᾳΕεἐἑἐἑἘἙεηἠἡἠἡἩἨἩηᾑᾐᾑῃῃΙϊιἰἱἰἱἸἹιϊΟοὀὁὀὁὈὉὈὉοΟϋυὑὐὑὙυϋωὠὡὠὡὩὨὩωᾠῳᾉ"
|
321
|
+
)
|
322
|
+
end
|
323
|
+
if n then s=s.unicode_normalize(:nfc) end
|
324
|
+
return @@prep_remove_acute_and_grave_from_greek.apply(s)
|
325
|
+
end
|
326
|
+
|
327
|
+
def Tr.remove_circumflex_from_greek(s,n:false)
|
328
|
+
if @@prep_remove_circumflex_from_greek.nil? then
|
329
|
+
@@prep_remove_circumflex_from_greek =
|
330
|
+
Tinycus::Tr.new(
|
331
|
+
"ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåæçèéêëìíîïñòóôõöøùúûüýÿΆΈΊΌΐάέήίΰϊϋόύώỏἀἁἂἃἄἅἆἈἉἊἌἍἎἐἑἒἓἔἕἘἙἜἝἠἡἢἣἤἥἦἧἨἩἫἬἭἮἯἰἱἲἳἴἵἶἷἸἹἼἽἾὀὁὂὃὄὅὈὉὊὋὌὍὐὑὓὔὕὖὗὙὝὠὡὢὣὤὥὦὧὨὩὫὬὭὮὯὰὲὴὶὸὺὼᾐᾑᾓᾔᾕᾖᾗᾠᾤᾦᾧᾰᾱᾳᾴᾶᾷᾸᾹῂῃῄῆῇῐῑῒῖῗῘῙῠῡῢῥῦῨῩῬῳῴῶῷῸ",
|
332
|
+
"ÀÁAAÄÅÆÇÈÉEËÌÍIÏNÒÓOOÖØÙÚUÜÝàáaaäåæçèéeëìíiïnòóooöøùúuüýÿΆΈΊΌΐάέήίΰϊϋόύώỏἀἁἂἃἄἅἀἈἉἊἌἍἈἐἑἒἓἔἕἘἙἜἝἠἡἢἣἤἥἠἡἨἩἫἬἭἨἩἰἱἲἳἴἵἰἱἸἹἼἽἸὀὁὂὃὄὅὈὉὊὋὌὍὐὑὓὔὕὐὑὙὝὠὡὢὣὤὥὠὡὨὩὫὬὭὨὩὰὲὴὶὸὺὼᾐᾑᾓᾔᾕᾐᾑᾠᾤᾠᾡᾰᾱᾳᾴαᾳᾸᾹῂῃῄηῃῐῑῒιϊῘῙῠῡῢῥυῨῩῬῳῴωῳῸ"
|
333
|
+
)
|
334
|
+
end
|
335
|
+
if n then s=s.unicode_normalize(:nfc) end
|
336
|
+
return @@prep_remove_circumflex_from_greek.apply(s)
|
337
|
+
end
|
338
|
+
|
339
|
+
def Tr.add_circumflex_to_greek(s,n:false)
|
340
|
+
if @@prep_add_circumflex_to_greek.nil? then
|
341
|
+
@@prep_add_circumflex_to_greek =
|
342
|
+
Tinycus::Tr.new(
|
343
|
+
"AAEINOOUaaeinoouἀἈἠἡἨἩἰἱἸὐὑὠὡὨὩᾐᾑᾠᾡαᾳηῃιϊυωῳ",
|
344
|
+
"ÂÃÊÎÑÔÕÛâãêîñôõûἆἎἦἧἮἯἶἷἾὖὗὦὧὮὯᾖᾗᾦᾧᾶᾷῆῇῖῗῦῶῷ"
|
345
|
+
)
|
346
|
+
end
|
347
|
+
if n then s=s.unicode_normalize(:nfc) end
|
348
|
+
return @@prep_add_circumflex_to_greek.apply(s)
|
349
|
+
end
|
350
|
+
|
351
|
+
def Tr.remove_acute_from_greek(s,n:false)
|
352
|
+
if @@prep_remove_acute_from_greek.nil? then
|
353
|
+
@@prep_remove_acute_from_greek =
|
354
|
+
Tinycus::Tr.new(
|
355
|
+
"ÁÉÍÓÚáéíóúἄἅἌἔἕἤἥἭἴἵὄὅὌὍὕὤὥὭάέήίόύώῄῴΐΰΌ",
|
356
|
+
"AEIOUaeiouἀἁἈἐἑἠἡἩἰἱὀὁὈὉὑὠὡὩαεηιουωῃῳϊϋΟ"
|
357
|
+
)
|
358
|
+
end
|
359
|
+
if n then s=s.unicode_normalize(:nfc) end
|
360
|
+
return @@prep_remove_acute_from_greek.apply(s)
|
361
|
+
end
|
362
|
+
|
363
|
+
def Tr.remove_grave_from_greek(s,n:false)
|
364
|
+
if @@prep_remove_grave_from_greek.nil? then
|
365
|
+
@@prep_remove_grave_from_greek =
|
366
|
+
Tinycus::Tr.new(
|
367
|
+
"ÀÈÌÒÙàèìòùἂἃἊἒἓἢἣἫἲἳὂὃὊὋὓὢὣὫὰὲὴὶὸὺὼῂῒῢῸ",
|
368
|
+
"AEIOUaeiouἀἁἈἐἑἠἡἩἰἱὀὁὈὉὑὠὡὩαεηιουωῃϊϋΟ"
|
369
|
+
)
|
370
|
+
end
|
371
|
+
if n then s=s.unicode_normalize(:nfc) end
|
372
|
+
return @@prep_remove_grave_from_greek.apply(s)
|
373
|
+
end
|
374
|
+
|
375
|
+
def Tr.add_grave_to_greek(s,n:false)
|
376
|
+
if @@prep_add_grave_to_greek.nil? then
|
377
|
+
@@prep_add_grave_to_greek =
|
378
|
+
Tinycus::Tr.new(
|
379
|
+
"AEIOUaeiouἀἁἈἐἑἠἡἩἰἱὀὁὈὉὑὠὡὩαεηιουωῃϊϋΟ",
|
380
|
+
"ÀÈÌÒÙàèìòùἂἃἊἒἓἢἣἫἲἳὂὃὊὋὓὢὣὫὰὲὴὶὸὺὼῂῒῢῸ"
|
381
|
+
)
|
382
|
+
end
|
383
|
+
if n then s=s.unicode_normalize(:nfc) end
|
384
|
+
return @@prep_add_grave_to_greek.apply(s)
|
385
|
+
end
|
386
|
+
|
387
|
+
def Tr.add_acute_to_greek(s,n:false)
|
388
|
+
if @@prep_add_acute_to_greek.nil? then
|
389
|
+
@@prep_add_acute_to_greek =
|
390
|
+
Tinycus::Tr.new(
|
391
|
+
"AEIOUaeiouἀἁἈἐἑἠἡἩἰἱὀὁὈὉὑὠὡὩαεηιουωῃῳϊϋΟ",
|
392
|
+
"ÁÉÍÓÚáéíóúἄἅἌἔἕἤἥἭἴἵὄὅὌὍὕὤὥὭάέήίόύώῄῴΐΰΌ"
|
393
|
+
)
|
394
|
+
end
|
395
|
+
if n then s=s.unicode_normalize(:nfc) end
|
396
|
+
return @@prep_add_acute_to_greek.apply(s)
|
397
|
+
end
|
398
|
+
|
399
|
+
def Tr.remove_diar_from_greek(s,n:false)
|
400
|
+
# Can't do caps with diaresis, since they only exist as combining characters.
|
401
|
+
if @@prep_remove_diar_from_greek.nil? then
|
402
|
+
@@prep_remove_diar_from_greek =
|
403
|
+
Tinycus::Tr.new(
|
404
|
+
"ϊΐῒῗϋΰῢ",
|
405
|
+
"ιίὶῖυύὺ"
|
406
|
+
)
|
407
|
+
end
|
408
|
+
if n then s=s.unicode_normalize(:nfc) end
|
409
|
+
return @@prep_remove_diar_from_greek.apply(s)
|
410
|
+
end
|
411
|
+
|
412
|
+
def Tr.add_diar_to_greek(s,n:false)
|
413
|
+
# Can't do caps with diaresis, since they only exist as combining characters.
|
414
|
+
if @@prep_add_diar_to_greek.nil? then
|
415
|
+
@@prep_add_diar_to_greek =
|
416
|
+
Tinycus::Tr.new(
|
417
|
+
"ιίὶῖυύὺ",
|
418
|
+
"ϊΐῒῗϋΰῢ"
|
419
|
+
)
|
420
|
+
end
|
421
|
+
if n then s=s.unicode_normalize(:nfc) end
|
422
|
+
return @@prep_add_diar_to_greek.apply(s)
|
423
|
+
end
|
424
|
+
|
425
|
+
def Tr.remove_breathing_from_greek(s,n:false)
|
426
|
+
if @@prep_remove_breathing_from_greek.nil? then
|
427
|
+
@@prep_remove_breathing_from_greek =
|
428
|
+
Tinycus::Tr.new(
|
429
|
+
"ἄἌἈἈἀἈἁἉἂἊἅἍἆἉἉἃἋᾇἔἜἑἙἐἘἕἝἘἘἙἙἓἛἒἚἣἫἡἩἠἨἦἥἭἢἪἤἬᾔἬΙἧᾗἨἨᾕἭΙἩἩᾐἨΙᾖᾑἩΙἰἸἱἹἴἼἶἳἻἵἽἷἸἸἲἺἹἹὃὋὄὌὀὈὈὈὅὍὁὉὉὉῥῬῬῬῤὐὗὕὝὑὙὖὔὓὛὙὙὥὭὣὫὤὬᾤὬΙὠὨὦᾧὡὩὧᾠὨΙὨὨὢὪᾦὩὩ",
|
430
|
+
"άΆΑΑαΑαΑὰᾺάΆᾶΑΑὰᾺᾷέΈεΕεΕέΈΕΕΕΕὲῈὲῈὴῊηΗηΗῆήΉὴῊήΉῄΉΙῆῇΗΗῄΉΙΗΗῃΗΙῇῃΗΙιΙιΙίΊῖὶῚίΊῖΙΙὶῚΙΙὸῸόΌοΟΟΟόΌοΟΟΟρΡΡΡρυῦύΎυΥῦύὺῪΥΥώΏὼῺώΏῴΏΙωΩῶῷωΩῶῳΩΙΩΩὼῺῷΩΩ"
|
431
|
+
)
|
432
|
+
end
|
433
|
+
if n then s=s.unicode_normalize(:nfc) end
|
434
|
+
return @@prep_remove_breathing_from_greek.apply(s)
|
435
|
+
end
|
436
|
+
|
437
|
+
def Tr.remove_accents_from_greek(s,n:false)
|
438
|
+
if @@prep_remove_accents_from_greek.nil? then
|
439
|
+
@@prep_remove_accents_from_greek = Tr.remove_accents('el')
|
440
|
+
end
|
441
|
+
if n then s=s.unicode_normalize(:nfc) end
|
442
|
+
return @@prep_remove_accents_from_greek.apply(s)
|
443
|
+
end
|
444
|
+
|
445
|
+
def Tr.remove_macrons_and_breves(s)
|
446
|
+
# This can't be implemented using my fast method, because most of these are composed characters.
|
447
|
+
if !(s.kind_of?(String)) then return s end
|
448
|
+
# ...convenience feature for stuff like parsing json data, which may include integers. Won't work for arrays containing strings.
|
449
|
+
# ---
|
450
|
+
# Letters with both a macron and a breve. The ruby script to generate this is in the commented out code below.
|
451
|
+
if s=~/ᾱ̆|ῑ̆|ῡ̆|Ᾱ̆|Ῑ̆|Ῡ̆/ then
|
452
|
+
s = s.gsub(/ᾱ̆/,'α')
|
453
|
+
s = s.gsub(/ῑ̆/,'ι')
|
454
|
+
s = s.gsub(/ῡ̆/,'υ')
|
455
|
+
s = s.gsub(/Ᾱ̆/,'Α')
|
456
|
+
s = s.gsub(/Ῑ̆/,'Ι')
|
457
|
+
s = s.gsub(/Ῡ̆/,'Υ')
|
458
|
+
end
|
459
|
+
# ---
|
460
|
+
s = s.gsub(/ϊ̄/,'ϊ') # iota with macron and diaresis; happens in Cunliffe for ἀϊκη, ὠΐετο, ὁμοιΐου, ὁμοίϊος
|
461
|
+
s = s.gsub(/ΐ̄/,'ΐ') # iota with macron, diaresis, and acute; happens in Cunliffe for ὀΐομαι
|
462
|
+
# ---
|
463
|
+
# The following is kludgy, probably not the right way to handle this. Should this just be in the loop at the end?
|
464
|
+
s = s.gsub(/ῐ/,"ῐ")
|
465
|
+
# ῐ = iota + combining breve
|
466
|
+
# ῐ = iota with vrachy
|
467
|
+
s = s.gsub(/ῑ/,"ι")
|
468
|
+
s = s.gsub(/ᾰ/,"α")
|
469
|
+
s = s.gsub(/ᾱ̆/,"α") # alpha with both a macron and a breve!?
|
470
|
+
# ---
|
471
|
+
s = Tinycus::Util.safe_normalize(s)
|
472
|
+
s = s.tr("āēīōūӯ","aeiouy") # latin
|
473
|
+
s = s.tr("ᾰᾱᾸᾹῐῑῘῙῠῡῨῩ","ααΑΑιιΙΙυυΥΥ")
|
474
|
+
# Accent combined with macron. The monospaced fonts I'm using for coding display these incorrectly, and I also don't know how to type them.
|
475
|
+
# Furthermore, these seem to be represented as multiple characters, so that tr won't work. The following will be slow on short strings,
|
476
|
+
# but should perform well on long ones.
|
477
|
+
# The following isn't really an exhaustive list of vowels.
|
478
|
+
"άίύὰὶὺΆΊΎᾺῚῪἀἐἰὀὐἠὠἁἑἱὁὑἡὡἄἔἴὄὔἤὤἂἒἲὂὒἢὢἅἕἵὅὕἥὥἃἓἳὃὓἣὣΐῒ".chars.each { |c|
|
479
|
+
[772,774].each { |combining| # 772=combining macron, 774=combining breve (773=combining overline, presumably used for math)
|
480
|
+
m = [c.ord, combining].pack("U*") # is not a single character
|
481
|
+
s = s.gsub(/#{m}/,c)
|
482
|
+
}
|
483
|
+
}
|
484
|
+
# ---
|
485
|
+
s = s.unicode_normalize(:nfc) # found empirically that this was necessary, don't remove
|
486
|
+
# ---
|
487
|
+
return s
|
488
|
+
=begin
|
489
|
+
--------------------------------------------------------
|
490
|
+
m = [0x1fb1,0x1FD1,0x1fe1]
|
491
|
+
a = m.map { |u| [u, 0x0306].pack('U*') }
|
492
|
+
a = a + a.map { |s| s.upcase }
|
493
|
+
i=0
|
494
|
+
a.each { |c|
|
495
|
+
cc = ['α','ι','υ','Α','Ι','Υ'][i]
|
496
|
+
i += 1
|
497
|
+
print " s = s.gsub(/#{c}/,'#{cc}')\n"
|
498
|
+
}
|
499
|
+
--------------------------------------------------------
|
500
|
+
=end
|
501
|
+
end
|
502
|
+
|
503
|
+
def Tr.macronized_to_underbar_style(s)
|
504
|
+
# Changes a macronized string to one that looks like this: ἕννυ_μι.
|
505
|
+
# The lists in the regexes are generated by the commented-out scripts below, and are not actually totally comprehensive.
|
506
|
+
# We don't handle grave and circumflex accents, but those don't occur in dictionary headers with macrons.
|
507
|
+
# For an IfMows object in Ifthimos, this can be done using stringify(macronization:'underbar').
|
508
|
+
# ---
|
509
|
+
x = s.clone
|
510
|
+
x = x.gsub(/(ϊ̄)/) { "#{Tinycus::Tr.remove_macrons_and_breves($1)}_" } # iota with diaresis and macron, occurs in ἀϊκή
|
511
|
+
# First handle letters that have both a macron and a breve, treating them as if they weren't macronized at all:
|
512
|
+
x = x.gsub(/(ᾱ̆|ῑ̆|ῡ̆|Ᾱ̆|Ῑ̆|Ῡ̆)/) { Tinycus::Tr.remove_macrons_and_breves($1) }
|
513
|
+
# Next handle the ones that have macrons only:
|
514
|
+
x = x.gsub(/(ᾱ|ῑ|ῡ|ά̄|ί̄|ύ̄|ἀ̄|ἁ̄|ἄ̄|ἅ̄|ἰ̄|ἱ̄|ἴ̄|ἵ̄|ὐ̄|ὑ̄|ὔ̄|ὕ̄|Ᾱ|Ῑ|Ῡ|Ά̄|Ί̄|Ύ̄|Ἀ̄|Ἁ̄|Ἄ̄|Ἅ̄|Ἰ̄|Ἱ̄|Ἴ̄|Ἵ̄|Υ̓̄|Ὑ̄|Υ̓́̄|Ὕ̄)/) { "#{Tinycus::Tr.remove_macrons_and_breves($1)}_" }
|
515
|
+
# Finally, remove all breves:
|
516
|
+
return Tinycus::Tr.remove_macrons_and_breves(x)
|
517
|
+
=begin
|
518
|
+
------------------------------------
|
519
|
+
m = [0x1fb1,0x1FD1,0x1fe1]
|
520
|
+
a = m.map { |u| [u, 0x0306].pack('U*') }
|
521
|
+
a = a + a.map { |s| s.upcase }
|
522
|
+
print a.join('|'),"--\n"
|
523
|
+
------------------------------------
|
524
|
+
a = ['ᾱ','ῑ','ῡ']
|
525
|
+
[
|
526
|
+
0x03ac,0x03af,0x03cd
|
527
|
+
].each { |i|
|
528
|
+
x = [i, 0x0304].pack('U*')
|
529
|
+
a.push(x)
|
530
|
+
}
|
531
|
+
[
|
532
|
+
0x1f00,0x1f30,0x1f50
|
533
|
+
].each { |i|
|
534
|
+
[0,1,4,5].each { |j|
|
535
|
+
x = [i+j, 0x0304].pack('U*')
|
536
|
+
a.push(x)
|
537
|
+
}
|
538
|
+
}
|
539
|
+
a = a + a.map { |s| s.upcase }
|
540
|
+
print a.join('|'),"--\n"
|
541
|
+
------------------------------------
|
542
|
+
=end
|
543
|
+
end
|
544
|
+
|
545
|
+
def Tr.get_greek_collation_tr
|
546
|
+
if @@prep_greek_to_collation_form.nil? then
|
547
|
+
@@prep_greek_to_collation_form = Tr.collation_form('el')
|
548
|
+
end
|
549
|
+
return @@prep_greek_to_collation_form
|
550
|
+
end
|
551
|
+
|
552
|
+
def Tr.greek_to_collation_form(s,n:false)
|
553
|
+
if @@prep_greek_to_collation_form.nil? then
|
554
|
+
@@prep_greek_to_collation_form = Tr.collation_form('el')
|
555
|
+
end
|
556
|
+
if n then s=s.unicode_normalize(:nfc) end
|
557
|
+
return @@prep_greek_to_collation_form.apply(s)
|
558
|
+
end
|
559
|
+
|
560
|
+
def Tr.remove_accents(locale)
|
561
|
+
# Returns a Tinycus::Tr object which can then be used to act on strings using the apply() method.
|
562
|
+
# The 'el' locale is a standard thing that software like ICU uses for polytonic Greek. The object constructed with this
|
563
|
+
# locale will also remove most accents and macrons from Latin characters, but will miss some cases like Czech, and will not handle Cyrillic.
|
564
|
+
t = {
|
565
|
+
"el"=>[
|
566
|
+
"ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåæçèéêëìíîïñòóôõöøùúûüýÿΆΈΊΌΐάέήίΰϊϋόύώỏἀἁἂἃἄἅἆἈἉἊἌἍἎἐἑἒἓἔἕἘἙἜἝἠἡἢἣἤἥἦἧἨἩἫἬἭἮἯἰἱἲἳἴἵἶἷἸἹἼἽἾὀὁὂὃὄὅὈὉὊὋὌὍὐὑὓὔὕὖὗὙὝὠὡὢὣὤὥὦὧὨὩὫὬὭὮὯὰὲὴὶὸὺὼᾐᾑᾓᾔᾕᾖᾗᾠᾤᾦᾧᾰᾱᾳᾴᾶᾷᾸᾹῂῃῄῆῇῐῑῒῖῗῘῙῠῡῢῥῦῨῩῬῳῴῶῷῸῤᾆᾄᾂᾁᾇᾅᾃᾍᾡ",
|
567
|
+
"AAAAAAÆCEEEEIIIINOOOOOOUUUUYaaaaaaæceeeeiiiinoooooouuuuyyΑΕΙΟιαεηιυιυουωoαααααααΑΑΑΑΑΑεεεεεεΕΕΕΕηηηηηηηηΗΗΗΗΗΗΗιιιιιιιιΙΙΙΙΙοοοοοοΟΟΟΟΟΟυυυυυυυΥΥωωωωωωωωΩΩΩΩΩΩΩαεηιουωηηηηηηηωωωωααααααΑΑηηηηηιιιιιΙΙυυυρυΥΥΡωωωωΟραααααααΑω"
|
568
|
+
]
|
569
|
+
}
|
570
|
+
tables = t[locale]
|
571
|
+
if tables.nil? then raise "unknown locale: #{locale}, known locales are: #{t.keys.join(' ')}" end
|
572
|
+
result = Tinycus::Tr.new(tables[0],tables[1])
|
573
|
+
return result
|
574
|
+
end
|
575
|
+
|
576
|
+
def Tr.collation_form(locale)
|
577
|
+
# Returns a Tinycus::Tr object which can then be used to act on strings using the apply() method. Gives a form that
|
578
|
+
# can be alphabetized properly.
|
579
|
+
# The 'el' locale is a standard thing that software like ICU uses for polytonic Greek. The object constructed with this
|
580
|
+
# locale will also produce correct results for most Latin-script words, will miss some cases like Czech, and will not handle Cyrillic.
|
581
|
+
t = {
|
582
|
+
"el"=>[
|
583
|
+
"ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåæçèéêëìíîïñòóôõöøùúûüýÿΆΈΊΌΐάέήίΰϊϋόύώỏἀἁἂἃἄἅἆἈἉἊἌἍἎἐἑἒἓἔἕἘἙἜἝἠἡἢἣἤἥἦἧἨἩἫἬἭἮἯἰἱἲἳἴἵἶἷἸἹἼἽἾὀὁὂὃὄὅὈὉὊὋὌὍὐὑὓὔὕὖὗὙὝὠὡὢὣὤὥὦὧὨὩὫὬὭὮὯὰὲὴὶὸὺὼᾐᾑᾓᾔᾕᾖᾗᾠᾤᾦᾧᾰᾱᾳᾴᾶᾷᾸᾹῂῃῄῆῇῐῑῒῖῗῘῙῠῡῢῥῦῨῩῬῳῴῶῷῸῤᾆᾄᾂᾁᾇᾅᾃΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩςᾍ",
|
584
|
+
"aaaaaaæceeeeiiiinoooooouuuuyaaaaaaæceeeeiiiinoooooouuuuyyαειοιαεηιυιυουωoαααααααααααααεεεεεεεεεεηηηηηηηηηηηηηηηιιιιιιιιιιιιιοοοοοοοοοοοουυυυυυυυυωωωωωωωωωωωωωωωαεηιουωηηηηηηηωωωωααααααααηηηηηιιιιιιιυυυρυυυρωωωωορααααααααβγδεζηθικλμνξοπρστυφχψωσΑ"
|
585
|
+
]
|
586
|
+
}
|
587
|
+
tables = t[locale]
|
588
|
+
if tables.nil? then raise "unknown locale: #{locale}, known locales are: #{t.keys.join(' ')}" end
|
589
|
+
result = Tinycus::Tr.new(tables[0],tables[1])
|
590
|
+
return result
|
591
|
+
end
|
592
|
+
|
593
|
+
def Tr.add_breathing_to_character(c,what)
|
594
|
+
plain,d = Tinycus.disassemble_greek_char(c)
|
595
|
+
d['breathing'] = what
|
596
|
+
return Tinycus.assemble_greek_char(plain,d)
|
597
|
+
end
|
598
|
+
|
599
|
+
|
600
|
+
|
601
|
+
def Tr.run_tests
|
602
|
+
# to execute this, do a "make test_tr"
|
603
|
+
['el'].each { |locale|
|
604
|
+
tr = Tinycus::Tr.remove_accents(locale)
|
605
|
+
if locale=='el' then alphabet=Script.alphabet('greek')+Script.alphabet('latin')+'Ææ' else alphabet=nil end
|
606
|
+
tr.self_test(alphabet)
|
607
|
+
print "Passed self-test on locale #{locale}.\n"
|
608
|
+
}
|
609
|
+
end
|
610
|
+
|
611
|
+
|
612
|
+
end # class Tr
|
613
|
+
|
614
|
+
class Tinycus::Script
|
615
|
+
|
616
|
+
def Script.alphabet(script,c:'both')
|
617
|
+
# Script can be 'latin', 'greek', or 'hebrew'.
|
618
|
+
# C can be both, lowercase, or uppercase.
|
619
|
+
# For scripts that don't have case, c is ignored.
|
620
|
+
t = {
|
621
|
+
'latin'=>{'has_case'=>true},
|
622
|
+
'greek'=>{'has_case'=>true},
|
623
|
+
'hebrew'=>{'has_case'=>false},
|
624
|
+
}
|
625
|
+
data = t[script]
|
626
|
+
if data.nil? then raise "unknown script: #{script}, possible values are: #{t.keys.join(' ')}" end
|
627
|
+
has_case = data['has_case']
|
628
|
+
if !has_case then return Script.alphabet_helper(script,nil) end
|
629
|
+
if c=='both' then return Script.alphabet(script,c:"lowercase")+Script.alphabet(script,c:"uppercase") end
|
630
|
+
# If we fall through to here, then we're doing a single case of an alphabet that has two cases.
|
631
|
+
if c=='lowercase' then return Script.alphabet_helper(script,true) end
|
632
|
+
if c=='uppercase' then return Script.alphabet_helper(script,false).upcase end
|
633
|
+
die("illegal value of c=#{c}, must be both, lowercase, or uppercase")
|
634
|
+
end
|
635
|
+
|
636
|
+
def Script.alphabet_helper(script,include_lc_only_chars)
|
637
|
+
if script=='latin' then return 'abcdefghijklmnopqrstuvwxyz' end
|
638
|
+
if script=='greek' then
|
639
|
+
result = 'αβγδεζηθικλμνξοπρστυφχψω'
|
640
|
+
if include_lc_only_chars then result = result+'ς' end
|
641
|
+
return result.unicode_normalize(:nfc)
|
642
|
+
end
|
643
|
+
if self.name=='hebrew' then return 'אבגדהוזחטילמנסעפצקרשתםןףץ'.unicode_normalize(:nfc) end
|
644
|
+
# ... Word-final forms are all at the end.
|
645
|
+
# To edit the Hebrew list, use mg, not emacs. Emacs tries to be smart about RTL but freaks out and gets it
|
646
|
+
# wrong on a line that mixes RTL and LTR.
|
647
|
+
raise "no alphabet available for script #{script}"
|
648
|
+
end
|
649
|
+
|
650
|
+
end
|
651
|
+
|
652
|
+
class Tinycus::Cleanup
|
653
|
+
|
654
|
+
def Cleanup.clean_up_grotty_greek(s,allow_latin:false,clean_perseus:true,standardize_punctuation:true)
|
655
|
+
# Designed for external data sources that can have all kinds of nasty crap in them. Slow, thorough, silent, and brutal.
|
656
|
+
a = s.split(/(\s+)/) # returns a string in which even indices are words, odd indices are whitespace
|
657
|
+
b = []
|
658
|
+
0.upto(a.length-1) { |i|
|
659
|
+
w = a[i]
|
660
|
+
if i%2==0 then
|
661
|
+
looks_greek = true
|
662
|
+
if w=~/[a-zA-Z]/ then looks_greek=false end # for speed and reliability; if it contains Latin letters, it shouldn't be a greek word
|
663
|
+
if looks_greek && !(w=~/[α-ωΑ-Ως]/) && \
|
664
|
+
!(Tinycus::Tr.remove_accents_from_greek(w.unicode_normalize(:nfc))=~/[α-ωΑ-Ως]/) then looks_greek=false end
|
665
|
+
if looks_greek then
|
666
|
+
w=Cleanup.clean_up_grotty_greek_one_word(w,allow_latin:allow_latin,clean_perseus:clean_perseus,standardize_punctuation:standardize_punctuation)
|
667
|
+
end
|
668
|
+
end
|
669
|
+
b.push(w)
|
670
|
+
}
|
671
|
+
s = b.join('')
|
672
|
+
if standardize_punctuation then
|
673
|
+
s = Cleanup.standardize_greek_punctuation(s)
|
674
|
+
end
|
675
|
+
s = s.unicode_normalize(:nfc) # does stuff like changing deprecated 8059 (upsilon with oxia) to 973 (upsilon with tonos)
|
676
|
+
return s
|
677
|
+
end
|
678
|
+
|
679
|
+
def Cleanup.clean_up_grotty_greek_one_word(s,allow_latin:false,clean_perseus:true,standardize_punctuation:true)
|
680
|
+
# This works on a single word.
|
681
|
+
s = s.unicode_normalize(:nfc)
|
682
|
+
s = Cleanup.clean_up_greek_combining_characters(s,allow_latin:allow_latin)
|
683
|
+
# In Perseus's Polybius, they have bracketed text sometimes. In their system, this should probably be a separate punctuation token.
|
684
|
+
if clean_perseus then
|
685
|
+
s = s.sub(/\[/,'') # Hesiod, δεσμὸ]ν; also occurs sometimes like [word]
|
686
|
+
s = s.sub(/\]/,'')
|
687
|
+
s = s.sub(/^\(/,'')
|
688
|
+
s = s.sub(/\)$/,'')
|
689
|
+
s = s.sub(/\}/,'') # Hesiod, Ι{ππώ
|
690
|
+
s = s.sub(/\{/,'')
|
691
|
+
s = s.sub(/&?απο[σς];/,"᾽") # software bug in perseus, is ''' transliterated into Greek
|
692
|
+
s.sub!(/\-$/,"᾽") # e.g., in Thucydides, Perseus has δοκεῖ δέ μοι, οὐδὲ τοὔνομα with τοὔνομα written as τ- οὔνομα
|
693
|
+
s.sub!(/\-/,'') # e.g., in Thucydides, Perseus has ἀντίσχουσαν lemmatized as ἀντί-ἴσχω
|
694
|
+
end
|
695
|
+
s = s.sub(/σ$/,'ς') # this won't work if there's trailing punct; what is the right way to spell a word that ends with a sigma, then an elision mark?
|
696
|
+
s = Cleanup.clean_up_greek_beta_code(s)
|
697
|
+
greek_koronis = [8125].pack('U')
|
698
|
+
if s[0]==greek_koronis then
|
699
|
+
s = s[1..-1] # this happens in perseus for the lemma ἀθήνη, which they have encoded as 787 7936 952 ..., i.e., the
|
700
|
+
# breathing mark is there twice, once as a combining comma above and once as part of the composed character ἀ
|
701
|
+
# https://github.com/PerseusDL/treebank_data/issues/37
|
702
|
+
end
|
703
|
+
s = Cleanup.standardize_greek_punctuation(s) if standardize_punctuation
|
704
|
+
if !allow_latin && s=~/[^[:alpha:]᾽[0-9]\?;,.··«»’᾽—“”]/ then raise "word #{s} contains unexpected characters; unicode=#{s.chars.map { |x| x.ord}}" end
|
705
|
+
return s
|
706
|
+
end
|
707
|
+
|
708
|
+
def Cleanup.clean_up_greek_combining_characters(s,allow_latin:false)
|
709
|
+
combining_comma_above = [787].pack('U')
|
710
|
+
combining_acute_accent = [769].pack('U')
|
711
|
+
greek_koronis = [8125].pack('U')
|
712
|
+
# s = s.sub(/α#{combining_comma_above}#{combining_acute_accent}/,'ἄ') # my error introduced in Lemming; no longer needed
|
713
|
+
s = s.sub(/#{combining_comma_above}/,greek_koronis)
|
714
|
+
# ... mistaken use of combining comma above rather than the spacing version
|
715
|
+
# https://github.com/PerseusDL/treebank_data/issues/31
|
716
|
+
s = s.sub(/#{combining_acute_accent}/,'') # occurs once in Perseus's Plutarch, in a place where it's redundant
|
717
|
+
s = s.sub(/#{[788].pack('U')}/,'') # combining reversed comma above; occurs once in Perseus's Polybius, where it's on a capital Ι
|
718
|
+
# seeming one-off errors in perseus:
|
719
|
+
s2 = s
|
720
|
+
s2 = s2.sub(/#{[8158, 7973].pack('U')}/,"ἥ") # dasia and oxia combining char with eta
|
721
|
+
s2 = s2.sub(/#{[8142, 7940].pack('U')}/,"ἄ") # psili and oxia combining char with alpha
|
722
|
+
s2 = s2.sub(/#{[8142, 7988].pack('U')}/,"ἴ")
|
723
|
+
s2 = s2.sub(/ἄἄ/,'ἄ') # why is this necessary...??
|
724
|
+
s2 = s2.sub(/ἥἥ/,'ἥ') # why is this necessary...??
|
725
|
+
s2 = s2.sub(/#{[769].pack('U')}([μτ])/) {$1} # accent on a mu or tau, obvious error
|
726
|
+
s2 = s2.sub(/#{[769].pack('U')}ε/) {'έ'}
|
727
|
+
s2 = s2.sub(/#{[180].pack('U')}([κ])/) {$1} # accent on a kappa, obvious error
|
728
|
+
s2 = s2.sub(/#{[834].pack('U')}/,'') # what the heck is this?
|
729
|
+
s2 = s2.sub(/ʽ([ἁἑἱὁὑἡὡ])/) {$1} # redundant rough breathing mark
|
730
|
+
# another repeating error:
|
731
|
+
s2 = s2.sub(/(?<=[[:alpha:]][[:alpha:]])([ἀἐἰὀὐἠὠ])(?![[:alpha:]])/) { $1.tr("ἀἐἰὀὐἠὠ","αειουηω")+"᾽" }
|
732
|
+
# ... smooth breathing on the last character of a long word; this is a mistake in representation of elision
|
733
|
+
# https://github.com/PerseusDL/treebank_data/issues/31
|
734
|
+
s = s2
|
735
|
+
return s
|
736
|
+
end
|
737
|
+
|
738
|
+
def Cleanup.clean_up_greek_beta_code(s)
|
739
|
+
# This was for when I mistakenly used old beta code version of project perseus.
|
740
|
+
# Even with perseus 2.1, some stuff seems to come through that looks like beta code, e.g., ἀργει~ος.
|
741
|
+
# https://github.com/PerseusDL/treebank_data/issues/30
|
742
|
+
s = s.sub(/\((.)/) { $1.tr("αειουηω","ἁἑἱὁὑἡὡ") }
|
743
|
+
s = s.sub(/\)(.)/) { $1.tr("αειουηω","ἀἐἰὀὐἠὠ") }
|
744
|
+
s = s.sub(/(.)~/) { $1.tr("αιυηω","ᾶῖῦῆῶ") }
|
745
|
+
s = s.sub(/\|/,'ϊ')
|
746
|
+
s = s.sub(/\/(.)/) { $1.tr("αειουηω","άέίόύήώ") }
|
747
|
+
s = s.sub(/&θυοτ;/,'')
|
748
|
+
s = s.sub(/θεοισ=ν/,'θεοῖσιν')
|
749
|
+
s = s.sub(/ὀ=νοψ1/,'οἴνοπα1')
|
750
|
+
s = s.sub(/π=ας/,'πᾶς')
|
751
|
+
return s
|
752
|
+
end
|
753
|
+
|
754
|
+
def Cleanup.standardize_greek_punctuation(s)
|
755
|
+
# Works on any string, doesn't have to be a single word. Standardize elision character and middle dot/ano teleia.
|
756
|
+
# Perseus and Monro/Allen write ρ with breathing mark instead of ρ᾽ when there's elision:
|
757
|
+
s = s.gsub(/(?<=[[:alpha:]])[ῤῥ](?![[:alpha:]])/,'ρ᾽')
|
758
|
+
# ... Note that we do need to reinsert the breathing mark, or else we lose the info needed to do accurate lemmatization. Cf. Spelling module.
|
759
|
+
# Wikisource has ῤῥ in the middle of words, e.g., χείμαῤῥοι, which OCT and Perseus don't have:
|
760
|
+
s = s.gsub(/(?<=[[:alpha:]])ῤῥ(?=[[:alpha:]])/,'ρρ')
|
761
|
+
# Standardize the elision character:
|
762
|
+
s = s.gsub(/[᾽’'](?![[:alpha:]])/,'᾽')
|
763
|
+
# ... There are other possibilities (see comments in ransom/contains_greek_elision), but these should already have been taken care of in Lemming.
|
764
|
+
s = s.gsub(/#{[183].pack('U')}/,[903].pack('U')) # ano teleia has two forms, B7=183 and 387=903; GFS Porson and Olga only have the latter code point
|
765
|
+
return s
|
766
|
+
end
|
767
|
+
|
768
|
+
end
|
769
|
+
|
770
|
+
class Tinycus::Util
|
771
|
+
def Util.is_valid_utf8(s)
|
772
|
+
return s.clone.force_encoding("UTF-8").valid_encoding?
|
773
|
+
end
|
774
|
+
|
775
|
+
def Util.explain_how_strings_differ(x,y)
|
776
|
+
result = []
|
777
|
+
result.push("comparing '#{x}' to '#{y}'")
|
778
|
+
if x==y then
|
779
|
+
result.push(" strings are equal")
|
780
|
+
else
|
781
|
+
if x.length!=y.length then
|
782
|
+
result.push(" strings differ in length, #{x.length} and #{y.length}")
|
783
|
+
else
|
784
|
+
0.upto(x.length-1) { |i|
|
785
|
+
if x[i]!=y[i] then
|
786
|
+
result.push(" strings differ at position #{i}, #{x[i]}!=#{y[i]}, codes are #{x[i].ord} and #{y[i].ord}")
|
787
|
+
end
|
788
|
+
}
|
789
|
+
end
|
790
|
+
end
|
791
|
+
return result.map { |line| line+"\n" }.join('')
|
792
|
+
end
|
793
|
+
|
794
|
+
def Util.explain_chars_in_string(s)
|
795
|
+
result = []
|
796
|
+
result.push("explaining the characters in the string '#{s}':")
|
797
|
+
0.upto(s.length-1) { |i|
|
798
|
+
result.push(" [#{i}] = '#{s[i]}', code = #{s[i].ord}")
|
799
|
+
}
|
800
|
+
return result.map { |line| line+"\n" }.join('')
|
801
|
+
end
|
802
|
+
|
803
|
+
def Util.longest_common_initial_substring(aa,bb,script)
|
804
|
+
if aa.length<bb.length then return longest_common_initial_substring(bb,aa,script) end
|
805
|
+
a = aa
|
806
|
+
b = bb
|
807
|
+
if script=='greek' then a=aa.gsub(/ς$/,'σ'); b=bb.gsub(/ς$/,'σ') end
|
808
|
+
# From here on, we're guaranteed that a is at least as long as b and neither has a final lunate sigma.
|
809
|
+
best = ''
|
810
|
+
0.upto(b.length-1) { |l|
|
811
|
+
if a[0..l]==b[0..l] then best=a[0..l] end
|
812
|
+
}
|
813
|
+
if script=='greek' then best=best.sub(/σ$/,'ς') end
|
814
|
+
return best
|
815
|
+
end
|
816
|
+
|
817
|
+
def Util.words(s)
|
818
|
+
# Splits a string into an array of words, eliminating whitespace and interword punctuation.
|
819
|
+
# Don't use this for making word-by-word running hashes; that's what split_string_at_whitespace() is for.
|
820
|
+
# Knows about apostrophe for English and two elision characters for Greek.
|
821
|
+
# For a better-engineered version of this, see genos.string_to_words().
|
822
|
+
return s.scan(/[[:alpha:]'’᾽]+/)
|
823
|
+
end
|
824
|
+
|
825
|
+
def Util.strip_whitespace(s)
|
826
|
+
# strips leading and trailing whitespace
|
827
|
+
return s.sub(/^\s+/,'').sub(/\s+$/,'')
|
828
|
+
end
|
829
|
+
|
830
|
+
def Util.split_string_at_whitespace(text)
|
831
|
+
# Returns an array like [['The',' '],['quick',' '],...]. Every element is guaranteed to be a two-element list.
|
832
|
+
# In the final pair, the whitespace will be a null string if the text doesn't end with whitespace.
|
833
|
+
# This is basically meant for simple, reproducible word-by-word hashing (WhereAt.auto_hash), not for
|
834
|
+
# human-readable text processing, so don't use it for other purposes or fiddle with it to make it work
|
835
|
+
# for that purpose. For human-readable extraction of words, without punctuation, see words() above.
|
836
|
+
a = text.split(/(\s+)/)
|
837
|
+
if a.length%2==1 then a.push('') end
|
838
|
+
b = []
|
839
|
+
0.upto(a.length/2-1) { |i| b.push([a[2*i],a[2*i+1]]) }
|
840
|
+
return b
|
841
|
+
end
|
842
|
+
|
843
|
+
def Util.split_string_into_paragraphs(text)
|
844
|
+
# Returns a list like ["This is a paragraph.","\n\n","Another paragraph.","\n \n\t\n",...].
|
845
|
+
# Guaranteed to have even length, so final element may be a null string.
|
846
|
+
# Like split_string_at_whitespace(), this is meant to be used for reproducible creation of hashes.
|
847
|
+
paras_and_delimiters = text.split(/(\s*(?:\n[ \t]*){2,}\s*)/) # even indices=paragraphs, odd=delimiters
|
848
|
+
if paras_and_delimiters.length%2==1 then paras_and_delimiters.push('') end # input doesn't end with a delimiter
|
849
|
+
return paras_and_delimiters
|
850
|
+
end
|
851
|
+
|
852
|
+
def Util.substr(x,i,len)
|
853
|
+
# Basically returns x[i..(i+len-1)], but doesn't do screwy stuff in cases like i=0, len=0.
|
854
|
+
result = ''
|
855
|
+
i.upto(i+len-1) { |m|
|
856
|
+
result = result+x[m]
|
857
|
+
}
|
858
|
+
return result
|
859
|
+
end
|
860
|
+
|
861
|
+
def Util.texify_quotes(s)
|
862
|
+
s = s.gsub(/((?<=[a-zA-Z]))'(?=[a-zA-Z])/,'__ENGLISH_INTERNAL_APOSTROPHE__')
|
863
|
+
# We don't want [[:alpha:]], because Greek doesn't use mid-word apostrophes, and we don't want to get confused by cases where elision
|
864
|
+
# was marked by an ASCII apostrophe.
|
865
|
+
# Handle nested quotes, working from the inside out.
|
866
|
+
1.upto(3) { |i| # handle up to three levels
|
867
|
+
[[%q('),'SINGLE'],[%q("),'DOUBLE']].each { |x|
|
868
|
+
char,kind = x
|
869
|
+
s = s.gsub(/(?<![[:alpha:]])#{char}([^'"]+)#{char}(?![[:alpha:]])/) {"__OPEN_#{kind}_QUOTES__#{$1}__CLOSE_#{kind}_QUOTES__"}
|
870
|
+
# ... negative lookbehind and negative lookahead help to ensure we don't get confused
|
871
|
+
}
|
872
|
+
}
|
873
|
+
[['__OPEN_SINGLE_QUOTES__',%q(`)], ['__CLOSE_SINGLE_QUOTES__',%q(')],
|
874
|
+
['__OPEN_DOUBLE_QUOTES__',%q(``)], ['__CLOSE_DOUBLE_QUOTES__',%q('')],
|
875
|
+
['__ENGLISH_INTERNAL_APOSTROPHE__',%q(')] ].each { |x|
|
876
|
+
marker,replace_with = x
|
877
|
+
s = s.gsub(/#{marker}/,replace_with)
|
878
|
+
}
|
879
|
+
return s
|
880
|
+
end
|
881
|
+
|
882
|
+
def Util.canonicalize_greek_word(w,n:false)
|
883
|
+
# Works on a single word, not an entire string.
|
884
|
+
# Is designed so that calling it on a Latin word is fast and harmless.
|
885
|
+
# The n argument has the same definition as in Tinycus::Tr.remove_accents_from_greek().
|
886
|
+
if !looks_greek(w) then return w end # is fast on Latin script
|
887
|
+
if n then w = w.unicode_normalize(:nfc) end
|
888
|
+
w = to_single_accent(w)
|
889
|
+
w = Tinycus::Cleanup.standardize_greek_punctuation(w)
|
890
|
+
return w
|
891
|
+
end
|
892
|
+
|
893
|
+
def Util.looks_greek(w,depth:0)
|
894
|
+
# designed to be fast
|
895
|
+
if w=='' then return false end
|
896
|
+
if w=~/[a-zA-Z]/ then return false end
|
897
|
+
if w=~/[α-ωΑ-Ως]/ then return true end
|
898
|
+
if depth>=2 then return false end # happens if the string contains only punctuation, etc.
|
899
|
+
return looks_greek(Tinycus::Tr.remove_accents_from_greek(w),depth:depth+1) # slow fallback, almost never needed
|
900
|
+
end
|
901
|
+
|
902
|
+
def Util.mixes_scripts(s)
|
903
|
+
# not designed to be super fast or super accurate, just a quick check
|
904
|
+
if s=='' then return false end
|
905
|
+
if s=~/[a-zA-Z]/ then has_latin=true else has_latin=false end
|
906
|
+
if Tinycus::Tr.remove_accents_from_greek(s)=~/[α-ωΑ-Ως]/ then has_greek=true else has_greek=false end
|
907
|
+
return (has_latin && has_greek)
|
908
|
+
end
|
909
|
+
|
910
|
+
def Util.to_single_accent(w,grave_to_acute:true,n:false)
|
911
|
+
# In most cases, it's better to use canonicalize_greek_word() rather than this.
|
912
|
+
# If the word has both an acute and a grave, remove the grave. If it has only a grave, change it to an acute.
|
913
|
+
# This is used e.g. in LemmaUtil.make_inflected_form_flavored_like_lemma.
|
914
|
+
# Testing: ruby -e "require './lib/string_util'; print to_single_accent('χεῖράς')"
|
915
|
+
# The n argument has the same definition as in Tinycus::Tr.remove_accents_from_greek().
|
916
|
+
if Tinycus::Tr.remove_accents_from_greek(w,n:n)==w then return w end # for efficiency
|
917
|
+
if Tinycus::MiscGreek.has_circumflex(w) then return Tinycus::Tr.remove_acute_and_grave_from_greek(w) end
|
918
|
+
acc = []
|
919
|
+
w_no_acute_or_grave = Tinycus::Tr.remove_acute_and_grave_from_greek(w)
|
920
|
+
0.upto(w.chars.length-1) { |i|
|
921
|
+
if w_no_acute_or_grave[i]!=w[i] then acc.push(i) end
|
922
|
+
}
|
923
|
+
if acc.length>1 then
|
924
|
+
# Remove every accent but the first.
|
925
|
+
ww = w.dup
|
926
|
+
1.upto(acc.length-1) { |m|
|
927
|
+
i = acc[m]
|
928
|
+
ww[i] = Tinycus::Tr.remove_acute_and_grave_from_greek(ww[i])
|
929
|
+
}
|
930
|
+
return ww
|
931
|
+
else
|
932
|
+
if grave_to_acute then
|
933
|
+
return Tinycus::Tr.greek_grave_to_acute(w)
|
934
|
+
else
|
935
|
+
return w
|
936
|
+
end
|
937
|
+
end
|
938
|
+
end
|
939
|
+
|
940
|
+
def Util.remove_punctuation(s)
|
941
|
+
# also removes whitespace
|
942
|
+
return s.gsub(/[^[:alpha:]]/,'')
|
943
|
+
end
|
944
|
+
|
945
|
+
def Util.safe_normalize(s)
|
946
|
+
begin
|
947
|
+
return s.encode("UTF-8").unicode_normalize(:nfc)
|
948
|
+
rescue
|
949
|
+
# is probably 8-bit ascii/ISO-8859-1?
|
950
|
+
return s
|
951
|
+
end
|
952
|
+
end
|
953
|
+
|
954
|
+
|
955
|
+
def Util.lc_underbar(s)
|
956
|
+
return s.downcase.gsub(/ /,'_')
|
957
|
+
end
|
958
|
+
|
959
|
+
def Util.clean_up_greek(s,thorough:false,allow_latin:false,strip_punctuation:false)
|
960
|
+
# s is any string, can contain any script or mix of scripts, can be more than one word.
|
961
|
+
# Use the thorough option for external sources like raw Perseus xml files. This option is slow.
|
962
|
+
# The strip_punctuation option only strips punctuation that shouldn't be in a word, doesn't strip elision mark; is necessary because
|
963
|
+
# PROIEL Herodotus has a few errors where punct is included in word.
|
964
|
+
if !s.kind_of?(String) then return s end # convenience feature
|
965
|
+
if strip_punctuation then s=s.gsub(/[·,;«».]/,'') end
|
966
|
+
if thorough then
|
967
|
+
s = Tinycus::Cleanup.clean_up_grotty_greek(s,allow_latin:allow_latin) # standardizes punctuation by default
|
968
|
+
else
|
969
|
+
s = Tinycus::Cleanup.standardize_greek_punctuation(s) # Standardize elision character and middle dot/ano teleia.
|
970
|
+
end
|
971
|
+
return s
|
972
|
+
end
|
973
|
+
|
974
|
+
def Util.contains_greek_elision(s)
|
975
|
+
# The following checks are arranged in order so as to give best performance.
|
976
|
+
if !(s=~/[᾽’ῤῥ]/) then return false end
|
977
|
+
if s=~/[᾽’]/ then return true end
|
978
|
+
if s.length>=2 && s=~/[ῤῥ]$/ then return true end
|
979
|
+
return false
|
980
|
+
# the above are koronis (8125=14bd hex) and apostrophe (8217=2019 hex)
|
981
|
+
# see http://www.opoudjis.net/unicode/gkdiacritics.html
|
982
|
+
# Perseus sometimes has 787=313 hex, which is combining comma above, the non-spacing version of koronis. This seems
|
983
|
+
# to me to be a mistake on their part.
|
984
|
+
# https://github.com/PerseusDL/treebank_data/issues/31
|
985
|
+
# One could also have 700=2bc hex, spacing smooth breathing, which seems like an error, or 39=27 hex, the ascii apostrophe.
|
986
|
+
end
|
987
|
+
|
988
|
+
def Util.escape_double_quotes(s)
|
989
|
+
return s.gsub(/"/,'\\"') # escape double quotes
|
990
|
+
end
|
991
|
+
|
992
|
+
def Util.reverse_if_rtl(s)
|
993
|
+
if s=='' then return s end
|
994
|
+
if char_is_rtl(s[0]) then return Util.reverse_string(s) else return s end
|
995
|
+
end
|
996
|
+
|
997
|
+
def Util.reverse_string(s)
|
998
|
+
r = 0
|
999
|
+
s.chars.each { |c| r = c+r }
|
1000
|
+
return r
|
1001
|
+
end
|
1002
|
+
|
1003
|
+
def Util.console(*x)
|
1004
|
+
$stderr.print *x
|
1005
|
+
end
|
1006
|
+
|
1007
|
+
end
|
1008
|
+
|
1009
|
+
# The following table is output by scripts/generate_beta_code_tables.rb in Ifthimos.
|
1010
|
+
@@beta_code_conversion_json =
|
1011
|
+
<<-'JSON'
|
1012
|
+
[{"ἄ":"a)/","ἄ̄":"a)/&","ἄ̆":"a)/'","ἄ̄̆":"a)/&","ὰ":"a\\","ὰ̄":"a\\'","ὰ̆":"a\\'","ὰ̄̆":"a\\'","ά":"a/","ά̄":"a/&","ά̆":"a/'","ά̄̆":"a/&","Ἀ":"*)a","Ἀ̄":"*ā","Ἀ̆":"*ă","Ἀ̄̆":"*ā̆","α":"a","ᾱ":"a&","ᾰ":"a'","ᾱ̆":"a&'","ᾶ":"a=","ᾶ̄":"ā","ᾶ̆":"ă","ᾶ̄̆":"ā̆","ἀ":"a)","ἀ̄":"a)&","ἀ̆":"a)'","ἀ̄̆":"a)&","ἁ":"a(","ἁ̄":"a(&","ἁ̆":"a('","ἁ̄̆":"a(&","Ἄ":"*)/a","Ἄ̄":"*ā","Ἄ̆":"*ă","Ἄ̄̆":"*ā̆","ἂ":"a)\\","ἂ̄":"a)\\'","ἂ̆":"a)\\'","ἂ̄̆":"a)\\'","ᾷ":"a=|","ᾷ̄":"ā","ᾷ̆":"ă","ᾷ̄̆":"ā̆","Α":"*a","Ᾱ":"*a&","Ᾰ":"*a'","Ᾱ̆":"*a&'","ἅ":"a(/","ἅ̄":"a(/&","ἅ̆":"a(/'","ἅ̄̆":"a(/&","ἆ":"a)=","ἆ̄":"ā","ἆ̆":"ă","ἆ̄̆":"ā̆","ᾳ":"a|","ᾱͅ":"a|&","ᾰͅ":"a|'","ᾱ̆ͅ":"a|&'","Ἅ":"*(/a","Ἅ̄":"*ā","Ἅ̆":"*ă","Ἅ̄̆":"*ā̆","Ἁ":"*(a","Ἁ̄":"*ā","Ἁ̆":"*ă","Ἁ̄̆":"*ā̆","Ἆ":"*)=a","Ἆ̄":"*ā","Ἆ̆":"*ă","Ἆ̄̆":"*ā̆","ἃ":"a(\\","ἃ̄":"a(\\'","ἃ̆":"a(\\'","ἃ̄̆":"a(\\'","Ἂ":"*)\\a","Ἂ̄":"*ā","Ἂ̆":"*ă","Ἂ̄̆":"*ā̆","ᾇ":"a(=|","ᾇ̄":"ā","ᾇ̆":"ă","ᾇ̄̆":"ā̆","β":"b","Β":"*b","γ":"g","Γ":"*g","δ":"d","Δ":"*d","ε":"e","έ":"e/","ἔ":"e)/","ὲ":"e\\","ἑ":"e(","ἐ":"e)","ἕ":"e(/","Ἕ":"*(/e","Ἐ":"*)e","Ε":"*e","Ἑ":"*(e","Ἔ":"*)/e","ἓ":"e(\\","ἒ":"e)\\","ζ":"z","Ζ":"*z","ῆ":"h=","η":"h","ἣ":"h(\\","ἡ":"h(","ή":"h/","ὴ":"h\\","ἠ":"h)","ἦ":"h)=","ῃ":"h|","ἥ":"h(/","ἢ":"h)\\","ἤ":"h)/","ῇ":"h=|","Ἥ":"*(/h","ᾔ":"h)/|","ἧ":"h(=","ᾗ":"h(=|","Ἠ":"*)h","ῄ":"h/|","ᾕ":"h(/|","Ἡ":"*(h","Ἤ":"*)/h","ῂ":"h\\|","ᾐ":"h)|","ᾖ":"h)=|","ᾑ":"h(|","Η":"*h","Ἦ":"*)=h","Ἣ":"*(\\h","Ἧ":"*(=h","θ":"q","Θ":"*q","ι":"i","ῑ":"i&","ῐ":"i'","ῑ̆":"i&'","ϊ":"i+","ϊ̄":"i+&","ϊ̆":"ĭ","ϊ̄̆":"ī̆&","ί":"i/","ί̄":"i/&","ί̆":"i/'","ί̄̆":"i/&","ῖ":"i=","ῖ̄":"ī","ῖ̆":"ĭ","ῖ̄̆":"ī̆","ἰ":"i)","ἰ̄":"i)&","ἰ̆":"i)'","ἰ̄̆":"i)&","ΐ":"i/+","ΐ̄":"i/+'","ΐ̆":"i/+'","ΐ̄̆":"i/+'","ὶ":"i\\","ὶ̄":"i\\'","ὶ̆":"i\\'","ὶ̄̆":"i\\'","ἱ":"i(","ἱ̄":"i(&","ἱ̆":"i('","ἱ̄̆":"i(&","ἴ":"i)/","ἴ̄":"i)/&","ἴ̆":"i)/'","ἴ̄̆":"i)/&","ἶ":"i)=","ἶ̄":"ī","ἶ̆":"ĭ","ἶ̄̆":"ī̆","ἳ":"i(\\","ἳ̄":"i(\\'","ἳ̆":"i(\\'","ἳ̄̆":"i(\\'","Ἴ":"*)/i","Ἴ̄":"*ī","Ἴ̆":"*ĭ","Ἴ̄̆":"*ī̆","ῒ":"i\\+","ῒ̄":"i\\+'","ῒ̆":"i\\+'","ῒ̄̆":"i\\+'","ἵ":"i(/","ἵ̄":"i(/&","ἵ̆":"i(/'","ἵ̄̆":"i(/&","ἷ":"i(=","ἷ̄":"ī","ἷ̆":"ĭ","ἷ̄̆":"ī̆","Ἰ":"*)i","Ἰ̄":"*ī","Ἰ̆":"*ĭ","Ἰ̄̆":"*ī̆","ἲ":"i)\\","ἲ̄":"i)\\'","ἲ̆":"i)\\'","ἲ̄̆":"i)\\'","Ἱ":"*(i","Ἱ̄":"*ī","Ἱ̆":"*ĭ","Ἱ̄̆":"*ī̆","Ἶ":"*)=i","Ἶ̄":"*ī","Ἶ̆":"*ĭ","Ἶ̄̆":"*ī̆","ῗ":"i=+","ῗ̄":"ī","ῗ̆":"ĭ","ῗ̄̆":"ī̆","Ι":"*i","Ῑ":"*i&","Ῐ":"*i'","Ῑ̆":"*i&'","Ἵ":"*(/i","Ἵ̄":"*ī","Ἵ̆":"*ĭ","Ἵ̄̆":"*ī̆","κ":"k","Κ":"*k","λ":"l","Λ":"*l","μ":"m","Μ":"*m","ν":"n","Ν":"*n","ξ":"c","Ξ":"*c","ο":"o","ὸ":"o\\","ό":"o/","ὃ":"o(\\","ὄ":"o)/","ὀ":"o)","Ὀ":"*)o","Ο":"*o","ὅ":"o(/","ὁ":"o(","Ὄ":"*)/o","Ὅ":"*(/o","Ὃ":"*(\\o","Ὁ":"*(o","Π":"*p","π":"p","ρ":"r","ῥ":"r(","Ῥ":"*(r","ῤ":"r)","Ρ":"*r","ς":"j","σ":"s","Σ":"*s","τ":"t","Τ":"*t","ὐ":"u)","ὐ̄":"u)&","ὐ̆":"u)'","ὐ̄̆":"u)&","υ":"u","ῡ":"u&","ῠ":"u'","ῡ̆":"u&'","ὺ":"u\\","ὺ̄":"u\\'","ὺ̆":"u\\'","ὺ̄̆":"u\\'","ῦ":"u=","ῦ̄":"ū","ῦ̆":"ŭ","ῦ̄̆":"ū̆","ύ":"u/","ύ̄":"u/&","ύ̆":"u/'","ύ̄̆":"u/&","ὗ":"u(=","ὗ̄":"ū","ὗ̆":"ŭ","ὗ̄̆":"ū̆","ὕ":"u(/","ὕ̄":"u(/&","ὕ̆":"u(/'","ὕ̄̆":"u(/&","ϋ":"u+","ϋ̄":"ū","ϋ̆":"ŭ","ϋ̄̆":"ū̆","ὑ":"u(","ὑ̄":"u(&","ὑ̆":"u('","ὑ̄̆":"u(&","ὖ":"u)=","ὖ̄":"ū","ὖ̆":"ŭ","ὖ̄̆":"ū̆","ΰ":"u/+","ΰ̄":"ū","ΰ̆":"ŭ","ΰ̄̆":"ū̆","ὔ":"u)/","ὔ̄":"u)/&","ὔ̆":"u)/'","ὔ̄̆":"u)/&","ῢ":"u\\+","ῢ̄":"ū","ῢ̆":"ŭ","ῢ̄̆":"ū̆","ὓ":"u(\\","ὓ̄":"u(\\'","ὓ̆":"u(\\'","ὓ̄̆":"u(\\'","Ὑ":"*(u","Ὑ̄":"*ū","Ὑ̆":"*ŭ","Ὑ̄̆":"*ū̆","Ὕ":"*(/u","Ὕ̄":"*ū","Ὕ̆":"*ŭ","Ὕ̄̆":"*ū̆","Υ":"*u","Ῡ":"*u&","Ῠ":"*u'","Ῡ̆":"*u&'","φ":"f","Φ":"*f","χ":"x","Χ":"*x","ψ":"y","Ψ":"*y","ω":"w","ώ":"w/","ῶ":"w=","ῳ":"w|","ῷ":"w=|","ὼ":"w\\","ὥ":"w(/","ὣ":"w(\\","ὤ":"w)/","ῴ":"w/|","ᾤ":"w)/|","ὠ":"w)","ὦ":"w)=","ᾧ":"w(=|","ὡ":"w(","ὧ":"w(=","ᾠ":"w)|","Ὠ":"*)w","ὢ":"w)\\","Ὤ":"*)/w","Ὦ":"*)=w","Ὧ":"*(=w","ᾦ":"w)=|","Ω":"*w","Ὥ":"*(/w","Ὣ":"*(\\w","Ὡ":"*(w","Ϝ":"*v","ϝ":"v"},{"a)/":"ἄ","a)/&":"ἄ̄̆","a)/'":"ἄ̆","a\\":"ὰ","a\\'":"ὰ̄̆","a/":"ά","a/&":"ά̄̆","a/'":"ά̆","*)a":"Ἀ","*ā":"Ἂ̄","*ă":"Ἂ̆","*ā̆":"Ἂ̄̆","a":"α","a&":"ᾱ","a'":"ᾰ","a&'":"ᾱ̆","a=":"ᾶ","ā":"ᾇ̄","ă":"ᾇ̆","ā̆":"ᾇ̄̆","a)":"ἀ","a)&":"ἀ̄̆","a)'":"ἀ̆","a(":"ἁ","a(&":"ἁ̄̆","a('":"ἁ̆","*)/a":"Ἄ","a)\\":"ἂ","a)\\'":"ἂ̄̆","a=|":"ᾷ","*a":"Α","*a&":"Ᾱ","*a'":"Ᾰ","*a&'":"Ᾱ̆","a(/":"ἅ","a(/&":"ἅ̄̆","a(/'":"ἅ̆","a)=":"ἆ","a|":"ᾳ","a|&":"ᾱͅ","a|'":"ᾰͅ","a|&'":"ᾱ̆ͅ","*(/a":"Ἅ","*(a":"Ἁ","*)=a":"Ἆ","a(\\":"ἃ","a(\\'":"ἃ̄̆","*)\\a":"Ἂ","a(=|":"ᾇ","b":"β","*b":"Β","g":"γ","*g":"Γ","d":"δ","*d":"Δ","e":"ε","e/":"έ","e)/":"ἔ","e\\":"ὲ","e(":"ἑ","e)":"ἐ","e(/":"ἕ","*(/e":"Ἕ","*)e":"Ἐ","*e":"Ε","*(e":"Ἑ","*)/e":"Ἔ","e(\\":"ἓ","e)\\":"ἒ","z":"ζ","*z":"Ζ","h=":"ῆ","h":"η","h(\\":"ἣ","h(":"ἡ","h/":"ή","h\\":"ὴ","h)":"ἠ","h)=":"ἦ","h|":"ῃ","h(/":"ἥ","h)\\":"ἢ","h)/":"ἤ","h=|":"ῇ","*(/h":"Ἥ","h)/|":"ᾔ","h(=":"ἧ","h(=|":"ᾗ","*)h":"Ἠ","h/|":"ῄ","h(/|":"ᾕ","*(h":"Ἡ","*)/h":"Ἤ","h\\|":"ῂ","h)|":"ᾐ","h)=|":"ᾖ","h(|":"ᾑ","*h":"Η","*)=h":"Ἦ","*(\\h":"Ἣ","*(=h":"Ἧ","q":"θ","*q":"Θ","i":"ι","i&":"ῑ","i'":"ῐ","i&'":"ῑ̆","i+":"ϊ","i+&":"ϊ̄","ĭ":"ῗ̆","ī̆&":"ϊ̄̆","i/":"ί","i/&":"ί̄̆","i/'":"ί̆","i=":"ῖ","ī":"ῗ̄","ī̆":"ῗ̄̆","i)":"ἰ","i)&":"ἰ̄̆","i)'":"ἰ̆","i/+":"ΐ","i/+'":"ΐ̄̆","i\\":"ὶ","i\\'":"ὶ̄̆","i(":"ἱ","i(&":"ἱ̄̆","i('":"ἱ̆","i)/":"ἴ","i)/&":"ἴ̄̆","i)/'":"ἴ̆","i)=":"ἶ","i(\\":"ἳ","i(\\'":"ἳ̄̆","*)/i":"Ἴ","*ī":"Ἵ̄","*ĭ":"Ἵ̆","*ī̆":"Ἵ̄̆","i\\+":"ῒ","i\\+'":"ῒ̄̆","i(/":"ἵ","i(/&":"ἵ̄̆","i(/'":"ἵ̆","i(=":"ἷ","*)i":"Ἰ","i)\\":"ἲ","i)\\'":"ἲ̄̆","*(i":"Ἱ","*)=i":"Ἶ","i=+":"ῗ","*i":"Ι","*i&":"Ῑ","*i'":"Ῐ","*i&'":"Ῑ̆","*(/i":"Ἵ","k":"κ","*k":"Κ","l":"λ","*l":"Λ","m":"μ","*m":"Μ","n":"ν","*n":"Ν","c":"ξ","*c":"Ξ","o":"ο","o\\":"ὸ","o/":"ό","o(\\":"ὃ","o)/":"ὄ","o)":"ὀ","*)o":"Ὀ","*o":"Ο","o(/":"ὅ","o(":"ὁ","*)/o":"Ὄ","*(/o":"Ὅ","*(\\o":"Ὃ","*(o":"Ὁ","*p":"Π","p":"π","r":"ρ","r(":"ῥ","*(r":"Ῥ","r)":"ῤ","*r":"Ρ","j":"ς","s":"σ","*s":"Σ","t":"τ","*t":"Τ","u)":"ὐ","u)&":"ὐ̄̆","u)'":"ὐ̆","u":"υ","u&":"ῡ","u'":"ῠ","u&'":"ῡ̆","u\\":"ὺ","u\\'":"ὺ̄̆","u=":"ῦ","ū":"ῢ̄","ŭ":"ῢ̆","ū̆":"ῢ̄̆","u/":"ύ","u/&":"ύ̄̆","u/'":"ύ̆","u(=":"ὗ","u(/":"ὕ","u(/&":"ὕ̄̆","u(/'":"ὕ̆","u+":"ϋ","u(":"ὑ","u(&":"ὑ̄̆","u('":"ὑ̆","u)=":"ὖ","u/+":"ΰ","u)/":"ὔ","u)/&":"ὔ̄̆","u)/'":"ὔ̆","u\\+":"ῢ","u(\\":"ὓ","u(\\'":"ὓ̄̆","*(u":"Ὑ","*ū":"Ὕ̄","*ŭ":"Ὕ̆","*ū̆":"Ὕ̄̆","*(/u":"Ὕ","*u":"Υ","*u&":"Ῡ","*u'":"Ῠ","*u&'":"Ῡ̆","f":"φ","*f":"Φ","x":"χ","*x":"Χ","y":"ψ","*y":"Ψ","w":"ω","w/":"ώ","w=":"ῶ","w|":"ῳ","w=|":"ῷ","w\\":"ὼ","w(/":"ὥ","w(\\":"ὣ","w)/":"ὤ","w/|":"ῴ","w)/|":"ᾤ","w)":"ὠ","w)=":"ὦ","w(=|":"ᾧ","w(":"ὡ","w(=":"ὧ","w)|":"ᾠ","*)w":"Ὠ","w)\\":"ὢ","*)/w":"Ὤ","*)=w":"Ὦ","*(=w":"Ὧ","w)=|":"ᾦ","*w":"Ω","*(/w":"Ὥ","*(\\w":"Ὣ","*(w":"Ὡ","*v":"Ϝ","v":"ϝ"}]
|
1013
|
+
JSON
|
1014
|
+
@@beta_code_conversion = nil
|
1015
|
+
|
1016
|
+
def Tinycus.beta_code_conversion_table
|
1017
|
+
if @@beta_code_conversion.nil? then @@beta_code_conversion=JSON.parse(@@beta_code_conversion_json) end
|
1018
|
+
return @@beta_code_conversion
|
1019
|
+
end
|
1020
|
+
|
1021
|
+
def Tinycus.greek_char_unicode_to_beta_code(u)
|
1022
|
+
b = Tinycus.beta_code_conversion_table()[0][u]
|
1023
|
+
if !b.nil? then return b else return u end # most failures will just be whitespace, punctuation, etc.
|
1024
|
+
end
|
1025
|
+
|
1026
|
+
def Tinycus.greek_char_beta_code_to_unicode(b)
|
1027
|
+
b = Tinycus.canonicalize_char_greek_beta_code(b)
|
1028
|
+
u = Tinycus.beta_code_conversion_table()[1][b]
|
1029
|
+
if !u.nil? then return u else return b end
|
1030
|
+
end
|
1031
|
+
|
1032
|
+
def Tinycus.canonicalize_char_greek_beta_code(b)
|
1033
|
+
# Breathing normally comes after accent, but sometimes you see things in the wild where it's reversed.
|
1034
|
+
# I can't find any documentation for any preferred or canonical order. What circumstantial evidence I could find I put into the WP article.
|
1035
|
+
# Note that the order of |+ doesn't matter, because the same letter can't have both.
|
1036
|
+
")(/\\=|+&'".chars.each { |c|
|
1037
|
+
if b=~/(.*)#{Regexp::quote(c)}(.*)/ then b = $1+$2+c end
|
1038
|
+
}
|
1039
|
+
b.sub!(/\*([a-zA-Z]\d?)([\/\\=)(]+)/) { '*'+$2+$1 } # if its capitalized, move tonal accents and breathing before the letter
|
1040
|
+
return b
|
1041
|
+
end
|
1042
|
+
|
1043
|
+
def Tinycus.greek_unicode_to_beta_code(u)
|
1044
|
+
u = Tinycus::Cleanup.clean_up_grotty_greek(u,allow_latin:true,clean_perseus:true,standardize_punctuation:false)
|
1045
|
+
# ... the conversion below will not work on unicode that isn't done cleanly and according to modern standards
|
1046
|
+
b = ''
|
1047
|
+
u.chars.each { |uc|
|
1048
|
+
b += Tinycus.greek_char_unicode_to_beta_code(uc)
|
1049
|
+
}
|
1050
|
+
b.tr!("·—’",":_'") # not implemeting keraia
|
1051
|
+
return b
|
1052
|
+
end
|
1053
|
+
|
1054
|
+
def Tinycus.greek_beta_code_to_unicode(b)
|
1055
|
+
# This implementation will be kind of slow because it does regex replacements in place.
|
1056
|
+
b = b.clone
|
1057
|
+
b.gsub!(/[sS][1-3]/,'s') # final sigma supported, actual lunate sigma (which looks like c) not supported
|
1058
|
+
# cons = "ϝβγδζθκλμνξπρσςτφχψh"
|
1059
|
+
b.gsub!(/(?<=[bcdfghjklmnpqrstvwxzBCDFGHJKLMNPQRSTVWXZ])'/,"’") # after a consonant, is apostrophe, not breve
|
1060
|
+
b.gsub!(/\*?[sS](?![1-3a-zA-Z*])/,'j') # make non-final sigma into final sigma if it's at end of word
|
1061
|
+
b.tr!(":_","·—") # not implemeting keraia
|
1062
|
+
b.gsub!(/(\*[)(\\\/=]*[a-zA-Z][)(\/=\\+|&'\´]*)/) { Tinycus.greek_char_beta_code_to_unicode($1) } # uppercase, expect breathing and tonal before vowel
|
1063
|
+
b.gsub!(/([a-zA-Z][)(\/=\\+|&'\´]*)/) { Tinycus.greek_char_beta_code_to_unicode($1) } # lowercase, breathing after vowel
|
1064
|
+
b.gsub!(/ς’/,'σ’')
|
1065
|
+
return b
|
1066
|
+
end
|
1067
|
+
|
1068
|
+
def Tinycus.disassemble_greek_char(c)
|
1069
|
+
# Returns [plain,d], where plain is a lowercase, unaccented Greek letter (α-ω, plus ς), and d is
|
1070
|
+
# a hash with the following keys:
|
1071
|
+
# uppercase, diar, iota_subscript - boolean values
|
1072
|
+
# tonal - string value: none acute grave circumflex
|
1073
|
+
# breathing - string value: none smooth rough
|
1074
|
+
# Doesn't handle macrons and breves. I have a function IfMows.disassemble_char in Ifthimos that does that.
|
1075
|
+
x = Tinycus.disassemble_greek_char_binary(c)
|
1076
|
+
if x.nil? then return nil end
|
1077
|
+
plain,decor = x
|
1078
|
+
d = {}
|
1079
|
+
d['uppercase'] = (decor & 0b1)!=0
|
1080
|
+
d['diar'] = (decor & 0b10)!=0
|
1081
|
+
d['iota_subscript'] = (decor & 0b100)!=0
|
1082
|
+
accent = (decor & 0b11000)>>3
|
1083
|
+
if accent==0b00 then d['tonal']='none' end
|
1084
|
+
if accent==0b01 then d['tonal']='acute' end
|
1085
|
+
if accent==0b10 then d['tonal']='grave' end
|
1086
|
+
if accent==0b11 then d['tonal']='circumflex' end
|
1087
|
+
if !d.has_key?('tonal') then raise "wtf? #{accent}" end
|
1088
|
+
breathing = (decor & 0b1100000)>>5
|
1089
|
+
if breathing==0b00 then d['breathing']='none' end
|
1090
|
+
if breathing==0b01 then d['breathing']='smooth' end
|
1091
|
+
if breathing==0b10 then d['breathing']='rough' end
|
1092
|
+
return [plain,d]
|
1093
|
+
end
|
1094
|
+
|
1095
|
+
def Tinycus.assemble_greek_char(plain,d)
|
1096
|
+
# The inverse of Tinycus.disassemble_greek_char.
|
1097
|
+
# Doesn't handle macrons and breves. I have a function IfMows.assemble_char in Ifthimos that does that.
|
1098
|
+
b = 0
|
1099
|
+
b |= 0b1 if d['uppercase']
|
1100
|
+
b |= 0b10 if d['diar']
|
1101
|
+
b |= 0b100 if d['iota_subscript']
|
1102
|
+
b |= 0b1000 if d['tonal']=='acute'
|
1103
|
+
b |= 0b10000 if d['tonal']=='grave'
|
1104
|
+
b |= 0b11000 if d['tonal']=='circumflex'
|
1105
|
+
b |= 0b0100000 if d['breathing']=='smooth'
|
1106
|
+
b |= 0b1000000 if d['breathing']=='rough'
|
1107
|
+
x = plain+("%.2x" % b)
|
1108
|
+
return Tinycus.assemble_greek_char_hex(x)
|
1109
|
+
end
|
1110
|
+
|
1111
|
+
def Tinycus.disassemble_greek_char_binary(c)
|
1112
|
+
# Returns [plain,b], where plain is a lowercase, unaccented Greek letter (α-ω, plus ς), and b is
|
1113
|
+
# an integer containing a set of flags encoded in binary, as follows:
|
1114
|
+
# b |= 0b1 if d['uppercase']
|
1115
|
+
# b |= 0b10 if d['diar']
|
1116
|
+
# b |= 0b100 if d['iota_subscript']
|
1117
|
+
# b |= 0b1000 if d['tonal']=='acute'
|
1118
|
+
# b |= 0b10000 if d['tonal']=='grave'
|
1119
|
+
# b |= 0b11000 if d['tonal']=='circumflex'
|
1120
|
+
# b |= 0b0100000 if d['breathing']=='smooth'
|
1121
|
+
# b |= 0b1000000 if d['breathing']=='rough'
|
1122
|
+
return @@disassemble_greek_char_hash[c]
|
1123
|
+
end
|
1124
|
+
|
1125
|
+
def Tinycus.assemble_greek_char_hex(x)
|
1126
|
+
# The inverse of the map in Tinycus.disassemble_greek_char_binary.
|
1127
|
+
# Accepts an input such as 'α08', where the second and third characters are the hex representation of of
|
1128
|
+
# the set of flags described in the comments in the forward map.
|
1129
|
+
return @@assemble_greek_char_hash[x]
|
1130
|
+
end
|
1131
|
+
|
1132
|
+
# The following are output by generating/assemble_disassemble.rb .
|
1133
|
+
@@disassemble_greek_char_hash = {
|
1134
|
+
'α'=>['α',0b0],
|
1135
|
+
'Α'=>['α',0b1],
|
1136
|
+
'ᾳ'=>['α',0b100],
|
1137
|
+
'ά'=>['α',0b1000],
|
1138
|
+
'ὰ'=>['α',0b10000],
|
1139
|
+
'ᾶ'=>['α',0b11000],
|
1140
|
+
'ᾷ'=>['α',0b11100],
|
1141
|
+
'ἀ'=>['α',0b100000],
|
1142
|
+
'Ἀ'=>['α',0b100001],
|
1143
|
+
'ἄ'=>['α',0b101000],
|
1144
|
+
'Ἄ'=>['α',0b101001],
|
1145
|
+
'ᾄ'=>['α',0b101100],
|
1146
|
+
'ἂ'=>['α',0b110000],
|
1147
|
+
'Ἂ'=>['α',0b110001],
|
1148
|
+
'ἆ'=>['α',0b111000],
|
1149
|
+
'Ἆ'=>['α',0b111001],
|
1150
|
+
'ἁ'=>['α',0b1000000],
|
1151
|
+
'Ἁ'=>['α',0b1000001],
|
1152
|
+
'ἅ'=>['α',0b1001000],
|
1153
|
+
'Ἅ'=>['α',0b1001001],
|
1154
|
+
'ᾅ'=>['α',0b1001100],
|
1155
|
+
'ἃ'=>['α',0b1010000],
|
1156
|
+
'ᾇ'=>['α',0b1011100],
|
1157
|
+
'β'=>['β',0b0],
|
1158
|
+
'Β'=>['β',0b1],
|
1159
|
+
'γ'=>['γ',0b0],
|
1160
|
+
'Γ'=>['γ',0b1],
|
1161
|
+
'δ'=>['δ',0b0],
|
1162
|
+
'Δ'=>['δ',0b1],
|
1163
|
+
'ε'=>['ε',0b0],
|
1164
|
+
'Ε'=>['ε',0b1],
|
1165
|
+
'έ'=>['ε',0b1000],
|
1166
|
+
'ὲ'=>['ε',0b10000],
|
1167
|
+
'ἐ'=>['ε',0b100000],
|
1168
|
+
'Ἐ'=>['ε',0b100001],
|
1169
|
+
'ἔ'=>['ε',0b101000],
|
1170
|
+
'Ἔ'=>['ε',0b101001],
|
1171
|
+
'ἒ'=>['ε',0b110000],
|
1172
|
+
'ἑ'=>['ε',0b1000000],
|
1173
|
+
'Ἑ'=>['ε',0b1000001],
|
1174
|
+
'ἕ'=>['ε',0b1001000],
|
1175
|
+
'Ἕ'=>['ε',0b1001001],
|
1176
|
+
'ἓ'=>['ε',0b1010000],
|
1177
|
+
'ζ'=>['ζ',0b0],
|
1178
|
+
'Ζ'=>['ζ',0b1],
|
1179
|
+
'η'=>['η',0b0],
|
1180
|
+
'Η'=>['η',0b1],
|
1181
|
+
'ῃ'=>['η',0b100],
|
1182
|
+
'ή'=>['η',0b1000],
|
1183
|
+
'ῄ'=>['η',0b1100],
|
1184
|
+
'ὴ'=>['η',0b10000],
|
1185
|
+
'ῂ'=>['η',0b10100],
|
1186
|
+
'ῆ'=>['η',0b11000],
|
1187
|
+
'ῇ'=>['η',0b11100],
|
1188
|
+
'ἠ'=>['η',0b100000],
|
1189
|
+
'Ἠ'=>['η',0b100001],
|
1190
|
+
'ᾐ'=>['η',0b100100],
|
1191
|
+
'ἤ'=>['η',0b101000],
|
1192
|
+
'Ἤ'=>['η',0b101001],
|
1193
|
+
'ᾔ'=>['η',0b101100],
|
1194
|
+
'ἢ'=>['η',0b110000],
|
1195
|
+
'ἦ'=>['η',0b111000],
|
1196
|
+
'Ἦ'=>['η',0b111001],
|
1197
|
+
'ᾖ'=>['η',0b111100],
|
1198
|
+
'ἡ'=>['η',0b1000000],
|
1199
|
+
'Ἡ'=>['η',0b1000001],
|
1200
|
+
'ᾑ'=>['η',0b1000100],
|
1201
|
+
'ἥ'=>['η',0b1001000],
|
1202
|
+
'Ἥ'=>['η',0b1001001],
|
1203
|
+
'ᾕ'=>['η',0b1001100],
|
1204
|
+
'ἣ'=>['η',0b1010000],
|
1205
|
+
'Ἣ'=>['η',0b1010001],
|
1206
|
+
'ἧ'=>['η',0b1011000],
|
1207
|
+
'Ἧ'=>['η',0b1011001],
|
1208
|
+
'ᾗ'=>['η',0b1011100],
|
1209
|
+
'θ'=>['θ',0b0],
|
1210
|
+
'Θ'=>['θ',0b1],
|
1211
|
+
'ι'=>['ι',0b0],
|
1212
|
+
'Ι'=>['ι',0b1],
|
1213
|
+
'ϊ'=>['ι',0b10],
|
1214
|
+
'ί'=>['ι',0b1000],
|
1215
|
+
'ΐ'=>['ι',0b1010],
|
1216
|
+
'ὶ'=>['ι',0b10000],
|
1217
|
+
'ῒ'=>['ι',0b10010],
|
1218
|
+
'ῖ'=>['ι',0b11000],
|
1219
|
+
'ῗ'=>['ι',0b11010],
|
1220
|
+
'ἰ'=>['ι',0b100000],
|
1221
|
+
'Ἰ'=>['ι',0b100001],
|
1222
|
+
'ἴ'=>['ι',0b101000],
|
1223
|
+
'Ἴ'=>['ι',0b101001],
|
1224
|
+
'ἲ'=>['ι',0b110000],
|
1225
|
+
'ἶ'=>['ι',0b111000],
|
1226
|
+
'Ἶ'=>['ι',0b111001],
|
1227
|
+
'ἱ'=>['ι',0b1000000],
|
1228
|
+
'Ἱ'=>['ι',0b1000001],
|
1229
|
+
'ἵ'=>['ι',0b1001000],
|
1230
|
+
'Ἵ'=>['ι',0b1001001],
|
1231
|
+
'ἳ'=>['ι',0b1010000],
|
1232
|
+
'ἷ'=>['ι',0b1011000],
|
1233
|
+
'κ'=>['κ',0b0],
|
1234
|
+
'Κ'=>['κ',0b1],
|
1235
|
+
'λ'=>['λ',0b0],
|
1236
|
+
'Λ'=>['λ',0b1],
|
1237
|
+
'μ'=>['μ',0b0],
|
1238
|
+
'Μ'=>['μ',0b1],
|
1239
|
+
'ν'=>['ν',0b0],
|
1240
|
+
'Ν'=>['ν',0b1],
|
1241
|
+
'ξ'=>['ξ',0b0],
|
1242
|
+
'Ξ'=>['ξ',0b1],
|
1243
|
+
'ο'=>['ο',0b0],
|
1244
|
+
'Ο'=>['ο',0b1],
|
1245
|
+
'ό'=>['ο',0b1000],
|
1246
|
+
'ὸ'=>['ο',0b10000],
|
1247
|
+
'ὀ'=>['ο',0b100000],
|
1248
|
+
'Ὀ'=>['ο',0b100001],
|
1249
|
+
'ὄ'=>['ο',0b101000],
|
1250
|
+
'Ὄ'=>['ο',0b101001],
|
1251
|
+
'ὁ'=>['ο',0b1000000],
|
1252
|
+
'Ὁ'=>['ο',0b1000001],
|
1253
|
+
'ὅ'=>['ο',0b1001000],
|
1254
|
+
'Ὅ'=>['ο',0b1001001],
|
1255
|
+
'ὃ'=>['ο',0b1010000],
|
1256
|
+
'Ὃ'=>['ο',0b1010001],
|
1257
|
+
'π'=>['π',0b0],
|
1258
|
+
'Π'=>['π',0b1],
|
1259
|
+
'ρ'=>['ρ',0b0],
|
1260
|
+
'Ρ'=>['ρ',0b1],
|
1261
|
+
'ῤ'=>['ρ',0b100000],
|
1262
|
+
'ῥ'=>['ρ',0b1000000],
|
1263
|
+
'Ῥ'=>['ρ',0b1000001],
|
1264
|
+
'ς'=>['ς',0b0],
|
1265
|
+
'σ'=>['σ',0b0],
|
1266
|
+
'Σ'=>['σ',0b1],
|
1267
|
+
'τ'=>['τ',0b0],
|
1268
|
+
'Τ'=>['τ',0b1],
|
1269
|
+
'υ'=>['υ',0b0],
|
1270
|
+
'Υ'=>['υ',0b1],
|
1271
|
+
'ϋ'=>['υ',0b10],
|
1272
|
+
'ύ'=>['υ',0b1000],
|
1273
|
+
'ΰ'=>['υ',0b1010],
|
1274
|
+
'ὺ'=>['υ',0b10000],
|
1275
|
+
'ῢ'=>['υ',0b10010],
|
1276
|
+
'ῦ'=>['υ',0b11000],
|
1277
|
+
'ὐ'=>['υ',0b100000],
|
1278
|
+
'ὔ'=>['υ',0b101000],
|
1279
|
+
'ὖ'=>['υ',0b111000],
|
1280
|
+
'ὑ'=>['υ',0b1000000],
|
1281
|
+
'Ὑ'=>['υ',0b1000001],
|
1282
|
+
'ὕ'=>['υ',0b1001000],
|
1283
|
+
'Ὕ'=>['υ',0b1001001],
|
1284
|
+
'ὓ'=>['υ',0b1010000],
|
1285
|
+
'ὗ'=>['υ',0b1011000],
|
1286
|
+
'φ'=>['φ',0b0],
|
1287
|
+
'Φ'=>['φ',0b1],
|
1288
|
+
'χ'=>['χ',0b0],
|
1289
|
+
'Χ'=>['χ',0b1],
|
1290
|
+
'ψ'=>['ψ',0b0],
|
1291
|
+
'Ψ'=>['ψ',0b1],
|
1292
|
+
'ω'=>['ω',0b0],
|
1293
|
+
'Ω'=>['ω',0b1],
|
1294
|
+
'ῳ'=>['ω',0b100],
|
1295
|
+
'ώ'=>['ω',0b1000],
|
1296
|
+
'ῴ'=>['ω',0b1100],
|
1297
|
+
'ὼ'=>['ω',0b10000],
|
1298
|
+
'ῶ'=>['ω',0b11000],
|
1299
|
+
'ῷ'=>['ω',0b11100],
|
1300
|
+
'ὠ'=>['ω',0b100000],
|
1301
|
+
'Ὠ'=>['ω',0b100001],
|
1302
|
+
'ᾠ'=>['ω',0b100100],
|
1303
|
+
'ὤ'=>['ω',0b101000],
|
1304
|
+
'Ὤ'=>['ω',0b101001],
|
1305
|
+
'ᾤ'=>['ω',0b101100],
|
1306
|
+
'ὢ'=>['ω',0b110000],
|
1307
|
+
'ὦ'=>['ω',0b111000],
|
1308
|
+
'Ὦ'=>['ω',0b111001],
|
1309
|
+
'ᾦ'=>['ω',0b111100],
|
1310
|
+
'ὡ'=>['ω',0b1000000],
|
1311
|
+
'Ὡ'=>['ω',0b1000001],
|
1312
|
+
'ὥ'=>['ω',0b1001000],
|
1313
|
+
'Ὥ'=>['ω',0b1001001],
|
1314
|
+
'ὣ'=>['ω',0b1010000],
|
1315
|
+
'Ὣ'=>['ω',0b1010001],
|
1316
|
+
'ὧ'=>['ω',0b1011000],
|
1317
|
+
'Ὧ'=>['ω',0b1011001],
|
1318
|
+
'ᾧ'=>['ω',0b1011100],
|
1319
|
+
'ϝ'=>['ϝ',0b0],
|
1320
|
+
'Ϝ'=>['ϝ',0b1]
|
1321
|
+
}
|
1322
|
+
@@assemble_greek_char_hash = {
|
1323
|
+
|
1324
|
+
'α00'=>'α',
|
1325
|
+
'α01'=>'Α',
|
1326
|
+
'α04'=>'ᾳ',
|
1327
|
+
'α08'=>'ά',
|
1328
|
+
'α10'=>'ὰ',
|
1329
|
+
'α18'=>'ᾶ',
|
1330
|
+
'α1c'=>'ᾷ',
|
1331
|
+
'α20'=>'ἀ',
|
1332
|
+
'α21'=>'Ἀ',
|
1333
|
+
'α28'=>'ἄ',
|
1334
|
+
'α29'=>'Ἄ',
|
1335
|
+
'α2c'=>'ᾄ',
|
1336
|
+
'α30'=>'ἂ',
|
1337
|
+
'α31'=>'Ἂ',
|
1338
|
+
'α38'=>'ἆ',
|
1339
|
+
'α39'=>'Ἆ',
|
1340
|
+
'α40'=>'ἁ',
|
1341
|
+
'α41'=>'Ἁ',
|
1342
|
+
'α48'=>'ἅ',
|
1343
|
+
'α49'=>'Ἅ',
|
1344
|
+
'α4c'=>'ᾅ',
|
1345
|
+
'α50'=>'ἃ',
|
1346
|
+
'α5c'=>'ᾇ',
|
1347
|
+
'β00'=>'β',
|
1348
|
+
'β01'=>'Β',
|
1349
|
+
'γ00'=>'γ',
|
1350
|
+
'γ01'=>'Γ',
|
1351
|
+
'δ00'=>'δ',
|
1352
|
+
'δ01'=>'Δ',
|
1353
|
+
'ε00'=>'ε',
|
1354
|
+
'ε01'=>'Ε',
|
1355
|
+
'ε08'=>'έ',
|
1356
|
+
'ε10'=>'ὲ',
|
1357
|
+
'ε20'=>'ἐ',
|
1358
|
+
'ε21'=>'Ἐ',
|
1359
|
+
'ε28'=>'ἔ',
|
1360
|
+
'ε29'=>'Ἔ',
|
1361
|
+
'ε30'=>'ἒ',
|
1362
|
+
'ε40'=>'ἑ',
|
1363
|
+
'ε41'=>'Ἑ',
|
1364
|
+
'ε48'=>'ἕ',
|
1365
|
+
'ε49'=>'Ἕ',
|
1366
|
+
'ε50'=>'ἓ',
|
1367
|
+
'ζ00'=>'ζ',
|
1368
|
+
'ζ01'=>'Ζ',
|
1369
|
+
'η00'=>'η',
|
1370
|
+
'η01'=>'Η',
|
1371
|
+
'η04'=>'ῃ',
|
1372
|
+
'η08'=>'ή',
|
1373
|
+
'η0c'=>'ῄ',
|
1374
|
+
'η10'=>'ὴ',
|
1375
|
+
'η14'=>'ῂ',
|
1376
|
+
'η18'=>'ῆ',
|
1377
|
+
'η1c'=>'ῇ',
|
1378
|
+
'η20'=>'ἠ',
|
1379
|
+
'η21'=>'Ἠ',
|
1380
|
+
'η24'=>'ᾐ',
|
1381
|
+
'η28'=>'ἤ',
|
1382
|
+
'η29'=>'Ἤ',
|
1383
|
+
'η2c'=>'ᾔ',
|
1384
|
+
'η30'=>'ἢ',
|
1385
|
+
'η38'=>'ἦ',
|
1386
|
+
'η39'=>'Ἦ',
|
1387
|
+
'η3c'=>'ᾖ',
|
1388
|
+
'η40'=>'ἡ',
|
1389
|
+
'η41'=>'Ἡ',
|
1390
|
+
'η44'=>'ᾑ',
|
1391
|
+
'η48'=>'ἥ',
|
1392
|
+
'η49'=>'Ἥ',
|
1393
|
+
'η4c'=>'ᾕ',
|
1394
|
+
'η50'=>'ἣ',
|
1395
|
+
'η51'=>'Ἣ',
|
1396
|
+
'η58'=>'ἧ',
|
1397
|
+
'η59'=>'Ἧ',
|
1398
|
+
'η5c'=>'ᾗ',
|
1399
|
+
'θ00'=>'θ',
|
1400
|
+
'θ01'=>'Θ',
|
1401
|
+
'ι00'=>'ι',
|
1402
|
+
'ι01'=>'Ι',
|
1403
|
+
'ι02'=>'ϊ',
|
1404
|
+
'ι08'=>'ί',
|
1405
|
+
'ι0a'=>'ΐ',
|
1406
|
+
'ι10'=>'ὶ',
|
1407
|
+
'ι12'=>'ῒ',
|
1408
|
+
'ι18'=>'ῖ',
|
1409
|
+
'ι1a'=>'ῗ',
|
1410
|
+
'ι20'=>'ἰ',
|
1411
|
+
'ι21'=>'Ἰ',
|
1412
|
+
'ι28'=>'ἴ',
|
1413
|
+
'ι29'=>'Ἴ',
|
1414
|
+
'ι30'=>'ἲ',
|
1415
|
+
'ι38'=>'ἶ',
|
1416
|
+
'ι39'=>'Ἶ',
|
1417
|
+
'ι40'=>'ἱ',
|
1418
|
+
'ι41'=>'Ἱ',
|
1419
|
+
'ι48'=>'ἵ',
|
1420
|
+
'ι49'=>'Ἵ',
|
1421
|
+
'ι50'=>'ἳ',
|
1422
|
+
'ι58'=>'ἷ',
|
1423
|
+
'κ00'=>'κ',
|
1424
|
+
'κ01'=>'Κ',
|
1425
|
+
'λ00'=>'λ',
|
1426
|
+
'λ01'=>'Λ',
|
1427
|
+
'μ00'=>'μ',
|
1428
|
+
'μ01'=>'Μ',
|
1429
|
+
'ν00'=>'ν',
|
1430
|
+
'ν01'=>'Ν',
|
1431
|
+
'ξ00'=>'ξ',
|
1432
|
+
'ξ01'=>'Ξ',
|
1433
|
+
'ο00'=>'ο',
|
1434
|
+
'ο01'=>'Ο',
|
1435
|
+
'ο08'=>'ό',
|
1436
|
+
'ο10'=>'ὸ',
|
1437
|
+
'ο20'=>'ὀ',
|
1438
|
+
'ο21'=>'Ὀ',
|
1439
|
+
'ο28'=>'ὄ',
|
1440
|
+
'ο29'=>'Ὄ',
|
1441
|
+
'ο40'=>'ὁ',
|
1442
|
+
'ο41'=>'Ὁ',
|
1443
|
+
'ο48'=>'ὅ',
|
1444
|
+
'ο49'=>'Ὅ',
|
1445
|
+
'ο50'=>'ὃ',
|
1446
|
+
'ο51'=>'Ὃ',
|
1447
|
+
'π00'=>'π',
|
1448
|
+
'π01'=>'Π',
|
1449
|
+
'ρ00'=>'ρ',
|
1450
|
+
'ρ01'=>'Ρ',
|
1451
|
+
'ρ20'=>'ῤ',
|
1452
|
+
'ρ40'=>'ῥ',
|
1453
|
+
'ρ41'=>'Ῥ',
|
1454
|
+
'ς00'=>'ς',
|
1455
|
+
'σ00'=>'σ',
|
1456
|
+
'σ01'=>'Σ',
|
1457
|
+
'τ00'=>'τ',
|
1458
|
+
'τ01'=>'Τ',
|
1459
|
+
'υ00'=>'υ',
|
1460
|
+
'υ01'=>'Υ',
|
1461
|
+
'υ02'=>'ϋ',
|
1462
|
+
'υ08'=>'ύ',
|
1463
|
+
'υ0a'=>'ΰ',
|
1464
|
+
'υ10'=>'ὺ',
|
1465
|
+
'υ12'=>'ῢ',
|
1466
|
+
'υ18'=>'ῦ',
|
1467
|
+
'υ20'=>'ὐ',
|
1468
|
+
'υ28'=>'ὔ',
|
1469
|
+
'υ38'=>'ὖ',
|
1470
|
+
'υ40'=>'ὑ',
|
1471
|
+
'υ41'=>'Ὑ',
|
1472
|
+
'υ48'=>'ὕ',
|
1473
|
+
'υ49'=>'Ὕ',
|
1474
|
+
'υ50'=>'ὓ',
|
1475
|
+
'υ58'=>'ὗ',
|
1476
|
+
'φ00'=>'φ',
|
1477
|
+
'φ01'=>'Φ',
|
1478
|
+
'χ00'=>'χ',
|
1479
|
+
'χ01'=>'Χ',
|
1480
|
+
'ψ00'=>'ψ',
|
1481
|
+
'ψ01'=>'Ψ',
|
1482
|
+
'ω00'=>'ω',
|
1483
|
+
'ω01'=>'Ω',
|
1484
|
+
'ω04'=>'ῳ',
|
1485
|
+
'ω08'=>'ώ',
|
1486
|
+
'ω0c'=>'ῴ',
|
1487
|
+
'ω10'=>'ὼ',
|
1488
|
+
'ω18'=>'ῶ',
|
1489
|
+
'ω1c'=>'ῷ',
|
1490
|
+
'ω20'=>'ὠ',
|
1491
|
+
'ω21'=>'Ὠ',
|
1492
|
+
'ω24'=>'ᾠ',
|
1493
|
+
'ω28'=>'ὤ',
|
1494
|
+
'ω29'=>'Ὤ',
|
1495
|
+
'ω2c'=>'ᾤ',
|
1496
|
+
'ω30'=>'ὢ',
|
1497
|
+
'ω38'=>'ὦ',
|
1498
|
+
'ω39'=>'Ὦ',
|
1499
|
+
'ω3c'=>'ᾦ',
|
1500
|
+
'ω40'=>'ὡ',
|
1501
|
+
'ω41'=>'Ὡ',
|
1502
|
+
'ω48'=>'ὥ',
|
1503
|
+
'ω49'=>'Ὥ',
|
1504
|
+
'ω50'=>'ὣ',
|
1505
|
+
'ω51'=>'Ὣ',
|
1506
|
+
'ω58'=>'ὧ',
|
1507
|
+
'ω59'=>'Ὧ',
|
1508
|
+
'ω5c'=>'ᾧ',
|
1509
|
+
'ϝ00'=>'ϝ',
|
1510
|
+
'ϝ01'=>'Ϝ'
|
1511
|
+
}
|
1512
|
+
|
1513
|
+
end # module Tinycus
|
1514
|
+
|