tinycus 1.0.6 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/tinycus.rb +87 -80
- metadata +3 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 83d0a3e5d3f764901829858fc5a1b9160577e46f5e277504291c21225c2a38a0
|
4
|
+
data.tar.gz: da3dece01b93b157e1e36d7e07b27ead0f55dab621101af4444e716672028323
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 16fb025b30abf1d6650aa2a76108117a88e30e5d5b524fc8aae83aca1531b62a2b4baab4da9167b673636cae413b8225fc236772185c9c1063bc4a31bc1321c3
|
7
|
+
data.tar.gz: e0cc0963f112a25e782875ecb179e035a0dabb1a6d20b2e3b59d0b2c5584bfb0bbe1d9b41c275dac631b7c520822a1e34b1c1340184a64f27bda52d403c3dd4b
|
data/tinycus.rb
CHANGED
@@ -4,45 +4,51 @@ require 'json'
|
|
4
4
|
|
5
5
|
module Tinycus
|
6
6
|
|
7
|
-
|
7
|
+
=begin rdoc
|
8
|
+
The four "alpha_" functions work on Greek and English, also most Latin characters; see comments in Tr.get_greek_collation_tr.
|
9
|
+
=end
|
8
10
|
|
11
|
+
# Sort a list of strings in alphabetical order.
|
9
12
|
def Tinycus.alpha_sort(l,n:false)
|
10
13
|
return Tinycus.sort(l,Tinycus.alpha_collation,n:false)
|
11
14
|
end
|
12
15
|
|
16
|
+
# Tests two strings for equality in alphabetical order, returns a boolean.
|
13
17
|
def Tinycus.alpha_equal(a,b,n:true)
|
14
18
|
return (Tinycus.alpha_compare(a,b,n:n)==0)
|
15
19
|
end
|
16
20
|
|
21
|
+
# An alphabetical <=> function.
|
17
22
|
def Tinycus.alpha_compare(a,b,n:true)
|
18
23
|
# return (Tinycus::Tr.remove_accents_from_greek(a,n:n).downcase <=> Tinycus::Tr.remove_accents_from_greek(b,n:n).downcase)
|
19
24
|
collation_tr = Tinycus.alpha_collation
|
20
25
|
return collation_tr.apply(a,n:n) <=> collation_tr.apply(b,n:n)
|
21
26
|
end
|
22
27
|
|
23
|
-
def Tinycus.alpha_collation
|
28
|
+
def Tinycus.alpha_collation() # :nodoc:
|
24
29
|
return Tinycus::Tr.get_greek_collation_tr
|
25
30
|
end
|
26
31
|
|
32
|
+
# synonym of Tinycus.contains_vowel, for readability when using it on a single character
|
27
33
|
def Tinycus.is_vowel(c)
|
28
|
-
# synonym of Tinycus.contains_vowel, for readability when using it on a single character
|
29
34
|
return Tinycus.contains_vowel(c)
|
30
35
|
end
|
31
36
|
|
37
|
+
# Works for Greek and Latin; considers y to be a vowel; doesn't handle stuff like Welsh w.
|
32
38
|
def Tinycus.contains_vowel(s)
|
33
|
-
# works for Greek and Latin; considers y to be a vowel; doesn't handle stuff like Welsh w
|
34
39
|
if Tinycus::Tr.remove_accents_from_greek(s).downcase.match?(/[αειουηωaeiouyæ]/) then return true else return false end
|
35
40
|
end
|
36
41
|
|
37
42
|
# ---
|
38
43
|
|
44
|
+
# Like alpha_sort, but more general, using any Tr object to define a collation order.
|
39
45
|
def Tinycus.sort(l,collation_tr,n:false)
|
40
46
|
p = l.clone # This does work on a list of strings: ruby -e "a=['p','q']; b=a.clone; b[1]='x'; print a"
|
41
47
|
p.sort_by! { |x| collation_tr.apply(x,n:n) } # ruby's sort_by! only applies the block once to each element, to form an index
|
42
48
|
return p
|
43
49
|
end
|
44
50
|
|
45
|
-
def Tinycus.run_tests
|
51
|
+
def Tinycus.run_tests # :nodoc:
|
46
52
|
# removing accents:
|
47
53
|
tests = [
|
48
54
|
['',''],
|
@@ -125,23 +131,13 @@ module Tinycus
|
|
125
131
|
end
|
126
132
|
}
|
127
133
|
|
134
|
+
Tinycus::Tr.run_tests()
|
135
|
+
Tinycus::MiscGreek.run_tests()
|
128
136
|
|
129
137
|
end
|
130
138
|
|
131
|
-
# fixme:
|
132
|
-
# Determine byte order and make sure we convert to native (i.e., BE rather than LE if we're on a big-endian machine).
|
133
|
-
@@bloater = Encoding::Converter.new('UTF-8','UTF-32LE')
|
134
|
-
@@shrinker = Encoding::Converter.new('UTF-32LE','UTF-8')
|
135
|
-
|
136
|
-
def Tinycus.bloat(s) # private method
|
137
|
-
return @@bloater.convert(s)
|
138
|
-
end
|
139
|
-
def Tinycus.shrink(s) # private method
|
140
|
-
return @@shrinker.convert(s)
|
141
|
-
end
|
142
|
-
|
143
139
|
class Tinycus::MiscGreek
|
144
|
-
def MiscGreek.run_tests
|
140
|
+
def MiscGreek.run_tests # :nodoc:
|
145
141
|
print "testing MiscGreek.add_second_accent...\n"
|
146
142
|
[['θεμείλια','θεμείλιά'],
|
147
143
|
['πόλεμονδε','πόλεμόνδε'],
|
@@ -155,9 +151,9 @@ module Tinycus
|
|
155
151
|
}
|
156
152
|
end
|
157
153
|
|
154
|
+
# A rough approximation, for cases where we don't need perfect precision and either don't have Ifthimos's syllabification module
|
155
|
+
# or don't want the performance hit.
|
158
156
|
def MiscGreek.estimate_syll_count(x)
|
159
|
-
# A rough approximation, for cases where we don't need perfect precision and either don't have Ransom's greek/syllab.rb
|
160
|
-
# or don't want the performance hit.
|
161
157
|
x = x.downcase
|
162
158
|
x = x.gsub(/[ϊ]/,'e') # prevent it from being misinterpreted as a diphthong after the diaresis is stripped below
|
163
159
|
x = Tr.remove_accents_from_greek(x)
|
@@ -181,10 +177,10 @@ module Tinycus
|
|
181
177
|
return Tr.remove_tonal_accents_from_greek(x)!=x
|
182
178
|
end
|
183
179
|
|
184
|
-
|
185
|
-
|
186
|
-
w =
|
187
|
-
|
180
|
+
# Modifies a word as would be appropriate if it was followed by an enclitic. E.g., if the input is θεμείλια, returns θεμείλιά.
|
181
|
+
def MiscGreek.add_second_accent(w)
|
182
|
+
w = w.clone # shallow copy, works on a string; I'm not clear in why this is necessary, but it is; modification to w is visible in
|
183
|
+
# output of make test_misc_greek
|
188
184
|
if w=~/(.*)δε$/ then
|
189
185
|
stem = $1
|
190
186
|
nsyll = MiscGreek.estimate_syll_count(stem)
|
@@ -202,14 +198,14 @@ module Tinycus
|
|
202
198
|
return w
|
203
199
|
end
|
204
200
|
|
201
|
+
# For a given word, try to predict every possible form it could take in a text, including
|
202
|
+
# both possible capitalizations, acute/grave, and multiple accents.
|
203
|
+
# The word w should already have been converted into a canonical dictionary form (typically a single acute accent).
|
204
|
+
# This is not 100% perfect, mainly because the rules for multiple accents are complicated and Tinycus doesn't include a full
|
205
|
+
# syllabification algorithm.
|
206
|
+
# I tested this as a round-trip on all multiply accented words occurring in Homer. The following three words were the only
|
207
|
+
# ones where it failed: κάλλίον, σταφύλῇ, ὕπὸ.
|
205
208
|
def MiscGreek.all_cases_and_accents(w)
|
206
|
-
# For a given word, try to predict every possible form it could take in a text, including
|
207
|
-
# both possible capitalizations, acute/grave, and multiple accents.
|
208
|
-
# The word w should already have been converted into a canonical dictionary form (typically a single acute accent).
|
209
|
-
# This is not 100% perfect, mainly because the rules for multiple accents are complicated and Tinycus doesn't include a full
|
210
|
-
# syllabification algorithm.
|
211
|
-
# I tested this as a round-trip on all multiply accented words occurring in Homer. The following three words were the only
|
212
|
-
# ones where it failed: κάλλίον, σταφύλῇ, ὕπὸ.
|
213
209
|
forms = [w.downcase]
|
214
210
|
forms = forms+forms.map { |x| Tr.greek_acute_to_grave(x) }
|
215
211
|
forms = forms+forms.map { |x| MiscGreek.add_second_accent(x) }
|
@@ -235,10 +231,11 @@ module Tinycus
|
|
235
231
|
@@greek_acute_to_grave = nil
|
236
232
|
@@prep_greek_to_collation_form = nil
|
237
233
|
|
234
|
+
# Initialize a data structure that represents an action equivalent to String#tr(a,b), but faster.
|
235
|
+
# Including redundant characters or unchanged characters is harmless and is fixed in this constructor; it does not cause
|
236
|
+
# any performance hit when the object is actually used. The initializer takes linear time and memory in the size of
|
237
|
+
# the inputs.
|
238
238
|
def initialize(a,b)
|
239
|
-
# Initialize a data structure that represents an action equivalent to string.tr(a,b), but faster.
|
240
|
-
# Including redudant characters is harmless and is fixed in this constructor; it does not cause
|
241
|
-
# any performance hit when the object is actually used.
|
242
239
|
if a.length!=b.length then raise "lengths unequal, #{a.length} and #{b.length}" end
|
243
240
|
@l = a.length
|
244
241
|
@orig_tables = [a.clone,b.clone] # stash them away for testing purposes
|
@@ -246,7 +243,7 @@ module Tinycus
|
|
246
243
|
@h = {}
|
247
244
|
0.upto(@l-1) { |i|
|
248
245
|
p,q = a[i],b[i]
|
249
|
-
@h[p] = q
|
246
|
+
if p!=q then @h[p] = q end
|
250
247
|
}
|
251
248
|
@h.freeze
|
252
249
|
|
@@ -254,9 +251,13 @@ module Tinycus
|
|
254
251
|
|
255
252
|
attr_reader :l,:a,:b
|
256
253
|
|
254
|
+
# Takes a predefined Tr object and uses it to perform the equivalent of String#tr. Takes O(1) time in the length of
|
255
|
+
# the translation list.
|
256
|
+
# This function tends to be a bottleneck for performance in real-world applications. I tried several algorithms.
|
257
|
+
# See notes in comments at top of scripts/benchmark.rb.
|
258
|
+
# Something like Gnu gperf is theoretically superior in certain ways (e.g., avoiding the theoretical possibility of
|
259
|
+
# a hash collision).
|
257
260
|
def apply(s,n:false)
|
258
|
-
# This function tends to be a bottleneck for performance in real-world applications. I tried several algorithms.
|
259
|
-
# See notes in comments at top of scripts/benchmark.rb.
|
260
261
|
|
261
262
|
if n then s = s.unicode_normalize(:nfc) end # 30% performance hit, not necessary if input has already been normalized
|
262
263
|
|
@@ -271,9 +272,10 @@ module Tinycus
|
|
271
272
|
|
272
273
|
end
|
273
274
|
|
274
|
-
|
275
|
-
|
276
|
-
|
275
|
+
# Tests a Tr object and pokes and prods it to see if it seems OK.
|
276
|
+
# Raises an exception if it fails. Otherwise just returns silently.
|
277
|
+
# If not nil, then the alphabet argument should give a list of characters that are allowed to exist in the output.
|
278
|
+
def self_test(alphabet) # :nodoc:
|
277
279
|
a,b = @orig_tables
|
278
280
|
if self.apply(a)!=b then raise "error in self_test, applying me to original a does not give original b" end
|
279
281
|
if self.apply(b)!=b then raise "error in self_test, applying me to original b does not give original b" end
|
@@ -442,6 +444,7 @@ module Tinycus
|
|
442
444
|
return @@prep_remove_accents_from_greek.apply(s)
|
443
445
|
end
|
444
446
|
|
447
|
+
# Slow.
|
445
448
|
def Tr.remove_macrons_and_breves(s)
|
446
449
|
# This can't be implemented using my fast method, because most of these are composed characters.
|
447
450
|
if !(s.kind_of?(String)) then return s end
|
@@ -500,12 +503,11 @@ a.each { |c|
|
|
500
503
|
=end
|
501
504
|
end
|
502
505
|
|
506
|
+
# Changes a macronized string to one that looks like this: ἕννυ_μι.
|
507
|
+
# We don't handle grave and circumflex accents, but those don't occur in dictionary headers with macrons.
|
508
|
+
# For an IfMows object in Ifthimos, this can be done using stringify(macronization:'underbar').
|
503
509
|
def Tr.macronized_to_underbar_style(s)
|
504
|
-
# Changes a macronized string to one that looks like this: ἕννυ_μι.
|
505
510
|
# The lists in the regexes are generated by the commented-out scripts below, and are not actually totally comprehensive.
|
506
|
-
# We don't handle grave and circumflex accents, but those don't occur in dictionary headers with macrons.
|
507
|
-
# For an IfMows object in Ifthimos, this can be done using stringify(macronization:'underbar').
|
508
|
-
# ---
|
509
511
|
x = s.clone
|
510
512
|
x = x.gsub(/(ϊ̄)/) { "#{Tinycus::Tr.remove_macrons_and_breves($1)}_" } # iota with diaresis and macron, occurs in ἀϊκή
|
511
513
|
# First handle letters that have both a macron and a breve, treating them as if they weren't macronized at all:
|
@@ -542,7 +544,7 @@ print a.join('|'),"--\n"
|
|
542
544
|
=end
|
543
545
|
end
|
544
546
|
|
545
|
-
def Tr.get_greek_collation_tr
|
547
|
+
def Tr.get_greek_collation_tr # :nodoc:
|
546
548
|
if @@prep_greek_to_collation_form.nil? then
|
547
549
|
@@prep_greek_to_collation_form = Tr.collation_form('el')
|
548
550
|
end
|
@@ -557,10 +559,10 @@ print a.join('|'),"--\n"
|
|
557
559
|
return @@prep_greek_to_collation_form.apply(s)
|
558
560
|
end
|
559
561
|
|
562
|
+
# Returns a Tinycus::Tr object which can then be used to act on strings using the apply() method.
|
563
|
+
# The 'el' locale is a standard thing that software like ICU uses for polytonic Greek. The object constructed with this
|
564
|
+
# locale will also remove most accents and macrons from Latin characters, but will miss some cases like Czech, and will not handle Cyrillic.
|
560
565
|
def Tr.remove_accents(locale)
|
561
|
-
# Returns a Tinycus::Tr object which can then be used to act on strings using the apply() method.
|
562
|
-
# The 'el' locale is a standard thing that software like ICU uses for polytonic Greek. The object constructed with this
|
563
|
-
# locale will also remove most accents and macrons from Latin characters, but will miss some cases like Czech, and will not handle Cyrillic.
|
564
566
|
t = {
|
565
567
|
"el"=>[
|
566
568
|
"ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåæçèéêëìíîïñòóôõöøùúûüýÿΆΈΊΌΐάέήίΰϊϋόύώỏἀἁἂἃἄἅἆἈἉἊἌἍἎἐἑἒἓἔἕἘἙἜἝἠἡἢἣἤἥἦἧἨἩἫἬἭἮἯἰἱἲἳἴἵἶἷἸἹἼἽἾὀὁὂὃὄὅὈὉὊὋὌὍὐὑὓὔὕὖὗὙὝὠὡὢὣὤὥὦὧὨὩὫὬὭὮὯὰὲὴὶὸὺὼᾐᾑᾓᾔᾕᾖᾗᾠᾤᾦᾧᾰᾱᾳᾴᾶᾷᾸᾹῂῃῄῆῇῐῑῒῖῗῘῙῠῡῢῥῦῨῩῬῳῴῶῷῸῤᾆᾄᾂᾁᾇᾅᾃᾍᾡ",
|
@@ -573,11 +575,11 @@ print a.join('|'),"--\n"
|
|
573
575
|
return result
|
574
576
|
end
|
575
577
|
|
578
|
+
# Returns a Tinycus::Tr object which can then be used to act on strings using the apply() method. Gives a form that
|
579
|
+
# can be alphabetized properly.
|
580
|
+
# The 'el' locale is a standard thing that software like ICU uses for polytonic Greek. The object constructed with this
|
581
|
+
# locale will also produce correct results for most Latin-script words, will miss some cases like Czech, and will not handle Cyrillic.
|
576
582
|
def Tr.collation_form(locale)
|
577
|
-
# Returns a Tinycus::Tr object which can then be used to act on strings using the apply() method. Gives a form that
|
578
|
-
# can be alphabetized properly.
|
579
|
-
# The 'el' locale is a standard thing that software like ICU uses for polytonic Greek. The object constructed with this
|
580
|
-
# locale will also produce correct results for most Latin-script words, will miss some cases like Czech, and will not handle Cyrillic.
|
581
583
|
t = {
|
582
584
|
"el"=>[
|
583
585
|
"ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåæçèéêëìíîïñòóôõöøùúûüýÿΆΈΊΌΐάέήίΰϊϋόύώỏἀἁἂἃἄἅἆἈἉἊἌἍἎἐἑἒἓἔἕἘἙἜἝἠἡἢἣἤἥἦἧἨἩἫἬἭἮἯἰἱἲἳἴἵἶἷἸἹἼἽἾὀὁὂὃὄὅὈὉὊὋὌὍὐὑὓὔὕὖὗὙὝὠὡὢὣὤὥὦὧὨὩὫὬὭὮὯὰὲὴὶὸὺὼᾐᾑᾓᾔᾕᾖᾗᾠᾤᾦᾧᾰᾱᾳᾴᾶᾷᾸᾹῂῃῄῆῇῐῑῒῖῗῘῙῠῡῢῥῦῨῩῬῳῴῶῷῸῤᾆᾄᾂᾁᾇᾅᾃΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩςᾍ",
|
@@ -590,15 +592,14 @@ print a.join('|'),"--\n"
|
|
590
592
|
return result
|
591
593
|
end
|
592
594
|
|
595
|
+
# E.g., Tr.add_breathing_to_character('α','rough') gives 'ἁ'.
|
593
596
|
def Tr.add_breathing_to_character(c,what)
|
594
597
|
plain,d = Tinycus.disassemble_greek_char(c)
|
595
598
|
d['breathing'] = what
|
596
599
|
return Tinycus.assemble_greek_char(plain,d)
|
597
600
|
end
|
598
601
|
|
599
|
-
|
600
|
-
|
601
|
-
def Tr.run_tests
|
602
|
+
def Tr.run_tests # :nodoc:
|
602
603
|
# to execute this, do a "make test_tr"
|
603
604
|
['el'].each { |locale|
|
604
605
|
tr = Tinycus::Tr.remove_accents(locale)
|
@@ -613,10 +614,10 @@ print a.join('|'),"--\n"
|
|
613
614
|
|
614
615
|
class Tinycus::Script
|
615
616
|
|
617
|
+
# Script can be 'latin', 'greek', or 'hebrew'.
|
618
|
+
# C can be both, lowercase, or uppercase.
|
619
|
+
# For scripts that don't have case, c is ignored.
|
616
620
|
def Script.alphabet(script,c:'both')
|
617
|
-
# Script can be 'latin', 'greek', or 'hebrew'.
|
618
|
-
# C can be both, lowercase, or uppercase.
|
619
|
-
# For scripts that don't have case, c is ignored.
|
620
621
|
t = {
|
621
622
|
'latin'=>{'has_case'=>true},
|
622
623
|
'greek'=>{'has_case'=>true},
|
@@ -633,7 +634,7 @@ print a.join('|'),"--\n"
|
|
633
634
|
die("illegal value of c=#{c}, must be both, lowercase, or uppercase")
|
634
635
|
end
|
635
636
|
|
636
|
-
def Script.alphabet_helper(script,include_lc_only_chars)
|
637
|
+
def Script.alphabet_helper(script,include_lc_only_chars) # :nodoc:
|
637
638
|
if script=='latin' then return 'abcdefghijklmnopqrstuvwxyz' end
|
638
639
|
if script=='greek' then
|
639
640
|
result = 'αβγδεζηθικλμνξοπρστυφχψω'
|
@@ -651,8 +652,9 @@ print a.join('|'),"--\n"
|
|
651
652
|
|
652
653
|
class Tinycus::Cleanup
|
653
654
|
|
655
|
+
# Designed for external data sources that can have all kinds of nasty crap in them. Slow, thorough, silent, and brutal.
|
656
|
+
# Attempts to eliminate the traces of incomplete beta code conversion that are found in some Project Perseus XML files.
|
654
657
|
def Cleanup.clean_up_grotty_greek(s,allow_latin:false,clean_perseus:true,standardize_punctuation:true)
|
655
|
-
# Designed for external data sources that can have all kinds of nasty crap in them. Slow, thorough, silent, and brutal.
|
656
658
|
a = s.split(/(\s+)/) # returns a string in which even indices are words, odd indices are whitespace
|
657
659
|
b = []
|
658
660
|
0.upto(a.length-1) { |i|
|
@@ -676,8 +678,8 @@ print a.join('|'),"--\n"
|
|
676
678
|
return s
|
677
679
|
end
|
678
680
|
|
681
|
+
# Like clean_up_grotty_greek, but works on a single word.
|
679
682
|
def Cleanup.clean_up_grotty_greek_one_word(s,allow_latin:false,clean_perseus:true,standardize_punctuation:true)
|
680
|
-
# This works on a single word.
|
681
683
|
s = s.unicode_normalize(:nfc)
|
682
684
|
s = Cleanup.clean_up_greek_combining_characters(s,allow_latin:allow_latin)
|
683
685
|
# In Perseus's Polybius, they have bracketed text sometimes. In their system, this should probably be a separate punctuation token.
|
@@ -705,7 +707,7 @@ print a.join('|'),"--\n"
|
|
705
707
|
return s
|
706
708
|
end
|
707
709
|
|
708
|
-
def Cleanup.clean_up_greek_combining_characters(s,allow_latin:false)
|
710
|
+
def Cleanup.clean_up_greek_combining_characters(s,allow_latin:false) # :nodoc:
|
709
711
|
combining_comma_above = [787].pack('U')
|
710
712
|
combining_acute_accent = [769].pack('U')
|
711
713
|
greek_koronis = [8125].pack('U')
|
@@ -735,8 +737,7 @@ print a.join('|'),"--\n"
|
|
735
737
|
return s
|
736
738
|
end
|
737
739
|
|
738
|
-
def Cleanup.clean_up_greek_beta_code(s)
|
739
|
-
# This was for when I mistakenly used old beta code version of project perseus.
|
740
|
+
def Cleanup.clean_up_greek_beta_code(s) # :nodoc:
|
740
741
|
# Even with perseus 2.1, some stuff seems to come through that looks like beta code, e.g., ἀργει~ος.
|
741
742
|
# https://github.com/PerseusDL/treebank_data/issues/30
|
742
743
|
s = s.sub(/\((.)/) { $1.tr("αειουηω","ἁἑἱὁὑἡὡ") }
|
@@ -751,8 +752,8 @@ print a.join('|'),"--\n"
|
|
751
752
|
return s
|
752
753
|
end
|
753
754
|
|
755
|
+
# Works on any string, doesn't have to be a single word. Standardize elision character and middle dot/ano teleia.
|
754
756
|
def Cleanup.standardize_greek_punctuation(s)
|
755
|
-
# Works on any string, doesn't have to be a single word. Standardize elision character and middle dot/ano teleia.
|
756
757
|
# Perseus and Monro/Allen write ρ with breathing mark instead of ρ᾽ when there's elision:
|
757
758
|
s = s.gsub(/(?<=[[:alpha:]])[ῤῥ](?![[:alpha:]])/,'ρ᾽')
|
758
759
|
# ... Note that we do need to reinsert the breathing mark, or else we lose the info needed to do accurate lemmatization. Cf. Spelling module.
|
@@ -1013,26 +1014,31 @@ end
|
|
1013
1014
|
JSON
|
1014
1015
|
@@beta_code_conversion = nil
|
1015
1016
|
|
1016
|
-
def Tinycus.beta_code_conversion_table
|
1017
|
-
if @@beta_code_conversion.nil? then
|
1017
|
+
def Tinycus.beta_code_conversion_table # :nodoc:
|
1018
|
+
if @@beta_code_conversion.nil? then
|
1019
|
+
@@beta_code_conversion=JSON.parse(@@beta_code_conversion_json)
|
1020
|
+
@@beta_code_conversion.freeze
|
1021
|
+
end
|
1018
1022
|
return @@beta_code_conversion
|
1019
1023
|
end
|
1020
1024
|
|
1025
|
+
# Converts a unicode character to beta code. The input must be utf8/nfc.
|
1021
1026
|
def Tinycus.greek_char_unicode_to_beta_code(u)
|
1022
1027
|
b = Tinycus.beta_code_conversion_table()[0][u]
|
1023
1028
|
if !b.nil? then return b else return u end # most failures will just be whitespace, punctuation, etc.
|
1024
1029
|
end
|
1025
1030
|
|
1031
|
+
# Converts a character from beta code to unicode.
|
1026
1032
|
def Tinycus.greek_char_beta_code_to_unicode(b)
|
1027
1033
|
b = Tinycus.canonicalize_char_greek_beta_code(b)
|
1028
1034
|
u = Tinycus.beta_code_conversion_table()[1][b]
|
1029
1035
|
if !u.nil? then return u else return b end
|
1030
1036
|
end
|
1031
1037
|
|
1038
|
+
# Breathing normally comes after accent, but sometimes you see things in the wild where it's reversed.
|
1039
|
+
# I can't find any documentation for any preferred or canonical order. What circumstantial evidence I could find I put into the WP article.
|
1040
|
+
# Note that the order of |+ doesn't matter, because the same letter can't have both.
|
1032
1041
|
def Tinycus.canonicalize_char_greek_beta_code(b)
|
1033
|
-
# Breathing normally comes after accent, but sometimes you see things in the wild where it's reversed.
|
1034
|
-
# I can't find any documentation for any preferred or canonical order. What circumstantial evidence I could find I put into the WP article.
|
1035
|
-
# Note that the order of |+ doesn't matter, because the same letter can't have both.
|
1036
1042
|
")(/\\=|+&'".chars.each { |c|
|
1037
1043
|
if b=~/(.*)#{Regexp::quote(c)}(.*)/ then b = $1+$2+c end
|
1038
1044
|
}
|
@@ -1040,6 +1046,7 @@ JSON
|
|
1040
1046
|
return b
|
1041
1047
|
end
|
1042
1048
|
|
1049
|
+
# Converts a string from utf8/nfc to beta code.
|
1043
1050
|
def Tinycus.greek_unicode_to_beta_code(u)
|
1044
1051
|
u = Tinycus::Cleanup.clean_up_grotty_greek(u,allow_latin:true,clean_perseus:true,standardize_punctuation:false)
|
1045
1052
|
# ... the conversion below will not work on unicode that isn't done cleanly and according to modern standards
|
@@ -1051,6 +1058,7 @@ JSON
|
|
1051
1058
|
return b
|
1052
1059
|
end
|
1053
1060
|
|
1061
|
+
# Converts a string from beta code to unicode.
|
1054
1062
|
def Tinycus.greek_beta_code_to_unicode(b)
|
1055
1063
|
# This implementation will be kind of slow because it does regex replacements in place.
|
1056
1064
|
b = b.clone
|
@@ -1065,13 +1073,13 @@ JSON
|
|
1065
1073
|
return b
|
1066
1074
|
end
|
1067
1075
|
|
1076
|
+
# Returns [plain,d], where plain is a lowercase, unaccented Greek letter (α-ω, plus ς), and d is
|
1077
|
+
# a hash with the following keys:
|
1078
|
+
# uppercase, diar, iota_subscript - boolean values
|
1079
|
+
# tonal - string value: none acute grave circumflex
|
1080
|
+
# breathing - string value: none smooth rough
|
1081
|
+
# Doesn't handle macrons and breves. I have a function IfMows.disassemble_char in Ifthimos that does that.
|
1068
1082
|
def Tinycus.disassemble_greek_char(c)
|
1069
|
-
# Returns [plain,d], where plain is a lowercase, unaccented Greek letter (α-ω, plus ς), and d is
|
1070
|
-
# a hash with the following keys:
|
1071
|
-
# uppercase, diar, iota_subscript - boolean values
|
1072
|
-
# tonal - string value: none acute grave circumflex
|
1073
|
-
# breathing - string value: none smooth rough
|
1074
|
-
# Doesn't handle macrons and breves. I have a function IfMows.disassemble_char in Ifthimos that does that.
|
1075
1083
|
x = Tinycus.disassemble_greek_char_binary(c)
|
1076
1084
|
if x.nil? then return nil end
|
1077
1085
|
plain,decor = x
|
@@ -1092,9 +1100,8 @@ JSON
|
|
1092
1100
|
return [plain,d]
|
1093
1101
|
end
|
1094
1102
|
|
1103
|
+
# The inverse of Tinycus.disassemble_greek_char.
|
1095
1104
|
def Tinycus.assemble_greek_char(plain,d)
|
1096
|
-
# The inverse of Tinycus.disassemble_greek_char.
|
1097
|
-
# Doesn't handle macrons and breves. I have a function IfMows.assemble_char in Ifthimos that does that.
|
1098
1105
|
b = 0
|
1099
1106
|
b |= 0b1 if d['uppercase']
|
1100
1107
|
b |= 0b10 if d['diar']
|
@@ -1108,7 +1115,7 @@ JSON
|
|
1108
1115
|
return Tinycus.assemble_greek_char_hex(x)
|
1109
1116
|
end
|
1110
1117
|
|
1111
|
-
def Tinycus.disassemble_greek_char_binary(c)
|
1118
|
+
def Tinycus.disassemble_greek_char_binary(c) # :nodoc:
|
1112
1119
|
# Returns [plain,b], where plain is a lowercase, unaccented Greek letter (α-ω, plus ς), and b is
|
1113
1120
|
# an integer containing a set of flags encoded in binary, as follows:
|
1114
1121
|
# b |= 0b1 if d['uppercase']
|
@@ -1122,7 +1129,7 @@ JSON
|
|
1122
1129
|
return @@disassemble_greek_char_hash[c]
|
1123
1130
|
end
|
1124
1131
|
|
1125
|
-
def Tinycus.assemble_greek_char_hex(x)
|
1132
|
+
def Tinycus.assemble_greek_char_hex(x) # :nodoc:
|
1126
1133
|
# The inverse of the map in Tinycus.disassemble_greek_char_binary.
|
1127
1134
|
# Accepts an input such as 'α08', where the second and third characters are the hex representation of of
|
1128
1135
|
# the set of flags described in the comments in the forward map.
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tinycus
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Benjamin Crowell
|
@@ -31,6 +31,8 @@ metadata:
|
|
31
31
|
source_code_uri: https://bitbucket.org/ben-crowell/tinycus
|
32
32
|
post_install_message:
|
33
33
|
rdoc_options:
|
34
|
+
- "--exclude"
|
35
|
+
- "(bad_combining|generating)"
|
34
36
|
- "--main"
|
35
37
|
- README.md
|
36
38
|
require_paths:
|