tinycus 1.0.5 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/tinycus.rb +87 -80
  3. metadata +12 -4
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 90849e6c1df95e50e8f6ba6818d78a8bf6352a82fc9048313a11acd7d1624bf2
4
- data.tar.gz: 5b962ac97c0d25777cf527b3a61d0c0e7ebb12c2321b690eca724907bd692de7
3
+ metadata.gz: 83d0a3e5d3f764901829858fc5a1b9160577e46f5e277504291c21225c2a38a0
4
+ data.tar.gz: da3dece01b93b157e1e36d7e07b27ead0f55dab621101af4444e716672028323
5
5
  SHA512:
6
- metadata.gz: 6af58dc8e3928c473e96416b20fee0893aba4b9abf3838f9689e5b95326eb74c80986a6fa09d47c3ca8240b00d2c17609d2b1adbdc74e86ac98055889ca9aff7
7
- data.tar.gz: a5e5a4cad6fbfae7fe6588864cc3607eec471b0c0db6e8adfc3657c44826b63696cac92c410423ef62fe96f3e618e0a476e8b89d764404115a68fc03dc7a8336
6
+ metadata.gz: 16fb025b30abf1d6650aa2a76108117a88e30e5d5b524fc8aae83aca1531b62a2b4baab4da9167b673636cae413b8225fc236772185c9c1063bc4a31bc1321c3
7
+ data.tar.gz: e0cc0963f112a25e782875ecb179e035a0dabb1a6d20b2e3b59d0b2c5584bfb0bbe1d9b41c275dac631b7c520822a1e34b1c1340184a64f27bda52d403c3dd4b
data/tinycus.rb CHANGED
@@ -4,45 +4,51 @@ require 'json'
4
4
 
5
5
  module Tinycus
6
6
 
7
- # The four "alpha_" functions work on Greek and English, also most Latin characters; see comments in Tr.get_greek_collation_tr.
7
+ =begin rdoc
8
+ The four "alpha_" functions work on Greek and English, also most Latin characters; see comments in Tr.get_greek_collation_tr.
9
+ =end
8
10
 
11
+ # Sort a list of strings in alphabetical order.
9
12
  def Tinycus.alpha_sort(l,n:false)
10
13
  return Tinycus.sort(l,Tinycus.alpha_collation,n:false)
11
14
  end
12
15
 
16
+ # Tests two strings for equality in alphabetical order, returns a boolean.
13
17
  def Tinycus.alpha_equal(a,b,n:true)
14
18
  return (Tinycus.alpha_compare(a,b,n:n)==0)
15
19
  end
16
20
 
21
+ # An alphabetical <=> function.
17
22
  def Tinycus.alpha_compare(a,b,n:true)
18
23
  # return (Tinycus::Tr.remove_accents_from_greek(a,n:n).downcase <=> Tinycus::Tr.remove_accents_from_greek(b,n:n).downcase)
19
24
  collation_tr = Tinycus.alpha_collation
20
25
  return collation_tr.apply(a,n:n) <=> collation_tr.apply(b,n:n)
21
26
  end
22
27
 
23
- def Tinycus.alpha_collation
28
+ def Tinycus.alpha_collation() # :nodoc:
24
29
  return Tinycus::Tr.get_greek_collation_tr
25
30
  end
26
31
 
32
+ # synonym of Tinycus.contains_vowel, for readability when using it on a single character
27
33
  def Tinycus.is_vowel(c)
28
- # synonym of Tinycus.contains_vowel, for readability when using it on a single character
29
34
  return Tinycus.contains_vowel(c)
30
35
  end
31
36
 
37
+ # Works for Greek and Latin; considers y to be a vowel; doesn't handle stuff like Welsh w.
32
38
  def Tinycus.contains_vowel(s)
33
- # works for Greek and Latin; considers y to be a vowel; doesn't handle stuff like Welsh w
34
39
  if Tinycus::Tr.remove_accents_from_greek(s).downcase.match?(/[αειουηωaeiouyæ]/) then return true else return false end
35
40
  end
36
41
 
37
42
  # ---
38
43
 
44
+ # Like alpha_sort, but more general, using any Tr object to define a collation order.
39
45
  def Tinycus.sort(l,collation_tr,n:false)
40
46
  p = l.clone # This does work on a list of strings: ruby -e "a=['p','q']; b=a.clone; b[1]='x'; print a"
41
47
  p.sort_by! { |x| collation_tr.apply(x,n:n) } # ruby's sort_by! only applies the block once to each element, to form an index
42
48
  return p
43
49
  end
44
50
 
45
- def Tinycus.run_tests
51
+ def Tinycus.run_tests # :nodoc:
46
52
  # removing accents:
47
53
  tests = [
48
54
  ['',''],
@@ -125,23 +131,13 @@ module Tinycus
125
131
  end
126
132
  }
127
133
 
134
+ Tinycus::Tr.run_tests()
135
+ Tinycus::MiscGreek.run_tests()
128
136
 
129
137
  end
130
138
 
131
- # fixme:
132
- # Determine byte order and make sure we convert to native (i.e., BE rather than LE if we're on a big-endian machine).
133
- @@bloater = Encoding::Converter.new('UTF-8','UTF-32LE')
134
- @@shrinker = Encoding::Converter.new('UTF-32LE','UTF-8')
135
-
136
- def Tinycus.bloat(s) # private method
137
- return @@bloater.convert(s)
138
- end
139
- def Tinycus.shrink(s) # private method
140
- return @@shrinker.convert(s)
141
- end
142
-
143
139
  class Tinycus::MiscGreek
144
- def MiscGreek.run_tests
140
+ def MiscGreek.run_tests # :nodoc:
145
141
  print "testing MiscGreek.add_second_accent...\n"
146
142
  [['θεμείλια','θεμείλιά'],
147
143
  ['πόλεμονδε','πόλεμόνδε'],
@@ -155,9 +151,9 @@ module Tinycus
155
151
  }
156
152
  end
157
153
 
154
+ # A rough approximation, for cases where we don't need perfect precision and either don't have Ifthimos's syllabification module
155
+ # or don't want the performance hit.
158
156
  def MiscGreek.estimate_syll_count(x)
159
- # A rough approximation, for cases where we don't need perfect precision and either don't have Ransom's greek/syllab.rb
160
- # or don't want the performance hit.
161
157
  x = x.downcase
162
158
  x = x.gsub(/[ϊ]/,'e') # prevent it from being misinterpreted as a diphthong after the diaresis is stripped below
163
159
  x = Tr.remove_accents_from_greek(x)
@@ -181,10 +177,10 @@ module Tinycus
181
177
  return Tr.remove_tonal_accents_from_greek(x)!=x
182
178
  end
183
179
 
184
- def MiscGreek.add_second_accent(w_orig)
185
- # e.g., if w is θεμείλια, returns θεμείλιά
186
- w = w_orig.clone # shallow copy, works on a string; I'm not clear in why this is necessary, but it is; modification to w_orig is visible in
187
- # output of make test_misc_greek
180
+ # Modifies a word as would be appropriate if it was followed by an enclitic. E.g., if the input is θεμείλια, returns θεμείλιά.
181
+ def MiscGreek.add_second_accent(w)
182
+ w = w.clone # shallow copy, works on a string; I'm not clear in why this is necessary, but it is; modification to w is visible in
183
+ # output of make test_misc_greek
188
184
  if w=~/(.*)δε$/ then
189
185
  stem = $1
190
186
  nsyll = MiscGreek.estimate_syll_count(stem)
@@ -202,14 +198,14 @@ module Tinycus
202
198
  return w
203
199
  end
204
200
 
201
+ # For a given word, try to predict every possible form it could take in a text, including
202
+ # both possible capitalizations, acute/grave, and multiple accents.
203
+ # The word w should already have been converted into a canonical dictionary form (typically a single acute accent).
204
+ # This is not 100% perfect, mainly because the rules for multiple accents are complicated and Tinycus doesn't include a full
205
+ # syllabification algorithm.
206
+ # I tested this as a round-trip on all multiply accented words occurring in Homer. The following three words were the only
207
+ # ones where it failed: κάλλίον, σταφύλῇ, ὕπὸ.
205
208
  def MiscGreek.all_cases_and_accents(w)
206
- # For a given word, try to predict every possible form it could take in a text, including
207
- # both possible capitalizations, acute/grave, and multiple accents.
208
- # The word w should already have been converted into a canonical dictionary form (typically a single acute accent).
209
- # This is not 100% perfect, mainly because the rules for multiple accents are complicated and Tinycus doesn't include a full
210
- # syllabification algorithm.
211
- # I tested this as a round-trip on all multiply accented words occurring in Homer. The following three words were the only
212
- # ones where it failed: κάλλίον, σταφύλῇ, ὕπὸ.
213
209
  forms = [w.downcase]
214
210
  forms = forms+forms.map { |x| Tr.greek_acute_to_grave(x) }
215
211
  forms = forms+forms.map { |x| MiscGreek.add_second_accent(x) }
@@ -235,10 +231,11 @@ module Tinycus
235
231
  @@greek_acute_to_grave = nil
236
232
  @@prep_greek_to_collation_form = nil
237
233
 
234
+ # Initialize a data structure that represents an action equivalent to String#tr(a,b), but faster.
235
+ # Including redundant characters or unchanged characters is harmless and is fixed in this constructor; it does not cause
236
+ # any performance hit when the object is actually used. The initializer takes linear time and memory in the size of
237
+ # the inputs.
238
238
  def initialize(a,b)
239
- # Initialize a data structure that represents an action equivalent to string.tr(a,b), but faster.
240
- # Including redudant characters is harmless and is fixed in this constructor; it does not cause
241
- # any performance hit when the object is actually used.
242
239
  if a.length!=b.length then raise "lengths unequal, #{a.length} and #{b.length}" end
243
240
  @l = a.length
244
241
  @orig_tables = [a.clone,b.clone] # stash them away for testing purposes
@@ -246,7 +243,7 @@ module Tinycus
246
243
  @h = {}
247
244
  0.upto(@l-1) { |i|
248
245
  p,q = a[i],b[i]
249
- @h[p] = q
246
+ if p!=q then @h[p] = q end
250
247
  }
251
248
  @h.freeze
252
249
 
@@ -254,9 +251,13 @@ module Tinycus
254
251
 
255
252
  attr_reader :l,:a,:b
256
253
 
254
+ # Takes a predefined Tr object and uses it to perform the equivalent of String#tr. Takes O(1) time in the length of
255
+ # the translation list.
256
+ # This function tends to be a bottleneck for performance in real-world applications. I tried several algorithms.
257
+ # See notes in comments at top of scripts/benchmark.rb.
258
+ # Something like Gnu gperf is theoretically superior in certain ways (e.g., avoiding the theoretical possibility of
259
+ # a hash collision).
257
260
  def apply(s,n:false)
258
- # This function tends to be a bottleneck for performance in real-world applications. I tried several algorithms.
259
- # See notes in comments at top of scripts/benchmark.rb.
260
261
 
261
262
  if n then s = s.unicode_normalize(:nfc) end # 30% performance hit, not necessary if input has already been normalized
262
263
 
@@ -271,9 +272,10 @@ module Tinycus
271
272
 
272
273
  end
273
274
 
274
- def self_test(alphabet)
275
- # Raises an exception if it fails. Otherwise just returns silently.
276
- # If not nil, then the alphabet parameter gives a list of characters that are allowed to exist in the output.
275
+ # Tests a Tr object and pokes and prods it to see if it seems OK.
276
+ # Raises an exception if it fails. Otherwise just returns silently.
277
+ # If not nil, then the alphabet argument should give a list of characters that are allowed to exist in the output.
278
+ def self_test(alphabet) # :nodoc:
277
279
  a,b = @orig_tables
278
280
  if self.apply(a)!=b then raise "error in self_test, applying me to original a does not give original b" end
279
281
  if self.apply(b)!=b then raise "error in self_test, applying me to original b does not give original b" end
@@ -442,6 +444,7 @@ module Tinycus
442
444
  return @@prep_remove_accents_from_greek.apply(s)
443
445
  end
444
446
 
447
+ # Slow.
445
448
  def Tr.remove_macrons_and_breves(s)
446
449
  # This can't be implemented using my fast method, because most of these are composed characters.
447
450
  if !(s.kind_of?(String)) then return s end
@@ -500,12 +503,11 @@ a.each { |c|
500
503
  =end
501
504
  end
502
505
 
506
+ # Changes a macronized string to one that looks like this: ἕννυ_μι.
507
+ # We don't handle grave and circumflex accents, but those don't occur in dictionary headers with macrons.
508
+ # For an IfMows object in Ifthimos, this can be done using stringify(macronization:'underbar').
503
509
  def Tr.macronized_to_underbar_style(s)
504
- # Changes a macronized string to one that looks like this: ἕννυ_μι.
505
510
  # The lists in the regexes are generated by the commented-out scripts below, and are not actually totally comprehensive.
506
- # We don't handle grave and circumflex accents, but those don't occur in dictionary headers with macrons.
507
- # For an IfMows object in Ifthimos, this can be done using stringify(macronization:'underbar').
508
- # ---
509
511
  x = s.clone
510
512
  x = x.gsub(/(ϊ̄)/) { "#{Tinycus::Tr.remove_macrons_and_breves($1)}_" } # iota with diaresis and macron, occurs in ἀϊκή
511
513
  # First handle letters that have both a macron and a breve, treating them as if they weren't macronized at all:
@@ -542,7 +544,7 @@ print a.join('|'),"--\n"
542
544
  =end
543
545
  end
544
546
 
545
- def Tr.get_greek_collation_tr
547
+ def Tr.get_greek_collation_tr # :nodoc:
546
548
  if @@prep_greek_to_collation_form.nil? then
547
549
  @@prep_greek_to_collation_form = Tr.collation_form('el')
548
550
  end
@@ -557,10 +559,10 @@ print a.join('|'),"--\n"
557
559
  return @@prep_greek_to_collation_form.apply(s)
558
560
  end
559
561
 
562
+ # Returns a Tinycus::Tr object which can then be used to act on strings using the apply() method.
563
+ # The 'el' locale is a standard thing that software like ICU uses for polytonic Greek. The object constructed with this
564
+ # locale will also remove most accents and macrons from Latin characters, but will miss some cases like Czech, and will not handle Cyrillic.
560
565
  def Tr.remove_accents(locale)
561
- # Returns a Tinycus::Tr object which can then be used to act on strings using the apply() method.
562
- # The 'el' locale is a standard thing that software like ICU uses for polytonic Greek. The object constructed with this
563
- # locale will also remove most accents and macrons from Latin characters, but will miss some cases like Czech, and will not handle Cyrillic.
564
566
  t = {
565
567
  "el"=>[
566
568
  "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåæçèéêëìíîïñòóôõöøùúûüýÿΆΈΊΌΐάέήίΰϊϋόύώỏἀἁἂἃἄἅἆἈἉἊἌἍἎἐἑἒἓἔἕἘἙἜἝἠἡἢἣἤἥἦἧἨἩἫἬἭἮἯἰἱἲἳἴἵἶἷἸἹἼἽἾὀὁὂὃὄὅὈὉὊὋὌὍὐὑὓὔὕὖὗὙὝὠὡὢὣὤὥὦὧὨὩὫὬὭὮὯὰὲὴὶὸὺὼᾐᾑᾓᾔᾕᾖᾗᾠᾤᾦᾧᾰᾱᾳᾴᾶᾷᾸᾹῂῃῄῆῇῐῑῒῖῗῘῙῠῡῢῥῦῨῩῬῳῴῶῷῸῤᾆᾄᾂᾁᾇᾅᾃᾍᾡ",
@@ -573,11 +575,11 @@ print a.join('|'),"--\n"
573
575
  return result
574
576
  end
575
577
 
578
+ # Returns a Tinycus::Tr object which can then be used to act on strings using the apply() method. Gives a form that
579
+ # can be alphabetized properly.
580
+ # The 'el' locale is a standard thing that software like ICU uses for polytonic Greek. The object constructed with this
581
+ # locale will also produce correct results for most Latin-script words, will miss some cases like Czech, and will not handle Cyrillic.
576
582
  def Tr.collation_form(locale)
577
- # Returns a Tinycus::Tr object which can then be used to act on strings using the apply() method. Gives a form that
578
- # can be alphabetized properly.
579
- # The 'el' locale is a standard thing that software like ICU uses for polytonic Greek. The object constructed with this
580
- # locale will also produce correct results for most Latin-script words, will miss some cases like Czech, and will not handle Cyrillic.
581
583
  t = {
582
584
  "el"=>[
583
585
  "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåæçèéêëìíîïñòóôõöøùúûüýÿΆΈΊΌΐάέήίΰϊϋόύώỏἀἁἂἃἄἅἆἈἉἊἌἍἎἐἑἒἓἔἕἘἙἜἝἠἡἢἣἤἥἦἧἨἩἫἬἭἮἯἰἱἲἳἴἵἶἷἸἹἼἽἾὀὁὂὃὄὅὈὉὊὋὌὍὐὑὓὔὕὖὗὙὝὠὡὢὣὤὥὦὧὨὩὫὬὭὮὯὰὲὴὶὸὺὼᾐᾑᾓᾔᾕᾖᾗᾠᾤᾦᾧᾰᾱᾳᾴᾶᾷᾸᾹῂῃῄῆῇῐῑῒῖῗῘῙῠῡῢῥῦῨῩῬῳῴῶῷῸῤᾆᾄᾂᾁᾇᾅᾃΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩςᾍ",
@@ -590,15 +592,14 @@ print a.join('|'),"--\n"
590
592
  return result
591
593
  end
592
594
 
595
+ # E.g., Tr.add_breathing_to_character('α','rough') gives 'ἁ'.
593
596
  def Tr.add_breathing_to_character(c,what)
594
597
  plain,d = Tinycus.disassemble_greek_char(c)
595
598
  d['breathing'] = what
596
599
  return Tinycus.assemble_greek_char(plain,d)
597
600
  end
598
601
 
599
-
600
-
601
- def Tr.run_tests
602
+ def Tr.run_tests # :nodoc:
602
603
  # to execute this, do a "make test_tr"
603
604
  ['el'].each { |locale|
604
605
  tr = Tinycus::Tr.remove_accents(locale)
@@ -613,10 +614,10 @@ print a.join('|'),"--\n"
613
614
 
614
615
  class Tinycus::Script
615
616
 
617
+ # Script can be 'latin', 'greek', or 'hebrew'.
618
+ # C can be both, lowercase, or uppercase.
619
+ # For scripts that don't have case, c is ignored.
616
620
  def Script.alphabet(script,c:'both')
617
- # Script can be 'latin', 'greek', or 'hebrew'.
618
- # C can be both, lowercase, or uppercase.
619
- # For scripts that don't have case, c is ignored.
620
621
  t = {
621
622
  'latin'=>{'has_case'=>true},
622
623
  'greek'=>{'has_case'=>true},
@@ -633,7 +634,7 @@ print a.join('|'),"--\n"
633
634
  die("illegal value of c=#{c}, must be both, lowercase, or uppercase")
634
635
  end
635
636
 
636
- def Script.alphabet_helper(script,include_lc_only_chars)
637
+ def Script.alphabet_helper(script,include_lc_only_chars) # :nodoc:
637
638
  if script=='latin' then return 'abcdefghijklmnopqrstuvwxyz' end
638
639
  if script=='greek' then
639
640
  result = 'αβγδεζηθικλμνξοπρστυφχψω'
@@ -651,8 +652,9 @@ print a.join('|'),"--\n"
651
652
 
652
653
  class Tinycus::Cleanup
653
654
 
655
+ # Designed for external data sources that can have all kinds of nasty crap in them. Slow, thorough, silent, and brutal.
656
+ # Attempts to eliminate the traces of incomplete beta code conversion that are found in some Project Perseus XML files.
654
657
  def Cleanup.clean_up_grotty_greek(s,allow_latin:false,clean_perseus:true,standardize_punctuation:true)
655
- # Designed for external data sources that can have all kinds of nasty crap in them. Slow, thorough, silent, and brutal.
656
658
  a = s.split(/(\s+)/) # returns a string in which even indices are words, odd indices are whitespace
657
659
  b = []
658
660
  0.upto(a.length-1) { |i|
@@ -676,8 +678,8 @@ print a.join('|'),"--\n"
676
678
  return s
677
679
  end
678
680
 
681
+ # Like clean_up_grotty_greek, but works on a single word.
679
682
  def Cleanup.clean_up_grotty_greek_one_word(s,allow_latin:false,clean_perseus:true,standardize_punctuation:true)
680
- # This works on a single word.
681
683
  s = s.unicode_normalize(:nfc)
682
684
  s = Cleanup.clean_up_greek_combining_characters(s,allow_latin:allow_latin)
683
685
  # In Perseus's Polybius, they have bracketed text sometimes. In their system, this should probably be a separate punctuation token.
@@ -705,7 +707,7 @@ print a.join('|'),"--\n"
705
707
  return s
706
708
  end
707
709
 
708
- def Cleanup.clean_up_greek_combining_characters(s,allow_latin:false)
710
+ def Cleanup.clean_up_greek_combining_characters(s,allow_latin:false) # :nodoc:
709
711
  combining_comma_above = [787].pack('U')
710
712
  combining_acute_accent = [769].pack('U')
711
713
  greek_koronis = [8125].pack('U')
@@ -735,8 +737,7 @@ print a.join('|'),"--\n"
735
737
  return s
736
738
  end
737
739
 
738
- def Cleanup.clean_up_greek_beta_code(s)
739
- # This was for when I mistakenly used old beta code version of project perseus.
740
+ def Cleanup.clean_up_greek_beta_code(s) # :nodoc:
740
741
  # Even with perseus 2.1, some stuff seems to come through that looks like beta code, e.g., ἀργει~ος.
741
742
  # https://github.com/PerseusDL/treebank_data/issues/30
742
743
  s = s.sub(/\((.)/) { $1.tr("αειουηω","ἁἑἱὁὑἡὡ") }
@@ -751,8 +752,8 @@ print a.join('|'),"--\n"
751
752
  return s
752
753
  end
753
754
 
755
+ # Works on any string, doesn't have to be a single word. Standardize elision character and middle dot/ano teleia.
754
756
  def Cleanup.standardize_greek_punctuation(s)
755
- # Works on any string, doesn't have to be a single word. Standardize elision character and middle dot/ano teleia.
756
757
  # Perseus and Monro/Allen write ρ with breathing mark instead of ρ᾽ when there's elision:
757
758
  s = s.gsub(/(?<=[[:alpha:]])[ῤῥ](?![[:alpha:]])/,'ρ᾽')
758
759
  # ... Note that we do need to reinsert the breathing mark, or else we lose the info needed to do accurate lemmatization. Cf. Spelling module.
@@ -1013,26 +1014,31 @@ end
1013
1014
  JSON
1014
1015
  @@beta_code_conversion = nil
1015
1016
 
1016
- def Tinycus.beta_code_conversion_table
1017
- if @@beta_code_conversion.nil? then @@beta_code_conversion=JSON.parse(@@beta_code_conversion_json) end
1017
+ def Tinycus.beta_code_conversion_table # :nodoc:
1018
+ if @@beta_code_conversion.nil? then
1019
+ @@beta_code_conversion=JSON.parse(@@beta_code_conversion_json)
1020
+ @@beta_code_conversion.freeze
1021
+ end
1018
1022
  return @@beta_code_conversion
1019
1023
  end
1020
1024
 
1025
+ # Converts a unicode character to beta code. The input must be utf8/nfc.
1021
1026
  def Tinycus.greek_char_unicode_to_beta_code(u)
1022
1027
  b = Tinycus.beta_code_conversion_table()[0][u]
1023
1028
  if !b.nil? then return b else return u end # most failures will just be whitespace, punctuation, etc.
1024
1029
  end
1025
1030
 
1031
+ # Converts a character from beta code to unicode.
1026
1032
  def Tinycus.greek_char_beta_code_to_unicode(b)
1027
1033
  b = Tinycus.canonicalize_char_greek_beta_code(b)
1028
1034
  u = Tinycus.beta_code_conversion_table()[1][b]
1029
1035
  if !u.nil? then return u else return b end
1030
1036
  end
1031
1037
 
1038
+ # Breathing normally comes after accent, but sometimes you see things in the wild where it's reversed.
1039
+ # I can't find any documentation for any preferred or canonical order. What circumstantial evidence I could find I put into the WP article.
1040
+ # Note that the order of |+ doesn't matter, because the same letter can't have both.
1032
1041
  def Tinycus.canonicalize_char_greek_beta_code(b)
1033
- # Breathing normally comes after accent, but sometimes you see things in the wild where it's reversed.
1034
- # I can't find any documentation for any preferred or canonical order. What circumstantial evidence I could find I put into the WP article.
1035
- # Note that the order of |+ doesn't matter, because the same letter can't have both.
1036
1042
  ")(/\\=|+&'".chars.each { |c|
1037
1043
  if b=~/(.*)#{Regexp::quote(c)}(.*)/ then b = $1+$2+c end
1038
1044
  }
@@ -1040,6 +1046,7 @@ JSON
1040
1046
  return b
1041
1047
  end
1042
1048
 
1049
+ # Converts a string from utf8/nfc to beta code.
1043
1050
  def Tinycus.greek_unicode_to_beta_code(u)
1044
1051
  u = Tinycus::Cleanup.clean_up_grotty_greek(u,allow_latin:true,clean_perseus:true,standardize_punctuation:false)
1045
1052
  # ... the conversion below will not work on unicode that isn't done cleanly and according to modern standards
@@ -1051,6 +1058,7 @@ JSON
1051
1058
  return b
1052
1059
  end
1053
1060
 
1061
+ # Converts a string from beta code to unicode.
1054
1062
  def Tinycus.greek_beta_code_to_unicode(b)
1055
1063
  # This implementation will be kind of slow because it does regex replacements in place.
1056
1064
  b = b.clone
@@ -1065,13 +1073,13 @@ JSON
1065
1073
  return b
1066
1074
  end
1067
1075
 
1076
+ # Returns [plain,d], where plain is a lowercase, unaccented Greek letter (α-ω, plus ς), and d is
1077
+ # a hash with the following keys:
1078
+ # uppercase, diar, iota_subscript - boolean values
1079
+ # tonal - string value: none acute grave circumflex
1080
+ # breathing - string value: none smooth rough
1081
+ # Doesn't handle macrons and breves. I have a function IfMows.disassemble_char in Ifthimos that does that.
1068
1082
  def Tinycus.disassemble_greek_char(c)
1069
- # Returns [plain,d], where plain is a lowercase, unaccented Greek letter (α-ω, plus ς), and d is
1070
- # a hash with the following keys:
1071
- # uppercase, diar, iota_subscript - boolean values
1072
- # tonal - string value: none acute grave circumflex
1073
- # breathing - string value: none smooth rough
1074
- # Doesn't handle macrons and breves. I have a function IfMows.disassemble_char in Ifthimos that does that.
1075
1083
  x = Tinycus.disassemble_greek_char_binary(c)
1076
1084
  if x.nil? then return nil end
1077
1085
  plain,decor = x
@@ -1092,9 +1100,8 @@ JSON
1092
1100
  return [plain,d]
1093
1101
  end
1094
1102
 
1103
+ # The inverse of Tinycus.disassemble_greek_char.
1095
1104
  def Tinycus.assemble_greek_char(plain,d)
1096
- # The inverse of Tinycus.disassemble_greek_char.
1097
- # Doesn't handle macrons and breves. I have a function IfMows.assemble_char in Ifthimos that does that.
1098
1105
  b = 0
1099
1106
  b |= 0b1 if d['uppercase']
1100
1107
  b |= 0b10 if d['diar']
@@ -1108,7 +1115,7 @@ JSON
1108
1115
  return Tinycus.assemble_greek_char_hex(x)
1109
1116
  end
1110
1117
 
1111
- def Tinycus.disassemble_greek_char_binary(c)
1118
+ def Tinycus.disassemble_greek_char_binary(c) # :nodoc:
1112
1119
  # Returns [plain,b], where plain is a lowercase, unaccented Greek letter (α-ω, plus ς), and b is
1113
1120
  # an integer containing a set of flags encoded in binary, as follows:
1114
1121
  # b |= 0b1 if d['uppercase']
@@ -1122,7 +1129,7 @@ JSON
1122
1129
  return @@disassemble_greek_char_hash[c]
1123
1130
  end
1124
1131
 
1125
- def Tinycus.assemble_greek_char_hex(x)
1132
+ def Tinycus.assemble_greek_char_hex(x) # :nodoc:
1126
1133
  # The inverse of the map in Tinycus.disassemble_greek_char_binary.
1127
1134
  # Accepts an input such as 'α08', where the second and third characters are the hex representation of of
1128
1135
  # the set of flags described in the comments in the forward map.
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tinycus
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.5
4
+ version: 1.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Benjamin Crowell
@@ -16,7 +16,8 @@ description: "This is a ruby library to do some string functions efficiently tha
16
16
  email:
17
17
  executables: []
18
18
  extensions: []
19
- extra_rdoc_files: []
19
+ extra_rdoc_files:
20
+ - README.md
20
21
  files:
21
22
  - LICENSE
22
23
  - README.md
@@ -24,9 +25,16 @@ files:
24
25
  homepage: https://bitbucket.org/ben-crowell/tinycus
25
26
  licenses:
26
27
  - GPL-3.0-only
27
- metadata: {}
28
+ metadata:
29
+ contact_uri: http://lightandmatter.com/area4author.html
30
+ homepage_uri: https://bitbucket.org/ben-crowell/tinycus
31
+ source_code_uri: https://bitbucket.org/ben-crowell/tinycus
28
32
  post_install_message:
29
- rdoc_options: []
33
+ rdoc_options:
34
+ - "--exclude"
35
+ - "(bad_combining|generating)"
36
+ - "--main"
37
+ - README.md
30
38
  require_paths:
31
39
  - lib
32
40
  required_ruby_version: !ruby/object:Gem::Requirement