hebrew 0.1.8 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
 - data/lib/hebrew.rb +69 -4
 - metadata +3 -4
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
     | 
    
         
            -
             
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 2 
     | 
    
         
            +
            SHA256:
         
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 661d451b7a8c3a59707599e58b68071cb3920bb0ce91d6289971fda0aededbb2
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: fbbae45974ca57a07617095ab5fd8f7dde66919179c0fc503b71f1ef2154f4bd
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: ea15e475ab1237ae160a48b9951f35233208c940ed4ea402c364d47a3f960c174cd7c9091d38ae7a381a882f7b21b618a071bc500849ae86d26eb6b0594bec0f
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: b1083099d1f2b488beaa5916057ace041efc7daf9600fceb6927a3aa8c69de760f642c0d31cba89575645d6807a4d166ada787b4f0b6a30a6e33c47719bb587c
         
     | 
    
        data/lib/hebrew.rb
    CHANGED
    
    | 
         @@ -5,9 +5,7 @@ 
     | 
|
| 
       5 
5 
     | 
    
         | 
| 
       6 
6 
     | 
    
         
             
            # codepoints for CP1255 nikkud
         
     | 
| 
       7 
7 
     | 
    
         
             
            NIKKUD_CP1255 = [192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 209, 210]
         
     | 
| 
       8 
     | 
    
         
            -
            #NIKKUD_CP1255 = ["\xc0".force_encoding('windows-1255'), "\xc1".force_encoding('windows-1255'), "\xc2".force_encoding('windows-1255'), "\xc3".force_encoding('windows-1255'), "\xc4".force_encoding('windows-1255'), "\xc5".force_encoding('windows-1255'), "\xc6".force_encoding('windows-1255'), "\xc7".force_encoding('windows-1255'), "\xc8".force_encoding('windows-1255'), "\xc9".force_encoding('windows-1255'), "\xcb".force_encoding('windows-1255'), "\xcc".force_encoding('windows-1255'), "\xd1".force_encoding('windows-1255'), "\xd2".force_encoding('windows-1255')] # wow, this is fugly.  Is there a neater way to specify CP1255 literal?
         
     | 
| 
       9 
8 
     | 
    
         
             
            NIKKUD_UTF8 = [0x05b0, 0x05b1, 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc, 0x05bd, 0x05bf, 0x05c1, 0x05c2]
         
     | 
| 
       10 
     | 
    
         
            -
            #NIKKUD_UTF8 = ["\u05b0", "\u05b1", "\u05b2", "\u05b3", "\u05b4", "\u05b5", "\u05b6", "\u05b7", "\u05b8", "\u05b9", "\u05bb", "\u05bc", "\u05c1", "\u05c2"]
         
     | 
| 
       11 
9 
     | 
    
         
             
            # TODO: Mac encoding
         
     | 
| 
       12 
10 
     | 
    
         | 
| 
       13 
11 
     | 
    
         
             
            FIANLS_CP1255 = ["\xea".force_encoding('windows-1255'), "\xed".force_encoding('windows-1255'), "\xef".force_encoding('windows-1255'), "\xf3".force_encoding('windows-1255'), "\xf5".force_encoding('windows-1255')]
         
     | 
| 
         @@ -15,9 +13,39 @@ FIANLS_CP1255 = ["\xea".force_encoding('windows-1255'), "\xed".force_encoding('w 
     | 
|
| 
       15 
13 
     | 
    
         
             
            FINALS_UTF8 = ["\u05da", "\u05dd", "\u05df", "\u05e3", "\u05e5"]
         
     | 
| 
       16 
14 
     | 
    
         
             
            HEB_UTF8_START = 1424
         
     | 
| 
       17 
15 
     | 
    
         
             
            HEB_UTF8_END = 1535
         
     | 
| 
      
 16 
     | 
    
         
            +
            HEB_UTF8_XIRIK = 1460
         
     | 
| 
      
 17 
     | 
    
         
            +
            HEB_UTF8_XOLAM = 1465
         
     | 
| 
      
 18 
     | 
    
         
            +
            HEB_UTF8_QUBBUTS = 1467
         
     | 
| 
      
 19 
     | 
    
         
            +
            HEB_UTF8_SHURUK = 1468
         
     | 
| 
       18 
20 
     | 
    
         | 
| 
       19 
21 
     | 
    
         
             
            # extend String class
         
     | 
| 
       20 
22 
     | 
    
         
             
            class String
         
     | 
| 
      
 23 
     | 
    
         
            +
              def strip_hebrew
         
     | 
| 
      
 24 
     | 
    
         
            +
                case self.encoding
         
     | 
| 
      
 25 
     | 
    
         
            +
                when Encoding::UTF_8
         
     | 
| 
      
 26 
     | 
    
         
            +
                  strip_hebrew_utf8
         
     | 
| 
      
 27 
     | 
    
         
            +
                when Encoding::WINDOWS_1255 || Encoding::CP1255
         
     | 
| 
      
 28 
     | 
    
         
            +
                  strip_hebrew_cp1255
         
     | 
| 
      
 29 
     | 
    
         
            +
                end
         
     | 
| 
      
 30 
     | 
    
         
            +
              end
         
     | 
| 
      
 31 
     | 
    
         
            +
              def strip_hebrew_utf8
         
     | 
| 
      
 32 
     | 
    
         
            +
                target = ''
         
     | 
| 
      
 33 
     | 
    
         
            +
                self.each_codepoint {|cp|
         
     | 
| 
      
 34 
     | 
    
         
            +
                  unless self.class.is_codepoint_nikkud_utf8(cp) or self.is_hebrew_codepoint_utf8(cp)
         
     | 
| 
      
 35 
     | 
    
         
            +
                    target << cp.chr(Encoding::UTF_8)
         
     | 
| 
      
 36 
     | 
    
         
            +
                  end
         
     | 
| 
      
 37 
     | 
    
         
            +
                }
         
     | 
| 
      
 38 
     | 
    
         
            +
                return target
         
     | 
| 
      
 39 
     | 
    
         
            +
              end
         
     | 
| 
      
 40 
     | 
    
         
            +
              def strip_hebrew_cp1255
         
     | 
| 
      
 41 
     | 
    
         
            +
                target = ''.force_encoding('windows-1255')
         
     | 
| 
      
 42 
     | 
    
         
            +
                self.each_codepoint {|cp|
         
     | 
| 
      
 43 
     | 
    
         
            +
                  unless self.class.is_codepoint_nikkud_cp1255(cp) or self.is_hebrew_codepoint_cp1255(cp)
         
     | 
| 
      
 44 
     | 
    
         
            +
                    target << cp.chr(Encoding::CP1255) # is there a neater way?
         
     | 
| 
      
 45 
     | 
    
         
            +
                  end
         
     | 
| 
      
 46 
     | 
    
         
            +
                }
         
     | 
| 
      
 47 
     | 
    
         
            +
                return target
         
     | 
| 
      
 48 
     | 
    
         
            +
              end
         
     | 
| 
       21 
49 
     | 
    
         
             
              # this will return the string, stripped of any Hebrew nikkud characters
         
     | 
| 
       22 
50 
     | 
    
         
             
              def strip_nikkud
         
     | 
| 
       23 
51 
     | 
    
         
             
                case self.encoding
         
     | 
| 
         @@ -31,7 +59,7 @@ class String 
     | 
|
| 
       31 
59 
     | 
    
         
             
                target = ''.force_encoding('windows-1255')
         
     | 
| 
       32 
60 
     | 
    
         
             
                self.each_codepoint {|cp|
         
     | 
| 
       33 
61 
     | 
    
         
             
                  unless self.class.is_codepoint_nikkud_cp1255(cp)
         
     | 
| 
       34 
     | 
    
         
            -
                    target  
     | 
| 
      
 62 
     | 
    
         
            +
                    target << cp.chr(Encoding::CP1255) # is there a neater way?
         
     | 
| 
       35 
63 
     | 
    
         
             
                  end
         
     | 
| 
       36 
64 
     | 
    
         
             
                }
         
     | 
| 
       37 
65 
     | 
    
         
             
                return target
         
     | 
| 
         @@ -40,7 +68,7 @@ class String 
     | 
|
| 
       40 
68 
     | 
    
         
             
                target = ''
         
     | 
| 
       41 
69 
     | 
    
         
             
                self.each_codepoint {|cp|
         
     | 
| 
       42 
70 
     | 
    
         
             
                  unless self.class.is_codepoint_nikkud_utf8(cp)
         
     | 
| 
       43 
     | 
    
         
            -
                    target  
     | 
| 
      
 71 
     | 
    
         
            +
                    target << cp.chr(Encoding::UTF_8)
         
     | 
| 
       44 
72 
     | 
    
         
             
                  end
         
     | 
| 
       45 
73 
     | 
    
         
             
                }
         
     | 
| 
       46 
74 
     | 
    
         
             
                return target
         
     | 
| 
         @@ -58,6 +86,43 @@ class String 
     | 
|
| 
       58 
86 
     | 
    
         
             
                  return false
         
     | 
| 
       59 
87 
     | 
    
         
             
                end
         
     | 
| 
       60 
88 
     | 
    
         
             
              end
         
     | 
| 
      
 89 
     | 
    
         
            +
             
     | 
| 
      
 90 
     | 
    
         
            +
              def falsehood
         
     | 
| 
      
 91 
     | 
    
         
            +
                false
         
     | 
| 
      
 92 
     | 
    
         
            +
              end
         
     | 
| 
      
 93 
     | 
    
         
            +
             
     | 
| 
      
 94 
     | 
    
         
            +
              # this will add matres lectionis (yods and vavs as vowels) after diacritics that denote those vowels. The result won't always be morphologically correct Hebrew, but is useful for generating mostly-likely variants users may search for, when typing inputs (almost no Hebrew users know how to produce diacritics on the keyboard).
         
     | 
| 
      
 95 
     | 
    
         
            +
              def naive_full_nikkud
         
     | 
| 
      
 96 
     | 
    
         
            +
                ret = ''
         
     | 
| 
      
 97 
     | 
    
         
            +
                prev_char = nil
         
     | 
| 
      
 98 
     | 
    
         
            +
                case self.encoding
         
     | 
| 
      
 99 
     | 
    
         
            +
                when Encoding::UTF_8
         
     | 
| 
      
 100 
     | 
    
         
            +
                  self.each_char do |c|
         
     | 
| 
      
 101 
     | 
    
         
            +
                    ret += c
         
     | 
| 
      
 102 
     | 
    
         
            +
                    ret += 'י' if c.codepoints[0] == HEB_UTF8_XIRIK
         
     | 
| 
      
 103 
     | 
    
         
            +
                    ret += 'ו' if c.codepoints[0] == HEB_UTF8_QUBBUTS
         
     | 
| 
      
 104 
     | 
    
         
            +
                    ret += 'ו' if c.codepoints[0] == HEB_UTF8_XOLAM && prev_char != 'ו'
         
     | 
| 
      
 105 
     | 
    
         
            +
                    prev_char = c
         
     | 
| 
      
 106 
     | 
    
         
            +
                  end
         
     | 
| 
      
 107 
     | 
    
         
            +
                  return ret.gsub('יי','ִי') # get rid of extraneous yods possibly added because we weren't looking ahead
         
     | 
| 
      
 108 
     | 
    
         
            +
                else
         
     | 
| 
      
 109 
     | 
    
         
            +
                  return nil # not implemented for other encodings for now.
         
     | 
| 
      
 110 
     | 
    
         
            +
                end
         
     | 
| 
      
 111 
     | 
    
         
            +
              end
         
     | 
| 
      
 112 
     | 
    
         
            +
             
     | 
| 
      
 113 
     | 
    
         
            +
              def any_nikkud?
         
     | 
| 
      
 114 
     | 
    
         
            +
                func = case self.encoding
         
     | 
| 
      
 115 
     | 
    
         
            +
                  when Encoding::UTF_8
         
     | 
| 
      
 116 
     | 
    
         
            +
                    :is_codepoint_nikkud_utf8
         
     | 
| 
      
 117 
     | 
    
         
            +
                  when Encoding::WINDOWS_1255 || Encoding::CP1255
         
     | 
| 
      
 118 
     | 
    
         
            +
                    :is_codepoint_nikkud_cp1255
         
     | 
| 
      
 119 
     | 
    
         
            +
                  else
         
     | 
| 
      
 120 
     | 
    
         
            +
                    :falsehood
         
     | 
| 
      
 121 
     | 
    
         
            +
                  end
         
     | 
| 
      
 122 
     | 
    
         
            +
                self.each_codepoint{|cp| return true if String.send(func, cp)}
         
     | 
| 
      
 123 
     | 
    
         
            +
                return false
         
     | 
| 
      
 124 
     | 
    
         
            +
              end
         
     | 
| 
      
 125 
     | 
    
         
            +
             
     | 
| 
       61 
126 
     | 
    
         
             
              def is_hebrew_codepoint_cp1255(cp)
         
     | 
| 
       62 
127 
     | 
    
         
             
                return ((cp > 191 && cp < 202) or [203, 204, 209, 210].include?(cp))
         
     | 
| 
       63 
128 
     | 
    
         
             
              end
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,14 +1,14 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: hebrew
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.2.3
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - Asaf Bartov
         
     | 
| 
       8 
8 
     | 
    
         
             
            autorequire: 
         
     | 
| 
       9 
9 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       10 
10 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       11 
     | 
    
         
            -
            date:  
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2020-10-20 00:00:00.000000000 Z
         
     | 
| 
       12 
12 
     | 
    
         
             
            dependencies: []
         
     | 
| 
       13 
13 
     | 
    
         
             
            description: Some useful code to identify, transcode, and manipulate Hebrew text
         
     | 
| 
       14 
14 
     | 
    
         
             
            email: asaf.bartov@gmail.com
         
     | 
| 
         @@ -36,8 +36,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement 
     | 
|
| 
       36 
36 
     | 
    
         
             
                - !ruby/object:Gem::Version
         
     | 
| 
       37 
37 
     | 
    
         
             
                  version: '0'
         
     | 
| 
       38 
38 
     | 
    
         
             
            requirements: []
         
     | 
| 
       39 
     | 
    
         
            -
             
     | 
| 
       40 
     | 
    
         
            -
            rubygems_version: 2.4.6
         
     | 
| 
      
 39 
     | 
    
         
            +
            rubygems_version: 3.1.2
         
     | 
| 
       41 
40 
     | 
    
         
             
            signing_key: 
         
     | 
| 
       42 
41 
     | 
    
         
             
            specification_version: 4
         
     | 
| 
       43 
42 
     | 
    
         
             
            summary: Hebrew string manipulation
         
     |