hebrew 0.1.9 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +5 -5
  2. data/lib/hebrew.rb +67 -2
  3. metadata +3 -4
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 5a8c1321cb8e5040931921ab180b86295427e64f
4
- data.tar.gz: fe567fbdb5384b967dfff2a0b787a1b403433a7a
2
+ SHA256:
3
+ metadata.gz: f51ccf60da40434c682c097f6986b2ce581351d84b51990afa5d8fe7aa43a8ee
4
+ data.tar.gz: cd79e5dac8aa79c605b6b8e43fa9c710f3d66be233f18af381e22f6a17b1940b
5
5
  SHA512:
6
- metadata.gz: 33ec8188e66991dca122aef6ecbb69a34ab566b79ede6fabce62f6de6975f97eeed2be3b4c176b74f4c1b03da3ef44283bb01017b67c2d3e25c86fb2c5619734
7
- data.tar.gz: 674d5e7ca61a186d7713d245459e4b19b856b0fdc2fc56bd42d557a7e0d0d035ba87b0e194a8f0c5a27a91ffe751d7c247b2b04dda8293b79cf6351cdbb948e3
6
+ metadata.gz: 043bd286ef937aaa988bb0434684b16b349b35f743896892a51d6fc0ccb58697e885be356061f0a2c87a7f27e617c7762b82342cd524aacd2c20876838260d2a
7
+ data.tar.gz: dfe91f87abcf3bc0605e603cd1323809d3016c495a3f9b694fb7e149f3612e9985e3bdf3d6255a4e64850e897507ac97e9bcbbbc67c2fda31cd73e3dd08e5db4
@@ -5,9 +5,7 @@
5
5
 
6
6
  # codepoints for CP1255 nikkud
7
7
  NIKKUD_CP1255 = [192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 209, 210]
8
- #NIKKUD_CP1255 = ["\xc0".force_encoding('windows-1255'), "\xc1".force_encoding('windows-1255'), "\xc2".force_encoding('windows-1255'), "\xc3".force_encoding('windows-1255'), "\xc4".force_encoding('windows-1255'), "\xc5".force_encoding('windows-1255'), "\xc6".force_encoding('windows-1255'), "\xc7".force_encoding('windows-1255'), "\xc8".force_encoding('windows-1255'), "\xc9".force_encoding('windows-1255'), "\xcb".force_encoding('windows-1255'), "\xcc".force_encoding('windows-1255'), "\xd1".force_encoding('windows-1255'), "\xd2".force_encoding('windows-1255')] # wow, this is fugly. Is there a neater way to specify CP1255 literal?
9
8
  NIKKUD_UTF8 = [0x05b0, 0x05b1, 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc, 0x05bd, 0x05bf, 0x05c1, 0x05c2]
10
- #NIKKUD_UTF8 = ["\u05b0", "\u05b1", "\u05b2", "\u05b3", "\u05b4", "\u05b5", "\u05b6", "\u05b7", "\u05b8", "\u05b9", "\u05bb", "\u05bc", "\u05c1", "\u05c2"]
11
9
  # TODO: Mac encoding
12
10
 
13
11
  FIANLS_CP1255 = ["\xea".force_encoding('windows-1255'), "\xed".force_encoding('windows-1255'), "\xef".force_encoding('windows-1255'), "\xf3".force_encoding('windows-1255'), "\xf5".force_encoding('windows-1255')]
@@ -15,9 +13,39 @@ FIANLS_CP1255 = ["\xea".force_encoding('windows-1255'), "\xed".force_encoding('w
15
13
  FINALS_UTF8 = ["\u05da", "\u05dd", "\u05df", "\u05e3", "\u05e5"]
16
14
  HEB_UTF8_START = 1424
17
15
  HEB_UTF8_END = 1535
16
+ HEB_UTF8_XIRIK = 1460
17
+ HEB_UTF8_XOLAM = 1465
18
+ HEB_UTF8_QUBBUTS = 1467
19
+ HEB_UTF8_SHURUK = 1468
18
20
 
19
21
  # extend String class
20
22
  class String
23
+ def strip_hebrew
24
+ case self.encoding
25
+ when Encoding::UTF_8
26
+ strip_hebrew_utf8
27
+ when Encoding::WINDOWS_1255 || Encoding::CP1255
28
+ strip_hebrew_cp1255
29
+ end
30
+ end
31
+ def strip_hebrew_utf8
32
+ target = ''
33
+ self.each_codepoint {|cp|
34
+ unless self.class.is_codepoint_nikkud_utf8(cp) or self.is_hebrew_codepoint_utf8(cp)
35
+ target << cp.chr(Encoding::UTF_8)
36
+ end
37
+ }
38
+ return target
39
+ end
40
+ def strip_hebrew_cp1255
41
+ target = ''.force_encoding('windows-1255')
42
+ self.each_codepoint {|cp|
43
+ unless self.class.is_codepoint_nikkud_cp1255(cp) or self.is_hebrew_codepoint_cp1255(cp)
44
+ target << cp.chr(Encoding::CP1255) # is there a neater way?
45
+ end
46
+ }
47
+ return target
48
+ end
21
49
  # this will return the string, stripped of any Hebrew nikkud characters
22
50
  def strip_nikkud
23
51
  case self.encoding
@@ -58,6 +86,43 @@ class String
58
86
  return false
59
87
  end
60
88
  end
89
+
90
+ def falsehood
91
+ false
92
+ end
93
+
94
+ # this will add matres lectionis (yods and vavs as vowels) after diacritics that denote those vowels. The result won't always be morphologically correct Hebrew, but is useful for generating mostly-likely variants users may search for, when typing inputs (almost no Hebrew users know how to produce diacritics on the keyboard).
95
+ def naive_full_nikkud
96
+ ret = ''
97
+ prev_char = nil
98
+ case self.encoding
99
+ when Encoding::UTF_8
100
+ self.each_char do |c|
101
+ ret += c
102
+ ret += 'י' if c.codepoints[0] == HEB_UTF8_XIRIK
103
+ ret += 'ו' if c.codepoints[0] == HEB_UTF8_QUBBUTS
104
+ ret += 'ו' if c.codepoints[0] == HEB_UTF8_XOLAM && prev_char != 'ו'
105
+ prev_char = c
106
+ end
107
+ return ret.gsub("\u05b4יי","\u05b4י") # get rid of extraneous yods possibly added because we weren't looking ahead
108
+ else
109
+ return nil # not implemented for other encodings for now.
110
+ end
111
+ end
112
+
113
+ def any_nikkud?
114
+ func = case self.encoding
115
+ when Encoding::UTF_8
116
+ :is_codepoint_nikkud_utf8
117
+ when Encoding::WINDOWS_1255 || Encoding::CP1255
118
+ :is_codepoint_nikkud_cp1255
119
+ else
120
+ :falsehood
121
+ end
122
+ self.each_codepoint{|cp| return true if String.send(func, cp)}
123
+ return false
124
+ end
125
+
61
126
  def is_hebrew_codepoint_cp1255(cp)
62
127
  return ((cp > 191 && cp < 202) or [203, 204, 209, 210].include?(cp))
63
128
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hebrew
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.9
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Asaf Bartov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-10-21 00:00:00.000000000 Z
11
+ date: 2020-10-20 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Some useful code to identify, transcode, and manipulate Hebrew text
14
14
  email: asaf.bartov@gmail.com
@@ -36,8 +36,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
36
36
  - !ruby/object:Gem::Version
37
37
  version: '0'
38
38
  requirements: []
39
- rubyforge_project:
40
- rubygems_version: 2.4.6
39
+ rubygems_version: 3.1.2
41
40
  signing_key:
42
41
  specification_version: 4
43
42
  summary: Hebrew string manipulation