hebrew 0.1.8 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +5 -5
  2. data/lib/hebrew.rb +69 -4
  3. metadata +3 -4
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: dccbbe3d1173be26a4a0bf0527669c20c054aa07
4
- data.tar.gz: d03cd42416c578729d69a5d0f4200f82950d0f8f
2
+ SHA256:
3
+ metadata.gz: 661d451b7a8c3a59707599e58b68071cb3920bb0ce91d6289971fda0aededbb2
4
+ data.tar.gz: fbbae45974ca57a07617095ab5fd8f7dde66919179c0fc503b71f1ef2154f4bd
5
5
  SHA512:
6
- metadata.gz: 2c3027be5c73cd44ccade52be6d9f0ac796814e13c5d1a037d0a9e96436e1e0e28a10c85881f375b3e628a3049926ff9f07e5d9168d872ed5686bc428729b84c
7
- data.tar.gz: 38ef10f3f1c65988f9b18f8403c49bf15c27bed7cbbd1b6cc88f5aeef87e0482b47eeea0d2f010ee391f0054ef4d6501a558be100a31c29fae3818d4ce2d3b07
6
+ metadata.gz: ea15e475ab1237ae160a48b9951f35233208c940ed4ea402c364d47a3f960c174cd7c9091d38ae7a381a882f7b21b618a071bc500849ae86d26eb6b0594bec0f
7
+ data.tar.gz: b1083099d1f2b488beaa5916057ace041efc7daf9600fceb6927a3aa8c69de760f642c0d31cba89575645d6807a4d166ada787b4f0b6a30a6e33c47719bb587c
@@ -5,9 +5,7 @@
5
5
 
6
6
  # codepoints for CP1255 nikkud
7
7
  NIKKUD_CP1255 = [192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 209, 210]
8
- #NIKKUD_CP1255 = ["\xc0".force_encoding('windows-1255'), "\xc1".force_encoding('windows-1255'), "\xc2".force_encoding('windows-1255'), "\xc3".force_encoding('windows-1255'), "\xc4".force_encoding('windows-1255'), "\xc5".force_encoding('windows-1255'), "\xc6".force_encoding('windows-1255'), "\xc7".force_encoding('windows-1255'), "\xc8".force_encoding('windows-1255'), "\xc9".force_encoding('windows-1255'), "\xcb".force_encoding('windows-1255'), "\xcc".force_encoding('windows-1255'), "\xd1".force_encoding('windows-1255'), "\xd2".force_encoding('windows-1255')] # wow, this is fugly. Is there a neater way to specify CP1255 literal?
9
8
  NIKKUD_UTF8 = [0x05b0, 0x05b1, 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc, 0x05bd, 0x05bf, 0x05c1, 0x05c2]
10
- #NIKKUD_UTF8 = ["\u05b0", "\u05b1", "\u05b2", "\u05b3", "\u05b4", "\u05b5", "\u05b6", "\u05b7", "\u05b8", "\u05b9", "\u05bb", "\u05bc", "\u05c1", "\u05c2"]
11
9
  # TODO: Mac encoding
12
10
 
13
11
  FIANLS_CP1255 = ["\xea".force_encoding('windows-1255'), "\xed".force_encoding('windows-1255'), "\xef".force_encoding('windows-1255'), "\xf3".force_encoding('windows-1255'), "\xf5".force_encoding('windows-1255')]
@@ -15,9 +13,39 @@ FIANLS_CP1255 = ["\xea".force_encoding('windows-1255'), "\xed".force_encoding('w
15
13
  FINALS_UTF8 = ["\u05da", "\u05dd", "\u05df", "\u05e3", "\u05e5"]
16
14
  HEB_UTF8_START = 1424
17
15
  HEB_UTF8_END = 1535
16
+ HEB_UTF8_XIRIK = 1460
17
+ HEB_UTF8_XOLAM = 1465
18
+ HEB_UTF8_QUBBUTS = 1467
19
+ HEB_UTF8_SHURUK = 1468
18
20
 
19
21
  # extend String class
20
22
  class String
23
+ def strip_hebrew
24
+ case self.encoding
25
+ when Encoding::UTF_8
26
+ strip_hebrew_utf8
27
+ when Encoding::WINDOWS_1255 || Encoding::CP1255
28
+ strip_hebrew_cp1255
29
+ end
30
+ end
31
+ def strip_hebrew_utf8
32
+ target = ''
33
+ self.each_codepoint {|cp|
34
+ unless self.class.is_codepoint_nikkud_utf8(cp) or self.is_hebrew_codepoint_utf8(cp)
35
+ target << cp.chr(Encoding::UTF_8)
36
+ end
37
+ }
38
+ return target
39
+ end
40
+ def strip_hebrew_cp1255
41
+ target = ''.force_encoding('windows-1255')
42
+ self.each_codepoint {|cp|
43
+ unless self.class.is_codepoint_nikkud_cp1255(cp) or self.is_hebrew_codepoint_cp1255(cp)
44
+ target << cp.chr(Encoding::CP1255) # is there a neater way?
45
+ end
46
+ }
47
+ return target
48
+ end
21
49
  # this will return the string, stripped of any Hebrew nikkud characters
22
50
  def strip_nikkud
23
51
  case self.encoding
@@ -31,7 +59,7 @@ class String
31
59
  target = ''.force_encoding('windows-1255')
32
60
  self.each_codepoint {|cp|
33
61
  unless self.class.is_codepoint_nikkud_cp1255(cp)
34
- target += cp.chr(Encoding::CP1255) # is there a neater way?
62
+ target << cp.chr(Encoding::CP1255) # is there a neater way?
35
63
  end
36
64
  }
37
65
  return target
@@ -40,7 +68,7 @@ class String
40
68
  target = ''
41
69
  self.each_codepoint {|cp|
42
70
  unless self.class.is_codepoint_nikkud_utf8(cp)
43
- target += cp.chr(Encoding::UTF_8)
71
+ target << cp.chr(Encoding::UTF_8)
44
72
  end
45
73
  }
46
74
  return target
@@ -58,6 +86,43 @@ class String
58
86
  return false
59
87
  end
60
88
  end
89
+
90
+ def falsehood
91
+ false
92
+ end
93
+
94
+ # this will add matres lectionis (yods and vavs as vowels) after diacritics that denote those vowels. The result won't always be morphologically correct Hebrew, but is useful for generating mostly-likely variants users may search for, when typing inputs (almost no Hebrew users know how to produce diacritics on the keyboard).
95
+ def naive_full_nikkud
96
+ ret = ''
97
+ prev_char = nil
98
+ case self.encoding
99
+ when Encoding::UTF_8
100
+ self.each_char do |c|
101
+ ret += c
102
+ ret += 'י' if c.codepoints[0] == HEB_UTF8_XIRIK
103
+ ret += 'ו' if c.codepoints[0] == HEB_UTF8_QUBBUTS
104
+ ret += 'ו' if c.codepoints[0] == HEB_UTF8_XOLAM && prev_char != 'ו'
105
+ prev_char = c
106
+ end
107
+ return ret.gsub('יי','ִי') # get rid of extraneous yods possibly added because we weren't looking ahead
108
+ else
109
+ return nil # not implemented for other encodings for now.
110
+ end
111
+ end
112
+
113
+ def any_nikkud?
114
+ func = case self.encoding
115
+ when Encoding::UTF_8
116
+ :is_codepoint_nikkud_utf8
117
+ when Encoding::WINDOWS_1255 || Encoding::CP1255
118
+ :is_codepoint_nikkud_cp1255
119
+ else
120
+ :falsehood
121
+ end
122
+ self.each_codepoint{|cp| return true if String.send(func, cp)}
123
+ return false
124
+ end
125
+
61
126
  def is_hebrew_codepoint_cp1255(cp)
62
127
  return ((cp > 191 && cp < 202) or [203, 204, 209, 210].include?(cp))
63
128
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hebrew
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.8
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Asaf Bartov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-07-04 00:00:00.000000000 Z
11
+ date: 2020-10-20 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Some useful code to identify, transcode, and manipulate Hebrew text
14
14
  email: asaf.bartov@gmail.com
@@ -36,8 +36,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
36
36
  - !ruby/object:Gem::Version
37
37
  version: '0'
38
38
  requirements: []
39
- rubyforge_project:
40
- rubygems_version: 2.4.6
39
+ rubygems_version: 3.1.2
41
40
  signing_key:
42
41
  specification_version: 4
43
42
  summary: Hebrew string manipulation