hebrew 0.2.0 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +5 -5
  2. data/lib/hebrew.rb +49 -2
  3. metadata +3 -4
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 3d90717862d817405afdf88fbdc31eefbcc90370
4
- data.tar.gz: 43f84e5a282a698eee70581b68ca96e5c4dc368d
2
+ SHA256:
3
+ metadata.gz: 6ef2e827cd290e619c308f84ee2c10c500e06643cd38e83ea0e2131dca7d712b
4
+ data.tar.gz: 3905432bb0cdbcbaacb04933c0c4a584b425bae481b2130558cdcdb06bc0ddbd
5
5
  SHA512:
6
- metadata.gz: 4c13beabf918c30be2deae92c2cf5e5949c44e8e2a0f7689caa28b2851626d239e835510cdc7c4141aaa5ff22e875f90b5199cb7670f003ed690a642aaf3f8d7
7
- data.tar.gz: 7f9fd7db90327bad58d00f8fdaae4f03f26fe0cef099ab02115585581f79fb708d5326c01436028afd9beb69d50f7d92d04bb13248d056b666135e4038bc036d
6
+ metadata.gz: 240ac1b3e6dc8d2891f8299e4871b313757038389cbca94dd5ad5693160bdcbfbb027ef4b96b303e413b17c6306dc4a320b40a0ad18efe39b691272b4897e6e7
7
+ data.tar.gz: 0df9ecc542cf86904f6951083a70fa26c799d7dd8923ca6c87d204fdaabee468f8918f9b9689074dd4e0e2fe6cc50541a657239c10b44e20e66a5de41432669d
@@ -5,9 +5,7 @@
5
5
 
6
6
  # codepoints for CP1255 nikkud
7
7
  NIKKUD_CP1255 = [192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 209, 210]
8
- #NIKKUD_CP1255 = ["\xc0".force_encoding('windows-1255'), "\xc1".force_encoding('windows-1255'), "\xc2".force_encoding('windows-1255'), "\xc3".force_encoding('windows-1255'), "\xc4".force_encoding('windows-1255'), "\xc5".force_encoding('windows-1255'), "\xc6".force_encoding('windows-1255'), "\xc7".force_encoding('windows-1255'), "\xc8".force_encoding('windows-1255'), "\xc9".force_encoding('windows-1255'), "\xcb".force_encoding('windows-1255'), "\xcc".force_encoding('windows-1255'), "\xd1".force_encoding('windows-1255'), "\xd2".force_encoding('windows-1255')] # wow, this is fugly. Is there a neater way to specify CP1255 literal?
9
8
  NIKKUD_UTF8 = [0x05b0, 0x05b1, 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc, 0x05bd, 0x05bf, 0x05c1, 0x05c2]
10
- #NIKKUD_UTF8 = ["\u05b0", "\u05b1", "\u05b2", "\u05b3", "\u05b4", "\u05b5", "\u05b6", "\u05b7", "\u05b8", "\u05b9", "\u05bb", "\u05bc", "\u05c1", "\u05c2"]
11
9
  # TODO: Mac encoding
12
10
 
13
11
  FIANLS_CP1255 = ["\xea".force_encoding('windows-1255'), "\xed".force_encoding('windows-1255'), "\xef".force_encoding('windows-1255'), "\xf3".force_encoding('windows-1255'), "\xf5".force_encoding('windows-1255')]
@@ -15,9 +13,39 @@ FIANLS_CP1255 = ["\xea".force_encoding('windows-1255'), "\xed".force_encoding('w
15
13
  FINALS_UTF8 = ["\u05da", "\u05dd", "\u05df", "\u05e3", "\u05e5"]
16
14
  HEB_UTF8_START = 1424
17
15
  HEB_UTF8_END = 1535
16
+ HEB_UTF8_XIRIK = 1460
17
+ HEB_UTF8_XOLAM = 1465
18
+ HEB_UTF8_QUBBUTS = 1467
19
+ HEB_UTF8_SHURUK = 1468
18
20
 
19
21
  # extend String class
20
22
  class String
23
+ def strip_hebrew
24
+ case self.encoding
25
+ when Encoding::UTF_8
26
+ strip_hebrew_utf8
27
+ when Encoding::WINDOWS_1255 || Encoding::CP1255
28
+ strip_hebrew_cp1255
29
+ end
30
+ end
31
+ def strip_hebrew_utf8
32
+ target = ''
33
+ self.each_codepoint {|cp|
34
+ unless self.class.is_codepoint_nikkud_utf8(cp) or self.is_hebrew_codepoint_utf8(cp)
35
+ target << cp.chr(Encoding::UTF_8)
36
+ end
37
+ }
38
+ return target
39
+ end
40
+ def strip_hebrew_cp1255
41
+ target = ''.force_encoding('windows-1255')
42
+ self.each_codepoint {|cp|
43
+ unless self.class.is_codepoint_nikkud_cp1255(cp) or self.is_hebrew_codepoint_cp1255(cp)
44
+ target << cp.chr(Encoding::CP1255) # is there a neater way?
45
+ end
46
+ }
47
+ return target
48
+ end
21
49
  # this will return the string, stripped of any Hebrew nikkud characters
22
50
  def strip_nikkud
23
51
  case self.encoding
@@ -63,6 +91,25 @@ class String
63
91
  false
64
92
  end
65
93
 
94
+ # this will add matres lectionis (yods and vavs as vowels) after diacritics that denote those vowels. The result won't always be morphologically correct Hebrew, but is useful for generating mostly-likely variants users may search for, when typing inputs (almost no Hebrew users know how to produce diacritics on the keyboard).
95
+ def naive_full_nikkud
96
+ ret = ''
97
+ prev_char = nil
98
+ case self.encoding
99
+ when Encoding::UTF_8
100
+ self.each_char do |c|
101
+ ret += c
102
+ ret += 'י' if c.codepoints[0] == HEB_UTF8_XIRIK
103
+ ret += 'ו' if c.codepoints[0] == HEB_UTF8_QUBBUTS
104
+ ret += 'ו' if c.codepoints[0] == HEB_UTF8_XOLAM && prev_char != 'ו'
105
+ prev_char = c
106
+ end
107
+ return ret.gsub("\u05b4יי","\u05b4י").gsub("\u05b4י\u05bcי", "\u05b4\u05bcי") # get rid of extraneous yods possibly added because we weren't looking ahead
108
+ else
109
+ return nil # not implemented for other encodings for now.
110
+ end
111
+ end
112
+
66
113
  def any_nikkud?
67
114
  func = case self.encoding
68
115
  when Encoding::UTF_8
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hebrew
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Asaf Bartov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-01-15 00:00:00.000000000 Z
11
+ date: 2020-10-20 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Some useful code to identify, transcode, and manipulate Hebrew text
14
14
  email: asaf.bartov@gmail.com
@@ -36,8 +36,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
36
36
  - !ruby/object:Gem::Version
37
37
  version: '0'
38
38
  requirements: []
39
- rubyforge_project:
40
- rubygems_version: 2.6.14
39
+ rubygems_version: 3.1.2
41
40
  signing_key:
42
41
  specification_version: 4
43
42
  summary: Hebrew string manipulation