hebrew 0.1.7 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +5 -5
  2. data/lib/hebrew.rb +70 -5
  3. metadata +5 -6
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: e744e020d212f7b11acbb82e9e4d99944fc3d06c
4
- data.tar.gz: 3f0fb1762473e8b52a4fbf7b33b8251e3a632a08
2
+ SHA256:
3
+ metadata.gz: 2cc64efe3ce4523383c76a9265c8e0d015f847c1aaefacb903470ec9c295f342
4
+ data.tar.gz: e0f065db2c27e3dd0b47669abc629eb228e4f37b118db54afa128203d832e8e5
5
5
  SHA512:
6
- metadata.gz: eeb8e700eaa11b89dbfd77107cb09d15a56de4c5eab1320301fb1445d411ae2e6bc527195f6c83cb24e9189ed09c36a039d458cb19d665d7700952e4d6868151
7
- data.tar.gz: f8ded3902c477c207425e64daf813f7f403b9d8652dba662508b6f179e1421ca4a8e3573f12fbd52babcb35e57389c67ef8d0406b3da104122c8129723cc015f
6
+ metadata.gz: f5143397a2bed96cccc2aead3dee8aac7f7f28637819e1979b61cf37e1bb96712b29e6a7f1f218c6281b4bdda4e67435ce2a5f8d0195c7982bc088adfcdb9fb4
7
+ data.tar.gz: 8ec1ba901eb082f53f1d8caa1049a5e48302418c48867b11e67e945ef50ef42112e735114fae64df1e0adf0a66296de188ab59cca419779a8d650afc8026aab9
@@ -5,9 +5,7 @@
5
5
 
6
6
  # codepoints for CP1255 nikkud
7
7
  NIKKUD_CP1255 = [192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 209, 210]
8
- #NIKKUD_CP1255 = ["\xc0".force_encoding('windows-1255'), "\xc1".force_encoding('windows-1255'), "\xc2".force_encoding('windows-1255'), "\xc3".force_encoding('windows-1255'), "\xc4".force_encoding('windows-1255'), "\xc5".force_encoding('windows-1255'), "\xc6".force_encoding('windows-1255'), "\xc7".force_encoding('windows-1255'), "\xc8".force_encoding('windows-1255'), "\xc9".force_encoding('windows-1255'), "\xcb".force_encoding('windows-1255'), "\xcc".force_encoding('windows-1255'), "\xd1".force_encoding('windows-1255'), "\xd2".force_encoding('windows-1255')] # wow, this is fugly. Is there a neater way to specify CP1255 literal?
9
8
  NIKKUD_UTF8 = [0x05b0, 0x05b1, 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc, 0x05bd, 0x05bf, 0x05c1, 0x05c2]
10
- #NIKKUD_UTF8 = ["\u05b0", "\u05b1", "\u05b2", "\u05b3", "\u05b4", "\u05b5", "\u05b6", "\u05b7", "\u05b8", "\u05b9", "\u05bb", "\u05bc", "\u05c1", "\u05c2"]
11
9
  # TODO: Mac encoding
12
10
 
13
11
  FIANLS_CP1255 = ["\xea".force_encoding('windows-1255'), "\xed".force_encoding('windows-1255'), "\xef".force_encoding('windows-1255'), "\xf3".force_encoding('windows-1255'), "\xf5".force_encoding('windows-1255')]
@@ -15,9 +13,39 @@ FIANLS_CP1255 = ["\xea".force_encoding('windows-1255'), "\xed".force_encoding('w
15
13
  FINALS_UTF8 = ["\u05da", "\u05dd", "\u05df", "\u05e3", "\u05e5"]
16
14
  HEB_UTF8_START = 1424
17
15
  HEB_UTF8_END = 1535
16
+ HEB_UTF8_XIRIK = 1460
17
+ HEB_UTF8_XOLAM = 1465
18
+ HEB_UTF8_QUBBUTS = 1467
19
+ HEB_UTF8_SHURUK = 1468
18
20
 
19
21
  # extend String class
20
22
  class String
23
+ def strip_hebrew
24
+ case self.encoding
25
+ when Encoding::UTF_8
26
+ strip_hebrew_utf8
27
+ when Encoding::WINDOWS_1255 || Encoding::CP1255
28
+ strip_hebrew_cp1255
29
+ end
30
+ end
31
+ def strip_hebrew_utf8
32
+ target = ''
33
+ self.each_codepoint {|cp|
34
+ unless self.class.is_codepoint_nikkud_utf8(cp) or self.is_hebrew_codepoint_utf8(cp)
35
+ target << cp.chr(Encoding::UTF_8)
36
+ end
37
+ }
38
+ return target
39
+ end
40
+ def strip_hebrew_cp1255
41
+ target = ''.force_encoding('windows-1255')
42
+ self.each_codepoint {|cp|
43
+ unless self.class.is_codepoint_nikkud_cp1255(cp) or self.is_hebrew_codepoint_cp1255(cp)
44
+ target << cp.chr(Encoding::CP1255) # is there a neater way?
45
+ end
46
+ }
47
+ return target
48
+ end
21
49
  # this will return the string, stripped of any Hebrew nikkud characters
22
50
  def strip_nikkud
23
51
  case self.encoding
@@ -31,7 +59,7 @@ class String
31
59
  target = ''.force_encoding('windows-1255')
32
60
  self.each_codepoint {|cp|
33
61
  unless self.class.is_codepoint_nikkud_cp1255(cp)
34
- target += cp.chr(Encoding::CP1255) # is there a neater way?
62
+ target << cp.chr(Encoding::CP1255) # is there a neater way?
35
63
  end
36
64
  }
37
65
  return target
@@ -40,7 +68,7 @@ class String
40
68
  target = ''
41
69
  self.each_codepoint {|cp|
42
70
  unless self.class.is_codepoint_nikkud_utf8(cp)
43
- target += cp.chr(Encoding::UTF_8)
71
+ target << cp.chr(Encoding::UTF_8)
44
72
  end
45
73
  }
46
74
  return target
@@ -58,6 +86,43 @@ class String
58
86
  return false
59
87
  end
60
88
  end
89
+
90
+ def falsehood
91
+ false
92
+ end
93
+
94
+ # this will add matres lectionis (yods and vavs as vowels) after diacritics that denote those vowels. The result won't always be morphologically correct Hebrew, but is useful for generating mostly-likely variants users may search for, when typing inputs (almost no Hebrew users know how to produce diacritics on the keyboard).
95
+ def naive_full_nikkud
96
+ ret = ''
97
+ prev_char = nil
98
+ case self.encoding
99
+ when Encoding::UTF_8
100
+ self.each_char do |c|
101
+ ret += c
102
+ ret += 'י' if c.codepoints[0] == HEB_UTF8_XIRIK
103
+ ret += 'ו' if c.codepoints[0] == HEB_UTF8_QUBBUTS
104
+ ret += 'ו' if [HEB_UTF8_XOLAM, HEB_UTF8_SHURUK].include?(c.codepoints[0]) && prev_char != 'ו'
105
+ prev_char = c
106
+ end
107
+ return ret.gsub('יי','ִי') # get rid of extraneous yods possibly added because we weren't looking ahead
108
+ else
109
+ return nil # not implemented for other encodings for now.
110
+ end
111
+ end
112
+
113
+ def any_nikkud?
114
+ func = case self.encoding
115
+ when Encoding::UTF_8
116
+ :is_codepoint_nikkud_utf8
117
+ when Encoding::WINDOWS_1255 || Encoding::CP1255
118
+ :is_codepoint_nikkud_cp1255
119
+ else
120
+ :falsehood
121
+ end
122
+ self.each_codepoint{|cp| return true if String.send(func, cp)}
123
+ return false
124
+ end
125
+
61
126
  def is_hebrew_codepoint_cp1255(cp)
62
127
  return ((cp > 191 && cp < 202) or [203, 204, 209, 210].include?(cp))
63
128
  end
@@ -77,7 +142,7 @@ class String
77
142
  #NIKKUD_CP1255.include?(cp) # cleaner, but much slower
78
143
  end
79
144
  def self.is_codepoint_nikkud_utf8(cp)
80
- return ((cp > 0x05af && cp < 0x05ba) or [0x05bb, 0x05bc, 0x05c1, 0x05c2].include?(cp))
145
+ return ((cp > 0x05af && cp < 0x05bd) or [0x05c1, 0x05c2].include?(cp))
81
146
  #NIKKUD_UTF8.include?(cp) # cleaner, but much slower
82
147
  end
83
148
  # this will return true if the first parameter is a nikkud character in the encoding of the second parameter
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hebrew
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Asaf Bartov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-04-03 00:00:00.000000000 Z
11
+ date: 2020-10-20 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Some useful code to identify, transcode, and manipulate Hebrew text
14
14
  email: asaf.bartov@gmail.com
@@ -27,17 +27,16 @@ require_paths:
27
27
  - lib
28
28
  required_ruby_version: !ruby/object:Gem::Requirement
29
29
  requirements:
30
- - - '>='
30
+ - - ">="
31
31
  - !ruby/object:Gem::Version
32
32
  version: '0'
33
33
  required_rubygems_version: !ruby/object:Gem::Requirement
34
34
  requirements:
35
- - - '>='
35
+ - - ">="
36
36
  - !ruby/object:Gem::Version
37
37
  version: '0'
38
38
  requirements: []
39
- rubyforge_project:
40
- rubygems_version: 2.2.1
39
+ rubygems_version: 3.1.2
41
40
  signing_key:
42
41
  specification_version: 4
43
42
  summary: Hebrew string manipulation