hebrew 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/hebrew.rb +21 -9
  3. metadata +3 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cdc23d69ac6c99a5089730a3980986821a750c9b
4
- data.tar.gz: b436b60e4093251483a256677308733488108d95
3
+ metadata.gz: db796d5a2e993a3e80c0533e59be71ddea35ff77
4
+ data.tar.gz: 907c43cec960cd4080f81084d555e627174e71cc
5
5
  SHA512:
6
- metadata.gz: 4bebdf8bb10c1101b93811a36d2b21ac12c76068040e02073e35bd5570b084c802c8cc4ecf0bbf23bc5663bb00c8a3ecfe67b9db9d1b9adc2ca124534c88e5b8
7
- data.tar.gz: c66e39755a562a5ac674deba0d678c5e115a74fbb16e12d2fdb299deeab608433168a5460d62f5e8e9401510e3f1735e97adf3bb7c89a7dc382e7717547e9b39
6
+ metadata.gz: c3d0e503b1ad747277dd6aa9ed98fcdf1ea32838bfd396f00657fb834f6715454d81a8bd23f66c0196085f62b763e6f2504c1bf0765f1f685958db92974e1851
7
+ data.tar.gz: 3c5221280f64cc1338f10ecf32fe73f3ce2bed78963d67c575179eb286eb64d7eed1e72c7bb84da2d89f75b0f55be8486f06cbd5d45346017e0e7d93f53ca595
data/lib/hebrew.rb CHANGED
@@ -4,17 +4,18 @@
4
4
  #
5
5
 
6
6
  # codepoints for CP1255 nikkud
7
- NIKKUD_CP1255 = [192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 209, 210]
7
+ NIKKUD_CP1255 = [192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 209, 210]
8
8
  #NIKKUD_CP1255 = ["\xc0".force_encoding('windows-1255'), "\xc1".force_encoding('windows-1255'), "\xc2".force_encoding('windows-1255'), "\xc3".force_encoding('windows-1255'), "\xc4".force_encoding('windows-1255'), "\xc5".force_encoding('windows-1255'), "\xc6".force_encoding('windows-1255'), "\xc7".force_encoding('windows-1255'), "\xc8".force_encoding('windows-1255'), "\xc9".force_encoding('windows-1255'), "\xcb".force_encoding('windows-1255'), "\xcc".force_encoding('windows-1255'), "\xd1".force_encoding('windows-1255'), "\xd2".force_encoding('windows-1255')] # wow, this is fugly. Is there a neater way to specify CP1255 literal?
9
- NIKKUD_UTF8 = [0x05b0, 0x05b1, 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc, 0x05c1, 0x05c2]
9
+ NIKKUD_UTF8 = [0x05b0, 0x05b1, 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc, 0x05bd, 0x05bf, 0x05c1, 0x05c2]
10
10
  #NIKKUD_UTF8 = ["\u05b0", "\u05b1", "\u05b2", "\u05b3", "\u05b4", "\u05b5", "\u05b6", "\u05b7", "\u05b8", "\u05b9", "\u05bb", "\u05bc", "\u05c1", "\u05c2"]
11
11
  # TODO: Mac encoding
12
12
 
13
13
  FIANLS_CP1255 = ["\xea".force_encoding('windows-1255'), "\xed".force_encoding('windows-1255'), "\xef".force_encoding('windows-1255'), "\xf3".force_encoding('windows-1255'), "\xf5".force_encoding('windows-1255')]
14
14
 
15
- FINALS_UTF8 = []
15
+ FINALS_UTF8 = ["\u05da", "\u05dd", "\u05df", "\u05e3", "\u05e5"]
16
16
  HEB_UTF8_START = 1424
17
17
  HEB_UTF8_END = 1535
18
+
18
19
  # extend String class
19
20
  class String
20
21
  # this will return the string, stripped of any Hebrew nikkud characters
@@ -44,6 +45,7 @@ class String
44
45
  }
45
46
  return target
46
47
  end
48
+ # this will return true if the string contains any Hebrew character (short circuit)
47
49
  def any_hebrew?
48
50
  case self.encoding
49
51
  when Encoding::UTF_8
@@ -56,6 +58,10 @@ class String
56
58
  return false
57
59
  end
58
60
  end
61
+ def is_hebrew_codepoint_cp1255(cp)
62
+ if (cp > 191 && cp < 202) or [203, 204, 209, 210].include?(cp)
63
+
64
+ end
59
65
  def is_hebrew_codepoint_utf8(cp)
60
66
  if cp >= HEB_UTF8_START && cp <= HEB_UTF8_END
61
67
  return true
@@ -63,27 +69,33 @@ class String
63
69
  return false
64
70
  end
65
71
  end
72
+
66
73
  # TODO: add strip_nikkud!
74
+
75
+ # this will return true if the parameter is a nikkud character
67
76
  def is_nikkud(c)
68
77
  self.class.is_nikkud_by_encoding(c, self.encoding) # delegate to class method based on instance encoding
69
78
  end
79
+
70
80
  def self.is_codepoint_nikkud_cp1255(cp)
71
- NIKKUD_CP1255.include?(cp)
81
+ return (cp > 191 && cp < 205) or [209, 210].include?(cp)
82
+ #NIKKUD_CP1255.include?(cp) # cleaner, but much slower
72
83
  end
73
84
  def self.is_codepoint_nikkud_utf8(cp)
74
- NIKKUD_UTF8.include?(cp)
85
+ return (cp > 0x05af && cp < 0x05ba) or [0x05bb, 0x05bc, 0x05c1, 0x05c2].include?(cp)
86
+ #NIKKUD_UTF8.include?(cp) # cleaner, but much slower
75
87
  end
88
+ # this will return true if the first parameter is a nikkud character in the encoding of the second parameter
76
89
  def self.is_nikkud_by_encoding(c, encoding)
77
90
  case encoding
78
91
  when Encoding::UTF_8
79
- # DBG: puts "utf8 - #{c} - #{c.codepoints.first}"
80
- NIKKUD_UTF8.include?(c)
92
+ self.is_codepoint_nikkud_utf8(c.codepoints.first)
81
93
  when Encoding::WINDOWS_1255 || Encoding::CP1255
82
- # DBG: puts "cp1255 - #{c} - #{c.codepoints.first}"
83
- NIKKUD_CP1255.include?(c)
94
+ self.is_codepoint_nikkud_cp1255(c.codepoints.first)
84
95
  # TODO: add Mac encoding?
85
96
  end
86
97
  end
98
+ # this will return true if the first parameter is a final letter in the encoding of the second parameter
87
99
  def self.is_final_by_encoding(c, encoding)
88
100
  case encoding
89
101
  when Encoding::UTF_8
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hebrew
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Asaf Bartov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-02-18 00:00:00.000000000 Z
11
+ date: 2014-04-02 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Some useful code to identify, transcode, and manipulate Hebrew text
14
14
  email: asaf.bartov@gmail.com
@@ -17,7 +17,7 @@ extensions: []
17
17
  extra_rdoc_files: []
18
18
  files:
19
19
  - lib/hebrew.rb
20
- homepage: http://rubygems.org/gems/hebrew
20
+ homepage: https://github.com/abartov/hebrew
21
21
  licenses:
22
22
  - MIT
23
23
  metadata: {}