hebrew 0.0.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +6 -14
  2. data/lib/hebrew.rb +43 -6
  3. metadata +5 -5
checksums.yaml CHANGED
@@ -1,15 +1,7 @@
1
1
  ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: !binary |-
4
- ZGQ0NGU1ODllODliMzU4NmNhOTI4ZGJkMmU4ZjBlNjg2ZTY0NTE4YQ==
5
- data.tar.gz: !binary |-
6
- OWJiNjU1MTMyYTRmZjZmZWE5MDY1NWVmMWNlOTJkYTM1MTVkNjQ2Yg==
7
- !binary "U0hBNTEy":
8
- metadata.gz: !binary |-
9
- MzcxYjMzODk1MWZlM2VkMWE4OGZjYjczNzQ4YmIyY2IzMzc2NzI3MDBjNGI1
10
- OWFhZDFhYzdiZWExMTczNzAwNjZhYWJkMDM5MGM0YWQ4YTYxZGVlYTliM2Rh
11
- ZGY1ZTdmMjY5NjNjZDA3MTMyMzY4MzU5YTZhNDEzNzNhMjJjNWY=
12
- data.tar.gz: !binary |-
13
- OGYwODM0MTk0NmQ0YjQxMjEzMTYzMWQ4MzUyYTQ2YjY1YWEwOTMxMjQzZDNl
14
- MWZmOWIxNjk3NzFhYTI1YjA4ZTc1NDhlMjM0MzQ0NzBjMTE4YjRlNjRjOTRm
15
- ZTZhMDc0YjQ4NDYzYzhmMmE2NGNhYjQxOTk4NDgzMTQxYTgwOTE=
2
+ SHA1:
3
+ metadata.gz: 41268cfa1a659bd6997ec10f67fbc3afd7d15f19
4
+ data.tar.gz: f2d888c38878d69b06e7e9a057a58ffe175cf6af
5
+ SHA512:
6
+ metadata.gz: 686135b818c4d6867fe4a005ce09862705ed9b9eac5ed309e5bdbcae8bc1cd04764f8cd8f49feb1db6eccd4002abb4bde65b72f0db15cec9d53a9a91dc4a22b9
7
+ data.tar.gz: 1241dc30e1b38a16cc2371d5d0e5c43441022b700a1c1569508e220fb69c6ef8d7d05af3bc965593ca19bce078c4159773532cfef32089bf8ba7820763321aab
data/lib/hebrew.rb CHANGED
@@ -3,19 +3,42 @@
3
3
  # @author Asaf Bartov <asaf.bartov@gmail.com>
4
4
  #
5
5
 
6
-
7
- NIKKUD_CP1255 = ["\xc0".force_encoding('windows-1255'), "\xc1".force_encoding('windows-1255'), "\xc2".force_encoding('windows-1255'), "\xc3".force_encoding('windows-1255'), "\xc4".force_encoding('windows-1255'), "\xc5".force_encoding('windows-1255'), "\xc6".force_encoding('windows-1255'), "\xc7".force_encoding('windows-1255'), "\xc8".force_encoding('windows-1255'), "\xc9".force_encoding('windows-1255'), "\xcb".force_encoding('windows-1255'), "\xcc".force_encoding('windows-1255'), "\xd1".force_encoding('windows-1255'), "\xd2".force_encoding('windows-1255')] # wow, this is fugly. Is there a neater way to specify CP1255 literal?
8
- NIKKUD_UTF8 = ["\u05b0", "\u05b1", "\u05b2", "\u05b3", "\u05b4", "\u05b5", "\u05b6", "\u05b7", "\u05b8", "\u05b9", "\u05bb", "\u05bc", "\u05c1", "\u05c2"]
6
+ # codepoints for CP1255 nikkud
7
+ NIKKUD_CP1255 = [192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 209, 210]
8
+ #NIKKUD_CP1255 = ["\xc0".force_encoding('windows-1255'), "\xc1".force_encoding('windows-1255'), "\xc2".force_encoding('windows-1255'), "\xc3".force_encoding('windows-1255'), "\xc4".force_encoding('windows-1255'), "\xc5".force_encoding('windows-1255'), "\xc6".force_encoding('windows-1255'), "\xc7".force_encoding('windows-1255'), "\xc8".force_encoding('windows-1255'), "\xc9".force_encoding('windows-1255'), "\xcb".force_encoding('windows-1255'), "\xcc".force_encoding('windows-1255'), "\xd1".force_encoding('windows-1255'), "\xd2".force_encoding('windows-1255')] # wow, this is fugly. Is there a neater way to specify CP1255 literal?
9
+ NIKKUD_UTF8 = [0x05b0, 0x05b1, 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc, 0x05c1, 0x05c2]
10
+ #NIKKUD_UTF8 = ["\u05b0", "\u05b1", "\u05b2", "\u05b3", "\u05b4", "\u05b5", "\u05b6", "\u05b7", "\u05b8", "\u05b9", "\u05bb", "\u05bc", "\u05c1", "\u05c2"]
9
11
  # TODO: Mac encoding
10
12
 
13
+ FIANLS_CP1255 = ["\xea".force_encoding('windows-1255'), "\xed".force_encoding('windows-1255'), "\xef".force_encoding('windows-1255'), "\xf3".force_encoding('windows-1255'), "\xf5".force_encoding('windows-1255')]
14
+
15
+ FINALS_UTF8 = []
16
+
11
17
  # extend String class
12
18
  class String
13
19
  # this will return the string, stripped of any Hebrew nikkud characters
14
20
  def strip_nikkud
21
+ case self.encoding
22
+ when Encoding::UTF_8
23
+ strip_nikkud_utf8
24
+ when Encoding::WINDOWS_1255 || Encoding::CP1255
25
+ strip_nikkud_cp1255
26
+ end
27
+ end
28
+ def strip_nikkud_cp1255
29
+ target = ''.force_encoding('windows-1255')
30
+ self.each_codepoint {|cp|
31
+ unless self.class.is_codepoint_nikkud_cp1255(cp)
32
+ target += cp.chr(Encoding::CP1255) # is there a neater way?
33
+ end
34
+ }
35
+ return target
36
+ end
37
+ def strip_nikkud_utf8
15
38
  target = ''
16
- self.each_char {|c|
17
- unless is_nikkud(c)
18
- target += c
39
+ self.each_codepoint {|cp|
40
+ unless self.class.is_codepoint_nikkud_utf8(cp)
41
+ target += cp.chr(Encoding::UTF_8)
19
42
  end
20
43
  }
21
44
  return target
@@ -24,6 +47,12 @@ class String
24
47
  def is_nikkud(c)
25
48
  self.class.is_nikkud_by_encoding(c, self.encoding) # delegate to class method based on instance encoding
26
49
  end
50
+ def self.is_codepoint_nikkud_cp1255(cp)
51
+ NIKKUD_CP1255.include?(cp)
52
+ end
53
+ def self.is_codepoint_nikkud_utf8(cp)
54
+ NIKKUD_UTF8.include?(cp)
55
+ end
27
56
  def self.is_nikkud_by_encoding(c, encoding)
28
57
  case encoding
29
58
  when Encoding::UTF_8
@@ -35,4 +64,12 @@ class String
35
64
  # TODO: add Mac encoding?
36
65
  end
37
66
  end
67
+ def self.is_final_by_encoding(c, encoding)
68
+ case encoding
69
+ when Encoding::UTF_8
70
+ FIANLS_UTF8.include?(c)
71
+ when Encoding::WINDOWS_1255 || Encoding::CP1255
72
+ FINALS_CP1255.include?(c)
73
+ end
74
+ end
38
75
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hebrew
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Asaf Bartov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-09-10 00:00:00.000000000 Z
11
+ date: 2014-02-11 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Some useful code to identify, transcode, and manipulate Hebrew text
14
14
  email: asaf.bartov@gmail.com
@@ -27,17 +27,17 @@ require_paths:
27
27
  - lib
28
28
  required_ruby_version: !ruby/object:Gem::Requirement
29
29
  requirements:
30
- - - ! '>='
30
+ - - '>='
31
31
  - !ruby/object:Gem::Version
32
32
  version: '0'
33
33
  required_rubygems_version: !ruby/object:Gem::Requirement
34
34
  requirements:
35
- - - ! '>='
35
+ - - '>='
36
36
  - !ruby/object:Gem::Version
37
37
  version: '0'
38
38
  requirements: []
39
39
  rubyforge_project:
40
- rubygems_version: 2.0.6
40
+ rubygems_version: 2.2.1
41
41
  signing_key:
42
42
  specification_version: 4
43
43
  summary: Hebrew string manipulation