hebrew 0.0.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +6 -14
  2. data/lib/hebrew.rb +43 -6
  3. metadata +5 -5
checksums.yaml CHANGED
@@ -1,15 +1,7 @@
1
1
  ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: !binary |-
4
- ZGQ0NGU1ODllODliMzU4NmNhOTI4ZGJkMmU4ZjBlNjg2ZTY0NTE4YQ==
5
- data.tar.gz: !binary |-
6
- OWJiNjU1MTMyYTRmZjZmZWE5MDY1NWVmMWNlOTJkYTM1MTVkNjQ2Yg==
7
- !binary "U0hBNTEy":
8
- metadata.gz: !binary |-
9
- MzcxYjMzODk1MWZlM2VkMWE4OGZjYjczNzQ4YmIyY2IzMzc2NzI3MDBjNGI1
10
- OWFhZDFhYzdiZWExMTczNzAwNjZhYWJkMDM5MGM0YWQ4YTYxZGVlYTliM2Rh
11
- ZGY1ZTdmMjY5NjNjZDA3MTMyMzY4MzU5YTZhNDEzNzNhMjJjNWY=
12
- data.tar.gz: !binary |-
13
- OGYwODM0MTk0NmQ0YjQxMjEzMTYzMWQ4MzUyYTQ2YjY1YWEwOTMxMjQzZDNl
14
- MWZmOWIxNjk3NzFhYTI1YjA4ZTc1NDhlMjM0MzQ0NzBjMTE4YjRlNjRjOTRm
15
- ZTZhMDc0YjQ4NDYzYzhmMmE2NGNhYjQxOTk4NDgzMTQxYTgwOTE=
2
+ SHA1:
3
+ metadata.gz: 41268cfa1a659bd6997ec10f67fbc3afd7d15f19
4
+ data.tar.gz: f2d888c38878d69b06e7e9a057a58ffe175cf6af
5
+ SHA512:
6
+ metadata.gz: 686135b818c4d6867fe4a005ce09862705ed9b9eac5ed309e5bdbcae8bc1cd04764f8cd8f49feb1db6eccd4002abb4bde65b72f0db15cec9d53a9a91dc4a22b9
7
+ data.tar.gz: 1241dc30e1b38a16cc2371d5d0e5c43441022b700a1c1569508e220fb69c6ef8d7d05af3bc965593ca19bce078c4159773532cfef32089bf8ba7820763321aab
data/lib/hebrew.rb CHANGED
@@ -3,19 +3,42 @@
3
3
  # @author Asaf Bartov <asaf.bartov@gmail.com>
4
4
  #
5
5
 
6
-
7
- NIKKUD_CP1255 = ["\xc0".force_encoding('windows-1255'), "\xc1".force_encoding('windows-1255'), "\xc2".force_encoding('windows-1255'), "\xc3".force_encoding('windows-1255'), "\xc4".force_encoding('windows-1255'), "\xc5".force_encoding('windows-1255'), "\xc6".force_encoding('windows-1255'), "\xc7".force_encoding('windows-1255'), "\xc8".force_encoding('windows-1255'), "\xc9".force_encoding('windows-1255'), "\xcb".force_encoding('windows-1255'), "\xcc".force_encoding('windows-1255'), "\xd1".force_encoding('windows-1255'), "\xd2".force_encoding('windows-1255')] # wow, this is fugly. Is there a neater way to specify CP1255 literal?
8
- NIKKUD_UTF8 = ["\u05b0", "\u05b1", "\u05b2", "\u05b3", "\u05b4", "\u05b5", "\u05b6", "\u05b7", "\u05b8", "\u05b9", "\u05bb", "\u05bc", "\u05c1", "\u05c2"]
6
+ # codepoints for CP1255 nikkud
7
+ NIKKUD_CP1255 = [192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 209, 210]
8
+ #NIKKUD_CP1255 = ["\xc0".force_encoding('windows-1255'), "\xc1".force_encoding('windows-1255'), "\xc2".force_encoding('windows-1255'), "\xc3".force_encoding('windows-1255'), "\xc4".force_encoding('windows-1255'), "\xc5".force_encoding('windows-1255'), "\xc6".force_encoding('windows-1255'), "\xc7".force_encoding('windows-1255'), "\xc8".force_encoding('windows-1255'), "\xc9".force_encoding('windows-1255'), "\xcb".force_encoding('windows-1255'), "\xcc".force_encoding('windows-1255'), "\xd1".force_encoding('windows-1255'), "\xd2".force_encoding('windows-1255')] # wow, this is fugly. Is there a neater way to specify CP1255 literal?
9
+ NIKKUD_UTF8 = [0x05b0, 0x05b1, 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc, 0x05c1, 0x05c2]
10
+ #NIKKUD_UTF8 = ["\u05b0", "\u05b1", "\u05b2", "\u05b3", "\u05b4", "\u05b5", "\u05b6", "\u05b7", "\u05b8", "\u05b9", "\u05bb", "\u05bc", "\u05c1", "\u05c2"]
9
11
  # TODO: Mac encoding
10
12
 
13
+ FIANLS_CP1255 = ["\xea".force_encoding('windows-1255'), "\xed".force_encoding('windows-1255'), "\xef".force_encoding('windows-1255'), "\xf3".force_encoding('windows-1255'), "\xf5".force_encoding('windows-1255')]
14
+
15
+ FINALS_UTF8 = []
16
+
11
17
  # extend String class
12
18
  class String
13
19
  # this will return the string, stripped of any Hebrew nikkud characters
14
20
  def strip_nikkud
21
+ case self.encoding
22
+ when Encoding::UTF_8
23
+ strip_nikkud_utf8
24
+ when Encoding::WINDOWS_1255 || Encoding::CP1255
25
+ strip_nikkud_cp1255
26
+ end
27
+ end
28
+ def strip_nikkud_cp1255
29
+ target = ''.force_encoding('windows-1255')
30
+ self.each_codepoint {|cp|
31
+ unless self.class.is_codepoint_nikkud_cp1255(cp)
32
+ target += cp.chr(Encoding::CP1255) # is there a neater way?
33
+ end
34
+ }
35
+ return target
36
+ end
37
+ def strip_nikkud_utf8
15
38
  target = ''
16
- self.each_char {|c|
17
- unless is_nikkud(c)
18
- target += c
39
+ self.each_codepoint {|cp|
40
+ unless self.class.is_codepoint_nikkud_utf8(cp)
41
+ target += cp.chr(Encoding::UTF_8)
19
42
  end
20
43
  }
21
44
  return target
@@ -24,6 +47,12 @@ class String
24
47
  def is_nikkud(c)
25
48
  self.class.is_nikkud_by_encoding(c, self.encoding) # delegate to class method based on instance encoding
26
49
  end
50
+ def self.is_codepoint_nikkud_cp1255(cp)
51
+ NIKKUD_CP1255.include?(cp)
52
+ end
53
+ def self.is_codepoint_nikkud_utf8(cp)
54
+ NIKKUD_UTF8.include?(cp)
55
+ end
27
56
  def self.is_nikkud_by_encoding(c, encoding)
28
57
  case encoding
29
58
  when Encoding::UTF_8
@@ -35,4 +64,12 @@ class String
35
64
  # TODO: add Mac encoding?
36
65
  end
37
66
  end
67
+ def self.is_final_by_encoding(c, encoding)
68
+ case encoding
69
+ when Encoding::UTF_8
70
+ FIANLS_UTF8.include?(c)
71
+ when Encoding::WINDOWS_1255 || Encoding::CP1255
72
+ FINALS_CP1255.include?(c)
73
+ end
74
+ end
38
75
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hebrew
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Asaf Bartov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-09-10 00:00:00.000000000 Z
11
+ date: 2014-02-11 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Some useful code to identify, transcode, and manipulate Hebrew text
14
14
  email: asaf.bartov@gmail.com
@@ -27,17 +27,17 @@ require_paths:
27
27
  - lib
28
28
  required_ruby_version: !ruby/object:Gem::Requirement
29
29
  requirements:
30
- - - ! '>='
30
+ - - '>='
31
31
  - !ruby/object:Gem::Version
32
32
  version: '0'
33
33
  required_rubygems_version: !ruby/object:Gem::Requirement
34
34
  requirements:
35
- - - ! '>='
35
+ - - '>='
36
36
  - !ruby/object:Gem::Version
37
37
  version: '0'
38
38
  requirements: []
39
39
  rubyforge_project:
40
- rubygems_version: 2.0.6
40
+ rubygems_version: 2.2.1
41
41
  signing_key:
42
42
  specification_version: 4
43
43
  summary: Hebrew string manipulation