babosa 1.0.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +5 -5
  2. checksums.yaml.gz.sig +0 -0
  3. data.tar.gz.sig +0 -0
  4. data/Changelog.md +20 -0
  5. data/README.md +83 -121
  6. data/Rakefile +9 -8
  7. data/lib/babosa.rb +2 -4
  8. data/lib/babosa/identifier.rb +104 -129
  9. data/lib/babosa/transliterator/base.rb +57 -54
  10. data/lib/babosa/transliterator/bulgarian.rb +3 -2
  11. data/lib/babosa/transliterator/cyrillic.rb +5 -5
  12. data/lib/babosa/transliterator/danish.rb +3 -3
  13. data/lib/babosa/transliterator/german.rb +3 -2
  14. data/lib/babosa/transliterator/greek.rb +4 -3
  15. data/lib/babosa/transliterator/hindi.rb +138 -0
  16. data/lib/babosa/transliterator/latin.rb +5 -5
  17. data/lib/babosa/transliterator/macedonian.rb +3 -2
  18. data/lib/babosa/transliterator/norwegian.rb +3 -3
  19. data/lib/babosa/transliterator/romanian.rb +3 -2
  20. data/lib/babosa/transliterator/russian.rb +3 -2
  21. data/lib/babosa/transliterator/serbian.rb +29 -27
  22. data/lib/babosa/transliterator/spanish.rb +2 -2
  23. data/lib/babosa/transliterator/swedish.rb +3 -3
  24. data/lib/babosa/transliterator/turkish.rb +8 -0
  25. data/lib/babosa/transliterator/ukrainian.rb +5 -4
  26. data/lib/babosa/transliterator/vietnamese.rb +4 -3
  27. data/lib/babosa/version.rb +3 -1
  28. data/spec/{babosa_spec.rb → identifier_spec.rb} +18 -15
  29. data/spec/spec_helper.rb +15 -6
  30. data/spec/transliterators/base_spec.rb +5 -6
  31. data/spec/transliterators/bulgarian_spec.rb +4 -5
  32. data/spec/transliterators/danish_spec.rb +5 -6
  33. data/spec/transliterators/german_spec.rb +4 -5
  34. data/spec/transliterators/greek_spec.rb +7 -7
  35. data/spec/transliterators/hindi_spec.rb +17 -0
  36. data/spec/transliterators/latin_spec.rb +3 -4
  37. data/spec/transliterators/macedonian_spec.rb +3 -4
  38. data/spec/transliterators/norwegian_spec.rb +4 -4
  39. data/spec/transliterators/polish_spec.rb +3 -5
  40. data/spec/transliterators/romanian_spec.rb +5 -6
  41. data/spec/transliterators/russian_spec.rb +3 -4
  42. data/spec/transliterators/serbian_spec.rb +6 -7
  43. data/spec/transliterators/spanish_spec.rb +4 -5
  44. data/spec/transliterators/swedish_spec.rb +7 -7
  45. data/spec/transliterators/turkish_spec.rb +24 -0
  46. data/spec/transliterators/ukrainian_spec.rb +74 -75
  47. data/spec/transliterators/vietnamese_spec.rb +10 -10
  48. metadata +50 -41
  49. metadata.gz.sig +2 -0
  50. data/lib/babosa/utf8/active_support_proxy.rb +0 -26
  51. data/lib/babosa/utf8/dumb_proxy.rb +0 -49
  52. data/lib/babosa/utf8/java_proxy.rb +0 -22
  53. data/lib/babosa/utf8/mappings.rb +0 -193
  54. data/lib/babosa/utf8/proxy.rb +0 -125
  55. data/lib/babosa/utf8/unicode_proxy.rb +0 -23
  56. data/spec/utf8_proxy_spec.rb +0 -52
@@ -1,29 +1,11 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
2
 
3
- require 'singleton'
3
+ require "singleton"
4
4
 
5
5
  module Babosa
6
-
7
6
  module Transliterator
8
-
9
- autoload :Bulgarian, "babosa/transliterator/bulgarian"
10
- autoload :Cyrillic, "babosa/transliterator/cyrillic"
11
- autoload :Danish, "babosa/transliterator/danish"
12
- autoload :German, "babosa/transliterator/german"
13
- autoload :Latin, "babosa/transliterator/latin"
14
- autoload :Macedonian, "babosa/transliterator/macedonian"
15
- autoload :Norwegian, "babosa/transliterator/norwegian"
16
- autoload :Romanian, "babosa/transliterator/romanian"
17
- autoload :Russian, "babosa/transliterator/russian"
18
- autoload :Serbian, "babosa/transliterator/serbian"
19
- autoload :Spanish, "babosa/transliterator/spanish"
20
- autoload :Swedish, "babosa/transliterator/swedish"
21
- autoload :Ukrainian, "babosa/transliterator/ukrainian"
22
- autoload :Greek, "babosa/transliterator/greek"
23
- autoload :Vietnamese, "babosa/transliterator/vietnamese"
24
-
25
7
  def self.get(symbol)
26
- class_name = symbol.to_s.split("_").map {|a| a.gsub(/\b('?[a-z])/) { $1.upcase }}.join
8
+ class_name = symbol.to_s.split("_").map { |a| a.gsub(/\b('?[a-z])/) { Regexp.last_match(1).upcase } }.join
27
9
  const_get(class_name)
28
10
  end
29
11
 
@@ -45,36 +27,39 @@ module Babosa
45
27
  "”" => '"',
46
28
  "„" => '"',
47
29
  "‟" => '"',
48
- '' => "'",
49
- '' => ",",
50
- '' => ".",
51
- '' => "!",
52
- '' => '?',
53
- '' => ',',
54
- '' => '(',
55
- '' => ')',
56
- '' => '[',
57
- '' => ']',
58
- '' => ';',
59
- '' => ':',
60
- '' => '<',
61
- '' => '>',
62
- # various kinds of space characters
63
- "\xc2\xa0" => " ",
64
- "\xe2\x80\x80" => " ",
65
- "\xe2\x80\x81" => " ",
66
- "\xe2\x80\x82" => " ",
67
- "\xe2\x80\x83" => " ",
68
- "\xe2\x80\x84" => " ",
69
- "\xe2\x80\x85" => " ",
70
- "\xe2\x80\x86" => " ",
71
- "\xe2\x80\x87" => " ",
72
- "\xe2\x80\x88" => " ",
73
- "\xe2\x80\x89" => " ",
74
- "\xe2\x80\x8a" => " ",
75
- "\xe2\x81\x9f" => " ",
76
- "\xe3\x80\x80" => " ",
77
- }.freeze
30
+ "" => "'",
31
+ "" => ",",
32
+ "" => ".",
33
+ "" => "!",
34
+ "" => "?",
35
+ "" => ",",
36
+ "" => "(",
37
+ "" => ")",
38
+ "" => "[",
39
+ "" => "]",
40
+ "" => ";",
41
+ "" => ":",
42
+ "" => "<",
43
+ "" => ">"
44
+ }.merge(
45
+ {
46
+ # various kinds of space characters
47
+ "\xc2\xa0" => " ",
48
+ "\xe2\x80\x80" => " ",
49
+ "\xe2\x80\x81" => " ",
50
+ "\xe2\x80\x82" => " ",
51
+ "\xe2\x80\x83" => " ",
52
+ "\xe2\x80\x84" => " ",
53
+ "\xe2\x80\x85" => " ",
54
+ "\xe2\x80\x86" => " ",
55
+ "\xe2\x80\x87" => " ",
56
+ "\xe2\x80\x88" => " ",
57
+ "\xe2\x80\x89" => " ",
58
+ "\xe2\x80\x8a" => " ",
59
+ "\xe2\x81\x9f" => " ",
60
+ "\xe3\x80\x80" => " "
61
+ }
62
+ ).freeze
78
63
 
79
64
  attr_reader :approximations
80
65
 
@@ -85,8 +70,8 @@ module Babosa
85
70
  @approximations = {}
86
71
  end
87
72
  self.class.const_get(:APPROXIMATIONS).inject(@approximations) do |memo, object|
88
- index = object[0].unpack("U").shift
89
- value = object[1].unpack("C*")
73
+ index = object[0].codepoints.shift
74
+ value = object[1].codepoints
90
75
  memo[index] = value.length == 1 ? value[0] : value
91
76
  memo
92
77
  end
@@ -101,8 +86,26 @@ module Babosa
101
86
 
102
87
  # Transliterates a string.
103
88
  def transliterate(string)
104
- string.unpack("U*").map {|char| self[char] || char}.flatten.pack("U*")
89
+ string.codepoints.map { |char| self[char] || char }.flatten.pack("U*")
105
90
  end
106
91
  end
107
92
  end
108
93
  end
94
+
95
+ require "babosa/transliterator/cyrillic"
96
+ require "babosa/transliterator/latin"
97
+ require "babosa/transliterator/bulgarian"
98
+ require "babosa/transliterator/danish"
99
+ require "babosa/transliterator/german"
100
+ require "babosa/transliterator/hindi"
101
+ require "babosa/transliterator/macedonian"
102
+ require "babosa/transliterator/norwegian"
103
+ require "babosa/transliterator/romanian"
104
+ require "babosa/transliterator/russian"
105
+ require "babosa/transliterator/serbian"
106
+ require "babosa/transliterator/spanish"
107
+ require "babosa/transliterator/swedish"
108
+ require "babosa/transliterator/ukrainian"
109
+ require "babosa/transliterator/greek"
110
+ require "babosa/transliterator/vietnamese"
111
+ require "babosa/transliterator/turkish"
@@ -1,4 +1,5 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  module Babosa
3
4
  module Transliterator
4
5
  class Bulgarian < Cyrillic
@@ -21,7 +22,7 @@ module Babosa
21
22
  "ь" => "i",
22
23
  "ю" => "iu",
23
24
  "я" => "ia"
24
- }
25
+ }.freeze
25
26
  end
26
27
  end
27
28
  end
@@ -1,7 +1,7 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  module Babosa
3
4
  module Transliterator
4
-
5
5
  # Approximations are based on GOST 7.79, System B:
6
6
  # http://en.wikipedia.org/wiki/ISO_9#GOST_7.79
7
7
  class Cyrillic < Base
@@ -97,11 +97,11 @@ module Babosa
97
97
  "Ѵ" => "Yh",
98
98
  "ѵ" => "yh",
99
99
  "Ґ" => "G",
100
- "ґ" => "g",
101
- }
100
+ "ґ" => "g"
101
+ }.freeze
102
102
 
103
103
  def transliterate(string)
104
- super.gsub(/(c)z([ieyj])/) { "#{$1}#{$2}" }
104
+ super.gsub(/(c)z([ieyj])/) { "#{Regexp.last_match(1)}#{Regexp.last_match(2)}" }
105
105
  end
106
106
  end
107
107
  end
@@ -1,4 +1,5 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  module Babosa
3
4
  module Transliterator
4
5
  class Danish < Latin
@@ -8,8 +9,7 @@ module Babosa
8
9
  "å" => "aa",
9
10
  "Ø" => "Oe",
10
11
  "Å" => "Aa"
11
- }
12
+ }.freeze
12
13
  end
13
14
  end
14
15
  end
15
-
@@ -1,4 +1,5 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  module Babosa
3
4
  module Transliterator
4
5
  class German < Latin
@@ -9,7 +10,7 @@ module Babosa
9
10
  "Ä" => "Ae",
10
11
  "Ö" => "Oe",
11
12
  "Ü" => "Ue"
12
- }
13
+ }.freeze
13
14
  end
14
15
  end
15
16
  end
@@ -1,4 +1,5 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  module Babosa
3
4
  module Transliterator
4
5
  class Greek < Base
@@ -71,7 +72,7 @@ module Babosa
71
72
  "Ώ" => "O",
72
73
  "ω" => "o",
73
74
  "ώ" => "o"
74
- }
75
+ }.freeze
75
76
  end
76
77
  end
77
- end
78
+ end
@@ -0,0 +1,138 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Babosa
4
+ module Transliterator
5
+ class Hindi < Base
6
+ APPROXIMATIONS = {
7
+ "ऀ" => "n",
8
+ "ँ" => "n",
9
+ "ं" => "n",
10
+ "ः" => "h",
11
+ "ऄ" => "a",
12
+ "अ" => "a",
13
+ "आ" => "aa",
14
+ "इ" => "i",
15
+ "ई" => "ii",
16
+ "उ" => "u",
17
+ "ऊ" => "uu",
18
+ "ऋ" => "ri",
19
+ "ऌ" => "lri",
20
+ "ऍ" => "e",
21
+ "ऎ" => "e",
22
+ "ए" => "e",
23
+ "ऐ" => "ei",
24
+ "ऑ" => "o",
25
+ "ऒ" => "o",
26
+ "ओ" => "o",
27
+ "औ" => "ou",
28
+ "क" => "k",
29
+ "ख" => "kh",
30
+ "ग" => "g",
31
+ "घ" => "gh",
32
+ "ङ" => "d",
33
+ "च" => "ch",
34
+ "छ" => "chh",
35
+ "ज" => "j",
36
+ "झ" => "jh",
37
+ "ञ" => "ny",
38
+ "ट" => "tt",
39
+ "ठ" => "tth",
40
+ "ड" => "dd",
41
+ "ढ" => "ddh",
42
+ "ण" => "nn",
43
+ "त" => "t",
44
+ "थ" => "th",
45
+ "द" => "d",
46
+ "ध" => "dh",
47
+ "न" => "n",
48
+ "ऩ" => "nnn",
49
+ "प" => "p",
50
+ "फ" => "ph",
51
+ "ब" => "b",
52
+ "भ" => "bh",
53
+ "म" => "m",
54
+ "य" => "y",
55
+ "र" => "r",
56
+ "ऱ" => "rr",
57
+ "ल" => "l",
58
+ "ळ" => "ll",
59
+ "ऴ" => "ll",
60
+ "व" => "v",
61
+ "श" => "sh",
62
+ "ष" => "ss",
63
+ "स" => "s",
64
+ "ह" => "h",
65
+ "ऺ" => "oe",
66
+ "ऻ" => "ooe",
67
+ "़" => "",
68
+ "ऽ" => "-",
69
+ "ा" => "aa",
70
+ "ि" => "i",
71
+ "ी" => "ii",
72
+ "ु" => "u",
73
+ "ू" => "uu",
74
+ "ृ" => "r",
75
+ "ॄ" => "rr",
76
+ "ॅ" => "e",
77
+ "ॆ" => "e",
78
+ "े" => "e",
79
+ "ै" => "ai",
80
+ "ॉ" => "o",
81
+ "ॊ" => "o",
82
+ "ो" => "o",
83
+ "ौ" => "au",
84
+ "्" => "",
85
+ "ॎ" => "e",
86
+ "ॏ" => "aw",
87
+ "ॐ" => "om",
88
+ "॑" => "",
89
+ "॒" => "_",
90
+ "॓" => "",
91
+ "॔" => "",
92
+ "ॕ" => "ee",
93
+ "ॖ" => "ue",
94
+ "ॗ" => "uue",
95
+ "क़" => "q",
96
+ "ख़" => "khh",
97
+ "ग़" => "ghh",
98
+ "ज़" => "za",
99
+ "ड़" => "dddh",
100
+ "ढ़" => "rh",
101
+ "फ़" => "f",
102
+ "य़" => "yy",
103
+ "ॠ" => "rri",
104
+ "ॡ" => "lr",
105
+ "ॢ" => "l",
106
+ "ॣ" => "l",
107
+ "।" => ".",
108
+ "॥" => "..",
109
+ "०" => "0",
110
+ "१" => "1",
111
+ "२" => "2",
112
+ "३" => "3",
113
+ "४" => "4",
114
+ "५" => "5",
115
+ "६" => "6",
116
+ "७" => "7",
117
+ "८" => "8",
118
+ "९" => "9",
119
+ "॰" => ".",
120
+ "ॱ" => ".",
121
+ "ॲ" => "a",
122
+ "ॳ" => "oe",
123
+ "ॴ" => "ooe",
124
+ "ॵ" => "aw",
125
+ "ॶ" => "ue",
126
+ "ॷ" => "uue",
127
+ "ॸ" => "dd",
128
+ "ॹ" => "zh",
129
+ "ॺ" => "y",
130
+ "ॻ" => "gg",
131
+ "ॼ" => "jj",
132
+ "ॽ" => "?",
133
+ "ॾ" => "ddd",
134
+ "ॿ" => "bb"
135
+ }.freeze
136
+ end
137
+ end
138
+ end
@@ -1,8 +1,8 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  module Babosa
3
4
  module Transliterator
4
5
  class Latin < Base
5
-
6
6
  APPROXIMATIONS = {
7
7
  "À" => "A",
8
8
  "Á" => "A",
@@ -35,14 +35,14 @@ module Babosa
35
35
  "Ý" => "Y",
36
36
  "Þ" => "Th",
37
37
  "ß" => "ss",
38
- "à" => "a" ,
38
+ "à" => "a",
39
39
  "á" => "a",
40
40
  "â" => "a",
41
41
  "ã" => "a",
42
42
  "ä" => "a",
43
43
  "å" => "a",
44
44
  "æ" => "ae",
45
- "ç" => "c" ,
45
+ "ç" => "c",
46
46
  "è" => "e",
47
47
  "é" => "e",
48
48
  "ê" => "e",
@@ -193,7 +193,7 @@ module Babosa
193
193
  "ž" => "z",
194
194
  "ź" => "z",
195
195
  "ż" => "z"
196
- }
196
+ }.freeze
197
197
  end
198
198
  end
199
199
  end
@@ -1,4 +1,5 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  module Babosa
3
4
  module Transliterator
4
5
  class Macedonian < Cyrillic
@@ -23,7 +24,7 @@ module Babosa
23
24
  "ѕ" => "z",
24
25
  "ј" => "j",
25
26
  "х" => "h"
26
- }
27
+ }.freeze
27
28
  end
28
29
  end
29
30
  end
@@ -1,4 +1,5 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  module Babosa
3
4
  module Transliterator
4
5
  class Norwegian < Latin
@@ -7,8 +8,7 @@ module Babosa
7
8
  "å" => "aa",
8
9
  "Ø" => "Oe",
9
10
  "Å" => "Aa"
10
- }
11
+ }.freeze
11
12
  end
12
13
  end
13
14
  end
14
-
@@ -1,4 +1,5 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  module Babosa
3
4
  module Transliterator
4
5
  class Romanian < Latin
@@ -7,7 +8,7 @@ module Babosa
7
8
  "ț" => "t",
8
9
  "Ș" => "S",
9
10
  "Ț" => "T"
10
- }
11
+ }.freeze
11
12
  end
12
13
  end
13
14
  end