kebab 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gemtest +0 -0
- data/Changelog.md +99 -0
- data/MIT-LICENSE +19 -0
- data/README.md +26 -0
- data/Rakefile +34 -0
- data/lib/kebab.rb +18 -0
- data/lib/kebab/identifier.rb +294 -0
- data/lib/kebab/transliterator/base.rb +110 -0
- data/lib/kebab/transliterator/bulgarian.rb +27 -0
- data/lib/kebab/transliterator/cyrillic.rb +108 -0
- data/lib/kebab/transliterator/danish.rb +15 -0
- data/lib/kebab/transliterator/german.rb +15 -0
- data/lib/kebab/transliterator/greek.rb +77 -0
- data/lib/kebab/transliterator/hindi.rb +137 -0
- data/lib/kebab/transliterator/latin.rb +199 -0
- data/lib/kebab/transliterator/macedonian.rb +29 -0
- data/lib/kebab/transliterator/norwegian.rb +14 -0
- data/lib/kebab/transliterator/romanian.rb +13 -0
- data/lib/kebab/transliterator/russian.rb +22 -0
- data/lib/kebab/transliterator/serbian.rb +34 -0
- data/lib/kebab/transliterator/spanish.rb +9 -0
- data/lib/kebab/transliterator/swedish.rb +16 -0
- data/lib/kebab/transliterator/turkish.rb +8 -0
- data/lib/kebab/transliterator/ukrainian.rb +30 -0
- data/lib/kebab/transliterator/vietnamese.rb +143 -0
- data/lib/kebab/utf8/active_support_proxy.rb +26 -0
- data/lib/kebab/utf8/dumb_proxy.rb +49 -0
- data/lib/kebab/utf8/java_proxy.rb +22 -0
- data/lib/kebab/utf8/mappings.rb +193 -0
- data/lib/kebab/utf8/proxy.rb +125 -0
- data/lib/kebab/utf8/unicode_proxy.rb +23 -0
- data/lib/kebab/version.rb +5 -0
- data/spec/kebab_spec.rb +155 -0
- data/spec/spec_helper.rb +45 -0
- data/spec/transliterators/base_spec.rb +16 -0
- data/spec/transliterators/bulgarian_spec.rb +20 -0
- data/spec/transliterators/danish_spec.rb +17 -0
- data/spec/transliterators/german_spec.rb +17 -0
- data/spec/transliterators/greek_spec.rb +17 -0
- data/spec/transliterators/hindi_spec.rb +17 -0
- data/spec/transliterators/latin_spec.rb +9 -0
- data/spec/transliterators/macedonian_spec.rb +9 -0
- data/spec/transliterators/norwegian_spec.rb +18 -0
- data/spec/transliterators/polish_spec.rb +14 -0
- data/spec/transliterators/romanian_spec.rb +19 -0
- data/spec/transliterators/russian_spec.rb +9 -0
- data/spec/transliterators/serbian_spec.rb +25 -0
- data/spec/transliterators/spanish_spec.rb +13 -0
- data/spec/transliterators/swedish_spec.rb +18 -0
- data/spec/transliterators/turkish_spec.rb +24 -0
- data/spec/transliterators/ukrainian_spec.rb +88 -0
- data/spec/transliterators/vietnamese_spec.rb +18 -0
- data/spec/utf8_proxy_spec.rb +53 -0
- metadata +167 -0
@@ -0,0 +1,110 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'singleton'
|
4
|
+
|
5
|
+
module Kebab
|
6
|
+
|
7
|
+
module Transliterator
|
8
|
+
|
9
|
+
autoload :Bulgarian, "kebab/transliterator/bulgarian"
|
10
|
+
autoload :Cyrillic, "kebab/transliterator/cyrillic"
|
11
|
+
autoload :Danish, "kebab/transliterator/danish"
|
12
|
+
autoload :German, "kebab/transliterator/german"
|
13
|
+
autoload :Hindi, "kebab/transliterator/hindi"
|
14
|
+
autoload :Latin, "kebab/transliterator/latin"
|
15
|
+
autoload :Macedonian, "kebab/transliterator/macedonian"
|
16
|
+
autoload :Norwegian, "kebab/transliterator/norwegian"
|
17
|
+
autoload :Romanian, "kebab/transliterator/romanian"
|
18
|
+
autoload :Russian, "kebab/transliterator/russian"
|
19
|
+
autoload :Serbian, "kebab/transliterator/serbian"
|
20
|
+
autoload :Spanish, "kebab/transliterator/spanish"
|
21
|
+
autoload :Swedish, "kebab/transliterator/swedish"
|
22
|
+
autoload :Ukrainian, "kebab/transliterator/ukrainian"
|
23
|
+
autoload :Greek, "kebab/transliterator/greek"
|
24
|
+
autoload :Vietnamese, "kebab/transliterator/vietnamese"
|
25
|
+
autoload :Turkish, "kebab/transliterator/turkish"
|
26
|
+
|
27
|
+
def self.get(symbol)
|
28
|
+
class_name = symbol.to_s.split("_").map {|a| a.gsub(/\b('?[a-z])/) { $1.upcase }}.join
|
29
|
+
const_get(class_name)
|
30
|
+
end
|
31
|
+
|
32
|
+
class Base
|
33
|
+
include Singleton
|
34
|
+
|
35
|
+
APPROXIMATIONS = {
|
36
|
+
"×" => "x",
|
37
|
+
"÷" => "/",
|
38
|
+
"‐" => "-",
|
39
|
+
"‑" => "-",
|
40
|
+
"‒" => "-",
|
41
|
+
"–" => "-",
|
42
|
+
"—" => "-",
|
43
|
+
"―" => "-",
|
44
|
+
"‘" => "'",
|
45
|
+
"‛" => "'",
|
46
|
+
"“" => '"',
|
47
|
+
"”" => '"',
|
48
|
+
"„" => '"',
|
49
|
+
"‟" => '"',
|
50
|
+
'’' => "'",
|
51
|
+
',' => ",",
|
52
|
+
'。' => ".",
|
53
|
+
'!' => "!",
|
54
|
+
'?' => '?',
|
55
|
+
'、' => ',',
|
56
|
+
'(' => '(',
|
57
|
+
')' => ')',
|
58
|
+
'【' => '[',
|
59
|
+
'】' => ']',
|
60
|
+
';' => ';',
|
61
|
+
':' => ':',
|
62
|
+
'《' => '<',
|
63
|
+
'》' => '>',
|
64
|
+
# various kinds of space characters
|
65
|
+
"\xc2\xa0" => " ",
|
66
|
+
"\xe2\x80\x80" => " ",
|
67
|
+
"\xe2\x80\x81" => " ",
|
68
|
+
"\xe2\x80\x82" => " ",
|
69
|
+
"\xe2\x80\x83" => " ",
|
70
|
+
"\xe2\x80\x84" => " ",
|
71
|
+
"\xe2\x80\x85" => " ",
|
72
|
+
"\xe2\x80\x86" => " ",
|
73
|
+
"\xe2\x80\x87" => " ",
|
74
|
+
"\xe2\x80\x88" => " ",
|
75
|
+
"\xe2\x80\x89" => " ",
|
76
|
+
"\xe2\x80\x8a" => " ",
|
77
|
+
"\xe2\x81\x9f" => " ",
|
78
|
+
"\xe3\x80\x80" => " ",
|
79
|
+
}.freeze
|
80
|
+
|
81
|
+
attr_reader :approximations
|
82
|
+
|
83
|
+
def initialize
|
84
|
+
if self.class < Base
|
85
|
+
@approximations = self.class.superclass.instance.approximations.dup
|
86
|
+
else
|
87
|
+
@approximations = {}
|
88
|
+
end
|
89
|
+
self.class.const_get(:APPROXIMATIONS).inject(@approximations) do |memo, object|
|
90
|
+
index = object[0].unpack("U").shift
|
91
|
+
value = object[1].unpack("C*")
|
92
|
+
memo[index] = value.length == 1 ? value[0] : value
|
93
|
+
memo
|
94
|
+
end
|
95
|
+
@approximations.freeze
|
96
|
+
end
|
97
|
+
|
98
|
+
# Accepts a single UTF-8 codepoint and returns the ASCII character code
|
99
|
+
# used as the transliteration value.
|
100
|
+
def [](codepoint)
|
101
|
+
@approximations[codepoint]
|
102
|
+
end
|
103
|
+
|
104
|
+
# Transliterates a string.
|
105
|
+
def transliterate(string)
|
106
|
+
string.unpack("U*").map {|char| self[char] || char}.flatten.pack("U*")
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Kebab
|
3
|
+
module Transliterator
|
4
|
+
class Bulgarian < Cyrillic
|
5
|
+
APPROXIMATIONS = {
|
6
|
+
"Ж" => "J",
|
7
|
+
"Й" => "I",
|
8
|
+
"Х" => "H",
|
9
|
+
"Ц" => "C",
|
10
|
+
"Щ" => "Sht",
|
11
|
+
"Ъ" => "U",
|
12
|
+
"Ь" => "I",
|
13
|
+
"Ю" => "Iu",
|
14
|
+
"Я" => "Ia",
|
15
|
+
"ж" => "j",
|
16
|
+
"й" => "i",
|
17
|
+
"х" => "h",
|
18
|
+
"ц" => "c",
|
19
|
+
"щ" => "sht",
|
20
|
+
"ъ" => "u",
|
21
|
+
"ь" => "i",
|
22
|
+
"ю" => "iu",
|
23
|
+
"я" => "ia"
|
24
|
+
}
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Kebab
|
3
|
+
module Transliterator
|
4
|
+
|
5
|
+
# Approximations are based on GOST 7.79, System B:
|
6
|
+
# http://en.wikipedia.org/wiki/ISO_9#GOST_7.79
|
7
|
+
class Cyrillic < Base
|
8
|
+
APPROXIMATIONS = {
|
9
|
+
"Ё" => "Yo",
|
10
|
+
"Ѓ" => "G",
|
11
|
+
"Є" => "Ye",
|
12
|
+
"Ї" => "Yi",
|
13
|
+
"Љ" => "L",
|
14
|
+
"Њ" => "N",
|
15
|
+
"Ќ" => "K",
|
16
|
+
"Ў" => "U",
|
17
|
+
"Џ" => "Dh",
|
18
|
+
"А" => "A",
|
19
|
+
"Б" => "B",
|
20
|
+
"В" => "V",
|
21
|
+
"Г" => "G",
|
22
|
+
"Д" => "D",
|
23
|
+
"Е" => "E",
|
24
|
+
"Ж" => "Zh",
|
25
|
+
"З" => "Z",
|
26
|
+
"И" => "I",
|
27
|
+
"Й" => "J",
|
28
|
+
"К" => "K",
|
29
|
+
"Л" => "L",
|
30
|
+
"М" => "M",
|
31
|
+
"Н" => "N",
|
32
|
+
"О" => "O",
|
33
|
+
"П" => "P",
|
34
|
+
"Р" => "R",
|
35
|
+
"С" => "S",
|
36
|
+
"Т" => "T",
|
37
|
+
"У" => "U",
|
38
|
+
"Ф" => "F",
|
39
|
+
"Х" => "X",
|
40
|
+
"Ц" => "Cz",
|
41
|
+
"Ч" => "Ch",
|
42
|
+
"Ш" => "Sh",
|
43
|
+
"Щ" => "Shh",
|
44
|
+
"Ъ" => "",
|
45
|
+
"Ы" => "Y",
|
46
|
+
"Ь" => "",
|
47
|
+
"Э" => "E",
|
48
|
+
"Ю" => "Yu",
|
49
|
+
"Я" => "Ya",
|
50
|
+
"а" => "a",
|
51
|
+
"б" => "b",
|
52
|
+
"в" => "v",
|
53
|
+
"г" => "g",
|
54
|
+
"д" => "d",
|
55
|
+
"е" => "e",
|
56
|
+
"ж" => "zh",
|
57
|
+
"з" => "z",
|
58
|
+
"и" => "i",
|
59
|
+
"й" => "j",
|
60
|
+
"к" => "k",
|
61
|
+
"л" => "l",
|
62
|
+
"м" => "m",
|
63
|
+
"н" => "n",
|
64
|
+
"о" => "o",
|
65
|
+
"п" => "p",
|
66
|
+
"р" => "r",
|
67
|
+
"с" => "s",
|
68
|
+
"т" => "t",
|
69
|
+
"у" => "u",
|
70
|
+
"ф" => "f",
|
71
|
+
"х" => "x",
|
72
|
+
"ц" => "cz",
|
73
|
+
"ч" => "ch",
|
74
|
+
"ш" => "sh",
|
75
|
+
"щ" => "shh",
|
76
|
+
"ъ" => "",
|
77
|
+
"ы" => "y",
|
78
|
+
"ь" => "",
|
79
|
+
"э" => "e",
|
80
|
+
"ю" => "yu",
|
81
|
+
"я" => "ya",
|
82
|
+
"ё" => "yo",
|
83
|
+
"ѓ" => "g",
|
84
|
+
"є" => "ye",
|
85
|
+
"ї" => "yi",
|
86
|
+
"љ" => "l",
|
87
|
+
"њ" => "n",
|
88
|
+
"ќ" => "k",
|
89
|
+
"ў" => "u",
|
90
|
+
"џ" => "dh",
|
91
|
+
"Ѣ" => "Ye",
|
92
|
+
"ѣ" => "ye",
|
93
|
+
"Ѫ" => "O",
|
94
|
+
"ѫ" => "o",
|
95
|
+
"Ѳ" => "Fh",
|
96
|
+
"ѳ" => "fh",
|
97
|
+
"Ѵ" => "Yh",
|
98
|
+
"ѵ" => "yh",
|
99
|
+
"Ґ" => "G",
|
100
|
+
"ґ" => "g",
|
101
|
+
}
|
102
|
+
|
103
|
+
def transliterate(string)
|
104
|
+
super.gsub(/(c)z([ieyj])/) { "#{$1}#{$2}" }
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Kebab
|
3
|
+
module Transliterator
|
4
|
+
class Greek < Base
|
5
|
+
APPROXIMATIONS = {
|
6
|
+
"Α" => "A",
|
7
|
+
"Ά" => "A",
|
8
|
+
"α" => "a",
|
9
|
+
"ά" => "a",
|
10
|
+
"Β" => "V",
|
11
|
+
"β" => "v",
|
12
|
+
"Γ" => "G",
|
13
|
+
"γ" => "g",
|
14
|
+
"Δ" => "D",
|
15
|
+
"δ" => "d",
|
16
|
+
"Ε" => "E",
|
17
|
+
"Έ" => "E",
|
18
|
+
"ε" => "e",
|
19
|
+
"έ" => "e",
|
20
|
+
"Ζ" => "Z",
|
21
|
+
"ζ" => "z",
|
22
|
+
"Η" => "I",
|
23
|
+
"Ή" => "i",
|
24
|
+
"η" => "i",
|
25
|
+
"ή" => "i",
|
26
|
+
"Θ" => "TH",
|
27
|
+
"θ" => "th",
|
28
|
+
"Ι" => "I",
|
29
|
+
"Ί" => "Ι",
|
30
|
+
"Î" => "I",
|
31
|
+
"ι" => "i",
|
32
|
+
"ί" => "i",
|
33
|
+
"ϊ" => "i",
|
34
|
+
"ΐ" => "i",
|
35
|
+
"Κ" => "K",
|
36
|
+
"κ" => "k",
|
37
|
+
"Λ" => "L",
|
38
|
+
"λ" => "l",
|
39
|
+
"Μ" => "M",
|
40
|
+
"μ" => "m",
|
41
|
+
"Ν" => "N",
|
42
|
+
"ν" => "n",
|
43
|
+
"Ξ" => "KS",
|
44
|
+
"ξ" => "ks",
|
45
|
+
"Ο" => "O",
|
46
|
+
"Ό" => "O",
|
47
|
+
"ο" => "o",
|
48
|
+
"ό" => "o",
|
49
|
+
"Π" => "P",
|
50
|
+
"π" => "p",
|
51
|
+
"Ρ" => "R",
|
52
|
+
"ρ" => "r",
|
53
|
+
"Σ" => "S",
|
54
|
+
"σ" => "s",
|
55
|
+
"ς" => "s",
|
56
|
+
"Τ" => "T",
|
57
|
+
"τ" => "t",
|
58
|
+
"Υ" => "Y",
|
59
|
+
"Ύ" => "Y",
|
60
|
+
"υ" => "y",
|
61
|
+
"ύ" => "y",
|
62
|
+
"ϋ" => "y",
|
63
|
+
"ΰ" => "y",
|
64
|
+
"Φ" => "F",
|
65
|
+
"φ" => "f",
|
66
|
+
"Χ" => "X",
|
67
|
+
"χ" => "x",
|
68
|
+
"Ψ" => "PS",
|
69
|
+
"ψ" => "ps",
|
70
|
+
"Ω" => "O",
|
71
|
+
"Ώ" => "O",
|
72
|
+
"ω" => "o",
|
73
|
+
"ώ" => "o"
|
74
|
+
}
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,137 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Kebab
|
3
|
+
module Transliterator
|
4
|
+
class Hindi < Base
|
5
|
+
APPROXIMATIONS = {
|
6
|
+
"ऀ" => "n",
|
7
|
+
"ँ" => "n",
|
8
|
+
"ं" => "n",
|
9
|
+
"ः" => "h",
|
10
|
+
"ऄ" => "a",
|
11
|
+
"अ" => "a",
|
12
|
+
"आ" => "aa",
|
13
|
+
"इ" => "i",
|
14
|
+
"ई" => "ii",
|
15
|
+
"उ" => "u",
|
16
|
+
"ऊ" => "uu",
|
17
|
+
"ऋ" => "ri",
|
18
|
+
"ऌ" => "lri",
|
19
|
+
"ऍ" => "e",
|
20
|
+
"ऎ" => "e",
|
21
|
+
"ए" => "e",
|
22
|
+
"ऐ" => "ei",
|
23
|
+
"ऑ" => "o",
|
24
|
+
"ऒ" => "o",
|
25
|
+
"ओ" => "o",
|
26
|
+
"औ" => "ou",
|
27
|
+
"क" => "k",
|
28
|
+
"ख" => "kh",
|
29
|
+
"ग" => "g",
|
30
|
+
"घ" => "gh",
|
31
|
+
"ङ" => "d",
|
32
|
+
"च" => "ch",
|
33
|
+
"छ" => "chh",
|
34
|
+
"ज" => "j",
|
35
|
+
"झ" => "jh",
|
36
|
+
"ञ" => "ny",
|
37
|
+
"ट" => "tt",
|
38
|
+
"ठ" => "tth",
|
39
|
+
"ड" => "dd",
|
40
|
+
"ढ" => "ddh",
|
41
|
+
"ण" => "nn",
|
42
|
+
"त" => "t",
|
43
|
+
"थ" => "th",
|
44
|
+
"द" => "d",
|
45
|
+
"ध" => "dh",
|
46
|
+
"न" => "n",
|
47
|
+
"ऩ" => "nnn",
|
48
|
+
"प" => "p",
|
49
|
+
"फ" => "ph",
|
50
|
+
"ब" => "b",
|
51
|
+
"भ" => "bh",
|
52
|
+
"म" => "m",
|
53
|
+
"य" => "y",
|
54
|
+
"र" => "r",
|
55
|
+
"ऱ" => "rr",
|
56
|
+
"ल" => "l",
|
57
|
+
"ळ" => "ll",
|
58
|
+
"ऴ" => "ll",
|
59
|
+
"व" => "v",
|
60
|
+
"श" => "sh",
|
61
|
+
"ष" => "ss",
|
62
|
+
"स" => "s",
|
63
|
+
"ह" => "h",
|
64
|
+
"ऺ" => "oe",
|
65
|
+
"ऻ" => "ooe",
|
66
|
+
"़" => "",
|
67
|
+
"ऽ" => "-",
|
68
|
+
"ा" => "aa",
|
69
|
+
"ि" => "i",
|
70
|
+
"ी" => "ii",
|
71
|
+
"ु" => "u",
|
72
|
+
"ू" => "uu",
|
73
|
+
"ृ" => "r",
|
74
|
+
"ॄ" => "rr",
|
75
|
+
"ॅ" => "e",
|
76
|
+
"ॆ" => "e",
|
77
|
+
"े" => "e",
|
78
|
+
"ै" => "ai",
|
79
|
+
"ॉ" => "o",
|
80
|
+
"ॊ" => "o",
|
81
|
+
"ो" => "o",
|
82
|
+
"ौ" => "au",
|
83
|
+
"्" => "",
|
84
|
+
"ॎ" => "e",
|
85
|
+
"ॏ" => "aw",
|
86
|
+
"ॐ" => "om",
|
87
|
+
"॑" => "",
|
88
|
+
"॒" => "_",
|
89
|
+
"॓" => "",
|
90
|
+
"॔" => "",
|
91
|
+
"ॕ" => "ee",
|
92
|
+
"ॖ" => "ue",
|
93
|
+
"ॗ" => "uue",
|
94
|
+
"क़" => "q",
|
95
|
+
"ख़" => "khh",
|
96
|
+
"ग़" => "ghh",
|
97
|
+
"ज़" => "za",
|
98
|
+
"ड़" => "dddh",
|
99
|
+
"ढ़" => "rh",
|
100
|
+
"फ़" => "f",
|
101
|
+
"य़" => "yy",
|
102
|
+
"ॠ" => "rri",
|
103
|
+
"ॡ" => "lr",
|
104
|
+
"ॢ" => "l",
|
105
|
+
"ॣ" => "l",
|
106
|
+
"।" => ".",
|
107
|
+
"॥" => "..",
|
108
|
+
"०" => "0",
|
109
|
+
"१" => "1",
|
110
|
+
"२" => "2",
|
111
|
+
"३" => "3",
|
112
|
+
"४" => "4",
|
113
|
+
"५" => "5",
|
114
|
+
"६" => "6",
|
115
|
+
"७" => "7",
|
116
|
+
"८" => "8",
|
117
|
+
"९" => "9",
|
118
|
+
"॰" => ".",
|
119
|
+
"ॱ" => ".",
|
120
|
+
"ॲ" => "a",
|
121
|
+
"ॳ" => "oe",
|
122
|
+
"ॴ" => "ooe",
|
123
|
+
"ॵ" => "aw",
|
124
|
+
"ॶ" => "ue",
|
125
|
+
"ॷ" => "uue",
|
126
|
+
"ॸ" => "dd",
|
127
|
+
"ॹ" => "zh",
|
128
|
+
"ॺ" => "y",
|
129
|
+
"ॻ" => "gg",
|
130
|
+
"ॼ" => "jj",
|
131
|
+
"ॽ" => "?",
|
132
|
+
"ॾ" => "ddd",
|
133
|
+
"ॿ" => "bb"
|
134
|
+
}
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|