kebab 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +7 -0
  2. data/.gemtest +0 -0
  3. data/Changelog.md +99 -0
  4. data/MIT-LICENSE +19 -0
  5. data/README.md +26 -0
  6. data/Rakefile +34 -0
  7. data/lib/kebab.rb +18 -0
  8. data/lib/kebab/identifier.rb +294 -0
  9. data/lib/kebab/transliterator/base.rb +110 -0
  10. data/lib/kebab/transliterator/bulgarian.rb +27 -0
  11. data/lib/kebab/transliterator/cyrillic.rb +108 -0
  12. data/lib/kebab/transliterator/danish.rb +15 -0
  13. data/lib/kebab/transliterator/german.rb +15 -0
  14. data/lib/kebab/transliterator/greek.rb +77 -0
  15. data/lib/kebab/transliterator/hindi.rb +137 -0
  16. data/lib/kebab/transliterator/latin.rb +199 -0
  17. data/lib/kebab/transliterator/macedonian.rb +29 -0
  18. data/lib/kebab/transliterator/norwegian.rb +14 -0
  19. data/lib/kebab/transliterator/romanian.rb +13 -0
  20. data/lib/kebab/transliterator/russian.rb +22 -0
  21. data/lib/kebab/transliterator/serbian.rb +34 -0
  22. data/lib/kebab/transliterator/spanish.rb +9 -0
  23. data/lib/kebab/transliterator/swedish.rb +16 -0
  24. data/lib/kebab/transliterator/turkish.rb +8 -0
  25. data/lib/kebab/transliterator/ukrainian.rb +30 -0
  26. data/lib/kebab/transliterator/vietnamese.rb +143 -0
  27. data/lib/kebab/utf8/active_support_proxy.rb +26 -0
  28. data/lib/kebab/utf8/dumb_proxy.rb +49 -0
  29. data/lib/kebab/utf8/java_proxy.rb +22 -0
  30. data/lib/kebab/utf8/mappings.rb +193 -0
  31. data/lib/kebab/utf8/proxy.rb +125 -0
  32. data/lib/kebab/utf8/unicode_proxy.rb +23 -0
  33. data/lib/kebab/version.rb +5 -0
  34. data/spec/kebab_spec.rb +155 -0
  35. data/spec/spec_helper.rb +45 -0
  36. data/spec/transliterators/base_spec.rb +16 -0
  37. data/spec/transliterators/bulgarian_spec.rb +20 -0
  38. data/spec/transliterators/danish_spec.rb +17 -0
  39. data/spec/transliterators/german_spec.rb +17 -0
  40. data/spec/transliterators/greek_spec.rb +17 -0
  41. data/spec/transliterators/hindi_spec.rb +17 -0
  42. data/spec/transliterators/latin_spec.rb +9 -0
  43. data/spec/transliterators/macedonian_spec.rb +9 -0
  44. data/spec/transliterators/norwegian_spec.rb +18 -0
  45. data/spec/transliterators/polish_spec.rb +14 -0
  46. data/spec/transliterators/romanian_spec.rb +19 -0
  47. data/spec/transliterators/russian_spec.rb +9 -0
  48. data/spec/transliterators/serbian_spec.rb +25 -0
  49. data/spec/transliterators/spanish_spec.rb +13 -0
  50. data/spec/transliterators/swedish_spec.rb +18 -0
  51. data/spec/transliterators/turkish_spec.rb +24 -0
  52. data/spec/transliterators/ukrainian_spec.rb +88 -0
  53. data/spec/transliterators/vietnamese_spec.rb +18 -0
  54. data/spec/utf8_proxy_spec.rb +53 -0
  55. metadata +167 -0
@@ -0,0 +1,110 @@
1
+ # encoding: utf-8
2
+
3
+ require 'singleton'
4
+
5
+ module Kebab
6
+
7
+ module Transliterator
8
+
9
+ autoload :Bulgarian, "kebab/transliterator/bulgarian"
10
+ autoload :Cyrillic, "kebab/transliterator/cyrillic"
11
+ autoload :Danish, "kebab/transliterator/danish"
12
+ autoload :German, "kebab/transliterator/german"
13
+ autoload :Hindi, "kebab/transliterator/hindi"
14
+ autoload :Latin, "kebab/transliterator/latin"
15
+ autoload :Macedonian, "kebab/transliterator/macedonian"
16
+ autoload :Norwegian, "kebab/transliterator/norwegian"
17
+ autoload :Romanian, "kebab/transliterator/romanian"
18
+ autoload :Russian, "kebab/transliterator/russian"
19
+ autoload :Serbian, "kebab/transliterator/serbian"
20
+ autoload :Spanish, "kebab/transliterator/spanish"
21
+ autoload :Swedish, "kebab/transliterator/swedish"
22
+ autoload :Ukrainian, "kebab/transliterator/ukrainian"
23
+ autoload :Greek, "kebab/transliterator/greek"
24
+ autoload :Vietnamese, "kebab/transliterator/vietnamese"
25
+ autoload :Turkish, "kebab/transliterator/turkish"
26
+
27
+ def self.get(symbol)
28
+ class_name = symbol.to_s.split("_").map {|a| a.gsub(/\b('?[a-z])/) { $1.upcase }}.join
29
+ const_get(class_name)
30
+ end
31
+
32
+ class Base
33
+ include Singleton
34
+
35
+ APPROXIMATIONS = {
36
+ "×" => "x",
37
+ "÷" => "/",
38
+ "‐" => "-",
39
+ "‑" => "-",
40
+ "‒" => "-",
41
+ "–" => "-",
42
+ "—" => "-",
43
+ "―" => "-",
44
+ "‘" => "'",
45
+ "‛" => "'",
46
+ "“" => '"',
47
+ "”" => '"',
48
+ "„" => '"',
49
+ "‟" => '"',
50
+ '’' => "'",
51
+ ',' => ",",
52
+ '。' => ".",
53
+ '!' => "!",
54
+ '?' => '?',
55
+ '、' => ',',
56
+ '(' => '(',
57
+ ')' => ')',
58
+ '【' => '[',
59
+ '】' => ']',
60
+ ';' => ';',
61
+ ':' => ':',
62
+ '《' => '<',
63
+ '》' => '>',
64
+ # various kinds of space characters
65
+ "\xc2\xa0" => " ",
66
+ "\xe2\x80\x80" => " ",
67
+ "\xe2\x80\x81" => " ",
68
+ "\xe2\x80\x82" => " ",
69
+ "\xe2\x80\x83" => " ",
70
+ "\xe2\x80\x84" => " ",
71
+ "\xe2\x80\x85" => " ",
72
+ "\xe2\x80\x86" => " ",
73
+ "\xe2\x80\x87" => " ",
74
+ "\xe2\x80\x88" => " ",
75
+ "\xe2\x80\x89" => " ",
76
+ "\xe2\x80\x8a" => " ",
77
+ "\xe2\x81\x9f" => " ",
78
+ "\xe3\x80\x80" => " ",
79
+ }.freeze
80
+
81
+ attr_reader :approximations
82
+
83
+ def initialize
84
+ if self.class < Base
85
+ @approximations = self.class.superclass.instance.approximations.dup
86
+ else
87
+ @approximations = {}
88
+ end
89
+ self.class.const_get(:APPROXIMATIONS).inject(@approximations) do |memo, object|
90
+ index = object[0].unpack("U").shift
91
+ value = object[1].unpack("C*")
92
+ memo[index] = value.length == 1 ? value[0] : value
93
+ memo
94
+ end
95
+ @approximations.freeze
96
+ end
97
+
98
+ # Accepts a single UTF-8 codepoint and returns the ASCII character code
99
+ # used as the transliteration value.
100
+ def [](codepoint)
101
+ @approximations[codepoint]
102
+ end
103
+
104
+ # Transliterates a string.
105
+ def transliterate(string)
106
+ string.unpack("U*").map {|char| self[char] || char}.flatten.pack("U*")
107
+ end
108
+ end
109
+ end
110
+ end
@@ -0,0 +1,27 @@
1
+ # encoding: utf-8
2
+ module Kebab
3
+ module Transliterator
4
+ class Bulgarian < Cyrillic
5
+ APPROXIMATIONS = {
6
+ "Ж" => "J",
7
+ "Й" => "I",
8
+ "Х" => "H",
9
+ "Ц" => "C",
10
+ "Щ" => "Sht",
11
+ "Ъ" => "U",
12
+ "Ь" => "I",
13
+ "Ю" => "Iu",
14
+ "Я" => "Ia",
15
+ "ж" => "j",
16
+ "й" => "i",
17
+ "х" => "h",
18
+ "ц" => "c",
19
+ "щ" => "sht",
20
+ "ъ" => "u",
21
+ "ь" => "i",
22
+ "ю" => "iu",
23
+ "я" => "ia"
24
+ }
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,108 @@
1
+ # encoding: utf-8
2
+ module Kebab
3
+ module Transliterator
4
+
5
+ # Approximations are based on GOST 7.79, System B:
6
+ # http://en.wikipedia.org/wiki/ISO_9#GOST_7.79
7
+ class Cyrillic < Base
8
+ APPROXIMATIONS = {
9
+ "Ё" => "Yo",
10
+ "Ѓ" => "G",
11
+ "Є" => "Ye",
12
+ "Ї" => "Yi",
13
+ "Љ" => "L",
14
+ "Њ" => "N",
15
+ "Ќ" => "K",
16
+ "Ў" => "U",
17
+ "Џ" => "Dh",
18
+ "А" => "A",
19
+ "Б" => "B",
20
+ "В" => "V",
21
+ "Г" => "G",
22
+ "Д" => "D",
23
+ "Е" => "E",
24
+ "Ж" => "Zh",
25
+ "З" => "Z",
26
+ "И" => "I",
27
+ "Й" => "J",
28
+ "К" => "K",
29
+ "Л" => "L",
30
+ "М" => "M",
31
+ "Н" => "N",
32
+ "О" => "O",
33
+ "П" => "P",
34
+ "Р" => "R",
35
+ "С" => "S",
36
+ "Т" => "T",
37
+ "У" => "U",
38
+ "Ф" => "F",
39
+ "Х" => "X",
40
+ "Ц" => "Cz",
41
+ "Ч" => "Ch",
42
+ "Ш" => "Sh",
43
+ "Щ" => "Shh",
44
+ "Ъ" => "",
45
+ "Ы" => "Y",
46
+ "Ь" => "",
47
+ "Э" => "E",
48
+ "Ю" => "Yu",
49
+ "Я" => "Ya",
50
+ "а" => "a",
51
+ "б" => "b",
52
+ "в" => "v",
53
+ "г" => "g",
54
+ "д" => "d",
55
+ "е" => "e",
56
+ "ж" => "zh",
57
+ "з" => "z",
58
+ "и" => "i",
59
+ "й" => "j",
60
+ "к" => "k",
61
+ "л" => "l",
62
+ "м" => "m",
63
+ "н" => "n",
64
+ "о" => "o",
65
+ "п" => "p",
66
+ "р" => "r",
67
+ "с" => "s",
68
+ "т" => "t",
69
+ "у" => "u",
70
+ "ф" => "f",
71
+ "х" => "x",
72
+ "ц" => "cz",
73
+ "ч" => "ch",
74
+ "ш" => "sh",
75
+ "щ" => "shh",
76
+ "ъ" => "",
77
+ "ы" => "y",
78
+ "ь" => "",
79
+ "э" => "e",
80
+ "ю" => "yu",
81
+ "я" => "ya",
82
+ "ё" => "yo",
83
+ "ѓ" => "g",
84
+ "є" => "ye",
85
+ "ї" => "yi",
86
+ "љ" => "l",
87
+ "њ" => "n",
88
+ "ќ" => "k",
89
+ "ў" => "u",
90
+ "џ" => "dh",
91
+ "Ѣ" => "Ye",
92
+ "ѣ" => "ye",
93
+ "Ѫ" => "O",
94
+ "ѫ" => "o",
95
+ "Ѳ" => "Fh",
96
+ "ѳ" => "fh",
97
+ "Ѵ" => "Yh",
98
+ "ѵ" => "yh",
99
+ "Ґ" => "G",
100
+ "ґ" => "g",
101
+ }
102
+
103
+ def transliterate(string)
104
+ super.gsub(/(c)z([ieyj])/) { "#{$1}#{$2}" }
105
+ end
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,15 @@
1
+ # encoding: utf-8
2
+ module Kebab
3
+ module Transliterator
4
+ class Danish < Latin
5
+ APPROXIMATIONS = {
6
+ "æ" => "ae",
7
+ "ø" => "oe",
8
+ "å" => "aa",
9
+ "Ø" => "Oe",
10
+ "Å" => "Aa"
11
+ }
12
+ end
13
+ end
14
+ end
15
+
@@ -0,0 +1,15 @@
1
+ # encoding: utf-8
2
+ module Kebab
3
+ module Transliterator
4
+ class German < Latin
5
+ APPROXIMATIONS = {
6
+ "ä" => "ae",
7
+ "ö" => "oe",
8
+ "ü" => "ue",
9
+ "Ä" => "Ae",
10
+ "Ö" => "Oe",
11
+ "Ü" => "Ue"
12
+ }
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,77 @@
1
+ # encoding: utf-8
2
+ module Kebab
3
+ module Transliterator
4
+ class Greek < Base
5
+ APPROXIMATIONS = {
6
+ "Α" => "A",
7
+ "Ά" => "A",
8
+ "α" => "a",
9
+ "ά" => "a",
10
+ "Β" => "V",
11
+ "β" => "v",
12
+ "Γ" => "G",
13
+ "γ" => "g",
14
+ "Δ" => "D",
15
+ "δ" => "d",
16
+ "Ε" => "E",
17
+ "Έ" => "E",
18
+ "ε" => "e",
19
+ "έ" => "e",
20
+ "Ζ" => "Z",
21
+ "ζ" => "z",
22
+ "Η" => "I",
23
+ "Ή" => "i",
24
+ "η" => "i",
25
+ "ή" => "i",
26
+ "Θ" => "TH",
27
+ "θ" => "th",
28
+ "Ι" => "I",
29
+ "Ί" => "Ι",
30
+ "Î" => "I",
31
+ "ι" => "i",
32
+ "ί" => "i",
33
+ "ϊ" => "i",
34
+ "ΐ" => "i",
35
+ "Κ" => "K",
36
+ "κ" => "k",
37
+ "Λ" => "L",
38
+ "λ" => "l",
39
+ "Μ" => "M",
40
+ "μ" => "m",
41
+ "Ν" => "N",
42
+ "ν" => "n",
43
+ "Ξ" => "KS",
44
+ "ξ" => "ks",
45
+ "Ο" => "O",
46
+ "Ό" => "O",
47
+ "ο" => "o",
48
+ "ό" => "o",
49
+ "Π" => "P",
50
+ "π" => "p",
51
+ "Ρ" => "R",
52
+ "ρ" => "r",
53
+ "Σ" => "S",
54
+ "σ" => "s",
55
+ "ς" => "s",
56
+ "Τ" => "T",
57
+ "τ" => "t",
58
+ "Υ" => "Y",
59
+ "Ύ" => "Y",
60
+ "υ" => "y",
61
+ "ύ" => "y",
62
+ "ϋ" => "y",
63
+ "ΰ" => "y",
64
+ "Φ" => "F",
65
+ "φ" => "f",
66
+ "Χ" => "X",
67
+ "χ" => "x",
68
+ "Ψ" => "PS",
69
+ "ψ" => "ps",
70
+ "Ω" => "O",
71
+ "Ώ" => "O",
72
+ "ω" => "o",
73
+ "ώ" => "o"
74
+ }
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,137 @@
1
+ # encoding: utf-8
2
+ module Kebab
3
+ module Transliterator
4
+ class Hindi < Base
5
+ APPROXIMATIONS = {
6
+ "ऀ" => "n",
7
+ "ँ" => "n",
8
+ "ं" => "n",
9
+ "ः" => "h",
10
+ "ऄ" => "a",
11
+ "अ" => "a",
12
+ "आ" => "aa",
13
+ "इ" => "i",
14
+ "ई" => "ii",
15
+ "उ" => "u",
16
+ "ऊ" => "uu",
17
+ "ऋ" => "ri",
18
+ "ऌ" => "lri",
19
+ "ऍ" => "e",
20
+ "ऎ" => "e",
21
+ "ए" => "e",
22
+ "ऐ" => "ei",
23
+ "ऑ" => "o",
24
+ "ऒ" => "o",
25
+ "ओ" => "o",
26
+ "औ" => "ou",
27
+ "क" => "k",
28
+ "ख" => "kh",
29
+ "ग" => "g",
30
+ "घ" => "gh",
31
+ "ङ" => "d",
32
+ "च" => "ch",
33
+ "छ" => "chh",
34
+ "ज" => "j",
35
+ "झ" => "jh",
36
+ "ञ" => "ny",
37
+ "ट" => "tt",
38
+ "ठ" => "tth",
39
+ "ड" => "dd",
40
+ "ढ" => "ddh",
41
+ "ण" => "nn",
42
+ "त" => "t",
43
+ "थ" => "th",
44
+ "द" => "d",
45
+ "ध" => "dh",
46
+ "न" => "n",
47
+ "ऩ" => "nnn",
48
+ "प" => "p",
49
+ "फ" => "ph",
50
+ "ब" => "b",
51
+ "भ" => "bh",
52
+ "म" => "m",
53
+ "य" => "y",
54
+ "र" => "r",
55
+ "ऱ" => "rr",
56
+ "ल" => "l",
57
+ "ळ" => "ll",
58
+ "ऴ" => "ll",
59
+ "व" => "v",
60
+ "श" => "sh",
61
+ "ष" => "ss",
62
+ "स" => "s",
63
+ "ह" => "h",
64
+ "ऺ" => "oe",
65
+ "ऻ" => "ooe",
66
+ "़" => "",
67
+ "ऽ" => "-",
68
+ "ा" => "aa",
69
+ "ि" => "i",
70
+ "ी" => "ii",
71
+ "ु" => "u",
72
+ "ू" => "uu",
73
+ "ृ" => "r",
74
+ "ॄ" => "rr",
75
+ "ॅ" => "e",
76
+ "ॆ" => "e",
77
+ "े" => "e",
78
+ "ै" => "ai",
79
+ "ॉ" => "o",
80
+ "ॊ" => "o",
81
+ "ो" => "o",
82
+ "ौ" => "au",
83
+ "्" => "",
84
+ "ॎ" => "e",
85
+ "ॏ" => "aw",
86
+ "ॐ" => "om",
87
+ "॑" => "",
88
+ "॒" => "_",
89
+ "॓" => "",
90
+ "॔" => "",
91
+ "ॕ" => "ee",
92
+ "ॖ" => "ue",
93
+ "ॗ" => "uue",
94
+ "क़" => "q",
95
+ "ख़" => "khh",
96
+ "ग़" => "ghh",
97
+ "ज़" => "za",
98
+ "ड़" => "dddh",
99
+ "ढ़" => "rh",
100
+ "फ़" => "f",
101
+ "य़" => "yy",
102
+ "ॠ" => "rri",
103
+ "ॡ" => "lr",
104
+ "ॢ" => "l",
105
+ "ॣ" => "l",
106
+ "।" => ".",
107
+ "॥" => "..",
108
+ "०" => "0",
109
+ "१" => "1",
110
+ "२" => "2",
111
+ "३" => "3",
112
+ "४" => "4",
113
+ "५" => "5",
114
+ "६" => "6",
115
+ "७" => "7",
116
+ "८" => "8",
117
+ "९" => "9",
118
+ "॰" => ".",
119
+ "ॱ" => ".",
120
+ "ॲ" => "a",
121
+ "ॳ" => "oe",
122
+ "ॴ" => "ooe",
123
+ "ॵ" => "aw",
124
+ "ॶ" => "ue",
125
+ "ॷ" => "uue",
126
+ "ॸ" => "dd",
127
+ "ॹ" => "zh",
128
+ "ॺ" => "y",
129
+ "ॻ" => "gg",
130
+ "ॼ" => "jj",
131
+ "ॽ" => "?",
132
+ "ॾ" => "ddd",
133
+ "ॿ" => "bb"
134
+ }
135
+ end
136
+ end
137
+ end