babosa 1.0.4 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data.tar.gz.sig +0 -0
- data/Changelog.md +12 -0
- data/README.md +81 -119
- data/Rakefile +9 -8
- data/lib/babosa.rb +2 -4
- data/lib/babosa/identifier.rb +104 -129
- data/lib/babosa/transliterator/base.rb +57 -56
- data/lib/babosa/transliterator/bulgarian.rb +3 -2
- data/lib/babosa/transliterator/cyrillic.rb +5 -5
- data/lib/babosa/transliterator/danish.rb +3 -3
- data/lib/babosa/transliterator/german.rb +3 -2
- data/lib/babosa/transliterator/greek.rb +4 -3
- data/lib/babosa/transliterator/hindi.rb +3 -2
- data/lib/babosa/transliterator/latin.rb +5 -5
- data/lib/babosa/transliterator/macedonian.rb +3 -2
- data/lib/babosa/transliterator/norwegian.rb +3 -3
- data/lib/babosa/transliterator/romanian.rb +3 -2
- data/lib/babosa/transliterator/russian.rb +3 -2
- data/lib/babosa/transliterator/serbian.rb +29 -27
- data/lib/babosa/transliterator/spanish.rb +2 -2
- data/lib/babosa/transliterator/swedish.rb +3 -3
- data/lib/babosa/transliterator/turkish.rb +8 -8
- data/lib/babosa/transliterator/ukrainian.rb +5 -4
- data/lib/babosa/transliterator/vietnamese.rb +4 -3
- data/lib/babosa/version.rb +3 -1
- data/spec/{babosa_spec.rb → identifier_spec.rb} +13 -14
- data/spec/spec_helper.rb +6 -6
- data/spec/transliterators/base_spec.rb +5 -6
- data/spec/transliterators/bulgarian_spec.rb +4 -5
- data/spec/transliterators/danish_spec.rb +5 -6
- data/spec/transliterators/german_spec.rb +4 -5
- data/spec/transliterators/greek_spec.rb +7 -7
- data/spec/transliterators/hindi_spec.rb +7 -7
- data/spec/transliterators/latin_spec.rb +3 -4
- data/spec/transliterators/macedonian_spec.rb +3 -4
- data/spec/transliterators/norwegian_spec.rb +4 -4
- data/spec/transliterators/polish_spec.rb +3 -5
- data/spec/transliterators/romanian_spec.rb +5 -6
- data/spec/transliterators/russian_spec.rb +3 -4
- data/spec/transliterators/serbian_spec.rb +6 -7
- data/spec/transliterators/spanish_spec.rb +4 -5
- data/spec/transliterators/swedish_spec.rb +7 -7
- data/spec/transliterators/turkish_spec.rb +24 -24
- data/spec/transliterators/ukrainian_spec.rb +74 -75
- data/spec/transliterators/vietnamese_spec.rb +10 -10
- metadata +44 -38
- metadata.gz.sig +2 -0
- data/lib/babosa/utf8/active_support_proxy.rb +0 -38
- data/lib/babosa/utf8/dumb_proxy.rb +0 -49
- data/lib/babosa/utf8/java_proxy.rb +0 -22
- data/lib/babosa/utf8/mappings.rb +0 -193
- data/lib/babosa/utf8/proxy.rb +0 -125
- data/lib/babosa/utf8/unicode_proxy.rb +0 -23
- data/spec/utf8_proxy_spec.rb +0 -52
@@ -1,31 +1,11 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require
|
3
|
+
require "singleton"
|
4
4
|
|
5
5
|
module Babosa
|
6
|
-
|
7
6
|
module Transliterator
|
8
|
-
|
9
|
-
autoload :Bulgarian, "babosa/transliterator/bulgarian"
|
10
|
-
autoload :Cyrillic, "babosa/transliterator/cyrillic"
|
11
|
-
autoload :Danish, "babosa/transliterator/danish"
|
12
|
-
autoload :German, "babosa/transliterator/german"
|
13
|
-
autoload :Hindi, "babosa/transliterator/hindi"
|
14
|
-
autoload :Latin, "babosa/transliterator/latin"
|
15
|
-
autoload :Macedonian, "babosa/transliterator/macedonian"
|
16
|
-
autoload :Norwegian, "babosa/transliterator/norwegian"
|
17
|
-
autoload :Romanian, "babosa/transliterator/romanian"
|
18
|
-
autoload :Russian, "babosa/transliterator/russian"
|
19
|
-
autoload :Serbian, "babosa/transliterator/serbian"
|
20
|
-
autoload :Spanish, "babosa/transliterator/spanish"
|
21
|
-
autoload :Swedish, "babosa/transliterator/swedish"
|
22
|
-
autoload :Ukrainian, "babosa/transliterator/ukrainian"
|
23
|
-
autoload :Greek, "babosa/transliterator/greek"
|
24
|
-
autoload :Vietnamese, "babosa/transliterator/vietnamese"
|
25
|
-
autoload :Turkish, "babosa/transliterator/turkish"
|
26
|
-
|
27
7
|
def self.get(symbol)
|
28
|
-
class_name = symbol.to_s.split("_").map {|a| a.gsub(/\b('?[a-z])/) {
|
8
|
+
class_name = symbol.to_s.split("_").map { |a| a.gsub(/\b('?[a-z])/) { Regexp.last_match(1).upcase } }.join
|
29
9
|
const_get(class_name)
|
30
10
|
end
|
31
11
|
|
@@ -47,36 +27,39 @@ module Babosa
|
|
47
27
|
"”" => '"',
|
48
28
|
"„" => '"',
|
49
29
|
"‟" => '"',
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
30
|
+
"’" => "'",
|
31
|
+
"," => ",",
|
32
|
+
"。" => ".",
|
33
|
+
"!" => "!",
|
34
|
+
"?" => "?",
|
35
|
+
"、" => ",",
|
36
|
+
"(" => "(",
|
37
|
+
")" => ")",
|
38
|
+
"【" => "[",
|
39
|
+
"】" => "]",
|
40
|
+
";" => ";",
|
41
|
+
":" => ":",
|
42
|
+
"《" => "<",
|
43
|
+
"》" => ">"
|
44
|
+
}.merge(
|
45
|
+
{
|
46
|
+
# various kinds of space characters
|
47
|
+
"\xc2\xa0" => " ",
|
48
|
+
"\xe2\x80\x80" => " ",
|
49
|
+
"\xe2\x80\x81" => " ",
|
50
|
+
"\xe2\x80\x82" => " ",
|
51
|
+
"\xe2\x80\x83" => " ",
|
52
|
+
"\xe2\x80\x84" => " ",
|
53
|
+
"\xe2\x80\x85" => " ",
|
54
|
+
"\xe2\x80\x86" => " ",
|
55
|
+
"\xe2\x80\x87" => " ",
|
56
|
+
"\xe2\x80\x88" => " ",
|
57
|
+
"\xe2\x80\x89" => " ",
|
58
|
+
"\xe2\x80\x8a" => " ",
|
59
|
+
"\xe2\x81\x9f" => " ",
|
60
|
+
"\xe3\x80\x80" => " "
|
61
|
+
}
|
62
|
+
).freeze
|
80
63
|
|
81
64
|
attr_reader :approximations
|
82
65
|
|
@@ -87,8 +70,8 @@ module Babosa
|
|
87
70
|
@approximations = {}
|
88
71
|
end
|
89
72
|
self.class.const_get(:APPROXIMATIONS).inject(@approximations) do |memo, object|
|
90
|
-
index = object[0].
|
91
|
-
value = object[1].
|
73
|
+
index = object[0].codepoints.shift
|
74
|
+
value = object[1].codepoints
|
92
75
|
memo[index] = value.length == 1 ? value[0] : value
|
93
76
|
memo
|
94
77
|
end
|
@@ -103,8 +86,26 @@ module Babosa
|
|
103
86
|
|
104
87
|
# Transliterates a string.
|
105
88
|
def transliterate(string)
|
106
|
-
string.
|
89
|
+
string.codepoints.map { |char| self[char] || char }.flatten.pack("U*")
|
107
90
|
end
|
108
91
|
end
|
109
92
|
end
|
110
93
|
end
|
94
|
+
|
95
|
+
require "babosa/transliterator/cyrillic"
|
96
|
+
require "babosa/transliterator/latin"
|
97
|
+
require "babosa/transliterator/bulgarian"
|
98
|
+
require "babosa/transliterator/danish"
|
99
|
+
require "babosa/transliterator/german"
|
100
|
+
require "babosa/transliterator/hindi"
|
101
|
+
require "babosa/transliterator/macedonian"
|
102
|
+
require "babosa/transliterator/norwegian"
|
103
|
+
require "babosa/transliterator/romanian"
|
104
|
+
require "babosa/transliterator/russian"
|
105
|
+
require "babosa/transliterator/serbian"
|
106
|
+
require "babosa/transliterator/spanish"
|
107
|
+
require "babosa/transliterator/swedish"
|
108
|
+
require "babosa/transliterator/ukrainian"
|
109
|
+
require "babosa/transliterator/greek"
|
110
|
+
require "babosa/transliterator/vietnamese"
|
111
|
+
require "babosa/transliterator/turkish"
|
@@ -1,7 +1,7 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module Babosa
|
3
4
|
module Transliterator
|
4
|
-
|
5
5
|
# Approximations are based on GOST 7.79, System B:
|
6
6
|
# http://en.wikipedia.org/wiki/ISO_9#GOST_7.79
|
7
7
|
class Cyrillic < Base
|
@@ -97,11 +97,11 @@ module Babosa
|
|
97
97
|
"Ѵ" => "Yh",
|
98
98
|
"ѵ" => "yh",
|
99
99
|
"Ґ" => "G",
|
100
|
-
"ґ" => "g"
|
101
|
-
}
|
100
|
+
"ґ" => "g"
|
101
|
+
}.freeze
|
102
102
|
|
103
103
|
def transliterate(string)
|
104
|
-
super.gsub(/(c)z([ieyj])/) { "#{
|
104
|
+
super.gsub(/(c)z([ieyj])/) { "#{Regexp.last_match(1)}#{Regexp.last_match(2)}" }
|
105
105
|
end
|
106
106
|
end
|
107
107
|
end
|
@@ -1,8 +1,8 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module Babosa
|
3
4
|
module Transliterator
|
4
5
|
class Latin < Base
|
5
|
-
|
6
6
|
APPROXIMATIONS = {
|
7
7
|
"À" => "A",
|
8
8
|
"Á" => "A",
|
@@ -35,14 +35,14 @@ module Babosa
|
|
35
35
|
"Ý" => "Y",
|
36
36
|
"Þ" => "Th",
|
37
37
|
"ß" => "ss",
|
38
|
-
"à" => "a"
|
38
|
+
"à" => "a",
|
39
39
|
"á" => "a",
|
40
40
|
"â" => "a",
|
41
41
|
"ã" => "a",
|
42
42
|
"ä" => "a",
|
43
43
|
"å" => "a",
|
44
44
|
"æ" => "ae",
|
45
|
-
"ç" => "c"
|
45
|
+
"ç" => "c",
|
46
46
|
"è" => "e",
|
47
47
|
"é" => "e",
|
48
48
|
"ê" => "e",
|
@@ -193,7 +193,7 @@ module Babosa
|
|
193
193
|
"ž" => "z",
|
194
194
|
"ź" => "z",
|
195
195
|
"ż" => "z"
|
196
|
-
}
|
196
|
+
}.freeze
|
197
197
|
end
|
198
198
|
end
|
199
199
|
end
|
@@ -1,34 +1,36 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Babosa
|
4
4
|
module Transliterator
|
5
5
|
class Serbian < Latin
|
6
|
-
APPROXIMATIONS = Cyrillic.const_get(:APPROXIMATIONS).merge(
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
6
|
+
APPROXIMATIONS = Cyrillic.const_get(:APPROXIMATIONS).merge(
|
7
|
+
{
|
8
|
+
"Ð" => "Dj",
|
9
|
+
"Č" => "Ch",
|
10
|
+
"Š" => "Sh",
|
11
|
+
"č" => "ch",
|
12
|
+
"đ" => "dj",
|
13
|
+
"š" => "sh",
|
14
|
+
"Ћ" => "C",
|
15
|
+
"Ц" => "C",
|
16
|
+
"Ч" => "Ch",
|
17
|
+
"Ђ" => "Dj",
|
18
|
+
"Џ" => "Dz",
|
19
|
+
"Х" => "H",
|
20
|
+
"Ј" => "J",
|
21
|
+
"Љ" => "Lj",
|
22
|
+
"Њ" => "Nj",
|
23
|
+
"ц" => "c",
|
24
|
+
"ћ" => "c",
|
25
|
+
"ч" => "ch",
|
26
|
+
"ђ" => "dj",
|
27
|
+
"џ" => "dz",
|
28
|
+
"х" => "h",
|
29
|
+
"ј" => "j",
|
30
|
+
"љ" => "lj",
|
31
|
+
"њ" => "nj"
|
32
|
+
}
|
33
|
+
)
|
32
34
|
end
|
33
35
|
end
|
34
36
|
end
|