mojibake 1.0.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gemtest +0 -0
- data/History.rdoc +7 -0
- data/Manifest.txt +6 -2
- data/README.rdoc +4 -2
- data/Rakefile +15 -2
- data/bin/mojibake +33 -17
- data/config/table.json +270 -0
- data/config/table.txt +268 -0
- data/lib/mojibake.rb +34 -5
- data/lib/mojibake/base.rb +1 -1
- data/lib/mojibake/{mapper.rb → encoding.rb} +15 -29
- data/lib/mojibake/json.rb +81 -0
- data/test/test_encoding.rb +76 -0
- data/test/{test_mojibake.rb → test_mapper.rb} +1 -34
- metadata +69 -56
data/.gemtest
ADDED
File without changes
|
data/History.rdoc
CHANGED
@@ -1,2 +1,9 @@
|
|
1
|
+
=== 1.1.0 (2011-6-31)
|
2
|
+
* Add table.json output support as more convenient language
|
3
|
+
independent format. Include default version (as well as table.txt)
|
4
|
+
as part of gem.
|
5
|
+
* Add Ruby 1.8 support for Mapper.recover by using default included
|
6
|
+
table.json (avoids need for 1.9 encoding support.)
|
7
|
+
|
1
8
|
=== 1.0.0 (2011-6-21)
|
2
9
|
* Initial release.
|
data/Manifest.txt
CHANGED
@@ -3,8 +3,12 @@ Manifest.txt
|
|
3
3
|
README.rdoc
|
4
4
|
Rakefile
|
5
5
|
bin/mojibake
|
6
|
+
config/table.json
|
7
|
+
config/table.txt
|
6
8
|
lib/mojibake/base.rb
|
7
9
|
lib/mojibake.rb
|
8
|
-
lib/mojibake/
|
10
|
+
lib/mojibake/encoding.rb
|
11
|
+
lib/mojibake/json.rb
|
9
12
|
test/test.txt
|
10
|
-
test/
|
13
|
+
test/test_encoding.rb
|
14
|
+
test/test_mapper.rb
|
data/README.rdoc
CHANGED
@@ -15,8 +15,10 @@ Windows-1252 is in the wild, should also benefit.
|
|
15
15
|
|
16
16
|
== Dependencies
|
17
17
|
|
18
|
-
Requires the String Encoding support
|
19
|
-
|
18
|
+
Requires the String Encoding support in ruby 1.9 as provided by:
|
19
|
+
|
20
|
+
* ruby 1.9.2+ (tested 1.9.2p180, Linux)
|
21
|
+
* jruby 1.6.5+ (tested 1.6.5, Linux)
|
20
22
|
|
21
23
|
== Synopsis
|
22
24
|
|
data/Rakefile
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
$LOAD_PATH << './lib'
|
4
4
|
|
5
5
|
require 'rubygems'
|
6
|
-
gem 'rjack-tarpit', '~> 1.
|
6
|
+
gem 'rjack-tarpit', '~> 1.4'
|
7
7
|
require 'rjack-tarpit'
|
8
8
|
|
9
9
|
require 'mojibake/base'
|
@@ -14,7 +14,8 @@ t.specify do |h|
|
|
14
14
|
h.developer( 'David Kellum', 'dek-oss@gravitext.com' )
|
15
15
|
|
16
16
|
h.testlib = :minitest
|
17
|
-
h.
|
17
|
+
h.extra_deps += [ [ 'json', '~> 1.6.1' ] ]
|
18
|
+
h.extra_dev_deps += [ [ 'minitest', '~> 2.3' ] ]
|
18
19
|
|
19
20
|
h.url = 'http://github.com/dekellum/mojibake'
|
20
21
|
end
|
@@ -33,3 +34,15 @@ task :tag => [ :check_history_version, :check_history_date ]
|
|
33
34
|
task :push => [ :check_history_version, :check_history_date ]
|
34
35
|
|
35
36
|
t.define_tasks
|
37
|
+
|
38
|
+
desc "(Re-)generate config output files (requires 1.9)"
|
39
|
+
task :generate_config do
|
40
|
+
if ( RUBY_VERSION.split( '.' ).map { |d| d.to_i } <=> [ 1, 9 ] ) >= 0
|
41
|
+
require 'mojibake'
|
42
|
+
mapper = MojiBake::Mapper.new
|
43
|
+
open( "config/table.txt", 'w' ) { |fout| fout.puts( mapper.table ) }
|
44
|
+
open( "config/table.json", 'w' ) { |fout| fout.puts( mapper.json ) }
|
45
|
+
else
|
46
|
+
raise "Task generate_config requires Ruby 1.9 encoding support"
|
47
|
+
end
|
48
|
+
end
|
data/bin/mojibake
CHANGED
@@ -20,6 +20,8 @@
|
|
20
20
|
|
21
21
|
$LOAD_PATH.unshift File.join( File.dirname( __FILE__ ), "..", "lib" )
|
22
22
|
|
23
|
+
require 'rubygems'
|
24
|
+
|
23
25
|
require 'mojibake'
|
24
26
|
require 'optparse'
|
25
27
|
|
@@ -33,23 +35,36 @@ module Script
|
|
33
35
|
puts "mojibake: #{MojiBake::VERSION}"
|
34
36
|
exit 1
|
35
37
|
end
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
38
|
+
|
39
|
+
if ( RUBY_VERSION.split( '.' ).map { |d| d.to_i } <=> [ 1, 9 ] ) >= 0
|
40
|
+
|
41
|
+
opts.on( "--no-windows-1252",
|
42
|
+
"Don't include miscodings from Windows-1252" ) do
|
43
|
+
mapper.map_windows_1252 = false
|
44
|
+
end
|
45
|
+
opts.on( "--no-iso-8859-1",
|
46
|
+
"Don't include miscodings from ISO-8859-1" ) do
|
47
|
+
mapper.map_iso_8859_1 = false
|
48
|
+
end
|
49
|
+
opts.on( "--no-permutations",
|
50
|
+
"Don't include ISO/Windows permutations" ) do
|
51
|
+
mapper.map_permutations = false
|
52
|
+
end
|
53
|
+
opts.on_tail( "-t", "--table",
|
54
|
+
"Write MojiBake Mapper table (UTF-8) and exit" ) do
|
55
|
+
puts mapper.table
|
56
|
+
exit 1
|
57
|
+
end
|
58
|
+
opts.on_tail( "-j", "--json",
|
59
|
+
"Write MojiBake Mapper json (UTF-8) and exit" ) do
|
60
|
+
require 'rubygems'
|
61
|
+
require 'mojibake/json'
|
62
|
+
puts mapper.json
|
63
|
+
exit 1
|
64
|
+
end
|
65
|
+
|
52
66
|
end
|
67
|
+
|
53
68
|
opts.on_tail( "-r", "--regex",
|
54
69
|
"Display MojiBake Mapper regex (UTF-8) and exit" ) do
|
55
70
|
puts mapper.regexp.inspect
|
@@ -65,7 +80,8 @@ module Script
|
|
65
80
|
|
66
81
|
input_file = ARGV.shift
|
67
82
|
if input_file
|
68
|
-
|
83
|
+
data = IO.read( input_file )
|
84
|
+
$stdout.write( mapper.recover( data ) )
|
69
85
|
end
|
70
86
|
|
71
87
|
end
|
data/config/table.json
ADDED
@@ -0,0 +1,270 @@
|
|
1
|
+
{
|
2
|
+
"mojibake": "1.1.0",
|
3
|
+
"url": "https://github.com/dekellum/mojibake",
|
4
|
+
"regexp": "Â[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ŒœŠšŸŽžƒˆ˜–—‘’‚“”„†‡•…‰‹›€™\uFFFD]|Ã[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ŒœŠšŸŽžƒˆ˜–—‘’‚“”„†‡•…‰‹›€™\uFFFD]|Å[\u0092\u0093\u00A0¡¸½¾’“]|Æ[\u0092’]|Ë[\u0086\u009Cœ†]|â(\u0080[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u0093\u0094\u0098\u0099\u009A\u009C\u009D\u009E\u00A0¡¢¦°¹º]|\u0081\u00A0|\u0082¬|\u0084¢|‚¬|„¢|€[\u0081\u009D\u00A0¡¢¦°¹ºœŠšžƒˆ˜‚“”„†‡…‰‹€™\uFFFD]|\uFFFD\u00A0)|ï(»¿|¿[½¾])",
|
5
|
+
"moji": {
|
6
|
+
"Â\u0080": "\u0080",
|
7
|
+
"Â\u0081": "\u0081",
|
8
|
+
"Â\u0082": "\u0082",
|
9
|
+
"Â\u0083": "\u0083",
|
10
|
+
"Â\u0084": "\u0084",
|
11
|
+
"Â\u0085": "\u0085",
|
12
|
+
"Â\u0086": "\u0086",
|
13
|
+
"Â\u0087": "\u0087",
|
14
|
+
"Â\u0088": "\u0088",
|
15
|
+
"Â\u0089": "\u0089",
|
16
|
+
"Â\u008A": "\u008A",
|
17
|
+
"Â\u008B": "\u008B",
|
18
|
+
"Â\u008C": "\u008C",
|
19
|
+
"Â\u008D": "\u008D",
|
20
|
+
"Â\u008E": "\u008E",
|
21
|
+
"Â\u008F": "\u008F",
|
22
|
+
"Â\u0090": "\u0090",
|
23
|
+
"Â\u0091": "\u0091",
|
24
|
+
"Â\u0092": "\u0092",
|
25
|
+
"Â\u0093": "\u0093",
|
26
|
+
"Â\u0094": "\u0094",
|
27
|
+
"Â\u0095": "\u0095",
|
28
|
+
"Â\u0096": "\u0096",
|
29
|
+
"Â\u0097": "\u0097",
|
30
|
+
"Â\u0098": "\u0098",
|
31
|
+
"Â\u0099": "\u0099",
|
32
|
+
"Â\u009A": "\u009A",
|
33
|
+
"Â\u009B": "\u009B",
|
34
|
+
"Â\u009C": "\u009C",
|
35
|
+
"Â\u009D": "\u009D",
|
36
|
+
"Â\u009E": "\u009E",
|
37
|
+
"Â\u009F": "\u009F",
|
38
|
+
"Â\u00A0": "\u00A0",
|
39
|
+
"¡": "¡",
|
40
|
+
"¢": "¢",
|
41
|
+
"£": "£",
|
42
|
+
"¤": "¤",
|
43
|
+
"Â¥": "¥",
|
44
|
+
"¦": "¦",
|
45
|
+
"§": "§",
|
46
|
+
"¨": "¨",
|
47
|
+
"©": "©",
|
48
|
+
"ª": "ª",
|
49
|
+
"«": "«",
|
50
|
+
"¬": "¬",
|
51
|
+
"Â": "",
|
52
|
+
"®": "®",
|
53
|
+
"¯": "¯",
|
54
|
+
"°": "°",
|
55
|
+
"±": "±",
|
56
|
+
"²": "²",
|
57
|
+
"³": "³",
|
58
|
+
"´": "´",
|
59
|
+
"µ": "µ",
|
60
|
+
"¶": "¶",
|
61
|
+
"·": "·",
|
62
|
+
"¸": "¸",
|
63
|
+
"¹": "¹",
|
64
|
+
"º": "º",
|
65
|
+
"»": "»",
|
66
|
+
"¼": "¼",
|
67
|
+
"½": "½",
|
68
|
+
"¾": "¾",
|
69
|
+
"¿": "¿",
|
70
|
+
"Œ": "\u008C",
|
71
|
+
"œ": "\u009C",
|
72
|
+
"Š": "\u008A",
|
73
|
+
"š": "\u009A",
|
74
|
+
"Ÿ": "\u009F",
|
75
|
+
"ÂŽ": "\u008E",
|
76
|
+
"ž": "\u009E",
|
77
|
+
"ƒ": "\u0083",
|
78
|
+
"ˆ": "\u0088",
|
79
|
+
"˜": "\u0098",
|
80
|
+
"–": "\u0096",
|
81
|
+
"—": "\u0097",
|
82
|
+
"‘": "\u0091",
|
83
|
+
"Â’": "\u0092",
|
84
|
+
"‚": "\u0082",
|
85
|
+
"“": "\u0093",
|
86
|
+
"”": "\u0094",
|
87
|
+
"„": "\u0084",
|
88
|
+
"†": "\u0086",
|
89
|
+
"‡": "\u0087",
|
90
|
+
"•": "\u0095",
|
91
|
+
"Â…": "\u0085",
|
92
|
+
"‰": "\u0089",
|
93
|
+
"‹": "\u008B",
|
94
|
+
"›": "\u009B",
|
95
|
+
"€": "\u0080",
|
96
|
+
"™": "\u0099",
|
97
|
+
"Â\uFFFD": "\u0081",
|
98
|
+
"Ã\u0080": "À",
|
99
|
+
"Ã\u0081": "Á",
|
100
|
+
"Ã\u0082": "Â",
|
101
|
+
"Ã\u0083": "Ã",
|
102
|
+
"Ã\u0084": "Ä",
|
103
|
+
"Ã\u0085": "Å",
|
104
|
+
"Ã\u0086": "Æ",
|
105
|
+
"Ã\u0087": "Ç",
|
106
|
+
"Ã\u0088": "È",
|
107
|
+
"Ã\u0089": "É",
|
108
|
+
"Ã\u008A": "Ê",
|
109
|
+
"Ã\u008B": "Ë",
|
110
|
+
"Ã\u008C": "Ì",
|
111
|
+
"Ã\u008D": "Í",
|
112
|
+
"Ã\u008E": "Î",
|
113
|
+
"Ã\u008F": "Ï",
|
114
|
+
"Ã\u0090": "Ð",
|
115
|
+
"Ã\u0091": "Ñ",
|
116
|
+
"Ã\u0092": "Ò",
|
117
|
+
"Ã\u0093": "Ó",
|
118
|
+
"Ã\u0094": "Ô",
|
119
|
+
"Ã\u0095": "Õ",
|
120
|
+
"Ã\u0096": "Ö",
|
121
|
+
"Ã\u0097": "×",
|
122
|
+
"Ã\u0098": "Ø",
|
123
|
+
"Ã\u0099": "Ù",
|
124
|
+
"Ã\u009A": "Ú",
|
125
|
+
"Ã\u009B": "Û",
|
126
|
+
"Ã\u009C": "Ü",
|
127
|
+
"Ã\u009D": "Ý",
|
128
|
+
"Ã\u009E": "Þ",
|
129
|
+
"Ã\u009F": "ß",
|
130
|
+
"Ã\u00A0": "à",
|
131
|
+
"á": "á",
|
132
|
+
"â": "â",
|
133
|
+
"ã": "ã",
|
134
|
+
"ä": "ä",
|
135
|
+
"Ã¥": "å",
|
136
|
+
"æ": "æ",
|
137
|
+
"ç": "ç",
|
138
|
+
"è": "è",
|
139
|
+
"é": "é",
|
140
|
+
"ê": "ê",
|
141
|
+
"ë": "ë",
|
142
|
+
"ì": "ì",
|
143
|
+
"Ã": "í",
|
144
|
+
"î": "î",
|
145
|
+
"ï": "ï",
|
146
|
+
"ð": "ð",
|
147
|
+
"ñ": "ñ",
|
148
|
+
"ò": "ò",
|
149
|
+
"ó": "ó",
|
150
|
+
"ô": "ô",
|
151
|
+
"õ": "õ",
|
152
|
+
"ö": "ö",
|
153
|
+
"÷": "÷",
|
154
|
+
"ø": "ø",
|
155
|
+
"ù": "ù",
|
156
|
+
"ú": "ú",
|
157
|
+
"û": "û",
|
158
|
+
"ü": "ü",
|
159
|
+
"ý": "ý",
|
160
|
+
"þ": "þ",
|
161
|
+
"ÿ": "ÿ",
|
162
|
+
"ÃŒ": "Ì",
|
163
|
+
"Ãœ": "Ü",
|
164
|
+
"Ê": "Ê",
|
165
|
+
"Ú": "Ú",
|
166
|
+
"ß": "ß",
|
167
|
+
"ÃŽ": "Î",
|
168
|
+
"Þ": "Þ",
|
169
|
+
"Ã": "Ã",
|
170
|
+
"È": "È",
|
171
|
+
"Ø": "Ø",
|
172
|
+
"Ö": "Ö",
|
173
|
+
"×": "×",
|
174
|
+
"Ñ": "Ñ",
|
175
|
+
"Ã’": "Ò",
|
176
|
+
"Â": "Â",
|
177
|
+
"Ó": "Ó",
|
178
|
+
"Ô": "Ô",
|
179
|
+
"Ä": "Ä",
|
180
|
+
"Æ": "Æ",
|
181
|
+
"Ç": "Ç",
|
182
|
+
"Õ": "Õ",
|
183
|
+
"Ã…": "Å",
|
184
|
+
"É": "É",
|
185
|
+
"Ë": "Ë",
|
186
|
+
"Û": "Û",
|
187
|
+
"À": "À",
|
188
|
+
"Ù": "Ù",
|
189
|
+
"Ã\uFFFD": "Á",
|
190
|
+
"Å\u0092": "Œ",
|
191
|
+
"Å\u0093": "œ",
|
192
|
+
"Å\u00A0": "Š",
|
193
|
+
"Å¡": "š",
|
194
|
+
"Ÿ": "Ÿ",
|
195
|
+
"Ž": "Ž",
|
196
|
+
"ž": "ž",
|
197
|
+
"Å’": "Œ",
|
198
|
+
"Å“": "œ",
|
199
|
+
"Æ\u0092": "ƒ",
|
200
|
+
"Æ’": "ƒ",
|
201
|
+
"Ë\u0086": "ˆ",
|
202
|
+
"Ë\u009C": "˜",
|
203
|
+
"Ëœ": "˜",
|
204
|
+
"ˆ": "ˆ",
|
205
|
+
"â\u0080\u0080": "\u2000",
|
206
|
+
"â\u0080\u0081": "\u2001",
|
207
|
+
"â\u0080\u0082": "\u2002",
|
208
|
+
"â\u0080\u0083": "\u2003",
|
209
|
+
"â\u0080\u0084": "\u2004",
|
210
|
+
"â\u0080\u0085": "\u2005",
|
211
|
+
"â\u0080\u0086": "\u2006",
|
212
|
+
"â\u0080\u0087": "\u2007",
|
213
|
+
"â\u0080\u0088": "\u2008",
|
214
|
+
"â\u0080\u0089": "\u2009",
|
215
|
+
"â\u0080\u008A": "\u200A",
|
216
|
+
"â\u0080\u008B": "\u200B",
|
217
|
+
"â\u0080\u0093": "–",
|
218
|
+
"â\u0080\u0094": "—",
|
219
|
+
"â\u0080\u0098": "‘",
|
220
|
+
"â\u0080\u0099": "’",
|
221
|
+
"â\u0080\u009A": "‚",
|
222
|
+
"â\u0080\u009C": "“",
|
223
|
+
"â\u0080\u009D": "”",
|
224
|
+
"â\u0080\u009E": "„",
|
225
|
+
"â\u0080\u00A0": "†",
|
226
|
+
"â\u0080¡": "‡",
|
227
|
+
"â\u0080¢": "•",
|
228
|
+
"â\u0080¦": "…",
|
229
|
+
"â\u0080°": "‰",
|
230
|
+
"â\u0080¹": "‹",
|
231
|
+
"â\u0080º": "›",
|
232
|
+
"â\u0081\u00A0": "\u2060",
|
233
|
+
"â\u0082¬": "€",
|
234
|
+
"â\u0084¢": "™",
|
235
|
+
"€": "€",
|
236
|
+
"â„¢": "™",
|
237
|
+
"â€\u0081": "\u2001",
|
238
|
+
"â€\u009D": "”",
|
239
|
+
"â€\u00A0": "†",
|
240
|
+
"‡": "‡",
|
241
|
+
"•": "•",
|
242
|
+
"…": "…",
|
243
|
+
"‰": "‰",
|
244
|
+
"‹": "‹",
|
245
|
+
"›": "›",
|
246
|
+
"“": "“",
|
247
|
+
" ": "\u200A",
|
248
|
+
"‚": "‚",
|
249
|
+
"„": "„",
|
250
|
+
" ": "\u2003",
|
251
|
+
" ": "\u2008",
|
252
|
+
"‘": "‘",
|
253
|
+
" ": "\u2002",
|
254
|
+
"–": "–",
|
255
|
+
"—": "—",
|
256
|
+
" ": "\u2004",
|
257
|
+
" ": "\u2006",
|
258
|
+
" ": "\u2007",
|
259
|
+
" ": "\u2005",
|
260
|
+
" ": "\u2009",
|
261
|
+
"​": "\u200B",
|
262
|
+
" ": "\u2000",
|
263
|
+
"’": "’",
|
264
|
+
"â€\uFFFD": "”",
|
265
|
+
"â\uFFFD\u00A0": "\u2060",
|
266
|
+
"": "\uFEFF",
|
267
|
+
"�": "\uFFFD",
|
268
|
+
"￾": "\uFFFE"
|
269
|
+
}
|
270
|
+
}
|
data/config/table.txt
ADDED
@@ -0,0 +1,268 @@
|
|
1
|
+
# -*- coding: utf-8 -*- mojibake: 1.1.0
|
2
|
+
/Â[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ŒœŠšŸŽžƒˆ˜–—‘’‚“”„†‡•…‰‹›€™\uFFFD]|Ã[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ŒœŠšŸŽžƒˆ˜–—‘’‚“”„†‡•…‰‹›€™\uFFFD]|Å[\u0092\u0093\u00A0¡¸½¾’“]|Æ[\u0092’]|Ë[\u0086\u009Cœ†]|â(\u0080[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u0093\u0094\u0098\u0099\u009A\u009C\u009D\u009E\u00A0¡¢¦°¹º]|\u0081\u00A0|\u0082¬|\u0084¢|‚¬|„¢|€[\u0081\u009D\u00A0¡¢¦°¹ºœŠšžƒˆ˜‚“”„†‡…‰‹€™\uFFFD]|\uFFFD\u00A0)|ï(»¿|¿[½¾])/
|
3
|
+
|
4
|
+
Moji UNICODE Org CODE
|
5
|
+
+---- ---- ---- ---- ----- ---+
|
6
|
+
[Â] 00C2 0080 [] 0080
|
7
|
+
[Â] 00C2 0081 [] 0081
|
8
|
+
[Â] 00C2 0082 [] 0082
|
9
|
+
[Â] 00C2 0083 [] 0083
|
10
|
+
[Â] 00C2 0084 [] 0084
|
11
|
+
[Â
] 00C2 0085 [
] 0085
|
12
|
+
[Â] 00C2 0086 [] 0086
|
13
|
+
[Â] 00C2 0087 [] 0087
|
14
|
+
[Â] 00C2 0088 [] 0088
|
15
|
+
[Â] 00C2 0089 [] 0089
|
16
|
+
[Â] 00C2 008A [] 008A
|
17
|
+
[Â] 00C2 008B [] 008B
|
18
|
+
[Â] 00C2 008C [] 008C
|
19
|
+
[Â] 00C2 008D [] 008D
|
20
|
+
[Â] 00C2 008E [] 008E
|
21
|
+
[Â] 00C2 008F [] 008F
|
22
|
+
[Â] 00C2 0090 [] 0090
|
23
|
+
[Â] 00C2 0091 [] 0091
|
24
|
+
[Â] 00C2 0092 [] 0092
|
25
|
+
[Â] 00C2 0093 [] 0093
|
26
|
+
[Â] 00C2 0094 [] 0094
|
27
|
+
[Â] 00C2 0095 [] 0095
|
28
|
+
[Â] 00C2 0096 [] 0096
|
29
|
+
[Â] 00C2 0097 [] 0097
|
30
|
+
[Â] 00C2 0098 [] 0098
|
31
|
+
[Â] 00C2 0099 [] 0099
|
32
|
+
[Â] 00C2 009A [] 009A
|
33
|
+
[Â] 00C2 009B [] 009B
|
34
|
+
[Â] 00C2 009C [] 009C
|
35
|
+
[Â] 00C2 009D [] 009D
|
36
|
+
[Â] 00C2 009E [] 009E
|
37
|
+
[Â] 00C2 009F [] 009F
|
38
|
+
[Â ] 00C2 00A0 [ ] 00A0
|
39
|
+
[¡] 00C2 00A1 [¡] 00A1
|
40
|
+
[¢] 00C2 00A2 [¢] 00A2
|
41
|
+
[£] 00C2 00A3 [£] 00A3
|
42
|
+
[¤] 00C2 00A4 [¤] 00A4
|
43
|
+
[Â¥] 00C2 00A5 [¥] 00A5
|
44
|
+
[¦] 00C2 00A6 [¦] 00A6
|
45
|
+
[§] 00C2 00A7 [§] 00A7
|
46
|
+
[¨] 00C2 00A8 [¨] 00A8
|
47
|
+
[©] 00C2 00A9 [©] 00A9
|
48
|
+
[ª] 00C2 00AA [ª] 00AA
|
49
|
+
[«] 00C2 00AB [«] 00AB
|
50
|
+
[¬] 00C2 00AC [¬] 00AC
|
51
|
+
[Â] 00C2 00AD [] 00AD
|
52
|
+
[®] 00C2 00AE [®] 00AE
|
53
|
+
[¯] 00C2 00AF [¯] 00AF
|
54
|
+
[°] 00C2 00B0 [°] 00B0
|
55
|
+
[±] 00C2 00B1 [±] 00B1
|
56
|
+
[²] 00C2 00B2 [²] 00B2
|
57
|
+
[³] 00C2 00B3 [³] 00B3
|
58
|
+
[´] 00C2 00B4 [´] 00B4
|
59
|
+
[µ] 00C2 00B5 [µ] 00B5
|
60
|
+
[¶] 00C2 00B6 [¶] 00B6
|
61
|
+
[·] 00C2 00B7 [·] 00B7
|
62
|
+
[¸] 00C2 00B8 [¸] 00B8
|
63
|
+
[¹] 00C2 00B9 [¹] 00B9
|
64
|
+
[º] 00C2 00BA [º] 00BA
|
65
|
+
[»] 00C2 00BB [»] 00BB
|
66
|
+
[¼] 00C2 00BC [¼] 00BC
|
67
|
+
[½] 00C2 00BD [½] 00BD
|
68
|
+
[¾] 00C2 00BE [¾] 00BE
|
69
|
+
[¿] 00C2 00BF [¿] 00BF
|
70
|
+
[Œ] 00C2 0152 [] 008C
|
71
|
+
[œ] 00C2 0153 [] 009C
|
72
|
+
[Š] 00C2 0160 [] 008A
|
73
|
+
[š] 00C2 0161 [] 009A
|
74
|
+
[Ÿ] 00C2 0178 [] 009F
|
75
|
+
[ÂŽ] 00C2 017D [] 008E
|
76
|
+
[ž] 00C2 017E [] 009E
|
77
|
+
[ƒ] 00C2 0192 [] 0083
|
78
|
+
[ˆ] 00C2 02C6 [] 0088
|
79
|
+
[˜] 00C2 02DC [] 0098
|
80
|
+
[–] 00C2 2013 [] 0096
|
81
|
+
[—] 00C2 2014 [] 0097
|
82
|
+
[‘] 00C2 2018 [] 0091
|
83
|
+
[Â’] 00C2 2019 [] 0092
|
84
|
+
[‚] 00C2 201A [] 0082
|
85
|
+
[“] 00C2 201C [] 0093
|
86
|
+
[”] 00C2 201D [] 0094
|
87
|
+
[„] 00C2 201E [] 0084
|
88
|
+
[†] 00C2 2020 [] 0086
|
89
|
+
[‡] 00C2 2021 [] 0087
|
90
|
+
[•] 00C2 2022 [] 0095
|
91
|
+
[Â…] 00C2 2026 [
] 0085
|
92
|
+
[‰] 00C2 2030 [] 0089
|
93
|
+
[‹] 00C2 2039 [] 008B
|
94
|
+
[›] 00C2 203A [] 009B
|
95
|
+
[€] 00C2 20AC [] 0080
|
96
|
+
[™] 00C2 2122 [] 0099
|
97
|
+
[�] 00C2 FFFD [] 0081
|
98
|
+
[Ã] 00C3 0080 [À] 00C0
|
99
|
+
[Ã] 00C3 0081 [Á] 00C1
|
100
|
+
[Ã] 00C3 0082 [Â] 00C2
|
101
|
+
[Ã] 00C3 0083 [Ã] 00C3
|
102
|
+
[Ã] 00C3 0084 [Ä] 00C4
|
103
|
+
[Ã
] 00C3 0085 [Å] 00C5
|
104
|
+
[Ã] 00C3 0086 [Æ] 00C6
|
105
|
+
[Ã] 00C3 0087 [Ç] 00C7
|
106
|
+
[Ã] 00C3 0088 [È] 00C8
|
107
|
+
[Ã] 00C3 0089 [É] 00C9
|
108
|
+
[Ã] 00C3 008A [Ê] 00CA
|
109
|
+
[Ã] 00C3 008B [Ë] 00CB
|
110
|
+
[Ã] 00C3 008C [Ì] 00CC
|
111
|
+
[Ã] 00C3 008D [Í] 00CD
|
112
|
+
[Ã] 00C3 008E [Î] 00CE
|
113
|
+
[Ã] 00C3 008F [Ï] 00CF
|
114
|
+
[Ã] 00C3 0090 [Ð] 00D0
|
115
|
+
[Ã] 00C3 0091 [Ñ] 00D1
|
116
|
+
[Ã] 00C3 0092 [Ò] 00D2
|
117
|
+
[Ã] 00C3 0093 [Ó] 00D3
|
118
|
+
[Ã] 00C3 0094 [Ô] 00D4
|
119
|
+
[Ã] 00C3 0095 [Õ] 00D5
|
120
|
+
[Ã] 00C3 0096 [Ö] 00D6
|
121
|
+
[Ã] 00C3 0097 [×] 00D7
|
122
|
+
[Ã] 00C3 0098 [Ø] 00D8
|
123
|
+
[Ã] 00C3 0099 [Ù] 00D9
|
124
|
+
[Ã] 00C3 009A [Ú] 00DA
|
125
|
+
[Ã] 00C3 009B [Û] 00DB
|
126
|
+
[Ã] 00C3 009C [Ü] 00DC
|
127
|
+
[Ã] 00C3 009D [Ý] 00DD
|
128
|
+
[Ã] 00C3 009E [Þ] 00DE
|
129
|
+
[Ã] 00C3 009F [ß] 00DF
|
130
|
+
[Ã ] 00C3 00A0 [à] 00E0
|
131
|
+
[á] 00C3 00A1 [á] 00E1
|
132
|
+
[â] 00C3 00A2 [â] 00E2
|
133
|
+
[ã] 00C3 00A3 [ã] 00E3
|
134
|
+
[ä] 00C3 00A4 [ä] 00E4
|
135
|
+
[Ã¥] 00C3 00A5 [å] 00E5
|
136
|
+
[æ] 00C3 00A6 [æ] 00E6
|
137
|
+
[ç] 00C3 00A7 [ç] 00E7
|
138
|
+
[è] 00C3 00A8 [è] 00E8
|
139
|
+
[é] 00C3 00A9 [é] 00E9
|
140
|
+
[ê] 00C3 00AA [ê] 00EA
|
141
|
+
[ë] 00C3 00AB [ë] 00EB
|
142
|
+
[ì] 00C3 00AC [ì] 00EC
|
143
|
+
[Ã] 00C3 00AD [í] 00ED
|
144
|
+
[î] 00C3 00AE [î] 00EE
|
145
|
+
[ï] 00C3 00AF [ï] 00EF
|
146
|
+
[ð] 00C3 00B0 [ð] 00F0
|
147
|
+
[ñ] 00C3 00B1 [ñ] 00F1
|
148
|
+
[ò] 00C3 00B2 [ò] 00F2
|
149
|
+
[ó] 00C3 00B3 [ó] 00F3
|
150
|
+
[ô] 00C3 00B4 [ô] 00F4
|
151
|
+
[õ] 00C3 00B5 [õ] 00F5
|
152
|
+
[ö] 00C3 00B6 [ö] 00F6
|
153
|
+
[÷] 00C3 00B7 [÷] 00F7
|
154
|
+
[ø] 00C3 00B8 [ø] 00F8
|
155
|
+
[ù] 00C3 00B9 [ù] 00F9
|
156
|
+
[ú] 00C3 00BA [ú] 00FA
|
157
|
+
[û] 00C3 00BB [û] 00FB
|
158
|
+
[ü] 00C3 00BC [ü] 00FC
|
159
|
+
[ý] 00C3 00BD [ý] 00FD
|
160
|
+
[þ] 00C3 00BE [þ] 00FE
|
161
|
+
[ÿ] 00C3 00BF [ÿ] 00FF
|
162
|
+
[ÃŒ] 00C3 0152 [Ì] 00CC
|
163
|
+
[Ãœ] 00C3 0153 [Ü] 00DC
|
164
|
+
[Ê] 00C3 0160 [Ê] 00CA
|
165
|
+
[Ú] 00C3 0161 [Ú] 00DA
|
166
|
+
[ß] 00C3 0178 [ß] 00DF
|
167
|
+
[ÃŽ] 00C3 017D [Î] 00CE
|
168
|
+
[Þ] 00C3 017E [Þ] 00DE
|
169
|
+
[Ã] 00C3 0192 [Ã] 00C3
|
170
|
+
[È] 00C3 02C6 [È] 00C8
|
171
|
+
[Ø] 00C3 02DC [Ø] 00D8
|
172
|
+
[Ö] 00C3 2013 [Ö] 00D6
|
173
|
+
[×] 00C3 2014 [×] 00D7
|
174
|
+
[Ñ] 00C3 2018 [Ñ] 00D1
|
175
|
+
[Ã’] 00C3 2019 [Ò] 00D2
|
176
|
+
[Â] 00C3 201A [Â] 00C2
|
177
|
+
[Ó] 00C3 201C [Ó] 00D3
|
178
|
+
[Ô] 00C3 201D [Ô] 00D4
|
179
|
+
[Ä] 00C3 201E [Ä] 00C4
|
180
|
+
[Æ] 00C3 2020 [Æ] 00C6
|
181
|
+
[Ç] 00C3 2021 [Ç] 00C7
|
182
|
+
[Õ] 00C3 2022 [Õ] 00D5
|
183
|
+
[Ã…] 00C3 2026 [Å] 00C5
|
184
|
+
[É] 00C3 2030 [É] 00C9
|
185
|
+
[Ë] 00C3 2039 [Ë] 00CB
|
186
|
+
[Û] 00C3 203A [Û] 00DB
|
187
|
+
[À] 00C3 20AC [À] 00C0
|
188
|
+
[Ù] 00C3 2122 [Ù] 00D9
|
189
|
+
[Ã�] 00C3 FFFD [Á] 00C1
|
190
|
+
[Å] 00C5 0092 [Œ] 0152
|
191
|
+
[Å] 00C5 0093 [œ] 0153
|
192
|
+
[Å ] 00C5 00A0 [Š] 0160
|
193
|
+
[Å¡] 00C5 00A1 [š] 0161
|
194
|
+
[Ÿ] 00C5 00B8 [Ÿ] 0178
|
195
|
+
[Ž] 00C5 00BD [Ž] 017D
|
196
|
+
[ž] 00C5 00BE [ž] 017E
|
197
|
+
[Å’] 00C5 2019 [Œ] 0152
|
198
|
+
[Å“] 00C5 201C [œ] 0153
|
199
|
+
[Æ] 00C6 0092 [ƒ] 0192
|
200
|
+
[Æ’] 00C6 2019 [ƒ] 0192
|
201
|
+
[Ë] 00CB 0086 [ˆ] 02C6
|
202
|
+
[Ë] 00CB 009C [˜] 02DC
|
203
|
+
[Ëœ] 00CB 0153 [˜] 02DC
|
204
|
+
[ˆ] 00CB 2020 [ˆ] 02C6
|
205
|
+
[â] 00E2 0080 0080 [ ] 2000
|
206
|
+
[â] 00E2 0080 0081 [ ] 2001
|
207
|
+
[â] 00E2 0080 0082 [ ] 2002
|
208
|
+
[â] 00E2 0080 0083 [ ] 2003
|
209
|
+
[â] 00E2 0080 0084 [ ] 2004
|
210
|
+
[â
] 00E2 0080 0085 [ ] 2005
|
211
|
+
[â] 00E2 0080 0086 [ ] 2006
|
212
|
+
[â] 00E2 0080 0087 [ ] 2007
|
213
|
+
[â] 00E2 0080 0088 [ ] 2008
|
214
|
+
[â] 00E2 0080 0089 [ ] 2009
|
215
|
+
[â] 00E2 0080 008A [ ] 200A
|
216
|
+
[â] 00E2 0080 008B [] 200B
|
217
|
+
[â] 00E2 0080 0093 [–] 2013
|
218
|
+
[â] 00E2 0080 0094 [—] 2014
|
219
|
+
[â] 00E2 0080 0098 [‘] 2018
|
220
|
+
[â] 00E2 0080 0099 [’] 2019
|
221
|
+
[â] 00E2 0080 009A [‚] 201A
|
222
|
+
[â] 00E2 0080 009C [“] 201C
|
223
|
+
[â] 00E2 0080 009D [”] 201D
|
224
|
+
[â] 00E2 0080 009E [„] 201E
|
225
|
+
[â ] 00E2 0080 00A0 [†] 2020
|
226
|
+
[â¡] 00E2 0080 00A1 [‡] 2021
|
227
|
+
[â¢] 00E2 0080 00A2 [•] 2022
|
228
|
+
[â¦] 00E2 0080 00A6 […] 2026
|
229
|
+
[â°] 00E2 0080 00B0 [‰] 2030
|
230
|
+
[â¹] 00E2 0080 00B9 [‹] 2039
|
231
|
+
[âº] 00E2 0080 00BA [›] 203A
|
232
|
+
[â ] 00E2 0081 00A0 [] 2060
|
233
|
+
[â¬] 00E2 0082 00AC [€] 20AC
|
234
|
+
[â¢] 00E2 0084 00A2 [™] 2122
|
235
|
+
[€] 00E2 201A 00AC [€] 20AC
|
236
|
+
[â„¢] 00E2 201E 00A2 [™] 2122
|
237
|
+
[â€] 00E2 20AC 0081 [ ] 2001
|
238
|
+
[â€] 00E2 20AC 009D [”] 201D
|
239
|
+
[†] 00E2 20AC 00A0 [†] 2020
|
240
|
+
[‡] 00E2 20AC 00A1 [‡] 2021
|
241
|
+
[•] 00E2 20AC 00A2 [•] 2022
|
242
|
+
[…] 00E2 20AC 00A6 […] 2026
|
243
|
+
[‰] 00E2 20AC 00B0 [‰] 2030
|
244
|
+
[‹] 00E2 20AC 00B9 [‹] 2039
|
245
|
+
[›] 00E2 20AC 00BA [›] 203A
|
246
|
+
[“] 00E2 20AC 0153 [“] 201C
|
247
|
+
[ ] 00E2 20AC 0160 [ ] 200A
|
248
|
+
[‚] 00E2 20AC 0161 [‚] 201A
|
249
|
+
[„] 00E2 20AC 017E [„] 201E
|
250
|
+
[ ] 00E2 20AC 0192 [ ] 2003
|
251
|
+
[ ] 00E2 20AC 02C6 [ ] 2008
|
252
|
+
[‘] 00E2 20AC 02DC [‘] 2018
|
253
|
+
[ ] 00E2 20AC 201A [ ] 2002
|
254
|
+
[–] 00E2 20AC 201C [–] 2013
|
255
|
+
[—] 00E2 20AC 201D [—] 2014
|
256
|
+
[ ] 00E2 20AC 201E [ ] 2004
|
257
|
+
[ ] 00E2 20AC 2020 [ ] 2006
|
258
|
+
[ ] 00E2 20AC 2021 [ ] 2007
|
259
|
+
[ ] 00E2 20AC 2026 [ ] 2005
|
260
|
+
[ ] 00E2 20AC 2030 [ ] 2009
|
261
|
+
[​] 00E2 20AC 2039 [] 200B
|
262
|
+
[ ] 00E2 20AC 20AC [ ] 2000
|
263
|
+
[’] 00E2 20AC 2122 [’] 2019
|
264
|
+
[â€�] 00E2 20AC FFFD [”] 201D
|
265
|
+
[â� ] 00E2 FFFD 00A0 [] 2060
|
266
|
+
[] 00EF 00BB 00BF [] FEFF
|
267
|
+
[�] 00EF 00BF 00BD [�] FFFD
|
268
|
+
[￾] 00EF 00BF 00BE [] FFFE
|
data/lib/mojibake.rb
CHANGED
@@ -14,9 +14,38 @@
|
|
14
14
|
# permissions and limitations under the License.
|
15
15
|
#++
|
16
16
|
|
17
|
-
if ( RUBY_VERSION.split( '.' ).map { |d| d.to_i } <=> [ 1, 9 ] ) < 0
|
18
|
-
raise "Requires ruby ~> 1.9 for String.encode support"
|
19
|
-
end
|
20
|
-
|
21
17
|
require 'mojibake/base'
|
22
|
-
|
18
|
+
|
19
|
+
require 'mojibake/json'
|
20
|
+
|
21
|
+
module MojiBake
|
22
|
+
|
23
|
+
# Supports recovering Mojibake characters to the original text.
|
24
|
+
class Mapper
|
25
|
+
include JSONSupport
|
26
|
+
|
27
|
+
if ( RUBY_VERSION.split( '.' ).map { |d| d.to_i } <=> [ 1, 9 ] ) >= 0
|
28
|
+
require 'mojibake/encoding'
|
29
|
+
include EncodingSupport
|
30
|
+
end
|
31
|
+
|
32
|
+
def initialize( opts = {} )
|
33
|
+
super()
|
34
|
+
opts.map { |k,v| send( k.to_s + '=', v ) }
|
35
|
+
end
|
36
|
+
|
37
|
+
# Recover original characters from input using regexp, recursively.
|
38
|
+
def recover( input, recursive = true )
|
39
|
+
output = input.gsub( regexp ) { |moji| hash[moji] }
|
40
|
+
|
41
|
+
# Only recurse if requested and substituted something (output
|
42
|
+
# shorter) in this run.
|
43
|
+
if recursive && ( output.length < input.length )
|
44
|
+
recover( output )
|
45
|
+
else
|
46
|
+
output
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
end
|
data/lib/mojibake/base.rb
CHANGED
@@ -16,9 +16,9 @@
|
|
16
16
|
|
17
17
|
module MojiBake
|
18
18
|
|
19
|
-
#
|
20
|
-
#
|
21
|
-
|
19
|
+
# Mixin for the actual (ruby 1.9 backed) encoding support to define
|
20
|
+
# the mojibake mapping table and regex.
|
21
|
+
module EncodingSupport
|
22
22
|
|
23
23
|
W252 = Encoding::WINDOWS_1252
|
24
24
|
ISO8 = Encoding::ISO_8859_1
|
@@ -30,20 +30,21 @@ module MojiBake
|
|
30
30
|
# RIGHT DOUBLE QUOTATION MARK. These are the most common problem
|
31
31
|
# chars in English and probably most latin languages.
|
32
32
|
HIGH_ORDER_CHARS =
|
33
|
-
( ( 0x80..0xFF )
|
33
|
+
( Array( 0x80..0xFF ) - [ 0x81, 0x8D, 0x8F, 0x90, 0x9D ] ).
|
34
34
|
map { |i| i.chr( W252 ).encode( UTF8 ) }.
|
35
35
|
sort
|
36
36
|
|
37
37
|
# Additional Unicode codepoints of mojibake potential, like alt
|
38
38
|
# whitespace, C1 control characters, and BOMs.
|
39
39
|
INTEREST_CODEPOINTS =
|
40
|
-
[
|
41
|
-
0x00A0,
|
42
|
-
|
43
|
-
0x2060,
|
44
|
-
0xfeff,
|
45
|
-
0xfffd,
|
46
|
-
0xfffe ].
|
40
|
+
[ 0x0080..0x009F, # ISO/Unicode C1 control codes.
|
41
|
+
0x00A0, # NO-BREAK SPACE
|
42
|
+
0x2000..0x200B, # EN QUAD ... ZERO WIDTH SPACE
|
43
|
+
0x2060, # WORD JOINER
|
44
|
+
0xfeff, # ZERO WIDTH SPACE, BYTE-ORDER-MARK (BOM)
|
45
|
+
0xfffd, # REPLACEMENT CHARACTER
|
46
|
+
0xfffe ]. # UNASSIGNED, BAD BOM
|
47
|
+
map { |i| Array( i ) }.
|
47
48
|
flatten.
|
48
49
|
sort
|
49
50
|
|
@@ -63,12 +64,11 @@ module MojiBake
|
|
63
64
|
# (default: true). This covers ambiguities of C1 control codes.
|
64
65
|
attr_accessor :map_permutations
|
65
66
|
|
66
|
-
def initialize
|
67
|
+
def initialize
|
68
|
+
super
|
67
69
|
@map_windows_1252 = true
|
68
70
|
@map_iso_8859_1 = true
|
69
71
|
@map_permutations = true
|
70
|
-
|
71
|
-
options.map { |k,v| send( k.to_s + '=', v ) }
|
72
72
|
end
|
73
73
|
|
74
74
|
# Return Hash of mojibake UTF-8 2-3 character sequences to original
|
@@ -122,19 +122,6 @@ module MojiBake
|
|
122
122
|
@regexp ||= Regexp.new( tree_flatten( char_tree( hash.keys ) ) )
|
123
123
|
end
|
124
124
|
|
125
|
-
# Recover original characters from input using regexp, recursively.
|
126
|
-
def recover( input, recursive = true )
|
127
|
-
output = input.gsub( regexp ) { |moji| hash[moji] }
|
128
|
-
|
129
|
-
# Only recurse if requested and substituted something (output
|
130
|
-
# shorter) in this run.
|
131
|
-
if recursive && ( output.length < input.length )
|
132
|
-
recover( output )
|
133
|
-
else
|
134
|
-
output
|
135
|
-
end
|
136
|
-
end
|
137
|
-
|
138
125
|
def char_tree( seqs )
|
139
126
|
seqs.inject( {} ) do |h,seq|
|
140
127
|
seq.chars.inject( h ) do |hs,c|
|
@@ -158,8 +145,7 @@ module MojiBake
|
|
158
145
|
o
|
159
146
|
end
|
160
147
|
if cs.find { |o| o =~ /[()|\[\]]/ }
|
161
|
-
cs.join( '|' )
|
162
|
-
#FIXME: Join looses encoding so force, jruby bug?
|
148
|
+
cs.join( '|' )
|
163
149
|
else
|
164
150
|
if cs.length > 1
|
165
151
|
'[' + cs.inject(:+) + ']'
|
@@ -0,0 +1,81 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'mojibake/base'
|
18
|
+
require 'json'
|
19
|
+
|
20
|
+
module MojiBake
|
21
|
+
|
22
|
+
module JSONSupport
|
23
|
+
|
24
|
+
JSON_CONFIG = File.join( File.dirname( __FILE__ ),
|
25
|
+
'..', '..', 'config', 'table.json' )
|
26
|
+
|
27
|
+
def initialize
|
28
|
+
super
|
29
|
+
end
|
30
|
+
|
31
|
+
def config
|
32
|
+
@config ||= JSON.parse( IO.read( JSON_CONFIG ) )
|
33
|
+
end
|
34
|
+
|
35
|
+
def hash
|
36
|
+
@hash ||= config[ 'moji' ]
|
37
|
+
end
|
38
|
+
|
39
|
+
def regexp
|
40
|
+
# Note use of Unicode mode for ruby 1.8's
|
41
|
+
@regexp ||= Regexp.new( config[ 'regexp' ], 0, 'U' )
|
42
|
+
end
|
43
|
+
|
44
|
+
# table as self contained json-ready Hash
|
45
|
+
def hash_to_json_object
|
46
|
+
|
47
|
+
# Also use unicode escape for the interesting (effectively,
|
48
|
+
# non-printable) subset of moji mappings.
|
49
|
+
moji = hash.sort.map do |kv|
|
50
|
+
kv.map do |s|
|
51
|
+
s.codepoints.inject( '' ) do |r,i|
|
52
|
+
if MojiBake::Mapper::INTEREST_CODEPOINTS.include?( i )
|
53
|
+
r << sprintf( '\u%04X', i )
|
54
|
+
else
|
55
|
+
r << i.chr( Encoding::UTF_8 )
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
{ :mojibake => MojiBake::VERSION,
|
62
|
+
:url => "https://github.com/dekellum/mojibake",
|
63
|
+
:regexp => regexp.inspect[1...-1],
|
64
|
+
:moji => Hash[ moji ] }
|
65
|
+
end
|
66
|
+
|
67
|
+
# Pretty formatted JSON serialized String for json_object
|
68
|
+
def json
|
69
|
+
# Generate and replace what become double escaped '\\u' UNICODE
|
70
|
+
# escapes with single '\u' escapes. This is a hack but is
|
71
|
+
# reasonably safe given that 'u' isn't normally escaped. The
|
72
|
+
# alterantive would be to hack JSON package or do the JSON
|
73
|
+
# formatting ourselves. Ideally JSON package would support
|
74
|
+
# serialization using unicode escapes for the non-printable,
|
75
|
+
# non-friendly chars. As of 1.6.1 it doesn't.
|
76
|
+
JSON.pretty_generate( hash_to_json_object ).gsub( /\\\\u/, '\u' )
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
#.hashdot.args.pre = --1.9
|
4
|
+
#.hashdot.profile += jruby-shortlived
|
5
|
+
|
6
|
+
#--
|
7
|
+
# Copyright (c) 2011 David Kellum
|
8
|
+
#
|
9
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
10
|
+
# may not use this file except in compliance with the License. You
|
11
|
+
# may obtain a copy of the License at
|
12
|
+
#
|
13
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
14
|
+
#
|
15
|
+
# Unless required by applicable law or agreed to in writing, software
|
16
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
17
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
18
|
+
# implied. See the License for the specific language governing
|
19
|
+
# permissions and limitations under the License.
|
20
|
+
#++
|
21
|
+
|
22
|
+
ldir = File.join( File.dirname( __FILE__ ), "..", "lib" )
|
23
|
+
$LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
|
24
|
+
|
25
|
+
require 'rubygems'
|
26
|
+
require 'minitest/unit'
|
27
|
+
require 'minitest/autorun'
|
28
|
+
|
29
|
+
require 'mojibake'
|
30
|
+
|
31
|
+
class TestEncoding < MiniTest::Unit::TestCase
|
32
|
+
include MojiBake
|
33
|
+
|
34
|
+
def setup
|
35
|
+
@mapper = Mapper.new
|
36
|
+
end
|
37
|
+
|
38
|
+
TEST_TREE = { "a" => { "b" => { "c" => {},
|
39
|
+
"d" => {} } },
|
40
|
+
"d" => { "b" => { "f" => {} } } }
|
41
|
+
|
42
|
+
# These only test with Ruby 1.9 support
|
43
|
+
if ( RUBY_VERSION.split( '.' ).map { |d| d.to_i } <=> [ 1, 9 ] ) >= 0
|
44
|
+
|
45
|
+
def test_init_options
|
46
|
+
assert_equal( true, Mapper.new.map_iso_8859_1 )
|
47
|
+
m = Mapper.new( :map_iso_8859_1 => false )
|
48
|
+
assert_equal( false, m.map_iso_8859_1 )
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_char_tree
|
52
|
+
assert_equal( TEST_TREE,
|
53
|
+
@mapper.char_tree( [ "abc", "abd", "dbf" ] ) )
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_tree_flaten
|
57
|
+
assert_equal( "ab[cd]|dbf",
|
58
|
+
@mapper.tree_flatten( TEST_TREE ) )
|
59
|
+
end
|
60
|
+
|
61
|
+
def test_regexp
|
62
|
+
re = Regexp.new( @mapper.tree_flatten( TEST_TREE ) )
|
63
|
+
assert_match( re, "abc" )
|
64
|
+
assert_match( re, "abd" )
|
65
|
+
assert_match( re, "dbf" )
|
66
|
+
|
67
|
+
refute_match( re, "ab" )
|
68
|
+
refute_match( re, "abf" )
|
69
|
+
|
70
|
+
assert_equal( "xbf" , "abdbf".gsub( re, 'x' ) )
|
71
|
+
assert_equal( "dbf" , "abdbf".gsub( re, 'd' ) )
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
@@ -28,46 +28,13 @@ require 'minitest/autorun'
|
|
28
28
|
|
29
29
|
require 'mojibake'
|
30
30
|
|
31
|
-
class
|
31
|
+
class TestMapper < MiniTest::Unit::TestCase
|
32
32
|
include MojiBake
|
33
33
|
|
34
34
|
def setup
|
35
35
|
@mapper = Mapper.new
|
36
36
|
end
|
37
37
|
|
38
|
-
TEST_TREE = { "a" => { "b" => { "c" => {},
|
39
|
-
"d" => {} } },
|
40
|
-
"d" => { "b" => { "f" => {} } } }
|
41
|
-
|
42
|
-
def test_init_options
|
43
|
-
assert_equal( true, Mapper.new.map_iso_8859_1 )
|
44
|
-
m = Mapper.new( :map_iso_8859_1 => false )
|
45
|
-
assert_equal( false, m.map_iso_8859_1 )
|
46
|
-
end
|
47
|
-
|
48
|
-
def test_char_tree
|
49
|
-
assert_equal( TEST_TREE,
|
50
|
-
@mapper.char_tree( [ "abc", "abd", "dbf" ] ) )
|
51
|
-
end
|
52
|
-
|
53
|
-
def test_tree_flaten
|
54
|
-
assert_equal( "ab[cd]|dbf",
|
55
|
-
@mapper.tree_flatten( TEST_TREE ) )
|
56
|
-
end
|
57
|
-
|
58
|
-
def test_regexp
|
59
|
-
re = Regexp.new( @mapper.tree_flatten( TEST_TREE ) )
|
60
|
-
assert_match( re, "abc" )
|
61
|
-
assert_match( re, "abd" )
|
62
|
-
assert_match( re, "dbf" )
|
63
|
-
|
64
|
-
refute_match( re, "ab" )
|
65
|
-
refute_match( re, "abf" )
|
66
|
-
|
67
|
-
assert_equal( "xbf" , "abdbf".gsub( re, 'x' ) )
|
68
|
-
assert_equal( "dbf" , "abdbf".gsub( re, 'd' ) )
|
69
|
-
end
|
70
|
-
|
71
38
|
def test_nomatch_recover
|
72
39
|
assert_equal( '', @mapper.recover( '' ) )
|
73
40
|
assert_equal( 'ascii', @mapper.recover( 'ascii' ) )
|
metadata
CHANGED
@@ -2,42 +2,49 @@
|
|
2
2
|
name: mojibake
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 1.
|
5
|
+
version: 1.1.0
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
|
-
|
8
|
+
- David Kellum
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2011-
|
14
|
-
default_executable:
|
13
|
+
date: 2011-10-31 00:00:00 Z
|
15
14
|
dependencies:
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: json
|
17
|
+
prerelease: false
|
18
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
19
|
+
none: false
|
20
|
+
requirements:
|
21
|
+
- - ~>
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.6.1
|
24
|
+
type: :runtime
|
25
|
+
version_requirements: *id001
|
26
|
+
- !ruby/object:Gem::Dependency
|
27
|
+
name: minitest
|
28
|
+
prerelease: false
|
29
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
30
|
+
none: false
|
31
|
+
requirements:
|
32
|
+
- - ~>
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: "2.3"
|
35
|
+
type: :development
|
36
|
+
version_requirements: *id002
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
name: rjack-tarpit
|
39
|
+
prerelease: false
|
40
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 1.4.0
|
46
|
+
type: :development
|
47
|
+
version_requirements: *id003
|
41
48
|
description: "Mojibake occurs in English most frequently due to misinterpreting and\n\
|
42
49
|
bad-transcoding between Windows-1252, ISO-8859-1, and UTF-8. This\n\
|
43
50
|
module provides a mojibake sequence to original character mapping\n\
|
@@ -45,54 +52,60 @@ description: "Mojibake occurs in English most frequently due to misinterpreting
|
|
45
52
|
Testing has been with English but other Latin based languages, where\n\
|
46
53
|
Windows-1252 is in the wild, should also benefit."
|
47
54
|
email:
|
48
|
-
|
55
|
+
- dek-oss@gravitext.com
|
49
56
|
executables:
|
50
|
-
|
57
|
+
- mojibake
|
51
58
|
extensions: []
|
52
59
|
|
53
60
|
extra_rdoc_files:
|
54
|
-
|
55
|
-
|
56
|
-
|
61
|
+
- Manifest.txt
|
62
|
+
- config/table.txt
|
63
|
+
- History.rdoc
|
64
|
+
- README.rdoc
|
57
65
|
files:
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
66
|
+
- History.rdoc
|
67
|
+
- Manifest.txt
|
68
|
+
- README.rdoc
|
69
|
+
- Rakefile
|
70
|
+
- bin/mojibake
|
71
|
+
- config/table.json
|
72
|
+
- config/table.txt
|
73
|
+
- lib/mojibake/base.rb
|
74
|
+
- lib/mojibake.rb
|
75
|
+
- lib/mojibake/encoding.rb
|
76
|
+
- lib/mojibake/json.rb
|
77
|
+
- test/test.txt
|
78
|
+
- test/test_encoding.rb
|
79
|
+
- test/test_mapper.rb
|
80
|
+
- .gemtest
|
69
81
|
homepage: http://github.com/dekellum/mojibake
|
70
82
|
licenses: []
|
71
83
|
|
72
84
|
post_install_message:
|
73
85
|
rdoc_options:
|
74
|
-
|
75
|
-
|
86
|
+
- --main
|
87
|
+
- README.rdoc
|
76
88
|
require_paths:
|
77
|
-
|
89
|
+
- lib
|
78
90
|
required_ruby_version: !ruby/object:Gem::Requirement
|
79
91
|
none: false
|
80
92
|
requirements:
|
81
|
-
|
82
|
-
|
83
|
-
|
93
|
+
- - ">="
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: "0"
|
84
96
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
85
97
|
none: false
|
86
98
|
requirements:
|
87
|
-
|
88
|
-
|
89
|
-
|
99
|
+
- - ">="
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: "0"
|
90
102
|
requirements: []
|
91
103
|
|
92
104
|
rubyforge_project: mojibake
|
93
|
-
rubygems_version: 1.
|
105
|
+
rubygems_version: 1.8.11
|
94
106
|
signing_key:
|
95
107
|
specification_version: 3
|
96
108
|
summary: Mojibake occurs in English most frequently due to misinterpreting and bad-transcoding between Windows-1252, ISO-8859-1, and UTF-8
|
97
109
|
test_files:
|
98
|
-
|
110
|
+
- test/test_encoding.rb
|
111
|
+
- test/test_mapper.rb
|