mojibake 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemtest +0 -0
- data/History.rdoc +7 -0
- data/Manifest.txt +6 -2
- data/README.rdoc +4 -2
- data/Rakefile +15 -2
- data/bin/mojibake +33 -17
- data/config/table.json +270 -0
- data/config/table.txt +268 -0
- data/lib/mojibake.rb +34 -5
- data/lib/mojibake/base.rb +1 -1
- data/lib/mojibake/{mapper.rb → encoding.rb} +15 -29
- data/lib/mojibake/json.rb +81 -0
- data/test/test_encoding.rb +76 -0
- data/test/{test_mojibake.rb → test_mapper.rb} +1 -34
- metadata +69 -56
data/.gemtest
ADDED
File without changes
|
data/History.rdoc
CHANGED
@@ -1,2 +1,9 @@
|
|
1
|
+
=== 1.1.0 (2011-6-31)
|
2
|
+
* Add table.json output support as more convenient language
|
3
|
+
independent format. Include default version (as well as table.txt)
|
4
|
+
as part of gem.
|
5
|
+
* Add Ruby 1.8 support for Mapper.recover by using default included
|
6
|
+
table.json (avoids need for 1.9 encoding support.)
|
7
|
+
|
1
8
|
=== 1.0.0 (2011-6-21)
|
2
9
|
* Initial release.
|
data/Manifest.txt
CHANGED
@@ -3,8 +3,12 @@ Manifest.txt
|
|
3
3
|
README.rdoc
|
4
4
|
Rakefile
|
5
5
|
bin/mojibake
|
6
|
+
config/table.json
|
7
|
+
config/table.txt
|
6
8
|
lib/mojibake/base.rb
|
7
9
|
lib/mojibake.rb
|
8
|
-
lib/mojibake/
|
10
|
+
lib/mojibake/encoding.rb
|
11
|
+
lib/mojibake/json.rb
|
9
12
|
test/test.txt
|
10
|
-
test/
|
13
|
+
test/test_encoding.rb
|
14
|
+
test/test_mapper.rb
|
data/README.rdoc
CHANGED
@@ -15,8 +15,10 @@ Windows-1252 is in the wild, should also benefit.
|
|
15
15
|
|
16
16
|
== Dependencies
|
17
17
|
|
18
|
-
Requires the String Encoding support
|
19
|
-
|
18
|
+
Requires the String Encoding support in ruby 1.9 as provided by:
|
19
|
+
|
20
|
+
* ruby 1.9.2+ (tested 1.9.2p180, Linux)
|
21
|
+
* jruby 1.6.5+ (tested 1.6.5, Linux)
|
20
22
|
|
21
23
|
== Synopsis
|
22
24
|
|
data/Rakefile
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
$LOAD_PATH << './lib'
|
4
4
|
|
5
5
|
require 'rubygems'
|
6
|
-
gem 'rjack-tarpit', '~> 1.
|
6
|
+
gem 'rjack-tarpit', '~> 1.4'
|
7
7
|
require 'rjack-tarpit'
|
8
8
|
|
9
9
|
require 'mojibake/base'
|
@@ -14,7 +14,8 @@ t.specify do |h|
|
|
14
14
|
h.developer( 'David Kellum', 'dek-oss@gravitext.com' )
|
15
15
|
|
16
16
|
h.testlib = :minitest
|
17
|
-
h.
|
17
|
+
h.extra_deps += [ [ 'json', '~> 1.6.1' ] ]
|
18
|
+
h.extra_dev_deps += [ [ 'minitest', '~> 2.3' ] ]
|
18
19
|
|
19
20
|
h.url = 'http://github.com/dekellum/mojibake'
|
20
21
|
end
|
@@ -33,3 +34,15 @@ task :tag => [ :check_history_version, :check_history_date ]
|
|
33
34
|
task :push => [ :check_history_version, :check_history_date ]
|
34
35
|
|
35
36
|
t.define_tasks
|
37
|
+
|
38
|
+
desc "(Re-)generate config output files (requires 1.9)"
|
39
|
+
task :generate_config do
|
40
|
+
if ( RUBY_VERSION.split( '.' ).map { |d| d.to_i } <=> [ 1, 9 ] ) >= 0
|
41
|
+
require 'mojibake'
|
42
|
+
mapper = MojiBake::Mapper.new
|
43
|
+
open( "config/table.txt", 'w' ) { |fout| fout.puts( mapper.table ) }
|
44
|
+
open( "config/table.json", 'w' ) { |fout| fout.puts( mapper.json ) }
|
45
|
+
else
|
46
|
+
raise "Task generate_config requires Ruby 1.9 encoding support"
|
47
|
+
end
|
48
|
+
end
|
data/bin/mojibake
CHANGED
@@ -20,6 +20,8 @@
|
|
20
20
|
|
21
21
|
$LOAD_PATH.unshift File.join( File.dirname( __FILE__ ), "..", "lib" )
|
22
22
|
|
23
|
+
require 'rubygems'
|
24
|
+
|
23
25
|
require 'mojibake'
|
24
26
|
require 'optparse'
|
25
27
|
|
@@ -33,23 +35,36 @@ module Script
|
|
33
35
|
puts "mojibake: #{MojiBake::VERSION}"
|
34
36
|
exit 1
|
35
37
|
end
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
38
|
+
|
39
|
+
if ( RUBY_VERSION.split( '.' ).map { |d| d.to_i } <=> [ 1, 9 ] ) >= 0
|
40
|
+
|
41
|
+
opts.on( "--no-windows-1252",
|
42
|
+
"Don't include miscodings from Windows-1252" ) do
|
43
|
+
mapper.map_windows_1252 = false
|
44
|
+
end
|
45
|
+
opts.on( "--no-iso-8859-1",
|
46
|
+
"Don't include miscodings from ISO-8859-1" ) do
|
47
|
+
mapper.map_iso_8859_1 = false
|
48
|
+
end
|
49
|
+
opts.on( "--no-permutations",
|
50
|
+
"Don't include ISO/Windows permutations" ) do
|
51
|
+
mapper.map_permutations = false
|
52
|
+
end
|
53
|
+
opts.on_tail( "-t", "--table",
|
54
|
+
"Write MojiBake Mapper table (UTF-8) and exit" ) do
|
55
|
+
puts mapper.table
|
56
|
+
exit 1
|
57
|
+
end
|
58
|
+
opts.on_tail( "-j", "--json",
|
59
|
+
"Write MojiBake Mapper json (UTF-8) and exit" ) do
|
60
|
+
require 'rubygems'
|
61
|
+
require 'mojibake/json'
|
62
|
+
puts mapper.json
|
63
|
+
exit 1
|
64
|
+
end
|
65
|
+
|
52
66
|
end
|
67
|
+
|
53
68
|
opts.on_tail( "-r", "--regex",
|
54
69
|
"Display MojiBake Mapper regex (UTF-8) and exit" ) do
|
55
70
|
puts mapper.regexp.inspect
|
@@ -65,7 +80,8 @@ module Script
|
|
65
80
|
|
66
81
|
input_file = ARGV.shift
|
67
82
|
if input_file
|
68
|
-
|
83
|
+
data = IO.read( input_file )
|
84
|
+
$stdout.write( mapper.recover( data ) )
|
69
85
|
end
|
70
86
|
|
71
87
|
end
|
data/config/table.json
ADDED
@@ -0,0 +1,270 @@
|
|
1
|
+
{
|
2
|
+
"mojibake": "1.1.0",
|
3
|
+
"url": "https://github.com/dekellum/mojibake",
|
4
|
+
"regexp": "Â[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ŒœŠšŸŽžƒˆ˜–—‘’‚“”„†‡•…‰‹›€™\uFFFD]|Ã[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ŒœŠšŸŽžƒˆ˜–—‘’‚“”„†‡•…‰‹›€™\uFFFD]|Å[\u0092\u0093\u00A0¡¸½¾’“]|Æ[\u0092’]|Ë[\u0086\u009Cœ†]|â(\u0080[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u0093\u0094\u0098\u0099\u009A\u009C\u009D\u009E\u00A0¡¢¦°¹º]|\u0081\u00A0|\u0082¬|\u0084¢|‚¬|„¢|€[\u0081\u009D\u00A0¡¢¦°¹ºœŠšžƒˆ˜‚“”„†‡…‰‹€™\uFFFD]|\uFFFD\u00A0)|ï(»¿|¿[½¾])",
|
5
|
+
"moji": {
|
6
|
+
"Â\u0080": "\u0080",
|
7
|
+
"Â\u0081": "\u0081",
|
8
|
+
"Â\u0082": "\u0082",
|
9
|
+
"Â\u0083": "\u0083",
|
10
|
+
"Â\u0084": "\u0084",
|
11
|
+
"Â\u0085": "\u0085",
|
12
|
+
"Â\u0086": "\u0086",
|
13
|
+
"Â\u0087": "\u0087",
|
14
|
+
"Â\u0088": "\u0088",
|
15
|
+
"Â\u0089": "\u0089",
|
16
|
+
"Â\u008A": "\u008A",
|
17
|
+
"Â\u008B": "\u008B",
|
18
|
+
"Â\u008C": "\u008C",
|
19
|
+
"Â\u008D": "\u008D",
|
20
|
+
"Â\u008E": "\u008E",
|
21
|
+
"Â\u008F": "\u008F",
|
22
|
+
"Â\u0090": "\u0090",
|
23
|
+
"Â\u0091": "\u0091",
|
24
|
+
"Â\u0092": "\u0092",
|
25
|
+
"Â\u0093": "\u0093",
|
26
|
+
"Â\u0094": "\u0094",
|
27
|
+
"Â\u0095": "\u0095",
|
28
|
+
"Â\u0096": "\u0096",
|
29
|
+
"Â\u0097": "\u0097",
|
30
|
+
"Â\u0098": "\u0098",
|
31
|
+
"Â\u0099": "\u0099",
|
32
|
+
"Â\u009A": "\u009A",
|
33
|
+
"Â\u009B": "\u009B",
|
34
|
+
"Â\u009C": "\u009C",
|
35
|
+
"Â\u009D": "\u009D",
|
36
|
+
"Â\u009E": "\u009E",
|
37
|
+
"Â\u009F": "\u009F",
|
38
|
+
"Â\u00A0": "\u00A0",
|
39
|
+
"¡": "¡",
|
40
|
+
"¢": "¢",
|
41
|
+
"£": "£",
|
42
|
+
"¤": "¤",
|
43
|
+
"Â¥": "¥",
|
44
|
+
"¦": "¦",
|
45
|
+
"§": "§",
|
46
|
+
"¨": "¨",
|
47
|
+
"©": "©",
|
48
|
+
"ª": "ª",
|
49
|
+
"«": "«",
|
50
|
+
"¬": "¬",
|
51
|
+
"Â": "",
|
52
|
+
"®": "®",
|
53
|
+
"¯": "¯",
|
54
|
+
"°": "°",
|
55
|
+
"±": "±",
|
56
|
+
"²": "²",
|
57
|
+
"³": "³",
|
58
|
+
"´": "´",
|
59
|
+
"µ": "µ",
|
60
|
+
"¶": "¶",
|
61
|
+
"·": "·",
|
62
|
+
"¸": "¸",
|
63
|
+
"¹": "¹",
|
64
|
+
"º": "º",
|
65
|
+
"»": "»",
|
66
|
+
"¼": "¼",
|
67
|
+
"½": "½",
|
68
|
+
"¾": "¾",
|
69
|
+
"¿": "¿",
|
70
|
+
"Œ": "\u008C",
|
71
|
+
"œ": "\u009C",
|
72
|
+
"Š": "\u008A",
|
73
|
+
"š": "\u009A",
|
74
|
+
"Ÿ": "\u009F",
|
75
|
+
"ÂŽ": "\u008E",
|
76
|
+
"ž": "\u009E",
|
77
|
+
"ƒ": "\u0083",
|
78
|
+
"ˆ": "\u0088",
|
79
|
+
"˜": "\u0098",
|
80
|
+
"–": "\u0096",
|
81
|
+
"—": "\u0097",
|
82
|
+
"‘": "\u0091",
|
83
|
+
"Â’": "\u0092",
|
84
|
+
"‚": "\u0082",
|
85
|
+
"“": "\u0093",
|
86
|
+
"”": "\u0094",
|
87
|
+
"„": "\u0084",
|
88
|
+
"†": "\u0086",
|
89
|
+
"‡": "\u0087",
|
90
|
+
"•": "\u0095",
|
91
|
+
"Â…": "\u0085",
|
92
|
+
"‰": "\u0089",
|
93
|
+
"‹": "\u008B",
|
94
|
+
"›": "\u009B",
|
95
|
+
"€": "\u0080",
|
96
|
+
"™": "\u0099",
|
97
|
+
"Â\uFFFD": "\u0081",
|
98
|
+
"Ã\u0080": "À",
|
99
|
+
"Ã\u0081": "Á",
|
100
|
+
"Ã\u0082": "Â",
|
101
|
+
"Ã\u0083": "Ã",
|
102
|
+
"Ã\u0084": "Ä",
|
103
|
+
"Ã\u0085": "Å",
|
104
|
+
"Ã\u0086": "Æ",
|
105
|
+
"Ã\u0087": "Ç",
|
106
|
+
"Ã\u0088": "È",
|
107
|
+
"Ã\u0089": "É",
|
108
|
+
"Ã\u008A": "Ê",
|
109
|
+
"Ã\u008B": "Ë",
|
110
|
+
"Ã\u008C": "Ì",
|
111
|
+
"Ã\u008D": "Í",
|
112
|
+
"Ã\u008E": "Î",
|
113
|
+
"Ã\u008F": "Ï",
|
114
|
+
"Ã\u0090": "Ð",
|
115
|
+
"Ã\u0091": "Ñ",
|
116
|
+
"Ã\u0092": "Ò",
|
117
|
+
"Ã\u0093": "Ó",
|
118
|
+
"Ã\u0094": "Ô",
|
119
|
+
"Ã\u0095": "Õ",
|
120
|
+
"Ã\u0096": "Ö",
|
121
|
+
"Ã\u0097": "×",
|
122
|
+
"Ã\u0098": "Ø",
|
123
|
+
"Ã\u0099": "Ù",
|
124
|
+
"Ã\u009A": "Ú",
|
125
|
+
"Ã\u009B": "Û",
|
126
|
+
"Ã\u009C": "Ü",
|
127
|
+
"Ã\u009D": "Ý",
|
128
|
+
"Ã\u009E": "Þ",
|
129
|
+
"Ã\u009F": "ß",
|
130
|
+
"Ã\u00A0": "à",
|
131
|
+
"á": "á",
|
132
|
+
"â": "â",
|
133
|
+
"ã": "ã",
|
134
|
+
"ä": "ä",
|
135
|
+
"Ã¥": "å",
|
136
|
+
"æ": "æ",
|
137
|
+
"ç": "ç",
|
138
|
+
"è": "è",
|
139
|
+
"é": "é",
|
140
|
+
"ê": "ê",
|
141
|
+
"ë": "ë",
|
142
|
+
"ì": "ì",
|
143
|
+
"Ã": "í",
|
144
|
+
"î": "î",
|
145
|
+
"ï": "ï",
|
146
|
+
"ð": "ð",
|
147
|
+
"ñ": "ñ",
|
148
|
+
"ò": "ò",
|
149
|
+
"ó": "ó",
|
150
|
+
"ô": "ô",
|
151
|
+
"õ": "õ",
|
152
|
+
"ö": "ö",
|
153
|
+
"÷": "÷",
|
154
|
+
"ø": "ø",
|
155
|
+
"ù": "ù",
|
156
|
+
"ú": "ú",
|
157
|
+
"û": "û",
|
158
|
+
"ü": "ü",
|
159
|
+
"ý": "ý",
|
160
|
+
"þ": "þ",
|
161
|
+
"ÿ": "ÿ",
|
162
|
+
"ÃŒ": "Ì",
|
163
|
+
"Ü": "Ü",
|
164
|
+
"Ê": "Ê",
|
165
|
+
"Ú": "Ú",
|
166
|
+
"ß": "ß",
|
167
|
+
"ÃŽ": "Î",
|
168
|
+
"Þ": "Þ",
|
169
|
+
"Ã": "Ã",
|
170
|
+
"È": "È",
|
171
|
+
"Ø": "Ø",
|
172
|
+
"Ö": "Ö",
|
173
|
+
"×": "×",
|
174
|
+
"Ñ": "Ñ",
|
175
|
+
"Ã’": "Ò",
|
176
|
+
"Â": "Â",
|
177
|
+
"Ó": "Ó",
|
178
|
+
"Ô": "Ô",
|
179
|
+
"Ä": "Ä",
|
180
|
+
"Æ": "Æ",
|
181
|
+
"Ç": "Ç",
|
182
|
+
"Õ": "Õ",
|
183
|
+
"Ã…": "Å",
|
184
|
+
"É": "É",
|
185
|
+
"Ë": "Ë",
|
186
|
+
"Û": "Û",
|
187
|
+
"À": "À",
|
188
|
+
"Ù": "Ù",
|
189
|
+
"Ã\uFFFD": "Á",
|
190
|
+
"Å\u0092": "Œ",
|
191
|
+
"Å\u0093": "œ",
|
192
|
+
"Å\u00A0": "Š",
|
193
|
+
"Å¡": "š",
|
194
|
+
"Ÿ": "Ÿ",
|
195
|
+
"Ž": "Ž",
|
196
|
+
"ž": "ž",
|
197
|
+
"Å’": "Œ",
|
198
|
+
"Å“": "œ",
|
199
|
+
"Æ\u0092": "ƒ",
|
200
|
+
"Æ’": "ƒ",
|
201
|
+
"Ë\u0086": "ˆ",
|
202
|
+
"Ë\u009C": "˜",
|
203
|
+
"Ëœ": "˜",
|
204
|
+
"ˆ": "ˆ",
|
205
|
+
"â\u0080\u0080": "\u2000",
|
206
|
+
"â\u0080\u0081": "\u2001",
|
207
|
+
"â\u0080\u0082": "\u2002",
|
208
|
+
"â\u0080\u0083": "\u2003",
|
209
|
+
"â\u0080\u0084": "\u2004",
|
210
|
+
"â\u0080\u0085": "\u2005",
|
211
|
+
"â\u0080\u0086": "\u2006",
|
212
|
+
"â\u0080\u0087": "\u2007",
|
213
|
+
"â\u0080\u0088": "\u2008",
|
214
|
+
"â\u0080\u0089": "\u2009",
|
215
|
+
"â\u0080\u008A": "\u200A",
|
216
|
+
"â\u0080\u008B": "\u200B",
|
217
|
+
"â\u0080\u0093": "–",
|
218
|
+
"â\u0080\u0094": "—",
|
219
|
+
"â\u0080\u0098": "‘",
|
220
|
+
"â\u0080\u0099": "’",
|
221
|
+
"â\u0080\u009A": "‚",
|
222
|
+
"â\u0080\u009C": "“",
|
223
|
+
"â\u0080\u009D": "”",
|
224
|
+
"â\u0080\u009E": "„",
|
225
|
+
"â\u0080\u00A0": "†",
|
226
|
+
"â\u0080¡": "‡",
|
227
|
+
"â\u0080¢": "•",
|
228
|
+
"â\u0080¦": "…",
|
229
|
+
"â\u0080°": "‰",
|
230
|
+
"â\u0080¹": "‹",
|
231
|
+
"â\u0080º": "›",
|
232
|
+
"â\u0081\u00A0": "\u2060",
|
233
|
+
"â\u0082¬": "€",
|
234
|
+
"â\u0084¢": "™",
|
235
|
+
"€": "€",
|
236
|
+
"â„¢": "™",
|
237
|
+
"â€\u0081": "\u2001",
|
238
|
+
"â€\u009D": "”",
|
239
|
+
"â€\u00A0": "†",
|
240
|
+
"‡": "‡",
|
241
|
+
"•": "•",
|
242
|
+
"…": "…",
|
243
|
+
"‰": "‰",
|
244
|
+
"‹": "‹",
|
245
|
+
"›": "›",
|
246
|
+
"“": "“",
|
247
|
+
" ": "\u200A",
|
248
|
+
"‚": "‚",
|
249
|
+
"„": "„",
|
250
|
+
" ": "\u2003",
|
251
|
+
" ": "\u2008",
|
252
|
+
"‘": "‘",
|
253
|
+
" ": "\u2002",
|
254
|
+
"–": "–",
|
255
|
+
"—": "—",
|
256
|
+
" ": "\u2004",
|
257
|
+
" ": "\u2006",
|
258
|
+
" ": "\u2007",
|
259
|
+
" ": "\u2005",
|
260
|
+
" ": "\u2009",
|
261
|
+
"​": "\u200B",
|
262
|
+
" ": "\u2000",
|
263
|
+
"’": "’",
|
264
|
+
"â€\uFFFD": "”",
|
265
|
+
"â\uFFFD\u00A0": "\u2060",
|
266
|
+
"": "\uFEFF",
|
267
|
+
"�": "\uFFFD",
|
268
|
+
"￾": "\uFFFE"
|
269
|
+
}
|
270
|
+
}
|
data/config/table.txt
ADDED
@@ -0,0 +1,268 @@
|
|
1
|
+
# -*- coding: utf-8 -*- mojibake: 1.1.0
|
2
|
+
/Â[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ŒœŠšŸŽžƒˆ˜–—‘’‚“”„†‡•…‰‹›€™\uFFFD]|Ã[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ŒœŠšŸŽžƒˆ˜–—‘’‚“”„†‡•…‰‹›€™\uFFFD]|Å[\u0092\u0093\u00A0¡¸½¾’“]|Æ[\u0092’]|Ë[\u0086\u009Cœ†]|â(\u0080[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u0093\u0094\u0098\u0099\u009A\u009C\u009D\u009E\u00A0¡¢¦°¹º]|\u0081\u00A0|\u0082¬|\u0084¢|‚¬|„¢|€[\u0081\u009D\u00A0¡¢¦°¹ºœŠšžƒˆ˜‚“”„†‡…‰‹€™\uFFFD]|\uFFFD\u00A0)|ï(»¿|¿[½¾])/
|
3
|
+
|
4
|
+
Moji UNICODE Org CODE
|
5
|
+
+---- ---- ---- ---- ----- ---+
|
6
|
+
[Â] 00C2 0080 [] 0080
|
7
|
+
[Â] 00C2 0081 [] 0081
|
8
|
+
[Â] 00C2 0082 [] 0082
|
9
|
+
[Â] 00C2 0083 [] 0083
|
10
|
+
[Â] 00C2 0084 [] 0084
|
11
|
+
[Â
] 00C2 0085 [
] 0085
|
12
|
+
[Â] 00C2 0086 [] 0086
|
13
|
+
[Â] 00C2 0087 [] 0087
|
14
|
+
[Â] 00C2 0088 [] 0088
|
15
|
+
[Â] 00C2 0089 [] 0089
|
16
|
+
[Â] 00C2 008A [] 008A
|
17
|
+
[Â] 00C2 008B [] 008B
|
18
|
+
[Â] 00C2 008C [] 008C
|
19
|
+
[Â] 00C2 008D [] 008D
|
20
|
+
[Â] 00C2 008E [] 008E
|
21
|
+
[Â] 00C2 008F [] 008F
|
22
|
+
[Â] 00C2 0090 [] 0090
|
23
|
+
[Â] 00C2 0091 [] 0091
|
24
|
+
[Â] 00C2 0092 [] 0092
|
25
|
+
[Â] 00C2 0093 [] 0093
|
26
|
+
[Â] 00C2 0094 [] 0094
|
27
|
+
[Â] 00C2 0095 [] 0095
|
28
|
+
[Â] 00C2 0096 [] 0096
|
29
|
+
[Â] 00C2 0097 [] 0097
|
30
|
+
[Â] 00C2 0098 [] 0098
|
31
|
+
[Â] 00C2 0099 [] 0099
|
32
|
+
[Â] 00C2 009A [] 009A
|
33
|
+
[Â] 00C2 009B [] 009B
|
34
|
+
[Â] 00C2 009C [] 009C
|
35
|
+
[Â] 00C2 009D [] 009D
|
36
|
+
[Â] 00C2 009E [] 009E
|
37
|
+
[Â] 00C2 009F [] 009F
|
38
|
+
[Â ] 00C2 00A0 [ ] 00A0
|
39
|
+
[¡] 00C2 00A1 [¡] 00A1
|
40
|
+
[¢] 00C2 00A2 [¢] 00A2
|
41
|
+
[£] 00C2 00A3 [£] 00A3
|
42
|
+
[¤] 00C2 00A4 [¤] 00A4
|
43
|
+
[Â¥] 00C2 00A5 [¥] 00A5
|
44
|
+
[¦] 00C2 00A6 [¦] 00A6
|
45
|
+
[§] 00C2 00A7 [§] 00A7
|
46
|
+
[¨] 00C2 00A8 [¨] 00A8
|
47
|
+
[©] 00C2 00A9 [©] 00A9
|
48
|
+
[ª] 00C2 00AA [ª] 00AA
|
49
|
+
[«] 00C2 00AB [«] 00AB
|
50
|
+
[¬] 00C2 00AC [¬] 00AC
|
51
|
+
[Â] 00C2 00AD [] 00AD
|
52
|
+
[®] 00C2 00AE [®] 00AE
|
53
|
+
[¯] 00C2 00AF [¯] 00AF
|
54
|
+
[°] 00C2 00B0 [°] 00B0
|
55
|
+
[±] 00C2 00B1 [±] 00B1
|
56
|
+
[²] 00C2 00B2 [²] 00B2
|
57
|
+
[³] 00C2 00B3 [³] 00B3
|
58
|
+
[´] 00C2 00B4 [´] 00B4
|
59
|
+
[µ] 00C2 00B5 [µ] 00B5
|
60
|
+
[¶] 00C2 00B6 [¶] 00B6
|
61
|
+
[·] 00C2 00B7 [·] 00B7
|
62
|
+
[¸] 00C2 00B8 [¸] 00B8
|
63
|
+
[¹] 00C2 00B9 [¹] 00B9
|
64
|
+
[º] 00C2 00BA [º] 00BA
|
65
|
+
[»] 00C2 00BB [»] 00BB
|
66
|
+
[¼] 00C2 00BC [¼] 00BC
|
67
|
+
[½] 00C2 00BD [½] 00BD
|
68
|
+
[¾] 00C2 00BE [¾] 00BE
|
69
|
+
[¿] 00C2 00BF [¿] 00BF
|
70
|
+
[Œ] 00C2 0152 [] 008C
|
71
|
+
[œ] 00C2 0153 [] 009C
|
72
|
+
[Š] 00C2 0160 [] 008A
|
73
|
+
[š] 00C2 0161 [] 009A
|
74
|
+
[Ÿ] 00C2 0178 [] 009F
|
75
|
+
[ÂŽ] 00C2 017D [] 008E
|
76
|
+
[ž] 00C2 017E [] 009E
|
77
|
+
[ƒ] 00C2 0192 [] 0083
|
78
|
+
[ˆ] 00C2 02C6 [] 0088
|
79
|
+
[˜] 00C2 02DC [] 0098
|
80
|
+
[–] 00C2 2013 [] 0096
|
81
|
+
[—] 00C2 2014 [] 0097
|
82
|
+
[‘] 00C2 2018 [] 0091
|
83
|
+
[Â’] 00C2 2019 [] 0092
|
84
|
+
[‚] 00C2 201A [] 0082
|
85
|
+
[“] 00C2 201C [] 0093
|
86
|
+
[”] 00C2 201D [] 0094
|
87
|
+
[„] 00C2 201E [] 0084
|
88
|
+
[†] 00C2 2020 [] 0086
|
89
|
+
[‡] 00C2 2021 [] 0087
|
90
|
+
[•] 00C2 2022 [] 0095
|
91
|
+
[Â…] 00C2 2026 [
] 0085
|
92
|
+
[‰] 00C2 2030 [] 0089
|
93
|
+
[‹] 00C2 2039 [] 008B
|
94
|
+
[›] 00C2 203A [] 009B
|
95
|
+
[€] 00C2 20AC [] 0080
|
96
|
+
[™] 00C2 2122 [] 0099
|
97
|
+
[�] 00C2 FFFD [] 0081
|
98
|
+
[Ã] 00C3 0080 [À] 00C0
|
99
|
+
[Ã] 00C3 0081 [Á] 00C1
|
100
|
+
[Ã] 00C3 0082 [Â] 00C2
|
101
|
+
[Ã] 00C3 0083 [Ã] 00C3
|
102
|
+
[Ã] 00C3 0084 [Ä] 00C4
|
103
|
+
[Ã
] 00C3 0085 [Å] 00C5
|
104
|
+
[Ã] 00C3 0086 [Æ] 00C6
|
105
|
+
[Ã] 00C3 0087 [Ç] 00C7
|
106
|
+
[Ã] 00C3 0088 [È] 00C8
|
107
|
+
[Ã] 00C3 0089 [É] 00C9
|
108
|
+
[Ã] 00C3 008A [Ê] 00CA
|
109
|
+
[Ã] 00C3 008B [Ë] 00CB
|
110
|
+
[Ã] 00C3 008C [Ì] 00CC
|
111
|
+
[Ã] 00C3 008D [Í] 00CD
|
112
|
+
[Ã] 00C3 008E [Î] 00CE
|
113
|
+
[Ã] 00C3 008F [Ï] 00CF
|
114
|
+
[Ã] 00C3 0090 [Ð] 00D0
|
115
|
+
[Ã] 00C3 0091 [Ñ] 00D1
|
116
|
+
[Ã] 00C3 0092 [Ò] 00D2
|
117
|
+
[Ã] 00C3 0093 [Ó] 00D3
|
118
|
+
[Ã] 00C3 0094 [Ô] 00D4
|
119
|
+
[Ã] 00C3 0095 [Õ] 00D5
|
120
|
+
[Ã] 00C3 0096 [Ö] 00D6
|
121
|
+
[Ã] 00C3 0097 [×] 00D7
|
122
|
+
[Ã] 00C3 0098 [Ø] 00D8
|
123
|
+
[Ã] 00C3 0099 [Ù] 00D9
|
124
|
+
[Ã] 00C3 009A [Ú] 00DA
|
125
|
+
[Ã] 00C3 009B [Û] 00DB
|
126
|
+
[Ã] 00C3 009C [Ü] 00DC
|
127
|
+
[Ã] 00C3 009D [Ý] 00DD
|
128
|
+
[Ã] 00C3 009E [Þ] 00DE
|
129
|
+
[Ã] 00C3 009F [ß] 00DF
|
130
|
+
[Ã ] 00C3 00A0 [à] 00E0
|
131
|
+
[á] 00C3 00A1 [á] 00E1
|
132
|
+
[â] 00C3 00A2 [â] 00E2
|
133
|
+
[ã] 00C3 00A3 [ã] 00E3
|
134
|
+
[ä] 00C3 00A4 [ä] 00E4
|
135
|
+
[Ã¥] 00C3 00A5 [å] 00E5
|
136
|
+
[æ] 00C3 00A6 [æ] 00E6
|
137
|
+
[ç] 00C3 00A7 [ç] 00E7
|
138
|
+
[è] 00C3 00A8 [è] 00E8
|
139
|
+
[é] 00C3 00A9 [é] 00E9
|
140
|
+
[ê] 00C3 00AA [ê] 00EA
|
141
|
+
[ë] 00C3 00AB [ë] 00EB
|
142
|
+
[ì] 00C3 00AC [ì] 00EC
|
143
|
+
[Ã] 00C3 00AD [í] 00ED
|
144
|
+
[î] 00C3 00AE [î] 00EE
|
145
|
+
[ï] 00C3 00AF [ï] 00EF
|
146
|
+
[ð] 00C3 00B0 [ð] 00F0
|
147
|
+
[ñ] 00C3 00B1 [ñ] 00F1
|
148
|
+
[ò] 00C3 00B2 [ò] 00F2
|
149
|
+
[ó] 00C3 00B3 [ó] 00F3
|
150
|
+
[ô] 00C3 00B4 [ô] 00F4
|
151
|
+
[õ] 00C3 00B5 [õ] 00F5
|
152
|
+
[ö] 00C3 00B6 [ö] 00F6
|
153
|
+
[÷] 00C3 00B7 [÷] 00F7
|
154
|
+
[ø] 00C3 00B8 [ø] 00F8
|
155
|
+
[ù] 00C3 00B9 [ù] 00F9
|
156
|
+
[ú] 00C3 00BA [ú] 00FA
|
157
|
+
[û] 00C3 00BB [û] 00FB
|
158
|
+
[ü] 00C3 00BC [ü] 00FC
|
159
|
+
[ý] 00C3 00BD [ý] 00FD
|
160
|
+
[þ] 00C3 00BE [þ] 00FE
|
161
|
+
[ÿ] 00C3 00BF [ÿ] 00FF
|
162
|
+
[ÃŒ] 00C3 0152 [Ì] 00CC
|
163
|
+
[Ü] 00C3 0153 [Ü] 00DC
|
164
|
+
[Ê] 00C3 0160 [Ê] 00CA
|
165
|
+
[Ú] 00C3 0161 [Ú] 00DA
|
166
|
+
[ß] 00C3 0178 [ß] 00DF
|
167
|
+
[ÃŽ] 00C3 017D [Î] 00CE
|
168
|
+
[Þ] 00C3 017E [Þ] 00DE
|
169
|
+
[Ã] 00C3 0192 [Ã] 00C3
|
170
|
+
[È] 00C3 02C6 [È] 00C8
|
171
|
+
[Ø] 00C3 02DC [Ø] 00D8
|
172
|
+
[Ö] 00C3 2013 [Ö] 00D6
|
173
|
+
[×] 00C3 2014 [×] 00D7
|
174
|
+
[Ñ] 00C3 2018 [Ñ] 00D1
|
175
|
+
[Ã’] 00C3 2019 [Ò] 00D2
|
176
|
+
[Â] 00C3 201A [Â] 00C2
|
177
|
+
[Ó] 00C3 201C [Ó] 00D3
|
178
|
+
[Ô] 00C3 201D [Ô] 00D4
|
179
|
+
[Ä] 00C3 201E [Ä] 00C4
|
180
|
+
[Æ] 00C3 2020 [Æ] 00C6
|
181
|
+
[Ç] 00C3 2021 [Ç] 00C7
|
182
|
+
[Õ] 00C3 2022 [Õ] 00D5
|
183
|
+
[Ã…] 00C3 2026 [Å] 00C5
|
184
|
+
[É] 00C3 2030 [É] 00C9
|
185
|
+
[Ë] 00C3 2039 [Ë] 00CB
|
186
|
+
[Û] 00C3 203A [Û] 00DB
|
187
|
+
[À] 00C3 20AC [À] 00C0
|
188
|
+
[Ù] 00C3 2122 [Ù] 00D9
|
189
|
+
[Ã�] 00C3 FFFD [Á] 00C1
|
190
|
+
[Å] 00C5 0092 [Œ] 0152
|
191
|
+
[Å] 00C5 0093 [œ] 0153
|
192
|
+
[Å ] 00C5 00A0 [Š] 0160
|
193
|
+
[Å¡] 00C5 00A1 [š] 0161
|
194
|
+
[Ÿ] 00C5 00B8 [Ÿ] 0178
|
195
|
+
[Ž] 00C5 00BD [Ž] 017D
|
196
|
+
[ž] 00C5 00BE [ž] 017E
|
197
|
+
[Å’] 00C5 2019 [Œ] 0152
|
198
|
+
[Å“] 00C5 201C [œ] 0153
|
199
|
+
[Æ] 00C6 0092 [ƒ] 0192
|
200
|
+
[Æ’] 00C6 2019 [ƒ] 0192
|
201
|
+
[Ë] 00CB 0086 [ˆ] 02C6
|
202
|
+
[Ë] 00CB 009C [˜] 02DC
|
203
|
+
[Ëœ] 00CB 0153 [˜] 02DC
|
204
|
+
[ˆ] 00CB 2020 [ˆ] 02C6
|
205
|
+
[â] 00E2 0080 0080 [ ] 2000
|
206
|
+
[â] 00E2 0080 0081 [ ] 2001
|
207
|
+
[â] 00E2 0080 0082 [ ] 2002
|
208
|
+
[â] 00E2 0080 0083 [ ] 2003
|
209
|
+
[â] 00E2 0080 0084 [ ] 2004
|
210
|
+
[â
] 00E2 0080 0085 [ ] 2005
|
211
|
+
[â] 00E2 0080 0086 [ ] 2006
|
212
|
+
[â] 00E2 0080 0087 [ ] 2007
|
213
|
+
[â] 00E2 0080 0088 [ ] 2008
|
214
|
+
[â] 00E2 0080 0089 [ ] 2009
|
215
|
+
[â] 00E2 0080 008A [ ] 200A
|
216
|
+
[â] 00E2 0080 008B [] 200B
|
217
|
+
[â] 00E2 0080 0093 [–] 2013
|
218
|
+
[â] 00E2 0080 0094 [—] 2014
|
219
|
+
[â] 00E2 0080 0098 [‘] 2018
|
220
|
+
[â] 00E2 0080 0099 [’] 2019
|
221
|
+
[â] 00E2 0080 009A [‚] 201A
|
222
|
+
[â] 00E2 0080 009C [“] 201C
|
223
|
+
[â] 00E2 0080 009D [”] 201D
|
224
|
+
[â] 00E2 0080 009E [„] 201E
|
225
|
+
[â ] 00E2 0080 00A0 [†] 2020
|
226
|
+
[â¡] 00E2 0080 00A1 [‡] 2021
|
227
|
+
[â¢] 00E2 0080 00A2 [•] 2022
|
228
|
+
[â¦] 00E2 0080 00A6 […] 2026
|
229
|
+
[â°] 00E2 0080 00B0 [‰] 2030
|
230
|
+
[â¹] 00E2 0080 00B9 [‹] 2039
|
231
|
+
[âº] 00E2 0080 00BA [›] 203A
|
232
|
+
[â ] 00E2 0081 00A0 [] 2060
|
233
|
+
[â¬] 00E2 0082 00AC [€] 20AC
|
234
|
+
[â¢] 00E2 0084 00A2 [™] 2122
|
235
|
+
[€] 00E2 201A 00AC [€] 20AC
|
236
|
+
[â„¢] 00E2 201E 00A2 [™] 2122
|
237
|
+
[â€] 00E2 20AC 0081 [ ] 2001
|
238
|
+
[â€] 00E2 20AC 009D [”] 201D
|
239
|
+
[†] 00E2 20AC 00A0 [†] 2020
|
240
|
+
[‡] 00E2 20AC 00A1 [‡] 2021
|
241
|
+
[•] 00E2 20AC 00A2 [•] 2022
|
242
|
+
[…] 00E2 20AC 00A6 […] 2026
|
243
|
+
[‰] 00E2 20AC 00B0 [‰] 2030
|
244
|
+
[‹] 00E2 20AC 00B9 [‹] 2039
|
245
|
+
[›] 00E2 20AC 00BA [›] 203A
|
246
|
+
[“] 00E2 20AC 0153 [“] 201C
|
247
|
+
[ ] 00E2 20AC 0160 [ ] 200A
|
248
|
+
[‚] 00E2 20AC 0161 [‚] 201A
|
249
|
+
[„] 00E2 20AC 017E [„] 201E
|
250
|
+
[ ] 00E2 20AC 0192 [ ] 2003
|
251
|
+
[ ] 00E2 20AC 02C6 [ ] 2008
|
252
|
+
[‘] 00E2 20AC 02DC [‘] 2018
|
253
|
+
[ ] 00E2 20AC 201A [ ] 2002
|
254
|
+
[–] 00E2 20AC 201C [–] 2013
|
255
|
+
[—] 00E2 20AC 201D [—] 2014
|
256
|
+
[ ] 00E2 20AC 201E [ ] 2004
|
257
|
+
[ ] 00E2 20AC 2020 [ ] 2006
|
258
|
+
[ ] 00E2 20AC 2021 [ ] 2007
|
259
|
+
[ ] 00E2 20AC 2026 [ ] 2005
|
260
|
+
[ ] 00E2 20AC 2030 [ ] 2009
|
261
|
+
[​] 00E2 20AC 2039 [] 200B
|
262
|
+
[ ] 00E2 20AC 20AC [ ] 2000
|
263
|
+
[’] 00E2 20AC 2122 [’] 2019
|
264
|
+
[â€�] 00E2 20AC FFFD [”] 201D
|
265
|
+
[â� ] 00E2 FFFD 00A0 [] 2060
|
266
|
+
[] 00EF 00BB 00BF [] FEFF
|
267
|
+
[�] 00EF 00BF 00BD [�] FFFD
|
268
|
+
[￾] 00EF 00BF 00BE [] FFFE
|
data/lib/mojibake.rb
CHANGED
@@ -14,9 +14,38 @@
|
|
14
14
|
# permissions and limitations under the License.
|
15
15
|
#++
|
16
16
|
|
17
|
-
if ( RUBY_VERSION.split( '.' ).map { |d| d.to_i } <=> [ 1, 9 ] ) < 0
|
18
|
-
raise "Requires ruby ~> 1.9 for String.encode support"
|
19
|
-
end
|
20
|
-
|
21
17
|
require 'mojibake/base'
|
22
|
-
|
18
|
+
|
19
|
+
require 'mojibake/json'
|
20
|
+
|
21
|
+
module MojiBake
|
22
|
+
|
23
|
+
# Supports recovering Mojibake characters to the original text.
|
24
|
+
class Mapper
|
25
|
+
include JSONSupport
|
26
|
+
|
27
|
+
if ( RUBY_VERSION.split( '.' ).map { |d| d.to_i } <=> [ 1, 9 ] ) >= 0
|
28
|
+
require 'mojibake/encoding'
|
29
|
+
include EncodingSupport
|
30
|
+
end
|
31
|
+
|
32
|
+
def initialize( opts = {} )
|
33
|
+
super()
|
34
|
+
opts.map { |k,v| send( k.to_s + '=', v ) }
|
35
|
+
end
|
36
|
+
|
37
|
+
# Recover original characters from input using regexp, recursively.
|
38
|
+
def recover( input, recursive = true )
|
39
|
+
output = input.gsub( regexp ) { |moji| hash[moji] }
|
40
|
+
|
41
|
+
# Only recurse if requested and substituted something (output
|
42
|
+
# shorter) in this run.
|
43
|
+
if recursive && ( output.length < input.length )
|
44
|
+
recover( output )
|
45
|
+
else
|
46
|
+
output
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
end
|
data/lib/mojibake/base.rb
CHANGED
@@ -16,9 +16,9 @@
|
|
16
16
|
|
17
17
|
module MojiBake
|
18
18
|
|
19
|
-
#
|
20
|
-
#
|
21
|
-
|
19
|
+
# Mixin for the actual (ruby 1.9 backed) encoding support to define
|
20
|
+
# the mojibake mapping table and regex.
|
21
|
+
module EncodingSupport
|
22
22
|
|
23
23
|
W252 = Encoding::WINDOWS_1252
|
24
24
|
ISO8 = Encoding::ISO_8859_1
|
@@ -30,20 +30,21 @@ module MojiBake
|
|
30
30
|
# RIGHT DOUBLE QUOTATION MARK. These are the most common problem
|
31
31
|
# chars in English and probably most latin languages.
|
32
32
|
HIGH_ORDER_CHARS =
|
33
|
-
( ( 0x80..0xFF )
|
33
|
+
( Array( 0x80..0xFF ) - [ 0x81, 0x8D, 0x8F, 0x90, 0x9D ] ).
|
34
34
|
map { |i| i.chr( W252 ).encode( UTF8 ) }.
|
35
35
|
sort
|
36
36
|
|
37
37
|
# Additional Unicode codepoints of mojibake potential, like alt
|
38
38
|
# whitespace, C1 control characters, and BOMs.
|
39
39
|
INTEREST_CODEPOINTS =
|
40
|
-
[
|
41
|
-
0x00A0,
|
42
|
-
|
43
|
-
0x2060,
|
44
|
-
0xfeff,
|
45
|
-
0xfffd,
|
46
|
-
0xfffe ].
|
40
|
+
[ 0x0080..0x009F, # ISO/Unicode C1 control codes.
|
41
|
+
0x00A0, # NO-BREAK SPACE
|
42
|
+
0x2000..0x200B, # EN QUAD ... ZERO WIDTH SPACE
|
43
|
+
0x2060, # WORD JOINER
|
44
|
+
0xfeff, # ZERO WIDTH SPACE, BYTE-ORDER-MARK (BOM)
|
45
|
+
0xfffd, # REPLACEMENT CHARACTER
|
46
|
+
0xfffe ]. # UNASSIGNED, BAD BOM
|
47
|
+
map { |i| Array( i ) }.
|
47
48
|
flatten.
|
48
49
|
sort
|
49
50
|
|
@@ -63,12 +64,11 @@ module MojiBake
|
|
63
64
|
# (default: true). This covers ambiguities of C1 control codes.
|
64
65
|
attr_accessor :map_permutations
|
65
66
|
|
66
|
-
def initialize
|
67
|
+
def initialize
|
68
|
+
super
|
67
69
|
@map_windows_1252 = true
|
68
70
|
@map_iso_8859_1 = true
|
69
71
|
@map_permutations = true
|
70
|
-
|
71
|
-
options.map { |k,v| send( k.to_s + '=', v ) }
|
72
72
|
end
|
73
73
|
|
74
74
|
# Return Hash of mojibake UTF-8 2-3 character sequences to original
|
@@ -122,19 +122,6 @@ module MojiBake
|
|
122
122
|
@regexp ||= Regexp.new( tree_flatten( char_tree( hash.keys ) ) )
|
123
123
|
end
|
124
124
|
|
125
|
-
# Recover original characters from input using regexp, recursively.
|
126
|
-
def recover( input, recursive = true )
|
127
|
-
output = input.gsub( regexp ) { |moji| hash[moji] }
|
128
|
-
|
129
|
-
# Only recurse if requested and substituted something (output
|
130
|
-
# shorter) in this run.
|
131
|
-
if recursive && ( output.length < input.length )
|
132
|
-
recover( output )
|
133
|
-
else
|
134
|
-
output
|
135
|
-
end
|
136
|
-
end
|
137
|
-
|
138
125
|
def char_tree( seqs )
|
139
126
|
seqs.inject( {} ) do |h,seq|
|
140
127
|
seq.chars.inject( h ) do |hs,c|
|
@@ -158,8 +145,7 @@ module MojiBake
|
|
158
145
|
o
|
159
146
|
end
|
160
147
|
if cs.find { |o| o =~ /[()|\[\]]/ }
|
161
|
-
cs.join( '|' )
|
162
|
-
#FIXME: Join looses encoding so force, jruby bug?
|
148
|
+
cs.join( '|' )
|
163
149
|
else
|
164
150
|
if cs.length > 1
|
165
151
|
'[' + cs.inject(:+) + ']'
|
@@ -0,0 +1,81 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'mojibake/base'
|
18
|
+
require 'json'
|
19
|
+
|
20
|
+
module MojiBake
|
21
|
+
|
22
|
+
module JSONSupport
|
23
|
+
|
24
|
+
JSON_CONFIG = File.join( File.dirname( __FILE__ ),
|
25
|
+
'..', '..', 'config', 'table.json' )
|
26
|
+
|
27
|
+
def initialize
|
28
|
+
super
|
29
|
+
end
|
30
|
+
|
31
|
+
def config
|
32
|
+
@config ||= JSON.parse( IO.read( JSON_CONFIG ) )
|
33
|
+
end
|
34
|
+
|
35
|
+
def hash
|
36
|
+
@hash ||= config[ 'moji' ]
|
37
|
+
end
|
38
|
+
|
39
|
+
def regexp
|
40
|
+
# Note use of Unicode mode for ruby 1.8's
|
41
|
+
@regexp ||= Regexp.new( config[ 'regexp' ], 0, 'U' )
|
42
|
+
end
|
43
|
+
|
44
|
+
# table as self contained json-ready Hash
|
45
|
+
def hash_to_json_object
|
46
|
+
|
47
|
+
# Also use unicode escape for the interesting (effectively,
|
48
|
+
# non-printable) subset of moji mappings.
|
49
|
+
moji = hash.sort.map do |kv|
|
50
|
+
kv.map do |s|
|
51
|
+
s.codepoints.inject( '' ) do |r,i|
|
52
|
+
if MojiBake::Mapper::INTEREST_CODEPOINTS.include?( i )
|
53
|
+
r << sprintf( '\u%04X', i )
|
54
|
+
else
|
55
|
+
r << i.chr( Encoding::UTF_8 )
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
{ :mojibake => MojiBake::VERSION,
|
62
|
+
:url => "https://github.com/dekellum/mojibake",
|
63
|
+
:regexp => regexp.inspect[1...-1],
|
64
|
+
:moji => Hash[ moji ] }
|
65
|
+
end
|
66
|
+
|
67
|
+
# Pretty formatted JSON serialized String for json_object
|
68
|
+
def json
|
69
|
+
# Generate and replace what become double escaped '\\u' UNICODE
|
70
|
+
# escapes with single '\u' escapes. This is a hack but is
|
71
|
+
# reasonably safe given that 'u' isn't normally escaped. The
|
72
|
+
# alterantive would be to hack JSON package or do the JSON
|
73
|
+
# formatting ourselves. Ideally JSON package would support
|
74
|
+
# serialization using unicode escapes for the non-printable,
|
75
|
+
# non-friendly chars. As of 1.6.1 it doesn't.
|
76
|
+
JSON.pretty_generate( hash_to_json_object ).gsub( /\\\\u/, '\u' )
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
#.hashdot.args.pre = --1.9
|
4
|
+
#.hashdot.profile += jruby-shortlived
|
5
|
+
|
6
|
+
#--
|
7
|
+
# Copyright (c) 2011 David Kellum
|
8
|
+
#
|
9
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
10
|
+
# may not use this file except in compliance with the License. You
|
11
|
+
# may obtain a copy of the License at
|
12
|
+
#
|
13
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
14
|
+
#
|
15
|
+
# Unless required by applicable law or agreed to in writing, software
|
16
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
17
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
18
|
+
# implied. See the License for the specific language governing
|
19
|
+
# permissions and limitations under the License.
|
20
|
+
#++
|
21
|
+
|
22
|
+
ldir = File.join( File.dirname( __FILE__ ), "..", "lib" )
|
23
|
+
$LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
|
24
|
+
|
25
|
+
require 'rubygems'
|
26
|
+
require 'minitest/unit'
|
27
|
+
require 'minitest/autorun'
|
28
|
+
|
29
|
+
require 'mojibake'
|
30
|
+
|
31
|
+
class TestEncoding < MiniTest::Unit::TestCase
|
32
|
+
include MojiBake
|
33
|
+
|
34
|
+
def setup
|
35
|
+
@mapper = Mapper.new
|
36
|
+
end
|
37
|
+
|
38
|
+
TEST_TREE = { "a" => { "b" => { "c" => {},
|
39
|
+
"d" => {} } },
|
40
|
+
"d" => { "b" => { "f" => {} } } }
|
41
|
+
|
42
|
+
# These only test with Ruby 1.9 support
|
43
|
+
if ( RUBY_VERSION.split( '.' ).map { |d| d.to_i } <=> [ 1, 9 ] ) >= 0
|
44
|
+
|
45
|
+
def test_init_options
|
46
|
+
assert_equal( true, Mapper.new.map_iso_8859_1 )
|
47
|
+
m = Mapper.new( :map_iso_8859_1 => false )
|
48
|
+
assert_equal( false, m.map_iso_8859_1 )
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_char_tree
|
52
|
+
assert_equal( TEST_TREE,
|
53
|
+
@mapper.char_tree( [ "abc", "abd", "dbf" ] ) )
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_tree_flaten
|
57
|
+
assert_equal( "ab[cd]|dbf",
|
58
|
+
@mapper.tree_flatten( TEST_TREE ) )
|
59
|
+
end
|
60
|
+
|
61
|
+
def test_regexp
|
62
|
+
re = Regexp.new( @mapper.tree_flatten( TEST_TREE ) )
|
63
|
+
assert_match( re, "abc" )
|
64
|
+
assert_match( re, "abd" )
|
65
|
+
assert_match( re, "dbf" )
|
66
|
+
|
67
|
+
refute_match( re, "ab" )
|
68
|
+
refute_match( re, "abf" )
|
69
|
+
|
70
|
+
assert_equal( "xbf" , "abdbf".gsub( re, 'x' ) )
|
71
|
+
assert_equal( "dbf" , "abdbf".gsub( re, 'd' ) )
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
@@ -28,46 +28,13 @@ require 'minitest/autorun'
|
|
28
28
|
|
29
29
|
require 'mojibake'
|
30
30
|
|
31
|
-
class
|
31
|
+
class TestMapper < MiniTest::Unit::TestCase
|
32
32
|
include MojiBake
|
33
33
|
|
34
34
|
def setup
|
35
35
|
@mapper = Mapper.new
|
36
36
|
end
|
37
37
|
|
38
|
-
TEST_TREE = { "a" => { "b" => { "c" => {},
|
39
|
-
"d" => {} } },
|
40
|
-
"d" => { "b" => { "f" => {} } } }
|
41
|
-
|
42
|
-
def test_init_options
|
43
|
-
assert_equal( true, Mapper.new.map_iso_8859_1 )
|
44
|
-
m = Mapper.new( :map_iso_8859_1 => false )
|
45
|
-
assert_equal( false, m.map_iso_8859_1 )
|
46
|
-
end
|
47
|
-
|
48
|
-
def test_char_tree
|
49
|
-
assert_equal( TEST_TREE,
|
50
|
-
@mapper.char_tree( [ "abc", "abd", "dbf" ] ) )
|
51
|
-
end
|
52
|
-
|
53
|
-
def test_tree_flaten
|
54
|
-
assert_equal( "ab[cd]|dbf",
|
55
|
-
@mapper.tree_flatten( TEST_TREE ) )
|
56
|
-
end
|
57
|
-
|
58
|
-
def test_regexp
|
59
|
-
re = Regexp.new( @mapper.tree_flatten( TEST_TREE ) )
|
60
|
-
assert_match( re, "abc" )
|
61
|
-
assert_match( re, "abd" )
|
62
|
-
assert_match( re, "dbf" )
|
63
|
-
|
64
|
-
refute_match( re, "ab" )
|
65
|
-
refute_match( re, "abf" )
|
66
|
-
|
67
|
-
assert_equal( "xbf" , "abdbf".gsub( re, 'x' ) )
|
68
|
-
assert_equal( "dbf" , "abdbf".gsub( re, 'd' ) )
|
69
|
-
end
|
70
|
-
|
71
38
|
def test_nomatch_recover
|
72
39
|
assert_equal( '', @mapper.recover( '' ) )
|
73
40
|
assert_equal( 'ascii', @mapper.recover( 'ascii' ) )
|
metadata
CHANGED
@@ -2,42 +2,49 @@
|
|
2
2
|
name: mojibake
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 1.
|
5
|
+
version: 1.1.0
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
|
-
|
8
|
+
- David Kellum
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2011-
|
14
|
-
default_executable:
|
13
|
+
date: 2011-10-31 00:00:00 Z
|
15
14
|
dependencies:
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: json
|
17
|
+
prerelease: false
|
18
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
19
|
+
none: false
|
20
|
+
requirements:
|
21
|
+
- - ~>
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.6.1
|
24
|
+
type: :runtime
|
25
|
+
version_requirements: *id001
|
26
|
+
- !ruby/object:Gem::Dependency
|
27
|
+
name: minitest
|
28
|
+
prerelease: false
|
29
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
30
|
+
none: false
|
31
|
+
requirements:
|
32
|
+
- - ~>
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: "2.3"
|
35
|
+
type: :development
|
36
|
+
version_requirements: *id002
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
name: rjack-tarpit
|
39
|
+
prerelease: false
|
40
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 1.4.0
|
46
|
+
type: :development
|
47
|
+
version_requirements: *id003
|
41
48
|
description: "Mojibake occurs in English most frequently due to misinterpreting and\n\
|
42
49
|
bad-transcoding between Windows-1252, ISO-8859-1, and UTF-8. This\n\
|
43
50
|
module provides a mojibake sequence to original character mapping\n\
|
@@ -45,54 +52,60 @@ description: "Mojibake occurs in English most frequently due to misinterpreting
|
|
45
52
|
Testing has been with English but other Latin based languages, where\n\
|
46
53
|
Windows-1252 is in the wild, should also benefit."
|
47
54
|
email:
|
48
|
-
|
55
|
+
- dek-oss@gravitext.com
|
49
56
|
executables:
|
50
|
-
|
57
|
+
- mojibake
|
51
58
|
extensions: []
|
52
59
|
|
53
60
|
extra_rdoc_files:
|
54
|
-
|
55
|
-
|
56
|
-
|
61
|
+
- Manifest.txt
|
62
|
+
- config/table.txt
|
63
|
+
- History.rdoc
|
64
|
+
- README.rdoc
|
57
65
|
files:
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
66
|
+
- History.rdoc
|
67
|
+
- Manifest.txt
|
68
|
+
- README.rdoc
|
69
|
+
- Rakefile
|
70
|
+
- bin/mojibake
|
71
|
+
- config/table.json
|
72
|
+
- config/table.txt
|
73
|
+
- lib/mojibake/base.rb
|
74
|
+
- lib/mojibake.rb
|
75
|
+
- lib/mojibake/encoding.rb
|
76
|
+
- lib/mojibake/json.rb
|
77
|
+
- test/test.txt
|
78
|
+
- test/test_encoding.rb
|
79
|
+
- test/test_mapper.rb
|
80
|
+
- .gemtest
|
69
81
|
homepage: http://github.com/dekellum/mojibake
|
70
82
|
licenses: []
|
71
83
|
|
72
84
|
post_install_message:
|
73
85
|
rdoc_options:
|
74
|
-
|
75
|
-
|
86
|
+
- --main
|
87
|
+
- README.rdoc
|
76
88
|
require_paths:
|
77
|
-
|
89
|
+
- lib
|
78
90
|
required_ruby_version: !ruby/object:Gem::Requirement
|
79
91
|
none: false
|
80
92
|
requirements:
|
81
|
-
|
82
|
-
|
83
|
-
|
93
|
+
- - ">="
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: "0"
|
84
96
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
85
97
|
none: false
|
86
98
|
requirements:
|
87
|
-
|
88
|
-
|
89
|
-
|
99
|
+
- - ">="
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: "0"
|
90
102
|
requirements: []
|
91
103
|
|
92
104
|
rubyforge_project: mojibake
|
93
|
-
rubygems_version: 1.
|
105
|
+
rubygems_version: 1.8.11
|
94
106
|
signing_key:
|
95
107
|
specification_version: 3
|
96
108
|
summary: Mojibake occurs in English most frequently due to misinterpreting and bad-transcoding between Windows-1252, ISO-8859-1, and UTF-8
|
97
109
|
test_files:
|
98
|
-
|
110
|
+
- test/test_encoding.rb
|
111
|
+
- test/test_mapper.rb
|