mojibake 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.rdoc +2 -0
- data/Manifest.txt +10 -0
- data/README.rdoc +55 -0
- data/Rakefile +35 -0
- data/bin/mojibake +71 -0
- data/lib/mojibake.rb +22 -0
- data/lib/mojibake/base.rb +19 -0
- data/lib/mojibake/mapper.rb +187 -0
- data/test/test.txt +12 -0
- data/test/test_mojibake.rb +89 -0
- metadata +98 -0
data/History.rdoc
ADDED
data/Manifest.txt
ADDED
data/README.rdoc
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
= MojiBake
|
3
|
+
|
4
|
+
* http://github.com/dekellum/mojibake
|
5
|
+
|
6
|
+
== Description
|
7
|
+
|
8
|
+
Mojibake occurs in English most frequently due to misinterpreting and
|
9
|
+
bad-transcoding between Windows-1252, ISO-8859-1, and UTF-8. This
|
10
|
+
module provides a mojibake sequence to original character mapping
|
11
|
+
table, and utility to recover mojibake’d text.
|
12
|
+
|
13
|
+
Testing has been with English but other Latin based languages, where
|
14
|
+
Windows-1252 is in the wild, should also benefit.
|
15
|
+
|
16
|
+
== Dependencies
|
17
|
+
|
18
|
+
Requires the String Encoding support of ruby 1.9+ (tested 1.9.2p180
|
19
|
+
Linux) or jruby 1.6+ (tested 1.6.2, Linux).
|
20
|
+
|
21
|
+
== Synopsis
|
22
|
+
|
23
|
+
gem install mojibake
|
24
|
+
|
25
|
+
require 'mojibake'
|
26
|
+
mapper = MojiBake::Mapper.new
|
27
|
+
mapper.recover( '“quotedâ€�' ) #=> '“quoted”'
|
28
|
+
|
29
|
+
Or via cli:
|
30
|
+
|
31
|
+
mojibake -h
|
32
|
+
|
33
|
+
List the mojibake mapping table (output in UTF-8):
|
34
|
+
|
35
|
+
mojibake -t
|
36
|
+
|
37
|
+
Recover from a text file:
|
38
|
+
|
39
|
+
mojibake input.txt
|
40
|
+
|
41
|
+
== License
|
42
|
+
|
43
|
+
Copyright (c) 2011 David Kellum
|
44
|
+
|
45
|
+
Licensed under the Apache License, Version 2.0 (the "License"); you
|
46
|
+
may not use this file except in compliance with the License. You
|
47
|
+
may obtain a copy of the License at:
|
48
|
+
|
49
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
50
|
+
|
51
|
+
Unless required by applicable law or agreed to in writing, software
|
52
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
53
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
54
|
+
implied. See the License for the specific language governing
|
55
|
+
permissions and limitations under the License.
|
data/Rakefile
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
$LOAD_PATH << './lib'
|
4
|
+
|
5
|
+
require 'rubygems'
|
6
|
+
gem 'rjack-tarpit', '~> 1.3.2'
|
7
|
+
require 'rjack-tarpit'
|
8
|
+
|
9
|
+
require 'mojibake/base'
|
10
|
+
|
11
|
+
t = RJack::TarPit.new( 'mojibake', MojiBake::VERSION )
|
12
|
+
|
13
|
+
t.specify do |h|
|
14
|
+
h.developer( 'David Kellum', 'dek-oss@gravitext.com' )
|
15
|
+
|
16
|
+
h.testlib = :minitest
|
17
|
+
h.extra_dev_deps += [ [ 'minitest', '>= 2.1', '< 2.4' ] ]
|
18
|
+
|
19
|
+
h.url = 'http://github.com/dekellum/mojibake'
|
20
|
+
end
|
21
|
+
|
22
|
+
# Version/date consistency checks:
|
23
|
+
|
24
|
+
task :check_history_version do
|
25
|
+
t.test_line_match( 'History.rdoc', /^==/, / #{ t.version } / )
|
26
|
+
end
|
27
|
+
task :check_history_date do
|
28
|
+
t.test_line_match( 'History.rdoc', /^==/, /\([0-9\-]+\)$/ )
|
29
|
+
end
|
30
|
+
|
31
|
+
task :gem => [ :check_history_version ]
|
32
|
+
task :tag => [ :check_history_version, :check_history_date ]
|
33
|
+
task :push => [ :check_history_version, :check_history_date ]
|
34
|
+
|
35
|
+
t.define_tasks
|
data/bin/mojibake
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#.hashdot.args.pre = --1.9
|
3
|
+
#.hashdot.profile += jruby-shortlived
|
4
|
+
|
5
|
+
#--
|
6
|
+
# Copyright (c) 2011 David Kellum
|
7
|
+
#
|
8
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
9
|
+
# may not use this file except in compliance with the License. You
|
10
|
+
# may obtain a copy of the License at
|
11
|
+
#
|
12
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
13
|
+
#
|
14
|
+
# Unless required by applicable law or agreed to in writing, software
|
15
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
16
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
17
|
+
# implied. See the License for the specific language governing
|
18
|
+
# permissions and limitations under the License.
|
19
|
+
#++
|
20
|
+
|
21
|
+
$LOAD_PATH.unshift File.join( File.dirname( __FILE__ ), "..", "lib" )
|
22
|
+
|
23
|
+
require 'mojibake'
|
24
|
+
require 'optparse'
|
25
|
+
|
26
|
+
module Script
|
27
|
+
|
28
|
+
mapper = MojiBake::Mapper.new
|
29
|
+
|
30
|
+
OptionParser.new do |opts|
|
31
|
+
opts.banner = "Usage: mojibake [options] [InputFile]"
|
32
|
+
opts.on( "-v", "--version", "Display version and exit" ) do
|
33
|
+
puts "mojibake: #{MojiBake::VERSION}"
|
34
|
+
exit 1
|
35
|
+
end
|
36
|
+
opts.on( "--no-windows-1252",
|
37
|
+
"Don't include miscodings from Windows-1252" ) do
|
38
|
+
mapper.map_windows_1252 = false
|
39
|
+
end
|
40
|
+
opts.on( "--no-iso-8859-1",
|
41
|
+
"Don't include miscodings from ISO-8859-1" ) do
|
42
|
+
mapper.map_iso_8859_1 = false
|
43
|
+
end
|
44
|
+
opts.on( "--no-permutations",
|
45
|
+
"Don't include ISO/Windows permutations" ) do
|
46
|
+
mapper.map_permutations = false
|
47
|
+
end
|
48
|
+
opts.on_tail( "-t", "--table",
|
49
|
+
"Write MojiBake Mapper table (UTF-8)" ) do
|
50
|
+
puts mapper.table
|
51
|
+
exit 1
|
52
|
+
end
|
53
|
+
opts.on_tail( "-r", "--regex",
|
54
|
+
"Display MojiBake Mapper regex (UTF-8) and exit" ) do
|
55
|
+
puts mapper.regexp.inspect
|
56
|
+
exit 1
|
57
|
+
end
|
58
|
+
opts.on_tail( "-h", "--help", "Show help and exit" ) do
|
59
|
+
puts opts
|
60
|
+
puts
|
61
|
+
puts( "Recover InputFile and writes to STDOUT." )
|
62
|
+
exit 1
|
63
|
+
end
|
64
|
+
end.parse!
|
65
|
+
|
66
|
+
input_file = ARGV.shift
|
67
|
+
if input_file
|
68
|
+
$stdout.write( mapper.recover( IO.read( input_file ).encode( 'UTF-8' ) ) )
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
data/lib/mojibake.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
if ( RUBY_VERSION.split( '.' ).map { |d| d.to_i } <=> [ 1, 9 ] ) < 0
|
18
|
+
raise "Requires ruby ~> 1.9 for String.encode support"
|
19
|
+
end
|
20
|
+
|
21
|
+
require 'mojibake/base'
|
22
|
+
require 'mojibake/mapper'
|
@@ -0,0 +1,19 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
module MojiBake
|
18
|
+
VERSION = "1.0.0"
|
19
|
+
end
|
@@ -0,0 +1,187 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
module MojiBake
|
18
|
+
|
19
|
+
# Creates a Map from mojibake sequences to recovered/original
|
20
|
+
# characters.
|
21
|
+
class Mapper
|
22
|
+
|
23
|
+
W252 = Encoding::WINDOWS_1252
|
24
|
+
ISO8 = Encoding::ISO_8859_1
|
25
|
+
UTF8 = Encoding::UTF_8
|
26
|
+
|
27
|
+
# The 8-bit high-order characters assigned in Windows-1252, as UTF8.
|
28
|
+
# This is actually a superset of ISO-8859-1 high order set,
|
29
|
+
# including in particular, punctuation characters like EM DASH and
|
30
|
+
# RIGHT DOUBLE QUOTATION MARK. These are the most common problem
|
31
|
+
# chars in English and probably most latin languages.
|
32
|
+
HIGH_ORDER_CHARS =
|
33
|
+
( ( 0x80..0xFF ).to_a - [ 0x81, 0x8D, 0x8F, 0x90, 0x9D ] ).
|
34
|
+
map { |i| i.chr( W252 ).encode( UTF8 ) }.
|
35
|
+
sort
|
36
|
+
|
37
|
+
# Additional Unicode codepoints of mojibake potential, like alt
|
38
|
+
# whitespace, C1 control characters, and BOMs.
|
39
|
+
INTEREST_CODEPOINTS =
|
40
|
+
[ (0x0080..0x009F).to_a, # ISO/Unicode C1 control codes.
|
41
|
+
0x00A0, # NO-BREAK SPACE
|
42
|
+
(0x2000..0x200B).to_a, # EN QUAD ... ZERO WIDTH SPACE
|
43
|
+
0x2060, # WORD JOINER
|
44
|
+
0xfeff, # ZERO WIDTH SPACE, BYTE-ORDER-MARK (BOM)
|
45
|
+
0xfffd, # REPLACEMENT CHARACTER
|
46
|
+
0xfffe ]. # UNASSIGNED, BAD BOM
|
47
|
+
flatten.
|
48
|
+
sort
|
49
|
+
|
50
|
+
INTEREST_CHARS = INTEREST_CODEPOINTS.map { |c| c.chr( UTF8 ) }
|
51
|
+
|
52
|
+
# Mojibake candidate characters in reverse; HIGH_ORDER_CHARS and
|
53
|
+
# lowest codepoints have highest precedence.
|
54
|
+
CANDIDATE_CHARS = ( HIGH_ORDER_CHARS + INTEREST_CHARS ).reverse
|
55
|
+
|
56
|
+
# Include Windows-1252 transcodes in map (default: true)
|
57
|
+
attr_accessor :map_windows_1252
|
58
|
+
|
59
|
+
# Include ISO-8859-1 transcodes in map (default: true)
|
60
|
+
attr_accessor :map_iso_8859_1
|
61
|
+
|
62
|
+
# Include permutations between ISO-8859-1 and Windows-1252
|
63
|
+
# (default: true). This covers ambiguities of C1 control codes.
|
64
|
+
attr_accessor :map_permutations
|
65
|
+
|
66
|
+
def initialize( options = {} )
|
67
|
+
@map_windows_1252 = true
|
68
|
+
@map_iso_8859_1 = true
|
69
|
+
@map_permutations = true
|
70
|
+
|
71
|
+
options.map { |k,v| send( k.to_s + '=', v ) }
|
72
|
+
end
|
73
|
+
|
74
|
+
# Return Hash of mojibake UTF-8 2-3 character sequences to original
|
75
|
+
# UTF-8 (recovered) characters
|
76
|
+
def hash
|
77
|
+
@hash ||= CANDIDATE_CHARS.inject( {} ) do |h,c|
|
78
|
+
|
79
|
+
# Mis-interpret as ISO-8859-1, and encode back to UTF-8
|
80
|
+
moji_8 = c.encode( UTF8, ISO8 )
|
81
|
+
h[moji_8] = c if @map_iso_8859_1
|
82
|
+
|
83
|
+
# Mis-interpret as Windows-1252, and encode back to UTF-8
|
84
|
+
moji_w = c.encode( UTF8, W252, :undef => :replace )
|
85
|
+
h[moji_w] = c if @map_windows_1252
|
86
|
+
|
87
|
+
if @map_permutations
|
88
|
+
# Also add permutations of unassigned Windows-1252 chars to
|
89
|
+
# the 8bit equivalent.
|
90
|
+
i = 0
|
91
|
+
moji_w.each_codepoint do |cp|
|
92
|
+
if cp == 0xFFFD
|
93
|
+
moji_n = moji_w.dup
|
94
|
+
moji_n[i] = moji_8[i]
|
95
|
+
h[moji_n] = c
|
96
|
+
end
|
97
|
+
i += 1
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
h
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# Return pretty table formatting of hash (array of lines)
|
106
|
+
def table
|
107
|
+
lines = [ "# -*- coding: utf-8 -*- mojibake: #{MojiBake::VERSION}" ]
|
108
|
+
lines << regexp.inspect
|
109
|
+
lines << ""
|
110
|
+
lines << "Moji\tUNICODE \tOrg\tCODE"
|
111
|
+
lines << "+----\t---- ---- ----\t-----\t---+"
|
112
|
+
lines += hash.sort.map do |moji,c|
|
113
|
+
"[%s]\t%s\t[%s]\t%s" %
|
114
|
+
[ moji, codepoints_hex( moji ), c, codepoints_hex( c ) ]
|
115
|
+
end
|
116
|
+
lines
|
117
|
+
end
|
118
|
+
|
119
|
+
# A Regexp that will match any of the mojibake sequences, as
|
120
|
+
# found in hash.keys.
|
121
|
+
def regexp
|
122
|
+
@regexp ||= Regexp.new( tree_flatten( char_tree( hash.keys ) ) )
|
123
|
+
end
|
124
|
+
|
125
|
+
# Recover original characters from input using regexp, recursively.
|
126
|
+
def recover( input, recursive = true )
|
127
|
+
output = input.gsub( regexp ) { |moji| hash[moji] }
|
128
|
+
|
129
|
+
# Only recurse if requested and substituted something (output
|
130
|
+
# shorter) in this run.
|
131
|
+
if recursive && ( output.length < input.length )
|
132
|
+
recover( output )
|
133
|
+
else
|
134
|
+
output
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
def char_tree( seqs )
|
139
|
+
seqs.inject( {} ) do |h,seq|
|
140
|
+
seq.chars.inject( h ) do |hs,c|
|
141
|
+
hs[c] ||= {}
|
142
|
+
end
|
143
|
+
h
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
def tree_flatten( tree )
|
148
|
+
cs = tree.sort.map do |k,v|
|
149
|
+
o = regex_encode( k )
|
150
|
+
unless v.empty?
|
151
|
+
c = tree_flatten( v )
|
152
|
+
o << if c =~ /^\[.*\]$/ || v.length == 1
|
153
|
+
c
|
154
|
+
else
|
155
|
+
'(' + c + ')'
|
156
|
+
end
|
157
|
+
end
|
158
|
+
o
|
159
|
+
end
|
160
|
+
if cs.find { |o| o =~ /[()|\[\]]/ }
|
161
|
+
cs.join( '|' ).force_encoding( "UTF-8" )
|
162
|
+
#FIXME: Join looses encoding so force, jruby bug?
|
163
|
+
else
|
164
|
+
if cs.length > 1
|
165
|
+
'[' + cs.inject(:+) + ']'
|
166
|
+
else
|
167
|
+
cs.first
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
# Unicode hex dump of codepoints
|
173
|
+
def codepoints_hex( s )
|
174
|
+
s.codepoints.map { |i| sprintf( "%04X", i ) }.join( ' ' )
|
175
|
+
end
|
176
|
+
|
177
|
+
def regex_encode( c )
|
178
|
+
i = c.codepoints.next #only one
|
179
|
+
if INTEREST_CODEPOINTS.include?( i )
|
180
|
+
sprintf( '\u%04X', i )
|
181
|
+
else
|
182
|
+
Regexp.escape( c )
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
end
|
187
|
+
end
|
data/test/test.txt
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
-*- coding: utf-8 -*-
|
2
|
+
Source: http://en.wikipedia.org/wiki/Mojibake
|
3
|
+
|
4
|
+
== English ==
|
5
|
+
|
6
|
+
Mojibake in English texts generally occurs in punctuation, such as em
|
7
|
+
dashes (—), en dashes (–), and curly quotes (“, ”), but rarely in
|
8
|
+
character text, since most encodings agree with ASCII on the encoding
|
9
|
+
of the English alphabet. For example, the pound sign "£" will appear
|
10
|
+
as "£" if it was encoded by the sender as UTF-8 but interpreted by
|
11
|
+
the recipient as CP1252 or ISO 8859-1. If iterated, this can lead to
|
12
|
+
"£", "£", etc.
|
@@ -0,0 +1,89 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
#.hashdot.args.pre = --1.9
|
4
|
+
#.hashdot.profile += jruby-shortlived
|
5
|
+
|
6
|
+
#--
|
7
|
+
# Copyright (c) 2011 David Kellum
|
8
|
+
#
|
9
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
10
|
+
# may not use this file except in compliance with the License. You
|
11
|
+
# may obtain a copy of the License at
|
12
|
+
#
|
13
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
14
|
+
#
|
15
|
+
# Unless required by applicable law or agreed to in writing, software
|
16
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
17
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
18
|
+
# implied. See the License for the specific language governing
|
19
|
+
# permissions and limitations under the License.
|
20
|
+
#++
|
21
|
+
|
22
|
+
ldir = File.join( File.dirname( __FILE__ ), "..", "lib" )
|
23
|
+
$LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
|
24
|
+
|
25
|
+
require 'rubygems'
|
26
|
+
require 'minitest/unit'
|
27
|
+
require 'minitest/autorun'
|
28
|
+
|
29
|
+
require 'mojibake'
|
30
|
+
|
31
|
+
class TestMojiBake < MiniTest::Unit::TestCase
|
32
|
+
include MojiBake
|
33
|
+
|
34
|
+
def setup
|
35
|
+
@mapper = Mapper.new
|
36
|
+
end
|
37
|
+
|
38
|
+
TEST_TREE = { "a" => { "b" => { "c" => {},
|
39
|
+
"d" => {} } },
|
40
|
+
"d" => { "b" => { "f" => {} } } }
|
41
|
+
|
42
|
+
def test_init_options
|
43
|
+
assert_equal( true, Mapper.new.map_iso_8859_1 )
|
44
|
+
m = Mapper.new( :map_iso_8859_1 => false )
|
45
|
+
assert_equal( false, m.map_iso_8859_1 )
|
46
|
+
end
|
47
|
+
|
48
|
+
def test_char_tree
|
49
|
+
assert_equal( TEST_TREE,
|
50
|
+
@mapper.char_tree( [ "abc", "abd", "dbf" ] ) )
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_tree_flaten
|
54
|
+
assert_equal( "ab[cd]|dbf",
|
55
|
+
@mapper.tree_flatten( TEST_TREE ) )
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_regexp
|
59
|
+
re = Regexp.new( @mapper.tree_flatten( TEST_TREE ) )
|
60
|
+
assert_match( re, "abc" )
|
61
|
+
assert_match( re, "abd" )
|
62
|
+
assert_match( re, "dbf" )
|
63
|
+
|
64
|
+
refute_match( re, "ab" )
|
65
|
+
refute_match( re, "abf" )
|
66
|
+
|
67
|
+
assert_equal( "xbf" , "abdbf".gsub( re, 'x' ) )
|
68
|
+
assert_equal( "dbf" , "abdbf".gsub( re, 'd' ) )
|
69
|
+
end
|
70
|
+
|
71
|
+
def test_nomatch_recover
|
72
|
+
assert_equal( '', @mapper.recover( '' ) )
|
73
|
+
assert_equal( 'ascii', @mapper.recover( 'ascii' ) )
|
74
|
+
assert_equal( 'Â', @mapper.recover( 'Â' ) )
|
75
|
+
end
|
76
|
+
|
77
|
+
def test_simple_recover
|
78
|
+
assert_equal( '[°]', @mapper.recover( '[°]' ) )
|
79
|
+
assert_equal( '“quoted”', @mapper.recover( '“quotedâ€�' ) )
|
80
|
+
assert_equal( '“quoted”', @mapper.recover( 'âquotedâ€' ) )
|
81
|
+
end
|
82
|
+
|
83
|
+
def test_recursive_recover
|
84
|
+
assert_equal( '°', @mapper.recover( '°' ) )
|
85
|
+
assert_equal( 'AP – Greenlake', @mapper.recover( 'AP – Greenlake' ) )
|
86
|
+
assert_equal( 'you’re', @mapper.recover( 'you’re' ) )
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
metadata
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: mojibake
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 1.0.0
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- David Kellum
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-06-21 00:00:00 -07:00
|
14
|
+
default_executable:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: minitest
|
18
|
+
prerelease: false
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ">="
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: "2.1"
|
25
|
+
- - <
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: "2.4"
|
28
|
+
type: :development
|
29
|
+
version_requirements: *id001
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rjack-tarpit
|
32
|
+
prerelease: false
|
33
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
34
|
+
none: false
|
35
|
+
requirements:
|
36
|
+
- - ~>
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: 1.3.2
|
39
|
+
type: :development
|
40
|
+
version_requirements: *id002
|
41
|
+
description: "Mojibake occurs in English most frequently due to misinterpreting and\n\
|
42
|
+
bad-transcoding between Windows-1252, ISO-8859-1, and UTF-8. This\n\
|
43
|
+
module provides a mojibake sequence to original character mapping\n\
|
44
|
+
table, and utility to recover mojibake\xE2\x80\x99d text.\n\n\
|
45
|
+
Testing has been with English but other Latin based languages, where\n\
|
46
|
+
Windows-1252 is in the wild, should also benefit."
|
47
|
+
email:
|
48
|
+
- dek-oss@gravitext.com
|
49
|
+
executables:
|
50
|
+
- mojibake
|
51
|
+
extensions: []
|
52
|
+
|
53
|
+
extra_rdoc_files:
|
54
|
+
- Manifest.txt
|
55
|
+
- History.rdoc
|
56
|
+
- README.rdoc
|
57
|
+
files:
|
58
|
+
- History.rdoc
|
59
|
+
- Manifest.txt
|
60
|
+
- README.rdoc
|
61
|
+
- Rakefile
|
62
|
+
- bin/mojibake
|
63
|
+
- lib/mojibake/base.rb
|
64
|
+
- lib/mojibake.rb
|
65
|
+
- lib/mojibake/mapper.rb
|
66
|
+
- test/test.txt
|
67
|
+
- test/test_mojibake.rb
|
68
|
+
has_rdoc: true
|
69
|
+
homepage: http://github.com/dekellum/mojibake
|
70
|
+
licenses: []
|
71
|
+
|
72
|
+
post_install_message:
|
73
|
+
rdoc_options:
|
74
|
+
- --main
|
75
|
+
- README.rdoc
|
76
|
+
require_paths:
|
77
|
+
- lib
|
78
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
79
|
+
none: false
|
80
|
+
requirements:
|
81
|
+
- - ">="
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: "0"
|
84
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
85
|
+
none: false
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: "0"
|
90
|
+
requirements: []
|
91
|
+
|
92
|
+
rubyforge_project: mojibake
|
93
|
+
rubygems_version: 1.5.1
|
94
|
+
signing_key:
|
95
|
+
specification_version: 3
|
96
|
+
summary: Mojibake occurs in English most frequently due to misinterpreting and bad-transcoding between Windows-1252, ISO-8859-1, and UTF-8
|
97
|
+
test_files:
|
98
|
+
- test/test_mojibake.rb
|