mojibake 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.rdoc +2 -0
- data/Manifest.txt +10 -0
- data/README.rdoc +55 -0
- data/Rakefile +35 -0
- data/bin/mojibake +71 -0
- data/lib/mojibake.rb +22 -0
- data/lib/mojibake/base.rb +19 -0
- data/lib/mojibake/mapper.rb +187 -0
- data/test/test.txt +12 -0
- data/test/test_mojibake.rb +89 -0
- metadata +98 -0
data/History.rdoc
ADDED
data/Manifest.txt
ADDED
data/README.rdoc
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
= MojiBake
|
3
|
+
|
4
|
+
* http://github.com/dekellum/mojibake
|
5
|
+
|
6
|
+
== Description
|
7
|
+
|
8
|
+
Mojibake occurs in English most frequently due to misinterpreting and
|
9
|
+
bad-transcoding between Windows-1252, ISO-8859-1, and UTF-8. This
|
10
|
+
module provides a mojibake sequence to original character mapping
|
11
|
+
table, and utility to recover mojibake’d text.
|
12
|
+
|
13
|
+
Testing has been with English but other Latin based languages, where
|
14
|
+
Windows-1252 is in the wild, should also benefit.
|
15
|
+
|
16
|
+
== Dependencies
|
17
|
+
|
18
|
+
Requires the String Encoding support of ruby 1.9+ (tested 1.9.2p180
|
19
|
+
Linux) or jruby 1.6+ (tested 1.6.2, Linux).
|
20
|
+
|
21
|
+
== Synopsis
|
22
|
+
|
23
|
+
gem install mojibake
|
24
|
+
|
25
|
+
require 'mojibake'
|
26
|
+
mapper = MojiBake::Mapper.new
|
27
|
+
mapper.recover( '“quotedâ€�' ) #=> '“quoted”'
|
28
|
+
|
29
|
+
Or via cli:
|
30
|
+
|
31
|
+
mojibake -h
|
32
|
+
|
33
|
+
List the mojibake mapping table (output in UTF-8):
|
34
|
+
|
35
|
+
mojibake -t
|
36
|
+
|
37
|
+
Recover from a text file:
|
38
|
+
|
39
|
+
mojibake input.txt
|
40
|
+
|
41
|
+
== License
|
42
|
+
|
43
|
+
Copyright (c) 2011 David Kellum
|
44
|
+
|
45
|
+
Licensed under the Apache License, Version 2.0 (the "License"); you
|
46
|
+
may not use this file except in compliance with the License. You
|
47
|
+
may obtain a copy of the License at:
|
48
|
+
|
49
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
50
|
+
|
51
|
+
Unless required by applicable law or agreed to in writing, software
|
52
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
53
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
54
|
+
implied. See the License for the specific language governing
|
55
|
+
permissions and limitations under the License.
|
data/Rakefile
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
$LOAD_PATH << './lib'
|
4
|
+
|
5
|
+
require 'rubygems'
|
6
|
+
gem 'rjack-tarpit', '~> 1.3.2'
|
7
|
+
require 'rjack-tarpit'
|
8
|
+
|
9
|
+
require 'mojibake/base'
|
10
|
+
|
11
|
+
t = RJack::TarPit.new( 'mojibake', MojiBake::VERSION )
|
12
|
+
|
13
|
+
t.specify do |h|
|
14
|
+
h.developer( 'David Kellum', 'dek-oss@gravitext.com' )
|
15
|
+
|
16
|
+
h.testlib = :minitest
|
17
|
+
h.extra_dev_deps += [ [ 'minitest', '>= 2.1', '< 2.4' ] ]
|
18
|
+
|
19
|
+
h.url = 'http://github.com/dekellum/mojibake'
|
20
|
+
end
|
21
|
+
|
22
|
+
# Version/date consistency checks:
|
23
|
+
|
24
|
+
task :check_history_version do
|
25
|
+
t.test_line_match( 'History.rdoc', /^==/, / #{ t.version } / )
|
26
|
+
end
|
27
|
+
task :check_history_date do
|
28
|
+
t.test_line_match( 'History.rdoc', /^==/, /\([0-9\-]+\)$/ )
|
29
|
+
end
|
30
|
+
|
31
|
+
task :gem => [ :check_history_version ]
|
32
|
+
task :tag => [ :check_history_version, :check_history_date ]
|
33
|
+
task :push => [ :check_history_version, :check_history_date ]
|
34
|
+
|
35
|
+
t.define_tasks
|
data/bin/mojibake
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#.hashdot.args.pre = --1.9
|
3
|
+
#.hashdot.profile += jruby-shortlived
|
4
|
+
|
5
|
+
#--
|
6
|
+
# Copyright (c) 2011 David Kellum
|
7
|
+
#
|
8
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
9
|
+
# may not use this file except in compliance with the License. You
|
10
|
+
# may obtain a copy of the License at
|
11
|
+
#
|
12
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
13
|
+
#
|
14
|
+
# Unless required by applicable law or agreed to in writing, software
|
15
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
16
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
17
|
+
# implied. See the License for the specific language governing
|
18
|
+
# permissions and limitations under the License.
|
19
|
+
#++
|
20
|
+
|
21
|
+
$LOAD_PATH.unshift File.join( File.dirname( __FILE__ ), "..", "lib" )
|
22
|
+
|
23
|
+
require 'mojibake'
|
24
|
+
require 'optparse'
|
25
|
+
|
26
|
+
module Script
|
27
|
+
|
28
|
+
mapper = MojiBake::Mapper.new
|
29
|
+
|
30
|
+
OptionParser.new do |opts|
|
31
|
+
opts.banner = "Usage: mojibake [options] [InputFile]"
|
32
|
+
opts.on( "-v", "--version", "Display version and exit" ) do
|
33
|
+
puts "mojibake: #{MojiBake::VERSION}"
|
34
|
+
exit 1
|
35
|
+
end
|
36
|
+
opts.on( "--no-windows-1252",
|
37
|
+
"Don't include miscodings from Windows-1252" ) do
|
38
|
+
mapper.map_windows_1252 = false
|
39
|
+
end
|
40
|
+
opts.on( "--no-iso-8859-1",
|
41
|
+
"Don't include miscodings from ISO-8859-1" ) do
|
42
|
+
mapper.map_iso_8859_1 = false
|
43
|
+
end
|
44
|
+
opts.on( "--no-permutations",
|
45
|
+
"Don't include ISO/Windows permutations" ) do
|
46
|
+
mapper.map_permutations = false
|
47
|
+
end
|
48
|
+
opts.on_tail( "-t", "--table",
|
49
|
+
"Write MojiBake Mapper table (UTF-8)" ) do
|
50
|
+
puts mapper.table
|
51
|
+
exit 1
|
52
|
+
end
|
53
|
+
opts.on_tail( "-r", "--regex",
|
54
|
+
"Display MojiBake Mapper regex (UTF-8) and exit" ) do
|
55
|
+
puts mapper.regexp.inspect
|
56
|
+
exit 1
|
57
|
+
end
|
58
|
+
opts.on_tail( "-h", "--help", "Show help and exit" ) do
|
59
|
+
puts opts
|
60
|
+
puts
|
61
|
+
puts( "Recover InputFile and writes to STDOUT." )
|
62
|
+
exit 1
|
63
|
+
end
|
64
|
+
end.parse!
|
65
|
+
|
66
|
+
input_file = ARGV.shift
|
67
|
+
if input_file
|
68
|
+
$stdout.write( mapper.recover( IO.read( input_file ).encode( 'UTF-8' ) ) )
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
data/lib/mojibake.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
if ( RUBY_VERSION.split( '.' ).map { |d| d.to_i } <=> [ 1, 9 ] ) < 0
|
18
|
+
raise "Requires ruby ~> 1.9 for String.encode support"
|
19
|
+
end
|
20
|
+
|
21
|
+
require 'mojibake/base'
|
22
|
+
require 'mojibake/mapper'
|
@@ -0,0 +1,19 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
module MojiBake
|
18
|
+
VERSION = "1.0.0"
|
19
|
+
end
|
@@ -0,0 +1,187 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
module MojiBake
|
18
|
+
|
19
|
+
# Creates a Map from mojibake sequences to recovered/original
|
20
|
+
# characters.
|
21
|
+
class Mapper
|
22
|
+
|
23
|
+
W252 = Encoding::WINDOWS_1252
|
24
|
+
ISO8 = Encoding::ISO_8859_1
|
25
|
+
UTF8 = Encoding::UTF_8
|
26
|
+
|
27
|
+
# The 8-bit high-order characters assigned in Windows-1252, as UTF8.
|
28
|
+
# This is actually a superset of ISO-8859-1 high order set,
|
29
|
+
# including in particular, punctuation characters like EM DASH and
|
30
|
+
# RIGHT DOUBLE QUOTATION MARK. These are the most common problem
|
31
|
+
# chars in English and probably most latin languages.
|
32
|
+
HIGH_ORDER_CHARS =
|
33
|
+
( ( 0x80..0xFF ).to_a - [ 0x81, 0x8D, 0x8F, 0x90, 0x9D ] ).
|
34
|
+
map { |i| i.chr( W252 ).encode( UTF8 ) }.
|
35
|
+
sort
|
36
|
+
|
37
|
+
# Additional Unicode codepoints of mojibake potential, like alt
|
38
|
+
# whitespace, C1 control characters, and BOMs.
|
39
|
+
INTEREST_CODEPOINTS =
|
40
|
+
[ (0x0080..0x009F).to_a, # ISO/Unicode C1 control codes.
|
41
|
+
0x00A0, # NO-BREAK SPACE
|
42
|
+
(0x2000..0x200B).to_a, # EN QUAD ... ZERO WIDTH SPACE
|
43
|
+
0x2060, # WORD JOINER
|
44
|
+
0xfeff, # ZERO WIDTH SPACE, BYTE-ORDER-MARK (BOM)
|
45
|
+
0xfffd, # REPLACEMENT CHARACTER
|
46
|
+
0xfffe ]. # UNASSIGNED, BAD BOM
|
47
|
+
flatten.
|
48
|
+
sort
|
49
|
+
|
50
|
+
INTEREST_CHARS = INTEREST_CODEPOINTS.map { |c| c.chr( UTF8 ) }
|
51
|
+
|
52
|
+
# Mojibake candidate characters in reverse; HIGH_ORDER_CHARS and
|
53
|
+
# lowest codepoints have highest precedence.
|
54
|
+
CANDIDATE_CHARS = ( HIGH_ORDER_CHARS + INTEREST_CHARS ).reverse
|
55
|
+
|
56
|
+
# Include Windows-1252 transcodes in map (default: true)
|
57
|
+
attr_accessor :map_windows_1252
|
58
|
+
|
59
|
+
# Include ISO-8859-1 transcodes in map (default: true)
|
60
|
+
attr_accessor :map_iso_8859_1
|
61
|
+
|
62
|
+
# Include permutations between ISO-8859-1 and Windows-1252
|
63
|
+
# (default: true). This covers ambiguities of C1 control codes.
|
64
|
+
attr_accessor :map_permutations
|
65
|
+
|
66
|
+
def initialize( options = {} )
|
67
|
+
@map_windows_1252 = true
|
68
|
+
@map_iso_8859_1 = true
|
69
|
+
@map_permutations = true
|
70
|
+
|
71
|
+
options.map { |k,v| send( k.to_s + '=', v ) }
|
72
|
+
end
|
73
|
+
|
74
|
+
# Return Hash of mojibake UTF-8 2-3 character sequences to original
|
75
|
+
# UTF-8 (recovered) characters
|
76
|
+
def hash
|
77
|
+
@hash ||= CANDIDATE_CHARS.inject( {} ) do |h,c|
|
78
|
+
|
79
|
+
# Mis-interpret as ISO-8859-1, and encode back to UTF-8
|
80
|
+
moji_8 = c.encode( UTF8, ISO8 )
|
81
|
+
h[moji_8] = c if @map_iso_8859_1
|
82
|
+
|
83
|
+
# Mis-interpret as Windows-1252, and encode back to UTF-8
|
84
|
+
moji_w = c.encode( UTF8, W252, :undef => :replace )
|
85
|
+
h[moji_w] = c if @map_windows_1252
|
86
|
+
|
87
|
+
if @map_permutations
|
88
|
+
# Also add permutations of unassigned Windows-1252 chars to
|
89
|
+
# the 8bit equivalent.
|
90
|
+
i = 0
|
91
|
+
moji_w.each_codepoint do |cp|
|
92
|
+
if cp == 0xFFFD
|
93
|
+
moji_n = moji_w.dup
|
94
|
+
moji_n[i] = moji_8[i]
|
95
|
+
h[moji_n] = c
|
96
|
+
end
|
97
|
+
i += 1
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
h
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# Return pretty table formatting of hash (array of lines)
|
106
|
+
def table
|
107
|
+
lines = [ "# -*- coding: utf-8 -*- mojibake: #{MojiBake::VERSION}" ]
|
108
|
+
lines << regexp.inspect
|
109
|
+
lines << ""
|
110
|
+
lines << "Moji\tUNICODE \tOrg\tCODE"
|
111
|
+
lines << "+----\t---- ---- ----\t-----\t---+"
|
112
|
+
lines += hash.sort.map do |moji,c|
|
113
|
+
"[%s]\t%s\t[%s]\t%s" %
|
114
|
+
[ moji, codepoints_hex( moji ), c, codepoints_hex( c ) ]
|
115
|
+
end
|
116
|
+
lines
|
117
|
+
end
|
118
|
+
|
119
|
+
# A Regexp that will match any of the mojibake sequences, as
|
120
|
+
# found in hash.keys.
|
121
|
+
def regexp
|
122
|
+
@regexp ||= Regexp.new( tree_flatten( char_tree( hash.keys ) ) )
|
123
|
+
end
|
124
|
+
|
125
|
+
# Recover original characters from input using regexp, recursively.
|
126
|
+
def recover( input, recursive = true )
|
127
|
+
output = input.gsub( regexp ) { |moji| hash[moji] }
|
128
|
+
|
129
|
+
# Only recurse if requested and substituted something (output
|
130
|
+
# shorter) in this run.
|
131
|
+
if recursive && ( output.length < input.length )
|
132
|
+
recover( output )
|
133
|
+
else
|
134
|
+
output
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
def char_tree( seqs )
|
139
|
+
seqs.inject( {} ) do |h,seq|
|
140
|
+
seq.chars.inject( h ) do |hs,c|
|
141
|
+
hs[c] ||= {}
|
142
|
+
end
|
143
|
+
h
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
def tree_flatten( tree )
|
148
|
+
cs = tree.sort.map do |k,v|
|
149
|
+
o = regex_encode( k )
|
150
|
+
unless v.empty?
|
151
|
+
c = tree_flatten( v )
|
152
|
+
o << if c =~ /^\[.*\]$/ || v.length == 1
|
153
|
+
c
|
154
|
+
else
|
155
|
+
'(' + c + ')'
|
156
|
+
end
|
157
|
+
end
|
158
|
+
o
|
159
|
+
end
|
160
|
+
if cs.find { |o| o =~ /[()|\[\]]/ }
|
161
|
+
cs.join( '|' ).force_encoding( "UTF-8" )
|
162
|
+
#FIXME: Join looses encoding so force, jruby bug?
|
163
|
+
else
|
164
|
+
if cs.length > 1
|
165
|
+
'[' + cs.inject(:+) + ']'
|
166
|
+
else
|
167
|
+
cs.first
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
# Unicode hex dump of codepoints
|
173
|
+
def codepoints_hex( s )
|
174
|
+
s.codepoints.map { |i| sprintf( "%04X", i ) }.join( ' ' )
|
175
|
+
end
|
176
|
+
|
177
|
+
def regex_encode( c )
|
178
|
+
i = c.codepoints.next #only one
|
179
|
+
if INTEREST_CODEPOINTS.include?( i )
|
180
|
+
sprintf( '\u%04X', i )
|
181
|
+
else
|
182
|
+
Regexp.escape( c )
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
end
|
187
|
+
end
|
data/test/test.txt
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
-*- coding: utf-8 -*-
|
2
|
+
Source: http://en.wikipedia.org/wiki/Mojibake
|
3
|
+
|
4
|
+
== English ==
|
5
|
+
|
6
|
+
Mojibake in English texts generally occurs in punctuation, such as em
|
7
|
+
dashes (—), en dashes (–), and curly quotes (“, ”), but rarely in
|
8
|
+
character text, since most encodings agree with ASCII on the encoding
|
9
|
+
of the English alphabet. For example, the pound sign "£" will appear
|
10
|
+
as "£" if it was encoded by the sender as UTF-8 but interpreted by
|
11
|
+
the recipient as CP1252 or ISO 8859-1. If iterated, this can lead to
|
12
|
+
"£", "£", etc.
|
@@ -0,0 +1,89 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
#.hashdot.args.pre = --1.9
|
4
|
+
#.hashdot.profile += jruby-shortlived
|
5
|
+
|
6
|
+
#--
|
7
|
+
# Copyright (c) 2011 David Kellum
|
8
|
+
#
|
9
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
10
|
+
# may not use this file except in compliance with the License. You
|
11
|
+
# may obtain a copy of the License at
|
12
|
+
#
|
13
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
14
|
+
#
|
15
|
+
# Unless required by applicable law or agreed to in writing, software
|
16
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
17
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
18
|
+
# implied. See the License for the specific language governing
|
19
|
+
# permissions and limitations under the License.
|
20
|
+
#++
|
21
|
+
|
22
|
+
ldir = File.join( File.dirname( __FILE__ ), "..", "lib" )
|
23
|
+
$LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
|
24
|
+
|
25
|
+
require 'rubygems'
|
26
|
+
require 'minitest/unit'
|
27
|
+
require 'minitest/autorun'
|
28
|
+
|
29
|
+
require 'mojibake'
|
30
|
+
|
31
|
+
class TestMojiBake < MiniTest::Unit::TestCase
|
32
|
+
include MojiBake
|
33
|
+
|
34
|
+
def setup
|
35
|
+
@mapper = Mapper.new
|
36
|
+
end
|
37
|
+
|
38
|
+
TEST_TREE = { "a" => { "b" => { "c" => {},
|
39
|
+
"d" => {} } },
|
40
|
+
"d" => { "b" => { "f" => {} } } }
|
41
|
+
|
42
|
+
def test_init_options
|
43
|
+
assert_equal( true, Mapper.new.map_iso_8859_1 )
|
44
|
+
m = Mapper.new( :map_iso_8859_1 => false )
|
45
|
+
assert_equal( false, m.map_iso_8859_1 )
|
46
|
+
end
|
47
|
+
|
48
|
+
def test_char_tree
|
49
|
+
assert_equal( TEST_TREE,
|
50
|
+
@mapper.char_tree( [ "abc", "abd", "dbf" ] ) )
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_tree_flaten
|
54
|
+
assert_equal( "ab[cd]|dbf",
|
55
|
+
@mapper.tree_flatten( TEST_TREE ) )
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_regexp
|
59
|
+
re = Regexp.new( @mapper.tree_flatten( TEST_TREE ) )
|
60
|
+
assert_match( re, "abc" )
|
61
|
+
assert_match( re, "abd" )
|
62
|
+
assert_match( re, "dbf" )
|
63
|
+
|
64
|
+
refute_match( re, "ab" )
|
65
|
+
refute_match( re, "abf" )
|
66
|
+
|
67
|
+
assert_equal( "xbf" , "abdbf".gsub( re, 'x' ) )
|
68
|
+
assert_equal( "dbf" , "abdbf".gsub( re, 'd' ) )
|
69
|
+
end
|
70
|
+
|
71
|
+
def test_nomatch_recover
|
72
|
+
assert_equal( '', @mapper.recover( '' ) )
|
73
|
+
assert_equal( 'ascii', @mapper.recover( 'ascii' ) )
|
74
|
+
assert_equal( 'Â', @mapper.recover( 'Â' ) )
|
75
|
+
end
|
76
|
+
|
77
|
+
def test_simple_recover
|
78
|
+
assert_equal( '[°]', @mapper.recover( '[°]' ) )
|
79
|
+
assert_equal( '“quoted”', @mapper.recover( '“quotedâ€�' ) )
|
80
|
+
assert_equal( '“quoted”', @mapper.recover( 'âquotedâ€' ) )
|
81
|
+
end
|
82
|
+
|
83
|
+
def test_recursive_recover
|
84
|
+
assert_equal( '°', @mapper.recover( '°' ) )
|
85
|
+
assert_equal( 'AP – Greenlake', @mapper.recover( 'AP – Greenlake' ) )
|
86
|
+
assert_equal( 'you’re', @mapper.recover( 'you’re' ) )
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
metadata
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: mojibake
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 1.0.0
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- David Kellum
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-06-21 00:00:00 -07:00
|
14
|
+
default_executable:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: minitest
|
18
|
+
prerelease: false
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ">="
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: "2.1"
|
25
|
+
- - <
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: "2.4"
|
28
|
+
type: :development
|
29
|
+
version_requirements: *id001
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rjack-tarpit
|
32
|
+
prerelease: false
|
33
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
34
|
+
none: false
|
35
|
+
requirements:
|
36
|
+
- - ~>
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: 1.3.2
|
39
|
+
type: :development
|
40
|
+
version_requirements: *id002
|
41
|
+
description: "Mojibake occurs in English most frequently due to misinterpreting and\n\
|
42
|
+
bad-transcoding between Windows-1252, ISO-8859-1, and UTF-8. This\n\
|
43
|
+
module provides a mojibake sequence to original character mapping\n\
|
44
|
+
table, and utility to recover mojibake\xE2\x80\x99d text.\n\n\
|
45
|
+
Testing has been with English but other Latin based languages, where\n\
|
46
|
+
Windows-1252 is in the wild, should also benefit."
|
47
|
+
email:
|
48
|
+
- dek-oss@gravitext.com
|
49
|
+
executables:
|
50
|
+
- mojibake
|
51
|
+
extensions: []
|
52
|
+
|
53
|
+
extra_rdoc_files:
|
54
|
+
- Manifest.txt
|
55
|
+
- History.rdoc
|
56
|
+
- README.rdoc
|
57
|
+
files:
|
58
|
+
- History.rdoc
|
59
|
+
- Manifest.txt
|
60
|
+
- README.rdoc
|
61
|
+
- Rakefile
|
62
|
+
- bin/mojibake
|
63
|
+
- lib/mojibake/base.rb
|
64
|
+
- lib/mojibake.rb
|
65
|
+
- lib/mojibake/mapper.rb
|
66
|
+
- test/test.txt
|
67
|
+
- test/test_mojibake.rb
|
68
|
+
has_rdoc: true
|
69
|
+
homepage: http://github.com/dekellum/mojibake
|
70
|
+
licenses: []
|
71
|
+
|
72
|
+
post_install_message:
|
73
|
+
rdoc_options:
|
74
|
+
- --main
|
75
|
+
- README.rdoc
|
76
|
+
require_paths:
|
77
|
+
- lib
|
78
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
79
|
+
none: false
|
80
|
+
requirements:
|
81
|
+
- - ">="
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: "0"
|
84
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
85
|
+
none: false
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: "0"
|
90
|
+
requirements: []
|
91
|
+
|
92
|
+
rubyforge_project: mojibake
|
93
|
+
rubygems_version: 1.5.1
|
94
|
+
signing_key:
|
95
|
+
specification_version: 3
|
96
|
+
summary: Mojibake occurs in English most frequently due to misinterpreting and bad-transcoding between Windows-1252, ISO-8859-1, and UTF-8
|
97
|
+
test_files:
|
98
|
+
- test/test_mojibake.rb
|