mojibake 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/History.rdoc ADDED
@@ -0,0 +1,2 @@
1
+ === 1.0.0 (2011-6-21)
2
+ * Initial release.
data/Manifest.txt ADDED
@@ -0,0 +1,10 @@
1
+ History.rdoc
2
+ Manifest.txt
3
+ README.rdoc
4
+ Rakefile
5
+ bin/mojibake
6
+ lib/mojibake/base.rb
7
+ lib/mojibake.rb
8
+ lib/mojibake/mapper.rb
9
+ test/test.txt
10
+ test/test_mojibake.rb
data/README.rdoc ADDED
@@ -0,0 +1,55 @@
1
+ # -*- coding: utf-8 -*-
2
+ = MojiBake
3
+
4
+ * http://github.com/dekellum/mojibake
5
+
6
+ == Description
7
+
8
+ Mojibake occurs in English most frequently due to misinterpreting and
9
+ bad-transcoding between Windows-1252, ISO-8859-1, and UTF-8. This
10
+ module provides a mojibake sequence to original character mapping
11
+ table, and utility to recover mojibake’d text.
12
+
13
+ Testing has been with English but other Latin based languages, where
14
+ Windows-1252 is in the wild, should also benefit.
15
+
16
+ == Dependencies
17
+
18
+ Requires the String Encoding support of ruby 1.9+ (tested 1.9.2p180
19
+ Linux) or jruby 1.6+ (tested 1.6.2, Linux).
20
+
21
+ == Synopsis
22
+
23
+ gem install mojibake
24
+
25
+ require 'mojibake'
26
+ mapper = MojiBake::Mapper.new
27
+ mapper.recover( '“quotedâ€�' ) #=> '“quoted”'
28
+
29
+ Or via cli:
30
+
31
+ mojibake -h
32
+
33
+ List the mojibake mapping table (output in UTF-8):
34
+
35
+ mojibake -t
36
+
37
+ Recover from a text file:
38
+
39
+ mojibake input.txt
40
+
41
+ == License
42
+
43
+ Copyright (c) 2011 David Kellum
44
+
45
+ Licensed under the Apache License, Version 2.0 (the "License"); you
46
+ may not use this file except in compliance with the License. You
47
+ may obtain a copy of the License at:
48
+
49
+ http://www.apache.org/licenses/LICENSE-2.0
50
+
51
+ Unless required by applicable law or agreed to in writing, software
52
+ distributed under the License is distributed on an "AS IS" BASIS,
53
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
54
+ implied. See the License for the specific language governing
55
+ permissions and limitations under the License.
data/Rakefile ADDED
@@ -0,0 +1,35 @@
1
+ # -*- ruby -*-
2
+
3
+ $LOAD_PATH << './lib'
4
+
5
+ require 'rubygems'
6
+ gem 'rjack-tarpit', '~> 1.3.2'
7
+ require 'rjack-tarpit'
8
+
9
+ require 'mojibake/base'
10
+
11
+ t = RJack::TarPit.new( 'mojibake', MojiBake::VERSION )
12
+
13
+ t.specify do |h|
14
+ h.developer( 'David Kellum', 'dek-oss@gravitext.com' )
15
+
16
+ h.testlib = :minitest
17
+ h.extra_dev_deps += [ [ 'minitest', '>= 2.1', '< 2.4' ] ]
18
+
19
+ h.url = 'http://github.com/dekellum/mojibake'
20
+ end
21
+
22
+ # Version/date consistency checks:
23
+
24
+ task :check_history_version do
25
+ t.test_line_match( 'History.rdoc', /^==/, / #{ t.version } / )
26
+ end
27
+ task :check_history_date do
28
+ t.test_line_match( 'History.rdoc', /^==/, /\([0-9\-]+\)$/ )
29
+ end
30
+
31
+ task :gem => [ :check_history_version ]
32
+ task :tag => [ :check_history_version, :check_history_date ]
33
+ task :push => [ :check_history_version, :check_history_date ]
34
+
35
+ t.define_tasks
data/bin/mojibake ADDED
@@ -0,0 +1,71 @@
1
+ #!/usr/bin/env ruby
2
+ #.hashdot.args.pre = --1.9
3
+ #.hashdot.profile += jruby-shortlived
4
+
5
+ #--
6
+ # Copyright (c) 2011 David Kellum
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
9
+ # may not use this file except in compliance with the License. You
10
+ # may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
17
+ # implied. See the License for the specific language governing
18
+ # permissions and limitations under the License.
19
+ #++
20
+
21
+ $LOAD_PATH.unshift File.join( File.dirname( __FILE__ ), "..", "lib" )
22
+
23
+ require 'mojibake'
24
+ require 'optparse'
25
+
26
+ module Script
27
+
28
+ mapper = MojiBake::Mapper.new
29
+
30
+ OptionParser.new do |opts|
31
+ opts.banner = "Usage: mojibake [options] [InputFile]"
32
+ opts.on( "-v", "--version", "Display version and exit" ) do
33
+ puts "mojibake: #{MojiBake::VERSION}"
34
+ exit 1
35
+ end
36
+ opts.on( "--no-windows-1252",
37
+ "Don't include miscodings from Windows-1252" ) do
38
+ mapper.map_windows_1252 = false
39
+ end
40
+ opts.on( "--no-iso-8859-1",
41
+ "Don't include miscodings from ISO-8859-1" ) do
42
+ mapper.map_iso_8859_1 = false
43
+ end
44
+ opts.on( "--no-permutations",
45
+ "Don't include ISO/Windows permutations" ) do
46
+ mapper.map_permutations = false
47
+ end
48
+ opts.on_tail( "-t", "--table",
49
+ "Write MojiBake Mapper table (UTF-8)" ) do
50
+ puts mapper.table
51
+ exit 1
52
+ end
53
+ opts.on_tail( "-r", "--regex",
54
+ "Display MojiBake Mapper regex (UTF-8) and exit" ) do
55
+ puts mapper.regexp.inspect
56
+ exit 1
57
+ end
58
+ opts.on_tail( "-h", "--help", "Show help and exit" ) do
59
+ puts opts
60
+ puts
61
+ puts( "Recover InputFile and writes to STDOUT." )
62
+ exit 1
63
+ end
64
+ end.parse!
65
+
66
+ input_file = ARGV.shift
67
+ if input_file
68
+ $stdout.write( mapper.recover( IO.read( input_file ).encode( 'UTF-8' ) ) )
69
+ end
70
+
71
+ end
data/lib/mojibake.rb ADDED
@@ -0,0 +1,22 @@
1
+ #--
2
+ # Copyright (c) 2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ if ( RUBY_VERSION.split( '.' ).map { |d| d.to_i } <=> [ 1, 9 ] ) < 0
18
+ raise "Requires ruby ~> 1.9 for String.encode support"
19
+ end
20
+
21
+ require 'mojibake/base'
22
+ require 'mojibake/mapper'
@@ -0,0 +1,19 @@
1
+ #--
2
+ # Copyright (c) 2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ module MojiBake
18
+ VERSION = "1.0.0"
19
+ end
@@ -0,0 +1,187 @@
1
+ #--
2
+ # Copyright (c) 2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ module MojiBake
18
+
19
+ # Creates a Map from mojibake sequences to recovered/original
20
+ # characters.
21
+ class Mapper
22
+
23
+ W252 = Encoding::WINDOWS_1252
24
+ ISO8 = Encoding::ISO_8859_1
25
+ UTF8 = Encoding::UTF_8
26
+
27
+ # The 8-bit high-order characters assigned in Windows-1252, as UTF8.
28
+ # This is actually a superset of ISO-8859-1 high order set,
29
+ # including in particular, punctuation characters like EM DASH and
30
+ # RIGHT DOUBLE QUOTATION MARK. These are the most common problem
31
+ # chars in English and probably most latin languages.
32
+ HIGH_ORDER_CHARS =
33
+ ( ( 0x80..0xFF ).to_a - [ 0x81, 0x8D, 0x8F, 0x90, 0x9D ] ).
34
+ map { |i| i.chr( W252 ).encode( UTF8 ) }.
35
+ sort
36
+
37
+ # Additional Unicode codepoints of mojibake potential, like alt
38
+ # whitespace, C1 control characters, and BOMs.
39
+ INTEREST_CODEPOINTS =
40
+ [ (0x0080..0x009F).to_a, # ISO/Unicode C1 control codes.
41
+ 0x00A0, # NO-BREAK SPACE
42
+ (0x2000..0x200B).to_a, # EN QUAD ... ZERO WIDTH SPACE
43
+ 0x2060, # WORD JOINER
44
+ 0xfeff, # ZERO WIDTH SPACE, BYTE-ORDER-MARK (BOM)
45
+ 0xfffd, # REPLACEMENT CHARACTER
46
+ 0xfffe ]. # UNASSIGNED, BAD BOM
47
+ flatten.
48
+ sort
49
+
50
+ INTEREST_CHARS = INTEREST_CODEPOINTS.map { |c| c.chr( UTF8 ) }
51
+
52
+ # Mojibake candidate characters in reverse; HIGH_ORDER_CHARS and
53
+ # lowest codepoints have highest precedence.
54
+ CANDIDATE_CHARS = ( HIGH_ORDER_CHARS + INTEREST_CHARS ).reverse
55
+
56
+ # Include Windows-1252 transcodes in map (default: true)
57
+ attr_accessor :map_windows_1252
58
+
59
+ # Include ISO-8859-1 transcodes in map (default: true)
60
+ attr_accessor :map_iso_8859_1
61
+
62
+ # Include permutations between ISO-8859-1 and Windows-1252
63
+ # (default: true). This covers ambiguities of C1 control codes.
64
+ attr_accessor :map_permutations
65
+
66
+ def initialize( options = {} )
67
+ @map_windows_1252 = true
68
+ @map_iso_8859_1 = true
69
+ @map_permutations = true
70
+
71
+ options.map { |k,v| send( k.to_s + '=', v ) }
72
+ end
73
+
74
+ # Return Hash of mojibake UTF-8 2-3 character sequences to original
75
+ # UTF-8 (recovered) characters
76
+ def hash
77
+ @hash ||= CANDIDATE_CHARS.inject( {} ) do |h,c|
78
+
79
+ # Mis-interpret as ISO-8859-1, and encode back to UTF-8
80
+ moji_8 = c.encode( UTF8, ISO8 )
81
+ h[moji_8] = c if @map_iso_8859_1
82
+
83
+ # Mis-interpret as Windows-1252, and encode back to UTF-8
84
+ moji_w = c.encode( UTF8, W252, :undef => :replace )
85
+ h[moji_w] = c if @map_windows_1252
86
+
87
+ if @map_permutations
88
+ # Also add permutations of unassigned Windows-1252 chars to
89
+ # the 8bit equivalent.
90
+ i = 0
91
+ moji_w.each_codepoint do |cp|
92
+ if cp == 0xFFFD
93
+ moji_n = moji_w.dup
94
+ moji_n[i] = moji_8[i]
95
+ h[moji_n] = c
96
+ end
97
+ i += 1
98
+ end
99
+ end
100
+
101
+ h
102
+ end
103
+ end
104
+
105
+ # Return pretty table formatting of hash (array of lines)
106
+ def table
107
+ lines = [ "# -*- coding: utf-8 -*- mojibake: #{MojiBake::VERSION}" ]
108
+ lines << regexp.inspect
109
+ lines << ""
110
+ lines << "Moji\tUNICODE \tOrg\tCODE"
111
+ lines << "+----\t---- ---- ----\t-----\t---+"
112
+ lines += hash.sort.map do |moji,c|
113
+ "[%s]\t%s\t[%s]\t%s" %
114
+ [ moji, codepoints_hex( moji ), c, codepoints_hex( c ) ]
115
+ end
116
+ lines
117
+ end
118
+
119
+ # A Regexp that will match any of the mojibake sequences, as
120
+ # found in hash.keys.
121
+ def regexp
122
+ @regexp ||= Regexp.new( tree_flatten( char_tree( hash.keys ) ) )
123
+ end
124
+
125
+ # Recover original characters from input using regexp, recursively.
126
+ def recover( input, recursive = true )
127
+ output = input.gsub( regexp ) { |moji| hash[moji] }
128
+
129
+ # Only recurse if requested and substituted something (output
130
+ # shorter) in this run.
131
+ if recursive && ( output.length < input.length )
132
+ recover( output )
133
+ else
134
+ output
135
+ end
136
+ end
137
+
138
+ def char_tree( seqs )
139
+ seqs.inject( {} ) do |h,seq|
140
+ seq.chars.inject( h ) do |hs,c|
141
+ hs[c] ||= {}
142
+ end
143
+ h
144
+ end
145
+ end
146
+
147
+ def tree_flatten( tree )
148
+ cs = tree.sort.map do |k,v|
149
+ o = regex_encode( k )
150
+ unless v.empty?
151
+ c = tree_flatten( v )
152
+ o << if c =~ /^\[.*\]$/ || v.length == 1
153
+ c
154
+ else
155
+ '(' + c + ')'
156
+ end
157
+ end
158
+ o
159
+ end
160
+ if cs.find { |o| o =~ /[()|\[\]]/ }
161
+ cs.join( '|' ).force_encoding( "UTF-8" )
162
+ #FIXME: Join looses encoding so force, jruby bug?
163
+ else
164
+ if cs.length > 1
165
+ '[' + cs.inject(:+) + ']'
166
+ else
167
+ cs.first
168
+ end
169
+ end
170
+ end
171
+
172
+ # Unicode hex dump of codepoints
173
+ def codepoints_hex( s )
174
+ s.codepoints.map { |i| sprintf( "%04X", i ) }.join( ' ' )
175
+ end
176
+
177
+ def regex_encode( c )
178
+ i = c.codepoints.next #only one
179
+ if INTEREST_CODEPOINTS.include?( i )
180
+ sprintf( '\u%04X', i )
181
+ else
182
+ Regexp.escape( c )
183
+ end
184
+ end
185
+
186
+ end
187
+ end
data/test/test.txt ADDED
@@ -0,0 +1,12 @@
1
+ -*- coding: utf-8 -*-
2
+ Source: http://en.wikipedia.org/wiki/Mojibake
3
+
4
+ == English ==
5
+
6
+ Mojibake in English texts generally occurs in punctuation, such as em
7
+ dashes (—), en dashes (–), and curly quotes (“, ”), but rarely in
8
+ character text, since most encodings agree with ASCII on the encoding
9
+ of the English alphabet. For example, the pound sign "£" will appear
10
+ as "£" if it was encoded by the sender as UTF-8 but interpreted by
11
+ the recipient as CP1252 or ISO 8859-1. If iterated, this can lead to
12
+ "£", "£", etc.
@@ -0,0 +1,89 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+ #.hashdot.args.pre = --1.9
4
+ #.hashdot.profile += jruby-shortlived
5
+
6
+ #--
7
+ # Copyright (c) 2011 David Kellum
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
10
+ # may not use this file except in compliance with the License. You
11
+ # may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
18
+ # implied. See the License for the specific language governing
19
+ # permissions and limitations under the License.
20
+ #++
21
+
22
+ ldir = File.join( File.dirname( __FILE__ ), "..", "lib" )
23
+ $LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
24
+
25
+ require 'rubygems'
26
+ require 'minitest/unit'
27
+ require 'minitest/autorun'
28
+
29
+ require 'mojibake'
30
+
31
+ class TestMojiBake < MiniTest::Unit::TestCase
32
+ include MojiBake
33
+
34
+ def setup
35
+ @mapper = Mapper.new
36
+ end
37
+
38
+ TEST_TREE = { "a" => { "b" => { "c" => {},
39
+ "d" => {} } },
40
+ "d" => { "b" => { "f" => {} } } }
41
+
42
+ def test_init_options
43
+ assert_equal( true, Mapper.new.map_iso_8859_1 )
44
+ m = Mapper.new( :map_iso_8859_1 => false )
45
+ assert_equal( false, m.map_iso_8859_1 )
46
+ end
47
+
48
+ def test_char_tree
49
+ assert_equal( TEST_TREE,
50
+ @mapper.char_tree( [ "abc", "abd", "dbf" ] ) )
51
+ end
52
+
53
+ def test_tree_flaten
54
+ assert_equal( "ab[cd]|dbf",
55
+ @mapper.tree_flatten( TEST_TREE ) )
56
+ end
57
+
58
+ def test_regexp
59
+ re = Regexp.new( @mapper.tree_flatten( TEST_TREE ) )
60
+ assert_match( re, "abc" )
61
+ assert_match( re, "abd" )
62
+ assert_match( re, "dbf" )
63
+
64
+ refute_match( re, "ab" )
65
+ refute_match( re, "abf" )
66
+
67
+ assert_equal( "xbf" , "abdbf".gsub( re, 'x' ) )
68
+ assert_equal( "dbf" , "abdbf".gsub( re, 'd' ) )
69
+ end
70
+
71
+ def test_nomatch_recover
72
+ assert_equal( '', @mapper.recover( '' ) )
73
+ assert_equal( 'ascii', @mapper.recover( 'ascii' ) )
74
+ assert_equal( 'Â', @mapper.recover( 'Â' ) )
75
+ end
76
+
77
+ def test_simple_recover
78
+ assert_equal( '[°]', @mapper.recover( '[°]' ) )
79
+ assert_equal( '“quoted”', @mapper.recover( '“quotedâ€�' ) )
80
+ assert_equal( '“quoted”', @mapper.recover( '“quoted”' ) )
81
+ end
82
+
83
+ def test_recursive_recover
84
+ assert_equal( '°', @mapper.recover( '°' ) )
85
+ assert_equal( 'AP – Greenlake', @mapper.recover( 'AP – Greenlake' ) )
86
+ assert_equal( 'you’re', @mapper.recover( 'you’re' ) )
87
+ end
88
+
89
+ end
metadata ADDED
@@ -0,0 +1,98 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mojibake
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 1.0.0
6
+ platform: ruby
7
+ authors:
8
+ - David Kellum
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-06-21 00:00:00 -07:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: minitest
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ">="
23
+ - !ruby/object:Gem::Version
24
+ version: "2.1"
25
+ - - <
26
+ - !ruby/object:Gem::Version
27
+ version: "2.4"
28
+ type: :development
29
+ version_requirements: *id001
30
+ - !ruby/object:Gem::Dependency
31
+ name: rjack-tarpit
32
+ prerelease: false
33
+ requirement: &id002 !ruby/object:Gem::Requirement
34
+ none: false
35
+ requirements:
36
+ - - ~>
37
+ - !ruby/object:Gem::Version
38
+ version: 1.3.2
39
+ type: :development
40
+ version_requirements: *id002
41
+ description: "Mojibake occurs in English most frequently due to misinterpreting and\n\
42
+ bad-transcoding between Windows-1252, ISO-8859-1, and UTF-8. This\n\
43
+ module provides a mojibake sequence to original character mapping\n\
44
+ table, and utility to recover mojibake\xE2\x80\x99d text.\n\n\
45
+ Testing has been with English but other Latin based languages, where\n\
46
+ Windows-1252 is in the wild, should also benefit."
47
+ email:
48
+ - dek-oss@gravitext.com
49
+ executables:
50
+ - mojibake
51
+ extensions: []
52
+
53
+ extra_rdoc_files:
54
+ - Manifest.txt
55
+ - History.rdoc
56
+ - README.rdoc
57
+ files:
58
+ - History.rdoc
59
+ - Manifest.txt
60
+ - README.rdoc
61
+ - Rakefile
62
+ - bin/mojibake
63
+ - lib/mojibake/base.rb
64
+ - lib/mojibake.rb
65
+ - lib/mojibake/mapper.rb
66
+ - test/test.txt
67
+ - test/test_mojibake.rb
68
+ has_rdoc: true
69
+ homepage: http://github.com/dekellum/mojibake
70
+ licenses: []
71
+
72
+ post_install_message:
73
+ rdoc_options:
74
+ - --main
75
+ - README.rdoc
76
+ require_paths:
77
+ - lib
78
+ required_ruby_version: !ruby/object:Gem::Requirement
79
+ none: false
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: "0"
84
+ required_rubygems_version: !ruby/object:Gem::Requirement
85
+ none: false
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: "0"
90
+ requirements: []
91
+
92
+ rubyforge_project: mojibake
93
+ rubygems_version: 1.5.1
94
+ signing_key:
95
+ specification_version: 3
96
+ summary: Mojibake occurs in English most frequently due to misinterpreting and bad-transcoding between Windows-1252, ISO-8859-1, and UTF-8
97
+ test_files:
98
+ - test/test_mojibake.rb