mojibake 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.rdoc ADDED
@@ -0,0 +1,2 @@
1
+ === 1.0.0 (2011-6-21)
2
+ * Initial release.
data/Manifest.txt ADDED
@@ -0,0 +1,10 @@
1
+ History.rdoc
2
+ Manifest.txt
3
+ README.rdoc
4
+ Rakefile
5
+ bin/mojibake
6
+ lib/mojibake/base.rb
7
+ lib/mojibake.rb
8
+ lib/mojibake/mapper.rb
9
+ test/test.txt
10
+ test/test_mojibake.rb
data/README.rdoc ADDED
@@ -0,0 +1,55 @@
1
+ # -*- coding: utf-8 -*-
2
+ = MojiBake
3
+
4
+ * http://github.com/dekellum/mojibake
5
+
6
+ == Description
7
+
8
+ Mojibake occurs in English most frequently due to misinterpreting and
9
+ bad-transcoding between Windows-1252, ISO-8859-1, and UTF-8. This
10
+ module provides a mojibake sequence to original character mapping
11
+ table, and utility to recover mojibake’d text.
12
+
13
+ Testing has been with English but other Latin based languages, where
14
+ Windows-1252 is in the wild, should also benefit.
15
+
16
+ == Dependencies
17
+
18
+ Requires the String Encoding support of ruby 1.9+ (tested 1.9.2p180
19
+ Linux) or jruby 1.6+ (tested 1.6.2, Linux).
20
+
21
+ == Synopsis
22
+
23
+ gem install mojibake
24
+
25
+ require 'mojibake'
26
+ mapper = MojiBake::Mapper.new
27
+ mapper.recover( '“quotedâ€�' ) #=> '“quoted”'
28
+
29
+ Or via cli:
30
+
31
+ mojibake -h
32
+
33
+ List the mojibake mapping table (output in UTF-8):
34
+
35
+ mojibake -t
36
+
37
+ Recover from a text file:
38
+
39
+ mojibake input.txt
40
+
41
+ == License
42
+
43
+ Copyright (c) 2011 David Kellum
44
+
45
+ Licensed under the Apache License, Version 2.0 (the "License"); you
46
+ may not use this file except in compliance with the License. You
47
+ may obtain a copy of the License at:
48
+
49
+ http://www.apache.org/licenses/LICENSE-2.0
50
+
51
+ Unless required by applicable law or agreed to in writing, software
52
+ distributed under the License is distributed on an "AS IS" BASIS,
53
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
54
+ implied. See the License for the specific language governing
55
+ permissions and limitations under the License.
data/Rakefile ADDED
@@ -0,0 +1,35 @@
1
+ # -*- ruby -*-
2
+
3
+ $LOAD_PATH << './lib'
4
+
5
+ require 'rubygems'
6
+ gem 'rjack-tarpit', '~> 1.3.2'
7
+ require 'rjack-tarpit'
8
+
9
+ require 'mojibake/base'
10
+
11
+ t = RJack::TarPit.new( 'mojibake', MojiBake::VERSION )
12
+
13
+ t.specify do |h|
14
+ h.developer( 'David Kellum', 'dek-oss@gravitext.com' )
15
+
16
+ h.testlib = :minitest
17
+ h.extra_dev_deps += [ [ 'minitest', '>= 2.1', '< 2.4' ] ]
18
+
19
+ h.url = 'http://github.com/dekellum/mojibake'
20
+ end
21
+
22
+ # Version/date consistency checks:
23
+
24
+ task :check_history_version do
25
+ t.test_line_match( 'History.rdoc', /^==/, / #{ t.version } / )
26
+ end
27
+ task :check_history_date do
28
+ t.test_line_match( 'History.rdoc', /^==/, /\([0-9\-]+\)$/ )
29
+ end
30
+
31
+ task :gem => [ :check_history_version ]
32
+ task :tag => [ :check_history_version, :check_history_date ]
33
+ task :push => [ :check_history_version, :check_history_date ]
34
+
35
+ t.define_tasks
data/bin/mojibake ADDED
@@ -0,0 +1,71 @@
1
+ #!/usr/bin/env ruby
2
+ #.hashdot.args.pre = --1.9
3
+ #.hashdot.profile += jruby-shortlived
4
+
5
+ #--
6
+ # Copyright (c) 2011 David Kellum
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
9
+ # may not use this file except in compliance with the License. You
10
+ # may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
17
+ # implied. See the License for the specific language governing
18
+ # permissions and limitations under the License.
19
+ #++
20
+
21
+ $LOAD_PATH.unshift File.join( File.dirname( __FILE__ ), "..", "lib" )
22
+
23
+ require 'mojibake'
24
+ require 'optparse'
25
+
26
+ module Script
27
+
28
+ mapper = MojiBake::Mapper.new
29
+
30
+ OptionParser.new do |opts|
31
+ opts.banner = "Usage: mojibake [options] [InputFile]"
32
+ opts.on( "-v", "--version", "Display version and exit" ) do
33
+ puts "mojibake: #{MojiBake::VERSION}"
34
+ exit 1
35
+ end
36
+ opts.on( "--no-windows-1252",
37
+ "Don't include miscodings from Windows-1252" ) do
38
+ mapper.map_windows_1252 = false
39
+ end
40
+ opts.on( "--no-iso-8859-1",
41
+ "Don't include miscodings from ISO-8859-1" ) do
42
+ mapper.map_iso_8859_1 = false
43
+ end
44
+ opts.on( "--no-permutations",
45
+ "Don't include ISO/Windows permutations" ) do
46
+ mapper.map_permutations = false
47
+ end
48
+ opts.on_tail( "-t", "--table",
49
+ "Write MojiBake Mapper table (UTF-8)" ) do
50
+ puts mapper.table
51
+ exit 1
52
+ end
53
+ opts.on_tail( "-r", "--regex",
54
+ "Display MojiBake Mapper regex (UTF-8) and exit" ) do
55
+ puts mapper.regexp.inspect
56
+ exit 1
57
+ end
58
+ opts.on_tail( "-h", "--help", "Show help and exit" ) do
59
+ puts opts
60
+ puts
61
+ puts( "Recover InputFile and writes to STDOUT." )
62
+ exit 1
63
+ end
64
+ end.parse!
65
+
66
+ input_file = ARGV.shift
67
+ if input_file
68
+ $stdout.write( mapper.recover( IO.read( input_file ).encode( 'UTF-8' ) ) )
69
+ end
70
+
71
+ end
data/lib/mojibake.rb ADDED
@@ -0,0 +1,22 @@
1
+ #--
2
+ # Copyright (c) 2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ if ( RUBY_VERSION.split( '.' ).map { |d| d.to_i } <=> [ 1, 9 ] ) < 0
18
+ raise "Requires ruby ~> 1.9 for String.encode support"
19
+ end
20
+
21
+ require 'mojibake/base'
22
+ require 'mojibake/mapper'
@@ -0,0 +1,19 @@
1
+ #--
2
+ # Copyright (c) 2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ module MojiBake
18
+ VERSION = "1.0.0"
19
+ end
@@ -0,0 +1,187 @@
1
+ #--
2
+ # Copyright (c) 2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ module MojiBake
18
+
19
+ # Creates a Map from mojibake sequences to recovered/original
20
+ # characters.
21
+ class Mapper
22
+
23
+ W252 = Encoding::WINDOWS_1252
24
+ ISO8 = Encoding::ISO_8859_1
25
+ UTF8 = Encoding::UTF_8
26
+
27
+ # The 8-bit high-order characters assigned in Windows-1252, as UTF8.
28
+ # This is actually a superset of ISO-8859-1 high order set,
29
+ # including in particular, punctuation characters like EM DASH and
30
+ # RIGHT DOUBLE QUOTATION MARK. These are the most common problem
31
+ # chars in English and probably most latin languages.
32
+ HIGH_ORDER_CHARS =
33
+ ( ( 0x80..0xFF ).to_a - [ 0x81, 0x8D, 0x8F, 0x90, 0x9D ] ).
34
+ map { |i| i.chr( W252 ).encode( UTF8 ) }.
35
+ sort
36
+
37
+ # Additional Unicode codepoints of mojibake potential, like alt
38
+ # whitespace, C1 control characters, and BOMs.
39
+ INTEREST_CODEPOINTS =
40
+ [ (0x0080..0x009F).to_a, # ISO/Unicode C1 control codes.
41
+ 0x00A0, # NO-BREAK SPACE
42
+ (0x2000..0x200B).to_a, # EN QUAD ... ZERO WIDTH SPACE
43
+ 0x2060, # WORD JOINER
44
+ 0xfeff, # ZERO WIDTH SPACE, BYTE-ORDER-MARK (BOM)
45
+ 0xfffd, # REPLACEMENT CHARACTER
46
+ 0xfffe ]. # UNASSIGNED, BAD BOM
47
+ flatten.
48
+ sort
49
+
50
+ INTEREST_CHARS = INTEREST_CODEPOINTS.map { |c| c.chr( UTF8 ) }
51
+
52
+ # Mojibake candidate characters in reverse; HIGH_ORDER_CHARS and
53
+ # lowest codepoints have highest precedence.
54
+ CANDIDATE_CHARS = ( HIGH_ORDER_CHARS + INTEREST_CHARS ).reverse
55
+
56
+ # Include Windows-1252 transcodes in map (default: true)
57
+ attr_accessor :map_windows_1252
58
+
59
+ # Include ISO-8859-1 transcodes in map (default: true)
60
+ attr_accessor :map_iso_8859_1
61
+
62
+ # Include permutations between ISO-8859-1 and Windows-1252
63
+ # (default: true). This covers ambiguities of C1 control codes.
64
+ attr_accessor :map_permutations
65
+
66
+ def initialize( options = {} )
67
+ @map_windows_1252 = true
68
+ @map_iso_8859_1 = true
69
+ @map_permutations = true
70
+
71
+ options.map { |k,v| send( k.to_s + '=', v ) }
72
+ end
73
+
74
+ # Return Hash of mojibake UTF-8 2-3 character sequences to original
75
+ # UTF-8 (recovered) characters
76
+ def hash
77
+ @hash ||= CANDIDATE_CHARS.inject( {} ) do |h,c|
78
+
79
+ # Mis-interpret as ISO-8859-1, and encode back to UTF-8
80
+ moji_8 = c.encode( UTF8, ISO8 )
81
+ h[moji_8] = c if @map_iso_8859_1
82
+
83
+ # Mis-interpret as Windows-1252, and encode back to UTF-8
84
+ moji_w = c.encode( UTF8, W252, :undef => :replace )
85
+ h[moji_w] = c if @map_windows_1252
86
+
87
+ if @map_permutations
88
+ # Also add permutations of unassigned Windows-1252 chars to
89
+ # the 8bit equivalent.
90
+ i = 0
91
+ moji_w.each_codepoint do |cp|
92
+ if cp == 0xFFFD
93
+ moji_n = moji_w.dup
94
+ moji_n[i] = moji_8[i]
95
+ h[moji_n] = c
96
+ end
97
+ i += 1
98
+ end
99
+ end
100
+
101
+ h
102
+ end
103
+ end
104
+
105
+ # Return pretty table formatting of hash (array of lines)
106
+ def table
107
+ lines = [ "# -*- coding: utf-8 -*- mojibake: #{MojiBake::VERSION}" ]
108
+ lines << regexp.inspect
109
+ lines << ""
110
+ lines << "Moji\tUNICODE \tOrg\tCODE"
111
+ lines << "+----\t---- ---- ----\t-----\t---+"
112
+ lines += hash.sort.map do |moji,c|
113
+ "[%s]\t%s\t[%s]\t%s" %
114
+ [ moji, codepoints_hex( moji ), c, codepoints_hex( c ) ]
115
+ end
116
+ lines
117
+ end
118
+
119
+ # A Regexp that will match any of the mojibake sequences, as
120
+ # found in hash.keys.
121
+ def regexp
122
+ @regexp ||= Regexp.new( tree_flatten( char_tree( hash.keys ) ) )
123
+ end
124
+
125
+ # Recover original characters from input using regexp, recursively.
126
+ def recover( input, recursive = true )
127
+ output = input.gsub( regexp ) { |moji| hash[moji] }
128
+
129
+ # Only recurse if requested and substituted something (output
130
+ # shorter) in this run.
131
+ if recursive && ( output.length < input.length )
132
+ recover( output )
133
+ else
134
+ output
135
+ end
136
+ end
137
+
138
+ def char_tree( seqs )
139
+ seqs.inject( {} ) do |h,seq|
140
+ seq.chars.inject( h ) do |hs,c|
141
+ hs[c] ||= {}
142
+ end
143
+ h
144
+ end
145
+ end
146
+
147
+ def tree_flatten( tree )
148
+ cs = tree.sort.map do |k,v|
149
+ o = regex_encode( k )
150
+ unless v.empty?
151
+ c = tree_flatten( v )
152
+ o << if c =~ /^\[.*\]$/ || v.length == 1
153
+ c
154
+ else
155
+ '(' + c + ')'
156
+ end
157
+ end
158
+ o
159
+ end
160
+ if cs.find { |o| o =~ /[()|\[\]]/ }
161
+ cs.join( '|' ).force_encoding( "UTF-8" )
162
+ #FIXME: Join looses encoding so force, jruby bug?
163
+ else
164
+ if cs.length > 1
165
+ '[' + cs.inject(:+) + ']'
166
+ else
167
+ cs.first
168
+ end
169
+ end
170
+ end
171
+
172
+ # Unicode hex dump of codepoints
173
+ def codepoints_hex( s )
174
+ s.codepoints.map { |i| sprintf( "%04X", i ) }.join( ' ' )
175
+ end
176
+
177
+ def regex_encode( c )
178
+ i = c.codepoints.next #only one
179
+ if INTEREST_CODEPOINTS.include?( i )
180
+ sprintf( '\u%04X', i )
181
+ else
182
+ Regexp.escape( c )
183
+ end
184
+ end
185
+
186
+ end
187
+ end
data/test/test.txt ADDED
@@ -0,0 +1,12 @@
1
+ -*- coding: utf-8 -*-
2
+ Source: http://en.wikipedia.org/wiki/Mojibake
3
+
4
+ == English ==
5
+
6
+ Mojibake in English texts generally occurs in punctuation, such as em
7
+ dashes (—), en dashes (–), and curly quotes (“, ”), but rarely in
8
+ character text, since most encodings agree with ASCII on the encoding
9
+ of the English alphabet. For example, the pound sign "£" will appear
10
+ as "£" if it was encoded by the sender as UTF-8 but interpreted by
11
+ the recipient as CP1252 or ISO 8859-1. If iterated, this can lead to
12
+ "£", "£", etc.
@@ -0,0 +1,89 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+ #.hashdot.args.pre = --1.9
4
+ #.hashdot.profile += jruby-shortlived
5
+
6
+ #--
7
+ # Copyright (c) 2011 David Kellum
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
10
+ # may not use this file except in compliance with the License. You
11
+ # may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
18
+ # implied. See the License for the specific language governing
19
+ # permissions and limitations under the License.
20
+ #++
21
+
22
+ ldir = File.join( File.dirname( __FILE__ ), "..", "lib" )
23
+ $LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
24
+
25
+ require 'rubygems'
26
+ require 'minitest/unit'
27
+ require 'minitest/autorun'
28
+
29
+ require 'mojibake'
30
+
31
+ class TestMojiBake < MiniTest::Unit::TestCase
32
+ include MojiBake
33
+
34
+ def setup
35
+ @mapper = Mapper.new
36
+ end
37
+
38
+ TEST_TREE = { "a" => { "b" => { "c" => {},
39
+ "d" => {} } },
40
+ "d" => { "b" => { "f" => {} } } }
41
+
42
+ def test_init_options
43
+ assert_equal( true, Mapper.new.map_iso_8859_1 )
44
+ m = Mapper.new( :map_iso_8859_1 => false )
45
+ assert_equal( false, m.map_iso_8859_1 )
46
+ end
47
+
48
+ def test_char_tree
49
+ assert_equal( TEST_TREE,
50
+ @mapper.char_tree( [ "abc", "abd", "dbf" ] ) )
51
+ end
52
+
53
+ def test_tree_flaten
54
+ assert_equal( "ab[cd]|dbf",
55
+ @mapper.tree_flatten( TEST_TREE ) )
56
+ end
57
+
58
+ def test_regexp
59
+ re = Regexp.new( @mapper.tree_flatten( TEST_TREE ) )
60
+ assert_match( re, "abc" )
61
+ assert_match( re, "abd" )
62
+ assert_match( re, "dbf" )
63
+
64
+ refute_match( re, "ab" )
65
+ refute_match( re, "abf" )
66
+
67
+ assert_equal( "xbf" , "abdbf".gsub( re, 'x' ) )
68
+ assert_equal( "dbf" , "abdbf".gsub( re, 'd' ) )
69
+ end
70
+
71
+ def test_nomatch_recover
72
+ assert_equal( '', @mapper.recover( '' ) )
73
+ assert_equal( 'ascii', @mapper.recover( 'ascii' ) )
74
+ assert_equal( 'Â', @mapper.recover( 'Â' ) )
75
+ end
76
+
77
+ def test_simple_recover
78
+ assert_equal( '[°]', @mapper.recover( '[°]' ) )
79
+ assert_equal( '“quoted”', @mapper.recover( '“quotedâ€�' ) )
80
+ assert_equal( '“quoted”', @mapper.recover( '“quoted”' ) )
81
+ end
82
+
83
+ def test_recursive_recover
84
+ assert_equal( '°', @mapper.recover( '°' ) )
85
+ assert_equal( 'AP – Greenlake', @mapper.recover( 'AP – Greenlake' ) )
86
+ assert_equal( 'you’re', @mapper.recover( 'you’re' ) )
87
+ end
88
+
89
+ end
metadata ADDED
@@ -0,0 +1,98 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mojibake
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 1.0.0
6
+ platform: ruby
7
+ authors:
8
+ - David Kellum
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-06-21 00:00:00 -07:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: minitest
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ">="
23
+ - !ruby/object:Gem::Version
24
+ version: "2.1"
25
+ - - <
26
+ - !ruby/object:Gem::Version
27
+ version: "2.4"
28
+ type: :development
29
+ version_requirements: *id001
30
+ - !ruby/object:Gem::Dependency
31
+ name: rjack-tarpit
32
+ prerelease: false
33
+ requirement: &id002 !ruby/object:Gem::Requirement
34
+ none: false
35
+ requirements:
36
+ - - ~>
37
+ - !ruby/object:Gem::Version
38
+ version: 1.3.2
39
+ type: :development
40
+ version_requirements: *id002
41
+ description: "Mojibake occurs in English most frequently due to misinterpreting and\n\
42
+ bad-transcoding between Windows-1252, ISO-8859-1, and UTF-8. This\n\
43
+ module provides a mojibake sequence to original character mapping\n\
44
+ table, and utility to recover mojibake\xE2\x80\x99d text.\n\n\
45
+ Testing has been with English but other Latin based languages, where\n\
46
+ Windows-1252 is in the wild, should also benefit."
47
+ email:
48
+ - dek-oss@gravitext.com
49
+ executables:
50
+ - mojibake
51
+ extensions: []
52
+
53
+ extra_rdoc_files:
54
+ - Manifest.txt
55
+ - History.rdoc
56
+ - README.rdoc
57
+ files:
58
+ - History.rdoc
59
+ - Manifest.txt
60
+ - README.rdoc
61
+ - Rakefile
62
+ - bin/mojibake
63
+ - lib/mojibake/base.rb
64
+ - lib/mojibake.rb
65
+ - lib/mojibake/mapper.rb
66
+ - test/test.txt
67
+ - test/test_mojibake.rb
68
+ has_rdoc: true
69
+ homepage: http://github.com/dekellum/mojibake
70
+ licenses: []
71
+
72
+ post_install_message:
73
+ rdoc_options:
74
+ - --main
75
+ - README.rdoc
76
+ require_paths:
77
+ - lib
78
+ required_ruby_version: !ruby/object:Gem::Requirement
79
+ none: false
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: "0"
84
+ required_rubygems_version: !ruby/object:Gem::Requirement
85
+ none: false
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: "0"
90
+ requirements: []
91
+
92
+ rubyforge_project: mojibake
93
+ rubygems_version: 1.5.1
94
+ signing_key:
95
+ specification_version: 3
96
+ summary: Mojibake occurs in English most frequently due to misinterpreting and bad-transcoding between Windows-1252, ISO-8859-1, and UTF-8
97
+ test_files:
98
+ - test/test_mojibake.rb