mojibake 1.0.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
File without changes
@@ -1,2 +1,9 @@
1
+ === 1.1.0 (2011-6-31)
2
+ * Add table.json output support as more convenient language
3
+ independent format. Include default version (as well as table.txt)
4
+ as part of gem.
5
+ * Add Ruby 1.8 support for Mapper.recover by using default included
6
+ table.json (avoids need for 1.9 encoding support.)
7
+
1
8
  === 1.0.0 (2011-6-21)
2
9
  * Initial release.
@@ -3,8 +3,12 @@ Manifest.txt
3
3
  README.rdoc
4
4
  Rakefile
5
5
  bin/mojibake
6
+ config/table.json
7
+ config/table.txt
6
8
  lib/mojibake/base.rb
7
9
  lib/mojibake.rb
8
- lib/mojibake/mapper.rb
10
+ lib/mojibake/encoding.rb
11
+ lib/mojibake/json.rb
9
12
  test/test.txt
10
- test/test_mojibake.rb
13
+ test/test_encoding.rb
14
+ test/test_mapper.rb
@@ -15,8 +15,10 @@ Windows-1252 is in the wild, should also benefit.
15
15
 
16
16
  == Dependencies
17
17
 
18
- Requires the String Encoding support of ruby 1.9+ (tested 1.9.2p180
19
- Linux) or jruby 1.6+ (tested 1.6.2, Linux).
18
+ Requires the String Encoding support in ruby 1.9 as provided by:
19
+
20
+ * ruby 1.9.2+ (tested 1.9.2p180, Linux)
21
+ * jruby 1.6.5+ (tested 1.6.5, Linux)
20
22
 
21
23
  == Synopsis
22
24
 
data/Rakefile CHANGED
@@ -3,7 +3,7 @@
3
3
  $LOAD_PATH << './lib'
4
4
 
5
5
  require 'rubygems'
6
- gem 'rjack-tarpit', '~> 1.3.2'
6
+ gem 'rjack-tarpit', '~> 1.4'
7
7
  require 'rjack-tarpit'
8
8
 
9
9
  require 'mojibake/base'
@@ -14,7 +14,8 @@ t.specify do |h|
14
14
  h.developer( 'David Kellum', 'dek-oss@gravitext.com' )
15
15
 
16
16
  h.testlib = :minitest
17
- h.extra_dev_deps += [ [ 'minitest', '>= 2.1', '< 2.4' ] ]
17
+ h.extra_deps += [ [ 'json', '~> 1.6.1' ] ]
18
+ h.extra_dev_deps += [ [ 'minitest', '~> 2.3' ] ]
18
19
 
19
20
  h.url = 'http://github.com/dekellum/mojibake'
20
21
  end
@@ -33,3 +34,15 @@ task :tag => [ :check_history_version, :check_history_date ]
33
34
  task :push => [ :check_history_version, :check_history_date ]
34
35
 
35
36
  t.define_tasks
37
+
38
+ desc "(Re-)generate config output files (requires 1.9)"
39
+ task :generate_config do
40
+ if ( RUBY_VERSION.split( '.' ).map { |d| d.to_i } <=> [ 1, 9 ] ) >= 0
41
+ require 'mojibake'
42
+ mapper = MojiBake::Mapper.new
43
+ open( "config/table.txt", 'w' ) { |fout| fout.puts( mapper.table ) }
44
+ open( "config/table.json", 'w' ) { |fout| fout.puts( mapper.json ) }
45
+ else
46
+ raise "Task generate_config requires Ruby 1.9 encoding support"
47
+ end
48
+ end
@@ -20,6 +20,8 @@
20
20
 
21
21
  $LOAD_PATH.unshift File.join( File.dirname( __FILE__ ), "..", "lib" )
22
22
 
23
+ require 'rubygems'
24
+
23
25
  require 'mojibake'
24
26
  require 'optparse'
25
27
 
@@ -33,23 +35,36 @@ module Script
33
35
  puts "mojibake: #{MojiBake::VERSION}"
34
36
  exit 1
35
37
  end
36
- opts.on( "--no-windows-1252",
37
- "Don't include miscodings from Windows-1252" ) do
38
- mapper.map_windows_1252 = false
39
- end
40
- opts.on( "--no-iso-8859-1",
41
- "Don't include miscodings from ISO-8859-1" ) do
42
- mapper.map_iso_8859_1 = false
43
- end
44
- opts.on( "--no-permutations",
45
- "Don't include ISO/Windows permutations" ) do
46
- mapper.map_permutations = false
47
- end
48
- opts.on_tail( "-t", "--table",
49
- "Write MojiBake Mapper table (UTF-8)" ) do
50
- puts mapper.table
51
- exit 1
38
+
39
+ if ( RUBY_VERSION.split( '.' ).map { |d| d.to_i } <=> [ 1, 9 ] ) >= 0
40
+
41
+ opts.on( "--no-windows-1252",
42
+ "Don't include miscodings from Windows-1252" ) do
43
+ mapper.map_windows_1252 = false
44
+ end
45
+ opts.on( "--no-iso-8859-1",
46
+ "Don't include miscodings from ISO-8859-1" ) do
47
+ mapper.map_iso_8859_1 = false
48
+ end
49
+ opts.on( "--no-permutations",
50
+ "Don't include ISO/Windows permutations" ) do
51
+ mapper.map_permutations = false
52
+ end
53
+ opts.on_tail( "-t", "--table",
54
+ "Write MojiBake Mapper table (UTF-8) and exit" ) do
55
+ puts mapper.table
56
+ exit 1
57
+ end
58
+ opts.on_tail( "-j", "--json",
59
+ "Write MojiBake Mapper json (UTF-8) and exit" ) do
60
+ require 'rubygems'
61
+ require 'mojibake/json'
62
+ puts mapper.json
63
+ exit 1
64
+ end
65
+
52
66
  end
67
+
53
68
  opts.on_tail( "-r", "--regex",
54
69
  "Display MojiBake Mapper regex (UTF-8) and exit" ) do
55
70
  puts mapper.regexp.inspect
@@ -65,7 +80,8 @@ module Script
65
80
 
66
81
  input_file = ARGV.shift
67
82
  if input_file
68
- $stdout.write( mapper.recover( IO.read( input_file ).encode( 'UTF-8' ) ) )
83
+ data = IO.read( input_file )
84
+ $stdout.write( mapper.recover( data ) )
69
85
  end
70
86
 
71
87
  end
@@ -0,0 +1,270 @@
1
+ {
2
+ "mojibake": "1.1.0",
3
+ "url": "https://github.com/dekellum/mojibake",
4
+ "regexp": "Â[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ŒœŠšŸŽžƒˆ˜–—‘’‚“”„†‡•…‰‹›€™\uFFFD]|Ã[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ŒœŠšŸŽžƒˆ˜–—‘’‚“”„†‡•…‰‹›€™\uFFFD]|Å[\u0092\u0093\u00A0¡¸½¾’“]|Æ[\u0092’]|Ë[\u0086\u009Cœ†]|â(\u0080[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u0093\u0094\u0098\u0099\u009A\u009C\u009D\u009E\u00A0¡¢¦°¹º]|\u0081\u00A0|\u0082¬|\u0084¢|‚¬|„¢|€[\u0081\u009D\u00A0¡¢¦°¹ºœŠšžƒˆ˜‚“”„†‡…‰‹€™\uFFFD]|\uFFFD\u00A0)|ï(»¿|¿[½¾])",
5
+ "moji": {
6
+ "Â\u0080": "\u0080",
7
+ "Â\u0081": "\u0081",
8
+ "Â\u0082": "\u0082",
9
+ "Â\u0083": "\u0083",
10
+ "Â\u0084": "\u0084",
11
+ "Â\u0085": "\u0085",
12
+ "Â\u0086": "\u0086",
13
+ "Â\u0087": "\u0087",
14
+ "Â\u0088": "\u0088",
15
+ "Â\u0089": "\u0089",
16
+ "Â\u008A": "\u008A",
17
+ "Â\u008B": "\u008B",
18
+ "Â\u008C": "\u008C",
19
+ "Â\u008D": "\u008D",
20
+ "Â\u008E": "\u008E",
21
+ "Â\u008F": "\u008F",
22
+ "Â\u0090": "\u0090",
23
+ "Â\u0091": "\u0091",
24
+ "Â\u0092": "\u0092",
25
+ "Â\u0093": "\u0093",
26
+ "Â\u0094": "\u0094",
27
+ "Â\u0095": "\u0095",
28
+ "Â\u0096": "\u0096",
29
+ "Â\u0097": "\u0097",
30
+ "Â\u0098": "\u0098",
31
+ "Â\u0099": "\u0099",
32
+ "Â\u009A": "\u009A",
33
+ "Â\u009B": "\u009B",
34
+ "Â\u009C": "\u009C",
35
+ "Â\u009D": "\u009D",
36
+ "Â\u009E": "\u009E",
37
+ "Â\u009F": "\u009F",
38
+ "Â\u00A0": "\u00A0",
39
+ "¡": "¡",
40
+ "¢": "¢",
41
+ "£": "£",
42
+ "¤": "¤",
43
+ "Â¥": "¥",
44
+ "¦": "¦",
45
+ "§": "§",
46
+ "¨": "¨",
47
+ "©": "©",
48
+ "ª": "ª",
49
+ "«": "«",
50
+ "¬": "¬",
51
+ "­": "­",
52
+ "®": "®",
53
+ "¯": "¯",
54
+ "°": "°",
55
+ "±": "±",
56
+ "²": "²",
57
+ "³": "³",
58
+ "´": "´",
59
+ "µ": "µ",
60
+ "¶": "¶",
61
+ "·": "·",
62
+ "¸": "¸",
63
+ "¹": "¹",
64
+ "º": "º",
65
+ "»": "»",
66
+ "¼": "¼",
67
+ "½": "½",
68
+ "¾": "¾",
69
+ "¿": "¿",
70
+ "Œ": "\u008C",
71
+ "œ": "\u009C",
72
+ "Š": "\u008A",
73
+ "š": "\u009A",
74
+ "Ÿ": "\u009F",
75
+ "ÂŽ": "\u008E",
76
+ "ž": "\u009E",
77
+ "ƒ": "\u0083",
78
+ "ˆ": "\u0088",
79
+ "˜": "\u0098",
80
+ "–": "\u0096",
81
+ "—": "\u0097",
82
+ "‘": "\u0091",
83
+ "Â’": "\u0092",
84
+ "‚": "\u0082",
85
+ "“": "\u0093",
86
+ "”": "\u0094",
87
+ "„": "\u0084",
88
+ "†": "\u0086",
89
+ "‡": "\u0087",
90
+ "•": "\u0095",
91
+ "Â…": "\u0085",
92
+ "‰": "\u0089",
93
+ "‹": "\u008B",
94
+ "›": "\u009B",
95
+ "€": "\u0080",
96
+ "™": "\u0099",
97
+ "Â\uFFFD": "\u0081",
98
+ "Ã\u0080": "À",
99
+ "Ã\u0081": "Á",
100
+ "Ã\u0082": "Â",
101
+ "Ã\u0083": "Ã",
102
+ "Ã\u0084": "Ä",
103
+ "Ã\u0085": "Å",
104
+ "Ã\u0086": "Æ",
105
+ "Ã\u0087": "Ç",
106
+ "Ã\u0088": "È",
107
+ "Ã\u0089": "É",
108
+ "Ã\u008A": "Ê",
109
+ "Ã\u008B": "Ë",
110
+ "Ã\u008C": "Ì",
111
+ "Ã\u008D": "Í",
112
+ "Ã\u008E": "Î",
113
+ "Ã\u008F": "Ï",
114
+ "Ã\u0090": "Ð",
115
+ "Ã\u0091": "Ñ",
116
+ "Ã\u0092": "Ò",
117
+ "Ã\u0093": "Ó",
118
+ "Ã\u0094": "Ô",
119
+ "Ã\u0095": "Õ",
120
+ "Ã\u0096": "Ö",
121
+ "Ã\u0097": "×",
122
+ "Ã\u0098": "Ø",
123
+ "Ã\u0099": "Ù",
124
+ "Ã\u009A": "Ú",
125
+ "Ã\u009B": "Û",
126
+ "Ã\u009C": "Ü",
127
+ "Ã\u009D": "Ý",
128
+ "Ã\u009E": "Þ",
129
+ "Ã\u009F": "ß",
130
+ "Ã\u00A0": "à",
131
+ "á": "á",
132
+ "â": "â",
133
+ "ã": "ã",
134
+ "ä": "ä",
135
+ "Ã¥": "å",
136
+ "æ": "æ",
137
+ "ç": "ç",
138
+ "è": "è",
139
+ "é": "é",
140
+ "ê": "ê",
141
+ "ë": "ë",
142
+ "ì": "ì",
143
+ "í": "í",
144
+ "î": "î",
145
+ "ï": "ï",
146
+ "ð": "ð",
147
+ "ñ": "ñ",
148
+ "ò": "ò",
149
+ "ó": "ó",
150
+ "ô": "ô",
151
+ "õ": "õ",
152
+ "ö": "ö",
153
+ "÷": "÷",
154
+ "ø": "ø",
155
+ "ù": "ù",
156
+ "ú": "ú",
157
+ "û": "û",
158
+ "ü": "ü",
159
+ "ý": "ý",
160
+ "þ": "þ",
161
+ "ÿ": "ÿ",
162
+ "ÃŒ": "Ì",
163
+ "Ãœ": "Ü",
164
+ "Ê": "Ê",
165
+ "Ú": "Ú",
166
+ "ß": "ß",
167
+ "ÃŽ": "Î",
168
+ "Þ": "Þ",
169
+ "Ã": "Ã",
170
+ "È": "È",
171
+ "Ø": "Ø",
172
+ "Ö": "Ö",
173
+ "×": "×",
174
+ "Ñ": "Ñ",
175
+ "Ã’": "Ò",
176
+ "Â": "Â",
177
+ "Ó": "Ó",
178
+ "Ô": "Ô",
179
+ "Ä": "Ä",
180
+ "Æ": "Æ",
181
+ "Ç": "Ç",
182
+ "Õ": "Õ",
183
+ "Ã…": "Å",
184
+ "É": "É",
185
+ "Ë": "Ë",
186
+ "Û": "Û",
187
+ "À": "À",
188
+ "Ù": "Ù",
189
+ "Ã\uFFFD": "Á",
190
+ "Å\u0092": "Œ",
191
+ "Å\u0093": "œ",
192
+ "Å\u00A0": "Š",
193
+ "Å¡": "š",
194
+ "Ÿ": "Ÿ",
195
+ "Ž": "Ž",
196
+ "ž": "ž",
197
+ "Å’": "Œ",
198
+ "Å“": "œ",
199
+ "Æ\u0092": "ƒ",
200
+ "Æ’": "ƒ",
201
+ "Ë\u0086": "ˆ",
202
+ "Ë\u009C": "˜",
203
+ "Ëœ": "˜",
204
+ "ˆ": "ˆ",
205
+ "â\u0080\u0080": "\u2000",
206
+ "â\u0080\u0081": "\u2001",
207
+ "â\u0080\u0082": "\u2002",
208
+ "â\u0080\u0083": "\u2003",
209
+ "â\u0080\u0084": "\u2004",
210
+ "â\u0080\u0085": "\u2005",
211
+ "â\u0080\u0086": "\u2006",
212
+ "â\u0080\u0087": "\u2007",
213
+ "â\u0080\u0088": "\u2008",
214
+ "â\u0080\u0089": "\u2009",
215
+ "â\u0080\u008A": "\u200A",
216
+ "â\u0080\u008B": "\u200B",
217
+ "â\u0080\u0093": "–",
218
+ "â\u0080\u0094": "—",
219
+ "â\u0080\u0098": "‘",
220
+ "â\u0080\u0099": "’",
221
+ "â\u0080\u009A": "‚",
222
+ "â\u0080\u009C": "“",
223
+ "â\u0080\u009D": "”",
224
+ "â\u0080\u009E": "„",
225
+ "â\u0080\u00A0": "†",
226
+ "â\u0080¡": "‡",
227
+ "â\u0080¢": "•",
228
+ "â\u0080¦": "…",
229
+ "â\u0080°": "‰",
230
+ "â\u0080¹": "‹",
231
+ "â\u0080º": "›",
232
+ "â\u0081\u00A0": "\u2060",
233
+ "â\u0082¬": "€",
234
+ "â\u0084¢": "™",
235
+ "€": "€",
236
+ "â„¢": "™",
237
+ "â€\u0081": "\u2001",
238
+ "â€\u009D": "”",
239
+ "â€\u00A0": "†",
240
+ "‡": "‡",
241
+ "•": "•",
242
+ "…": "…",
243
+ "‰": "‰",
244
+ "‹": "‹",
245
+ "›": "›",
246
+ "“": "“",
247
+ " ": "\u200A",
248
+ "‚": "‚",
249
+ "„": "„",
250
+ " ": "\u2003",
251
+ " ": "\u2008",
252
+ "‘": "‘",
253
+ " ": "\u2002",
254
+ "–": "–",
255
+ "—": "—",
256
+ " ": "\u2004",
257
+ " ": "\u2006",
258
+ " ": "\u2007",
259
+ " ": "\u2005",
260
+ " ": "\u2009",
261
+ "​": "\u200B",
262
+ " ": "\u2000",
263
+ "’": "’",
264
+ "â€\uFFFD": "”",
265
+ "â\uFFFD\u00A0": "\u2060",
266
+ "": "\uFEFF",
267
+ "�": "\uFFFD",
268
+ "￾": "\uFFFE"
269
+ }
270
+ }
@@ -0,0 +1,268 @@
1
+ # -*- coding: utf-8 -*- mojibake: 1.1.0
2
+ /Â[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ŒœŠšŸŽžƒˆ˜–—‘’‚“”„†‡•…‰‹›€™\uFFFD]|Ã[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ŒœŠšŸŽžƒˆ˜–—‘’‚“”„†‡•…‰‹›€™\uFFFD]|Å[\u0092\u0093\u00A0¡¸½¾’“]|Æ[\u0092’]|Ë[\u0086\u009Cœ†]|â(\u0080[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u0093\u0094\u0098\u0099\u009A\u009C\u009D\u009E\u00A0¡¢¦°¹º]|\u0081\u00A0|\u0082¬|\u0084¢|‚¬|„¢|€[\u0081\u009D\u00A0¡¢¦°¹ºœŠšžƒˆ˜‚“”„†‡…‰‹€™\uFFFD]|\uFFFD\u00A0)|ï(»¿|¿[½¾])/
3
+
4
+ Moji UNICODE Org CODE
5
+ +---- ---- ---- ---- ----- ---+
6
+ [€] 00C2 0080 [€] 0080
7
+ [] 00C2 0081 [] 0081
8
+ [‚] 00C2 0082 [‚] 0082
9
+ [ƒ] 00C2 0083 [ƒ] 0083
10
+ [„] 00C2 0084 [„] 0084
11
+ […] 00C2 0085 […] 0085
12
+ [†] 00C2 0086 [†] 0086
13
+ [‡] 00C2 0087 [‡] 0087
14
+ [ˆ] 00C2 0088 [ˆ] 0088
15
+ [‰] 00C2 0089 [‰] 0089
16
+ [Š] 00C2 008A [Š] 008A
17
+ [‹] 00C2 008B [‹] 008B
18
+ [Œ] 00C2 008C [Œ] 008C
19
+ [] 00C2 008D [] 008D
20
+ [Ž] 00C2 008E [Ž] 008E
21
+ [] 00C2 008F [] 008F
22
+ [] 00C2 0090 [] 0090
23
+ [‘] 00C2 0091 [‘] 0091
24
+ [’] 00C2 0092 [’] 0092
25
+ [“] 00C2 0093 [“] 0093
26
+ [”] 00C2 0094 [”] 0094
27
+ [•] 00C2 0095 [•] 0095
28
+ [–] 00C2 0096 [–] 0096
29
+ [—] 00C2 0097 [—] 0097
30
+ [˜] 00C2 0098 [˜] 0098
31
+ [™] 00C2 0099 [™] 0099
32
+ [š] 00C2 009A [š] 009A
33
+ [›] 00C2 009B [›] 009B
34
+ [œ] 00C2 009C [œ] 009C
35
+ [] 00C2 009D [] 009D
36
+ [ž] 00C2 009E [ž] 009E
37
+ [Ÿ] 00C2 009F [Ÿ] 009F
38
+ [ ] 00C2 00A0 [ ] 00A0
39
+ [¡] 00C2 00A1 [¡] 00A1
40
+ [¢] 00C2 00A2 [¢] 00A2
41
+ [£] 00C2 00A3 [£] 00A3
42
+ [¤] 00C2 00A4 [¤] 00A4
43
+ [Â¥] 00C2 00A5 [¥] 00A5
44
+ [¦] 00C2 00A6 [¦] 00A6
45
+ [§] 00C2 00A7 [§] 00A7
46
+ [¨] 00C2 00A8 [¨] 00A8
47
+ [©] 00C2 00A9 [©] 00A9
48
+ [ª] 00C2 00AA [ª] 00AA
49
+ [«] 00C2 00AB [«] 00AB
50
+ [¬] 00C2 00AC [¬] 00AC
51
+ [­] 00C2 00AD [­] 00AD
52
+ [®] 00C2 00AE [®] 00AE
53
+ [¯] 00C2 00AF [¯] 00AF
54
+ [°] 00C2 00B0 [°] 00B0
55
+ [±] 00C2 00B1 [±] 00B1
56
+ [²] 00C2 00B2 [²] 00B2
57
+ [³] 00C2 00B3 [³] 00B3
58
+ [´] 00C2 00B4 [´] 00B4
59
+ [µ] 00C2 00B5 [µ] 00B5
60
+ [¶] 00C2 00B6 [¶] 00B6
61
+ [·] 00C2 00B7 [·] 00B7
62
+ [¸] 00C2 00B8 [¸] 00B8
63
+ [¹] 00C2 00B9 [¹] 00B9
64
+ [º] 00C2 00BA [º] 00BA
65
+ [»] 00C2 00BB [»] 00BB
66
+ [¼] 00C2 00BC [¼] 00BC
67
+ [½] 00C2 00BD [½] 00BD
68
+ [¾] 00C2 00BE [¾] 00BE
69
+ [¿] 00C2 00BF [¿] 00BF
70
+ [ÂŒ] 00C2 0152 [Œ] 008C
71
+ [Âœ] 00C2 0153 [œ] 009C
72
+ [Š] 00C2 0160 [Š] 008A
73
+ [š] 00C2 0161 [š] 009A
74
+ [Ÿ] 00C2 0178 [Ÿ] 009F
75
+ [ÂŽ] 00C2 017D [Ž] 008E
76
+ [ž] 00C2 017E [ž] 009E
77
+ [ƒ] 00C2 0192 [ƒ] 0083
78
+ [ˆ] 00C2 02C6 [ˆ] 0088
79
+ [˜] 00C2 02DC [˜] 0098
80
+ [–] 00C2 2013 [–] 0096
81
+ [—] 00C2 2014 [—] 0097
82
+ [‘] 00C2 2018 [‘] 0091
83
+ [Â’] 00C2 2019 [’] 0092
84
+ [‚] 00C2 201A [‚] 0082
85
+ [“] 00C2 201C [“] 0093
86
+ [”] 00C2 201D [”] 0094
87
+ [„] 00C2 201E [„] 0084
88
+ [†] 00C2 2020 [†] 0086
89
+ [‡] 00C2 2021 [‡] 0087
90
+ [•] 00C2 2022 [•] 0095
91
+ [Â…] 00C2 2026 […] 0085
92
+ [‰] 00C2 2030 [‰] 0089
93
+ [‹] 00C2 2039 [‹] 008B
94
+ [›] 00C2 203A [›] 009B
95
+ [€] 00C2 20AC [€] 0080
96
+ [™] 00C2 2122 [™] 0099
97
+ [Â�] 00C2 FFFD [] 0081
98
+ [À] 00C3 0080 [À] 00C0
99
+ [Á] 00C3 0081 [Á] 00C1
100
+ [Â] 00C3 0082 [Â] 00C2
101
+ [Ã] 00C3 0083 [Ã] 00C3
102
+ [Ä] 00C3 0084 [Ä] 00C4
103
+ [Å] 00C3 0085 [Å] 00C5
104
+ [Æ] 00C3 0086 [Æ] 00C6
105
+ [Ç] 00C3 0087 [Ç] 00C7
106
+ [È] 00C3 0088 [È] 00C8
107
+ [É] 00C3 0089 [É] 00C9
108
+ [Ê] 00C3 008A [Ê] 00CA
109
+ [Ë] 00C3 008B [Ë] 00CB
110
+ [Ì] 00C3 008C [Ì] 00CC
111
+ [Í] 00C3 008D [Í] 00CD
112
+ [Î] 00C3 008E [Î] 00CE
113
+ [Ï] 00C3 008F [Ï] 00CF
114
+ [Ð] 00C3 0090 [Ð] 00D0
115
+ [Ñ] 00C3 0091 [Ñ] 00D1
116
+ [Ò] 00C3 0092 [Ò] 00D2
117
+ [Ó] 00C3 0093 [Ó] 00D3
118
+ [Ô] 00C3 0094 [Ô] 00D4
119
+ [Õ] 00C3 0095 [Õ] 00D5
120
+ [Ö] 00C3 0096 [Ö] 00D6
121
+ [×] 00C3 0097 [×] 00D7
122
+ [Ø] 00C3 0098 [Ø] 00D8
123
+ [Ù] 00C3 0099 [Ù] 00D9
124
+ [Ú] 00C3 009A [Ú] 00DA
125
+ [Û] 00C3 009B [Û] 00DB
126
+ [Ü] 00C3 009C [Ü] 00DC
127
+ [Ý] 00C3 009D [Ý] 00DD
128
+ [Þ] 00C3 009E [Þ] 00DE
129
+ [ß] 00C3 009F [ß] 00DF
130
+ [à] 00C3 00A0 [à] 00E0
131
+ [á] 00C3 00A1 [á] 00E1
132
+ [â] 00C3 00A2 [â] 00E2
133
+ [ã] 00C3 00A3 [ã] 00E3
134
+ [ä] 00C3 00A4 [ä] 00E4
135
+ [Ã¥] 00C3 00A5 [å] 00E5
136
+ [æ] 00C3 00A6 [æ] 00E6
137
+ [ç] 00C3 00A7 [ç] 00E7
138
+ [è] 00C3 00A8 [è] 00E8
139
+ [é] 00C3 00A9 [é] 00E9
140
+ [ê] 00C3 00AA [ê] 00EA
141
+ [ë] 00C3 00AB [ë] 00EB
142
+ [ì] 00C3 00AC [ì] 00EC
143
+ [í] 00C3 00AD [í] 00ED
144
+ [î] 00C3 00AE [î] 00EE
145
+ [ï] 00C3 00AF [ï] 00EF
146
+ [ð] 00C3 00B0 [ð] 00F0
147
+ [ñ] 00C3 00B1 [ñ] 00F1
148
+ [ò] 00C3 00B2 [ò] 00F2
149
+ [ó] 00C3 00B3 [ó] 00F3
150
+ [ô] 00C3 00B4 [ô] 00F4
151
+ [õ] 00C3 00B5 [õ] 00F5
152
+ [ö] 00C3 00B6 [ö] 00F6
153
+ [÷] 00C3 00B7 [÷] 00F7
154
+ [ø] 00C3 00B8 [ø] 00F8
155
+ [ù] 00C3 00B9 [ù] 00F9
156
+ [ú] 00C3 00BA [ú] 00FA
157
+ [û] 00C3 00BB [û] 00FB
158
+ [ü] 00C3 00BC [ü] 00FC
159
+ [ý] 00C3 00BD [ý] 00FD
160
+ [þ] 00C3 00BE [þ] 00FE
161
+ [ÿ] 00C3 00BF [ÿ] 00FF
162
+ [ÃŒ] 00C3 0152 [Ì] 00CC
163
+ [Ãœ] 00C3 0153 [Ü] 00DC
164
+ [Ê] 00C3 0160 [Ê] 00CA
165
+ [Ú] 00C3 0161 [Ú] 00DA
166
+ [ß] 00C3 0178 [ß] 00DF
167
+ [ÃŽ] 00C3 017D [Î] 00CE
168
+ [Þ] 00C3 017E [Þ] 00DE
169
+ [Ã] 00C3 0192 [Ã] 00C3
170
+ [È] 00C3 02C6 [È] 00C8
171
+ [Ø] 00C3 02DC [Ø] 00D8
172
+ [Ö] 00C3 2013 [Ö] 00D6
173
+ [×] 00C3 2014 [×] 00D7
174
+ [Ñ] 00C3 2018 [Ñ] 00D1
175
+ [Ã’] 00C3 2019 [Ò] 00D2
176
+ [Â] 00C3 201A [Â] 00C2
177
+ [Ó] 00C3 201C [Ó] 00D3
178
+ [Ô] 00C3 201D [Ô] 00D4
179
+ [Ä] 00C3 201E [Ä] 00C4
180
+ [Æ] 00C3 2020 [Æ] 00C6
181
+ [Ç] 00C3 2021 [Ç] 00C7
182
+ [Õ] 00C3 2022 [Õ] 00D5
183
+ [Ã…] 00C3 2026 [Å] 00C5
184
+ [É] 00C3 2030 [É] 00C9
185
+ [Ë] 00C3 2039 [Ë] 00CB
186
+ [Û] 00C3 203A [Û] 00DB
187
+ [À] 00C3 20AC [À] 00C0
188
+ [Ù] 00C3 2122 [Ù] 00D9
189
+ [Ã�] 00C3 FFFD [Á] 00C1
190
+ [Œ] 00C5 0092 [Œ] 0152
191
+ [œ] 00C5 0093 [œ] 0153
192
+ [Å ] 00C5 00A0 [Š] 0160
193
+ [Å¡] 00C5 00A1 [š] 0161
194
+ [Ÿ] 00C5 00B8 [Ÿ] 0178
195
+ [Ž] 00C5 00BD [Ž] 017D
196
+ [ž] 00C5 00BE [ž] 017E
197
+ [Å’] 00C5 2019 [Œ] 0152
198
+ [Å“] 00C5 201C [œ] 0153
199
+ [ƒ] 00C6 0092 [ƒ] 0192
200
+ [Æ’] 00C6 2019 [ƒ] 0192
201
+ [ˆ] 00CB 0086 [ˆ] 02C6
202
+ [˜] 00CB 009C [˜] 02DC
203
+ [Ëœ] 00CB 0153 [˜] 02DC
204
+ [ˆ] 00CB 2020 [ˆ] 02C6
205
+ [ ] 00E2 0080 0080 [ ] 2000
206
+ [ ] 00E2 0080 0081 [ ] 2001
207
+ [ ] 00E2 0080 0082 [ ] 2002
208
+ [ ] 00E2 0080 0083 [ ] 2003
209
+ [ ] 00E2 0080 0084 [ ] 2004
210
+ [ ] 00E2 0080 0085 [ ] 2005
211
+ [ ] 00E2 0080 0086 [ ] 2006
212
+ [ ] 00E2 0080 0087 [ ] 2007
213
+ [ ] 00E2 0080 0088 [ ] 2008
214
+ [ ] 00E2 0080 0089 [ ] 2009
215
+ [ ] 00E2 0080 008A [ ] 200A
216
+ [​] 00E2 0080 008B [​] 200B
217
+ [–] 00E2 0080 0093 [–] 2013
218
+ [—] 00E2 0080 0094 [—] 2014
219
+ [‘] 00E2 0080 0098 [‘] 2018
220
+ [’] 00E2 0080 0099 [’] 2019
221
+ [‚] 00E2 0080 009A [‚] 201A
222
+ [“] 00E2 0080 009C [“] 201C
223
+ [”] 00E2 0080 009D [”] 201D
224
+ [„] 00E2 0080 009E [„] 201E
225
+ [†] 00E2 0080 00A0 [†] 2020
226
+ [‡] 00E2 0080 00A1 [‡] 2021
227
+ [•] 00E2 0080 00A2 [•] 2022
228
+ […] 00E2 0080 00A6 […] 2026
229
+ [‰] 00E2 0080 00B0 [‰] 2030
230
+ [‹] 00E2 0080 00B9 [‹] 2039
231
+ [›] 00E2 0080 00BA [›] 203A
232
+ [⁠] 00E2 0081 00A0 [⁠] 2060
233
+ [€] 00E2 0082 00AC [€] 20AC
234
+ [™] 00E2 0084 00A2 [™] 2122
235
+ [€] 00E2 201A 00AC [€] 20AC
236
+ [â„¢] 00E2 201E 00A2 [™] 2122
237
+ [ ] 00E2 20AC 0081 [ ] 2001
238
+ [”] 00E2 20AC 009D [”] 201D
239
+ [†] 00E2 20AC 00A0 [†] 2020
240
+ [‡] 00E2 20AC 00A1 [‡] 2021
241
+ [•] 00E2 20AC 00A2 [•] 2022
242
+ […] 00E2 20AC 00A6 […] 2026
243
+ [‰] 00E2 20AC 00B0 [‰] 2030
244
+ [‹] 00E2 20AC 00B9 [‹] 2039
245
+ [›] 00E2 20AC 00BA [›] 203A
246
+ [“] 00E2 20AC 0153 [“] 201C
247
+ [ ] 00E2 20AC 0160 [ ] 200A
248
+ [‚] 00E2 20AC 0161 [‚] 201A
249
+ [„] 00E2 20AC 017E [„] 201E
250
+ [ ] 00E2 20AC 0192 [ ] 2003
251
+ [ ] 00E2 20AC 02C6 [ ] 2008
252
+ [‘] 00E2 20AC 02DC [‘] 2018
253
+ [ ] 00E2 20AC 201A [ ] 2002
254
+ [–] 00E2 20AC 201C [–] 2013
255
+ [—] 00E2 20AC 201D [—] 2014
256
+ [ ] 00E2 20AC 201E [ ] 2004
257
+ [ ] 00E2 20AC 2020 [ ] 2006
258
+ [ ] 00E2 20AC 2021 [ ] 2007
259
+ [ ] 00E2 20AC 2026 [ ] 2005
260
+ [ ] 00E2 20AC 2030 [ ] 2009
261
+ [​] 00E2 20AC 2039 [​] 200B
262
+ [ ] 00E2 20AC 20AC [ ] 2000
263
+ [’] 00E2 20AC 2122 [’] 2019
264
+ [â€�] 00E2 20AC FFFD [”] 201D
265
+ [â� ] 00E2 FFFD 00A0 [⁠] 2060
266
+ [] 00EF 00BB 00BF [] FEFF
267
+ [�] 00EF 00BF 00BD [�] FFFD
268
+ [￾] 00EF 00BF 00BE [￾] FFFE
@@ -14,9 +14,38 @@
14
14
  # permissions and limitations under the License.
15
15
  #++
16
16
 
17
- if ( RUBY_VERSION.split( '.' ).map { |d| d.to_i } <=> [ 1, 9 ] ) < 0
18
- raise "Requires ruby ~> 1.9 for String.encode support"
19
- end
20
-
21
17
  require 'mojibake/base'
22
- require 'mojibake/mapper'
18
+
19
+ require 'mojibake/json'
20
+
21
+ module MojiBake
22
+
23
+ # Supports recovering Mojibake characters to the original text.
24
+ class Mapper
25
+ include JSONSupport
26
+
27
+ if ( RUBY_VERSION.split( '.' ).map { |d| d.to_i } <=> [ 1, 9 ] ) >= 0
28
+ require 'mojibake/encoding'
29
+ include EncodingSupport
30
+ end
31
+
32
+ def initialize( opts = {} )
33
+ super()
34
+ opts.map { |k,v| send( k.to_s + '=', v ) }
35
+ end
36
+
37
+ # Recover original characters from input using regexp, recursively.
38
+ def recover( input, recursive = true )
39
+ output = input.gsub( regexp ) { |moji| hash[moji] }
40
+
41
+ # Only recurse if requested and substituted something (output
42
+ # shorter) in this run.
43
+ if recursive && ( output.length < input.length )
44
+ recover( output )
45
+ else
46
+ output
47
+ end
48
+ end
49
+
50
+ end
51
+ end
@@ -15,5 +15,5 @@
15
15
  #++
16
16
 
17
17
  module MojiBake
18
- VERSION = "1.0.0"
18
+ VERSION = "1.1.0"
19
19
  end
@@ -16,9 +16,9 @@
16
16
 
17
17
  module MojiBake
18
18
 
19
- # Creates a Map from mojibake sequences to recovered/original
20
- # characters.
21
- class Mapper
19
+ # Mixin for the actual (ruby 1.9 backed) encoding support to define
20
+ # the mojibake mapping table and regex.
21
+ module EncodingSupport
22
22
 
23
23
  W252 = Encoding::WINDOWS_1252
24
24
  ISO8 = Encoding::ISO_8859_1
@@ -30,20 +30,21 @@ module MojiBake
30
30
  # RIGHT DOUBLE QUOTATION MARK. These are the most common problem
31
31
  # chars in English and probably most latin languages.
32
32
  HIGH_ORDER_CHARS =
33
- ( ( 0x80..0xFF ).to_a - [ 0x81, 0x8D, 0x8F, 0x90, 0x9D ] ).
33
+ ( Array( 0x80..0xFF ) - [ 0x81, 0x8D, 0x8F, 0x90, 0x9D ] ).
34
34
  map { |i| i.chr( W252 ).encode( UTF8 ) }.
35
35
  sort
36
36
 
37
37
  # Additional Unicode codepoints of mojibake potential, like alt
38
38
  # whitespace, C1 control characters, and BOMs.
39
39
  INTEREST_CODEPOINTS =
40
- [ (0x0080..0x009F).to_a, # ISO/Unicode C1 control codes.
41
- 0x00A0, # NO-BREAK SPACE
42
- (0x2000..0x200B).to_a, # EN QUAD ... ZERO WIDTH SPACE
43
- 0x2060, # WORD JOINER
44
- 0xfeff, # ZERO WIDTH SPACE, BYTE-ORDER-MARK (BOM)
45
- 0xfffd, # REPLACEMENT CHARACTER
46
- 0xfffe ]. # UNASSIGNED, BAD BOM
40
+ [ 0x0080..0x009F, # ISO/Unicode C1 control codes.
41
+ 0x00A0, # NO-BREAK SPACE
42
+ 0x2000..0x200B, # EN QUAD ... ZERO WIDTH SPACE
43
+ 0x2060, # WORD JOINER
44
+ 0xfeff, # ZERO WIDTH SPACE, BYTE-ORDER-MARK (BOM)
45
+ 0xfffd, # REPLACEMENT CHARACTER
46
+ 0xfffe ]. # UNASSIGNED, BAD BOM
47
+ map { |i| Array( i ) }.
47
48
  flatten.
48
49
  sort
49
50
 
@@ -63,12 +64,11 @@ module MojiBake
63
64
  # (default: true). This covers ambiguities of C1 control codes.
64
65
  attr_accessor :map_permutations
65
66
 
66
- def initialize( options = {} )
67
+ def initialize
68
+ super
67
69
  @map_windows_1252 = true
68
70
  @map_iso_8859_1 = true
69
71
  @map_permutations = true
70
-
71
- options.map { |k,v| send( k.to_s + '=', v ) }
72
72
  end
73
73
 
74
74
  # Return Hash of mojibake UTF-8 2-3 character sequences to original
@@ -122,19 +122,6 @@ module MojiBake
122
122
  @regexp ||= Regexp.new( tree_flatten( char_tree( hash.keys ) ) )
123
123
  end
124
124
 
125
- # Recover original characters from input using regexp, recursively.
126
- def recover( input, recursive = true )
127
- output = input.gsub( regexp ) { |moji| hash[moji] }
128
-
129
- # Only recurse if requested and substituted something (output
130
- # shorter) in this run.
131
- if recursive && ( output.length < input.length )
132
- recover( output )
133
- else
134
- output
135
- end
136
- end
137
-
138
125
  def char_tree( seqs )
139
126
  seqs.inject( {} ) do |h,seq|
140
127
  seq.chars.inject( h ) do |hs,c|
@@ -158,8 +145,7 @@ module MojiBake
158
145
  o
159
146
  end
160
147
  if cs.find { |o| o =~ /[()|\[\]]/ }
161
- cs.join( '|' ).force_encoding( "UTF-8" )
162
- #FIXME: Join looses encoding so force, jruby bug?
148
+ cs.join( '|' )
163
149
  else
164
150
  if cs.length > 1
165
151
  '[' + cs.inject(:+) + ']'
@@ -0,0 +1,81 @@
1
+ #--
2
+ # Copyright (c) 2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'mojibake/base'
18
+ require 'json'
19
+
20
+ module MojiBake
21
+
22
+ module JSONSupport
23
+
24
+ JSON_CONFIG = File.join( File.dirname( __FILE__ ),
25
+ '..', '..', 'config', 'table.json' )
26
+
27
+ def initialize
28
+ super
29
+ end
30
+
31
+ def config
32
+ @config ||= JSON.parse( IO.read( JSON_CONFIG ) )
33
+ end
34
+
35
+ def hash
36
+ @hash ||= config[ 'moji' ]
37
+ end
38
+
39
+ def regexp
40
+ # Note use of Unicode mode for ruby 1.8's
41
+ @regexp ||= Regexp.new( config[ 'regexp' ], 0, 'U' )
42
+ end
43
+
44
+ # table as self contained json-ready Hash
45
+ def hash_to_json_object
46
+
47
+ # Also use unicode escape for the interesting (effectively,
48
+ # non-printable) subset of moji mappings.
49
+ moji = hash.sort.map do |kv|
50
+ kv.map do |s|
51
+ s.codepoints.inject( '' ) do |r,i|
52
+ if MojiBake::Mapper::INTEREST_CODEPOINTS.include?( i )
53
+ r << sprintf( '\u%04X', i )
54
+ else
55
+ r << i.chr( Encoding::UTF_8 )
56
+ end
57
+ end
58
+ end
59
+ end
60
+
61
+ { :mojibake => MojiBake::VERSION,
62
+ :url => "https://github.com/dekellum/mojibake",
63
+ :regexp => regexp.inspect[1...-1],
64
+ :moji => Hash[ moji ] }
65
+ end
66
+
67
+ # Pretty formatted JSON serialized String for json_object
68
+ def json
69
+ # Generate and replace what become double escaped '\\u' UNICODE
70
+ # escapes with single '\u' escapes. This is a hack but is
71
+ # reasonably safe given that 'u' isn't normally escaped. The
72
+ # alterantive would be to hack JSON package or do the JSON
73
+ # formatting ourselves. Ideally JSON package would support
74
+ # serialization using unicode escapes for the non-printable,
75
+ # non-friendly chars. As of 1.6.1 it doesn't.
76
+ JSON.pretty_generate( hash_to_json_object ).gsub( /\\\\u/, '\u' )
77
+ end
78
+
79
+ end
80
+
81
+ end
@@ -0,0 +1,76 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+ #.hashdot.args.pre = --1.9
4
+ #.hashdot.profile += jruby-shortlived
5
+
6
+ #--
7
+ # Copyright (c) 2011 David Kellum
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
10
+ # may not use this file except in compliance with the License. You
11
+ # may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
18
+ # implied. See the License for the specific language governing
19
+ # permissions and limitations under the License.
20
+ #++
21
+
22
+ ldir = File.join( File.dirname( __FILE__ ), "..", "lib" )
23
+ $LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
24
+
25
+ require 'rubygems'
26
+ require 'minitest/unit'
27
+ require 'minitest/autorun'
28
+
29
+ require 'mojibake'
30
+
31
+ class TestEncoding < MiniTest::Unit::TestCase
32
+ include MojiBake
33
+
34
+ def setup
35
+ @mapper = Mapper.new
36
+ end
37
+
38
+ TEST_TREE = { "a" => { "b" => { "c" => {},
39
+ "d" => {} } },
40
+ "d" => { "b" => { "f" => {} } } }
41
+
42
+ # These only test with Ruby 1.9 support
43
+ if ( RUBY_VERSION.split( '.' ).map { |d| d.to_i } <=> [ 1, 9 ] ) >= 0
44
+
45
+ def test_init_options
46
+ assert_equal( true, Mapper.new.map_iso_8859_1 )
47
+ m = Mapper.new( :map_iso_8859_1 => false )
48
+ assert_equal( false, m.map_iso_8859_1 )
49
+ end
50
+
51
+ def test_char_tree
52
+ assert_equal( TEST_TREE,
53
+ @mapper.char_tree( [ "abc", "abd", "dbf" ] ) )
54
+ end
55
+
56
+ def test_tree_flaten
57
+ assert_equal( "ab[cd]|dbf",
58
+ @mapper.tree_flatten( TEST_TREE ) )
59
+ end
60
+
61
+ def test_regexp
62
+ re = Regexp.new( @mapper.tree_flatten( TEST_TREE ) )
63
+ assert_match( re, "abc" )
64
+ assert_match( re, "abd" )
65
+ assert_match( re, "dbf" )
66
+
67
+ refute_match( re, "ab" )
68
+ refute_match( re, "abf" )
69
+
70
+ assert_equal( "xbf" , "abdbf".gsub( re, 'x' ) )
71
+ assert_equal( "dbf" , "abdbf".gsub( re, 'd' ) )
72
+ end
73
+
74
+ end
75
+
76
+ end
@@ -28,46 +28,13 @@ require 'minitest/autorun'
28
28
 
29
29
  require 'mojibake'
30
30
 
31
- class TestMojiBake < MiniTest::Unit::TestCase
31
+ class TestMapper < MiniTest::Unit::TestCase
32
32
  include MojiBake
33
33
 
34
34
  def setup
35
35
  @mapper = Mapper.new
36
36
  end
37
37
 
38
- TEST_TREE = { "a" => { "b" => { "c" => {},
39
- "d" => {} } },
40
- "d" => { "b" => { "f" => {} } } }
41
-
42
- def test_init_options
43
- assert_equal( true, Mapper.new.map_iso_8859_1 )
44
- m = Mapper.new( :map_iso_8859_1 => false )
45
- assert_equal( false, m.map_iso_8859_1 )
46
- end
47
-
48
- def test_char_tree
49
- assert_equal( TEST_TREE,
50
- @mapper.char_tree( [ "abc", "abd", "dbf" ] ) )
51
- end
52
-
53
- def test_tree_flaten
54
- assert_equal( "ab[cd]|dbf",
55
- @mapper.tree_flatten( TEST_TREE ) )
56
- end
57
-
58
- def test_regexp
59
- re = Regexp.new( @mapper.tree_flatten( TEST_TREE ) )
60
- assert_match( re, "abc" )
61
- assert_match( re, "abd" )
62
- assert_match( re, "dbf" )
63
-
64
- refute_match( re, "ab" )
65
- refute_match( re, "abf" )
66
-
67
- assert_equal( "xbf" , "abdbf".gsub( re, 'x' ) )
68
- assert_equal( "dbf" , "abdbf".gsub( re, 'd' ) )
69
- end
70
-
71
38
  def test_nomatch_recover
72
39
  assert_equal( '', @mapper.recover( '' ) )
73
40
  assert_equal( 'ascii', @mapper.recover( 'ascii' ) )
metadata CHANGED
@@ -2,42 +2,49 @@
2
2
  name: mojibake
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 1.0.0
5
+ version: 1.1.0
6
6
  platform: ruby
7
7
  authors:
8
- - David Kellum
8
+ - David Kellum
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2011-06-21 00:00:00 -07:00
14
- default_executable:
13
+ date: 2011-10-31 00:00:00 Z
15
14
  dependencies:
16
- - !ruby/object:Gem::Dependency
17
- name: minitest
18
- prerelease: false
19
- requirement: &id001 !ruby/object:Gem::Requirement
20
- none: false
21
- requirements:
22
- - - ">="
23
- - !ruby/object:Gem::Version
24
- version: "2.1"
25
- - - <
26
- - !ruby/object:Gem::Version
27
- version: "2.4"
28
- type: :development
29
- version_requirements: *id001
30
- - !ruby/object:Gem::Dependency
31
- name: rjack-tarpit
32
- prerelease: false
33
- requirement: &id002 !ruby/object:Gem::Requirement
34
- none: false
35
- requirements:
36
- - - ~>
37
- - !ruby/object:Gem::Version
38
- version: 1.3.2
39
- type: :development
40
- version_requirements: *id002
15
+ - !ruby/object:Gem::Dependency
16
+ name: json
17
+ prerelease: false
18
+ requirement: &id001 !ruby/object:Gem::Requirement
19
+ none: false
20
+ requirements:
21
+ - - ~>
22
+ - !ruby/object:Gem::Version
23
+ version: 1.6.1
24
+ type: :runtime
25
+ version_requirements: *id001
26
+ - !ruby/object:Gem::Dependency
27
+ name: minitest
28
+ prerelease: false
29
+ requirement: &id002 !ruby/object:Gem::Requirement
30
+ none: false
31
+ requirements:
32
+ - - ~>
33
+ - !ruby/object:Gem::Version
34
+ version: "2.3"
35
+ type: :development
36
+ version_requirements: *id002
37
+ - !ruby/object:Gem::Dependency
38
+ name: rjack-tarpit
39
+ prerelease: false
40
+ requirement: &id003 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: 1.4.0
46
+ type: :development
47
+ version_requirements: *id003
41
48
  description: "Mojibake occurs in English most frequently due to misinterpreting and\n\
42
49
  bad-transcoding between Windows-1252, ISO-8859-1, and UTF-8. This\n\
43
50
  module provides a mojibake sequence to original character mapping\n\
@@ -45,54 +52,60 @@ description: "Mojibake occurs in English most frequently due to misinterpreting
45
52
  Testing has been with English but other Latin based languages, where\n\
46
53
  Windows-1252 is in the wild, should also benefit."
47
54
  email:
48
- - dek-oss@gravitext.com
55
+ - dek-oss@gravitext.com
49
56
  executables:
50
- - mojibake
57
+ - mojibake
51
58
  extensions: []
52
59
 
53
60
  extra_rdoc_files:
54
- - Manifest.txt
55
- - History.rdoc
56
- - README.rdoc
61
+ - Manifest.txt
62
+ - config/table.txt
63
+ - History.rdoc
64
+ - README.rdoc
57
65
  files:
58
- - History.rdoc
59
- - Manifest.txt
60
- - README.rdoc
61
- - Rakefile
62
- - bin/mojibake
63
- - lib/mojibake/base.rb
64
- - lib/mojibake.rb
65
- - lib/mojibake/mapper.rb
66
- - test/test.txt
67
- - test/test_mojibake.rb
68
- has_rdoc: true
66
+ - History.rdoc
67
+ - Manifest.txt
68
+ - README.rdoc
69
+ - Rakefile
70
+ - bin/mojibake
71
+ - config/table.json
72
+ - config/table.txt
73
+ - lib/mojibake/base.rb
74
+ - lib/mojibake.rb
75
+ - lib/mojibake/encoding.rb
76
+ - lib/mojibake/json.rb
77
+ - test/test.txt
78
+ - test/test_encoding.rb
79
+ - test/test_mapper.rb
80
+ - .gemtest
69
81
  homepage: http://github.com/dekellum/mojibake
70
82
  licenses: []
71
83
 
72
84
  post_install_message:
73
85
  rdoc_options:
74
- - --main
75
- - README.rdoc
86
+ - --main
87
+ - README.rdoc
76
88
  require_paths:
77
- - lib
89
+ - lib
78
90
  required_ruby_version: !ruby/object:Gem::Requirement
79
91
  none: false
80
92
  requirements:
81
- - - ">="
82
- - !ruby/object:Gem::Version
83
- version: "0"
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: "0"
84
96
  required_rubygems_version: !ruby/object:Gem::Requirement
85
97
  none: false
86
98
  requirements:
87
- - - ">="
88
- - !ruby/object:Gem::Version
89
- version: "0"
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: "0"
90
102
  requirements: []
91
103
 
92
104
  rubyforge_project: mojibake
93
- rubygems_version: 1.5.1
105
+ rubygems_version: 1.8.11
94
106
  signing_key:
95
107
  specification_version: 3
96
108
  summary: Mojibake occurs in English most frequently due to misinterpreting and bad-transcoding between Windows-1252, ISO-8859-1, and UTF-8
97
109
  test_files:
98
- - test/test_mojibake.rb
110
+ - test/test_encoding.rb
111
+ - test/test_mapper.rb