mojibake 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -1,2 +1,9 @@
1
+ === 1.1.0 (2011-6-31)
2
+ * Add table.json output support as more convenient language
3
+ independent format. Include default version (as well as table.txt)
4
+ as part of gem.
5
+ * Add Ruby 1.8 support for Mapper.recover by using default included
6
+ table.json (avoids need for 1.9 encoding support.)
7
+
1
8
  === 1.0.0 (2011-6-21)
2
9
  * Initial release.
@@ -3,8 +3,12 @@ Manifest.txt
3
3
  README.rdoc
4
4
  Rakefile
5
5
  bin/mojibake
6
+ config/table.json
7
+ config/table.txt
6
8
  lib/mojibake/base.rb
7
9
  lib/mojibake.rb
8
- lib/mojibake/mapper.rb
10
+ lib/mojibake/encoding.rb
11
+ lib/mojibake/json.rb
9
12
  test/test.txt
10
- test/test_mojibake.rb
13
+ test/test_encoding.rb
14
+ test/test_mapper.rb
@@ -15,8 +15,10 @@ Windows-1252 is in the wild, should also benefit.
15
15
 
16
16
  == Dependencies
17
17
 
18
- Requires the String Encoding support of ruby 1.9+ (tested 1.9.2p180
19
- Linux) or jruby 1.6+ (tested 1.6.2, Linux).
18
+ Requires the String Encoding support in ruby 1.9 as provided by:
19
+
20
+ * ruby 1.9.2+ (tested 1.9.2p180, Linux)
21
+ * jruby 1.6.5+ (tested 1.6.5, Linux)
20
22
 
21
23
  == Synopsis
22
24
 
data/Rakefile CHANGED
@@ -3,7 +3,7 @@
3
3
  $LOAD_PATH << './lib'
4
4
 
5
5
  require 'rubygems'
6
- gem 'rjack-tarpit', '~> 1.3.2'
6
+ gem 'rjack-tarpit', '~> 1.4'
7
7
  require 'rjack-tarpit'
8
8
 
9
9
  require 'mojibake/base'
@@ -14,7 +14,8 @@ t.specify do |h|
14
14
  h.developer( 'David Kellum', 'dek-oss@gravitext.com' )
15
15
 
16
16
  h.testlib = :minitest
17
- h.extra_dev_deps += [ [ 'minitest', '>= 2.1', '< 2.4' ] ]
17
+ h.extra_deps += [ [ 'json', '~> 1.6.1' ] ]
18
+ h.extra_dev_deps += [ [ 'minitest', '~> 2.3' ] ]
18
19
 
19
20
  h.url = 'http://github.com/dekellum/mojibake'
20
21
  end
@@ -33,3 +34,15 @@ task :tag => [ :check_history_version, :check_history_date ]
33
34
  task :push => [ :check_history_version, :check_history_date ]
34
35
 
35
36
  t.define_tasks
37
+
38
+ desc "(Re-)generate config output files (requires 1.9)"
39
+ task :generate_config do
40
+ if ( RUBY_VERSION.split( '.' ).map { |d| d.to_i } <=> [ 1, 9 ] ) >= 0
41
+ require 'mojibake'
42
+ mapper = MojiBake::Mapper.new
43
+ open( "config/table.txt", 'w' ) { |fout| fout.puts( mapper.table ) }
44
+ open( "config/table.json", 'w' ) { |fout| fout.puts( mapper.json ) }
45
+ else
46
+ raise "Task generate_config requires Ruby 1.9 encoding support"
47
+ end
48
+ end
@@ -20,6 +20,8 @@
20
20
 
21
21
  $LOAD_PATH.unshift File.join( File.dirname( __FILE__ ), "..", "lib" )
22
22
 
23
+ require 'rubygems'
24
+
23
25
  require 'mojibake'
24
26
  require 'optparse'
25
27
 
@@ -33,23 +35,36 @@ module Script
33
35
  puts "mojibake: #{MojiBake::VERSION}"
34
36
  exit 1
35
37
  end
36
- opts.on( "--no-windows-1252",
37
- "Don't include miscodings from Windows-1252" ) do
38
- mapper.map_windows_1252 = false
39
- end
40
- opts.on( "--no-iso-8859-1",
41
- "Don't include miscodings from ISO-8859-1" ) do
42
- mapper.map_iso_8859_1 = false
43
- end
44
- opts.on( "--no-permutations",
45
- "Don't include ISO/Windows permutations" ) do
46
- mapper.map_permutations = false
47
- end
48
- opts.on_tail( "-t", "--table",
49
- "Write MojiBake Mapper table (UTF-8)" ) do
50
- puts mapper.table
51
- exit 1
38
+
39
+ if ( RUBY_VERSION.split( '.' ).map { |d| d.to_i } <=> [ 1, 9 ] ) >= 0
40
+
41
+ opts.on( "--no-windows-1252",
42
+ "Don't include miscodings from Windows-1252" ) do
43
+ mapper.map_windows_1252 = false
44
+ end
45
+ opts.on( "--no-iso-8859-1",
46
+ "Don't include miscodings from ISO-8859-1" ) do
47
+ mapper.map_iso_8859_1 = false
48
+ end
49
+ opts.on( "--no-permutations",
50
+ "Don't include ISO/Windows permutations" ) do
51
+ mapper.map_permutations = false
52
+ end
53
+ opts.on_tail( "-t", "--table",
54
+ "Write MojiBake Mapper table (UTF-8) and exit" ) do
55
+ puts mapper.table
56
+ exit 1
57
+ end
58
+ opts.on_tail( "-j", "--json",
59
+ "Write MojiBake Mapper json (UTF-8) and exit" ) do
60
+ require 'rubygems'
61
+ require 'mojibake/json'
62
+ puts mapper.json
63
+ exit 1
64
+ end
65
+
52
66
  end
67
+
53
68
  opts.on_tail( "-r", "--regex",
54
69
  "Display MojiBake Mapper regex (UTF-8) and exit" ) do
55
70
  puts mapper.regexp.inspect
@@ -65,7 +80,8 @@ module Script
65
80
 
66
81
  input_file = ARGV.shift
67
82
  if input_file
68
- $stdout.write( mapper.recover( IO.read( input_file ).encode( 'UTF-8' ) ) )
83
+ data = IO.read( input_file )
84
+ $stdout.write( mapper.recover( data ) )
69
85
  end
70
86
 
71
87
  end
@@ -0,0 +1,270 @@
1
+ {
2
+ "mojibake": "1.1.0",
3
+ "url": "https://github.com/dekellum/mojibake",
4
+ "regexp": "Â[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ŒœŠšŸŽžƒˆ˜–—‘’‚“”„†‡•…‰‹›€™\uFFFD]|Ã[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ŒœŠšŸŽžƒˆ˜–—‘’‚“”„†‡•…‰‹›€™\uFFFD]|Å[\u0092\u0093\u00A0¡¸½¾’“]|Æ[\u0092’]|Ë[\u0086\u009Cœ†]|â(\u0080[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u0093\u0094\u0098\u0099\u009A\u009C\u009D\u009E\u00A0¡¢¦°¹º]|\u0081\u00A0|\u0082¬|\u0084¢|‚¬|„¢|€[\u0081\u009D\u00A0¡¢¦°¹ºœŠšžƒˆ˜‚“”„†‡…‰‹€™\uFFFD]|\uFFFD\u00A0)|ï(»¿|¿[½¾])",
5
+ "moji": {
6
+ "Â\u0080": "\u0080",
7
+ "Â\u0081": "\u0081",
8
+ "Â\u0082": "\u0082",
9
+ "Â\u0083": "\u0083",
10
+ "Â\u0084": "\u0084",
11
+ "Â\u0085": "\u0085",
12
+ "Â\u0086": "\u0086",
13
+ "Â\u0087": "\u0087",
14
+ "Â\u0088": "\u0088",
15
+ "Â\u0089": "\u0089",
16
+ "Â\u008A": "\u008A",
17
+ "Â\u008B": "\u008B",
18
+ "Â\u008C": "\u008C",
19
+ "Â\u008D": "\u008D",
20
+ "Â\u008E": "\u008E",
21
+ "Â\u008F": "\u008F",
22
+ "Â\u0090": "\u0090",
23
+ "Â\u0091": "\u0091",
24
+ "Â\u0092": "\u0092",
25
+ "Â\u0093": "\u0093",
26
+ "Â\u0094": "\u0094",
27
+ "Â\u0095": "\u0095",
28
+ "Â\u0096": "\u0096",
29
+ "Â\u0097": "\u0097",
30
+ "Â\u0098": "\u0098",
31
+ "Â\u0099": "\u0099",
32
+ "Â\u009A": "\u009A",
33
+ "Â\u009B": "\u009B",
34
+ "Â\u009C": "\u009C",
35
+ "Â\u009D": "\u009D",
36
+ "Â\u009E": "\u009E",
37
+ "Â\u009F": "\u009F",
38
+ "Â\u00A0": "\u00A0",
39
+ "¡": "¡",
40
+ "¢": "¢",
41
+ "£": "£",
42
+ "¤": "¤",
43
+ "Â¥": "¥",
44
+ "¦": "¦",
45
+ "§": "§",
46
+ "¨": "¨",
47
+ "©": "©",
48
+ "ª": "ª",
49
+ "«": "«",
50
+ "¬": "¬",
51
+ "­": "­",
52
+ "®": "®",
53
+ "¯": "¯",
54
+ "°": "°",
55
+ "±": "±",
56
+ "²": "²",
57
+ "³": "³",
58
+ "´": "´",
59
+ "µ": "µ",
60
+ "¶": "¶",
61
+ "·": "·",
62
+ "¸": "¸",
63
+ "¹": "¹",
64
+ "º": "º",
65
+ "»": "»",
66
+ "¼": "¼",
67
+ "½": "½",
68
+ "¾": "¾",
69
+ "¿": "¿",
70
+ "Œ": "\u008C",
71
+ "œ": "\u009C",
72
+ "Š": "\u008A",
73
+ "š": "\u009A",
74
+ "Ÿ": "\u009F",
75
+ "ÂŽ": "\u008E",
76
+ "ž": "\u009E",
77
+ "ƒ": "\u0083",
78
+ "ˆ": "\u0088",
79
+ "˜": "\u0098",
80
+ "–": "\u0096",
81
+ "—": "\u0097",
82
+ "‘": "\u0091",
83
+ "Â’": "\u0092",
84
+ "‚": "\u0082",
85
+ "“": "\u0093",
86
+ "”": "\u0094",
87
+ "„": "\u0084",
88
+ "†": "\u0086",
89
+ "‡": "\u0087",
90
+ "•": "\u0095",
91
+ "Â…": "\u0085",
92
+ "‰": "\u0089",
93
+ "‹": "\u008B",
94
+ "›": "\u009B",
95
+ "€": "\u0080",
96
+ "™": "\u0099",
97
+ "Â\uFFFD": "\u0081",
98
+ "Ã\u0080": "À",
99
+ "Ã\u0081": "Á",
100
+ "Ã\u0082": "Â",
101
+ "Ã\u0083": "Ã",
102
+ "Ã\u0084": "Ä",
103
+ "Ã\u0085": "Å",
104
+ "Ã\u0086": "Æ",
105
+ "Ã\u0087": "Ç",
106
+ "Ã\u0088": "È",
107
+ "Ã\u0089": "É",
108
+ "Ã\u008A": "Ê",
109
+ "Ã\u008B": "Ë",
110
+ "Ã\u008C": "Ì",
111
+ "Ã\u008D": "Í",
112
+ "Ã\u008E": "Î",
113
+ "Ã\u008F": "Ï",
114
+ "Ã\u0090": "Ð",
115
+ "Ã\u0091": "Ñ",
116
+ "Ã\u0092": "Ò",
117
+ "Ã\u0093": "Ó",
118
+ "Ã\u0094": "Ô",
119
+ "Ã\u0095": "Õ",
120
+ "Ã\u0096": "Ö",
121
+ "Ã\u0097": "×",
122
+ "Ã\u0098": "Ø",
123
+ "Ã\u0099": "Ù",
124
+ "Ã\u009A": "Ú",
125
+ "Ã\u009B": "Û",
126
+ "Ã\u009C": "Ü",
127
+ "Ã\u009D": "Ý",
128
+ "Ã\u009E": "Þ",
129
+ "Ã\u009F": "ß",
130
+ "Ã\u00A0": "à",
131
+ "á": "á",
132
+ "â": "â",
133
+ "ã": "ã",
134
+ "ä": "ä",
135
+ "Ã¥": "å",
136
+ "æ": "æ",
137
+ "ç": "ç",
138
+ "è": "è",
139
+ "é": "é",
140
+ "ê": "ê",
141
+ "ë": "ë",
142
+ "ì": "ì",
143
+ "í": "í",
144
+ "î": "î",
145
+ "ï": "ï",
146
+ "ð": "ð",
147
+ "ñ": "ñ",
148
+ "ò": "ò",
149
+ "ó": "ó",
150
+ "ô": "ô",
151
+ "õ": "õ",
152
+ "ö": "ö",
153
+ "÷": "÷",
154
+ "ø": "ø",
155
+ "ù": "ù",
156
+ "ú": "ú",
157
+ "û": "û",
158
+ "ü": "ü",
159
+ "ý": "ý",
160
+ "þ": "þ",
161
+ "ÿ": "ÿ",
162
+ "ÃŒ": "Ì",
163
+ "Ü": "Ü",
164
+ "Ê": "Ê",
165
+ "Ú": "Ú",
166
+ "ß": "ß",
167
+ "ÃŽ": "Î",
168
+ "Þ": "Þ",
169
+ "Ã": "Ã",
170
+ "È": "È",
171
+ "Ø": "Ø",
172
+ "Ö": "Ö",
173
+ "×": "×",
174
+ "Ñ": "Ñ",
175
+ "Ã’": "Ò",
176
+ "Â": "Â",
177
+ "Ó": "Ó",
178
+ "Ô": "Ô",
179
+ "Ä": "Ä",
180
+ "Æ": "Æ",
181
+ "Ç": "Ç",
182
+ "Õ": "Õ",
183
+ "Ã…": "Å",
184
+ "É": "É",
185
+ "Ë": "Ë",
186
+ "Û": "Û",
187
+ "À": "À",
188
+ "Ù": "Ù",
189
+ "Ã\uFFFD": "Á",
190
+ "Å\u0092": "Œ",
191
+ "Å\u0093": "œ",
192
+ "Å\u00A0": "Š",
193
+ "Å¡": "š",
194
+ "Ÿ": "Ÿ",
195
+ "Ž": "Ž",
196
+ "ž": "ž",
197
+ "Å’": "Œ",
198
+ "Å“": "œ",
199
+ "Æ\u0092": "ƒ",
200
+ "Æ’": "ƒ",
201
+ "Ë\u0086": "ˆ",
202
+ "Ë\u009C": "˜",
203
+ "Ëœ": "˜",
204
+ "ˆ": "ˆ",
205
+ "â\u0080\u0080": "\u2000",
206
+ "â\u0080\u0081": "\u2001",
207
+ "â\u0080\u0082": "\u2002",
208
+ "â\u0080\u0083": "\u2003",
209
+ "â\u0080\u0084": "\u2004",
210
+ "â\u0080\u0085": "\u2005",
211
+ "â\u0080\u0086": "\u2006",
212
+ "â\u0080\u0087": "\u2007",
213
+ "â\u0080\u0088": "\u2008",
214
+ "â\u0080\u0089": "\u2009",
215
+ "â\u0080\u008A": "\u200A",
216
+ "â\u0080\u008B": "\u200B",
217
+ "â\u0080\u0093": "–",
218
+ "â\u0080\u0094": "—",
219
+ "â\u0080\u0098": "‘",
220
+ "â\u0080\u0099": "’",
221
+ "â\u0080\u009A": "‚",
222
+ "â\u0080\u009C": "“",
223
+ "â\u0080\u009D": "”",
224
+ "â\u0080\u009E": "„",
225
+ "â\u0080\u00A0": "†",
226
+ "â\u0080¡": "‡",
227
+ "â\u0080¢": "•",
228
+ "â\u0080¦": "…",
229
+ "â\u0080°": "‰",
230
+ "â\u0080¹": "‹",
231
+ "â\u0080º": "›",
232
+ "â\u0081\u00A0": "\u2060",
233
+ "â\u0082¬": "€",
234
+ "â\u0084¢": "™",
235
+ "€": "€",
236
+ "â„¢": "™",
237
+ "â€\u0081": "\u2001",
238
+ "â€\u009D": "”",
239
+ "â€\u00A0": "†",
240
+ "‡": "‡",
241
+ "•": "•",
242
+ "…": "…",
243
+ "‰": "‰",
244
+ "‹": "‹",
245
+ "›": "›",
246
+ "“": "“",
247
+ " ": "\u200A",
248
+ "‚": "‚",
249
+ "„": "„",
250
+ " ": "\u2003",
251
+ " ": "\u2008",
252
+ "‘": "‘",
253
+ " ": "\u2002",
254
+ "–": "–",
255
+ "—": "—",
256
+ " ": "\u2004",
257
+ " ": "\u2006",
258
+ " ": "\u2007",
259
+ " ": "\u2005",
260
+ " ": "\u2009",
261
+ "​": "\u200B",
262
+ " ": "\u2000",
263
+ "’": "’",
264
+ "â€\uFFFD": "”",
265
+ "â\uFFFD\u00A0": "\u2060",
266
+ "": "\uFEFF",
267
+ "�": "\uFFFD",
268
+ "￾": "\uFFFE"
269
+ }
270
+ }
@@ -0,0 +1,268 @@
1
+ # -*- coding: utf-8 -*- mojibake: 1.1.0
2
+ /Â[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ŒœŠšŸŽžƒˆ˜–—‘’‚“”„†‡•…‰‹›€™\uFFFD]|Ã[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ŒœŠšŸŽžƒˆ˜–—‘’‚“”„†‡•…‰‹›€™\uFFFD]|Å[\u0092\u0093\u00A0¡¸½¾’“]|Æ[\u0092’]|Ë[\u0086\u009Cœ†]|â(\u0080[\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u0093\u0094\u0098\u0099\u009A\u009C\u009D\u009E\u00A0¡¢¦°¹º]|\u0081\u00A0|\u0082¬|\u0084¢|‚¬|„¢|€[\u0081\u009D\u00A0¡¢¦°¹ºœŠšžƒˆ˜‚“”„†‡…‰‹€™\uFFFD]|\uFFFD\u00A0)|ï(»¿|¿[½¾])/
3
+
4
+ Moji UNICODE Org CODE
5
+ +---- ---- ---- ---- ----- ---+
6
+ [€] 00C2 0080 [€] 0080
7
+ [] 00C2 0081 [] 0081
8
+ [‚] 00C2 0082 [‚] 0082
9
+ [ƒ] 00C2 0083 [ƒ] 0083
10
+ [„] 00C2 0084 [„] 0084
11
+ […] 00C2 0085 […] 0085
12
+ [†] 00C2 0086 [†] 0086
13
+ [‡] 00C2 0087 [‡] 0087
14
+ [ˆ] 00C2 0088 [ˆ] 0088
15
+ [‰] 00C2 0089 [‰] 0089
16
+ [Š] 00C2 008A [Š] 008A
17
+ [‹] 00C2 008B [‹] 008B
18
+ [Œ] 00C2 008C [Œ] 008C
19
+ [] 00C2 008D [] 008D
20
+ [Ž] 00C2 008E [Ž] 008E
21
+ [] 00C2 008F [] 008F
22
+ [] 00C2 0090 [] 0090
23
+ [‘] 00C2 0091 [‘] 0091
24
+ [’] 00C2 0092 [’] 0092
25
+ [“] 00C2 0093 [“] 0093
26
+ [”] 00C2 0094 [”] 0094
27
+ [•] 00C2 0095 [•] 0095
28
+ [–] 00C2 0096 [–] 0096
29
+ [—] 00C2 0097 [—] 0097
30
+ [˜] 00C2 0098 [˜] 0098
31
+ [™] 00C2 0099 [™] 0099
32
+ [š] 00C2 009A [š] 009A
33
+ [›] 00C2 009B [›] 009B
34
+ [œ] 00C2 009C [œ] 009C
35
+ [] 00C2 009D [] 009D
36
+ [ž] 00C2 009E [ž] 009E
37
+ [Ÿ] 00C2 009F [Ÿ] 009F
38
+ [ ] 00C2 00A0 [ ] 00A0
39
+ [¡] 00C2 00A1 [¡] 00A1
40
+ [¢] 00C2 00A2 [¢] 00A2
41
+ [£] 00C2 00A3 [£] 00A3
42
+ [¤] 00C2 00A4 [¤] 00A4
43
+ [Â¥] 00C2 00A5 [¥] 00A5
44
+ [¦] 00C2 00A6 [¦] 00A6
45
+ [§] 00C2 00A7 [§] 00A7
46
+ [¨] 00C2 00A8 [¨] 00A8
47
+ [©] 00C2 00A9 [©] 00A9
48
+ [ª] 00C2 00AA [ª] 00AA
49
+ [«] 00C2 00AB [«] 00AB
50
+ [¬] 00C2 00AC [¬] 00AC
51
+ [­] 00C2 00AD [­] 00AD
52
+ [®] 00C2 00AE [®] 00AE
53
+ [¯] 00C2 00AF [¯] 00AF
54
+ [°] 00C2 00B0 [°] 00B0
55
+ [±] 00C2 00B1 [±] 00B1
56
+ [²] 00C2 00B2 [²] 00B2
57
+ [³] 00C2 00B3 [³] 00B3
58
+ [´] 00C2 00B4 [´] 00B4
59
+ [µ] 00C2 00B5 [µ] 00B5
60
+ [¶] 00C2 00B6 [¶] 00B6
61
+ [·] 00C2 00B7 [·] 00B7
62
+ [¸] 00C2 00B8 [¸] 00B8
63
+ [¹] 00C2 00B9 [¹] 00B9
64
+ [º] 00C2 00BA [º] 00BA
65
+ [»] 00C2 00BB [»] 00BB
66
+ [¼] 00C2 00BC [¼] 00BC
67
+ [½] 00C2 00BD [½] 00BD
68
+ [¾] 00C2 00BE [¾] 00BE
69
+ [¿] 00C2 00BF [¿] 00BF
70
+ [ÂŒ] 00C2 0152 [Œ] 008C
71
+ [œ] 00C2 0153 [œ] 009C
72
+ [Š] 00C2 0160 [Š] 008A
73
+ [š] 00C2 0161 [š] 009A
74
+ [Ÿ] 00C2 0178 [Ÿ] 009F
75
+ [ÂŽ] 00C2 017D [Ž] 008E
76
+ [ž] 00C2 017E [ž] 009E
77
+ [ƒ] 00C2 0192 [ƒ] 0083
78
+ [ˆ] 00C2 02C6 [ˆ] 0088
79
+ [˜] 00C2 02DC [˜] 0098
80
+ [–] 00C2 2013 [–] 0096
81
+ [—] 00C2 2014 [—] 0097
82
+ [‘] 00C2 2018 [‘] 0091
83
+ [Â’] 00C2 2019 [’] 0092
84
+ [‚] 00C2 201A [‚] 0082
85
+ [“] 00C2 201C [“] 0093
86
+ [”] 00C2 201D [”] 0094
87
+ [„] 00C2 201E [„] 0084
88
+ [†] 00C2 2020 [†] 0086
89
+ [‡] 00C2 2021 [‡] 0087
90
+ [•] 00C2 2022 [•] 0095
91
+ [Â…] 00C2 2026 […] 0085
92
+ [‰] 00C2 2030 [‰] 0089
93
+ [‹] 00C2 2039 [‹] 008B
94
+ [›] 00C2 203A [›] 009B
95
+ [€] 00C2 20AC [€] 0080
96
+ [™] 00C2 2122 [™] 0099
97
+ [Â�] 00C2 FFFD [] 0081
98
+ [À] 00C3 0080 [À] 00C0
99
+ [Á] 00C3 0081 [Á] 00C1
100
+ [Â] 00C3 0082 [Â] 00C2
101
+ [Ã] 00C3 0083 [Ã] 00C3
102
+ [Ä] 00C3 0084 [Ä] 00C4
103
+ [Å] 00C3 0085 [Å] 00C5
104
+ [Æ] 00C3 0086 [Æ] 00C6
105
+ [Ç] 00C3 0087 [Ç] 00C7
106
+ [È] 00C3 0088 [È] 00C8
107
+ [É] 00C3 0089 [É] 00C9
108
+ [Ê] 00C3 008A [Ê] 00CA
109
+ [Ë] 00C3 008B [Ë] 00CB
110
+ [Ì] 00C3 008C [Ì] 00CC
111
+ [Í] 00C3 008D [Í] 00CD
112
+ [Î] 00C3 008E [Î] 00CE
113
+ [Ï] 00C3 008F [Ï] 00CF
114
+ [Ð] 00C3 0090 [Ð] 00D0
115
+ [Ñ] 00C3 0091 [Ñ] 00D1
116
+ [Ò] 00C3 0092 [Ò] 00D2
117
+ [Ó] 00C3 0093 [Ó] 00D3
118
+ [Ô] 00C3 0094 [Ô] 00D4
119
+ [Õ] 00C3 0095 [Õ] 00D5
120
+ [Ö] 00C3 0096 [Ö] 00D6
121
+ [×] 00C3 0097 [×] 00D7
122
+ [Ø] 00C3 0098 [Ø] 00D8
123
+ [Ù] 00C3 0099 [Ù] 00D9
124
+ [Ú] 00C3 009A [Ú] 00DA
125
+ [Û] 00C3 009B [Û] 00DB
126
+ [Ü] 00C3 009C [Ü] 00DC
127
+ [Ý] 00C3 009D [Ý] 00DD
128
+ [Þ] 00C3 009E [Þ] 00DE
129
+ [ß] 00C3 009F [ß] 00DF
130
+ [à] 00C3 00A0 [à] 00E0
131
+ [á] 00C3 00A1 [á] 00E1
132
+ [â] 00C3 00A2 [â] 00E2
133
+ [ã] 00C3 00A3 [ã] 00E3
134
+ [ä] 00C3 00A4 [ä] 00E4
135
+ [Ã¥] 00C3 00A5 [å] 00E5
136
+ [æ] 00C3 00A6 [æ] 00E6
137
+ [ç] 00C3 00A7 [ç] 00E7
138
+ [è] 00C3 00A8 [è] 00E8
139
+ [é] 00C3 00A9 [é] 00E9
140
+ [ê] 00C3 00AA [ê] 00EA
141
+ [ë] 00C3 00AB [ë] 00EB
142
+ [ì] 00C3 00AC [ì] 00EC
143
+ [í] 00C3 00AD [í] 00ED
144
+ [î] 00C3 00AE [î] 00EE
145
+ [ï] 00C3 00AF [ï] 00EF
146
+ [ð] 00C3 00B0 [ð] 00F0
147
+ [ñ] 00C3 00B1 [ñ] 00F1
148
+ [ò] 00C3 00B2 [ò] 00F2
149
+ [ó] 00C3 00B3 [ó] 00F3
150
+ [ô] 00C3 00B4 [ô] 00F4
151
+ [õ] 00C3 00B5 [õ] 00F5
152
+ [ö] 00C3 00B6 [ö] 00F6
153
+ [÷] 00C3 00B7 [÷] 00F7
154
+ [ø] 00C3 00B8 [ø] 00F8
155
+ [ù] 00C3 00B9 [ù] 00F9
156
+ [ú] 00C3 00BA [ú] 00FA
157
+ [û] 00C3 00BB [û] 00FB
158
+ [ü] 00C3 00BC [ü] 00FC
159
+ [ý] 00C3 00BD [ý] 00FD
160
+ [þ] 00C3 00BE [þ] 00FE
161
+ [ÿ] 00C3 00BF [ÿ] 00FF
162
+ [ÃŒ] 00C3 0152 [Ì] 00CC
163
+ [Ü] 00C3 0153 [Ü] 00DC
164
+ [Ê] 00C3 0160 [Ê] 00CA
165
+ [Ú] 00C3 0161 [Ú] 00DA
166
+ [ß] 00C3 0178 [ß] 00DF
167
+ [ÃŽ] 00C3 017D [Î] 00CE
168
+ [Þ] 00C3 017E [Þ] 00DE
169
+ [Ã] 00C3 0192 [Ã] 00C3
170
+ [È] 00C3 02C6 [È] 00C8
171
+ [Ø] 00C3 02DC [Ø] 00D8
172
+ [Ö] 00C3 2013 [Ö] 00D6
173
+ [×] 00C3 2014 [×] 00D7
174
+ [Ñ] 00C3 2018 [Ñ] 00D1
175
+ [Ã’] 00C3 2019 [Ò] 00D2
176
+ [Â] 00C3 201A [Â] 00C2
177
+ [Ó] 00C3 201C [Ó] 00D3
178
+ [Ô] 00C3 201D [Ô] 00D4
179
+ [Ä] 00C3 201E [Ä] 00C4
180
+ [Æ] 00C3 2020 [Æ] 00C6
181
+ [Ç] 00C3 2021 [Ç] 00C7
182
+ [Õ] 00C3 2022 [Õ] 00D5
183
+ [Ã…] 00C3 2026 [Å] 00C5
184
+ [É] 00C3 2030 [É] 00C9
185
+ [Ë] 00C3 2039 [Ë] 00CB
186
+ [Û] 00C3 203A [Û] 00DB
187
+ [À] 00C3 20AC [À] 00C0
188
+ [Ù] 00C3 2122 [Ù] 00D9
189
+ [Ã�] 00C3 FFFD [Á] 00C1
190
+ [Œ] 00C5 0092 [Œ] 0152
191
+ [œ] 00C5 0093 [œ] 0153
192
+ [Å ] 00C5 00A0 [Š] 0160
193
+ [Å¡] 00C5 00A1 [š] 0161
194
+ [Ÿ] 00C5 00B8 [Ÿ] 0178
195
+ [Ž] 00C5 00BD [Ž] 017D
196
+ [ž] 00C5 00BE [ž] 017E
197
+ [Å’] 00C5 2019 [Œ] 0152
198
+ [Å“] 00C5 201C [œ] 0153
199
+ [ƒ] 00C6 0092 [ƒ] 0192
200
+ [Æ’] 00C6 2019 [ƒ] 0192
201
+ [ˆ] 00CB 0086 [ˆ] 02C6
202
+ [˜] 00CB 009C [˜] 02DC
203
+ [Ëœ] 00CB 0153 [˜] 02DC
204
+ [ˆ] 00CB 2020 [ˆ] 02C6
205
+ [ ] 00E2 0080 0080 [ ] 2000
206
+ [ ] 00E2 0080 0081 [ ] 2001
207
+ [ ] 00E2 0080 0082 [ ] 2002
208
+ [ ] 00E2 0080 0083 [ ] 2003
209
+ [ ] 00E2 0080 0084 [ ] 2004
210
+ [ ] 00E2 0080 0085 [ ] 2005
211
+ [ ] 00E2 0080 0086 [ ] 2006
212
+ [ ] 00E2 0080 0087 [ ] 2007
213
+ [ ] 00E2 0080 0088 [ ] 2008
214
+ [ ] 00E2 0080 0089 [ ] 2009
215
+ [ ] 00E2 0080 008A [ ] 200A
216
+ [​] 00E2 0080 008B [​] 200B
217
+ [–] 00E2 0080 0093 [–] 2013
218
+ [—] 00E2 0080 0094 [—] 2014
219
+ [‘] 00E2 0080 0098 [‘] 2018
220
+ [’] 00E2 0080 0099 [’] 2019
221
+ [‚] 00E2 0080 009A [‚] 201A
222
+ [“] 00E2 0080 009C [“] 201C
223
+ [”] 00E2 0080 009D [”] 201D
224
+ [„] 00E2 0080 009E [„] 201E
225
+ [†] 00E2 0080 00A0 [†] 2020
226
+ [‡] 00E2 0080 00A1 [‡] 2021
227
+ [•] 00E2 0080 00A2 [•] 2022
228
+ […] 00E2 0080 00A6 […] 2026
229
+ [‰] 00E2 0080 00B0 [‰] 2030
230
+ [‹] 00E2 0080 00B9 [‹] 2039
231
+ [›] 00E2 0080 00BA [›] 203A
232
+ [⁠] 00E2 0081 00A0 [⁠] 2060
233
+ [€] 00E2 0082 00AC [€] 20AC
234
+ [™] 00E2 0084 00A2 [™] 2122
235
+ [€] 00E2 201A 00AC [€] 20AC
236
+ [â„¢] 00E2 201E 00A2 [™] 2122
237
+ [ ] 00E2 20AC 0081 [ ] 2001
238
+ [”] 00E2 20AC 009D [”] 201D
239
+ [†] 00E2 20AC 00A0 [†] 2020
240
+ [‡] 00E2 20AC 00A1 [‡] 2021
241
+ [•] 00E2 20AC 00A2 [•] 2022
242
+ […] 00E2 20AC 00A6 […] 2026
243
+ [‰] 00E2 20AC 00B0 [‰] 2030
244
+ [‹] 00E2 20AC 00B9 [‹] 2039
245
+ [›] 00E2 20AC 00BA [›] 203A
246
+ [“] 00E2 20AC 0153 [“] 201C
247
+ [ ] 00E2 20AC 0160 [ ] 200A
248
+ [‚] 00E2 20AC 0161 [‚] 201A
249
+ [„] 00E2 20AC 017E [„] 201E
250
+ [ ] 00E2 20AC 0192 [ ] 2003
251
+ [ ] 00E2 20AC 02C6 [ ] 2008
252
+ [‘] 00E2 20AC 02DC [‘] 2018
253
+ [ ] 00E2 20AC 201A [ ] 2002
254
+ [–] 00E2 20AC 201C [–] 2013
255
+ [—] 00E2 20AC 201D [—] 2014
256
+ [ ] 00E2 20AC 201E [ ] 2004
257
+ [ ] 00E2 20AC 2020 [ ] 2006
258
+ [ ] 00E2 20AC 2021 [ ] 2007
259
+ [ ] 00E2 20AC 2026 [ ] 2005
260
+ [ ] 00E2 20AC 2030 [ ] 2009
261
+ [​] 00E2 20AC 2039 [​] 200B
262
+ [ ] 00E2 20AC 20AC [ ] 2000
263
+ [’] 00E2 20AC 2122 [’] 2019
264
+ [â€�] 00E2 20AC FFFD [”] 201D
265
+ [â� ] 00E2 FFFD 00A0 [⁠] 2060
266
+ [] 00EF 00BB 00BF [] FEFF
267
+ [�] 00EF 00BF 00BD [�] FFFD
268
+ [￾] 00EF 00BF 00BE [￾] FFFE
@@ -14,9 +14,38 @@
14
14
  # permissions and limitations under the License.
15
15
  #++
16
16
 
17
- if ( RUBY_VERSION.split( '.' ).map { |d| d.to_i } <=> [ 1, 9 ] ) < 0
18
- raise "Requires ruby ~> 1.9 for String.encode support"
19
- end
20
-
21
17
  require 'mojibake/base'
22
- require 'mojibake/mapper'
18
+
19
+ require 'mojibake/json'
20
+
21
+ module MojiBake
22
+
23
+ # Supports recovering Mojibake characters to the original text.
24
+ class Mapper
25
+ include JSONSupport
26
+
27
+ if ( RUBY_VERSION.split( '.' ).map { |d| d.to_i } <=> [ 1, 9 ] ) >= 0
28
+ require 'mojibake/encoding'
29
+ include EncodingSupport
30
+ end
31
+
32
+ def initialize( opts = {} )
33
+ super()
34
+ opts.map { |k,v| send( k.to_s + '=', v ) }
35
+ end
36
+
37
+ # Recover original characters from input using regexp, recursively.
38
+ def recover( input, recursive = true )
39
+ output = input.gsub( regexp ) { |moji| hash[moji] }
40
+
41
+ # Only recurse if requested and substituted something (output
42
+ # shorter) in this run.
43
+ if recursive && ( output.length < input.length )
44
+ recover( output )
45
+ else
46
+ output
47
+ end
48
+ end
49
+
50
+ end
51
+ end
@@ -15,5 +15,5 @@
15
15
  #++
16
16
 
17
17
  module MojiBake
18
- VERSION = "1.0.0"
18
+ VERSION = "1.1.0"
19
19
  end
@@ -16,9 +16,9 @@
16
16
 
17
17
  module MojiBake
18
18
 
19
- # Creates a Map from mojibake sequences to recovered/original
20
- # characters.
21
- class Mapper
19
+ # Mixin for the actual (ruby 1.9 backed) encoding support to define
20
+ # the mojibake mapping table and regex.
21
+ module EncodingSupport
22
22
 
23
23
  W252 = Encoding::WINDOWS_1252
24
24
  ISO8 = Encoding::ISO_8859_1
@@ -30,20 +30,21 @@ module MojiBake
30
30
  # RIGHT DOUBLE QUOTATION MARK. These are the most common problem
31
31
  # chars in English and probably most latin languages.
32
32
  HIGH_ORDER_CHARS =
33
- ( ( 0x80..0xFF ).to_a - [ 0x81, 0x8D, 0x8F, 0x90, 0x9D ] ).
33
+ ( Array( 0x80..0xFF ) - [ 0x81, 0x8D, 0x8F, 0x90, 0x9D ] ).
34
34
  map { |i| i.chr( W252 ).encode( UTF8 ) }.
35
35
  sort
36
36
 
37
37
  # Additional Unicode codepoints of mojibake potential, like alt
38
38
  # whitespace, C1 control characters, and BOMs.
39
39
  INTEREST_CODEPOINTS =
40
- [ (0x0080..0x009F).to_a, # ISO/Unicode C1 control codes.
41
- 0x00A0, # NO-BREAK SPACE
42
- (0x2000..0x200B).to_a, # EN QUAD ... ZERO WIDTH SPACE
43
- 0x2060, # WORD JOINER
44
- 0xfeff, # ZERO WIDTH SPACE, BYTE-ORDER-MARK (BOM)
45
- 0xfffd, # REPLACEMENT CHARACTER
46
- 0xfffe ]. # UNASSIGNED, BAD BOM
40
+ [ 0x0080..0x009F, # ISO/Unicode C1 control codes.
41
+ 0x00A0, # NO-BREAK SPACE
42
+ 0x2000..0x200B, # EN QUAD ... ZERO WIDTH SPACE
43
+ 0x2060, # WORD JOINER
44
+ 0xfeff, # ZERO WIDTH SPACE, BYTE-ORDER-MARK (BOM)
45
+ 0xfffd, # REPLACEMENT CHARACTER
46
+ 0xfffe ]. # UNASSIGNED, BAD BOM
47
+ map { |i| Array( i ) }.
47
48
  flatten.
48
49
  sort
49
50
 
@@ -63,12 +64,11 @@ module MojiBake
63
64
  # (default: true). This covers ambiguities of C1 control codes.
64
65
  attr_accessor :map_permutations
65
66
 
66
- def initialize( options = {} )
67
+ def initialize
68
+ super
67
69
  @map_windows_1252 = true
68
70
  @map_iso_8859_1 = true
69
71
  @map_permutations = true
70
-
71
- options.map { |k,v| send( k.to_s + '=', v ) }
72
72
  end
73
73
 
74
74
  # Return Hash of mojibake UTF-8 2-3 character sequences to original
@@ -122,19 +122,6 @@ module MojiBake
122
122
  @regexp ||= Regexp.new( tree_flatten( char_tree( hash.keys ) ) )
123
123
  end
124
124
 
125
- # Recover original characters from input using regexp, recursively.
126
- def recover( input, recursive = true )
127
- output = input.gsub( regexp ) { |moji| hash[moji] }
128
-
129
- # Only recurse if requested and substituted something (output
130
- # shorter) in this run.
131
- if recursive && ( output.length < input.length )
132
- recover( output )
133
- else
134
- output
135
- end
136
- end
137
-
138
125
  def char_tree( seqs )
139
126
  seqs.inject( {} ) do |h,seq|
140
127
  seq.chars.inject( h ) do |hs,c|
@@ -158,8 +145,7 @@ module MojiBake
158
145
  o
159
146
  end
160
147
  if cs.find { |o| o =~ /[()|\[\]]/ }
161
- cs.join( '|' ).force_encoding( "UTF-8" )
162
- #FIXME: Join looses encoding so force, jruby bug?
148
+ cs.join( '|' )
163
149
  else
164
150
  if cs.length > 1
165
151
  '[' + cs.inject(:+) + ']'
@@ -0,0 +1,81 @@
1
+ #--
2
+ # Copyright (c) 2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'mojibake/base'
18
+ require 'json'
19
+
20
+ module MojiBake
21
+
22
+ module JSONSupport
23
+
24
+ JSON_CONFIG = File.join( File.dirname( __FILE__ ),
25
+ '..', '..', 'config', 'table.json' )
26
+
27
+ def initialize
28
+ super
29
+ end
30
+
31
+ def config
32
+ @config ||= JSON.parse( IO.read( JSON_CONFIG ) )
33
+ end
34
+
35
+ def hash
36
+ @hash ||= config[ 'moji' ]
37
+ end
38
+
39
+ def regexp
40
+ # Note use of Unicode mode for ruby 1.8's
41
+ @regexp ||= Regexp.new( config[ 'regexp' ], 0, 'U' )
42
+ end
43
+
44
+ # table as self contained json-ready Hash
45
+ def hash_to_json_object
46
+
47
+ # Also use unicode escape for the interesting (effectively,
48
+ # non-printable) subset of moji mappings.
49
+ moji = hash.sort.map do |kv|
50
+ kv.map do |s|
51
+ s.codepoints.inject( '' ) do |r,i|
52
+ if MojiBake::Mapper::INTEREST_CODEPOINTS.include?( i )
53
+ r << sprintf( '\u%04X', i )
54
+ else
55
+ r << i.chr( Encoding::UTF_8 )
56
+ end
57
+ end
58
+ end
59
+ end
60
+
61
+ { :mojibake => MojiBake::VERSION,
62
+ :url => "https://github.com/dekellum/mojibake",
63
+ :regexp => regexp.inspect[1...-1],
64
+ :moji => Hash[ moji ] }
65
+ end
66
+
67
+ # Pretty formatted JSON serialized String for json_object
68
+ def json
69
+ # Generate and replace what become double escaped '\\u' UNICODE
70
+ # escapes with single '\u' escapes. This is a hack but is
71
+ # reasonably safe given that 'u' isn't normally escaped. The
72
+ # alterantive would be to hack JSON package or do the JSON
73
+ # formatting ourselves. Ideally JSON package would support
74
+ # serialization using unicode escapes for the non-printable,
75
+ # non-friendly chars. As of 1.6.1 it doesn't.
76
+ JSON.pretty_generate( hash_to_json_object ).gsub( /\\\\u/, '\u' )
77
+ end
78
+
79
+ end
80
+
81
+ end
@@ -0,0 +1,76 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+ #.hashdot.args.pre = --1.9
4
+ #.hashdot.profile += jruby-shortlived
5
+
6
+ #--
7
+ # Copyright (c) 2011 David Kellum
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
10
+ # may not use this file except in compliance with the License. You
11
+ # may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
18
+ # implied. See the License for the specific language governing
19
+ # permissions and limitations under the License.
20
+ #++
21
+
22
+ ldir = File.join( File.dirname( __FILE__ ), "..", "lib" )
23
+ $LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
24
+
25
+ require 'rubygems'
26
+ require 'minitest/unit'
27
+ require 'minitest/autorun'
28
+
29
+ require 'mojibake'
30
+
31
+ class TestEncoding < MiniTest::Unit::TestCase
32
+ include MojiBake
33
+
34
+ def setup
35
+ @mapper = Mapper.new
36
+ end
37
+
38
+ TEST_TREE = { "a" => { "b" => { "c" => {},
39
+ "d" => {} } },
40
+ "d" => { "b" => { "f" => {} } } }
41
+
42
+ # These only test with Ruby 1.9 support
43
+ if ( RUBY_VERSION.split( '.' ).map { |d| d.to_i } <=> [ 1, 9 ] ) >= 0
44
+
45
+ def test_init_options
46
+ assert_equal( true, Mapper.new.map_iso_8859_1 )
47
+ m = Mapper.new( :map_iso_8859_1 => false )
48
+ assert_equal( false, m.map_iso_8859_1 )
49
+ end
50
+
51
+ def test_char_tree
52
+ assert_equal( TEST_TREE,
53
+ @mapper.char_tree( [ "abc", "abd", "dbf" ] ) )
54
+ end
55
+
56
+ def test_tree_flaten
57
+ assert_equal( "ab[cd]|dbf",
58
+ @mapper.tree_flatten( TEST_TREE ) )
59
+ end
60
+
61
+ def test_regexp
62
+ re = Regexp.new( @mapper.tree_flatten( TEST_TREE ) )
63
+ assert_match( re, "abc" )
64
+ assert_match( re, "abd" )
65
+ assert_match( re, "dbf" )
66
+
67
+ refute_match( re, "ab" )
68
+ refute_match( re, "abf" )
69
+
70
+ assert_equal( "xbf" , "abdbf".gsub( re, 'x' ) )
71
+ assert_equal( "dbf" , "abdbf".gsub( re, 'd' ) )
72
+ end
73
+
74
+ end
75
+
76
+ end
@@ -28,46 +28,13 @@ require 'minitest/autorun'
28
28
 
29
29
  require 'mojibake'
30
30
 
31
- class TestMojiBake < MiniTest::Unit::TestCase
31
+ class TestMapper < MiniTest::Unit::TestCase
32
32
  include MojiBake
33
33
 
34
34
  def setup
35
35
  @mapper = Mapper.new
36
36
  end
37
37
 
38
- TEST_TREE = { "a" => { "b" => { "c" => {},
39
- "d" => {} } },
40
- "d" => { "b" => { "f" => {} } } }
41
-
42
- def test_init_options
43
- assert_equal( true, Mapper.new.map_iso_8859_1 )
44
- m = Mapper.new( :map_iso_8859_1 => false )
45
- assert_equal( false, m.map_iso_8859_1 )
46
- end
47
-
48
- def test_char_tree
49
- assert_equal( TEST_TREE,
50
- @mapper.char_tree( [ "abc", "abd", "dbf" ] ) )
51
- end
52
-
53
- def test_tree_flaten
54
- assert_equal( "ab[cd]|dbf",
55
- @mapper.tree_flatten( TEST_TREE ) )
56
- end
57
-
58
- def test_regexp
59
- re = Regexp.new( @mapper.tree_flatten( TEST_TREE ) )
60
- assert_match( re, "abc" )
61
- assert_match( re, "abd" )
62
- assert_match( re, "dbf" )
63
-
64
- refute_match( re, "ab" )
65
- refute_match( re, "abf" )
66
-
67
- assert_equal( "xbf" , "abdbf".gsub( re, 'x' ) )
68
- assert_equal( "dbf" , "abdbf".gsub( re, 'd' ) )
69
- end
70
-
71
38
  def test_nomatch_recover
72
39
  assert_equal( '', @mapper.recover( '' ) )
73
40
  assert_equal( 'ascii', @mapper.recover( 'ascii' ) )
metadata CHANGED
@@ -2,42 +2,49 @@
2
2
  name: mojibake
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 1.0.0
5
+ version: 1.1.0
6
6
  platform: ruby
7
7
  authors:
8
- - David Kellum
8
+ - David Kellum
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2011-06-21 00:00:00 -07:00
14
- default_executable:
13
+ date: 2011-10-31 00:00:00 Z
15
14
  dependencies:
16
- - !ruby/object:Gem::Dependency
17
- name: minitest
18
- prerelease: false
19
- requirement: &id001 !ruby/object:Gem::Requirement
20
- none: false
21
- requirements:
22
- - - ">="
23
- - !ruby/object:Gem::Version
24
- version: "2.1"
25
- - - <
26
- - !ruby/object:Gem::Version
27
- version: "2.4"
28
- type: :development
29
- version_requirements: *id001
30
- - !ruby/object:Gem::Dependency
31
- name: rjack-tarpit
32
- prerelease: false
33
- requirement: &id002 !ruby/object:Gem::Requirement
34
- none: false
35
- requirements:
36
- - - ~>
37
- - !ruby/object:Gem::Version
38
- version: 1.3.2
39
- type: :development
40
- version_requirements: *id002
15
+ - !ruby/object:Gem::Dependency
16
+ name: json
17
+ prerelease: false
18
+ requirement: &id001 !ruby/object:Gem::Requirement
19
+ none: false
20
+ requirements:
21
+ - - ~>
22
+ - !ruby/object:Gem::Version
23
+ version: 1.6.1
24
+ type: :runtime
25
+ version_requirements: *id001
26
+ - !ruby/object:Gem::Dependency
27
+ name: minitest
28
+ prerelease: false
29
+ requirement: &id002 !ruby/object:Gem::Requirement
30
+ none: false
31
+ requirements:
32
+ - - ~>
33
+ - !ruby/object:Gem::Version
34
+ version: "2.3"
35
+ type: :development
36
+ version_requirements: *id002
37
+ - !ruby/object:Gem::Dependency
38
+ name: rjack-tarpit
39
+ prerelease: false
40
+ requirement: &id003 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: 1.4.0
46
+ type: :development
47
+ version_requirements: *id003
41
48
  description: "Mojibake occurs in English most frequently due to misinterpreting and\n\
42
49
  bad-transcoding between Windows-1252, ISO-8859-1, and UTF-8. This\n\
43
50
  module provides a mojibake sequence to original character mapping\n\
@@ -45,54 +52,60 @@ description: "Mojibake occurs in English most frequently due to misinterpreting
45
52
  Testing has been with English but other Latin based languages, where\n\
46
53
  Windows-1252 is in the wild, should also benefit."
47
54
  email:
48
- - dek-oss@gravitext.com
55
+ - dek-oss@gravitext.com
49
56
  executables:
50
- - mojibake
57
+ - mojibake
51
58
  extensions: []
52
59
 
53
60
  extra_rdoc_files:
54
- - Manifest.txt
55
- - History.rdoc
56
- - README.rdoc
61
+ - Manifest.txt
62
+ - config/table.txt
63
+ - History.rdoc
64
+ - README.rdoc
57
65
  files:
58
- - History.rdoc
59
- - Manifest.txt
60
- - README.rdoc
61
- - Rakefile
62
- - bin/mojibake
63
- - lib/mojibake/base.rb
64
- - lib/mojibake.rb
65
- - lib/mojibake/mapper.rb
66
- - test/test.txt
67
- - test/test_mojibake.rb
68
- has_rdoc: true
66
+ - History.rdoc
67
+ - Manifest.txt
68
+ - README.rdoc
69
+ - Rakefile
70
+ - bin/mojibake
71
+ - config/table.json
72
+ - config/table.txt
73
+ - lib/mojibake/base.rb
74
+ - lib/mojibake.rb
75
+ - lib/mojibake/encoding.rb
76
+ - lib/mojibake/json.rb
77
+ - test/test.txt
78
+ - test/test_encoding.rb
79
+ - test/test_mapper.rb
80
+ - .gemtest
69
81
  homepage: http://github.com/dekellum/mojibake
70
82
  licenses: []
71
83
 
72
84
  post_install_message:
73
85
  rdoc_options:
74
- - --main
75
- - README.rdoc
86
+ - --main
87
+ - README.rdoc
76
88
  require_paths:
77
- - lib
89
+ - lib
78
90
  required_ruby_version: !ruby/object:Gem::Requirement
79
91
  none: false
80
92
  requirements:
81
- - - ">="
82
- - !ruby/object:Gem::Version
83
- version: "0"
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: "0"
84
96
  required_rubygems_version: !ruby/object:Gem::Requirement
85
97
  none: false
86
98
  requirements:
87
- - - ">="
88
- - !ruby/object:Gem::Version
89
- version: "0"
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: "0"
90
102
  requirements: []
91
103
 
92
104
  rubyforge_project: mojibake
93
- rubygems_version: 1.5.1
105
+ rubygems_version: 1.8.11
94
106
  signing_key:
95
107
  specification_version: 3
96
108
  summary: Mojibake occurs in English most frequently due to misinterpreting and bad-transcoding between Windows-1252, ISO-8859-1, and UTF-8
97
109
  test_files:
98
- - test/test_mojibake.rb
110
+ - test/test_encoding.rb
111
+ - test/test_mapper.rb