cmess 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 6388ed1476587af57eb196817003d94d58ecb268
4
+ data.tar.gz: 41551903b954d2de73b62203467d93955ac2516f
5
+ SHA512:
6
+ metadata.gz: 4ffc33d0ed9392797a97a5eb535cb0652687e0b8abcd2b6a30de85d41beb63b65ebfa434a625b6aa91bfa379ad5fe8a2f46286aa3450b930633bea9241ba30e7
7
+ data.tar.gz: e11d90e25d6b5fb5c0f6d81f7d2824e2c7dd662e5feef1f7978cd66588fcd0a5a18cc0d60fbe2d1ee6b3cac254198ff07a64bd985b1d7bb5f3a1f27aa7d7096a
data/ChangeLog CHANGED
@@ -1,5 +1,17 @@
1
+ # markup: rd
2
+
1
3
  = Revision history for cmess
2
4
 
5
+ == 0.4.0 [2013-08-02]
6
+
7
+ * Updated for Ruby 1.9.2+. <b>Ruby 1.8 no longer supported.</b>
8
+ * guess_encoding: Added <tt>--reverse</tt> option.
9
+ * Housekeeping.
10
+
11
+ == 0.3.1 [2011-08-16]
12
+
13
+ * decode_entities: Fixed regression.
14
+
3
15
  == 0.3.0 [2011-07-25]
4
16
 
5
17
  * Extensive refactoring
data/README CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  == VERSION
4
4
 
5
- This documentation refers to cmess version 0.3.1
5
+ This documentation refers to cmess version 0.4.0
6
6
 
7
7
 
8
8
  == DESCRIPTION
@@ -22,21 +22,24 @@ there are:
22
22
  (see CMess::BConv)
23
23
  +decode_entities+:: Decode HTML entities in a string. (see CMess::DecodeEntities)
24
24
 
25
- TODO: well, more of the description... ;-)
25
+
26
+ == SUPPORTED PLATFORMS
27
+
28
+ Requires Ruby version 1.9.2 or higher; use the latest 0.3.x release on older
29
+ Ruby versions. CMess has been tested with ruby 2.0.0p247 on x86_64-linux.
26
30
 
27
31
 
28
32
  == LINKS
29
33
 
30
34
  <b></b>
31
- Documentation:: http://prometheus.rubyforge.org/cmess
32
- Source code:: http://github.com/blackwinter/cmess
33
- RubyForge project:: http://rubyforge.org/projects/prometheus
34
- RubyGem:: http://rubygems.org/gems/cmess
35
+ Documentation:: http://blackwinter.github.com/cmess
36
+ Source code:: http://github.com/blackwinter/cmess
37
+ RubyGem:: http://rubygems.org/gems/cmess
35
38
 
36
39
 
37
40
  == AUTHORS
38
41
 
39
- * Jens Wille <mailto:jens.wille@uni-koeln.de>
42
+ * Jens Wille <mailto:jens.wille@gmail.com>
40
43
 
41
44
 
42
45
  == CREDITS
@@ -48,9 +51,11 @@ RubyGem:: http://rubygems.org/gems/cmess
48
51
 
49
52
  == LICENSE AND COPYRIGHT
50
53
 
51
- Copyright (C) 2007-2011 University of Cologne,
54
+ Copyright (C) 2007-2012 University of Cologne,
52
55
  Albertus-Magnus-Platz, 50923 Cologne, Germany
53
56
 
57
+ Copyright (C) 2013 Jens Wille
58
+
54
59
  cmess is free software: you can redistribute it and/or modify it under the
55
60
  terms of the GNU Affero General Public License as published by the Free
56
61
  Software Foundation, either version 3 of the License, or (at your option)
data/Rakefile CHANGED
@@ -6,21 +6,21 @@ begin
6
6
  require 'hen'
7
7
 
8
8
  Hen.lay! {{
9
- :rubyforge => {
10
- :project => %q{prometheus},
11
- :package => %q{cmess}
12
- },
13
-
14
- :gem => {
15
- :version => CMess::VERSION,
16
- :summary => %Q{
17
- Assist with handling messed up encodings (Currently includes the
18
- following tools: #{Dir['bin/*'].map { |e| File.basename(e) }.sort.join(', ')})
19
- },
20
- :author => %q{Jens Wille},
21
- :email => %q{jens.wille@uni-koeln.de},
22
- :extra_files => FileList['data/**/*'].to_a,
23
- :dependencies => [['ruby-nuggets', '>= 0.3.3'], 'htmlentities']
9
+ gem: {
10
+ name: %q{cmess},
11
+ version: CMess::VERSION,
12
+ summary: <<-EOT,
13
+ Assist with handling messed up encodings (Currently includes the
14
+ following tools: #{Dir['bin/*'].map { |e| File.basename(e) }.sort.join(', ')})
15
+ EOT
16
+ author: %q{Jens Wille},
17
+ email: %q{jens.wille@gmail.com},
18
+ license: %q{AGPL},
19
+ homepage: :blackwinter,
20
+ extra_files: FileList['data/**/*'].to_a,
21
+ dependencies: [['ruby-nuggets', '>= 0.3.3'], 'htmlentities'],
22
+
23
+ required_ruby_version: '>= 1.9.2'
24
24
  }
25
25
  }}
26
26
  rescue LoadError => err
data/bin/bconv CHANGED
@@ -6,12 +6,14 @@
6
6
  # bconv -- Convert between bibliographic (and other) encodings #
7
7
  # [A component of cmess, the encoding tool-box] #
8
8
  # #
9
- # Copyright (C) 2008-2011 University of Cologne, #
9
+ # Copyright (C) 2008-2012 University of Cologne, #
10
10
  # Albertus-Magnus-Platz, #
11
11
  # 50923 Cologne, Germany #
12
12
  # #
13
+ # Copyright (C) 2013 Jens Wille #
14
+ # #
13
15
  # Authors: #
14
- # Jens Wille <jens.wille@uni-koeln.de> #
16
+ # Jens Wille <jens.wille@gmail.com> #
15
17
  # #
16
18
  # cmess is free software; you can redistribute it and/or modify it under the #
17
19
  # terms of the GNU Affero General Public License as published by the Free #
@@ -33,11 +35,11 @@ require 'cmess/bconv'
33
35
  include CMess::CLI
34
36
 
35
37
  options = {
36
- :input => STDIN,
37
- :output => STDOUT,
38
- :source_encoding => determine_system_encoding,
39
- :target_encoding => determine_system_encoding,
40
- :chartab_file => CMess::BConv::DEFAULT_CHARTAB_FILE
38
+ input: STDIN,
39
+ output: STDOUT,
40
+ source_encoding: determine_system_encoding,
41
+ target_encoding: determine_system_encoding,
42
+ chartab_file: CMess::BConv::DEFAULT_CHARTAB_FILE
41
43
  }
42
44
 
43
45
  parse_options { |opts|
@@ -6,12 +6,14 @@
6
6
  # cinderella -- Handle double encoded characters #
7
7
  # [A component of cmess, the encoding tool-box] #
8
8
  # #
9
- # Copyright (C) 2007-2011 University of Cologne, #
9
+ # Copyright (C) 2008-2012 University of Cologne, #
10
10
  # Albertus-Magnus-Platz, #
11
11
  # 50923 Cologne, Germany #
12
12
  # #
13
+ # Copyright (C) 2013 Jens Wille #
14
+ # #
13
15
  # Authors: #
14
- # Jens Wille <jens.wille@uni-koeln.de> #
16
+ # Jens Wille <jens.wille@gmail.com> #
15
17
  # #
16
18
  # cmess is free software; you can redistribute it and/or modify it under the #
17
19
  # terms of the GNU Affero General Public License as published by the Free #
@@ -35,14 +37,14 @@ include CMess::CLI
35
37
  progname = File.basename($0)
36
38
 
37
39
  options = {
38
- :input => STDIN,
39
- :output => STDOUT,
40
- :pot => nil,
41
- :crop => nil,
42
- :source_encoding => nil,
43
- :target_encoding => determine_system_encoding,
44
- :csets => [CMess::Cinderella::DEFAULT_CSETS_DIR],
45
- :repair => false
40
+ input: STDIN,
41
+ output: STDOUT,
42
+ pot: nil,
43
+ crop: nil,
44
+ source_encoding: nil,
45
+ target_encoding: determine_system_encoding,
46
+ csets: [CMess::Cinderella::DEFAULT_CSETS_DIR],
47
+ repair: false
46
48
  }
47
49
 
48
50
  parse_options { |opts|
@@ -169,5 +171,5 @@ cli do
169
171
 
170
172
  trailing_args_as_input(options)
171
173
 
172
- CMess::Cinderella.pick(options.merge(:chars => YAML.load_file(char_file)))
174
+ CMess::Cinderella.pick(options.merge(chars: YAML.load_file(char_file)))
173
175
  end
@@ -6,12 +6,14 @@
6
6
  # decode_entities -- Decode HTML entities #
7
7
  # [A component of cmess, the encoding tool-box] #
8
8
  # #
9
- # Copyright (C) 2007-2011 University of Cologne, #
9
+ # Copyright (C) 2008-2012 University of Cologne, #
10
10
  # Albertus-Magnus-Platz, #
11
11
  # 50923 Cologne, Germany #
12
12
  # #
13
+ # Copyright (C) 2013 Jens Wille #
14
+ # #
13
15
  # Authors: #
14
- # Jens Wille <jens.wille@uni-koeln.de> #
16
+ # Jens Wille <jens.wille@gmail.com> #
15
17
  # #
16
18
  # cmess is free software; you can redistribute it and/or modify it under the #
17
19
  # terms of the GNU Affero General Public License as published by the Free #
@@ -33,11 +35,11 @@ require 'cmess/decode_entities'
33
35
  include CMess::CLI
34
36
 
35
37
  options = {
36
- :input => STDIN,
37
- :output => STDOUT,
38
- :source_encoding => CMess::DecodeEntities::INTERMEDIATE_ENCODING,
39
- :target_encoding => nil,
40
- :flavour => CMess::DecodeEntities::DEFAULT_FLAVOUR
38
+ input: STDIN,
39
+ output: STDOUT,
40
+ source_encoding: CMess::DecodeEntities::ENCODING,
41
+ target_encoding: nil,
42
+ flavour: CMess::DecodeEntities::DEFAULT_FLAVOUR
41
43
  }
42
44
 
43
45
  parse_options { |opts|
@@ -6,12 +6,14 @@
6
6
  # guess_encoding -- Assist with guessing the encoding of some input at hand #
7
7
  # [A component of cmess, the encoding tool-box] #
8
8
  # #
9
- # Copyright (C) 2007-2011 University of Cologne, #
9
+ # Copyright (C) 2008-2012 University of Cologne, #
10
10
  # Albertus-Magnus-Platz, #
11
11
  # 50923 Cologne, Germany #
12
12
  # #
13
+ # Copyright (C) 2013 Jens Wille #
14
+ # #
13
15
  # Authors: #
14
- # Jens Wille <jens.wille@uni-koeln.de> #
16
+ # Jens Wille <jens.wille@gmail.com> #
15
17
  # #
16
18
  # cmess is free software; you can redistribute it and/or modify it under the #
17
19
  # terms of the GNU Affero General Public License as published by the Free #
@@ -33,16 +35,17 @@ require 'cmess/guess_encoding'
33
35
  include CMess::CLI
34
36
 
35
37
  options = {
36
- :input => STDIN,
37
- :line => 1,
38
- :encodings => nil,
39
- :additional_encodings => [],
40
- :target_encoding => determine_system_encoding,
41
- :manual => false,
42
- :chunk_size => nil,
43
- :ignore_bom => false,
44
- :charcodes => nil,
45
- :base => 16
38
+ input: STDIN,
39
+ line: 1,
40
+ encodings: nil,
41
+ additional_encodings: [],
42
+ target_encoding: determine_system_encoding,
43
+ manual: false,
44
+ reverse: false,
45
+ chunk_size: nil,
46
+ ignore_bom: false,
47
+ charcodes: nil,
48
+ base: 16
46
49
  }
47
50
 
48
51
  parse_options { |opts|
@@ -79,12 +82,8 @@ parse_options { |opts|
79
82
  opts.separator ''
80
83
 
81
84
  opts.on('-l', '--line LINE', Integer, "Line number of input file to use for testing [Default: #{options[:line]}]") { |line|
85
+ abort 'Line number must be greater than 0!' unless line > 0
82
86
  options[:line] = line
83
-
84
- unless options[:line] > 0
85
- options[:input].read # prevent 'Broken pipe' error
86
- abort 'Line number must be greater then 0!'
87
- end
88
87
  }
89
88
 
90
89
  opts.separator ''
@@ -111,6 +110,12 @@ parse_options { |opts|
111
110
  exit
112
111
  }
113
112
 
113
+ opts.separator ''
114
+
115
+ opts.on('-R', '--reverse', 'Reverse encoding direction (FROM target TO encodings)') {
116
+ options[:reverse] = true
117
+ }
118
+
114
119
  opts.separator ''
115
120
  opts.separator ' * Charcodes'
116
121
 
@@ -189,7 +194,7 @@ cli do
189
194
  abort "Line not found -- input has only #{$.} line#{'s' if $. != 1}" unless input.is_a?(String)
190
195
  end
191
196
 
192
- CMess::GuessEncoding.manual(options.merge(:input => input))
197
+ CMess::GuessEncoding.manual(options.merge(input: input))
193
198
  else
194
199
  puts CMess::GuessEncoding.automatic(options[:input], options[:chunk_size], options[:ignore_bom])
195
200
  end
@@ -3,12 +3,14 @@
3
3
  # #
4
4
  # cmess -- Assist with handling messed up encodings #
5
5
  # #
6
- # Copyright (C) 2007-2011 University of Cologne, #
6
+ # Copyright (C) 2008-2012 University of Cologne, #
7
7
  # Albertus-Magnus-Platz, #
8
8
  # 50923 Cologne, Germany #
9
9
  # #
10
+ # Copyright (C) 2013 Jens Wille #
11
+ # #
10
12
  # Authors: #
11
- # Jens Wille <jens.wille@uni-koeln.de> #
13
+ # Jens Wille <jens.wille@gmail.com> #
12
14
  # #
13
15
  # cmess is free software; you can redistribute it and/or modify it under the #
14
16
  # terms of the GNU Affero General Public License as published by the Free #
@@ -27,7 +29,6 @@
27
29
  #++
28
30
 
29
31
  require 'cmess/version'
30
- require 'iconv'
31
32
 
32
33
  # See README for more information.
33
34
 
@@ -3,12 +3,14 @@
3
3
  # #
4
4
  # A component of cmess, the encoding tool-box. #
5
5
  # #
6
- # Copyright (C) 2008-2011 University of Cologne, #
6
+ # Copyright (C) 2008-2012 University of Cologne, #
7
7
  # Albertus-Magnus-Platz, #
8
8
  # 50923 Cologne, Germany #
9
9
  # #
10
+ # Copyright (C) 2013 Jens Wille #
11
+ # #
10
12
  # Authors: #
11
- # Jens Wille <jens.wille@uni-koeln.de> #
13
+ # Jens Wille <jens.wille@gmail.com> #
12
14
  # #
13
15
  # cmess is free software; you can redistribute it and/or modify it under the #
14
16
  # terms of the GNU Affero General Public License as published by the Free #
@@ -33,9 +35,9 @@ require 'yaml'
33
35
 
34
36
  class CMess::BConv
35
37
 
36
- VERSION = '0.0.3'
38
+ VERSION = '0.1.0'
37
39
 
38
- INTERMEDIATE_ENCODING = 'utf-8'
40
+ ENCODING = 'utf-8'
39
41
 
40
42
  DEFAULT_CHARTAB_FILE = File.join(CMess::DATA_DIR, 'chartab.yaml')
41
43
 
@@ -67,8 +69,6 @@ class CMess::BConv
67
69
 
68
70
  end
69
71
 
70
- attr_reader :input, :output, :source_encoding, :target_encoding, :chartab, :encodings
71
-
72
72
  def initialize(options)
73
73
  @input, @output, _ = CMess.ensure_options!(options,
74
74
  :input, :output, :source_encoding, :target_encoding
@@ -79,83 +79,64 @@ class CMess::BConv
79
79
 
80
80
  @chartab = self.class.load_chartab(options[:chartab] || DEFAULT_CHARTAB_FILE)
81
81
  @encodings = self.class.encodings(@chartab)
82
- end
83
82
 
84
- def encoding?(encoding)
85
- encodings.include?(encoding)
83
+ [:source_encoding, :target_encoding].each { |key|
84
+ instance_variable_set("@#{key}", encoding = options[key].upcase)
85
+ instance_variable_set("@have_#{key}", encodings.include?(encoding))
86
+ }
86
87
  end
87
88
 
89
+ attr_reader :input, :output, :source_encoding, :target_encoding, :chartab, :encodings
90
+
88
91
  def convert
89
- if encoding?(source_encoding)
90
- if encoding?(target_encoding)
91
- @charmap = chartab.inject({}) { |hash, (code, map)|
92
- hash.update(map[source_encoding] => map[target_encoding].pack('U*'))
93
- }
92
+ source, target, out, charmap = source_encoding, target_encoding, output, {}
94
93
 
95
- input.each_byte { |char|
96
- output.print map(char)
94
+ if @have_source_encoding
95
+ if @have_target_encoding
96
+ chartab.each { |code, map|
97
+ charmap[map[source]] = map[target].pack('U*')
97
98
  }
98
- else
99
- iconv = iconv_to
100
99
 
101
- @charmap = chartab.inject({}) { |hash, (code, map)|
102
- hash.update(map[source_encoding] => [code.to_i(16)].pack('U*'))
100
+ input.each_byte { |char| out.print(map(char, charmap)) }
101
+ else
102
+ chartab.each { |code, map|
103
+ charmap[map[source]] = [code.to_i(16)].pack('U*')
103
104
  }
104
105
 
106
+ source = ENCODING
107
+
105
108
  input.each_byte { |char|
106
- output.print iconv.iconv(map(char))
109
+ out.print(encode(map(char, charmap), source, target))
107
110
  }
108
111
  end
109
112
  else
110
- if encoding?(target_encoding)
111
- iconv = iconv_from
112
-
113
- charmap = chartab.inject({}) { |hash, (code, map)|
114
- hash.update(code.to_i(16) => map[target_encoding].pack('U*'))
113
+ if @have_target_encoding
114
+ chartab.each { |code, map|
115
+ charmap[code.to_i(16)] = map[target].pack('U*')
115
116
  }
116
117
 
118
+ target = ENCODING
119
+
117
120
  input.each { |line|
118
- iconv.iconv(line).unpack('U*').each { |char|
119
- output.print charmap[char]
121
+ encode(line, source, target).unpack('U*').each { |char|
122
+ out.print(charmap[char])
120
123
  }
121
124
  }
122
125
  else
123
- iconv = iconv_from_to
124
-
125
- input.each { |line|
126
- output.puts iconv.iconv(line)
127
- }
126
+ input.each { |line| out.print(encode(line, source, target)) }
128
127
  end
129
128
  end
130
129
  end
131
130
 
132
131
  private
133
132
 
134
- def iconv_from_to(from = source_encoding, to = target_encoding)
135
- iconv = begin
136
- Iconv.new(to, from)
137
- rescue Iconv::InvalidEncoding
138
- raise ArgumentError, "invalid encoding: source encoding = #{from}, target encoding = #{to}"
139
- end
140
-
141
- def iconv.iconv(*args)
142
- super
143
- rescue Iconv::IllegalSequence, Iconv::InvalidCharacter => err
144
- warn "ILLEGAL INPUT SEQUENCE: #{err}"; ''
145
- end
146
-
147
- iconv
148
- end
149
-
150
- def iconv_from(from = source_encoding)
151
- iconv_from_to(from, INTERMEDIATE_ENCODING)
152
- end
153
-
154
- def iconv_to(to = target_encoding)
155
- iconv_from_to(INTERMEDIATE_ENCODING, to)
133
+ def encode(string, source, target)
134
+ string.encode(target, source)
135
+ rescue Encoding::UndefinedConversionError => err
136
+ warn "ILLEGAL INPUT SEQUENCE: #{err.error_char}"
156
137
  end
157
138
 
158
- def map(char, charmap = @charmap)
139
+ def map(char, charmap)
159
140
  unless map = charmap[[char]]
160
141
  unless map = charmap[[char, c = input.getc]]
161
142
  input.ungetc(c) if c
@@ -3,12 +3,14 @@
3
3
  # #
4
4
  # A component of cmess, the encoding tool-box. #
5
5
  # #
6
- # Copyright (C) 2007-2011 University of Cologne, #
6
+ # Copyright (C) 2008-2012 University of Cologne, #
7
7
  # Albertus-Magnus-Platz, #
8
8
  # 50923 Cologne, Germany #
9
9
  # #
10
+ # Copyright (C) 2013 Jens Wille #
11
+ # #
10
12
  # Authors: #
11
- # Jens Wille <jens.wille@uni-koeln.de> #
13
+ # Jens Wille <jens.wille@gmail.com> #
12
14
  # #
13
15
  # cmess is free software; you can redistribute it and/or modify it under the #
14
16
  # terms of the GNU Affero General Public License as published by the Free #
@@ -39,34 +41,33 @@ module CMess::Cinderella
39
41
 
40
42
  extend self
41
43
 
42
- VERSION = '0.0.5'
44
+ VERSION = '0.1.0'
43
45
 
44
46
  DEFAULT_CSETS_DIR = File.join(CMess::DATA_DIR, 'csets')
45
47
 
46
48
  def pick(options)
47
- CMess.ensure_options!(options,
49
+ input, pot, crop, source, target, chars = CMess.ensure_options!(options,
48
50
  :input, :pot, :crop, :source_encoding, :target_encoding, :chars
49
51
  )
50
52
 
51
53
  encoded = {}
52
- iconv = Iconv.new(*options.values_at(:target_encoding, :source_encoding))
53
-
54
- options[:chars].each { |char|
55
- begin
56
- encoded[iconv.iconv(char)] = char
57
- rescue Iconv::IllegalSequence
58
- end
59
- }
54
+ chars.each { |char| encoded[encode(char, source, target)] = char }
60
55
 
61
56
  regexp = Regexp.union(*encoded.keys)
62
- pot, crop, repair = options.values_at(:pot, :crop, :repair)
63
57
 
64
- options[:input].each { |line|
65
- if out = line =~ regexp ? crop : pot
66
- line.gsub!(regexp) { |m| encoded[m] } if repair
67
- out.puts(line)
68
- end
58
+ input.each { |line|
59
+ out = line =~ regexp ? crop : pot or next
60
+
61
+ line.gsub!(regexp) { |m| encoded[m] } if repair
62
+ out.puts(line)
69
63
  }
70
64
  end
71
65
 
66
+ private
67
+
68
+ def encode(string, source, target)
69
+ string.encode(target, source)
70
+ rescue Encoding::UndefinedConversionError
71
+ end
72
+
72
73
  end
@@ -3,12 +3,14 @@
3
3
  # #
4
4
  # A component of cmess, the encoding tool-box. #
5
5
  # #
6
- # Copyright (C) 2007-2011 University of Cologne, #
6
+ # Copyright (C) 2008-2012 University of Cologne, #
7
7
  # Albertus-Magnus-Platz, #
8
8
  # 50923 Cologne, Germany #
9
9
  # #
10
+ # Copyright (C) 2013 Jens Wille #
11
+ # #
10
12
  # Authors: #
11
- # Jens Wille <jens.wille@uni-koeln.de> #
13
+ # Jens Wille <jens.wille@gmail.com> #
12
14
  # #
13
15
  # cmess is free software; you can redistribute it and/or modify it under the #
14
16
  # terms of the GNU Affero General Public License as published by the Free #
@@ -102,18 +104,12 @@ module CMess::CLI
102
104
  end
103
105
 
104
106
  def determine_system_encoding
105
- ENV.user_encoding || begin
106
- dummy = lambda {
107
- abort <<-EOT
107
+ ENV.user_encoding || lambda {
108
+ abort <<-EOT
108
109
  Your system's encoding couldn't be determined automatically -- please specify
109
110
  it explicitly via the ENCODING environment variable or via the '-t' option.
110
- EOT
111
- }
112
-
113
- def dummy.to_s; 'NOT FOUND' end
114
-
115
- dummy
116
- end
111
+ EOT
112
+ }.tap { |dummy| def dummy.to_s; 'NOT FOUND'; end }
117
113
  end
118
114
 
119
115
  def cli
@@ -3,12 +3,14 @@
3
3
  # #
4
4
  # A component of cmess, the encoding tool-box. #
5
5
  # #
6
- # Copyright (C) 2007-2011 University of Cologne, #
6
+ # Copyright (C) 2008-2012 University of Cologne, #
7
7
  # Albertus-Magnus-Platz, #
8
8
  # 50923 Cologne, Germany #
9
9
  # #
10
+ # Copyright (C) 2013 Jens Wille #
11
+ # #
10
12
  # Authors: #
11
- # Jens Wille <jens.wille@uni-koeln.de> #
13
+ # Jens Wille <jens.wille@gmail.com> #
12
14
  # #
13
15
  # cmess is free software; you can redistribute it and/or modify it under the #
14
16
  # terms of the GNU Affero General Public License as published by the Free #
@@ -33,43 +35,38 @@ module CMess::DecodeEntities
33
35
 
34
36
  extend self
35
37
 
36
- VERSION = '0.0.5'
38
+ VERSION = '0.1.0'
37
39
 
38
40
  # HTMLEntities requires UTF-8
39
- INTERMEDIATE_ENCODING = 'utf-8'
40
-
41
- ICONV_DUMMY = begin
42
- dummy = Object.new
43
-
44
- def dummy.iconv(string)
45
- string
46
- end
47
-
48
- dummy
49
- end
41
+ ENCODING = 'UTF-8'
50
42
 
51
43
  DEFAULT_FLAVOUR = 'xml-safe'
52
44
 
53
45
  def decode(options)
54
- input, output, source_encoding = CMess.ensure_options!(options,
46
+ input, output, source = CMess.ensure_options!(options,
55
47
  :input, :output, :source_encoding
56
48
  )
57
49
 
58
- target_encoding = options[:target_encoding] || source_encoding
59
-
60
- iconv_in = source_encoding != INTERMEDIATE_ENCODING ?
61
- Iconv.new(INTERMEDIATE_ENCODING, source_encoding) : ICONV_DUMMY
50
+ target, entities, encoding = options[:target_encoding] || source,
51
+ HTMLEntities.new(options[:flavour] || DEFAULT_FLAVOUR), ENCODING
62
52
 
63
- iconv_out = target_encoding != INTERMEDIATE_ENCODING ?
64
- Iconv.new(target_encoding, INTERMEDIATE_ENCODING) : ICONV_DUMMY
65
-
66
- html_entities = HTMLEntities.new(options[:flavour] || DEFAULT_FLAVOUR)
53
+ skip_source, skip_target = source == encoding, target == encoding
67
54
 
68
55
  input.each { |line|
69
- output.puts iconv_out.iconv(html_entities.decode(iconv_in.iconv(line)))
56
+ line = encode(line, source, encoding) unless skip_source
57
+ line = entities.decode(line)
58
+ line = encode(line, encoding, target) unless skip_target
59
+
60
+ output.puts(line)
70
61
  }
71
62
  end
72
63
 
64
+ private
65
+
66
+ def encode(string, source, target)
67
+ string.encode(target, source)
68
+ end
69
+
73
70
  end
74
71
 
75
72
  class HTMLEntities # :nodoc:
@@ -3,12 +3,14 @@
3
3
  # #
4
4
  # A component of cmess, the encoding tool-box. #
5
5
  # #
6
- # Copyright (C) 2007-2011 University of Cologne, #
6
+ # Copyright (C) 2008-2012 University of Cologne, #
7
7
  # Albertus-Magnus-Platz, #
8
8
  # 50923 Cologne, Germany #
9
9
  # #
10
+ # Copyright (C) 2013 Jens Wille #
11
+ # #
10
12
  # Authors: #
11
- # Jens Wille <jens.wille@uni-koeln.de> #
13
+ # Jens Wille <jens.wille@gmail.com> #
12
14
  # #
13
15
  # Contributors: #
14
16
  # John Vorhauer <john@vorhauer.de> (idea and original implementation #
@@ -38,7 +40,7 @@ require 'cmess'
38
40
 
39
41
  module CMess::GuessEncoding
40
42
 
41
- VERSION = '0.1.0'
43
+ VERSION = '0.2.0'
42
44
 
43
45
  autoload :Encoding, 'cmess/guess_encoding/encoding'
44
46
  autoload :Manual, 'cmess/guess_encoding/manual'
@@ -5,12 +5,14 @@
5
5
  # #
6
6
  # A component of cmess, the encoding tool-box. #
7
7
  # #
8
- # Copyright (C) 2007-2011 University of Cologne, #
8
+ # Copyright (C) 2008-2012 University of Cologne, #
9
9
  # Albertus-Magnus-Platz, #
10
10
  # 50923 Cologne, Germany #
11
11
  # #
12
+ # Copyright (C) 2013 Jens Wille #
13
+ # #
12
14
  # Authors: #
13
- # Jens Wille <jens.wille@uni-koeln.de> #
15
+ # Jens Wille <jens.wille@gmail.com> #
14
16
  # #
15
17
  # Contributors: #
16
18
  # John Vorhauer <john@vorhauer.de> (idea and original implementation #
@@ -32,8 +34,6 @@
32
34
  ###############################################################################
33
35
  #++
34
36
 
35
- $KCODE = 'u' if RUBY_VERSION < '1.9'
36
-
37
37
  require 'cmess/guess_encoding'
38
38
 
39
39
  require 'yaml'
@@ -57,9 +57,6 @@ class CMess::GuessEncoding::Automatic
57
57
 
58
58
  include CMess::GuessEncoding::Encoding
59
59
 
60
- # Creates a converter for desired encoding (from UTF-8).
61
- ICONV_FOR = Hash.new { |h, k| h[k] = Iconv.new(k, UTF_8) }
62
-
63
60
  # Single-byte encodings to test statistically by TEST_CHARS.
64
61
  TEST_ENCODINGS = [
65
62
  MACINTOSH,
@@ -87,22 +84,13 @@ class CMess::GuessEncoding::Automatic
87
84
  CHARS_TO_TEST = (
88
85
  '€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂ' <<
89
86
  'ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'
90
- ).split(//)
87
+ ).chars.to_a
91
88
 
92
89
  # Map TEST_ENCODINGS to respectively encoded CHARS_TO_TEST.
93
- TEST_CHARS = Hash.new { |hash, encoding|
94
- encoding = self[encoding]
95
-
96
- encchars = CHARS_TO_TEST.map { |char|
97
- begin
98
- byte = *ICONV_FOR[encoding].iconv(char).unpack('C')
99
- rescue Iconv::IllegalSequence
100
- end
101
- }.compact
102
-
103
- TEST_ENCODINGS << encoding unless TEST_ENCODINGS.include?(encoding)
104
-
105
- hash[encoding] = encchars
90
+ TEST_CHARS = Hash.new { |h, k|
91
+ e, f = self[k], UTF_8
92
+ TEST_ENCODINGS << e unless TEST_ENCODINGS.include?(e)
93
+ h[e] = CHARS_TO_TEST.flat_map { |c| c.encode(e, f).unpack('C') }
106
94
  }.update(YAML.load_file(File.join(CMess::DATA_DIR, 'test_chars.yaml')))
107
95
 
108
96
  # Relative count of TEST_CHARS must exceed this threshold to yield
@@ -134,10 +122,10 @@ class CMess::GuessEncoding::Automatic
134
122
 
135
123
  def encoding(*encodings, &block)
136
124
  encodings.flatten.each { |encoding|
137
- next if @supported_encodings.include?(encoding)
138
-
139
- @supported_encodings << encoding
140
- @encoding_guessers << block
125
+ unless @supported_encodings.include?(encoding)
126
+ @supported_encodings << encoding
127
+ @encoding_guessers << block
128
+ end
141
129
  }
142
130
  end
143
131
 
@@ -146,10 +134,10 @@ class CMess::GuessEncoding::Automatic
146
134
  end
147
135
 
148
136
  def bom_encoding(encoding, &block)
149
- return if @supported_boms.include?(encoding)
150
-
151
- @supported_boms << encoding
152
- @bom_guessers << lambda { |*| encoding if instance_eval(&block) }
137
+ unless @supported_boms.include?(encoding)
138
+ @supported_boms << encoding
139
+ @bom_guessers << lambda { |*| encoding if instance_eval(&block) }
140
+ end
153
141
  end
154
142
 
155
143
  def supported_bom?(encoding)
@@ -158,30 +146,30 @@ class CMess::GuessEncoding::Automatic
158
146
 
159
147
  end
160
148
 
161
- attr_reader :input, :chunk_size, :byte_count, :byte_total, :first_byte
162
-
163
149
  def initialize(input, chunk_size = nil)
164
150
  @input = case input
165
151
  when IO then input
166
152
  when String then StringIO.new(input)
167
- else
168
- raise ArgumentError, "don't know how to handle input of type #{input.class}"
153
+ else raise ArgumentError,
154
+ "don't know how to handle input of type #{input.class}"
169
155
  end
170
156
 
171
157
  @chunk_size = chunk_size
172
158
  end
173
159
 
160
+ attr_reader :input, :chunk_size, :byte_count, :byte_total, :first_byte
161
+
174
162
  def guess(ignore_bom = false)
175
163
  return bom if bom && !ignore_bom
176
164
 
177
165
  while read
178
166
  encoding_guessers.each { |block|
179
- encoding = instance_eval(&block)
180
- return encoding if encoding && supported_encoding?(encoding)
167
+ if encoding = instance_eval(&block) and supported_encoding?(encoding)
168
+ return encoding
169
+ end
181
170
  }
182
171
  end
183
172
 
184
- # nothing suitable found :-(
185
173
  UNKNOWN
186
174
  end
187
175
 
@@ -206,14 +194,13 @@ class CMess::GuessEncoding::Automatic
206
194
  end
207
195
 
208
196
  bom_guessers.each { |block|
209
- encoding = instance_eval(&block)
210
- return encoding if encoding && supported_bom?(encoding)
211
-
212
- # read bytes don't build a BOM, so rewind...
213
- input.rewind
197
+ if encoding = instance_eval(&block) and supported_encoding?(encoding)
198
+ return encoding
199
+ else
200
+ input.rewind
201
+ end
214
202
  }
215
203
 
216
- # nothing suitable found :-(
217
204
  nil
218
205
  end
219
206
 
@@ -3,12 +3,14 @@
3
3
  # #
4
4
  # A component of cmess, the encoding tool-box. #
5
5
  # #
6
- # Copyright (C) 2007-2011 University of Cologne, #
6
+ # Copyright (C) 2008-2012 University of Cologne, #
7
7
  # Albertus-Magnus-Platz, #
8
8
  # 50923 Cologne, Germany #
9
9
  # #
10
+ # Copyright (C) 2013 Jens Wille #
11
+ # #
10
12
  # Authors: #
11
- # Jens Wille <jens.wille@uni-koeln.de> #
13
+ # Jens Wille <jens.wille@gmail.com> #
12
14
  # #
13
15
  # Contributors: #
14
16
  # John Vorhauer <john@vorhauer.de> (idea and original implementation #
@@ -50,9 +52,7 @@ module CMess::GuessEncoding::Encoding
50
52
  private
51
53
 
52
54
  def get_all_encodings
53
- %x{iconv -l}.split($/).map { |encoding|
54
- get_or_set_encoding_const(encoding.sub(%r{/*\z}, ''))
55
- }
55
+ Encoding.name_list.map { |encoding| get_or_set_encoding_const(encoding) }
56
56
  end
57
57
 
58
58
  def const_name_for(encoding)
@@ -3,12 +3,14 @@
3
3
  # #
4
4
  # A component of cmess, the encoding tool-box. #
5
5
  # #
6
- # Copyright (C) 2007-2011 University of Cologne, #
6
+ # Copyright (C) 2008-2012 University of Cologne, #
7
7
  # Albertus-Magnus-Platz, #
8
8
  # 50923 Cologne, Germany #
9
9
  # #
10
+ # Copyright (C) 2013 Jens Wille #
11
+ # #
10
12
  # Authors: #
11
- # Jens Wille <jens.wille@uni-koeln.de> #
13
+ # Jens Wille <jens.wille@gmail.com> #
12
14
  # #
13
15
  # Contributors: #
14
16
  # John Vorhauer <john@vorhauer.de> (idea and original implementation #
@@ -57,13 +59,13 @@ module CMess::GuessEncoding::Manual
57
59
  CP1252,
58
60
  CP850,
59
61
  CP852,
60
- CP856,
61
62
  UTF_8
62
63
  ]
63
64
 
64
65
  # Likely candidates to suggest to the user
65
66
  CANDIDATES = [
66
67
  ANSI_X34,
68
+ CP856,
67
69
  EBCDIC_AT_DE,
68
70
  EBCDIC_US,
69
71
  EUC_JP,
@@ -95,19 +97,18 @@ module CMess::GuessEncoding::Manual
95
97
  # move target encoding to front
96
98
  encodings.in_order!(target)
97
99
 
98
- max_length = encodings.max(:length)
100
+ max_length, reverse = encodings.max(:length), options[:reverse]
99
101
 
100
102
  encodings.each { |encoding|
103
+ args = [target, encoding]
104
+ args.reverse! if reverse
105
+
101
106
  converted = begin
102
- Iconv.conv(target, encoding, input)
103
- rescue Iconv::IllegalSequence, Iconv::InvalidCharacter => err
104
- "ILLEGAL INPUT SEQUENCE: #{err}"
105
- rescue Iconv::InvalidEncoding
106
- if encoding == target
107
- raise ArgumentError, "invalid encoding: #{encoding}"
108
- else
109
- 'INVALID ENCODING!'
110
- end
107
+ input.encode(*args)
108
+ rescue Encoding::UndefinedConversionError => err
109
+ "ILLEGAL INPUT SEQUENCE: #{err.error_char}"
110
+ rescue Encoding::ConverterNotFoundError => err
111
+ err.to_s
111
112
  end
112
113
 
113
114
  puts "%-#{max_length}s : %s" % [encoding, converted]
@@ -3,8 +3,8 @@ module CMess
3
3
  module Version
4
4
 
5
5
  MAJOR = 0
6
- MINOR = 3
7
- TINY = 1
6
+ MINOR = 4
7
+ TINY = 0
8
8
 
9
9
  class << self
10
10
 
metadata CHANGED
@@ -1,153 +1,137 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: cmess
3
- version: !ruby/object:Gem::Version
4
- hash: 17
5
- prerelease:
6
- segments:
7
- - 0
8
- - 3
9
- - 1
10
- version: 0.3.1
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.4.0
11
5
  platform: ruby
12
- authors:
6
+ authors:
13
7
  - Jens Wille
14
8
  autorequire:
15
9
  bindir: bin
16
10
  cert_chain: []
17
-
18
- date: 2011-08-16 00:00:00 Z
19
- dependencies:
20
- - !ruby/object:Gem::Dependency
11
+ date: 2013-08-02 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
21
14
  name: ruby-nuggets
22
- prerelease: false
23
- requirement: &id001 !ruby/object:Gem::Requirement
24
- none: false
25
- requirements:
26
- - - ">="
27
- - !ruby/object:Gem::Version
28
- hash: 21
29
- segments:
30
- - 0
31
- - 3
32
- - 3
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
33
19
  version: 0.3.3
34
20
  type: :runtime
35
- version_requirements: *id001
36
- - !ruby/object:Gem::Dependency
37
- name: htmlentities
38
21
  prerelease: false
39
- requirement: &id002 !ruby/object:Gem::Requirement
40
- none: false
41
- requirements:
42
- - - ">="
43
- - !ruby/object:Gem::Version
44
- hash: 3
45
- segments:
46
- - 0
47
- version: "0"
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: 0.3.3
27
+ - !ruby/object:Gem::Dependency
28
+ name: htmlentities
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
48
34
  type: :runtime
49
- version_requirements: *id002
50
- description: "\n Assist with handling messed up encodings (Currently includes the\n following tools: bconv, cinderella, decode_entities, guess_encoding)\n "
51
- email: jens.wille@uni-koeln.de
52
- executables:
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: |
42
+ Assist with handling messed up encodings (Currently includes the
43
+ following tools: bconv, cinderella, decode_entities, guess_encoding)
44
+ email: jens.wille@gmail.com
45
+ executables:
53
46
  - bconv
47
+ - cinderella
54
48
  - decode_entities
55
49
  - guess_encoding
56
- - cinderella
57
50
  extensions: []
58
-
59
- extra_rdoc_files:
51
+ extra_rdoc_files:
60
52
  - README
61
53
  - COPYING
62
54
  - ChangeLog
63
- files:
55
+ files:
64
56
  - lib/cmess.rb
65
- - lib/cmess/guess_encoding/automatic.rb
66
- - lib/cmess/guess_encoding/encoding.rb
67
- - lib/cmess/guess_encoding/manual.rb
68
57
  - lib/cmess/bconv.rb
69
- - lib/cmess/cli.rb
70
58
  - lib/cmess/cinderella.rb
71
- - lib/cmess/guess_encoding.rb
59
+ - lib/cmess/cli.rb
72
60
  - lib/cmess/decode_entities.rb
61
+ - lib/cmess/guess_encoding.rb
62
+ - lib/cmess/guess_encoding/automatic.rb
63
+ - lib/cmess/guess_encoding/encoding.rb
64
+ - lib/cmess/guess_encoding/manual.rb
73
65
  - lib/cmess/version.rb
74
66
  - bin/bconv
67
+ - bin/cinderella
75
68
  - bin/decode_entities
76
69
  - bin/guess_encoding
77
- - bin/cinderella
78
- - data/csets/latin1.yaml
79
- - data/csets/iso_8859-15.yaml
70
+ - data/chartab.yaml
80
71
  - data/csets/iso_8859-1.yaml
81
- - data/csets/unicode/latin_extended_a.yaml
72
+ - data/csets/iso_8859-15.yaml
73
+ - data/csets/latin1.yaml
82
74
  - data/csets/unicode/basic_latin.yaml
75
+ - data/csets/unicode/cyrillic-supplement.yaml
76
+ - data/csets/unicode/cyrillic.yaml
77
+ - data/csets/unicode/greek.yaml
83
78
  - data/csets/unicode/ipa_extensions.yaml
84
- - data/csets/unicode/latin_extended_b.yaml
79
+ - data/csets/unicode/latin-extended-c.yaml
85
80
  - data/csets/unicode/latin-extended-d.yaml
86
- - data/csets/unicode/letterlike_symbols.yaml
81
+ - data/csets/unicode/latin_1_supplement.yaml
82
+ - data/csets/unicode/latin_extended_a.yaml
87
83
  - data/csets/unicode/latin_extended_additional.yaml
88
- - data/csets/unicode/greek.yaml
89
- - data/csets/unicode/latin-extended-c.yaml
84
+ - data/csets/unicode/latin_extended_b.yaml
85
+ - data/csets/unicode/letterlike_symbols.yaml
90
86
  - data/csets/unicode/spacing_modifier_letters.yaml
91
- - data/csets/unicode/cyrillic-supplement.yaml
92
- - data/csets/unicode/cyrillic.yaml
93
- - data/csets/unicode/latin_1_supplement.yaml
94
87
  - data/csets/utf-8.yaml
95
88
  - data/csets/utf8.yaml
96
89
  - data/test_chars.yaml
97
- - data/chartab.yaml
98
- - README
90
+ - COPYING
99
91
  - ChangeLog
92
+ - README
100
93
  - Rakefile
101
- - COPYING
102
- - example/guess_encoding/en.utf-8.txt
103
- - example/guess_encoding/de.utf-8.txt
104
- - example/guess_encoding/it.utf-8.txt
105
- - example/guess_encoding/check_results
106
- - example/guess_encoding/fr.utf-8.txt
107
- - example/cinderella/empty6-slash_repaired.txt
108
- - example/cinderella/empty6-slash.txt
109
94
  - example/cinderella/crop
110
- - example/cinderella/pot
111
95
  - example/cinderella/crop_repaired
112
- homepage: http://prometheus.rubyforge.org/cmess
113
- licenses: []
114
-
96
+ - example/cinderella/empty6-slash.txt
97
+ - example/cinderella/empty6-slash_repaired.txt
98
+ - example/cinderella/pot
99
+ - example/guess_encoding/check_results
100
+ - example/guess_encoding/de.utf-8.txt
101
+ - example/guess_encoding/en.utf-8.txt
102
+ - example/guess_encoding/fr.utf-8.txt
103
+ - example/guess_encoding/it.utf-8.txt
104
+ homepage: http://github.com/blackwinter/cmess
105
+ licenses:
106
+ - AGPL
107
+ metadata: {}
115
108
  post_install_message:
116
- rdoc_options:
117
- - --main
118
- - README
109
+ rdoc_options:
119
110
  - --charset
120
111
  - UTF-8
112
+ - --line-numbers
121
113
  - --all
122
114
  - --title
123
- - cmess Application documentation (v0.3.1)
124
- - --line-numbers
125
- require_paths:
115
+ - cmess Application documentation (v0.4.0)
116
+ - --main
117
+ - README
118
+ require_paths:
126
119
  - lib
127
- required_ruby_version: !ruby/object:Gem::Requirement
128
- none: false
129
- requirements:
130
- - - ">="
131
- - !ruby/object:Gem::Version
132
- hash: 3
133
- segments:
134
- - 0
135
- version: "0"
136
- required_rubygems_version: !ruby/object:Gem::Requirement
137
- none: false
138
- requirements:
139
- - - ">="
140
- - !ruby/object:Gem::Version
141
- hash: 3
142
- segments:
143
- - 0
144
- version: "0"
120
+ required_ruby_version: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - '>='
123
+ - !ruby/object:Gem::Version
124
+ version: 1.9.2
125
+ required_rubygems_version: !ruby/object:Gem::Requirement
126
+ requirements:
127
+ - - '>='
128
+ - !ruby/object:Gem::Version
129
+ version: '0'
145
130
  requirements: []
146
-
147
- rubyforge_project: prometheus
148
- rubygems_version: 1.8.8
131
+ rubyforge_project:
132
+ rubygems_version: 2.0.6
149
133
  signing_key:
150
- specification_version: 3
151
- summary: "Assist with handling messed up encodings (Currently includes the following tools: bconv, cinderella, decode_entities, guess_encoding)"
134
+ specification_version: 4
135
+ summary: 'Assist with handling messed up encodings (Currently includes the following
136
+ tools: bconv, cinderella, decode_entities, guess_encoding)'
152
137
  test_files: []
153
-