cmess 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/cmess.rb CHANGED
@@ -3,47 +3,56 @@
3
3
  # #
4
4
  # cmess -- Assist with handling messed up encodings #
5
5
  # #
6
- # Copyright (C) 2007 University of Cologne, #
7
- # Albertus-Magnus-Platz, #
8
- # 50923 Cologne, Germany #
6
+ # Copyright (C) 2007-2011 University of Cologne, #
7
+ # Albertus-Magnus-Platz, #
8
+ # 50923 Cologne, Germany #
9
9
  # #
10
10
  # Authors: #
11
11
  # Jens Wille <jens.wille@uni-koeln.de> #
12
12
  # #
13
13
  # cmess is free software; you can redistribute it and/or modify it under the #
14
- # terms of the GNU General Public License as published by the Free Software #
15
- # Foundation; either version 3 of the License, or (at your option) any later #
16
- # version. #
14
+ # terms of the GNU Affero General Public License as published by the Free #
15
+ # Software Foundation; either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
17
  # #
18
18
  # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
19
19
  # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
- # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
- # details. #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
21
+ # more details. #
22
22
  # #
23
- # You should have received a copy of the GNU General Public License along #
24
- # with cmess. If not, see <http://www.gnu.org/licenses/>. #
23
+ # You should have received a copy of the GNU Affero General Public License #
24
+ # along with cmess. If not, see <http://www.gnu.org/licenses/>. #
25
25
  # #
26
26
  ###############################################################################
27
27
  #++
28
28
 
29
- # Bundles several tools that aim at dealing with various problems occurring in
30
- # the context of character sets and encodings. Currently, there are:
31
- #
32
- # guess_encoding:: Simple helper to identify the encoding of a given string.
33
- # Includes the ability to automatically detect the encoding
34
- # of an input. (see GuessEncoding)
35
- # cinderella:: When characters are "double encoded", you can't easily
36
- # convert them back -- this is where cinderella comes in,
37
- # sorting the good ones into the pot and the (potentially)
38
- # bad ones into the crop... (see Cinderella)
39
- # bconv:: Convert between bibliographic (and other) encodings.
40
- # (see BConv)
41
- # decode_entities:: Decode HTML entities in a string. (see DecodeEntities)
29
+ require 'cmess/version'
30
+ require 'iconv'
31
+
32
+ # See README for more information.
42
33
 
43
34
  module CMess
44
35
 
45
- DATA_DIR = File.expand_path(File.join(File.dirname(__FILE__), '..', 'data'))
36
+ autoload :BConv, 'cmess/bconv'
37
+ autoload :Cinderella, 'cmess/cinderella'
38
+ autoload :CLI, 'cmess/cli'
39
+ autoload :DecodeEntities, 'cmess/decode_entities'
40
+ autoload :GuessEncoding, 'cmess/guess_encoding'
46
41
 
47
- end
42
+ DATA_DIR = File.expand_path('../../data', __FILE__)
48
43
 
49
- require 'cmess/version'
44
+ class << self
45
+
46
+ def ensure_options!(options, *required)
47
+ values = options.values_at(*required)
48
+
49
+ missing = values.select { |value| value.nil? }
50
+ return values if missing.empty?
51
+
52
+ msg = "required options missing: #{missing.join(', ')}"
53
+ raise ArgumentError, msg, caller(1)
54
+ end
55
+
56
+ end
57
+
58
+ end
data/lib/cmess/bconv.rb CHANGED
@@ -3,7 +3,7 @@
3
3
  # #
4
4
  # A component of cmess, the encoding tool-box. #
5
5
  # #
6
- # Copyright (C) 2008-2010 University of Cologne, #
6
+ # Copyright (C) 2008-2011 University of Cologne, #
7
7
  # Albertus-Magnus-Platz, #
8
8
  # 50923 Cologne, Germany #
9
9
  # #
@@ -11,39 +11,40 @@
11
11
  # Jens Wille <jens.wille@uni-koeln.de> #
12
12
  # #
13
13
  # cmess is free software; you can redistribute it and/or modify it under the #
14
- # terms of the GNU General Public License as published by the Free Software #
15
- # Foundation; either version 3 of the License, or (at your option) any later #
16
- # version. #
14
+ # terms of the GNU Affero General Public License as published by the Free #
15
+ # Software Foundation; either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
17
  # #
18
18
  # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
19
19
  # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
- # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
- # details. #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
21
+ # more details. #
22
22
  # #
23
- # You should have received a copy of the GNU General Public License along #
24
- # with cmess. If not, see <http://www.gnu.org/licenses/>. #
23
+ # You should have received a copy of the GNU Affero General Public License #
24
+ # along with cmess. If not, see <http://www.gnu.org/licenses/>. #
25
25
  # #
26
26
  ###############################################################################
27
27
  #++
28
28
 
29
- require 'yaml'
30
- require 'iconv'
31
29
  require 'cmess'
30
+ require 'yaml'
32
31
 
33
32
  # Convert between bibliographic (and other) encodings.
34
33
 
35
- module CMess
36
- class BConv
34
+ class CMess::BConv
37
35
 
38
- # our version ;-)
39
- VERSION = '0.0.2'
36
+ VERSION = '0.0.3'
40
37
 
41
38
  INTERMEDIATE_ENCODING = 'utf-8'
42
39
 
43
- DEFAULT_CHARTAB_FILE = File.join(DATA_DIR, 'chartab.yaml')
40
+ DEFAULT_CHARTAB_FILE = File.join(CMess::DATA_DIR, 'chartab.yaml')
44
41
 
45
42
  class << self
46
43
 
44
+ def convert(*args)
45
+ new(*args).convert
46
+ end
47
+
47
48
  def encodings(chartab = DEFAULT_CHARTAB_FILE)
48
49
  chartab = load_chartab(chartab)
49
50
 
@@ -52,10 +53,6 @@ module CMess
52
53
  }.compact.sort
53
54
  end
54
55
 
55
- def convert(*args)
56
- new(*args).convert
57
- end
58
-
59
56
  def load_chartab(chartab)
60
57
  case chartab
61
58
  when Hash
@@ -72,13 +69,15 @@ module CMess
72
69
 
73
70
  attr_reader :input, :output, :source_encoding, :target_encoding, :chartab, :encodings
74
71
 
75
- def initialize(input, output, source_encoding, target_encoding, chartab = DEFAULT_CHARTAB_FILE)
76
- @input, @output = input, output
72
+ def initialize(options)
73
+ @input, @output, _ = CMess.ensure_options!(options,
74
+ :input, :output, :source_encoding, :target_encoding
75
+ )
77
76
 
78
- @source_encoding = source_encoding.upcase
79
- @target_encoding = target_encoding.upcase
77
+ @source_encoding = options[:source_encoding].upcase
78
+ @target_encoding = options[:target_encoding].upcase
80
79
 
81
- @chartab = self.class.load_chartab(chartab)
80
+ @chartab = self.class.load_chartab(options[:chartab] || DEFAULT_CHARTAB_FILE)
82
81
  @encodings = self.class.encodings(@chartab)
83
82
  end
84
83
 
@@ -167,5 +166,4 @@ module CMess
167
166
  map
168
167
  end
169
168
 
170
- end
171
169
  end
@@ -3,7 +3,7 @@
3
3
  # #
4
4
  # A component of cmess, the encoding tool-box. #
5
5
  # #
6
- # Copyright (C) 2007-2010 University of Cologne, #
6
+ # Copyright (C) 2007-2011 University of Cologne, #
7
7
  # Albertus-Magnus-Platz, #
8
8
  # 50923 Cologne, Germany #
9
9
  # #
@@ -11,22 +11,21 @@
11
11
  # Jens Wille <jens.wille@uni-koeln.de> #
12
12
  # #
13
13
  # cmess is free software; you can redistribute it and/or modify it under the #
14
- # terms of the GNU General Public License as published by the Free Software #
15
- # Foundation; either version 3 of the License, or (at your option) any later #
16
- # version. #
14
+ # terms of the GNU Affero General Public License as published by the Free #
15
+ # Software Foundation; either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
17
  # #
18
18
  # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
19
19
  # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
- # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
- # details. #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
21
+ # more details. #
22
22
  # #
23
- # You should have received a copy of the GNU General Public License along #
24
- # with cmess. If not, see <http://www.gnu.org/licenses/>. #
23
+ # You should have received a copy of the GNU Affero General Public License #
24
+ # along with cmess. If not, see <http://www.gnu.org/licenses/>. #
25
25
  # #
26
26
  ###############################################################################
27
27
  #++
28
28
 
29
- require 'iconv'
30
29
  require 'cmess'
31
30
 
32
31
  # Find (and possibly repair) doubly encoded characters. Here's how it's done:
@@ -36,20 +35,23 @@ require 'cmess'
36
35
  # containing those doubly encoded characters; if asked to repair doubly
37
36
  # encoded characters, substitutes them with their original character.
38
37
 
39
- module CMess
40
- module Cinderella
38
+ module CMess::Cinderella
41
39
 
42
40
  extend self
43
41
 
44
- # our version ;-)
45
- VERSION = '0.0.4'
42
+ VERSION = '0.0.5'
46
43
 
47
- DEFAULT_CSETS_DIR = File.join(DATA_DIR, 'csets')
44
+ DEFAULT_CSETS_DIR = File.join(CMess::DATA_DIR, 'csets')
48
45
 
49
- def pick(input, pot, crop, source_encoding, target_encoding, chars, repair = false)
50
- iconv, encoded = Iconv.new(target_encoding, source_encoding), {}
46
+ def pick(options)
47
+ CMess.ensure_options!(options,
48
+ :input, :pot, :crop, :source_encoding, :target_encoding, :chars
49
+ )
51
50
 
52
- chars.each { |char|
51
+ encoded = {}
52
+ iconv = Iconv.new(*options.values_at(:target_encoding, :source_encoding))
53
+
54
+ options[:chars].each { |char|
53
55
  begin
54
56
  encoded[iconv.iconv(char)] = char
55
57
  rescue Iconv::IllegalSequence
@@ -57,15 +59,14 @@ module CMess
57
59
  }
58
60
 
59
61
  regexp = Regexp.union(*encoded.keys)
62
+ pot, crop, repair = options.values_at(:pot, :crop, :repair)
60
63
 
61
- input.each { |line|
64
+ options[:input].each { |line|
62
65
  if out = line =~ regexp ? crop : pot
63
66
  line.gsub!(regexp) { |m| encoded[m] } if repair
64
-
65
67
  out.puts(line)
66
68
  end
67
69
  }
68
70
  end
69
71
 
70
- end
71
72
  end
data/lib/cmess/cli.rb CHANGED
@@ -3,7 +3,7 @@
3
3
  # #
4
4
  # A component of cmess, the encoding tool-box. #
5
5
  # #
6
- # Copyright (C) 2007-2010 University of Cologne, #
6
+ # Copyright (C) 2007-2011 University of Cologne, #
7
7
  # Albertus-Magnus-Platz, #
8
8
  # 50923 Cologne, Germany #
9
9
  # #
@@ -11,31 +11,42 @@
11
11
  # Jens Wille <jens.wille@uni-koeln.de> #
12
12
  # #
13
13
  # cmess is free software; you can redistribute it and/or modify it under the #
14
- # terms of the GNU General Public License as published by the Free Software #
15
- # Foundation; either version 3 of the License, or (at your option) any later #
16
- # version. #
14
+ # terms of the GNU Affero General Public License as published by the Free #
15
+ # Software Foundation; either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
17
  # #
18
18
  # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
19
19
  # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
- # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
- # details. #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
21
+ # more details. #
22
22
  # #
23
- # You should have received a copy of the GNU General Public License along #
24
- # with cmess. If not, see <http://www.gnu.org/licenses/>. #
23
+ # You should have received a copy of the GNU Affero General Public License #
24
+ # along with cmess. If not, see <http://www.gnu.org/licenses/>. #
25
25
  # #
26
26
  ###############################################################################
27
27
  #++
28
28
 
29
- require 'tempfile'
29
+ require 'cmess'
30
30
 
31
- require 'rubygems'
31
+ require 'optparse'
32
+ require 'tempfile'
33
+ require 'yaml'
32
34
  require 'nuggets/env/user_encoding'
35
+ require 'nuggets/string/capitalize_first'
36
+ require 'nuggets/string/word_wrap'
33
37
 
34
- module CMess
35
- module CLI
38
+ module CMess::CLI
36
39
 
37
- # how to split list of arguments
38
- SPLIT_ARG_LIST_RE = /\s*[,\s]\s*/o
40
+ # How to split list of arguments.
41
+ SPLIT_ARG_LIST_RE = %r{\s*[,\s]\s*}
42
+
43
+ def parse_options(&block)
44
+ OptionParser.new(nil, 40, &block).parse!
45
+ end
46
+
47
+ def arg_list(arg)
48
+ arg.split(SPLIT_ARG_LIST_RE)
49
+ end
39
50
 
40
51
  def ensure_readable(file)
41
52
  abort "Can't find input file: #{file}" unless File.readable?(file)
@@ -55,7 +66,7 @@ module CMess
55
66
  when 'r' then STDIN
56
67
  when 'w' then STDOUT
57
68
  when 'a' then STDERR
58
- else raise ArgumentError, "don't know how to handle mode '#{mode}'"
69
+ else raise ArgumentError, "don't know how to handle mode `#{mode}'"
59
70
  end
60
71
  else
61
72
  ensure_readable(file) unless mode == 'w'
@@ -114,9 +125,8 @@ it explicitly via the ENCODING environment variable or via the '-t' option.
114
125
 
115
126
  abort "#{backtrace.first} #{err} (#{err.class})#{fromtrace}"
116
127
  else
117
- abort "#{err.to_s.capitalize} [#{err.backtrace.first}]"
128
+ abort "#{err.to_s.capitalize_first} [#{err.backtrace.first}]"
118
129
  end
119
130
  end
120
131
 
121
- end
122
132
  end
@@ -3,7 +3,7 @@
3
3
  # #
4
4
  # A component of cmess, the encoding tool-box. #
5
5
  # #
6
- # Copyright (C) 2007-2010 University of Cologne, #
6
+ # Copyright (C) 2007-2011 University of Cologne, #
7
7
  # Albertus-Magnus-Platz, #
8
8
  # 50923 Cologne, Germany #
9
9
  # #
@@ -11,33 +11,29 @@
11
11
  # Jens Wille <jens.wille@uni-koeln.de> #
12
12
  # #
13
13
  # cmess is free software; you can redistribute it and/or modify it under the #
14
- # terms of the GNU General Public License as published by the Free Software #
15
- # Foundation; either version 3 of the License, or (at your option) any later #
16
- # version. #
14
+ # terms of the GNU Affero General Public License as published by the Free #
15
+ # Software Foundation; either version 3 of the License, or (at your option) #
16
+ # any later version. #
17
17
  # #
18
18
  # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
19
19
  # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
20
- # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
21
- # details. #
20
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
21
+ # more details. #
22
22
  # #
23
- # You should have received a copy of the GNU General Public License along #
24
- # with cmess. If not, see <http://www.gnu.org/licenses/>. #
23
+ # You should have received a copy of the GNU Affero General Public License #
24
+ # along with cmess. If not, see <http://www.gnu.org/licenses/>. #
25
25
  # #
26
26
  ###############################################################################
27
27
  #++
28
28
 
29
- require 'iconv'
30
-
31
- require 'rubygems'
29
+ require 'cmess'
32
30
  require 'htmlentities'
33
31
 
34
- module CMess
35
- module DecodeEntities
32
+ module CMess::DecodeEntities
36
33
 
37
34
  extend self
38
35
 
39
- # our version ;-)
40
- VERSION = '0.0.4'
36
+ VERSION = '0.0.5'
41
37
 
42
38
  # HTMLEntities requires UTF-8
43
39
  INTERMEDIATE_ENCODING = 'utf-8'
@@ -54,8 +50,12 @@ module CMess
54
50
 
55
51
  DEFAULT_FLAVOUR = 'xml-safe'
56
52
 
57
- def decode(input, output, source_encoding, target_encoding = nil, flavour = nil)
58
- target_encoding ||= source_encoding
53
+ def decode(options)
54
+ input, output, source_encoding = CMess.ensure_options!(options,
55
+ :input, :output, :source_encoding
56
+ )
57
+
58
+ target_encoding = options[:target_encoding] || source_encoding
59
59
 
60
60
  iconv_in = source_encoding != INTERMEDIATE_ENCODING ?
61
61
  Iconv.new(INTERMEDIATE_ENCODING, source_encoding) : ICONV_DUMMY
@@ -63,17 +63,16 @@ module CMess
63
63
  iconv_out = target_encoding != INTERMEDIATE_ENCODING ?
64
64
  Iconv.new(target_encoding, INTERMEDIATE_ENCODING) : ICONV_DUMMY
65
65
 
66
- html_entities = HTMLEntities.new(flavour || DEFAULT_FLAVOUR)
66
+ html_entities = HTMLEntities.new(options[:flavour] || DEFAULT_FLAVOUR)
67
67
 
68
68
  input.each { |line|
69
69
  output.puts iconv_out.iconv(html_entities.decode(iconv_in.iconv(line)))
70
70
  }
71
71
  end
72
72
 
73
- end
74
73
  end
75
74
 
76
- class HTMLEntities
75
+ class HTMLEntities # :nodoc:
77
76
  FLAVORS << 'xml-safe'
78
77
  MAPPINGS['xml-safe'] = MAPPINGS['xhtml1'].dup
79
78
  %w[amp apos gt lt quot].each { |key| MAPPINGS['xml-safe'].delete(key) }
@@ -3,7 +3,7 @@
3
3
  # #
4
4
  # A component of cmess, the encoding tool-box. #
5
5
  # #
6
- # Copyright (C) 2007-2010 University of Cologne, #
6
+ # Copyright (C) 2007-2011 University of Cologne, #
7
7
  # Albertus-Magnus-Platz, #
8
8
  # 50923 Cologne, Germany #
9
9
  # #
@@ -15,30 +15,34 @@
15
15
  # for automatic encoding detection) #
16
16
  # #
17
17
  # cmess is free software; you can redistribute it and/or modify it under the #
18
- # terms of the GNU General Public License as published by the Free Software #
19
- # Foundation; either version 3 of the License, or (at your option) any later #
20
- # version. #
18
+ # terms of the GNU Affero General Public License as published by the Free #
19
+ # Software Foundation; either version 3 of the License, or (at your option) #
20
+ # any later version. #
21
21
  # #
22
22
  # cmess is distributed in the hope that it will be useful, but WITHOUT ANY #
23
23
  # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
24
- # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more #
25
- # details. #
24
+ # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for #
25
+ # more details. #
26
26
  # #
27
- # You should have received a copy of the GNU General Public License along #
28
- # with cmess. If not, see <http://www.gnu.org/licenses/>. #
27
+ # You should have received a copy of the GNU Affero General Public License #
28
+ # along with cmess. If not, see <http://www.gnu.org/licenses/>. #
29
29
  # #
30
30
  ###############################################################################
31
31
  #++
32
32
 
33
+ require 'cmess'
34
+
33
35
  # Allows to guess an input's encoding either manually or automatically.
34
36
  # Works actually pretty good -- for the supported encodings. See Manual
35
37
  # and Automatic for details.
36
38
 
37
- module CMess
38
- module GuessEncoding
39
+ module CMess::GuessEncoding
40
+
41
+ VERSION = '0.1.0'
39
42
 
40
- # our version ;-)
41
- VERSION = '0.0.9'
43
+ autoload :Encoding, 'cmess/guess_encoding/encoding'
44
+ autoload :Manual, 'cmess/guess_encoding/manual'
45
+ autoload :Automatic, 'cmess/guess_encoding/automatic'
42
46
 
43
47
  class << self
44
48
 
@@ -46,16 +50,14 @@ module CMess
46
50
  Manual.display(*args)
47
51
  end
48
52
 
53
+ alias_method :display, :manual
54
+
49
55
  def automatic(*args)
50
56
  Automatic.guess(*args)
51
57
  end
52
58
 
53
- end
59
+ alias_method :guess, :automatic
54
60
 
55
61
  end
56
- end
57
62
 
58
- %w[encoding manual automatic].each { |lib|
59
- lib = "cmess/guess_encoding/#{lib}"
60
- require lib
61
- }
63
+ end