RubyGems - cmess - Versions diffs - 0.2.4 → 0.3.0 - Mend

cmess 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

data/COPYING +68 -81
data/ChangeLog +28 -0
data/README +23 -21
data/Rakefile +15 -16
data/bin/bconv +30 -47
data/bin/cinderella +51 -68
data/bin/decode_entities +28 -36
data/bin/guess_encoding +53 -81
data/lib/cmess.rb +35 -26
data/lib/cmess/bconv.rb +23 -25
data/lib/cmess/cinderella.rb +21 -20
data/lib/cmess/cli.rb +27 -17
data/lib/cmess/decode_entities.rb +19 -20
data/lib/cmess/guess_encoding.rb +20 -18
data/lib/cmess/guess_encoding/automatic.rb +151 -125
data/lib/cmess/guess_encoding/encoding.rb +16 -18
data/lib/cmess/guess_encoding/manual.rb +26 -31
data/lib/cmess/version.rb +2 -2
metadata +25 -28

data/lib/cmess/guess_encoding/automatic.rb CHANGED Viewed

@@ -5,7 +5,7 @@
 #                                                                             #
 # A component of cmess, the encoding tool-box.                                #
 #                                                                             #
-# Copyright (C) 2007-2010 University of Cologne,                              #
+# Copyright (C) 2007-2011 University of Cologne,                              #
 #                         Albertus-Magnus-Platz,                              #
 #                         50923 Cologne, Germany                              #
 #                                                                             #
@@ -17,49 +17,50 @@
 #                                       for automatic encoding detection)     #
 #                                                                             #
 # cmess is free software; you can redistribute it and/or modify it under the  #
-# terms of the GNU General Public License as published by the Free Software   #
-# Foundation; either version 3 of the License, or (at your option) any later  #
-# version.                                                                    #
+# terms of the GNU Affero General Public License as published by the Free     #
+# Software Foundation; either version 3 of the License, or (at your option)   #
+# any later version.                                                          #
 #                                                                             #
 # cmess is distributed in the hope that it will be useful, but WITHOUT ANY    #
 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS   #
-# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more       #
-# details.                                                                    #
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for     #
+# more details.                                                               #
 #                                                                             #
-# You should have received a copy of the GNU General Public License along     #
-# with cmess. If not, see <http://www.gnu.org/licenses/>.                     #
+# You should have received a copy of the GNU Affero General Public License    #
+# along with cmess. If not, see <http://www.gnu.org/licenses/>.               #
 #                                                                             #
 ###############################################################################
 #++
-$KCODE = 'u' unless RUBY_VERSION >= '1.9'
+$KCODE = 'u' if RUBY_VERSION < '1.9'
+require 'cmess/guess_encoding'
 require 'yaml'
-require 'iconv'
 require 'stringio'
 require 'forwardable'
 # Tries to detect the encoding of a given input by applying several
-# heuristics to determine the <b>most likely</b> candidate. If no heuristic
-# catches on, resorts to Encoding::UNKNOWN.
+# heuristics to determine the <b>most likely</b> candidate. If no
+# heuristic catches on, resorts to Encoding::UNKNOWN.
 #
 # If a BOM is found, it may determine the encoding directly.
+#
+# For supported encodings see EncodingGuessers and BOMGuessers.
-module CMess
-  module GuessEncoding
-    class Automatic
+class CMess::GuessEncoding::Automatic
   extend Forwardable
   def_delegators self, :encoding_guessers, :supported_encoding?,
                        :bom_guessers,      :supported_bom?
-  include Encoding
+  include CMess::GuessEncoding::Encoding
-  # Creates a converter for desired encoding (from UTF-8)
+  # Creates a converter for desired encoding (from UTF-8).
   ICONV_FOR = Hash.new { |h, k| h[k] = Iconv.new(k, UTF_8) }
-  # Single-byte encodings to test statistically by TEST_CHARS
+  # Single-byte encodings to test statistically by TEST_CHARS.
   TEST_ENCODINGS = [
     MACINTOSH,
     ISO_8859_1,
@@ -82,15 +83,16 @@ module CMess
     MS_ANSI
   ]
-  # Certain (non-ASCII) chars to test for in TEST_ENCODINGS
+  # Certain (non-ASCII) chars to test for in TEST_ENCODINGS.
   CHARS_TO_TEST = (
     '€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂ' <<
     'ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'
   ).split(//)
-  # Map TEST_ENCODINGS to respectively encoded CHARS_TO_TEST
+  # Map TEST_ENCODINGS to respectively encoded CHARS_TO_TEST.
   TEST_CHARS = Hash.new { |hash, encoding|
-    encoding = get_or_set_encoding_const(encoding)
+    encoding = self[encoding]
     encchars = CHARS_TO_TEST.map { |char|
       begin
         byte = *ICONV_FOR[encoding].iconv(char).unpack('C')
@@ -99,19 +101,21 @@ module CMess
     }.compact
     TEST_ENCODINGS << encoding unless TEST_ENCODINGS.include?(encoding)
     hash[encoding] = encchars
-  }.update(YAML.load_file(
-    File.join(File.dirname(__FILE__), *%w[.. .. .. data test_chars.yaml])
-  ))
+  }.update(YAML.load_file(File.join(CMess::DATA_DIR, 'test_chars.yaml')))
   # Relative count of TEST_CHARS must exceed this threshold to yield
-  # a direct match
+  # a direct match.
   TEST_THRESHOLD_DIRECT = 0.1
   # Relative count of TEST_CHARS must exceed this threshold to yield
-  # an approximate match
+  # an approximate match.
   TEST_THRESHOLD_APPROX = 0.0004
+  # Pattern for method names in EncodingGuessers and BOMGuessers.
+  GUESS_METHOD_RE = %r{\A((?:bom_)?encoding)_\d+_(.+)\z}
   @supported_encodings = []
   @encoding_guessers   = []
   @supported_boms      = []
@@ -128,19 +132,12 @@ module CMess
     private
-    def encoding(encoding, &condition_block)
-      encoding_block = lambda {
-        encoding if instance_eval(&condition_block)
-      }
-      encodings(encoding, &encoding_block)
-    end
+    def encoding(*encodings, &block)
+      encodings.flatten.each { |encoding|
+        next if @supported_encodings.include?(encoding)
-    def encodings(*encodings, &encoding_block)
-      encodings.each { |encoding|
         @supported_encodings << encoding
-        @encoding_guessers   << encoding_block \
-          unless @encoding_guessers.include?(encoding_block)
+        @encoding_guessers   << block
       }
     end
@@ -148,14 +145,11 @@ module CMess
       supported_encodings.include?(encoding)
     end
-    def bom_encoding(encoding, &condition_block)
-      encoding_block = lambda {
-        encoding if instance_eval(&condition_block)
-      }
+    def bom_encoding(encoding, &block)
+      return if @supported_boms.include?(encoding)
       @supported_boms << encoding
-      @bom_guessers   << encoding_block \
-        unless @bom_guessers.include?(encoding_block)
+      @bom_guessers   << lambda { |*| encoding if instance_eval(&block) }
     end
     def supported_bom?(encoding)
@@ -168,11 +162,9 @@ module CMess
   def initialize(input, chunk_size = nil)
     @input = case input
-      when IO      # that's what we want
-        input
-      when String  # convert it to an IO
-        StringIO.new(input)
-      else         # um, what's that...?
+      when IO     then input
+      when String then StringIO.new(input)
+      else
         raise ArgumentError, "don't know how to handle input of type #{input.class}"
     end
@@ -230,9 +222,7 @@ module CMess
   end
   def starts_with?(*bytes)
-    bytes.all? { |byte|
-      next_byte == byte
-    }
+    bytes.all? { |byte| next_byte == byte }
   end
   def next_one_of?(*bytes)
@@ -240,7 +230,6 @@ module CMess
   end
   def read(chunk_size = chunk_size)
-    # => initialize counters
     @byte_count ||= Hash.new(0)
     @byte_total ||= 0
@@ -258,103 +247,140 @@ module CMess
     @byte_total > bytes_before
   end
-  def byte_count_sum(*bytes)
-    bytes = *bytes  # treat arrays/ranges and lists alike
-    bytes.inject(0) { |sum, n| sum + byte_count[n] }
+  def byte_count_sum(bytes)
+    Array(bytes).inject(0) { |sum, n| sum + byte_count[n] }
   end
   def relative_byte_count(count)
     count.to_f / byte_total
   end
-  ### Definition of guessing heuristics. Order matters!
+  # Definition of guessing heuristics. Order matters!
-  # ASCII, if all bytes are within the lower 128 bytes
-  # (Unfortunately, we have to read the *whole* file to make that decision)
-  encoding ASCII do
-    eof? && byte_count_sum(0x0..0x7f) == byte_total
-  end
+  module EncodingGuessers
+    include CMess::GuessEncoding::Encoding
-  # UTF-16, if lots of NULL bytes present
-  encodings UTF_16BE, UTF_16LE, UTF_16 do
-    if relative_byte_count(byte_count[0]) > 0.25
-      case first_byte
-        when 0x0  then UTF_32
-        when 0xfe then UTF_16BE
-        when 0xff then UTF_16LE
-        else           UTF_16
+    # ASCII[http://en.wikipedia.org/wiki/ASCII], if all bytes are
+    # within the lower 128 bytes. Unfortunately, we have to read
+    # the *whole* file to make that decision.
+    def encoding_01_ASCII
+      ASCII if eof? && byte_count_sum(0x00..0x7f) == byte_total
+    end
+    # UTF-16[http://en.wikipedia.org/wiki/UTF-16] /
+    # UTF-32[http://en.wikipedia.org/wiki/UTF-32], if lots of
+    # NULL[http://en.wikipedia.org/wiki/Null_character] bytes present.
+    def encoding_02_UTF_32_and_UTF_16BE_and_UTF_16LE_and_UTF_16
+      if relative_byte_count(byte_count[0]) > 0.25
+        case first_byte
+          when 0x00 then UTF_32
+          when 0xfe then UTF_16BE
+          when 0xff then UTF_16LE
+          else           UTF_16
+        end
       end
     end
-  end
-  # UTF-8, if number of escape-bytes and following bytes
-  # is matching (cf. http://en.wikipedia.org/wiki/UTF-8)
-  encoding UTF_8 do
-    esc_bytes = byte_count_sum(0xc0..0xdf)     +
-                # => 110xxxxx 10xxxxxx
-                byte_count_sum(0xe0..0xef) * 2 +
-                # => 1110xxxx 10xxxxxx 10xxxxxx
-                byte_count_sum(0xf0..0xf7) * 3
-                # => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-    fol_bytes = byte_count_sum(0x80..0xbf)
-                # => 10xxxxxx
-    esc_bytes > 0 && esc_bytes == fol_bytes
-  end
+    # UTF-8[http://en.wikipedia.org/wiki/UTF-8], if number of escape-bytes
+    # and following bytes is matching.
+    def encoding_03_UTF_8
+      esc_bytes = byte_count_sum(0xc0..0xdf)     +
+                  # => 110xxxxx 10xxxxxx
+                  byte_count_sum(0xe0..0xef) * 2 +
+                  # => 1110xxxx 10xxxxxx 10xxxxxx
+                  byte_count_sum(0xf0..0xf7) * 3
+                  # => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+      UTF_8 if esc_bytes > 0 && esc_bytes == byte_count_sum(0x80..0xbf)
+    end
+    # TEST_ENCODINGS, if frequency of TEST_CHARS exceeds TEST_THRESHOLD_DIRECT
+    # (direct match) or TEST_THRESHOLD_APPROX (approximate match).
+    def encoding_04_TEST_ENCODINGS
+      ratios = {}
+      TEST_ENCODINGS.find(lambda {
+        ratio, encoding = ratios.sort.last
+        encoding if ratio >= TEST_THRESHOLD_APPROX
+      }) { |encoding|
+        ratio = relative_byte_count(byte_count_sum(TEST_CHARS[encoding]))
+        ratio >= TEST_THRESHOLD_DIRECT || (ratios[ratio] ||= encoding; false)
+      }
+    end
-  # Analyse statistical appearance of German umlauts and other accented
-  # letters (see TEST_CHARS)
-  encodings(*TEST_ENCODINGS) do
-    ratios = {}
-    TEST_ENCODINGS.find(lambda {
-      ratio, encoding = ratios.sort.last
-      encoding if ratio >= TEST_THRESHOLD_APPROX
-    }) { |encoding|
-      ratio = relative_byte_count(byte_count_sum(TEST_CHARS[encoding]))
-      #p [encoding, ratio]
-      ratio >= TEST_THRESHOLD_DIRECT || (ratios[ratio] ||= encoding; false)
-    }
   end
-  ### BOM detection. (cf. http://en.wikipedia.org/wiki/Byte-order_mark)
+  # BOM[http://en.wikipedia.org/wiki/Byte_order_mark] detection.
-  bom_encoding UTF_8 do
-    starts_with?(0xef, 0xbb, 0xbf)
-  end
+  module BOMGuessers
-  bom_encoding UTF_16BE do
-    starts_with?(0xfe, 0xff)
-  end
+    # UTF-8[http://en.wikipedia.org/wiki/UTF-8]
+    def bom_encoding_01_UTF_8
+      starts_with?(0xef, 0xbb, 0xbf)
+    end
-  bom_encoding UTF_16LE do
-    starts_with?(0xff, 0xfe)
-  end
+    # UTF-16[http://en.wikipedia.org/wiki/UTF-16] (Big Endian)
+    def bom_encoding_02_UTF_16BE
+      starts_with?(0xfe, 0xff)
+    end
-  bom_encoding UTF_32BE do
-    starts_with?(0x00, 0x00, 0xfe, 0xff)
-  end
+    # UTF-16[http://en.wikipedia.org/wiki/UTF-16] (Little Endian)
+    def bom_encoding_03_UTF_16LE
+      starts_with?(0xff, 0xfe)
+    end
-  bom_encoding UTF_32LE do
-    starts_with?(0xff, 0xfe, 0x00, 0x00)
-  end
+    # UTF-32[http://en.wikipedia.org/wiki/UTF-32] (Big Endian)
+    def bom_encoding_04_UTF_32BE
+      starts_with?(0x00, 0x00, 0xfe, 0xff)
+    end
-  bom_encoding SCSU do
-    starts_with?(0x0e, 0xfe, 0xff)
-  end
+    # UTF-32[http://en.wikipedia.org/wiki/UTF-32] (Little Endian)
+    def bom_encoding_05_UTF_32LE
+      starts_with?(0xff, 0xfe, 0x00, 0x00)
+    end
-  bom_encoding UTF_7 do
-    starts_with?(0x2b, 0x2f, 0x76) && next_one_of?(0x38, 0x39, 0x2b, 0x2f)
-  end
+    # SCSU[http://en.wikipedia.org/wiki/Standard_Compression_Scheme_for_Unicode]
+    def bom_encoding_06_SCSU
+      starts_with?(0x0e, 0xfe, 0xff)
+    end
-  bom_encoding UTF_EBCDIC do
-    starts_with?(0xdd, 0x73, 0x66, 0x73)
-  end
+    # UTF-7[http://en.wikipedia.org/wiki/UTF-7]
+    def bom_encoding_07_UTF_7
+      starts_with?(0x2b, 0x2f, 0x76) && next_one_of?(0x38, 0x39, 0x2b, 0x2f)
+    end
-  bom_encoding BOCU_1 do
-    starts_with?(0xfb, 0xee, 0x28)
-  end
+    # UTF-1[http://en.wikipedia.org/wiki/UTF-1]
+    def bom_encoding_08_UTF_1
+      starts_with?(0xf7, 0x64, 0x4c)
+    end
+    # UTF-EBCDIC[http://en.wikipedia.org/wiki/UTF-EBCDIC]
+    def bom_encoding_09_UTF_EBCDIC
+      starts_with?(0xdd, 0x73, 0x66, 0x73)
+    end
+    # BOCU-1[http://en.wikipedia.org/wiki/BOCU-1]
+    def bom_encoding_10_BOCU_1
+      starts_with?(0xfb, 0xee, 0x28)
+    end
+    # GB-18030[http://en.wikipedia.org/wiki/GB-18030]
+    def bom_encoding_11_GB_18030
+      starts_with?(0x84, 0x31, 0x95, 0x33)
     end
   end
+  [EncodingGuessers, BOMGuessers].each { |mod|
+    include mod
+    mod.instance_methods(false).sort.each { |method|
+      next unless method =~ GUESS_METHOD_RE
+      name, list = $1, $2.split('_and_')
+      send(name, *list.map { |encoding| const_get(encoding) }) { send(method) }
+    }
+  }
 end

data/lib/cmess/guess_encoding/encoding.rb CHANGED Viewed

@@ -3,7 +3,7 @@
 #                                                                             #
 # A component of cmess, the encoding tool-box.                                #
 #                                                                             #
-# Copyright (C) 2007-2010 University of Cologne,                              #
+# Copyright (C) 2007-2011 University of Cologne,                              #
 #                         Albertus-Magnus-Platz,                              #
 #                         50923 Cologne, Germany                              #
 #                                                                             #
@@ -15,26 +15,26 @@
 #                                       for automatic encoding detection)     #
 #                                                                             #
 # cmess is free software; you can redistribute it and/or modify it under the  #
-# terms of the GNU General Public License as published by the Free Software   #
-# Foundation; either version 3 of the License, or (at your option) any later  #
-# version.                                                                    #
+# terms of the GNU Affero General Public License as published by the Free     #
+# Software Foundation; either version 3 of the License, or (at your option)   #
+# any later version.                                                          #
 #                                                                             #
 # cmess is distributed in the hope that it will be useful, but WITHOUT ANY    #
 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS   #
-# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more       #
-# details.                                                                    #
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for     #
+# more details.                                                               #
 #                                                                             #
-# You should have received a copy of the GNU General Public License along     #
-# with cmess. If not, see <http://www.gnu.org/licenses/>.                     #
+# You should have received a copy of the GNU Affero General Public License    #
+# along with cmess. If not, see <http://www.gnu.org/licenses/>.               #
 #                                                                             #
 ###############################################################################
 #++
+require 'cmess/guess_encoding'
 # Namespace for our encodings.
-module CMess
-  module GuessEncoding
-    module Encoding
+module CMess::GuessEncoding::Encoding
   extend self
@@ -50,8 +50,8 @@ module CMess
   private
   def get_all_encodings
-    %x{iconv -l}.split("\n").map { |e|
-      get_or_set_encoding_const(e.sub(/\/*\z/, ''))
+    %x{iconv -l}.split($/).map { |encoding|
+      get_or_set_encoding_const(encoding.sub(%r{/*\z}, ''))
     }
   end
@@ -75,14 +75,12 @@ module CMess
     ISO-8859-11 ISO-8859-13 ISO-8859-14 ISO-8859-15 ISO-8859-16
     CP1250 CP1251 CP1252 CP850 CP852 CP856
     UTF-8 UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE
-    UTF-7 UTF-EBCDIC SCSU BOCU-1
+    UTF-7 UTF-1 UTF-EBCDIC SCSU BOCU-1 GB-18030
     ANSI_X3.4 EBCDIC-AT-DE EBCDIC-US EUC-JP KOI-8 MS-ANSI SHIFT-JIS
   ].each { |encoding| set_encoding_const(encoding) }
-  def included(base)
-    base.extend self
+  def self.included(base)
+    base.extend(self)
   end
-    end
-  end
 end