RubyGems - cmess - Versions diffs - 0.0.8.274 → 0.0.9.276 - Mend

cmess 0.0.8.274 → 0.0.9.276

Files changed (8) hide show

data/ChangeLog +7 -0
data/README +1 -1
data/lib/cmess/guess_encoding.rb +14 -398
data/lib/cmess/guess_encoding/automatic.rb +341 -0
data/lib/cmess/guess_encoding/encoding.rb +61 -0
data/lib/cmess/guess_encoding/manual.rb +105 -0
data/lib/cmess/version.rb +1 -1
metadata +5 -2

data/ChangeLog CHANGED

@@ -1,5 +1,12 @@
 = Revision history for cmess
+== 0.0.9 [2008-08-15]
+* Reorganized file structure for guess_encoding
+* Added shortcuts GuessEncoding.manual/.automatic
+* GuessEncoding::Automatic now also takes a String
+  as input (will be converted to a StringIO)
 == 0.0.8 [2008-08-14]
 * Require 'cmess' inside libs, so the user doesn't have to

data/README CHANGED

@@ -2,7 +2,7 @@
 == VERSION
-This documentation refers to cmess version 0.0.8
+This documentation refers to cmess version 0.0.9
 == DESCRIPTION

data/lib/cmess/guess_encoding.rb CHANGED

@@ -30,416 +30,32 @@
 ###############################################################################
 #++
-$KCODE = 'u'
-require 'yaml'
-require 'iconv'
-require 'forwardable'
 require 'cmess'
-# Outputs given string (or line), being encoded in target encoding, encoded in
-# various test encodings, thus allowing to identify the (seemingly) correct
-# encoding by visually comparing the input string with its desired appearance.
-#
-# In addition to that manual procedure, may be used to detect the encoding
-# automatically. Works actually pretty good -- for the supported encodings
-# (see Automatic for details).
+# Allows to guess an input's encoding either manually or automatically.
+# Works actually pretty good -- for the supported encodings. See Manual
+# and Automatic for details.
 module CMess::GuessEncoding
   # our version ;-)
-  VERSION = '0.0.6'
-  # Namespace for our encodings.
-  module Encoding
-    extend self
-    def const_name_for(encoding)
-      encoding.tr('-', '_').gsub(/\W/, '').upcase
-    end
-    def set_encoding_const(encoding, const = const_name_for(encoding))
-      const_set(const, encoding.freeze)
-    end
-    def get_or_set_encoding_const(encoding)
-      const_defined?(const = const_name_for(encoding)) ? const_get(const) :
-        set_encoding_const(encoding, const)
-    end
-    %w[
-      UNKNOWN ASCII MACINTOSH
-      ISO-8859-1 ISO-8859-2 ISO-8859-15
-      CP1250 CP1251 CP1252 CP850 CP852 CP856
-      UTF-8 UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE
-      UTF-7 UTF-EBCDIC SCSU BOCU-1
-      ANSI_X3.4 EBCDIC-AT-DE EBCDIC-US EUC-JP KOI-8 MS-ANSI SHIFT-JIS
-    ].each { |encoding| set_encoding_const(encoding) }
-  end
-  module Manual
-    extend self
-    include Encoding
-    # default encodings to try
-    ENCODINGS = [
-      ISO_8859_1,
-      ISO_8859_2,
-      ISO_8859_15,
-      CP1250,
-      CP1251,
-      CP1252,
-      CP850,
-      CP852,
-      CP856,
-      UTF_8
-    ]
-    # likely candidates to suggest to the user
-    CANDIDATES = [
-      ANSI_X34,
-      EBCDIC_AT_DE,
-      EBCDIC_US,
-      EUC_JP,
-      KOI_8,
-      MACINTOSH,
-      MS_ANSI,
-      SHIFT_JIS,
-      UTF_7,
-      UTF_16,
-      UTF_16BE,
-      UTF_16LE,
-      UTF_32,
-      UTF_32BE,
-      UTF_32LE
-    ]
-    def display(input, target_encoding, encodings = nil, additional_encodings = [])
-      target = target_encoding
-      encodings = (encodings || ENCODINGS) + additional_encodings
-      encodings = encodings.reverse.uniq.reverse     # uniq with additional encodings
-                                                     # staying at the end
-      encodings = [target] + (encodings - [target])  # move target encoding to front
-      max_length = encodings.map { |encoding| encoding.length }.max
-      encodings.each { |encoding|
-        converted = begin
-          Iconv.conv(target, encoding, input)
-        rescue Iconv::IllegalSequence, Iconv::InvalidCharacter => err
-          "ILLEGAL INPUT SEQUENCE: #{err}"
-        rescue Iconv::InvalidEncoding
-          if encoding == target
-            abort "Invalid encoding: #{encoding}"
-          else
-            "INVALID ENCODING!"
-          end
-        end
-        puts "%-#{max_length}s : %s" % [encoding, converted]
-      }
-    end
-  end
-  # Tries to detect the encoding of a given input by applying several
-  # heuristics to determine the <b>most likely</b> candidate. If no heuristic
-  # catches on, resorts to Encoding::UNKNOWN.
-  #
-  # If a BOM is found, it may determine the encoding directly.
-  class Automatic
-    extend Forwardable
-    def_delegators self, :encoding_guessers, :supported_encoding?,
-                         :bom_guessers,      :supported_bom?
-    include Encoding
-    # Creates a converter for desired encoding (from UTF-8)
-    ICONV_FOR = Hash.new { |h, k| h[k] = Iconv.new(k, UTF_8) }
-    # Single-byte encodings to test statistically by TEST_CHARS
-    TEST_ENCODINGS = [
-      MACINTOSH,
-      ISO_8859_1,
-      ISO_8859_15,
-      CP1252,
-      CP850,
-      MS_ANSI
-    ]
-    # Certain (non-ASCII) chars to test for in TEST_ENCODINGS
-    CHARS_TO_TEST = (
-      '€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂ' <<
-      'ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'
-    ).split(//)
-    # Map TEST_ENCODINGS to respectively encoded CHARS_TO_TEST
-    TEST_CHARS = Hash.new { |hash, encoding|
-      encoding = Encoding.get_or_set_encoding_const(encoding)
-      encchars = CHARS_TO_TEST.map { |char|
-        begin
-          byte = *ICONV_FOR[encoding].iconv(char).unpack('C')
-        rescue Iconv::IllegalSequence
-        end
-      }.compact
-      TEST_ENCODINGS << encoding unless TEST_ENCODINGS.include?(encoding)
-      hash[encoding] = encchars
-    }.update(YAML.load_file(
-      File.join(File.dirname(__FILE__), '..', '..', 'data', 'test_chars.yaml')
-    ))
-    # Relative count of TEST_CHARS must exceed this threshold to yield
-    # a direct match
-    TEST_THRESHOLD_DIRECT = 0.1
-    # Relative count of TEST_CHARS must exceed this threshold to yield
-    # an approximate match
-    TEST_THRESHOLD_APPROX = 0.0004
-    @supported_encodings = []
-    @encoding_guessers   = []
-    @supported_boms      = []
-    @bom_guessers        = []
-    class << self
-      attr_reader :supported_encodings, :encoding_guessers,
-                  :supported_boms,      :bom_guessers
-      def guess(input, chunk_size = nil, ignore_bom = false)
-        new(input, chunk_size).guess(ignore_bom)
-      end
-      private
-      def encoding(encoding, &condition_block)
-        encoding_block = lambda {
-          encoding if instance_eval(&condition_block)
-        }
-        encodings(encoding, &encoding_block)
-      end
-      def encodings(*encodings, &encoding_block)
-        encodings.each { |encoding|
-          @supported_encodings << encoding
-          @encoding_guessers   << encoding_block \
-            unless @encoding_guessers.include?(encoding_block)
-        }
-      end
-      def supported_encoding?(encoding)
-        supported_encodings.include?(encoding)
-      end
-      def bom_encoding(encoding, &condition_block)
-        encoding_block = lambda {
-          encoding if instance_eval(&condition_block)
-        }
-        @supported_boms << encoding
-        @bom_guessers   << encoding_block \
-          unless @bom_guessers.include?(encoding_block)
-      end
-      def supported_bom?(encoding)
-        supported_boms.include?(encoding)
-      end
-    end
-    attr_reader :input, :chunk_size, :byte_count, :byte_total, :first_byte
-    def initialize(input, chunk_size = nil)
-      @input      = input
-      @chunk_size = chunk_size
-    end
-    def guess(ignore_bom = false)
-      return bom if bom && !ignore_bom
-      while read
-        encoding_guessers.each { |block|
-          encoding = instance_eval(&block)
-          return encoding if encoding && supported_encoding?(encoding)
-        }
-      end
+  VERSION = '0.0.7'
-      # nothing suitable found :-(
-      UNKNOWN
-    end
-    def bom
-      @bom ||= check_bom
-    end
-    private
-    def eof?
-      input.eof?
-    end
-    def check_bom
-      return if eof?
-      # prevent "Illegal seek" error inside a pipe
-      begin
-        input.pos
-      rescue Errno::ESPIPE
-        return
-      end
-      bom_guessers.each { |block|
-        encoding = instance_eval(&block)
-        return encoding if encoding && supported_bom?(encoding)
-        # read bytes don't build a BOM, so rewind...
-        input.rewind
-      }
-      # nothing suitable found :-(
-      nil
-    end
-    def next_byte
-      input.read(1).unpack('C').first
-    end
-    def starts_with?(*bytes)
-      bytes.all? { |byte|
-        next_byte == byte
-      }
-    end
-    def next_one_of?(*bytes)
-      bytes.include?(next_byte)
-    end
-    def read(chunk_size = chunk_size)
-      # => initialize counters
-      @byte_count ||= Hash.new(0)
-      @byte_total ||= 0
-      return if eof?
-      bytes_before = @byte_total
-      input.read(chunk_size).each_byte { |byte|
-        @byte_count[byte] += 1
-        @byte_total       += 1
-        @first_byte ||= byte
-      }
-      @byte_total > bytes_before
-    end
-    def byte_count_sum(*bytes)
-      bytes = *bytes  # treat arrays/ranges and lists alike
-      bytes.inject(0) { |sum, n| sum + byte_count[n] }
-    end
-    def relative_byte_count(count)
-      count.to_f / byte_total
-    end
-    ### Definition of guessing heuristics. Order matters!
-    # ASCII, if all bytes are within the lower 128 bytes
-    # (Unfortunately, we have to read the *whole* file to make that decision)
-    encoding ASCII do
-      eof? && byte_count_sum(0x0..0x7f) == byte_total
-    end
-    # UTF-16, if lots of NULL bytes present
-    encodings UTF_16BE, UTF_16LE, UTF_16 do
-      if relative_byte_count(byte_count[0]) > 0.25
-        case first_byte
-          when 0x0:  UTF_32
-          when 0xfe: UTF_16BE
-          when 0xff: UTF_16LE
-          else       UTF_16
-        end
-      end
-    end
-    # UTF-8, if number of escape-bytes and following bytes
-    # is matching (cf. http://en.wikipedia.org/wiki/UTF-8)
-    encoding UTF_8 do
-      esc_bytes = byte_count_sum(0xc0..0xdf)     \
-                  # => 110xxxxx 10xxxxxx
-                + byte_count_sum(0xe0..0xef) * 2 \
-                  # => 1110xxxx 10xxxxxx 10xxxxxx
-                + byte_count_sum(0xf0..0xf7) * 3
-                  # => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-      fol_bytes = byte_count_sum(0x80..0xbf)
-                  # => 10xxxxxx
-      esc_bytes > 0 && esc_bytes == fol_bytes
-    end
-    # Analyse statistical appearance of German umlauts and other accented
-    # letters (see TEST_CHARS)
-    encodings *TEST_ENCODINGS do
-      ratios = {}
-      TEST_ENCODINGS.find(lambda {
-        ratio, encoding = ratios.sort.last
-        encoding if ratio >= TEST_THRESHOLD_APPROX
-      }) { |encoding|
-        ratio = relative_byte_count(byte_count_sum(TEST_CHARS[encoding]))
-        #p [encoding, ratio]
-        ratio >= TEST_THRESHOLD_DIRECT || (ratios[ratio] ||= encoding; false)
-      }
-    end
-    ### BOM detection. (cf. http://en.wikipedia.org/wiki/Byte-order_mark)
-    bom_encoding UTF_8 do
-      starts_with?(0xef, 0xbb, 0xbf)
-    end
-    bom_encoding UTF_16BE do
-      starts_with?(0xfe, 0xff)
-    end
-    bom_encoding UTF_16LE do
-      starts_with?(0xff, 0xfe)
-    end
-    bom_encoding UTF_32BE do
-      starts_with?(0x00, 0x00, 0xfe, 0xff)
-    end
+  class << self
-    bom_encoding UTF_32LE do
-      starts_with?(0xff, 0xfe, 0x00, 0x00)
+    def manual(*args)
+      Manual.display(*args)
     end
-    bom_encoding SCSU do
-      starts_with?(0x0e, 0xfe, 0xff)
-    end
-    bom_encoding UTF_7 do
-      starts_with?(0x2b, 0x2f, 0x76) && next_one_of?(0x38, 0x39, 0x2b, 0x2f)
-    end
-    bom_encoding UTF_EBCDIC do
-      starts_with?(0xdd, 0x73, 0x66, 0x73)
-    end
-    bom_encoding BOCU_1 do
-      starts_with?(0xfb, 0xee, 0x28)
+    def automatic(*args)
+      Automatic.guess(*args)
     end
   end
 end
+%w[encoding manual automatic].each { |lib|
+  lib = "cmess/guess_encoding/#{lib}"
+  require lib
+}

data/lib/cmess/guess_encoding/automatic.rb ADDED

@@ -0,0 +1,341 @@
+#--
+###############################################################################
+#                                                                             #
+# A component of cmess, the encoding tool-box.                                #
+#                                                                             #
+# Copyright (C) 2007-2008 University of Cologne,                              #
+#                         Albertus-Magnus-Platz,                              #
+#                         50932 Cologne, Germany                              #
+#                                                                             #
+# Authors:                                                                    #
+#     Jens Wille <jens.wille@uni-koeln.de>                                    #
+#                                                                             #
+# Contributors:                                                               #
+#     John Vorhauer <john@vorhauer.de> (idea and original implementation      #
+#                                       for automatic encoding detection)     #
+#                                                                             #
+# cmess is free software; you can redistribute it and/or modify it under the  #
+# terms of the GNU General Public License as published by the Free Software   #
+# Foundation; either version 3 of the License, or (at your option) any later  #
+# version.                                                                    #
+#                                                                             #
+# cmess is distributed in the hope that it will be useful, but WITHOUT ANY    #
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS   #
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more       #
+# details.                                                                    #
+#                                                                             #
+# You should have received a copy of the GNU General Public License along     #
+# with cmess. If not, see <http://www.gnu.org/licenses/>.                     #
+#                                                                             #
+###############################################################################
+#++
+$KCODE = 'u'
+require 'yaml'
+require 'iconv'
+require 'stringio'
+require 'forwardable'
+# Tries to detect the encoding of a given input by applying several
+# heuristics to determine the <b>most likely</b> candidate. If no heuristic
+# catches on, resorts to Encoding::UNKNOWN.
+#
+# If a BOM is found, it may determine the encoding directly.
+class CMess::GuessEncoding::Automatic
+  extend Forwardable
+  def_delegators self, :encoding_guessers, :supported_encoding?,
+                       :bom_guessers,      :supported_bom?
+  include CMess::GuessEncoding::Encoding
+  # Creates a converter for desired encoding (from UTF-8)
+  ICONV_FOR = Hash.new { |h, k| h[k] = Iconv.new(k, UTF_8) }
+  # Single-byte encodings to test statistically by TEST_CHARS
+  TEST_ENCODINGS = [
+    MACINTOSH,
+    ISO_8859_1,
+    ISO_8859_15,
+    CP1252,
+    CP850,
+    MS_ANSI
+  ]
+  # Certain (non-ASCII) chars to test for in TEST_ENCODINGS
+  CHARS_TO_TEST = (
+    '€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂ' <<
+    'ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'
+  ).split(//)
+  # Map TEST_ENCODINGS to respectively encoded CHARS_TO_TEST
+  TEST_CHARS = Hash.new { |hash, encoding|
+    encoding = Encoding.get_or_set_encoding_const(encoding)
+    encchars = CHARS_TO_TEST.map { |char|
+      begin
+        byte = *ICONV_FOR[encoding].iconv(char).unpack('C')
+      rescue Iconv::IllegalSequence
+      end
+    }.compact
+    TEST_ENCODINGS << encoding unless TEST_ENCODINGS.include?(encoding)
+    hash[encoding] = encchars
+  }.update(YAML.load_file(
+    File.join(File.dirname(__FILE__), *%w[.. .. .. data test_chars.yaml])
+  ))
+  # Relative count of TEST_CHARS must exceed this threshold to yield
+  # a direct match
+  TEST_THRESHOLD_DIRECT = 0.1
+  # Relative count of TEST_CHARS must exceed this threshold to yield
+  # an approximate match
+  TEST_THRESHOLD_APPROX = 0.0004
+  @supported_encodings = []
+  @encoding_guessers   = []
+  @supported_boms      = []
+  @bom_guessers        = []
+  class << self
+    attr_reader :supported_encodings, :encoding_guessers,
+                :supported_boms,      :bom_guessers
+    def guess(input, chunk_size = nil, ignore_bom = false)
+      new(input, chunk_size).guess(ignore_bom)
+    end
+    private
+    def encoding(encoding, &condition_block)
+      encoding_block = lambda {
+        encoding if instance_eval(&condition_block)
+      }
+      encodings(encoding, &encoding_block)
+    end
+    def encodings(*encodings, &encoding_block)
+      encodings.each { |encoding|
+        @supported_encodings << encoding
+        @encoding_guessers   << encoding_block \
+          unless @encoding_guessers.include?(encoding_block)
+      }
+    end
+    def supported_encoding?(encoding)
+      supported_encodings.include?(encoding)
+    end
+    def bom_encoding(encoding, &condition_block)
+      encoding_block = lambda {
+        encoding if instance_eval(&condition_block)
+      }
+      @supported_boms << encoding
+      @bom_guessers   << encoding_block \
+        unless @bom_guessers.include?(encoding_block)
+    end
+    def supported_bom?(encoding)
+      supported_boms.include?(encoding)
+    end
+  end
+  attr_reader :input, :chunk_size, :byte_count, :byte_total, :first_byte
+  def initialize(input, chunk_size = nil)
+    @input = case input
+      when IO      # that's what we want
+        input
+      when String  # convert it to an IO
+        StringIO.new(input)
+      else         # um, what's that...?
+        raise ArgumentError, "don't know how to handle input of type #{input.class}"
+    end
+    @chunk_size = chunk_size
+  end
+  def guess(ignore_bom = false)
+    return bom if bom && !ignore_bom
+    while read
+      encoding_guessers.each { |block|
+        encoding = instance_eval(&block)
+        return encoding if encoding && supported_encoding?(encoding)
+      }
+    end
+    # nothing suitable found :-(
+    UNKNOWN
+  end
+  def bom
+    @bom ||= check_bom
+  end
+  private
+  def eof?
+    input.eof?
+  end
+  def check_bom
+    return if eof?
+    # prevent "Illegal seek" error inside a pipe
+    begin
+      input.pos
+    rescue Errno::ESPIPE
+      return
+    end
+    bom_guessers.each { |block|
+      encoding = instance_eval(&block)
+      return encoding if encoding && supported_bom?(encoding)
+      # read bytes don't build a BOM, so rewind...
+      input.rewind
+    }
+    # nothing suitable found :-(
+    nil
+  end
+  def next_byte
+    input.read(1).unpack('C').first
+  end
+  def starts_with?(*bytes)
+    bytes.all? { |byte|
+      next_byte == byte
+    }
+  end
+  def next_one_of?(*bytes)
+    bytes.include?(next_byte)
+  end
+  def read(chunk_size = chunk_size)
+    # => initialize counters
+    @byte_count ||= Hash.new(0)
+    @byte_total ||= 0
+    return if eof?
+    bytes_before = @byte_total
+    input.read(chunk_size).each_byte { |byte|
+      @byte_count[byte] += 1
+      @byte_total       += 1
+      @first_byte ||= byte
+    }
+    @byte_total > bytes_before
+  end
+  def byte_count_sum(*bytes)
+    bytes = *bytes  # treat arrays/ranges and lists alike
+    bytes.inject(0) { |sum, n| sum + byte_count[n] }
+  end
+  def relative_byte_count(count)
+    count.to_f / byte_total
+  end
+  ### Definition of guessing heuristics. Order matters!
+  # ASCII, if all bytes are within the lower 128 bytes
+  # (Unfortunately, we have to read the *whole* file to make that decision)
+  encoding ASCII do
+    eof? && byte_count_sum(0x0..0x7f) == byte_total
+  end
+  # UTF-16, if lots of NULL bytes present
+  encodings UTF_16BE, UTF_16LE, UTF_16 do
+    if relative_byte_count(byte_count[0]) > 0.25
+      case first_byte
+        when 0x0:  UTF_32
+        when 0xfe: UTF_16BE
+        when 0xff: UTF_16LE
+        else       UTF_16
+      end
+    end
+  end
+  # UTF-8, if number of escape-bytes and following bytes
+  # is matching (cf. http://en.wikipedia.org/wiki/UTF-8)
+  encoding UTF_8 do
+    esc_bytes = byte_count_sum(0xc0..0xdf)     \
+                # => 110xxxxx 10xxxxxx
+              + byte_count_sum(0xe0..0xef) * 2 \
+                # => 1110xxxx 10xxxxxx 10xxxxxx
+              + byte_count_sum(0xf0..0xf7) * 3
+                # => 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+    fol_bytes = byte_count_sum(0x80..0xbf)
+                # => 10xxxxxx
+    esc_bytes > 0 && esc_bytes == fol_bytes
+  end
+  # Analyse statistical appearance of German umlauts and other accented
+  # letters (see TEST_CHARS)
+  encodings *TEST_ENCODINGS do
+    ratios = {}
+    TEST_ENCODINGS.find(lambda {
+      ratio, encoding = ratios.sort.last
+      encoding if ratio >= TEST_THRESHOLD_APPROX
+    }) { |encoding|
+      ratio = relative_byte_count(byte_count_sum(TEST_CHARS[encoding]))
+      #p [encoding, ratio]
+      ratio >= TEST_THRESHOLD_DIRECT || (ratios[ratio] ||= encoding; false)
+    }
+  end
+  ### BOM detection. (cf. http://en.wikipedia.org/wiki/Byte-order_mark)
+  bom_encoding UTF_8 do
+    starts_with?(0xef, 0xbb, 0xbf)
+  end
+  bom_encoding UTF_16BE do
+    starts_with?(0xfe, 0xff)
+  end
+  bom_encoding UTF_16LE do
+    starts_with?(0xff, 0xfe)
+  end
+  bom_encoding UTF_32BE do
+    starts_with?(0x00, 0x00, 0xfe, 0xff)
+  end
+  bom_encoding UTF_32LE do
+    starts_with?(0xff, 0xfe, 0x00, 0x00)
+  end
+  bom_encoding SCSU do
+    starts_with?(0x0e, 0xfe, 0xff)
+  end
+  bom_encoding UTF_7 do
+    starts_with?(0x2b, 0x2f, 0x76) && next_one_of?(0x38, 0x39, 0x2b, 0x2f)
+  end
+  bom_encoding UTF_EBCDIC do
+    starts_with?(0xdd, 0x73, 0x66, 0x73)
+  end
+  bom_encoding BOCU_1 do
+    starts_with?(0xfb, 0xee, 0x28)
+  end
+end

data/lib/cmess/guess_encoding/encoding.rb ADDED

@@ -0,0 +1,61 @@
+#--
+###############################################################################
+#                                                                             #
+# A component of cmess, the encoding tool-box.                                #
+#                                                                             #
+# Copyright (C) 2007-2008 University of Cologne,                              #
+#                         Albertus-Magnus-Platz,                              #
+#                         50932 Cologne, Germany                              #
+#                                                                             #
+# Authors:                                                                    #
+#     Jens Wille <jens.wille@uni-koeln.de>                                    #
+#                                                                             #
+# Contributors:                                                               #
+#     John Vorhauer <john@vorhauer.de> (idea and original implementation      #
+#                                       for automatic encoding detection)     #
+#                                                                             #
+# cmess is free software; you can redistribute it and/or modify it under the  #
+# terms of the GNU General Public License as published by the Free Software   #
+# Foundation; either version 3 of the License, or (at your option) any later  #
+# version.                                                                    #
+#                                                                             #
+# cmess is distributed in the hope that it will be useful, but WITHOUT ANY    #
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS   #
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more       #
+# details.                                                                    #
+#                                                                             #
+# You should have received a copy of the GNU General Public License along     #
+# with cmess. If not, see <http://www.gnu.org/licenses/>.                     #
+#                                                                             #
+###############################################################################
+#++
+# Namespace for our encodings.
+module CMess::GuessEncoding::Encoding
+  extend self
+  def const_name_for(encoding)
+    encoding.tr('-', '_').gsub(/\W/, '').upcase
+  end
+  def set_encoding_const(encoding, const = const_name_for(encoding))
+    const_set(const, encoding.freeze)
+  end
+  def get_or_set_encoding_const(encoding)
+    const_defined?(const = const_name_for(encoding)) ?
+      const_get(const) : set_encoding_const(encoding, const)
+  end
+  %w[
+    UNKNOWN ASCII MACINTOSH
+    ISO-8859-1 ISO-8859-2 ISO-8859-15
+    CP1250 CP1251 CP1252 CP850 CP852 CP856
+    UTF-8 UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE
+    UTF-7 UTF-EBCDIC SCSU BOCU-1
+    ANSI_X3.4 EBCDIC-AT-DE EBCDIC-US EUC-JP KOI-8 MS-ANSI SHIFT-JIS
+  ].each { |encoding| set_encoding_const(encoding) }
+end

data/lib/cmess/guess_encoding/manual.rb ADDED

@@ -0,0 +1,105 @@
+#--
+###############################################################################
+#                                                                             #
+# A component of cmess, the encoding tool-box.                                #
+#                                                                             #
+# Copyright (C) 2007-2008 University of Cologne,                              #
+#                         Albertus-Magnus-Platz,                              #
+#                         50932 Cologne, Germany                              #
+#                                                                             #
+# Authors:                                                                    #
+#     Jens Wille <jens.wille@uni-koeln.de>                                    #
+#                                                                             #
+# Contributors:                                                               #
+#     John Vorhauer <john@vorhauer.de> (idea and original implementation      #
+#                                       for automatic encoding detection)     #
+#                                                                             #
+# cmess is free software; you can redistribute it and/or modify it under the  #
+# terms of the GNU General Public License as published by the Free Software   #
+# Foundation; either version 3 of the License, or (at your option) any later  #
+# version.                                                                    #
+#                                                                             #
+# cmess is distributed in the hope that it will be useful, but WITHOUT ANY    #
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS   #
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more       #
+# details.                                                                    #
+#                                                                             #
+# You should have received a copy of the GNU General Public License along     #
+# with cmess. If not, see <http://www.gnu.org/licenses/>.                     #
+#                                                                             #
+###############################################################################
+#++
+require 'iconv'
+# Outputs given string (or line), being encoded in target encoding, encoded in
+# various test encodings, thus allowing to identify the (seemingly) correct
+# encoding by visually comparing the input string with its desired appearance.
+module CMess::GuessEncoding::Manual
+  extend self
+  include CMess::GuessEncoding::Encoding
+  # default encodings to try
+  ENCODINGS = [
+    ISO_8859_1,
+    ISO_8859_2,
+    ISO_8859_15,
+    CP1250,
+    CP1251,
+    CP1252,
+    CP850,
+    CP852,
+    CP856,
+    UTF_8
+  ]
+  # likely candidates to suggest to the user
+  CANDIDATES = [
+    ANSI_X34,
+    EBCDIC_AT_DE,
+    EBCDIC_US,
+    EUC_JP,
+    KOI_8,
+    MACINTOSH,
+    MS_ANSI,
+    SHIFT_JIS,
+    UTF_7,
+    UTF_16,
+    UTF_16BE,
+    UTF_16LE,
+    UTF_32,
+    UTF_32BE,
+    UTF_32LE
+  ]
+  def display(input, target_encoding, encodings = nil, additional_encodings = [])
+    target = target_encoding
+    encodings = (encodings || ENCODINGS) + additional_encodings
+    encodings = encodings.reverse.uniq.reverse     # uniq with additional encodings
+                                                   # staying at the end
+    encodings = [target] + (encodings - [target])  # move target encoding to front
+    max_length = encodings.map { |encoding| encoding.length }.max
+    encodings.each { |encoding|
+      converted = begin
+        Iconv.conv(target, encoding, input)
+      rescue Iconv::IllegalSequence, Iconv::InvalidCharacter => err
+        "ILLEGAL INPUT SEQUENCE: #{err}"
+      rescue Iconv::InvalidEncoding
+        if encoding == target
+          abort "Invalid encoding: #{encoding}"
+        else
+          "INVALID ENCODING!"
+        end
+      end
+      puts "%-#{max_length}s : %s" % [encoding, converted]
+    }
+  end
+end

data/lib/cmess/version.rb CHANGED

@@ -30,7 +30,7 @@ module CMess::Version
   MAJOR = 0
   MINOR = 0
-  TINY  = 8
+  TINY  = 9
   class << self

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: cmess
 version: !ruby/object:Gem::Version
-  version: 0.0.8.274
+  version: 0.0.9.276
 platform: ruby
 authors:
 - Jens Wille
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2008-08-14 00:00:00 +02:00
+date: 2008-08-15 00:00:00 +02:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -51,6 +51,9 @@ files:
 - lib/cmess/cli.rb
 - lib/cmess/cinderella.rb
 - lib/cmess/decode_entities.rb
+- lib/cmess/guess_encoding/manual.rb
+- lib/cmess/guess_encoding/encoding.rb
+- lib/cmess/guess_encoding/automatic.rb
 - bin/cinderella
 - bin/decode_entities
 - bin/guess_encoding