RubyGems - uchardet - Versions diffs - 0.1.1 - Mend

uchardet 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

data/History.txt ADDED

@@ -0,0 +1,3 @@
+=== 0.1.1 2009-12-19
+* Initial release

data/Manifest.txt ADDED

@@ -0,0 +1,18 @@
+History.txt
+Manifest.txt
+README.rdoc
+Rakefile
+bin/uchardet
+ext/uchardet/extconf.rb
+ext/uchardet/uchardet.c
+lib/uchardet.rb
+lib/uchardet/cli.rb
+script/console
+script/destroy
+script/generate
+tasks/extconf.rake
+tasks/extconf/uchardet.rake
+test/test_helper.rb
+test/test_uchardet.rb
+test/test_uchardet_cli.rb
+test/test_uchardet_extn.rb

data/README.rdoc ADDED

@@ -0,0 +1,50 @@
+= uchardet
+* http://github.com/invisiblellama/uchardet
+== DESCRIPTION:
+Fast character set encoding detection using International Components for Unicode C++ library.
+== SYNOPSIS:
+  require 'open-uri'
+  require 'uchardet'
+  encoding = ICU::UCharsetDetector.detect open('http://google.jp').read
+  encoding # => {:language=>"ja", :encoding=>"Shift_JIS", :confidence=>100}
+From command line:
+  $ uchardet
+  Usage: uchardet [options] file
+      -l, --list                       Display list of detectable character sets.
+      -s, --strip                      Strip HTML or XML markup before detection.
+      -e, --encoding                   Hint the charset detector about possible encoding.
+      -a, --all                        Show all matching encodings.
+      -h, --help                       Show this help message.
+  $ uchardet `which uchardet`
+  ISO-8859-1 (confidence 60%)
+== REQUIREMENTS:
+ICU[http://site.icu-project.org/] (International Components for Unicode):
+on Mac OS X:
+  sudo port install icu
+on Debian/Ubuntu
+  sudo apt-get install libicu-dev
+== INSTALL:
+  sudo gem install uchardet
+== LICENSE:
+Copyright (c) 2009 Dmitri Goutnik, released under the MIT license.

data/Rakefile ADDED

@@ -0,0 +1,23 @@
+require 'rubygems'
+gem 'hoe', '>= 2.1.0'
+require 'hoe'
+require 'fileutils'
+require './lib/uchardet'
+Hoe.plugin :newgem
+# Generate all the Rake tasks
+# Run 'rake -T' to see list of generated tasks (from gem root directory)
+$hoe = Hoe.spec 'uchardet' do
+  self.developer 'Dmitri Goutnik', 'dg@syrec.org'
+  self.readme_file = 'README.rdoc'
+  self.extra_rdoc_files = ['README.rdoc']
+  self.rubyforge_name = self.name
+end
+require 'newgem/tasks'
+Dir['tasks/**/*.rake'].each { |t| load t }
+# TODO - want other tests/tasks run by default? Add them to the list
+# remove_task :default
+# task :default => [:spec, :features]

data/bin/uchardet ADDED

@@ -0,0 +1,8 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require File.expand_path(
+  File.join(File.dirname(__FILE__), %w[.. lib uchardet]))
+require "uchardet/cli"
+Uchardet::CLI.execute(STDOUT, ARGV)

data/ext/uchardet/extconf.rb ADDED

@@ -0,0 +1,11 @@
+require 'mkmf'
+icu_config = `which icu-config`.strip
+if icu_config.empty?
+  abort "ICU seems to be missing. Try 'port install icu' or 'apt-get install libicu-dev'"
+end
+$LIBS << ' ' + `#{icu_config} --ldflags-system --ldflags-libsonly`.strip
+$LDFLAGS <<  ' ' + `#{icu_config} --ldflags-searchpath`.strip
+create_makefile("uchardet")

data/ext/uchardet/uchardet.c ADDED

@@ -0,0 +1,340 @@
+#include <ruby.h>
+#include <unicode/ucsdet.h>
+#ifndef RSTRING_PTR
+#   define RSTRING_PTR(str) RSTRING(str)->ptr
+#   define RSTRING_LEN(str) RSTRING(str)->len
+#endif
+static VALUE cUChardetError;
+static VALUE cUCharsetDetector;
+static void
+assure(UErrorCode status)
+{
+    if (U_FAILURE(status)) {
+        VALUE ex = rb_exc_new2(cUChardetError, u_errorName(status));
+        rb_iv_set(ex, "@errno", INT2FIX(status));
+        rb_exc_raise(ex);
+    }
+}
+static void
+UCharsetDetector_free(void *detector)
+{
+    ucsdet_close(detector);
+}
+static VALUE
+UCharsetDetector_alloc(VALUE klass)
+{
+    UErrorCode status = U_ZERO_ERROR;
+    UCharsetDetector* detector = ucsdet_open(&status);
+    assure(status);
+    return Data_Wrap_Struct(klass, NULL, UCharsetDetector_free, detector);
+}
+/*
+ * call-seq:
+ *   input_filtered
+ *
+ *  Return filtering flag value this charset detector.
+ */
+static VALUE
+UCharsetDetector_get_input_filtered(VALUE self)
+{
+    UCharsetDetector *detector;
+    Data_Get_Struct(self, UCharsetDetector, detector);
+    return ucsdet_isInputFilterEnabled(detector) ? Qtrue : Qfalse;
+}
+/*
+ * call-seq:
+ *   input_filtered=
+ *
+ * Enable filtering of input text. If filtering is enabled,
+ * text within angle brackets ("<" and ">") will be removed
+ * before detection, which will remove most HTML or xml markup.
+ */
+static VALUE
+UCharsetDetector_set_input_filtered(VALUE self, VALUE flag)
+{
+    UCharsetDetector *detector;
+    Data_Get_Struct(self, UCharsetDetector, detector);
+    ucsdet_enableInputFilter(detector, RTEST(flag) ? TRUE : FALSE);
+    return self;
+}
+/*
+ * call-seq:
+ *   text
+ *
+ * Get input text for this detector.
+ */
+static VALUE
+UCharsetDetector_get_text(VALUE self)
+{
+    return rb_iv_get(self, "@text");
+}
+/*
+ * call-seq:
+ *   text=
+ *
+ * Set input text for this detector.
+ */
+static VALUE
+UCharsetDetector_set_text(VALUE self, VALUE text)
+{
+    return rb_iv_set(self, "@text", text);
+    return text;
+}
+/*
+ * call-seq:
+ *   declared_encoding
+ *
+ * Get the declared encoding for charset detection.
+ */
+static VALUE
+UCharsetDetector_get_declared_encoding(VALUE self)
+{
+    return rb_iv_get(self, "@declared_encoding");
+}
+/*
+ * call-seq:
+ *   declared_encoding=
+ *
+ * Set the declared encoding for charset detection.
+ * The declared encoding of an input text is an encoding obtained
+ * by the user from an http header or xml declaration or similar source that
+ * can be provided as an additional hint to the charset detector.
+ */
+static VALUE
+UCharsetDetector_set_declared_encoding(VALUE self, VALUE declared_encoding)
+{
+    return rb_iv_set(self, "@declared_encoding", declared_encoding);
+    return declared_encoding;
+}
+static void
+set_text(VALUE self, VALUE text)
+{
+    if (!NIL_P(text)) {
+        text = StringValue(text);
+        UErrorCode status = U_ZERO_ERROR;
+        UCharsetDetector *detector;
+        Data_Get_Struct(self, UCharsetDetector, detector);
+        ucsdet_setText(detector, StringValuePtr(text), RSTRING_LEN(text), &status);
+        assure(status);
+        UCharsetDetector_set_text(self, text);
+    }
+}
+static void
+set_declared_encoding(VALUE self, VALUE declared_encoding)
+{
+    if (!NIL_P(declared_encoding)){
+        declared_encoding = StringValue(declared_encoding);
+        UErrorCode status = U_ZERO_ERROR;
+        UCharsetDetector *detector;
+        Data_Get_Struct(self, UCharsetDetector, detector);
+        ucsdet_setDeclaredEncoding(detector, StringValuePtr(declared_encoding), RSTRING_LEN(declared_encoding), &status);
+        assure(status);
+        UCharsetDetector_set_declared_encoding(self, declared_encoding);
+    }
+}
+/*
+ * call-seq:
+ *   new(text=nil, declared_encoding=nil)
+ *
+ * Create a new charset detector. Optionally set input text and declared encoding.
+ */
+static VALUE
+UCharsetDetector_initialize(int argc, VALUE *argv, VALUE self)
+{
+    VALUE text;
+    VALUE declared_encoding;
+    rb_scan_args(argc, argv, "02", &text, &declared_encoding);
+    if (NIL_P(text))
+        UCharsetDetector_set_text(self, Qnil);
+    else
+        set_text(self, text);
+    if (NIL_P(declared_encoding))
+        UCharsetDetector_set_declared_encoding(self, Qnil);
+    else
+        set_declared_encoding(self, declared_encoding);
+    return self;
+}
+/*
+ * call-seq:
+ *   detect(text=nil, declared_encoding=nil)
+ *
+ * Return the charset that best matches the supplied input data.
+ *
+ * Note though, that because the detection
+ * only looks at the start of the input data,
+ * there is a possibility that the returned charset will fail to handle
+ * the full set of input data.
+ *
+ * The function will fail if
+ * * no charset appears to match the data
+ * * no input text has been provided (with +text+ or set with #text= )
+ */
+static VALUE
+UCharsetDetector_detect(int argc, VALUE *argv, VALUE self)
+{
+    VALUE text;
+    VALUE declared_encoding;
+    rb_scan_args(argc, argv, "02", &text, &declared_encoding);
+    set_text(self, text);
+    set_declared_encoding(self, declared_encoding);
+    UErrorCode status = U_ZERO_ERROR;
+    UCharsetDetector *detector;
+    Data_Get_Struct(self, UCharsetDetector, detector);
+    const UCharsetMatch *match = ucsdet_detect(detector, &status);
+    assure(status);
+    const char *encoding_name = ucsdet_getName(match, &status);
+    assure(status);
+    int32_t encoding_confidence = ucsdet_getConfidence(match, &status);
+    assure(status);
+    const char *encoding_language = ucsdet_getLanguage(match, &status);
+    assure(status);
+    VALUE hash = rb_hash_new();
+    rb_hash_aset(hash, ID2SYM(rb_intern("encoding")), rb_str_new2(encoding_name));
+    rb_hash_aset(hash, ID2SYM(rb_intern("confidence")), INT2NUM(encoding_confidence));
+    rb_hash_aset(hash, ID2SYM(rb_intern("language")), rb_str_new2(encoding_language));
+    return hash;
+}
+/*
+ * call-seq:
+ *   detect_all(text=nil, declared_encoding=nil)
+ *
+ * Find all charset matches that appear to be consistent with the input,
+ * returning an array of results.  The results are ordered with the
+ * best quality match first.
+ *
+ * Because the detection only looks at a limited amount of the
+ * input byte data, some of the returned charsets may fail to handle
+ * the all of input data.
+ *
+ * Return an error if
+ * * no charset appears to match the data
+ * * no input text has been provided (with +text+ or set with #text= )
+ */
+static VALUE
+UCharsetDetector_detect_all(int argc, VALUE *argv, VALUE self)
+{
+    VALUE text;
+    VALUE declared_encoding;
+    rb_scan_args(argc, argv, "02", &text, &declared_encoding);
+    set_text(self, text);
+    set_declared_encoding(self, declared_encoding);
+    UCharsetDetector *detector;
+    Data_Get_Struct(self, UCharsetDetector, detector);
+    UErrorCode status = U_ZERO_ERROR;
+    int32_t matches_found;
+    const UCharsetMatch **matches = ucsdet_detectAll(detector, &matches_found, &status);
+    assure(status);
+    VALUE ary = rb_ary_new();
+    int i = 0;
+    for (i = 0; i < matches_found; i++) {
+        const char *encoding_name = ucsdet_getName(matches[i], &status);
+        assure(status);
+        int32_t encoding_confidence = ucsdet_getConfidence(matches[i], &status);
+        assure(status);
+        const char *encoding_language = ucsdet_getLanguage(matches[i], &status);
+        assure(status);
+        VALUE hash = rb_hash_new();
+        rb_hash_aset(hash, ID2SYM(rb_intern("encoding")), rb_str_new2(encoding_name));
+        rb_hash_aset(hash, ID2SYM(rb_intern("confidence")), INT2NUM(encoding_confidence));
+        rb_hash_aset(hash, ID2SYM(rb_intern("language")), rb_str_new2(encoding_language));
+        rb_ary_push(ary, hash);
+    }
+    return ary;
+}
+/*
+ * call-seq:
+ *   detectable_charsets
+ *
+ * Get array of names of all detectable charsets that are known to the
+ * charset detection service.
+ */
+static VALUE
+UCharsetDetector_get_detectable_charsets(VALUE self)
+{
+    UCharsetDetector *detector;
+    Data_Get_Struct(self, UCharsetDetector, detector);
+    UErrorCode status = U_ZERO_ERROR;
+    UEnumeration *charsets = ucsdet_getAllDetectableCharsets(detector, &status);
+    assure(status);
+    VALUE ary = rb_ary_new();
+    int32_t result_length;
+    const char *charset_name;
+    while (charset_name = uenum_next(charsets, &result_length, &status)) {
+        assure(status);
+        rb_ary_push(ary, rb_str_new2(charset_name));
+    }
+    uenum_close(charsets);
+    return ary;
+}
+void
+Init_uchardet()
+{
+    VALUE mICU = rb_define_module("ICU");
+    cUChardetError = rb_define_class_under(mICU, "Error", rb_eStandardError);
+    cUCharsetDetector = rb_define_class_under(mICU, "UCharsetDetector", rb_cObject);
+    rb_define_alloc_func(cUCharsetDetector, UCharsetDetector_alloc);
+    rb_define_method(cUCharsetDetector, "initialize", UCharsetDetector_initialize, -1);
+    rb_define_method(cUCharsetDetector, "input_filtered?", UCharsetDetector_get_input_filtered, 0);
+    rb_define_method(cUCharsetDetector, "input_filtered=", UCharsetDetector_set_input_filtered, 1);
+    rb_define_method(cUCharsetDetector, "text", UCharsetDetector_get_text, 0);
+    rb_define_method(cUCharsetDetector, "text=", UCharsetDetector_set_text, 1);
+    rb_define_method(cUCharsetDetector, "declared_encoding", UCharsetDetector_get_declared_encoding, 0);
+    rb_define_method(cUCharsetDetector, "declared_encoding=", UCharsetDetector_set_declared_encoding, 1);
+    rb_define_method(cUCharsetDetector, "detect", UCharsetDetector_detect, -1);
+    rb_define_method(cUCharsetDetector, "detect_all", UCharsetDetector_detect_all, -1);
+    rb_define_method(cUCharsetDetector, "detectable_charsets", UCharsetDetector_get_detectable_charsets, 0);
+}

data/lib/uchardet.rb ADDED

@@ -0,0 +1,37 @@
+$:.unshift(File.dirname(__FILE__)) unless
+  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
+module Uchardet
+  VERSION = '0.1.1'
+end
+begin
+  require 'uchardet.so'
+rescue LoadError
+  # uh-oh
+end
+module ICU  # :main: README
+  class UCharsetDetector  # :main: README
+    ##
+    # Shortcut for ICU::UCharsetDetector#detect
+    #
+    def self.detect(*args)
+      self.new.detect(*args)
+    end
+    ##
+    # Shortcut for ICU::UCharsetDetector#detect_all
+    #
+    def self.detect_all(*args)
+      self.new.detect_all(*args)
+    end
+    ##
+    # Shortcut for ICU::UCharsetDetector#detectable_charsets
+    #
+    def self.detectable_charsets
+      self.new.detectable_charsets
+    end
+  end
+end

data/lib/uchardet/cli.rb ADDED

@@ -0,0 +1,80 @@
+require 'optparse'
+module Uchardet
+  class CLI
+    def self.execute(stdout, args=[])
+      @stdout = stdout
+      @options = {
+        :input_filtered => false,
+        :declared_encoding => nil,
+        :detect_all => false,
+        :path => nil
+      }
+      parser = OptionParser.new do |opts|
+        opts.banner = <<-BANNER.gsub(/^\s*/,'')
+          Usage: #{File.basename($0)} [options] file
+        BANNER
+        opts.on("-l", "--list",
+                "Display list of detectable character sets."
+                ) { self.list; exit }
+        opts.on("-s", "--strip",
+                "Strip HTML or XML markup before detection."
+                ) { @options[:input_filtered] = true }
+        opts.on("-e", "--encoding",
+                "Hint the charset detector about possible encoding."
+                ) { |arg| @options[:declared_encoding] = arg }
+        opts.on("-a", "--all",
+                "Show all matching encodings."
+                ) { @options[:detect_all] = true }
+        opts.on("-h", "--help",
+                "Show this help message."
+                ) { @stdout.puts opts; exit }
+        if args.empty?
+          @stdout.puts opts
+        else
+          begin
+            opts.parse!(args)
+          rescue OptionParser::ParseError => ex
+            STDERR.puts "ERROR: #{ex.to_s}. See #{File.basename($0)} --help"
+            exit
+          end
+          @options[:path] = args.last
+          if @options[:path].nil? || @options[:path].empty?
+            @stdout.puts opts
+            STDERR.puts "ERROR: please specify a file path."
+            exit
+          end
+          self.detect
+        end
+      end
+    end
+    def self.list
+      ICU::UCharsetDetector.detectable_charsets.uniq.sort.each { |name| @stdout.puts name }
+    end
+    def self.detect
+      detector = ICU::UCharsetDetector.new
+      detector.input_filtered = @options[:input_filtered]
+      detector.declared_encoding = @options[:declared_encoding]
+      source = IO.read(@options[:path])
+      matches = if @options[:detect_all]
+        detector.detect_all(source)
+      else
+        [detector.detect(source)]
+      end
+      matches.each do |match|
+        @stdout.puts "#{match[:encoding]} (confidence #{match[:confidence]}%)"
+      end
+    rescue Exception => ex
+      STDERR.puts "ERROR: #{ex.to_s}"
+    end
+  end
+end

data/script/console ADDED

@@ -0,0 +1,10 @@
+#!/usr/bin/env ruby
+# File: script/console
+irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
+libs =  " -r irb/completion"
+# Perhaps use a console_lib to store any extra methods I may want available in the cosole
+# libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
+libs <<  " -r #{File.dirname(__FILE__) + '/../lib/chardet-icu.rb'}"
+puts "Loading chardet-icu gem"
+exec "#{irb} #{libs} --simple-prompt"

data/script/destroy ADDED

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
+begin
+  require 'rubigen'
+rescue LoadError
+  require 'rubygems'
+  require 'rubigen'
+end
+require 'rubigen/scripts/destroy'
+ARGV.shift if ['--help', '-h'].include?(ARGV[0])
+RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
+RubiGen::Scripts::Destroy.new.run(ARGV)

data/script/generate ADDED

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
+begin
+  require 'rubigen'
+rescue LoadError
+  require 'rubygems'
+  require 'rubigen'
+end
+require 'rubigen/scripts/generate'
+ARGV.shift if ['--help', '-h'].include?(ARGV[0])
+RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
+RubiGen::Scripts::Generate.new.run(ARGV)

data/tasks/extconf.rake ADDED

@@ -0,0 +1,13 @@
+namespace :extconf do
+  desc "Compiles the Ruby extension"
+  task :compile
+end
+task :compile => "extconf:compile"
+task :test => :compile
+BIN = "*.{o,bundle,jar,so,obj,pdb,lib,def,exp}"
+$hoe.clean_globs |= ["ext/**/#{BIN}", "lib/**/#{BIN}", 'ext/**/Makefile']
+$hoe.spec.require_paths = Dir['{lib,ext/*}']
+$hoe.spec.extensions = FileList["ext/**/extconf.rb"].to_a

data/tasks/extconf/uchardet.rake ADDED

@@ -0,0 +1,43 @@
+namespace :extconf do
+  extension = File.basename(__FILE__, '.rake')
+  ext = "ext/#{extension}"
+  ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
+  ext_files = FileList[
+    "#{ext}/*.c",
+    "#{ext}/*.h",
+    "#{ext}/*.rl",
+    "#{ext}/extconf.rb",
+    "#{ext}/Makefile",
+    # "lib"
+  ]
+  task :compile => extension do
+    if Dir.glob("**/#{extension}.{o,so,dll}").length == 0
+      STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
+      STDERR.puts "Gem actually failed to build.  Your system is"
+      STDERR.puts "NOT configured properly to build #{GEM_NAME}."
+      STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
+      exit(1)
+    end
+  end
+  desc "Builds just the #{extension} extension"
+  task extension.to_sym => ["#{ext}/Makefile", ext_so ]
+  file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
+    Dir.chdir(ext) do ruby "extconf.rb" end
+  end
+  file ext_so => ext_files do
+    Dir.chdir(ext) do
+      sh(RUBY_PLATFORM =~ /win32/ ? 'nmake' : 'make') do |ok, res|
+        if !ok
+          require "fileutils"
+          FileUtils.rm Dir.glob('*.{so,o,dll,bundle}')
+        end
+      end
+    end
+  end
+end

data/test/test_helper.rb ADDED

@@ -0,0 +1,3 @@
+require 'stringio'
+require 'test/unit'
+require File.dirname(__FILE__) + '/../lib/uchardet'

data/test/test_uchardet.rb ADDED

@@ -0,0 +1,22 @@
+# encoding: utf-8
+require File.dirname(__FILE__) + '/test_helper.rb'
+class TestUchardet < Test::Unit::TestCase   # :nodoc:
+  def test_detect
+    detector = ICU::UCharsetDetector.new
+    assert_equal(detector.detect(''), ICU::UCharsetDetector.detect(''))
+  end
+  def test_detect_all
+    detector = ICU::UCharsetDetector.new
+    assert_equal(detector.detect_all('∑'), ICU::UCharsetDetector.detect_all('∑'))
+  end
+  def test_detectable_charsets
+    detector = ICU::UCharsetDetector.new
+    assert_equal(detector.detectable_charsets, ICU::UCharsetDetector.detectable_charsets)
+  end
+end

data/test/test_uchardet_cli.rb ADDED

@@ -0,0 +1,14 @@
+require File.join(File.dirname(__FILE__), "test_helper.rb")
+require 'uchardet/cli'
+class TestUchardetCli < Test::Unit::TestCase
+  def setup
+    Uchardet::CLI.execute(@stdout_io = StringIO.new, [])
+    @stdout_io.rewind
+    @stdout = @stdout_io.read
+  end
+  def test_print_default_output
+    assert_match(/Usage: .* \[options\] file/, @stdout)
+  end
+end

data/test/test_uchardet_extn.rb ADDED

@@ -0,0 +1,101 @@
+# encoding: utf-8
+require "test/unit"
+$:.unshift File.dirname(__FILE__) + "/../ext/uchardet"
+require "uchardet.so"
+class TestUchardetExtn < Test::Unit::TestCase   # :nodoc:
+  def test_init
+    assert_not_nil(ICU::UCharsetDetector)
+    assert_nothing_raised do
+      detector = ICU::UCharsetDetector.new
+      assert_not_nil(detector)
+      detector = ICU::UCharsetDetector.new nil
+      assert_not_nil(detector)
+      detector = ICU::UCharsetDetector.new 'some text'
+      assert_not_nil(detector)
+    end
+    assert_raise(TypeError) do
+      detector = ICU::UCharsetDetector.new 0
+    end
+    assert_raise(TypeError) do
+      detector = ICU::UCharsetDetector.new Time.now
+    end
+  end
+  def test_detect
+    detector = ICU::UCharsetDetector.new
+    assert_raise(ICU::Error) do
+      detector.detect
+    end
+    e = detector.detect '∂∆∂∆∂∆'
+    assert(e.is_a? Hash)
+    assert(e.has_key? :encoding)
+    assert(e.has_key? :confidence)
+    assert(e.has_key? :language)
+    assert_equal('utf-8', e[:encoding].downcase)
+    e = detector.detect '··', 'utf-8'
+    assert_equal('utf-8', e[:encoding].downcase)
+    e = detector.detect '··', 'Shift_JIS'
+    assert_equal('utf-8', e[:encoding].downcase)
+  end
+  def test_detect_all
+    detector = ICU::UCharsetDetector.new
+    assert_raise(ICU::Error) do
+      detector.detect_all
+    end
+    a = detector.detect_all '€‹€‹€'
+    assert(a.is_a? Array)
+    assert_equal(false, a.empty?)
+    assert(a[0].is_a? Hash)
+    assert(a[0].has_key? :encoding)
+    assert(a[0].has_key? :confidence)
+    assert(a[0].has_key? :language)
+  end
+  def test_input_filtered_accessor
+    detector = ICU::UCharsetDetector.new
+    assert_equal(false, detector.input_filtered?)
+    detector.input_filtered = true
+    assert_equal(true, detector.input_filtered?)
+    detector.input_filtered = ''
+    assert_equal(true, detector.input_filtered?)
+    detector.input_filtered = nil
+    assert_equal(false, detector.input_filtered?)
+  end
+  def test_text_accessor
+    detector = ICU::UCharsetDetector.new
+    assert_equal(nil, detector.text)
+    detector = ICU::UCharsetDetector.new 'blah'
+    assert_equal('blah', detector.text)
+    detector.text = 'test'
+    assert_equal('test', detector.text)
+    detector.detect
+    assert_equal('test', detector.text)
+  end
+  def test_declared_encoding_accessor
+    detector = ICU::UCharsetDetector.new
+    assert_equal(nil, detector.declared_encoding)
+    detector.declared_encoding = 'iso-8859-15'
+    assert_equal('iso-8859-15', detector.declared_encoding)
+    detector.detect 'test'
+    assert_equal('iso-8859-15', detector.declared_encoding)
+  end
+  def test_detectable_charsets
+    detector = ICU::UCharsetDetector.new
+    assert_not_nil(detector.detectable_charsets)
+    assert(detector.detectable_charsets.is_a? Array)
+  end
+end

metadata ADDED

@@ -0,0 +1,89 @@
+--- !ruby/object:Gem::Specification
+name: uchardet
+version: !ruby/object:Gem::Version
+  version: 0.1.1
+platform: ruby
+authors:
+- Dmitri Goutnik
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-12-19 00:00:00 +03:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: hoe
+  type: :development
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 2.4.0
+    version:
+description: Fast character set encoding detection using International Components for Unicode C++ library.
+email:
+- dg@syrec.org
+executables:
+- uchardet
+extensions:
+- ext/uchardet/extconf.rb
+extra_rdoc_files:
+- History.txt
+- Manifest.txt
+- README.rdoc
+files:
+- History.txt
+- Manifest.txt
+- README.rdoc
+- Rakefile
+- bin/uchardet
+- ext/uchardet/extconf.rb
+- ext/uchardet/uchardet.c
+- lib/uchardet.rb
+- lib/uchardet/cli.rb
+- script/console
+- script/destroy
+- script/generate
+- tasks/extconf.rake
+- tasks/extconf/uchardet.rake
+- test/test_helper.rb
+- test/test_uchardet.rb
+- test/test_uchardet_cli.rb
+- test/test_uchardet_extn.rb
+has_rdoc: true
+homepage: http://github.com/invisiblellama/uchardet
+licenses: []
+post_install_message:
+rdoc_options:
+- --main
+- README.rdoc
+require_paths:
+- lib
+- ext/uchardet
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project: uchardet
+rubygems_version: 1.3.5
+signing_key:
+specification_version: 3
+summary: Fast character set encoding detection using International Components for Unicode C++ library.
+test_files:
+- test/test_helper.rb
+- test/test_uchardet.rb
+- test/test_uchardet_cli.rb
+- test/test_uchardet_extn.rb