RubyGems - utf8cleaner - Versions diffs - 0.0.1 - Mend

utf8cleaner 0.0.1

Files changed (4) hide show

data/ext/extconf.rb ADDED

@@ -0,0 +1,4 @@
+require 'mkmf'
+dir_config 'utf8cleaner'
+create_makefile 'utf8cleaner'

data/ext/utf8cleaner.c ADDED

@@ -0,0 +1,68 @@
+#include <ruby.h>
+static VALUE UTF8Cleaner_clean(VALUE obj, VALUE string)
+{
+  char *input = RSTRING_PTR(string);
+  long i, input_len = RSTRING_LEN(string), output_len = 0;
+  char *output = ALLOC_N(char, input_len);
+  VALUE result;
+  if (!output)
+    return Qnil;
+  for(i = 0; i < input_len; ++i)
+  {
+    long remain = input_len - i;
+    /* ASCII */
+    if (input[i] == '\t' ||
+        input[i] == '\r' ||
+        input[i] == '\n' ||
+        (input[i] >= ' ' && input[i] <= 127))
+    {
+      output[output_len++] = input[i];
+    }
+    /* 2-byte sequence */
+    else if (remain >= 2 &&
+             (input[i] & 0xe0) == 0xc0 &&
+             (input[i + 1] & 0xc0) == 0x80)
+    {
+      output[output_len++] = input[i++];
+      output[output_len++] = input[i];
+    }
+    /* 3-byte sequence */
+    else if (remain >= 3 &&
+             (input[i] & 0xf0) == 0xe0 &&
+             (input[i + 1] & 0xc0) == 0x80 &&
+             (input[i + 2] & 0xc0) == 0x80)
+    {
+      output[output_len++] = input[i++];
+      output[output_len++] = input[i++];
+      output[output_len++] = input[i];
+    }
+    /* 4-byte sequence */
+    else if (remain >= 4 &&
+             (input[i] & 0xf8) == 0xf0 &&
+             (input[i + 1] & 0xc0) == 0x80 &&
+             (input[i + 2] & 0xc0) == 0x80 &&
+             (input[i + 3] & 0xc0) == 0x80)
+    {
+      output[output_len++] = input[i++];
+      output[output_len++] = input[i++];
+      output[output_len++] = input[i++];
+      output[output_len++] = input[i];
+    }
+    /*else
+      printf("Drop(%i) %X = (%X, %X)\n", remain, (char)input[i], input[i] & 0xe0, input[i + 1] & 0xc0);*/
+  }
+  result = rb_str_new(output, output_len);
+  free(output);
+  return result;
+}
+void Init_utf8cleaner()
+{
+  VALUE rb_mUTF8Cleaner = rb_define_module("UTF8Cleaner");
+  rb_define_module_function(rb_mUTF8Cleaner, "clean", &UTF8Cleaner_clean, 1);
+}

data/spec/utf8cleaner_spec.rb ADDED

@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+require File.dirname(__FILE__) + '/../ext/utf8cleaner'
+require 'iconv'
+describe UTF8Cleaner do
+  context "when cleaning valid input" do
+    it "should preserve ASCII" do
+      UTF8Cleaner.clean("foobar").
+        should == "foobar"
+    end
+    it "should preserve Umlauts" do
+      UTF8Cleaner.clean("mäh").
+        should == "mäh"
+    end
+    it "should preserve Umlauts at the front" do
+      UTF8Cleaner.clean("Äusserst").
+        should == "Äusserst"
+    end
+    it "should preserve Umlauts at the end" do
+      UTF8Cleaner.clean("Gauß").
+        should == "Gauß"
+    end
+    it "should not shorten Korean truncated with valid replacement character" do
+      UTF8Cleaner.clean("양 10m 1:01.7 슈�....").
+        should == "양 10m 1:01.7 슈�...."
+    end
+  end
+  context "when cleaning invalid input" do
+    it "should remove 0 bytes" do
+      UTF8Cleaner.clean("foo\0bar").
+        should == "foobar"
+    end
+    def utf8_to_latin1(s)
+      Iconv.open('ISO_8859-1', 'UTF-8') { |cd|
+        cd.iconv(s)
+      }
+    end
+    it "should remove broken Umlauts" do
+      UTF8Cleaner.clean(utf8_to_latin1("Mäuse")).
+        should == "Muse"
+    end
+    it "should remove broken Umlauts at the front" do
+      UTF8Cleaner.clean(utf8_to_latin1("Äusserst")).
+        should == "usserst"
+    end
+    it "should remove broken Umlauts at the end" do
+      UTF8Cleaner.clean(utf8_to_latin1("Gauß")).
+        should == "Gau"
+    end
+    it "should shorten truncated Korean" do
+      UTF8Cleaner.clean("량\354....").
+        should == "량...."
+    end
+  end
+end

metadata ADDED

@@ -0,0 +1,57 @@
+--- !ruby/object:Gem::Specification
+name: utf8cleaner
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Astro
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-11-03 00:00:00 +01:00
+default_executable:
+dependencies: []
+description: Removes any non-ASCII/UTF8 bytes from a string
+email: astro@spaceboyz.net
+executables: []
+extensions:
+- ext/extconf.rb
+extra_rdoc_files: []
+files:
+- ext/extconf.rb
+- ext/utf8cleaner.c
+- spec/utf8cleaner_spec.rb
+has_rdoc: true
+homepage:
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- ext
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.5
+signing_key:
+specification_version: 3
+summary: Efficiently clean your UTF8
+test_files:
+- spec/utf8cleaner_spec.rb