utf8cleaner 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,4 @@
1
+ require 'mkmf'
2
+
3
+ dir_config 'utf8cleaner'
4
+ create_makefile 'utf8cleaner'
@@ -0,0 +1,68 @@
1
+ #include <ruby.h>
2
+
3
+ static VALUE UTF8Cleaner_clean(VALUE obj, VALUE string)
4
+ {
5
+ char *input = RSTRING_PTR(string);
6
+ long i, input_len = RSTRING_LEN(string), output_len = 0;
7
+ char *output = ALLOC_N(char, input_len);
8
+ VALUE result;
9
+
10
+ if (!output)
11
+ return Qnil;
12
+
13
+ for(i = 0; i < input_len; ++i)
14
+ {
15
+ long remain = input_len - i;
16
+
17
+ /* ASCII */
18
+ if (input[i] == '\t' ||
19
+ input[i] == '\r' ||
20
+ input[i] == '\n' ||
21
+ (input[i] >= ' ' && input[i] <= 127))
22
+ {
23
+ output[output_len++] = input[i];
24
+ }
25
+ /* 2-byte sequence */
26
+ else if (remain >= 2 &&
27
+ (input[i] & 0xe0) == 0xc0 &&
28
+ (input[i + 1] & 0xc0) == 0x80)
29
+ {
30
+ output[output_len++] = input[i++];
31
+ output[output_len++] = input[i];
32
+ }
33
+ /* 3-byte sequence */
34
+ else if (remain >= 3 &&
35
+ (input[i] & 0xf0) == 0xe0 &&
36
+ (input[i + 1] & 0xc0) == 0x80 &&
37
+ (input[i + 2] & 0xc0) == 0x80)
38
+ {
39
+ output[output_len++] = input[i++];
40
+ output[output_len++] = input[i++];
41
+ output[output_len++] = input[i];
42
+ }
43
+ /* 4-byte sequence */
44
+ else if (remain >= 4 &&
45
+ (input[i] & 0xf8) == 0xf0 &&
46
+ (input[i + 1] & 0xc0) == 0x80 &&
47
+ (input[i + 2] & 0xc0) == 0x80 &&
48
+ (input[i + 3] & 0xc0) == 0x80)
49
+ {
50
+ output[output_len++] = input[i++];
51
+ output[output_len++] = input[i++];
52
+ output[output_len++] = input[i++];
53
+ output[output_len++] = input[i];
54
+ }
55
+ /*else
56
+ printf("Drop(%i) %X = (%X, %X)\n", remain, (char)input[i], input[i] & 0xe0, input[i + 1] & 0xc0);*/
57
+ }
58
+
59
+ result = rb_str_new(output, output_len);
60
+ free(output);
61
+ return result;
62
+ }
63
+
64
+ void Init_utf8cleaner()
65
+ {
66
+ VALUE rb_mUTF8Cleaner = rb_define_module("UTF8Cleaner");
67
+ rb_define_module_function(rb_mUTF8Cleaner, "clean", &UTF8Cleaner_clean, 1);
68
+ }
@@ -0,0 +1,66 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.dirname(__FILE__) + '/../ext/utf8cleaner'
3
+ require 'iconv'
4
+
5
+ describe UTF8Cleaner do
6
+ context "when cleaning valid input" do
7
+ it "should preserve ASCII" do
8
+ UTF8Cleaner.clean("foobar").
9
+ should == "foobar"
10
+ end
11
+
12
+ it "should preserve Umlauts" do
13
+ UTF8Cleaner.clean("mäh").
14
+ should == "mäh"
15
+ end
16
+
17
+ it "should preserve Umlauts at the front" do
18
+ UTF8Cleaner.clean("Äusserst").
19
+ should == "Äusserst"
20
+ end
21
+
22
+ it "should preserve Umlauts at the end" do
23
+ UTF8Cleaner.clean("Gauß").
24
+ should == "Gauß"
25
+ end
26
+
27
+ it "should not shorten Korean truncated with valid replacement character" do
28
+ UTF8Cleaner.clean("양 10m 1:01.7 슈�....").
29
+ should == "양 10m 1:01.7 슈�...."
30
+ end
31
+ end
32
+
33
+ context "when cleaning invalid input" do
34
+ it "should remove 0 bytes" do
35
+ UTF8Cleaner.clean("foo\0bar").
36
+ should == "foobar"
37
+ end
38
+
39
+ def utf8_to_latin1(s)
40
+ Iconv.open('ISO_8859-1', 'UTF-8') { |cd|
41
+ cd.iconv(s)
42
+ }
43
+ end
44
+
45
+ it "should remove broken Umlauts" do
46
+ UTF8Cleaner.clean(utf8_to_latin1("Mäuse")).
47
+ should == "Muse"
48
+ end
49
+
50
+ it "should remove broken Umlauts at the front" do
51
+ UTF8Cleaner.clean(utf8_to_latin1("Äusserst")).
52
+ should == "usserst"
53
+ end
54
+
55
+ it "should remove broken Umlauts at the end" do
56
+ UTF8Cleaner.clean(utf8_to_latin1("Gauß")).
57
+ should == "Gau"
58
+ end
59
+
60
+
61
+ it "should shorten truncated Korean" do
62
+ UTF8Cleaner.clean("량\354....").
63
+ should == "량...."
64
+ end
65
+ end
66
+ end
metadata ADDED
@@ -0,0 +1,57 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: utf8cleaner
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Astro
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-11-03 00:00:00 +01:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Removes any non-ASCII/UTF8 bytes from a string
17
+ email: astro@spaceboyz.net
18
+ executables: []
19
+
20
+ extensions:
21
+ - ext/extconf.rb
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - ext/extconf.rb
26
+ - ext/utf8cleaner.c
27
+ - spec/utf8cleaner_spec.rb
28
+ has_rdoc: true
29
+ homepage:
30
+ licenses: []
31
+
32
+ post_install_message:
33
+ rdoc_options: []
34
+
35
+ require_paths:
36
+ - ext
37
+ required_ruby_version: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: "0"
42
+ version:
43
+ required_rubygems_version: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: "0"
48
+ version:
49
+ requirements: []
50
+
51
+ rubyforge_project:
52
+ rubygems_version: 1.3.5
53
+ signing_key:
54
+ specification_version: 3
55
+ summary: Efficiently clean your UTF8
56
+ test_files:
57
+ - spec/utf8cleaner_spec.rb