utf8cleaner 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ require 'mkmf'
2
+
3
+ dir_config 'utf8cleaner'
4
+ create_makefile 'utf8cleaner'
@@ -0,0 +1,68 @@
1
+ #include <ruby.h>
2
+
3
+ static VALUE UTF8Cleaner_clean(VALUE obj, VALUE string)
4
+ {
5
+ char *input = RSTRING_PTR(string);
6
+ long i, input_len = RSTRING_LEN(string), output_len = 0;
7
+ char *output = ALLOC_N(char, input_len);
8
+ VALUE result;
9
+
10
+ if (!output)
11
+ return Qnil;
12
+
13
+ for(i = 0; i < input_len; ++i)
14
+ {
15
+ long remain = input_len - i;
16
+
17
+ /* ASCII */
18
+ if (input[i] == '\t' ||
19
+ input[i] == '\r' ||
20
+ input[i] == '\n' ||
21
+ (input[i] >= ' ' && input[i] <= 127))
22
+ {
23
+ output[output_len++] = input[i];
24
+ }
25
+ /* 2-byte sequence */
26
+ else if (remain >= 2 &&
27
+ (input[i] & 0xe0) == 0xc0 &&
28
+ (input[i + 1] & 0xc0) == 0x80)
29
+ {
30
+ output[output_len++] = input[i++];
31
+ output[output_len++] = input[i];
32
+ }
33
+ /* 3-byte sequence */
34
+ else if (remain >= 3 &&
35
+ (input[i] & 0xf0) == 0xe0 &&
36
+ (input[i + 1] & 0xc0) == 0x80 &&
37
+ (input[i + 2] & 0xc0) == 0x80)
38
+ {
39
+ output[output_len++] = input[i++];
40
+ output[output_len++] = input[i++];
41
+ output[output_len++] = input[i];
42
+ }
43
+ /* 4-byte sequence */
44
+ else if (remain >= 4 &&
45
+ (input[i] & 0xf8) == 0xf0 &&
46
+ (input[i + 1] & 0xc0) == 0x80 &&
47
+ (input[i + 2] & 0xc0) == 0x80 &&
48
+ (input[i + 3] & 0xc0) == 0x80)
49
+ {
50
+ output[output_len++] = input[i++];
51
+ output[output_len++] = input[i++];
52
+ output[output_len++] = input[i++];
53
+ output[output_len++] = input[i];
54
+ }
55
+ /*else
56
+ printf("Drop(%i) %X = (%X, %X)\n", remain, (char)input[i], input[i] & 0xe0, input[i + 1] & 0xc0);*/
57
+ }
58
+
59
+ result = rb_str_new(output, output_len);
60
+ free(output);
61
+ return result;
62
+ }
63
+
64
+ void Init_utf8cleaner()
65
+ {
66
+ VALUE rb_mUTF8Cleaner = rb_define_module("UTF8Cleaner");
67
+ rb_define_module_function(rb_mUTF8Cleaner, "clean", &UTF8Cleaner_clean, 1);
68
+ }
@@ -0,0 +1,66 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.dirname(__FILE__) + '/../ext/utf8cleaner'
3
+ require 'iconv'
4
+
5
+ describe UTF8Cleaner do
6
+ context "when cleaning valid input" do
7
+ it "should preserve ASCII" do
8
+ UTF8Cleaner.clean("foobar").
9
+ should == "foobar"
10
+ end
11
+
12
+ it "should preserve Umlauts" do
13
+ UTF8Cleaner.clean("mäh").
14
+ should == "mäh"
15
+ end
16
+
17
+ it "should preserve Umlauts at the front" do
18
+ UTF8Cleaner.clean("Äusserst").
19
+ should == "Äusserst"
20
+ end
21
+
22
+ it "should preserve Umlauts at the end" do
23
+ UTF8Cleaner.clean("Gauß").
24
+ should == "Gauß"
25
+ end
26
+
27
+ it "should not shorten Korean truncated with valid replacement character" do
28
+ UTF8Cleaner.clean("양 10m 1:01.7 슈�....").
29
+ should == "양 10m 1:01.7 슈�...."
30
+ end
31
+ end
32
+
33
+ context "when cleaning invalid input" do
34
+ it "should remove 0 bytes" do
35
+ UTF8Cleaner.clean("foo\0bar").
36
+ should == "foobar"
37
+ end
38
+
39
+ def utf8_to_latin1(s)
40
+ Iconv.open('ISO_8859-1', 'UTF-8') { |cd|
41
+ cd.iconv(s)
42
+ }
43
+ end
44
+
45
+ it "should remove broken Umlauts" do
46
+ UTF8Cleaner.clean(utf8_to_latin1("Mäuse")).
47
+ should == "Muse"
48
+ end
49
+
50
+ it "should remove broken Umlauts at the front" do
51
+ UTF8Cleaner.clean(utf8_to_latin1("Äusserst")).
52
+ should == "usserst"
53
+ end
54
+
55
+ it "should remove broken Umlauts at the end" do
56
+ UTF8Cleaner.clean(utf8_to_latin1("Gauß")).
57
+ should == "Gau"
58
+ end
59
+
60
+
61
+ it "should shorten truncated Korean" do
62
+ UTF8Cleaner.clean("량\354....").
63
+ should == "량...."
64
+ end
65
+ end
66
+ end
metadata ADDED
@@ -0,0 +1,57 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: utf8cleaner
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Astro
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-11-03 00:00:00 +01:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Removes any non-ASCII/UTF8 bytes from a string
17
+ email: astro@spaceboyz.net
18
+ executables: []
19
+
20
+ extensions:
21
+ - ext/extconf.rb
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - ext/extconf.rb
26
+ - ext/utf8cleaner.c
27
+ - spec/utf8cleaner_spec.rb
28
+ has_rdoc: true
29
+ homepage:
30
+ licenses: []
31
+
32
+ post_install_message:
33
+ rdoc_options: []
34
+
35
+ require_paths:
36
+ - ext
37
+ required_ruby_version: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: "0"
42
+ version:
43
+ required_rubygems_version: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: "0"
48
+ version:
49
+ requirements: []
50
+
51
+ rubyforge_project:
52
+ rubygems_version: 1.3.5
53
+ signing_key:
54
+ specification_version: 3
55
+ summary: Efficiently clean your UTF8
56
+ test_files:
57
+ - spec/utf8cleaner_spec.rb