utf8cleaner 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/extconf.rb +4 -0
- data/ext/utf8cleaner.c +68 -0
- data/spec/utf8cleaner_spec.rb +66 -0
- metadata +57 -0
data/ext/extconf.rb
ADDED
data/ext/utf8cleaner.c
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
|
3
|
+
static VALUE UTF8Cleaner_clean(VALUE obj, VALUE string)
|
4
|
+
{
|
5
|
+
char *input = RSTRING_PTR(string);
|
6
|
+
long i, input_len = RSTRING_LEN(string), output_len = 0;
|
7
|
+
char *output = ALLOC_N(char, input_len);
|
8
|
+
VALUE result;
|
9
|
+
|
10
|
+
if (!output)
|
11
|
+
return Qnil;
|
12
|
+
|
13
|
+
for(i = 0; i < input_len; ++i)
|
14
|
+
{
|
15
|
+
long remain = input_len - i;
|
16
|
+
|
17
|
+
/* ASCII */
|
18
|
+
if (input[i] == '\t' ||
|
19
|
+
input[i] == '\r' ||
|
20
|
+
input[i] == '\n' ||
|
21
|
+
(input[i] >= ' ' && input[i] <= 127))
|
22
|
+
{
|
23
|
+
output[output_len++] = input[i];
|
24
|
+
}
|
25
|
+
/* 2-byte sequence */
|
26
|
+
else if (remain >= 2 &&
|
27
|
+
(input[i] & 0xe0) == 0xc0 &&
|
28
|
+
(input[i + 1] & 0xc0) == 0x80)
|
29
|
+
{
|
30
|
+
output[output_len++] = input[i++];
|
31
|
+
output[output_len++] = input[i];
|
32
|
+
}
|
33
|
+
/* 3-byte sequence */
|
34
|
+
else if (remain >= 3 &&
|
35
|
+
(input[i] & 0xf0) == 0xe0 &&
|
36
|
+
(input[i + 1] & 0xc0) == 0x80 &&
|
37
|
+
(input[i + 2] & 0xc0) == 0x80)
|
38
|
+
{
|
39
|
+
output[output_len++] = input[i++];
|
40
|
+
output[output_len++] = input[i++];
|
41
|
+
output[output_len++] = input[i];
|
42
|
+
}
|
43
|
+
/* 4-byte sequence */
|
44
|
+
else if (remain >= 4 &&
|
45
|
+
(input[i] & 0xf8) == 0xf0 &&
|
46
|
+
(input[i + 1] & 0xc0) == 0x80 &&
|
47
|
+
(input[i + 2] & 0xc0) == 0x80 &&
|
48
|
+
(input[i + 3] & 0xc0) == 0x80)
|
49
|
+
{
|
50
|
+
output[output_len++] = input[i++];
|
51
|
+
output[output_len++] = input[i++];
|
52
|
+
output[output_len++] = input[i++];
|
53
|
+
output[output_len++] = input[i];
|
54
|
+
}
|
55
|
+
/*else
|
56
|
+
printf("Drop(%i) %X = (%X, %X)\n", remain, (char)input[i], input[i] & 0xe0, input[i + 1] & 0xc0);*/
|
57
|
+
}
|
58
|
+
|
59
|
+
result = rb_str_new(output, output_len);
|
60
|
+
free(output);
|
61
|
+
return result;
|
62
|
+
}
|
63
|
+
|
64
|
+
void Init_utf8cleaner()
|
65
|
+
{
|
66
|
+
VALUE rb_mUTF8Cleaner = rb_define_module("UTF8Cleaner");
|
67
|
+
rb_define_module_function(rb_mUTF8Cleaner, "clean", &UTF8Cleaner_clean, 1);
|
68
|
+
}
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require File.dirname(__FILE__) + '/../ext/utf8cleaner'
|
3
|
+
require 'iconv'
|
4
|
+
|
5
|
+
describe UTF8Cleaner do
|
6
|
+
context "when cleaning valid input" do
|
7
|
+
it "should preserve ASCII" do
|
8
|
+
UTF8Cleaner.clean("foobar").
|
9
|
+
should == "foobar"
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should preserve Umlauts" do
|
13
|
+
UTF8Cleaner.clean("mäh").
|
14
|
+
should == "mäh"
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should preserve Umlauts at the front" do
|
18
|
+
UTF8Cleaner.clean("Äusserst").
|
19
|
+
should == "Äusserst"
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should preserve Umlauts at the end" do
|
23
|
+
UTF8Cleaner.clean("Gauß").
|
24
|
+
should == "Gauß"
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should not shorten Korean truncated with valid replacement character" do
|
28
|
+
UTF8Cleaner.clean("양 10m 1:01.7 슈�....").
|
29
|
+
should == "양 10m 1:01.7 슈�...."
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
context "when cleaning invalid input" do
|
34
|
+
it "should remove 0 bytes" do
|
35
|
+
UTF8Cleaner.clean("foo\0bar").
|
36
|
+
should == "foobar"
|
37
|
+
end
|
38
|
+
|
39
|
+
def utf8_to_latin1(s)
|
40
|
+
Iconv.open('ISO_8859-1', 'UTF-8') { |cd|
|
41
|
+
cd.iconv(s)
|
42
|
+
}
|
43
|
+
end
|
44
|
+
|
45
|
+
it "should remove broken Umlauts" do
|
46
|
+
UTF8Cleaner.clean(utf8_to_latin1("Mäuse")).
|
47
|
+
should == "Muse"
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should remove broken Umlauts at the front" do
|
51
|
+
UTF8Cleaner.clean(utf8_to_latin1("Äusserst")).
|
52
|
+
should == "usserst"
|
53
|
+
end
|
54
|
+
|
55
|
+
it "should remove broken Umlauts at the end" do
|
56
|
+
UTF8Cleaner.clean(utf8_to_latin1("Gauß")).
|
57
|
+
should == "Gau"
|
58
|
+
end
|
59
|
+
|
60
|
+
|
61
|
+
it "should shorten truncated Korean" do
|
62
|
+
UTF8Cleaner.clean("량\354....").
|
63
|
+
should == "량...."
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
metadata
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: utf8cleaner
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Astro
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-11-03 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: Removes any non-ASCII/UTF8 bytes from a string
|
17
|
+
email: astro@spaceboyz.net
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions:
|
21
|
+
- ext/extconf.rb
|
22
|
+
extra_rdoc_files: []
|
23
|
+
|
24
|
+
files:
|
25
|
+
- ext/extconf.rb
|
26
|
+
- ext/utf8cleaner.c
|
27
|
+
- spec/utf8cleaner_spec.rb
|
28
|
+
has_rdoc: true
|
29
|
+
homepage:
|
30
|
+
licenses: []
|
31
|
+
|
32
|
+
post_install_message:
|
33
|
+
rdoc_options: []
|
34
|
+
|
35
|
+
require_paths:
|
36
|
+
- ext
|
37
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - ">="
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: "0"
|
42
|
+
version:
|
43
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: "0"
|
48
|
+
version:
|
49
|
+
requirements: []
|
50
|
+
|
51
|
+
rubyforge_project:
|
52
|
+
rubygems_version: 1.3.5
|
53
|
+
signing_key:
|
54
|
+
specification_version: 3
|
55
|
+
summary: Efficiently clean your UTF8
|
56
|
+
test_files:
|
57
|
+
- spec/utf8cleaner_spec.rb
|