utf8cleaner 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/extconf.rb +4 -0
- data/ext/utf8cleaner.c +68 -0
- data/spec/utf8cleaner_spec.rb +66 -0
- metadata +57 -0
data/ext/extconf.rb
ADDED
data/ext/utf8cleaner.c
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
|
3
|
+
static VALUE UTF8Cleaner_clean(VALUE obj, VALUE string)
|
4
|
+
{
|
5
|
+
char *input = RSTRING_PTR(string);
|
6
|
+
long i, input_len = RSTRING_LEN(string), output_len = 0;
|
7
|
+
char *output = ALLOC_N(char, input_len);
|
8
|
+
VALUE result;
|
9
|
+
|
10
|
+
if (!output)
|
11
|
+
return Qnil;
|
12
|
+
|
13
|
+
for(i = 0; i < input_len; ++i)
|
14
|
+
{
|
15
|
+
long remain = input_len - i;
|
16
|
+
|
17
|
+
/* ASCII */
|
18
|
+
if (input[i] == '\t' ||
|
19
|
+
input[i] == '\r' ||
|
20
|
+
input[i] == '\n' ||
|
21
|
+
(input[i] >= ' ' && input[i] <= 127))
|
22
|
+
{
|
23
|
+
output[output_len++] = input[i];
|
24
|
+
}
|
25
|
+
/* 2-byte sequence */
|
26
|
+
else if (remain >= 2 &&
|
27
|
+
(input[i] & 0xe0) == 0xc0 &&
|
28
|
+
(input[i + 1] & 0xc0) == 0x80)
|
29
|
+
{
|
30
|
+
output[output_len++] = input[i++];
|
31
|
+
output[output_len++] = input[i];
|
32
|
+
}
|
33
|
+
/* 3-byte sequence */
|
34
|
+
else if (remain >= 3 &&
|
35
|
+
(input[i] & 0xf0) == 0xe0 &&
|
36
|
+
(input[i + 1] & 0xc0) == 0x80 &&
|
37
|
+
(input[i + 2] & 0xc0) == 0x80)
|
38
|
+
{
|
39
|
+
output[output_len++] = input[i++];
|
40
|
+
output[output_len++] = input[i++];
|
41
|
+
output[output_len++] = input[i];
|
42
|
+
}
|
43
|
+
/* 4-byte sequence */
|
44
|
+
else if (remain >= 4 &&
|
45
|
+
(input[i] & 0xf8) == 0xf0 &&
|
46
|
+
(input[i + 1] & 0xc0) == 0x80 &&
|
47
|
+
(input[i + 2] & 0xc0) == 0x80 &&
|
48
|
+
(input[i + 3] & 0xc0) == 0x80)
|
49
|
+
{
|
50
|
+
output[output_len++] = input[i++];
|
51
|
+
output[output_len++] = input[i++];
|
52
|
+
output[output_len++] = input[i++];
|
53
|
+
output[output_len++] = input[i];
|
54
|
+
}
|
55
|
+
/*else
|
56
|
+
printf("Drop(%i) %X = (%X, %X)\n", remain, (char)input[i], input[i] & 0xe0, input[i + 1] & 0xc0);*/
|
57
|
+
}
|
58
|
+
|
59
|
+
result = rb_str_new(output, output_len);
|
60
|
+
free(output);
|
61
|
+
return result;
|
62
|
+
}
|
63
|
+
|
64
|
+
void Init_utf8cleaner()
|
65
|
+
{
|
66
|
+
VALUE rb_mUTF8Cleaner = rb_define_module("UTF8Cleaner");
|
67
|
+
rb_define_module_function(rb_mUTF8Cleaner, "clean", &UTF8Cleaner_clean, 1);
|
68
|
+
}
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require File.dirname(__FILE__) + '/../ext/utf8cleaner'
|
3
|
+
require 'iconv'
|
4
|
+
|
5
|
+
describe UTF8Cleaner do
|
6
|
+
context "when cleaning valid input" do
|
7
|
+
it "should preserve ASCII" do
|
8
|
+
UTF8Cleaner.clean("foobar").
|
9
|
+
should == "foobar"
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should preserve Umlauts" do
|
13
|
+
UTF8Cleaner.clean("mäh").
|
14
|
+
should == "mäh"
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should preserve Umlauts at the front" do
|
18
|
+
UTF8Cleaner.clean("Äusserst").
|
19
|
+
should == "Äusserst"
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should preserve Umlauts at the end" do
|
23
|
+
UTF8Cleaner.clean("Gauß").
|
24
|
+
should == "Gauß"
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should not shorten Korean truncated with valid replacement character" do
|
28
|
+
UTF8Cleaner.clean("양 10m 1:01.7 슈�....").
|
29
|
+
should == "양 10m 1:01.7 슈�...."
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
context "when cleaning invalid input" do
|
34
|
+
it "should remove 0 bytes" do
|
35
|
+
UTF8Cleaner.clean("foo\0bar").
|
36
|
+
should == "foobar"
|
37
|
+
end
|
38
|
+
|
39
|
+
def utf8_to_latin1(s)
|
40
|
+
Iconv.open('ISO_8859-1', 'UTF-8') { |cd|
|
41
|
+
cd.iconv(s)
|
42
|
+
}
|
43
|
+
end
|
44
|
+
|
45
|
+
it "should remove broken Umlauts" do
|
46
|
+
UTF8Cleaner.clean(utf8_to_latin1("Mäuse")).
|
47
|
+
should == "Muse"
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should remove broken Umlauts at the front" do
|
51
|
+
UTF8Cleaner.clean(utf8_to_latin1("Äusserst")).
|
52
|
+
should == "usserst"
|
53
|
+
end
|
54
|
+
|
55
|
+
it "should remove broken Umlauts at the end" do
|
56
|
+
UTF8Cleaner.clean(utf8_to_latin1("Gauß")).
|
57
|
+
should == "Gau"
|
58
|
+
end
|
59
|
+
|
60
|
+
|
61
|
+
it "should shorten truncated Korean" do
|
62
|
+
UTF8Cleaner.clean("량\354....").
|
63
|
+
should == "량...."
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
metadata
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: utf8cleaner
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Astro
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-11-03 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: Removes any non-ASCII/UTF8 bytes from a string
|
17
|
+
email: astro@spaceboyz.net
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions:
|
21
|
+
- ext/extconf.rb
|
22
|
+
extra_rdoc_files: []
|
23
|
+
|
24
|
+
files:
|
25
|
+
- ext/extconf.rb
|
26
|
+
- ext/utf8cleaner.c
|
27
|
+
- spec/utf8cleaner_spec.rb
|
28
|
+
has_rdoc: true
|
29
|
+
homepage:
|
30
|
+
licenses: []
|
31
|
+
|
32
|
+
post_install_message:
|
33
|
+
rdoc_options: []
|
34
|
+
|
35
|
+
require_paths:
|
36
|
+
- ext
|
37
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - ">="
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: "0"
|
42
|
+
version:
|
43
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: "0"
|
48
|
+
version:
|
49
|
+
requirements: []
|
50
|
+
|
51
|
+
rubyforge_project:
|
52
|
+
rubygems_version: 1.3.5
|
53
|
+
signing_key:
|
54
|
+
specification_version: 3
|
55
|
+
summary: Efficiently clean your UTF8
|
56
|
+
test_files:
|
57
|
+
- spec/utf8cleaner_spec.rb
|