utf8 0.1.6 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +6 -0
- data/benchmark/active_support.rb +39 -1
- data/ext/utf8/extconf.rb +1 -1
- data/ext/utf8/string_utf8.c +46 -17
- data/ext/utf8/utf8.c +1 -1
- data/ext/utf8/utf8.h +1 -1
- data/lib/utf8/version.rb +1 -1
- data/spec/spec_helper.rb +6 -1
- data/spec/string_scanner_spec.rb +15 -15
- data/spec/string_spec.rb +91 -93
- data/utf8.gemspec +1 -1
- metadata +6 -8
data/.travis.yml
ADDED
data/benchmark/active_support.rb
CHANGED
@@ -5,6 +5,7 @@ require 'benchmark'
|
|
5
5
|
require 'rubygems'
|
6
6
|
require 'active_support'
|
7
7
|
|
8
|
+
$KCODE = 'UTF8'
|
8
9
|
|
9
10
|
raw = File.read(File.expand_path('../test.txt', __FILE__))
|
10
11
|
utf8 = raw.as_utf8
|
@@ -29,9 +30,15 @@ Benchmark.bmbm { |x|
|
|
29
30
|
x.report("#[-start, len]") {
|
30
31
|
times.times {utf8[-1024, 1024]}
|
31
32
|
}
|
33
|
+
x.report("#clean") {
|
34
|
+
times.times {utf8.clean}
|
35
|
+
}
|
36
|
+
x.report("#valid?") {
|
37
|
+
times.times {utf8.valid?}
|
38
|
+
}
|
32
39
|
}
|
33
40
|
|
34
|
-
puts "\n\nActiveSupport::Multibyte
|
41
|
+
puts "\n\nActiveSupport::Multibyte"
|
35
42
|
Benchmark.bmbm { |x|
|
36
43
|
x.report("#length") {
|
37
44
|
times.times {as_mb.length}
|
@@ -48,4 +55,35 @@ Benchmark.bmbm { |x|
|
|
48
55
|
x.report("#[-start, len]") {
|
49
56
|
times.times {as_mb[-1024, 1024]}
|
50
57
|
}
|
58
|
+
x.report("ActiveSupport::Multibyte.clean") {
|
59
|
+
times.times {ActiveSupport::Multibyte.clean(raw)}
|
60
|
+
}
|
61
|
+
x.report("ActiveSupport::Multibyte.verify") {
|
62
|
+
times.times {ActiveSupport::Multibyte.verify(raw)}
|
63
|
+
}
|
64
|
+
}
|
65
|
+
|
66
|
+
require 'iconv'
|
67
|
+
module ActiveSupport::Multibyte
|
68
|
+
class << self
|
69
|
+
OUTSIDE_ASCII = /[^\x00-\x7f]/n
|
70
|
+
ICONV_CLEANER = Iconv.new('UTF-8//IGNORE', 'UTF-8')
|
71
|
+
|
72
|
+
def clean_with_iconv(string)
|
73
|
+
if string =~ OUTSIDE_ASCII
|
74
|
+
ICONV_CLEANER.iconv(string + ' ')[0..-2]
|
75
|
+
else
|
76
|
+
string
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
alias_method :clean_without_iconv, :clean
|
81
|
+
alias_method :clean, :clean_with_iconv
|
82
|
+
end
|
83
|
+
end
|
84
|
+
puts "\n\nActiveSupport::Multibyte (patched with Iconv)"
|
85
|
+
Benchmark.bmbm { |x|
|
86
|
+
x.report("ActiveSupport::Multibyte.clean") {
|
87
|
+
times.times {ActiveSupport::Multibyte.clean(raw)}
|
88
|
+
}
|
51
89
|
}
|
data/ext/utf8/extconf.rb
CHANGED
data/ext/utf8/string_utf8.c
CHANGED
@@ -93,7 +93,7 @@ static VALUE rb_cString_UTF8_each_codepoint(int argc, VALUE *argv, VALUE self) {
|
|
93
93
|
/*
|
94
94
|
* call-seq: valid?(max_codepoint=nil)
|
95
95
|
*
|
96
|
-
* Iterates over the string,
|
96
|
+
* Iterates over the string, returning true/false if it's valid UTF-8
|
97
97
|
*
|
98
98
|
* max_codepoint - an optional Fixnum used to declare this string invalid
|
99
99
|
* if a codepoint higher than that value is found
|
@@ -337,44 +337,73 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
337
337
|
* Returns: a new String
|
338
338
|
*/
|
339
339
|
static VALUE rb_cString_UTF8_clean(VALUE self) {
|
340
|
-
unsigned char *
|
341
|
-
unsigned char *
|
342
|
-
|
343
|
-
size_t len;
|
340
|
+
unsigned char *inBuf, *inBufCur;
|
341
|
+
unsigned char *outBuf, *outBufCur;
|
342
|
+
size_t len, i;
|
344
343
|
int8_t curCharLen;
|
345
|
-
size_t i;
|
346
344
|
VALUE rb_out;
|
347
345
|
|
348
|
-
|
346
|
+
inBuf = (unsigned char *)RSTRING_PTR(self);
|
347
|
+
inBufCur = inBuf;
|
349
348
|
len = RSTRING_LEN(self);
|
350
|
-
|
351
|
-
|
349
|
+
outBuf = malloc(len);
|
350
|
+
outBufCur = outBuf;
|
352
351
|
|
353
352
|
for(i=0; i<len; i+=curCharLen) {
|
354
|
-
curCharLen = utf8CharLen(
|
353
|
+
curCharLen = utf8CharLen(inBufCur, len);
|
355
354
|
if (curCharLen < 0) {
|
356
|
-
|
355
|
+
if (inBufCur-inBuf > 0) {
|
356
|
+
memcpy(outBufCur, inBuf, inBufCur-inBuf);
|
357
|
+
outBufCur += inBufCur-inBuf;
|
358
|
+
}
|
359
|
+
*outBufCur++ = REPLACEMENT_CHAR;
|
360
|
+
inBuf += (inBufCur-inBuf)+1;
|
357
361
|
curCharLen = 1;
|
358
|
-
} else {
|
359
|
-
memcpy(out+i, str+i, curCharLen);
|
360
362
|
}
|
363
|
+
|
364
|
+
inBufCur += curCharLen;
|
365
|
+
}
|
366
|
+
|
367
|
+
if (inBufCur-inBuf > 0) {
|
368
|
+
memcpy(outBufCur, inBuf, inBufCur-inBuf);
|
361
369
|
}
|
362
370
|
|
363
|
-
rb_out = rb_str_new((const char*)
|
371
|
+
rb_out = rb_str_new((const char*)outBuf, len);
|
364
372
|
AS_UTF8(rb_out);
|
365
373
|
|
366
|
-
|
374
|
+
free(outBuf);
|
367
375
|
|
368
376
|
return rb_out;
|
369
377
|
}
|
370
378
|
|
379
|
+
/*
|
380
|
+
* call-seq: clean
|
381
|
+
*
|
382
|
+
* Iterates over the string, returning true/false if it's within the low ASCII range
|
383
|
+
*
|
384
|
+
* Returns: a Boolean - true if the string is within the low ASCII range, false if not
|
385
|
+
*/
|
386
|
+
static VALUE rb_cString_UTF8_ascii_only(VALUE self) {
|
387
|
+
unsigned char *str = (unsigned char *)RSTRING_PTR(self);
|
388
|
+
size_t len = RSTRING_LEN(self), i=0;
|
389
|
+
|
390
|
+
for(; i<len; i+=1) {
|
391
|
+
if (str[i] > 0x7f) {
|
392
|
+
return Qfalse;
|
393
|
+
}
|
394
|
+
}
|
395
|
+
|
396
|
+
return Qtrue;
|
397
|
+
}
|
398
|
+
|
371
399
|
void init_String_UTF8() {
|
372
400
|
VALUE rb_cString_UTF8 = rb_define_class_under(rb_cString, "UTF8", rb_cString);
|
373
401
|
|
374
|
-
rb_define_method(rb_cString_UTF8, "length",
|
402
|
+
rb_define_method(rb_cString_UTF8, "length", rb_cString_UTF8_length, 0);
|
375
403
|
rb_define_method(rb_cString_UTF8, "each_char", rb_cString_UTF8_each_char, -1);
|
376
|
-
rb_define_method(rb_cString_UTF8, "[]",
|
404
|
+
rb_define_method(rb_cString_UTF8, "[]", rb_cString_UTF8_slice, -1);
|
377
405
|
rb_define_method(rb_cString_UTF8, "each_codepoint", rb_cString_UTF8_each_codepoint, -1);
|
378
406
|
rb_define_method(rb_cString_UTF8, "valid?", rb_cString_UTF8_valid, -1);
|
379
407
|
rb_define_method(rb_cString_UTF8, "clean", rb_cString_UTF8_clean, 0);
|
408
|
+
rb_define_method(rb_cString_UTF8, "ascii_only?", rb_cString_UTF8_ascii_only, 0);
|
380
409
|
}
|
data/ext/utf8/utf8.c
CHANGED
@@ -7,7 +7,7 @@
|
|
7
7
|
* Scans the current position of the buffer
|
8
8
|
* returning the length of this UTF-8 character
|
9
9
|
*/
|
10
|
-
|
10
|
+
int8_t utf8CharLen(unsigned char *in, size_t in_len) {
|
11
11
|
if (in_len > 0) {
|
12
12
|
unsigned char curChar, *start;
|
13
13
|
|
data/ext/utf8/utf8.h
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#ifndef UTF8_UTF8_H
|
2
2
|
#define UTF8_UTF8_H
|
3
3
|
|
4
|
-
|
4
|
+
int8_t utf8CharLen(unsigned char *in, size_t in_len);
|
5
5
|
int64_t utf8CharCount(unsigned char *in, size_t in_len);
|
6
6
|
int32_t utf8CharToCodepoint(unsigned char *in, size_t in_len);
|
7
7
|
|
data/lib/utf8/version.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
@@ -2,4 +2,9 @@ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
|
2
2
|
require 'utf8'
|
3
3
|
require 'utf8/string_scanner'
|
4
4
|
|
5
|
-
require 'rspec' unless defined?
|
5
|
+
require 'rspec' unless defined? RSpec
|
6
|
+
|
7
|
+
RSpec.configure do |config|
|
8
|
+
config.expect_with :stdlib
|
9
|
+
config.alias_example_to :test
|
10
|
+
end
|
data/spec/string_scanner_spec.rb
CHANGED
@@ -2,13 +2,13 @@
|
|
2
2
|
require File.expand_path('../spec_helper', __FILE__)
|
3
3
|
|
4
4
|
describe StringScanner::UTF8 do
|
5
|
-
before
|
5
|
+
before :each do
|
6
6
|
@char_array = ["怎", "麼", "也", "沒", "人", "寫", "了", "這", "個", "嗎"]
|
7
7
|
@scanner = StringScanner.new(@char_array.join)
|
8
8
|
@utf8_scanner = @scanner.as_utf8
|
9
9
|
end
|
10
10
|
|
11
|
-
|
11
|
+
test "should blow up on invalid utf8 chars" do
|
12
12
|
# lets cut right into the middle of a sequence so we know it's bad
|
13
13
|
str = @char_array.join
|
14
14
|
str.force_encoding('binary') if str.respond_to?(:force_encoding)
|
@@ -16,34 +16,34 @@ describe StringScanner::UTF8 do
|
|
16
16
|
str.force_encoding('utf-8') if str.respond_to?(:force_encoding)
|
17
17
|
scanner = StringScanner.new(str).as_utf8
|
18
18
|
|
19
|
-
|
19
|
+
assert_raise ArgumentError do
|
20
20
|
scanner.getch
|
21
|
-
|
21
|
+
end
|
22
22
|
end
|
23
23
|
|
24
|
-
|
25
|
-
@scanner.
|
26
|
-
@scanner.as_utf8.class
|
24
|
+
test "should extend StringScanner, adding an as_utf8 method that returns a StringScanner::UTF8 instance" do
|
25
|
+
assert @scanner.respond_to?(:as_utf8)
|
26
|
+
assert_equal StringScanner::UTF8, @scanner.as_utf8.class
|
27
27
|
end
|
28
28
|
|
29
|
-
|
29
|
+
test "should allow access to a regular (non-utf8-aware) StringScanner based on it's string" do
|
30
30
|
raw = @utf8_scanner.as_raw
|
31
|
-
raw.class
|
32
|
-
|
31
|
+
assert_equal StringScanner, raw.class
|
32
|
+
assert_equal @utf8_scanner.string, raw.string
|
33
33
|
end
|
34
34
|
|
35
|
-
|
35
|
+
test "#getch should be utf8-aware" do
|
36
36
|
i=0
|
37
37
|
while char = @utf8_scanner.getch
|
38
|
-
|
38
|
+
assert_equal @char_array[i], char
|
39
39
|
i+=1
|
40
40
|
end
|
41
41
|
end
|
42
42
|
|
43
|
-
|
43
|
+
test "should be able to be reset" do
|
44
44
|
i=0
|
45
45
|
while char = @utf8_scanner.getch
|
46
|
-
|
46
|
+
assert_equal @char_array[i], char
|
47
47
|
if i == 4
|
48
48
|
break
|
49
49
|
end
|
@@ -54,7 +54,7 @@ describe StringScanner::UTF8 do
|
|
54
54
|
|
55
55
|
i=0
|
56
56
|
while char = @utf8_scanner.getch
|
57
|
-
|
57
|
+
assert_equal @char_array[i], char
|
58
58
|
i+=1
|
59
59
|
end
|
60
60
|
end
|
data/spec/string_spec.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
require File.expand_path('../spec_helper', __FILE__)
|
3
3
|
|
4
4
|
describe String::UTF8 do
|
5
|
-
before
|
5
|
+
before :each do
|
6
6
|
@char_array = ["怎", "麼", "也", "沒", "人", "寫", "了", "這", "個", "嗎"]
|
7
7
|
@str = @char_array.join
|
8
8
|
@utf8 = @str.as_utf8
|
@@ -10,65 +10,69 @@ describe String::UTF8 do
|
|
10
10
|
@codepoints = @char_array.map{|c| c.unpack('U').first}
|
11
11
|
end
|
12
12
|
|
13
|
-
|
13
|
+
test "should blow up on invalid utf8 chars" do
|
14
14
|
# lets cut right into the middle of a sequence so we know it's bad
|
15
15
|
@str.force_encoding('binary') if @str.respond_to?(:force_encoding)
|
16
16
|
utf8 = @str[0..1]
|
17
17
|
utf8.force_encoding('utf-8') if utf8.respond_to?(:force_encoding)
|
18
18
|
utf8 = utf8.as_utf8
|
19
19
|
|
20
|
-
|
20
|
+
assert_raise ArgumentError do
|
21
21
|
utf8.length
|
22
|
-
|
22
|
+
end
|
23
23
|
|
24
|
-
|
24
|
+
assert_raise ArgumentError do
|
25
25
|
utf8[0, 10]
|
26
|
-
|
26
|
+
end
|
27
27
|
|
28
|
-
|
28
|
+
assert_raise ArgumentError do
|
29
29
|
utf8.chars.to_a
|
30
|
-
|
30
|
+
end
|
31
31
|
end
|
32
32
|
|
33
|
-
|
34
|
-
"".
|
35
|
-
"".as_utf8.class
|
33
|
+
test "should extend String, adding an as_utf8 method that returns a String::UTF8 instance" do
|
34
|
+
assert "".respond_to?(:as_utf8)
|
35
|
+
assert_equal String::UTF8, "".as_utf8.class
|
36
36
|
end
|
37
37
|
|
38
|
-
|
38
|
+
test "should allow access to the underlying raw string" do
|
39
39
|
raw = @utf8.as_raw
|
40
|
-
raw.class
|
40
|
+
assert_equal String, raw.class
|
41
41
|
if defined? Encoding
|
42
|
-
raw.length
|
42
|
+
assert_equal @utf8_len, raw.length
|
43
43
|
else
|
44
|
-
|
44
|
+
assert_equal @str.size, raw.length
|
45
45
|
end
|
46
46
|
end
|
47
47
|
|
48
|
-
|
49
|
-
@utf8[0].class
|
50
|
-
@utf8.chars.to_a[0].class
|
48
|
+
test "should wrap all returned strings to be utf8-aware" do
|
49
|
+
assert_equal String::UTF8, @utf8[0].class
|
50
|
+
assert_equal String::UTF8, @utf8.chars.to_a[0].class
|
51
51
|
end
|
52
52
|
|
53
|
-
|
53
|
+
test "clean should replace invalid utf8 chars with '?'" do
|
54
54
|
orig = "provided by Cristian Rodr\355guez."
|
55
55
|
clean = "provided by Cristian Rodr?guez."
|
56
|
-
orig.as_utf8.clean
|
56
|
+
assert_equal clean, orig.as_utf8.clean
|
57
|
+
assert_equal "asdf24??asdf24", "asdf24\206\222asdf24".as_utf8.clean
|
58
|
+
assert_equal "asdf24?asdf24", "asdf24\342asdf24".as_utf8.clean
|
59
|
+
assert_equal "asdf24??asdf24", "asdf24\342\206asdf24".as_utf8.clean
|
60
|
+
assert_equal "asdf24?asdf24", "asdf24\222asdf24".as_utf8.clean
|
57
61
|
end
|
58
62
|
|
59
|
-
|
60
|
-
|
63
|
+
test "clean should not replace valid utf8 chars with '?'" do
|
64
|
+
assert_equal "asdf24\342\206\222asdf24", "asdf24\342\206\222asdf24".as_utf8.clean
|
61
65
|
end
|
62
66
|
|
63
67
|
context "#length and #size" do
|
64
|
-
|
65
|
-
@utf8.length
|
66
|
-
@utf8.size
|
68
|
+
test "should be utf8-aware" do
|
69
|
+
assert_equal @utf8_len, @utf8.length
|
70
|
+
assert_equal @utf8_len, @utf8.size
|
67
71
|
end
|
68
72
|
end
|
69
73
|
|
70
74
|
context "#chars and #each_char" do
|
71
|
-
|
75
|
+
test "should be utf8-aware" do
|
72
76
|
klass = begin
|
73
77
|
if defined? Encoding
|
74
78
|
Enumerator
|
@@ -77,19 +81,19 @@ describe String::UTF8 do
|
|
77
81
|
end
|
78
82
|
end
|
79
83
|
|
80
|
-
@utf8.chars.class
|
84
|
+
assert_equal klass, @utf8.chars.class
|
81
85
|
@utf8.chars do |char|
|
82
|
-
char.
|
86
|
+
assert !char.nil?
|
83
87
|
end
|
84
88
|
joined = @utf8.chars.to_a.join
|
85
|
-
@utf8
|
86
|
-
@utf8.chars.to_a.size
|
87
|
-
@utf8.chars.to_a
|
89
|
+
assert_equal joined, @utf8
|
90
|
+
assert_equal @utf8_len, @utf8.chars.to_a.size
|
91
|
+
assert_equal @char_array, @utf8.chars.to_a
|
88
92
|
end
|
89
93
|
end
|
90
94
|
|
91
95
|
context "#codepoints and #each_codepoint" do
|
92
|
-
|
96
|
+
test "should be utf8-aware" do
|
93
97
|
klass = begin
|
94
98
|
if defined? Encoding
|
95
99
|
Enumerator
|
@@ -98,124 +102,118 @@ describe String::UTF8 do
|
|
98
102
|
end
|
99
103
|
end
|
100
104
|
|
101
|
-
@utf8.codepoints.class
|
105
|
+
assert_equal klass, @utf8.codepoints.class
|
102
106
|
@utf8.codepoints do |codepoint|
|
103
|
-
codepoint.
|
107
|
+
assert !codepoint.nil?
|
104
108
|
end
|
105
|
-
@
|
106
|
-
@utf8.codepoints.to_a
|
109
|
+
assert_equal @codepoints.size, @utf8.codepoints.to_a.size
|
110
|
+
assert_equal @codepoints, @utf8.codepoints.to_a
|
107
111
|
end
|
108
112
|
end
|
109
113
|
|
110
114
|
context "[offset] syntax" do
|
111
|
-
|
115
|
+
test "should be utf8-aware" do
|
112
116
|
@char_array.each_with_index do |char, i|
|
113
117
|
utf8_char = @utf8[i]
|
114
|
-
|
118
|
+
assert_equal char, utf8_char
|
115
119
|
end
|
116
120
|
end
|
117
121
|
|
118
|
-
|
122
|
+
test "should support negative indices" do
|
119
123
|
utf8_char = @utf8[-5]
|
120
|
-
|
124
|
+
assert_equal @char_array[-5], utf8_char
|
121
125
|
end
|
122
126
|
|
123
|
-
|
124
|
-
@utf8[100].
|
125
|
-
@utf8[-100].
|
127
|
+
test "should return nil for out of range indices" do
|
128
|
+
assert @utf8[100].nil?
|
129
|
+
assert @utf8[-100].nil?
|
126
130
|
end
|
127
131
|
end
|
128
132
|
|
129
133
|
context "[offset, length] syntax" do
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
utf8_char = @utf8[0, 6]
|
135
|
-
utf8_char.should eql(@char_array[0, 6].join)
|
134
|
+
test "should be utf8-aware" do
|
135
|
+
assert_equal @char_array[1, 4].join, @utf8[1, 4]
|
136
|
+
assert_equal @char_array[0, 6].join, @utf8[0, 6]
|
136
137
|
|
137
138
|
# this will fail due to a bug in 1.9
|
138
139
|
unless defined? Encoding
|
139
|
-
|
140
|
-
utf8_char.should eql(@char_array[6, 100].join)
|
140
|
+
assert_equal @char_array[6, 100].join, @utf8[6, 100]
|
141
141
|
end
|
142
142
|
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
utf8_char = @utf8[-1, 100]
|
147
|
-
utf8_char.should eql(@char_array[-1, 100].join)
|
148
|
-
|
149
|
-
utf8_char = @utf8[0, 0]
|
150
|
-
utf8_char.should eql(@char_array[0, 0].join)
|
143
|
+
assert_equal @char_array[-1, 2].join, @utf8[-1, 2]
|
144
|
+
assert_equal @char_array[-1, 100].join, @utf8[-1, 100]
|
145
|
+
assert_equal @char_array[0, 0].join, @utf8[0, 0]
|
151
146
|
end
|
152
147
|
|
153
|
-
|
154
|
-
@utf8[100, 100].
|
155
|
-
@utf8[-100, 100].
|
156
|
-
@utf8[0, -100].
|
148
|
+
test "should return nil for an out of range offset or length" do
|
149
|
+
assert @utf8[100, 100].nil?
|
150
|
+
assert @utf8[-100, 100].nil?
|
151
|
+
assert @utf8[0, -100].nil?
|
157
152
|
end
|
158
153
|
end
|
159
154
|
|
160
155
|
context "[Range] syntax" do
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
utf8_char = @utf8[0..6]
|
166
|
-
utf8_char.should eql(@char_array[0..6].join)
|
156
|
+
test "should be utf8-aware" do
|
157
|
+
assert_equal @char_array[1..4].join, @utf8[1..4]
|
158
|
+
assert_equal @char_array[0..6].join, @utf8[0..6]
|
167
159
|
|
168
160
|
# this will fail due to a bug in 1.9
|
169
161
|
unless defined? Encoding
|
170
|
-
|
171
|
-
utf8_char.should eql(@char_array[6..100].join)
|
162
|
+
assert_equal @char_array[6..100].join, @utf8[6..100]
|
172
163
|
end
|
173
164
|
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
utf8_char = @utf8[-1..100]
|
178
|
-
utf8_char.should eql(@char_array[-1..100].join)
|
165
|
+
assert_equal @char_array[-1..2].join, @utf8[-1..2]
|
166
|
+
assert_equal @char_array[-1..100].join, @utf8[-1..100]
|
179
167
|
end
|
180
168
|
|
181
|
-
|
182
|
-
@utf8[100..100].
|
183
|
-
@utf8[-100..100].
|
184
|
-
@utf8[0..-100]
|
169
|
+
test "should return nil for an out of range offset or length" do
|
170
|
+
assert @utf8[100..100].nil?
|
171
|
+
assert @utf8[-100..100].nil?
|
172
|
+
assert_equal "", @utf8[0..-100]
|
185
173
|
end
|
186
174
|
end
|
187
175
|
|
188
176
|
context "#valid?" do
|
189
|
-
|
177
|
+
test "should test validity" do
|
190
178
|
# lets cut right into the middle of a sequence so we know it's bad
|
191
179
|
@str.force_encoding('binary') if @str.respond_to?(:force_encoding)
|
192
180
|
utf8 = @str[0..1]
|
193
181
|
utf8.force_encoding('utf-8') if utf8.respond_to?(:force_encoding)
|
194
182
|
utf8 = utf8.as_utf8
|
195
183
|
|
196
|
-
utf8.valid
|
197
|
-
@utf8.valid
|
184
|
+
assert !utf8.valid?
|
185
|
+
assert @utf8.valid?
|
198
186
|
|
199
|
-
"provided by Cristian Rodr\355guez.".as_utf8.
|
187
|
+
assert !"provided by Cristian Rodr\355guez.".as_utf8.valid?
|
200
188
|
end
|
201
189
|
|
202
|
-
|
190
|
+
test "should test validity using a maximum codepoint" do
|
203
191
|
highest_codepoint = @utf8.codepoints.to_a.max
|
204
192
|
|
205
|
-
@utf8.valid?(highest_codepoint)
|
206
|
-
|
193
|
+
assert @utf8.valid?(highest_codepoint)
|
194
|
+
assert !@utf8.valid?(highest_codepoint-1)
|
207
195
|
end
|
208
196
|
end
|
209
197
|
|
210
|
-
|
211
|
-
|
198
|
+
test "[Regexp] syntax shouldn't be supported yet" do
|
199
|
+
assert_raise ArgumentError do
|
212
200
|
@utf8[/a/]
|
213
|
-
|
201
|
+
end
|
214
202
|
end
|
215
203
|
|
216
|
-
|
217
|
-
|
204
|
+
test "[Regexp, match_index] syntax shouldn't be supported yet" do
|
205
|
+
assert_raise ArgumentError do
|
218
206
|
@utf8[/(a)/, 1]
|
219
|
-
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
context "#ascii_only" do
|
211
|
+
test "should return true for a string within the low ascii range" do
|
212
|
+
assert "asdf".as_utf8.ascii_only?
|
213
|
+
end
|
214
|
+
|
215
|
+
test "should return false for a string within the low ascii range" do
|
216
|
+
assert !@char_array.first.as_utf8.ascii_only?
|
217
|
+
end
|
220
218
|
end
|
221
219
|
end
|
data/utf8.gemspec
CHANGED
@@ -13,7 +13,7 @@ Gem::Specification.new do |s|
|
|
13
13
|
s.files = `git ls-files`.split("\n")
|
14
14
|
s.homepage = %q{http://github.com/brianmario/utf8}
|
15
15
|
s.rdoc_options = ["--charset=UTF-8"]
|
16
|
-
s.require_paths = ["lib"
|
16
|
+
s.require_paths = ["lib"]
|
17
17
|
s.rubygems_version = %q{1.4.2}
|
18
18
|
s.summary = %q{A lightweight UTF8-aware String class meant for use with Ruby 1.8}
|
19
19
|
s.test_files = `git ls-files spec`.split("\n")
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: utf8
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 21
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 7
|
10
|
+
version: 0.1.7
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Brian Lopez
|
@@ -15,8 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
19
|
-
default_executable:
|
18
|
+
date: 2011-11-22 00:00:00 Z
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
22
21
|
name: rake-compiler
|
@@ -75,6 +74,7 @@ extra_rdoc_files:
|
|
75
74
|
files:
|
76
75
|
- .gitignore
|
77
76
|
- .rspec
|
77
|
+
- .travis.yml
|
78
78
|
- Gemfile
|
79
79
|
- MIT-LICENSE
|
80
80
|
- README.rdoc
|
@@ -98,7 +98,6 @@ files:
|
|
98
98
|
- spec/string_scanner_spec.rb
|
99
99
|
- spec/string_spec.rb
|
100
100
|
- utf8.gemspec
|
101
|
-
has_rdoc: true
|
102
101
|
homepage: http://github.com/brianmario/utf8
|
103
102
|
licenses: []
|
104
103
|
|
@@ -107,7 +106,6 @@ rdoc_options:
|
|
107
106
|
- --charset=UTF-8
|
108
107
|
require_paths:
|
109
108
|
- lib
|
110
|
-
- ext
|
111
109
|
required_ruby_version: !ruby/object:Gem::Requirement
|
112
110
|
none: false
|
113
111
|
requirements:
|
@@ -129,7 +127,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
129
127
|
requirements: []
|
130
128
|
|
131
129
|
rubyforge_project:
|
132
|
-
rubygems_version: 1.
|
130
|
+
rubygems_version: 1.8.11
|
133
131
|
signing_key:
|
134
132
|
specification_version: 3
|
135
133
|
summary: A lightweight UTF8-aware String class meant for use with Ruby 1.8
|