utf8 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.travis.yml +6 -0
- data/benchmark/active_support.rb +39 -1
- data/ext/utf8/extconf.rb +1 -1
- data/ext/utf8/string_utf8.c +46 -17
- data/ext/utf8/utf8.c +1 -1
- data/ext/utf8/utf8.h +1 -1
- data/lib/utf8/version.rb +1 -1
- data/spec/spec_helper.rb +6 -1
- data/spec/string_scanner_spec.rb +15 -15
- data/spec/string_spec.rb +91 -93
- data/utf8.gemspec +1 -1
- metadata +6 -8
data/.travis.yml
ADDED
data/benchmark/active_support.rb
CHANGED
@@ -5,6 +5,7 @@ require 'benchmark'
|
|
5
5
|
require 'rubygems'
|
6
6
|
require 'active_support'
|
7
7
|
|
8
|
+
$KCODE = 'UTF8'
|
8
9
|
|
9
10
|
raw = File.read(File.expand_path('../test.txt', __FILE__))
|
10
11
|
utf8 = raw.as_utf8
|
@@ -29,9 +30,15 @@ Benchmark.bmbm { |x|
|
|
29
30
|
x.report("#[-start, len]") {
|
30
31
|
times.times {utf8[-1024, 1024]}
|
31
32
|
}
|
33
|
+
x.report("#clean") {
|
34
|
+
times.times {utf8.clean}
|
35
|
+
}
|
36
|
+
x.report("#valid?") {
|
37
|
+
times.times {utf8.valid?}
|
38
|
+
}
|
32
39
|
}
|
33
40
|
|
34
|
-
puts "\n\nActiveSupport::Multibyte
|
41
|
+
puts "\n\nActiveSupport::Multibyte"
|
35
42
|
Benchmark.bmbm { |x|
|
36
43
|
x.report("#length") {
|
37
44
|
times.times {as_mb.length}
|
@@ -48,4 +55,35 @@ Benchmark.bmbm { |x|
|
|
48
55
|
x.report("#[-start, len]") {
|
49
56
|
times.times {as_mb[-1024, 1024]}
|
50
57
|
}
|
58
|
+
x.report("ActiveSupport::Multibyte.clean") {
|
59
|
+
times.times {ActiveSupport::Multibyte.clean(raw)}
|
60
|
+
}
|
61
|
+
x.report("ActiveSupport::Multibyte.verify") {
|
62
|
+
times.times {ActiveSupport::Multibyte.verify(raw)}
|
63
|
+
}
|
64
|
+
}
|
65
|
+
|
66
|
+
require 'iconv'
|
67
|
+
module ActiveSupport::Multibyte
|
68
|
+
class << self
|
69
|
+
OUTSIDE_ASCII = /[^\x00-\x7f]/n
|
70
|
+
ICONV_CLEANER = Iconv.new('UTF-8//IGNORE', 'UTF-8')
|
71
|
+
|
72
|
+
def clean_with_iconv(string)
|
73
|
+
if string =~ OUTSIDE_ASCII
|
74
|
+
ICONV_CLEANER.iconv(string + ' ')[0..-2]
|
75
|
+
else
|
76
|
+
string
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
alias_method :clean_without_iconv, :clean
|
81
|
+
alias_method :clean, :clean_with_iconv
|
82
|
+
end
|
83
|
+
end
|
84
|
+
puts "\n\nActiveSupport::Multibyte (patched with Iconv)"
|
85
|
+
Benchmark.bmbm { |x|
|
86
|
+
x.report("ActiveSupport::Multibyte.clean") {
|
87
|
+
times.times {ActiveSupport::Multibyte.clean(raw)}
|
88
|
+
}
|
51
89
|
}
|
data/ext/utf8/extconf.rb
CHANGED
data/ext/utf8/string_utf8.c
CHANGED
@@ -93,7 +93,7 @@ static VALUE rb_cString_UTF8_each_codepoint(int argc, VALUE *argv, VALUE self) {
|
|
93
93
|
/*
|
94
94
|
* call-seq: valid?(max_codepoint=nil)
|
95
95
|
*
|
96
|
-
* Iterates over the string,
|
96
|
+
* Iterates over the string, returning true/false if it's valid UTF-8
|
97
97
|
*
|
98
98
|
* max_codepoint - an optional Fixnum used to declare this string invalid
|
99
99
|
* if a codepoint higher than that value is found
|
@@ -337,44 +337,73 @@ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
|
|
337
337
|
* Returns: a new String
|
338
338
|
*/
|
339
339
|
static VALUE rb_cString_UTF8_clean(VALUE self) {
|
340
|
-
unsigned char *
|
341
|
-
unsigned char *
|
342
|
-
|
343
|
-
size_t len;
|
340
|
+
unsigned char *inBuf, *inBufCur;
|
341
|
+
unsigned char *outBuf, *outBufCur;
|
342
|
+
size_t len, i;
|
344
343
|
int8_t curCharLen;
|
345
|
-
size_t i;
|
346
344
|
VALUE rb_out;
|
347
345
|
|
348
|
-
|
346
|
+
inBuf = (unsigned char *)RSTRING_PTR(self);
|
347
|
+
inBufCur = inBuf;
|
349
348
|
len = RSTRING_LEN(self);
|
350
|
-
|
351
|
-
|
349
|
+
outBuf = malloc(len);
|
350
|
+
outBufCur = outBuf;
|
352
351
|
|
353
352
|
for(i=0; i<len; i+=curCharLen) {
|
354
|
-
curCharLen = utf8CharLen(
|
353
|
+
curCharLen = utf8CharLen(inBufCur, len);
|
355
354
|
if (curCharLen < 0) {
|
356
|
-
|
355
|
+
if (inBufCur-inBuf > 0) {
|
356
|
+
memcpy(outBufCur, inBuf, inBufCur-inBuf);
|
357
|
+
outBufCur += inBufCur-inBuf;
|
358
|
+
}
|
359
|
+
*outBufCur++ = REPLACEMENT_CHAR;
|
360
|
+
inBuf += (inBufCur-inBuf)+1;
|
357
361
|
curCharLen = 1;
|
358
|
-
} else {
|
359
|
-
memcpy(out+i, str+i, curCharLen);
|
360
362
|
}
|
363
|
+
|
364
|
+
inBufCur += curCharLen;
|
365
|
+
}
|
366
|
+
|
367
|
+
if (inBufCur-inBuf > 0) {
|
368
|
+
memcpy(outBufCur, inBuf, inBufCur-inBuf);
|
361
369
|
}
|
362
370
|
|
363
|
-
rb_out = rb_str_new((const char*)
|
371
|
+
rb_out = rb_str_new((const char*)outBuf, len);
|
364
372
|
AS_UTF8(rb_out);
|
365
373
|
|
366
|
-
|
374
|
+
free(outBuf);
|
367
375
|
|
368
376
|
return rb_out;
|
369
377
|
}
|
370
378
|
|
379
|
+
/*
|
380
|
+
* call-seq: clean
|
381
|
+
*
|
382
|
+
* Iterates over the string, returning true/false if it's within the low ASCII range
|
383
|
+
*
|
384
|
+
* Returns: a Boolean - true if the string is within the low ASCII range, false if not
|
385
|
+
*/
|
386
|
+
static VALUE rb_cString_UTF8_ascii_only(VALUE self) {
|
387
|
+
unsigned char *str = (unsigned char *)RSTRING_PTR(self);
|
388
|
+
size_t len = RSTRING_LEN(self), i=0;
|
389
|
+
|
390
|
+
for(; i<len; i+=1) {
|
391
|
+
if (str[i] > 0x7f) {
|
392
|
+
return Qfalse;
|
393
|
+
}
|
394
|
+
}
|
395
|
+
|
396
|
+
return Qtrue;
|
397
|
+
}
|
398
|
+
|
371
399
|
void init_String_UTF8() {
|
372
400
|
VALUE rb_cString_UTF8 = rb_define_class_under(rb_cString, "UTF8", rb_cString);
|
373
401
|
|
374
|
-
rb_define_method(rb_cString_UTF8, "length",
|
402
|
+
rb_define_method(rb_cString_UTF8, "length", rb_cString_UTF8_length, 0);
|
375
403
|
rb_define_method(rb_cString_UTF8, "each_char", rb_cString_UTF8_each_char, -1);
|
376
|
-
rb_define_method(rb_cString_UTF8, "[]",
|
404
|
+
rb_define_method(rb_cString_UTF8, "[]", rb_cString_UTF8_slice, -1);
|
377
405
|
rb_define_method(rb_cString_UTF8, "each_codepoint", rb_cString_UTF8_each_codepoint, -1);
|
378
406
|
rb_define_method(rb_cString_UTF8, "valid?", rb_cString_UTF8_valid, -1);
|
379
407
|
rb_define_method(rb_cString_UTF8, "clean", rb_cString_UTF8_clean, 0);
|
408
|
+
rb_define_method(rb_cString_UTF8, "ascii_only?", rb_cString_UTF8_ascii_only, 0);
|
380
409
|
}
|
data/ext/utf8/utf8.c
CHANGED
@@ -7,7 +7,7 @@
|
|
7
7
|
* Scans the current position of the buffer
|
8
8
|
* returning the length of this UTF-8 character
|
9
9
|
*/
|
10
|
-
|
10
|
+
int8_t utf8CharLen(unsigned char *in, size_t in_len) {
|
11
11
|
if (in_len > 0) {
|
12
12
|
unsigned char curChar, *start;
|
13
13
|
|
data/ext/utf8/utf8.h
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#ifndef UTF8_UTF8_H
|
2
2
|
#define UTF8_UTF8_H
|
3
3
|
|
4
|
-
|
4
|
+
int8_t utf8CharLen(unsigned char *in, size_t in_len);
|
5
5
|
int64_t utf8CharCount(unsigned char *in, size_t in_len);
|
6
6
|
int32_t utf8CharToCodepoint(unsigned char *in, size_t in_len);
|
7
7
|
|
data/lib/utf8/version.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
@@ -2,4 +2,9 @@ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
|
2
2
|
require 'utf8'
|
3
3
|
require 'utf8/string_scanner'
|
4
4
|
|
5
|
-
require 'rspec' unless defined?
|
5
|
+
require 'rspec' unless defined? RSpec
|
6
|
+
|
7
|
+
RSpec.configure do |config|
|
8
|
+
config.expect_with :stdlib
|
9
|
+
config.alias_example_to :test
|
10
|
+
end
|
data/spec/string_scanner_spec.rb
CHANGED
@@ -2,13 +2,13 @@
|
|
2
2
|
require File.expand_path('../spec_helper', __FILE__)
|
3
3
|
|
4
4
|
describe StringScanner::UTF8 do
|
5
|
-
before
|
5
|
+
before :each do
|
6
6
|
@char_array = ["怎", "麼", "也", "沒", "人", "寫", "了", "這", "個", "嗎"]
|
7
7
|
@scanner = StringScanner.new(@char_array.join)
|
8
8
|
@utf8_scanner = @scanner.as_utf8
|
9
9
|
end
|
10
10
|
|
11
|
-
|
11
|
+
test "should blow up on invalid utf8 chars" do
|
12
12
|
# lets cut right into the middle of a sequence so we know it's bad
|
13
13
|
str = @char_array.join
|
14
14
|
str.force_encoding('binary') if str.respond_to?(:force_encoding)
|
@@ -16,34 +16,34 @@ describe StringScanner::UTF8 do
|
|
16
16
|
str.force_encoding('utf-8') if str.respond_to?(:force_encoding)
|
17
17
|
scanner = StringScanner.new(str).as_utf8
|
18
18
|
|
19
|
-
|
19
|
+
assert_raise ArgumentError do
|
20
20
|
scanner.getch
|
21
|
-
|
21
|
+
end
|
22
22
|
end
|
23
23
|
|
24
|
-
|
25
|
-
@scanner.
|
26
|
-
@scanner.as_utf8.class
|
24
|
+
test "should extend StringScanner, adding an as_utf8 method that returns a StringScanner::UTF8 instance" do
|
25
|
+
assert @scanner.respond_to?(:as_utf8)
|
26
|
+
assert_equal StringScanner::UTF8, @scanner.as_utf8.class
|
27
27
|
end
|
28
28
|
|
29
|
-
|
29
|
+
test "should allow access to a regular (non-utf8-aware) StringScanner based on it's string" do
|
30
30
|
raw = @utf8_scanner.as_raw
|
31
|
-
raw.class
|
32
|
-
|
31
|
+
assert_equal StringScanner, raw.class
|
32
|
+
assert_equal @utf8_scanner.string, raw.string
|
33
33
|
end
|
34
34
|
|
35
|
-
|
35
|
+
test "#getch should be utf8-aware" do
|
36
36
|
i=0
|
37
37
|
while char = @utf8_scanner.getch
|
38
|
-
|
38
|
+
assert_equal @char_array[i], char
|
39
39
|
i+=1
|
40
40
|
end
|
41
41
|
end
|
42
42
|
|
43
|
-
|
43
|
+
test "should be able to be reset" do
|
44
44
|
i=0
|
45
45
|
while char = @utf8_scanner.getch
|
46
|
-
|
46
|
+
assert_equal @char_array[i], char
|
47
47
|
if i == 4
|
48
48
|
break
|
49
49
|
end
|
@@ -54,7 +54,7 @@ describe StringScanner::UTF8 do
|
|
54
54
|
|
55
55
|
i=0
|
56
56
|
while char = @utf8_scanner.getch
|
57
|
-
|
57
|
+
assert_equal @char_array[i], char
|
58
58
|
i+=1
|
59
59
|
end
|
60
60
|
end
|
data/spec/string_spec.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
require File.expand_path('../spec_helper', __FILE__)
|
3
3
|
|
4
4
|
describe String::UTF8 do
|
5
|
-
before
|
5
|
+
before :each do
|
6
6
|
@char_array = ["怎", "麼", "也", "沒", "人", "寫", "了", "這", "個", "嗎"]
|
7
7
|
@str = @char_array.join
|
8
8
|
@utf8 = @str.as_utf8
|
@@ -10,65 +10,69 @@ describe String::UTF8 do
|
|
10
10
|
@codepoints = @char_array.map{|c| c.unpack('U').first}
|
11
11
|
end
|
12
12
|
|
13
|
-
|
13
|
+
test "should blow up on invalid utf8 chars" do
|
14
14
|
# lets cut right into the middle of a sequence so we know it's bad
|
15
15
|
@str.force_encoding('binary') if @str.respond_to?(:force_encoding)
|
16
16
|
utf8 = @str[0..1]
|
17
17
|
utf8.force_encoding('utf-8') if utf8.respond_to?(:force_encoding)
|
18
18
|
utf8 = utf8.as_utf8
|
19
19
|
|
20
|
-
|
20
|
+
assert_raise ArgumentError do
|
21
21
|
utf8.length
|
22
|
-
|
22
|
+
end
|
23
23
|
|
24
|
-
|
24
|
+
assert_raise ArgumentError do
|
25
25
|
utf8[0, 10]
|
26
|
-
|
26
|
+
end
|
27
27
|
|
28
|
-
|
28
|
+
assert_raise ArgumentError do
|
29
29
|
utf8.chars.to_a
|
30
|
-
|
30
|
+
end
|
31
31
|
end
|
32
32
|
|
33
|
-
|
34
|
-
"".
|
35
|
-
"".as_utf8.class
|
33
|
+
test "should extend String, adding an as_utf8 method that returns a String::UTF8 instance" do
|
34
|
+
assert "".respond_to?(:as_utf8)
|
35
|
+
assert_equal String::UTF8, "".as_utf8.class
|
36
36
|
end
|
37
37
|
|
38
|
-
|
38
|
+
test "should allow access to the underlying raw string" do
|
39
39
|
raw = @utf8.as_raw
|
40
|
-
raw.class
|
40
|
+
assert_equal String, raw.class
|
41
41
|
if defined? Encoding
|
42
|
-
raw.length
|
42
|
+
assert_equal @utf8_len, raw.length
|
43
43
|
else
|
44
|
-
|
44
|
+
assert_equal @str.size, raw.length
|
45
45
|
end
|
46
46
|
end
|
47
47
|
|
48
|
-
|
49
|
-
@utf8[0].class
|
50
|
-
@utf8.chars.to_a[0].class
|
48
|
+
test "should wrap all returned strings to be utf8-aware" do
|
49
|
+
assert_equal String::UTF8, @utf8[0].class
|
50
|
+
assert_equal String::UTF8, @utf8.chars.to_a[0].class
|
51
51
|
end
|
52
52
|
|
53
|
-
|
53
|
+
test "clean should replace invalid utf8 chars with '?'" do
|
54
54
|
orig = "provided by Cristian Rodr\355guez."
|
55
55
|
clean = "provided by Cristian Rodr?guez."
|
56
|
-
orig.as_utf8.clean
|
56
|
+
assert_equal clean, orig.as_utf8.clean
|
57
|
+
assert_equal "asdf24??asdf24", "asdf24\206\222asdf24".as_utf8.clean
|
58
|
+
assert_equal "asdf24?asdf24", "asdf24\342asdf24".as_utf8.clean
|
59
|
+
assert_equal "asdf24??asdf24", "asdf24\342\206asdf24".as_utf8.clean
|
60
|
+
assert_equal "asdf24?asdf24", "asdf24\222asdf24".as_utf8.clean
|
57
61
|
end
|
58
62
|
|
59
|
-
|
60
|
-
|
63
|
+
test "clean should not replace valid utf8 chars with '?'" do
|
64
|
+
assert_equal "asdf24\342\206\222asdf24", "asdf24\342\206\222asdf24".as_utf8.clean
|
61
65
|
end
|
62
66
|
|
63
67
|
context "#length and #size" do
|
64
|
-
|
65
|
-
@utf8.length
|
66
|
-
@utf8.size
|
68
|
+
test "should be utf8-aware" do
|
69
|
+
assert_equal @utf8_len, @utf8.length
|
70
|
+
assert_equal @utf8_len, @utf8.size
|
67
71
|
end
|
68
72
|
end
|
69
73
|
|
70
74
|
context "#chars and #each_char" do
|
71
|
-
|
75
|
+
test "should be utf8-aware" do
|
72
76
|
klass = begin
|
73
77
|
if defined? Encoding
|
74
78
|
Enumerator
|
@@ -77,19 +81,19 @@ describe String::UTF8 do
|
|
77
81
|
end
|
78
82
|
end
|
79
83
|
|
80
|
-
@utf8.chars.class
|
84
|
+
assert_equal klass, @utf8.chars.class
|
81
85
|
@utf8.chars do |char|
|
82
|
-
char.
|
86
|
+
assert !char.nil?
|
83
87
|
end
|
84
88
|
joined = @utf8.chars.to_a.join
|
85
|
-
@utf8
|
86
|
-
@utf8.chars.to_a.size
|
87
|
-
@utf8.chars.to_a
|
89
|
+
assert_equal joined, @utf8
|
90
|
+
assert_equal @utf8_len, @utf8.chars.to_a.size
|
91
|
+
assert_equal @char_array, @utf8.chars.to_a
|
88
92
|
end
|
89
93
|
end
|
90
94
|
|
91
95
|
context "#codepoints and #each_codepoint" do
|
92
|
-
|
96
|
+
test "should be utf8-aware" do
|
93
97
|
klass = begin
|
94
98
|
if defined? Encoding
|
95
99
|
Enumerator
|
@@ -98,124 +102,118 @@ describe String::UTF8 do
|
|
98
102
|
end
|
99
103
|
end
|
100
104
|
|
101
|
-
@utf8.codepoints.class
|
105
|
+
assert_equal klass, @utf8.codepoints.class
|
102
106
|
@utf8.codepoints do |codepoint|
|
103
|
-
codepoint.
|
107
|
+
assert !codepoint.nil?
|
104
108
|
end
|
105
|
-
@
|
106
|
-
@utf8.codepoints.to_a
|
109
|
+
assert_equal @codepoints.size, @utf8.codepoints.to_a.size
|
110
|
+
assert_equal @codepoints, @utf8.codepoints.to_a
|
107
111
|
end
|
108
112
|
end
|
109
113
|
|
110
114
|
context "[offset] syntax" do
|
111
|
-
|
115
|
+
test "should be utf8-aware" do
|
112
116
|
@char_array.each_with_index do |char, i|
|
113
117
|
utf8_char = @utf8[i]
|
114
|
-
|
118
|
+
assert_equal char, utf8_char
|
115
119
|
end
|
116
120
|
end
|
117
121
|
|
118
|
-
|
122
|
+
test "should support negative indices" do
|
119
123
|
utf8_char = @utf8[-5]
|
120
|
-
|
124
|
+
assert_equal @char_array[-5], utf8_char
|
121
125
|
end
|
122
126
|
|
123
|
-
|
124
|
-
@utf8[100].
|
125
|
-
@utf8[-100].
|
127
|
+
test "should return nil for out of range indices" do
|
128
|
+
assert @utf8[100].nil?
|
129
|
+
assert @utf8[-100].nil?
|
126
130
|
end
|
127
131
|
end
|
128
132
|
|
129
133
|
context "[offset, length] syntax" do
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
utf8_char = @utf8[0, 6]
|
135
|
-
utf8_char.should eql(@char_array[0, 6].join)
|
134
|
+
test "should be utf8-aware" do
|
135
|
+
assert_equal @char_array[1, 4].join, @utf8[1, 4]
|
136
|
+
assert_equal @char_array[0, 6].join, @utf8[0, 6]
|
136
137
|
|
137
138
|
# this will fail due to a bug in 1.9
|
138
139
|
unless defined? Encoding
|
139
|
-
|
140
|
-
utf8_char.should eql(@char_array[6, 100].join)
|
140
|
+
assert_equal @char_array[6, 100].join, @utf8[6, 100]
|
141
141
|
end
|
142
142
|
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
utf8_char = @utf8[-1, 100]
|
147
|
-
utf8_char.should eql(@char_array[-1, 100].join)
|
148
|
-
|
149
|
-
utf8_char = @utf8[0, 0]
|
150
|
-
utf8_char.should eql(@char_array[0, 0].join)
|
143
|
+
assert_equal @char_array[-1, 2].join, @utf8[-1, 2]
|
144
|
+
assert_equal @char_array[-1, 100].join, @utf8[-1, 100]
|
145
|
+
assert_equal @char_array[0, 0].join, @utf8[0, 0]
|
151
146
|
end
|
152
147
|
|
153
|
-
|
154
|
-
@utf8[100, 100].
|
155
|
-
@utf8[-100, 100].
|
156
|
-
@utf8[0, -100].
|
148
|
+
test "should return nil for an out of range offset or length" do
|
149
|
+
assert @utf8[100, 100].nil?
|
150
|
+
assert @utf8[-100, 100].nil?
|
151
|
+
assert @utf8[0, -100].nil?
|
157
152
|
end
|
158
153
|
end
|
159
154
|
|
160
155
|
context "[Range] syntax" do
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
utf8_char = @utf8[0..6]
|
166
|
-
utf8_char.should eql(@char_array[0..6].join)
|
156
|
+
test "should be utf8-aware" do
|
157
|
+
assert_equal @char_array[1..4].join, @utf8[1..4]
|
158
|
+
assert_equal @char_array[0..6].join, @utf8[0..6]
|
167
159
|
|
168
160
|
# this will fail due to a bug in 1.9
|
169
161
|
unless defined? Encoding
|
170
|
-
|
171
|
-
utf8_char.should eql(@char_array[6..100].join)
|
162
|
+
assert_equal @char_array[6..100].join, @utf8[6..100]
|
172
163
|
end
|
173
164
|
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
utf8_char = @utf8[-1..100]
|
178
|
-
utf8_char.should eql(@char_array[-1..100].join)
|
165
|
+
assert_equal @char_array[-1..2].join, @utf8[-1..2]
|
166
|
+
assert_equal @char_array[-1..100].join, @utf8[-1..100]
|
179
167
|
end
|
180
168
|
|
181
|
-
|
182
|
-
@utf8[100..100].
|
183
|
-
@utf8[-100..100].
|
184
|
-
@utf8[0..-100]
|
169
|
+
test "should return nil for an out of range offset or length" do
|
170
|
+
assert @utf8[100..100].nil?
|
171
|
+
assert @utf8[-100..100].nil?
|
172
|
+
assert_equal "", @utf8[0..-100]
|
185
173
|
end
|
186
174
|
end
|
187
175
|
|
188
176
|
context "#valid?" do
|
189
|
-
|
177
|
+
test "should test validity" do
|
190
178
|
# lets cut right into the middle of a sequence so we know it's bad
|
191
179
|
@str.force_encoding('binary') if @str.respond_to?(:force_encoding)
|
192
180
|
utf8 = @str[0..1]
|
193
181
|
utf8.force_encoding('utf-8') if utf8.respond_to?(:force_encoding)
|
194
182
|
utf8 = utf8.as_utf8
|
195
183
|
|
196
|
-
utf8.valid
|
197
|
-
@utf8.valid
|
184
|
+
assert !utf8.valid?
|
185
|
+
assert @utf8.valid?
|
198
186
|
|
199
|
-
"provided by Cristian Rodr\355guez.".as_utf8.
|
187
|
+
assert !"provided by Cristian Rodr\355guez.".as_utf8.valid?
|
200
188
|
end
|
201
189
|
|
202
|
-
|
190
|
+
test "should test validity using a maximum codepoint" do
|
203
191
|
highest_codepoint = @utf8.codepoints.to_a.max
|
204
192
|
|
205
|
-
@utf8.valid?(highest_codepoint)
|
206
|
-
|
193
|
+
assert @utf8.valid?(highest_codepoint)
|
194
|
+
assert !@utf8.valid?(highest_codepoint-1)
|
207
195
|
end
|
208
196
|
end
|
209
197
|
|
210
|
-
|
211
|
-
|
198
|
+
test "[Regexp] syntax shouldn't be supported yet" do
|
199
|
+
assert_raise ArgumentError do
|
212
200
|
@utf8[/a/]
|
213
|
-
|
201
|
+
end
|
214
202
|
end
|
215
203
|
|
216
|
-
|
217
|
-
|
204
|
+
test "[Regexp, match_index] syntax shouldn't be supported yet" do
|
205
|
+
assert_raise ArgumentError do
|
218
206
|
@utf8[/(a)/, 1]
|
219
|
-
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
context "#ascii_only" do
|
211
|
+
test "should return true for a string within the low ascii range" do
|
212
|
+
assert "asdf".as_utf8.ascii_only?
|
213
|
+
end
|
214
|
+
|
215
|
+
test "should return false for a string within the low ascii range" do
|
216
|
+
assert !@char_array.first.as_utf8.ascii_only?
|
217
|
+
end
|
220
218
|
end
|
221
219
|
end
|
data/utf8.gemspec
CHANGED
@@ -13,7 +13,7 @@ Gem::Specification.new do |s|
|
|
13
13
|
s.files = `git ls-files`.split("\n")
|
14
14
|
s.homepage = %q{http://github.com/brianmario/utf8}
|
15
15
|
s.rdoc_options = ["--charset=UTF-8"]
|
16
|
-
s.require_paths = ["lib"
|
16
|
+
s.require_paths = ["lib"]
|
17
17
|
s.rubygems_version = %q{1.4.2}
|
18
18
|
s.summary = %q{A lightweight UTF8-aware String class meant for use with Ruby 1.8}
|
19
19
|
s.test_files = `git ls-files spec`.split("\n")
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: utf8
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 21
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 7
|
10
|
+
version: 0.1.7
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Brian Lopez
|
@@ -15,8 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
19
|
-
default_executable:
|
18
|
+
date: 2011-11-22 00:00:00 Z
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
22
21
|
name: rake-compiler
|
@@ -75,6 +74,7 @@ extra_rdoc_files:
|
|
75
74
|
files:
|
76
75
|
- .gitignore
|
77
76
|
- .rspec
|
77
|
+
- .travis.yml
|
78
78
|
- Gemfile
|
79
79
|
- MIT-LICENSE
|
80
80
|
- README.rdoc
|
@@ -98,7 +98,6 @@ files:
|
|
98
98
|
- spec/string_scanner_spec.rb
|
99
99
|
- spec/string_spec.rb
|
100
100
|
- utf8.gemspec
|
101
|
-
has_rdoc: true
|
102
101
|
homepage: http://github.com/brianmario/utf8
|
103
102
|
licenses: []
|
104
103
|
|
@@ -107,7 +106,6 @@ rdoc_options:
|
|
107
106
|
- --charset=UTF-8
|
108
107
|
require_paths:
|
109
108
|
- lib
|
110
|
-
- ext
|
111
109
|
required_ruby_version: !ruby/object:Gem::Requirement
|
112
110
|
none: false
|
113
111
|
requirements:
|
@@ -129,7 +127,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
129
127
|
requirements: []
|
130
128
|
|
131
129
|
rubyforge_project:
|
132
|
-
rubygems_version: 1.
|
130
|
+
rubygems_version: 1.8.11
|
133
131
|
signing_key:
|
134
132
|
specification_version: 3
|
135
133
|
summary: A lightweight UTF8-aware String class meant for use with Ruby 1.8
|