utf8 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +2 -1
- data/README.rdoc +2 -0
- data/ext/utf8/string_scanner_utf8.c +30 -10
- data/ext/utf8/string_utf8.c +5 -3
- data/lib/utf8/string.rb +1 -1
- data/spec/string_scanner_spec.rb +6 -2
- data/spec/string_spec.rb +5 -2
- data/utf8.gemspec +4 -4
- metadata +7 -5
data/.gitignore
CHANGED
data/README.rdoc
CHANGED
@@ -2,6 +2,8 @@
|
|
2
2
|
|
3
3
|
The scanning code is implemented in C (would love a Java version as well, if any of you want to help with that).
|
4
4
|
|
5
|
+
At the moment, this gem is tested on 1.8.7, 1.9.2 and Rubinius 1.2.1dev - it may work on others but ymmv.
|
6
|
+
|
5
7
|
== String::UTF8 Example
|
6
8
|
|
7
9
|
The String::UTF8#[] API isn't fully supported yet, and may never support regular expressions. I'll update this readme as I make progress.
|
@@ -1,9 +1,9 @@
|
|
1
1
|
#include "ext.h"
|
2
|
-
#include "ruby/regex.h"
|
3
2
|
#include "utf8.h"
|
4
3
|
|
5
4
|
extern ID intern_as_utf8;
|
6
5
|
|
6
|
+
#ifndef RUBINIUS
|
7
7
|
struct strscanner {
|
8
8
|
/* multi-purpose flags */
|
9
9
|
unsigned long flags;
|
@@ -15,13 +15,19 @@ struct strscanner {
|
|
15
15
|
long prev; /* legal only when MATCHED_P(s) */
|
16
16
|
long curr; /* always legal */
|
17
17
|
|
18
|
+
/*
|
19
|
+
* We never access this member, and would require a shitload of other patching
|
20
|
+
* to work right on other ruby versions
|
21
|
+
*
|
22
|
+
*/
|
18
23
|
/* the regexp register; legal only when MATCHED_P(s) */
|
19
|
-
struct re_registers regs;
|
24
|
+
/* struct re_registers regs; */
|
20
25
|
};
|
21
26
|
|
22
27
|
#define GET_SCANNER(obj, var) \
|
23
28
|
Data_Get_Struct(obj, struct strscanner, var); \
|
24
29
|
if (NIL_P(var->str)) rb_raise(rb_eArgError, "uninitialized StringScanner object");
|
30
|
+
#endif
|
25
31
|
|
26
32
|
/*
|
27
33
|
* Document-class: StringScanner::UTF8
|
@@ -34,22 +40,36 @@ struct strscanner {
|
|
34
40
|
*/
|
35
41
|
static VALUE rb_cStringScanner_UTF8_getch(VALUE self) {
|
36
42
|
unsigned char *str;
|
37
|
-
|
38
|
-
|
39
|
-
VALUE utf8Str;
|
43
|
+
long len = 0, pos = 0;
|
44
|
+
VALUE utf8Str, curStr;
|
40
45
|
int8_t lastCharLen=0;
|
46
|
+
|
47
|
+
#ifndef RUBINIUS
|
48
|
+
struct strscanner *scanner;
|
41
49
|
GET_SCANNER(self, scanner);
|
42
50
|
|
43
|
-
|
44
|
-
|
51
|
+
curStr = scanner->str;
|
52
|
+
pos = scanner->curr;
|
53
|
+
#else
|
54
|
+
curStr = rb_iv_get(self, "@string");
|
55
|
+
pos = FIX2LONG(rb_iv_get(self, "@pos"));
|
56
|
+
#endif
|
57
|
+
|
58
|
+
str = (unsigned char *)RSTRING_PTR(curStr);
|
59
|
+
len = RSTRING_LEN(curStr);
|
45
60
|
|
46
|
-
if (len > 0 && len >
|
61
|
+
if (len > 0 && len > pos) {
|
47
62
|
lastCharLen = utf8CharLen(str, len);
|
48
63
|
if (lastCharLen < 0) {
|
49
64
|
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
50
65
|
}
|
51
|
-
utf8Str = rb_str_new((char *)str+
|
52
|
-
|
66
|
+
utf8Str = rb_str_new((char *)str+pos, lastCharLen);
|
67
|
+
pos += lastCharLen;
|
68
|
+
#ifndef RUBINIUS
|
69
|
+
scanner->curr = pos;
|
70
|
+
#else
|
71
|
+
rb_iv_set(self, "@pos", LONG2FIX(pos));
|
72
|
+
#endif
|
53
73
|
AS_UTF8(utf8Str);
|
54
74
|
return utf8Str;
|
55
75
|
} else {
|
data/ext/utf8/string_utf8.c
CHANGED
@@ -30,7 +30,7 @@ static VALUE rb_cString_UTF8_length(VALUE self) {
|
|
30
30
|
*
|
31
31
|
* Iterates over the string, yielding one UTF8 character at a time
|
32
32
|
*/
|
33
|
-
static VALUE rb_cString_UTF8_each_char(VALUE self) {
|
33
|
+
static VALUE rb_cString_UTF8_each_char(int argc, VALUE *argv, VALUE self) {
|
34
34
|
unsigned char *str = (unsigned char *)RSTRING_PTR(self);
|
35
35
|
size_t len = RSTRING_LEN(self), i=0;
|
36
36
|
int8_t lastCharLen=0;
|
@@ -38,7 +38,9 @@ static VALUE rb_cString_UTF8_each_char(VALUE self) {
|
|
38
38
|
|
39
39
|
// this will return an Enumerator wrapping this string, yielding this method
|
40
40
|
// when Enumerator#each is called
|
41
|
-
|
41
|
+
if (!rb_block_given_p()) {
|
42
|
+
return rb_funcall(self, rb_intern("to_enum"), 1, ID2SYM(rb_intern("each_char")));
|
43
|
+
}
|
42
44
|
|
43
45
|
for(; i<len; i+=lastCharLen) {
|
44
46
|
lastCharLen = utf8CharLen(str, len);
|
@@ -259,6 +261,6 @@ void init_String_UTF8() {
|
|
259
261
|
VALUE rb_cString_UTF8 = rb_define_class_under(rb_cString, "UTF8", rb_cString);
|
260
262
|
|
261
263
|
rb_define_method(rb_cString_UTF8, "length", rb_cString_UTF8_length, 0);
|
262
|
-
rb_define_method(rb_cString_UTF8, "each_char", rb_cString_UTF8_each_char,
|
264
|
+
rb_define_method(rb_cString_UTF8, "each_char", rb_cString_UTF8_each_char, -1);
|
263
265
|
rb_define_method(rb_cString_UTF8, "[]", rb_cString_UTF8_slice, -1);
|
264
266
|
}
|
data/lib/utf8/string.rb
CHANGED
data/spec/string_scanner_spec.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
require File.expand_path('../spec_helper', __FILE__)
|
3
3
|
|
4
4
|
describe StringScanner::UTF8 do
|
5
|
-
before(:
|
5
|
+
before(:each) do
|
6
6
|
@char_array = ["怎", "麼", "也", "沒", "人", "寫", "了", "這", "個", "嗎"]
|
7
7
|
@scanner = StringScanner.new(@char_array.join)
|
8
8
|
@utf8_scanner = @scanner.as_utf8
|
@@ -10,7 +10,11 @@ describe StringScanner::UTF8 do
|
|
10
10
|
|
11
11
|
it "should blow up on invalid utf8 chars" do
|
12
12
|
# lets cut right into the middle of a sequence so we know it's bad
|
13
|
-
|
13
|
+
str = @char_array.join
|
14
|
+
str.force_encoding('binary') if str.respond_to?(:force_encoding)
|
15
|
+
str = str[0..1]
|
16
|
+
str.force_encoding('utf-8') if str.respond_to?(:force_encoding)
|
17
|
+
scanner = StringScanner.new(str).as_utf8
|
14
18
|
|
15
19
|
lambda {
|
16
20
|
scanner.getch
|
data/spec/string_spec.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
require File.expand_path('../spec_helper', __FILE__)
|
3
3
|
|
4
4
|
describe String::UTF8 do
|
5
|
-
before(:
|
5
|
+
before(:each) do
|
6
6
|
@char_array = ["怎", "麼", "也", "沒", "人", "寫", "了", "這", "個", "嗎"]
|
7
7
|
@str = @char_array.join
|
8
8
|
@utf8 = @str.as_utf8
|
@@ -11,7 +11,10 @@ describe String::UTF8 do
|
|
11
11
|
|
12
12
|
it "should blow up on invalid utf8 chars" do
|
13
13
|
# lets cut right into the middle of a sequence so we know it's bad
|
14
|
-
|
14
|
+
@str.force_encoding('binary') if @str.respond_to?(:force_encoding)
|
15
|
+
utf8 = @str[0..1]
|
16
|
+
utf8.force_encoding('utf-8') if utf8.respond_to?(:force_encoding)
|
17
|
+
utf8 = utf8.as_utf8
|
15
18
|
|
16
19
|
lambda {
|
17
20
|
utf8.length
|
data/utf8.gemspec
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{utf8}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.2"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Brian Lopez"]
|
@@ -24,14 +24,14 @@ Gem::Specification.new do |s|
|
|
24
24
|
|
25
25
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
26
26
|
s.add_development_dependency(%q<rake-compiler>, [">= 0.7.5"])
|
27
|
-
s.add_development_dependency(%q<rspec>, [">= 0"])
|
27
|
+
s.add_development_dependency(%q<rspec>, [">= 2.0.0"])
|
28
28
|
else
|
29
29
|
s.add_dependency(%q<rake-compiler>, [">= 0.7.5"])
|
30
|
-
s.add_dependency(%q<rspec>, [">= 0"])
|
30
|
+
s.add_dependency(%q<rspec>, [">= 2.0.0"])
|
31
31
|
end
|
32
32
|
else
|
33
33
|
s.add_dependency(%q<rake-compiler>, [">= 0.7.5"])
|
34
|
-
s.add_dependency(%q<rspec>, [">= 0"])
|
34
|
+
s.add_dependency(%q<rspec>, [">= 2.0.0"])
|
35
35
|
end
|
36
36
|
end
|
37
37
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: utf8
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 31
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 2
|
10
|
+
version: 0.1.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Brian Lopez
|
@@ -42,10 +42,12 @@ dependencies:
|
|
42
42
|
requirements:
|
43
43
|
- - ">="
|
44
44
|
- !ruby/object:Gem::Version
|
45
|
-
hash:
|
45
|
+
hash: 15
|
46
46
|
segments:
|
47
|
+
- 2
|
48
|
+
- 0
|
47
49
|
- 0
|
48
|
-
version:
|
50
|
+
version: 2.0.0
|
49
51
|
type: :development
|
50
52
|
version_requirements: *id002
|
51
53
|
description:
|