utf8 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -1
- data/README.rdoc +2 -0
- data/ext/utf8/string_scanner_utf8.c +30 -10
- data/ext/utf8/string_utf8.c +5 -3
- data/lib/utf8/string.rb +1 -1
- data/spec/string_scanner_spec.rb +6 -2
- data/spec/string_spec.rb +5 -2
- data/utf8.gemspec +4 -4
- metadata +7 -5
data/.gitignore
CHANGED
data/README.rdoc
CHANGED
@@ -2,6 +2,8 @@
|
|
2
2
|
|
3
3
|
The scanning code is implemented in C (would love a Java version as well, if any of you want to help with that).
|
4
4
|
|
5
|
+
At the moment, this gem is tested on 1.8.7, 1.9.2 and Rubinius 1.2.1dev - it may work on others but ymmv.
|
6
|
+
|
5
7
|
== String::UTF8 Example
|
6
8
|
|
7
9
|
The String::UTF8#[] API isn't fully supported yet, and may never support regular expressions. I'll update this readme as I make progress.
|
@@ -1,9 +1,9 @@
|
|
1
1
|
#include "ext.h"
|
2
|
-
#include "ruby/regex.h"
|
3
2
|
#include "utf8.h"
|
4
3
|
|
5
4
|
extern ID intern_as_utf8;
|
6
5
|
|
6
|
+
#ifndef RUBINIUS
|
7
7
|
struct strscanner {
|
8
8
|
/* multi-purpose flags */
|
9
9
|
unsigned long flags;
|
@@ -15,13 +15,19 @@ struct strscanner {
|
|
15
15
|
long prev; /* legal only when MATCHED_P(s) */
|
16
16
|
long curr; /* always legal */
|
17
17
|
|
18
|
+
/*
|
19
|
+
* We never access this member, and would require a shitload of other patching
|
20
|
+
* to work right on other ruby versions
|
21
|
+
*
|
22
|
+
*/
|
18
23
|
/* the regexp register; legal only when MATCHED_P(s) */
|
19
|
-
struct re_registers regs;
|
24
|
+
/* struct re_registers regs; */
|
20
25
|
};
|
21
26
|
|
22
27
|
#define GET_SCANNER(obj, var) \
|
23
28
|
Data_Get_Struct(obj, struct strscanner, var); \
|
24
29
|
if (NIL_P(var->str)) rb_raise(rb_eArgError, "uninitialized StringScanner object");
|
30
|
+
#endif
|
25
31
|
|
26
32
|
/*
|
27
33
|
* Document-class: StringScanner::UTF8
|
@@ -34,22 +40,36 @@ struct strscanner {
|
|
34
40
|
*/
|
35
41
|
static VALUE rb_cStringScanner_UTF8_getch(VALUE self) {
|
36
42
|
unsigned char *str;
|
37
|
-
|
38
|
-
|
39
|
-
VALUE utf8Str;
|
43
|
+
long len = 0, pos = 0;
|
44
|
+
VALUE utf8Str, curStr;
|
40
45
|
int8_t lastCharLen=0;
|
46
|
+
|
47
|
+
#ifndef RUBINIUS
|
48
|
+
struct strscanner *scanner;
|
41
49
|
GET_SCANNER(self, scanner);
|
42
50
|
|
43
|
-
|
44
|
-
|
51
|
+
curStr = scanner->str;
|
52
|
+
pos = scanner->curr;
|
53
|
+
#else
|
54
|
+
curStr = rb_iv_get(self, "@string");
|
55
|
+
pos = FIX2LONG(rb_iv_get(self, "@pos"));
|
56
|
+
#endif
|
57
|
+
|
58
|
+
str = (unsigned char *)RSTRING_PTR(curStr);
|
59
|
+
len = RSTRING_LEN(curStr);
|
45
60
|
|
46
|
-
if (len > 0 && len >
|
61
|
+
if (len > 0 && len > pos) {
|
47
62
|
lastCharLen = utf8CharLen(str, len);
|
48
63
|
if (lastCharLen < 0) {
|
49
64
|
rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
|
50
65
|
}
|
51
|
-
utf8Str = rb_str_new((char *)str+
|
52
|
-
|
66
|
+
utf8Str = rb_str_new((char *)str+pos, lastCharLen);
|
67
|
+
pos += lastCharLen;
|
68
|
+
#ifndef RUBINIUS
|
69
|
+
scanner->curr = pos;
|
70
|
+
#else
|
71
|
+
rb_iv_set(self, "@pos", LONG2FIX(pos));
|
72
|
+
#endif
|
53
73
|
AS_UTF8(utf8Str);
|
54
74
|
return utf8Str;
|
55
75
|
} else {
|
data/ext/utf8/string_utf8.c
CHANGED
@@ -30,7 +30,7 @@ static VALUE rb_cString_UTF8_length(VALUE self) {
|
|
30
30
|
*
|
31
31
|
* Iterates over the string, yielding one UTF8 character at a time
|
32
32
|
*/
|
33
|
-
static VALUE rb_cString_UTF8_each_char(VALUE self) {
|
33
|
+
static VALUE rb_cString_UTF8_each_char(int argc, VALUE *argv, VALUE self) {
|
34
34
|
unsigned char *str = (unsigned char *)RSTRING_PTR(self);
|
35
35
|
size_t len = RSTRING_LEN(self), i=0;
|
36
36
|
int8_t lastCharLen=0;
|
@@ -38,7 +38,9 @@ static VALUE rb_cString_UTF8_each_char(VALUE self) {
|
|
38
38
|
|
39
39
|
// this will return an Enumerator wrapping this string, yielding this method
|
40
40
|
// when Enumerator#each is called
|
41
|
-
|
41
|
+
if (!rb_block_given_p()) {
|
42
|
+
return rb_funcall(self, rb_intern("to_enum"), 1, ID2SYM(rb_intern("each_char")));
|
43
|
+
}
|
42
44
|
|
43
45
|
for(; i<len; i+=lastCharLen) {
|
44
46
|
lastCharLen = utf8CharLen(str, len);
|
@@ -259,6 +261,6 @@ void init_String_UTF8() {
|
|
259
261
|
VALUE rb_cString_UTF8 = rb_define_class_under(rb_cString, "UTF8", rb_cString);
|
260
262
|
|
261
263
|
rb_define_method(rb_cString_UTF8, "length", rb_cString_UTF8_length, 0);
|
262
|
-
rb_define_method(rb_cString_UTF8, "each_char", rb_cString_UTF8_each_char,
|
264
|
+
rb_define_method(rb_cString_UTF8, "each_char", rb_cString_UTF8_each_char, -1);
|
263
265
|
rb_define_method(rb_cString_UTF8, "[]", rb_cString_UTF8_slice, -1);
|
264
266
|
}
|
data/lib/utf8/string.rb
CHANGED
data/spec/string_scanner_spec.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
require File.expand_path('../spec_helper', __FILE__)
|
3
3
|
|
4
4
|
describe StringScanner::UTF8 do
|
5
|
-
before(:
|
5
|
+
before(:each) do
|
6
6
|
@char_array = ["怎", "麼", "也", "沒", "人", "寫", "了", "這", "個", "嗎"]
|
7
7
|
@scanner = StringScanner.new(@char_array.join)
|
8
8
|
@utf8_scanner = @scanner.as_utf8
|
@@ -10,7 +10,11 @@ describe StringScanner::UTF8 do
|
|
10
10
|
|
11
11
|
it "should blow up on invalid utf8 chars" do
|
12
12
|
# lets cut right into the middle of a sequence so we know it's bad
|
13
|
-
|
13
|
+
str = @char_array.join
|
14
|
+
str.force_encoding('binary') if str.respond_to?(:force_encoding)
|
15
|
+
str = str[0..1]
|
16
|
+
str.force_encoding('utf-8') if str.respond_to?(:force_encoding)
|
17
|
+
scanner = StringScanner.new(str).as_utf8
|
14
18
|
|
15
19
|
lambda {
|
16
20
|
scanner.getch
|
data/spec/string_spec.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
require File.expand_path('../spec_helper', __FILE__)
|
3
3
|
|
4
4
|
describe String::UTF8 do
|
5
|
-
before(:
|
5
|
+
before(:each) do
|
6
6
|
@char_array = ["怎", "麼", "也", "沒", "人", "寫", "了", "這", "個", "嗎"]
|
7
7
|
@str = @char_array.join
|
8
8
|
@utf8 = @str.as_utf8
|
@@ -11,7 +11,10 @@ describe String::UTF8 do
|
|
11
11
|
|
12
12
|
it "should blow up on invalid utf8 chars" do
|
13
13
|
# lets cut right into the middle of a sequence so we know it's bad
|
14
|
-
|
14
|
+
@str.force_encoding('binary') if @str.respond_to?(:force_encoding)
|
15
|
+
utf8 = @str[0..1]
|
16
|
+
utf8.force_encoding('utf-8') if utf8.respond_to?(:force_encoding)
|
17
|
+
utf8 = utf8.as_utf8
|
15
18
|
|
16
19
|
lambda {
|
17
20
|
utf8.length
|
data/utf8.gemspec
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{utf8}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.2"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Brian Lopez"]
|
@@ -24,14 +24,14 @@ Gem::Specification.new do |s|
|
|
24
24
|
|
25
25
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
26
26
|
s.add_development_dependency(%q<rake-compiler>, [">= 0.7.5"])
|
27
|
-
s.add_development_dependency(%q<rspec>, [">= 0"])
|
27
|
+
s.add_development_dependency(%q<rspec>, [">= 2.0.0"])
|
28
28
|
else
|
29
29
|
s.add_dependency(%q<rake-compiler>, [">= 0.7.5"])
|
30
|
-
s.add_dependency(%q<rspec>, [">= 0"])
|
30
|
+
s.add_dependency(%q<rspec>, [">= 2.0.0"])
|
31
31
|
end
|
32
32
|
else
|
33
33
|
s.add_dependency(%q<rake-compiler>, [">= 0.7.5"])
|
34
|
-
s.add_dependency(%q<rspec>, [">= 0"])
|
34
|
+
s.add_dependency(%q<rspec>, [">= 2.0.0"])
|
35
35
|
end
|
36
36
|
end
|
37
37
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: utf8
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 31
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 2
|
10
|
+
version: 0.1.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Brian Lopez
|
@@ -42,10 +42,12 @@ dependencies:
|
|
42
42
|
requirements:
|
43
43
|
- - ">="
|
44
44
|
- !ruby/object:Gem::Version
|
45
|
-
hash:
|
45
|
+
hash: 15
|
46
46
|
segments:
|
47
|
+
- 2
|
48
|
+
- 0
|
47
49
|
- 0
|
48
|
-
version:
|
50
|
+
version: 2.0.0
|
49
51
|
type: :development
|
50
52
|
version_requirements: *id002
|
51
53
|
description:
|