utf8 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -1,4 +1,5 @@
1
1
  .DS_Store
2
2
  *.bundle
3
3
  *.o
4
- tmp/
4
+ *.rbc
5
+ tmp/
data/README.rdoc CHANGED
@@ -2,6 +2,8 @@
2
2
 
3
3
  The scanning code is implemented in C (would love a Java version as well, if any of you want to help with that).
4
4
 
5
+ At the moment, this gem is tested on 1.8.7, 1.9.2 and Rubinius 1.2.1dev - it may work on others but ymmv.
6
+
5
7
  == String::UTF8 Example
6
8
 
7
9
  The String::UTF8#[] API isn't fully supported yet, and may never support regular expressions. I'll update this readme as I make progress.
@@ -1,9 +1,9 @@
1
1
  #include "ext.h"
2
- #include "ruby/regex.h"
3
2
  #include "utf8.h"
4
3
 
5
4
  extern ID intern_as_utf8;
6
5
 
6
+ #ifndef RUBINIUS
7
7
  struct strscanner {
8
8
  /* multi-purpose flags */
9
9
  unsigned long flags;
@@ -15,13 +15,19 @@ struct strscanner {
15
15
  long prev; /* legal only when MATCHED_P(s) */
16
16
  long curr; /* always legal */
17
17
 
18
+ /*
19
+ * We never access this member, and would require a shitload of other patching
20
+ * to work right on other ruby versions
21
+ *
22
+ */
18
23
  /* the regexp register; legal only when MATCHED_P(s) */
19
- struct re_registers regs;
24
+ /* struct re_registers regs; */
20
25
  };
21
26
 
22
27
  #define GET_SCANNER(obj, var) \
23
28
  Data_Get_Struct(obj, struct strscanner, var); \
24
29
  if (NIL_P(var->str)) rb_raise(rb_eArgError, "uninitialized StringScanner object");
30
+ #endif
25
31
 
26
32
  /*
27
33
  * Document-class: StringScanner::UTF8
@@ -34,22 +40,36 @@ struct strscanner {
34
40
  */
35
41
  static VALUE rb_cStringScanner_UTF8_getch(VALUE self) {
36
42
  unsigned char *str;
37
- size_t len;
38
- struct strscanner *scanner;
39
- VALUE utf8Str;
43
+ long len = 0, pos = 0;
44
+ VALUE utf8Str, curStr;
40
45
  int8_t lastCharLen=0;
46
+
47
+ #ifndef RUBINIUS
48
+ struct strscanner *scanner;
41
49
  GET_SCANNER(self, scanner);
42
50
 
43
- str = (unsigned char *)RSTRING_PTR(scanner->str);
44
- len = RSTRING_LEN(scanner->str);
51
+ curStr = scanner->str;
52
+ pos = scanner->curr;
53
+ #else
54
+ curStr = rb_iv_get(self, "@string");
55
+ pos = FIX2LONG(rb_iv_get(self, "@pos"));
56
+ #endif
57
+
58
+ str = (unsigned char *)RSTRING_PTR(curStr);
59
+ len = RSTRING_LEN(curStr);
45
60
 
46
- if (len > 0 && len > scanner->curr) {
61
+ if (len > 0 && len > pos) {
47
62
  lastCharLen = utf8CharLen(str, len);
48
63
  if (lastCharLen < 0) {
49
64
  rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
50
65
  }
51
- utf8Str = rb_str_new((char *)str+scanner->curr, lastCharLen);
52
- scanner->curr += lastCharLen;
66
+ utf8Str = rb_str_new((char *)str+pos, lastCharLen);
67
+ pos += lastCharLen;
68
+ #ifndef RUBINIUS
69
+ scanner->curr = pos;
70
+ #else
71
+ rb_iv_set(self, "@pos", LONG2FIX(pos));
72
+ #endif
53
73
  AS_UTF8(utf8Str);
54
74
  return utf8Str;
55
75
  } else {
@@ -30,7 +30,7 @@ static VALUE rb_cString_UTF8_length(VALUE self) {
30
30
  *
31
31
  * Iterates over the string, yielding one UTF8 character at a time
32
32
  */
33
- static VALUE rb_cString_UTF8_each_char(VALUE self) {
33
+ static VALUE rb_cString_UTF8_each_char(int argc, VALUE *argv, VALUE self) {
34
34
  unsigned char *str = (unsigned char *)RSTRING_PTR(self);
35
35
  size_t len = RSTRING_LEN(self), i=0;
36
36
  int8_t lastCharLen=0;
@@ -38,7 +38,9 @@ static VALUE rb_cString_UTF8_each_char(VALUE self) {
38
38
 
39
39
  // this will return an Enumerator wrapping this string, yielding this method
40
40
  // when Enumerator#each is called
41
- RETURN_ENUMERATOR(self, 0, 0);
41
+ if (!rb_block_given_p()) {
42
+ return rb_funcall(self, rb_intern("to_enum"), 1, ID2SYM(rb_intern("each_char")));
43
+ }
42
44
 
43
45
  for(; i<len; i+=lastCharLen) {
44
46
  lastCharLen = utf8CharLen(str, len);
@@ -259,6 +261,6 @@ void init_String_UTF8() {
259
261
  VALUE rb_cString_UTF8 = rb_define_class_under(rb_cString, "UTF8", rb_cString);
260
262
 
261
263
  rb_define_method(rb_cString_UTF8, "length", rb_cString_UTF8_length, 0);
262
- rb_define_method(rb_cString_UTF8, "each_char", rb_cString_UTF8_each_char, 0);
264
+ rb_define_method(rb_cString_UTF8, "each_char", rb_cString_UTF8_each_char, -1);
263
265
  rb_define_method(rb_cString_UTF8, "[]", rb_cString_UTF8_slice, -1);
264
266
  }
data/lib/utf8/string.rb CHANGED
@@ -5,7 +5,7 @@ class String
5
5
  end
6
6
 
7
7
  class UTF8
8
- VERSION = "0.1.1"
8
+ VERSION = "0.1.2"
9
9
 
10
10
  # Gives you access to the raw non-UTF8-aware version of the string
11
11
  def as_raw
@@ -2,7 +2,7 @@
2
2
  require File.expand_path('../spec_helper', __FILE__)
3
3
 
4
4
  describe StringScanner::UTF8 do
5
- before(:all) do
5
+ before(:each) do
6
6
  @char_array = ["怎", "麼", "也", "沒", "人", "寫", "了", "這", "個", "嗎"]
7
7
  @scanner = StringScanner.new(@char_array.join)
8
8
  @utf8_scanner = @scanner.as_utf8
@@ -10,7 +10,11 @@ describe StringScanner::UTF8 do
10
10
 
11
11
  it "should blow up on invalid utf8 chars" do
12
12
  # lets cut right into the middle of a sequence so we know it's bad
13
- scanner = StringScanner.new(@char_array.join[0..1]).as_utf8
13
+ str = @char_array.join
14
+ str.force_encoding('binary') if str.respond_to?(:force_encoding)
15
+ str = str[0..1]
16
+ str.force_encoding('utf-8') if str.respond_to?(:force_encoding)
17
+ scanner = StringScanner.new(str).as_utf8
14
18
 
15
19
  lambda {
16
20
  scanner.getch
data/spec/string_spec.rb CHANGED
@@ -2,7 +2,7 @@
2
2
  require File.expand_path('../spec_helper', __FILE__)
3
3
 
4
4
  describe String::UTF8 do
5
- before(:all) do
5
+ before(:each) do
6
6
  @char_array = ["怎", "麼", "也", "沒", "人", "寫", "了", "這", "個", "嗎"]
7
7
  @str = @char_array.join
8
8
  @utf8 = @str.as_utf8
@@ -11,7 +11,10 @@ describe String::UTF8 do
11
11
 
12
12
  it "should blow up on invalid utf8 chars" do
13
13
  # lets cut right into the middle of a sequence so we know it's bad
14
- utf8 = @str[0..1].as_utf8
14
+ @str.force_encoding('binary') if @str.respond_to?(:force_encoding)
15
+ utf8 = @str[0..1]
16
+ utf8.force_encoding('utf-8') if utf8.respond_to?(:force_encoding)
17
+ utf8 = utf8.as_utf8
15
18
 
16
19
  lambda {
17
20
  utf8.length
data/utf8.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{utf8}
5
- s.version = "0.1.1"
5
+ s.version = "0.1.2"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Brian Lopez"]
@@ -24,14 +24,14 @@ Gem::Specification.new do |s|
24
24
 
25
25
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
26
26
  s.add_development_dependency(%q<rake-compiler>, [">= 0.7.5"])
27
- s.add_development_dependency(%q<rspec>, [">= 0"])
27
+ s.add_development_dependency(%q<rspec>, [">= 2.0.0"])
28
28
  else
29
29
  s.add_dependency(%q<rake-compiler>, [">= 0.7.5"])
30
- s.add_dependency(%q<rspec>, [">= 0"])
30
+ s.add_dependency(%q<rspec>, [">= 2.0.0"])
31
31
  end
32
32
  else
33
33
  s.add_dependency(%q<rake-compiler>, [">= 0.7.5"])
34
- s.add_dependency(%q<rspec>, [">= 0"])
34
+ s.add_dependency(%q<rspec>, [">= 2.0.0"])
35
35
  end
36
36
  end
37
37
 
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: utf8
3
3
  version: !ruby/object:Gem::Version
4
- hash: 25
4
+ hash: 31
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 1
9
- - 1
10
- version: 0.1.1
9
+ - 2
10
+ version: 0.1.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Brian Lopez
@@ -42,10 +42,12 @@ dependencies:
42
42
  requirements:
43
43
  - - ">="
44
44
  - !ruby/object:Gem::Version
45
- hash: 3
45
+ hash: 15
46
46
  segments:
47
+ - 2
48
+ - 0
47
49
  - 0
48
- version: "0"
50
+ version: 2.0.0
49
51
  type: :development
50
52
  version_requirements: *id002
51
53
  description: