character-encodings 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +26 -0
- data/Rakefile +157 -0
- data/ext/encoding/character/unicode/codepoint.c +48 -0
- data/ext/encoding/character/utf-8/break.c +38 -0
- data/ext/encoding/character/utf-8/data/break.h +22931 -0
- data/ext/encoding/character/utf-8/data/character-tables.h +14356 -0
- data/ext/encoding/character/utf-8/data/compose.h +1607 -0
- data/ext/encoding/character/utf-8/data/decompose.h +10925 -0
- data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1065 -0
- data/ext/encoding/character/utf-8/decompose.c +476 -0
- data/ext/encoding/character/utf-8/depend +64 -0
- data/ext/encoding/character/utf-8/extconf.rb +47 -0
- data/ext/encoding/character/utf-8/private.h +68 -0
- data/ext/encoding/character/utf-8/properties.c +1061 -0
- data/ext/encoding/character/utf-8/rb_includes.h +18 -0
- data/ext/encoding/character/utf-8/rb_methods.h +49 -0
- data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
- data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
- data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
- data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
- data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
- data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
- data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
- data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
- data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
- data/ext/encoding/character/utf-8/rb_utf_insert.c +43 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +331 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
- data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
- data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
- data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
- data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
- data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
- data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
- data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
- data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
- data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
- data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
- data/ext/encoding/character/utf-8/unicode.c +319 -0
- data/ext/encoding/character/utf-8/unicode.h +208 -0
- data/ext/encoding/character/utf-8/utf.c +1332 -0
- data/lib/encoding/character/utf-8.rb +201 -0
- data/specifications/aref.rb +45 -0
- data/specifications/count.rb +29 -0
- data/specifications/delete.rb +25 -0
- data/specifications/each_char.rb +28 -0
- data/specifications/index.rb +35 -0
- data/specifications/insert.rb +67 -0
- data/specifications/length.rb +45 -0
- data/specifications/rindex.rb +52 -0
- data/specifications/squeeze.rb +25 -0
- data/specifications/to_i.rb +54 -0
- data/specifications/tr.rb +39 -0
- data/tests/foldcase.rb +28 -0
- data/tests/normalize.rb +101 -0
- data/tests/unicodedatatestbase.rb +45 -0
- metadata +112 -0
@@ -0,0 +1,13 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF8.collate module function.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
|
9
|
+
VALUE
|
10
|
+
rb_utf_collate(UNUSED(VALUE self), VALUE str, VALUE other)
|
11
|
+
{
|
12
|
+
return INT2FIX(utf_collate(StringValuePtr(str), StringValuePtr(other)));
|
13
|
+
}
|
@@ -0,0 +1,30 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF8.count module function.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
#include "rb_utf_internal_tr.h"
|
9
|
+
|
10
|
+
VALUE
|
11
|
+
rb_utf_count(int argc, VALUE *argv, UNUSED(VALUE self))
|
12
|
+
{
|
13
|
+
need_at_least_n_arguments(argc, 2);
|
14
|
+
|
15
|
+
VALUE str = argv[0];
|
16
|
+
StringValue(str);
|
17
|
+
if (RSTRING(str)->len == 0)
|
18
|
+
return INT2FIX(0);
|
19
|
+
|
20
|
+
unsigned int table[TR_TABLE_SIZE];
|
21
|
+
tr_setup_table_from_strings(table, argc - 1, &argv[1]);
|
22
|
+
|
23
|
+
long count = 0;
|
24
|
+
char const *p_end = RSTRING(str)->ptr + RSTRING(str)->len;
|
25
|
+
for (char const *p = RSTRING(str)->ptr; p < p_end; p = utf_next(p))
|
26
|
+
if (tr_table_lookup(table, _utf_char_validated(p, p_end)))
|
27
|
+
count++;
|
28
|
+
|
29
|
+
return LONG2NUM(count);
|
30
|
+
}
|
@@ -0,0 +1,60 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF8.delete module functions.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
#include "rb_utf_internal_tr.h"
|
9
|
+
|
10
|
+
VALUE
|
11
|
+
rb_utf_delete_bang(int argc, VALUE *argv, UNUSED(VALUE self))
|
12
|
+
{
|
13
|
+
need_at_least_n_arguments(argc, 2);
|
14
|
+
|
15
|
+
VALUE str = argv[0];
|
16
|
+
StringValue(str);
|
17
|
+
if (RSTRING(str)->len == 0)
|
18
|
+
return Qnil;
|
19
|
+
|
20
|
+
unsigned int table[TR_TABLE_SIZE];
|
21
|
+
tr_setup_table_from_strings(table, argc - 1, &argv[1]);
|
22
|
+
|
23
|
+
rb_str_modify(str);
|
24
|
+
|
25
|
+
bool modified = false;
|
26
|
+
char *s = RSTRING(str)->ptr;
|
27
|
+
char const *s_end = s + RSTRING(str)->len;
|
28
|
+
char *t = s;
|
29
|
+
while (s < s_end) {
|
30
|
+
unichar c = utf_char(s);
|
31
|
+
|
32
|
+
char *next = rb_utf_next_validated(s, s_end);
|
33
|
+
if (tr_table_lookup(table, c)) {
|
34
|
+
modified = true;
|
35
|
+
} else {
|
36
|
+
memmove(t, s, next - s);
|
37
|
+
t += next - s;
|
38
|
+
}
|
39
|
+
|
40
|
+
s = next;
|
41
|
+
}
|
42
|
+
*t = '\0';
|
43
|
+
RSTRING(str)->len = t - RSTRING(str)->ptr;
|
44
|
+
|
45
|
+
if (modified)
|
46
|
+
return str;
|
47
|
+
|
48
|
+
return Qnil;
|
49
|
+
}
|
50
|
+
|
51
|
+
VALUE
|
52
|
+
rb_utf_delete(int argc, VALUE *argv, VALUE self)
|
53
|
+
{
|
54
|
+
need_at_least_n_arguments(argc, 2);
|
55
|
+
|
56
|
+
StringValue(argv[0]);
|
57
|
+
argv[0] = rb_utf_dup(argv[0]);
|
58
|
+
rb_utf_delete_bang(argc, argv, self);
|
59
|
+
return argv[0];
|
60
|
+
}
|
@@ -0,0 +1,13 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF8.downcase module function.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
|
9
|
+
VALUE
|
10
|
+
rb_utf_downcase(UNUSED(VALUE self), VALUE str)
|
11
|
+
{
|
12
|
+
return rb_utf_alloc_using(utf_downcase(StringValuePtr(str)));
|
13
|
+
}
|
@@ -0,0 +1,27 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF8.each_char module function.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
|
9
|
+
VALUE
|
10
|
+
rb_utf_each_char(UNUSED(VALUE self), VALUE str)
|
11
|
+
{
|
12
|
+
#if 0
|
13
|
+
RETURN_ENUMERATOR(str, 0, 0);
|
14
|
+
#endif
|
15
|
+
|
16
|
+
const char *s = RSTRING(str)->ptr;
|
17
|
+
const char *s_end = s + RSTRING(str)->len;
|
18
|
+
while (s < s_end) {
|
19
|
+
char buf[MAX_UNICHAR_BYTE_LENGTH];
|
20
|
+
int len = unichar_to_utf(_utf_char_validated(s, s_end), buf);
|
21
|
+
VALUE c = rb_utf_new(buf, len);
|
22
|
+
rb_yield(c);
|
23
|
+
s = utf_next(s);
|
24
|
+
}
|
25
|
+
|
26
|
+
return str;
|
27
|
+
}
|
@@ -0,0 +1,13 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF8.folcase module function.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
|
9
|
+
VALUE
|
10
|
+
rb_utf_foldcase(UNUSED(VALUE self), VALUE str)
|
11
|
+
{
|
12
|
+
return rb_utf_alloc_using(utf_foldcase(StringValuePtr(str)));
|
13
|
+
}
|
@@ -0,0 +1,14 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF8.hex module function.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
#include "rb_utf_internal_bignum.h"
|
9
|
+
|
10
|
+
VALUE
|
11
|
+
rb_utf_hex(UNUSED(VALUE self), VALUE str)
|
12
|
+
{
|
13
|
+
return rb_utf_to_inum(str, 16, false);
|
14
|
+
}
|
@@ -0,0 +1,50 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF8.index module function.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
|
9
|
+
VALUE
|
10
|
+
rb_utf_index_m(int argc, VALUE *argv, UNUSED(VALUE self))
|
11
|
+
{
|
12
|
+
VALUE str, sub, rboffset;
|
13
|
+
|
14
|
+
long offset = 0;
|
15
|
+
if (rb_scan_args(argc, argv, "21", &str, &sub, &rboffset) == 3)
|
16
|
+
offset = NUM2LONG(rboffset);
|
17
|
+
|
18
|
+
StringValue(str);
|
19
|
+
|
20
|
+
char *begin, *end;
|
21
|
+
if (!rb_utf_begin_from_offset(str, offset, &begin, &end)) {
|
22
|
+
if (TYPE(sub) == T_REGEXP)
|
23
|
+
rb_backref_set(Qnil);
|
24
|
+
|
25
|
+
return Qnil;
|
26
|
+
}
|
27
|
+
|
28
|
+
switch (TYPE(sub)) {
|
29
|
+
case T_REGEXP:
|
30
|
+
offset = rb_utf_index_regexp(str, begin, end, sub, offset, false);
|
31
|
+
break;
|
32
|
+
default: {
|
33
|
+
VALUE tmp = rb_check_string_type(sub);
|
34
|
+
if (NIL_P(tmp))
|
35
|
+
rb_raise(rb_eTypeError, "type mismatch: %s given",
|
36
|
+
rb_obj_classname(sub));
|
37
|
+
|
38
|
+
sub = tmp;
|
39
|
+
}
|
40
|
+
/* fall through */
|
41
|
+
case T_STRING:
|
42
|
+
offset = rb_utf_index(str, sub, offset);
|
43
|
+
break;
|
44
|
+
}
|
45
|
+
|
46
|
+
if (offset < 0)
|
47
|
+
return Qnil;
|
48
|
+
|
49
|
+
return LONG2NUM(offset);
|
50
|
+
}
|
@@ -0,0 +1,43 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF8.insert module function.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
|
9
|
+
/* TODO: Update to use new offset-calculating functions. */
|
10
|
+
VALUE
|
11
|
+
rb_utf_insert(UNUSED(VALUE self), VALUE str, VALUE index, VALUE other)
|
12
|
+
{
|
13
|
+
long offset = NUM2LONG(index);
|
14
|
+
|
15
|
+
StringValue(str);
|
16
|
+
|
17
|
+
long n_chars = utf_length_n(RSTRING(str)->ptr, RSTRING(str)->len);
|
18
|
+
|
19
|
+
if (abs(offset) > n_chars) {
|
20
|
+
if (offset < 0)
|
21
|
+
offset -= n_chars;
|
22
|
+
rb_raise(rb_eIndexError, "index %ld out of string", offset);
|
23
|
+
}
|
24
|
+
|
25
|
+
long byte_index;
|
26
|
+
|
27
|
+
if (offset == -1) {
|
28
|
+
byte_index = RSTRING(str)->len;
|
29
|
+
} else {
|
30
|
+
if (offset < 0)
|
31
|
+
offset++;
|
32
|
+
|
33
|
+
char *s = RSTRING(str)->ptr;
|
34
|
+
|
35
|
+
if (offset < 0)
|
36
|
+
s += RSTRING(str)->len;
|
37
|
+
byte_index = utf_offset_to_pointer(s, offset) - s;
|
38
|
+
}
|
39
|
+
|
40
|
+
rb_str_update(str, byte_index, 0, other);
|
41
|
+
|
42
|
+
return str;
|
43
|
+
}
|
@@ -0,0 +1,331 @@
|
|
1
|
+
/*
|
2
|
+
* contents: Internal functionality for turning strings into Bignums.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
#include "rb_utf_internal_bignum.h"
|
9
|
+
|
10
|
+
/* Stolen straight from bignum.c. */
|
11
|
+
#define BDIGITS(x) ((BDIGIT *)RBIGNUM(x)->digits)
|
12
|
+
#define BITSPERDIG (SIZEOF_BDIGITS * CHAR_BIT)
|
13
|
+
#define BIGRAD ((BDIGIT_DBL)1 << BITSPERDIG)
|
14
|
+
#define BIGDN(x) RSHIFT(x, BITSPERDIG)
|
15
|
+
#define BIGLO(x) ((BDIGIT)((x) & (BIGRAD - 1)))
|
16
|
+
|
17
|
+
static VALUE
|
18
|
+
bignew_1(VALUE klass, long len, int sign)
|
19
|
+
{
|
20
|
+
NEWOBJ(big, struct RBignum);
|
21
|
+
OBJSETUP(big, klass, T_BIGNUM);
|
22
|
+
big->sign = sign ? 1 : 0;
|
23
|
+
big->len = len;
|
24
|
+
big->digits = ALLOC_N(BDIGIT, len);
|
25
|
+
|
26
|
+
return (VALUE)big;
|
27
|
+
}
|
28
|
+
|
29
|
+
#define bignew(len, sign) bignew_1(rb_cBignum, len, sign)
|
30
|
+
|
31
|
+
static const char *
|
32
|
+
rb_utf_to_inum_sign(const char *s, int *sign)
|
33
|
+
{
|
34
|
+
*sign = 1;
|
35
|
+
|
36
|
+
if (*s == '-')
|
37
|
+
*sign = 0;
|
38
|
+
|
39
|
+
if (*s == '+' || *s == '-')
|
40
|
+
return s + 1;
|
41
|
+
|
42
|
+
return s;
|
43
|
+
}
|
44
|
+
|
45
|
+
static const char *
|
46
|
+
rb_utf_to_inum_base(const char *s, int *base)
|
47
|
+
{
|
48
|
+
if (s[0] == '0') {
|
49
|
+
int offset = 2;
|
50
|
+
switch (s[1]) {
|
51
|
+
case 'x': case 'X':
|
52
|
+
*base = 16;
|
53
|
+
break;
|
54
|
+
case 'b': case 'B':
|
55
|
+
*base = 2;
|
56
|
+
break;
|
57
|
+
case 'o': case 'O':
|
58
|
+
*base = 8;
|
59
|
+
break;
|
60
|
+
case 'd': case 'D':
|
61
|
+
*base = 10;
|
62
|
+
break;
|
63
|
+
default:
|
64
|
+
*base = 8;
|
65
|
+
offset = 1;
|
66
|
+
break;
|
67
|
+
}
|
68
|
+
return s + offset;
|
69
|
+
} else if (*base < -1) {
|
70
|
+
*base = -*base;
|
71
|
+
} else {
|
72
|
+
*base = 10;
|
73
|
+
}
|
74
|
+
|
75
|
+
return s;
|
76
|
+
}
|
77
|
+
|
78
|
+
static size_t
|
79
|
+
rb_utf_to_inum_base_bit_length(const char *s, int base)
|
80
|
+
{
|
81
|
+
if (base < 2 || base > 36)
|
82
|
+
rb_raise(rb_eArgError, "illegal radix %d", base);
|
83
|
+
|
84
|
+
size_t bit_length;
|
85
|
+
switch (base) {
|
86
|
+
case 2:
|
87
|
+
bit_length = 1;
|
88
|
+
case 3:
|
89
|
+
bit_length = 2;
|
90
|
+
case 4: case 5: case 6: case 7: case 8:
|
91
|
+
bit_length = 3;
|
92
|
+
case 9: case 10: case 11: case 12: case 13: case 14: case 15: case 16:
|
93
|
+
bit_length = 4;
|
94
|
+
default:
|
95
|
+
if (base <= 32)
|
96
|
+
bit_length = 5;
|
97
|
+
|
98
|
+
bit_length = 6;
|
99
|
+
}
|
100
|
+
|
101
|
+
return bit_length * utf_length(s);
|
102
|
+
}
|
103
|
+
|
104
|
+
static bool
|
105
|
+
rb_utf_to_inum_num_separator(const char *str, const char *s, bool verify,
|
106
|
+
unichar c, unichar *non_digit)
|
107
|
+
{
|
108
|
+
if (c != '_')
|
109
|
+
return false;
|
110
|
+
|
111
|
+
if (!verify)
|
112
|
+
return true;
|
113
|
+
|
114
|
+
if (*non_digit != 0)
|
115
|
+
rb_raise(rb_eArgError,
|
116
|
+
"unexpected ‘%lc’ found at position %ld", c, s - str);
|
117
|
+
|
118
|
+
*non_digit = c;
|
119
|
+
|
120
|
+
return true;
|
121
|
+
}
|
122
|
+
|
123
|
+
static bool
|
124
|
+
rb_utf_to_inum_digit_value(const char *str, const char *s, unichar c,
|
125
|
+
int base, bool verify, int *digit_value)
|
126
|
+
{
|
127
|
+
/* If we stumble upon a space, return false so that we may end our
|
128
|
+
* processing and skip over any trailing white-space. */
|
129
|
+
if (unichar_isspace(c))
|
130
|
+
return false;
|
131
|
+
|
132
|
+
int value = unichar_xdigit_value(c);
|
133
|
+
if (value == -1) {
|
134
|
+
if (!verify)
|
135
|
+
return false;
|
136
|
+
rb_raise(rb_eArgError,
|
137
|
+
"non-digit character ‘%lc’ found at position %ld",
|
138
|
+
c, s - str);
|
139
|
+
}
|
140
|
+
|
141
|
+
if (value >= base) {
|
142
|
+
if (!verify)
|
143
|
+
return false;
|
144
|
+
|
145
|
+
rb_raise(rb_eArgError,
|
146
|
+
"value (%d) greater than base (%d) at position %ld",
|
147
|
+
value, base, s - str);
|
148
|
+
}
|
149
|
+
|
150
|
+
*digit_value = value;
|
151
|
+
|
152
|
+
return true;
|
153
|
+
}
|
154
|
+
|
155
|
+
static VALUE
|
156
|
+
rb_utf_to_inum_as_fix(const char *str, const char *s, int sign, int base,
|
157
|
+
bool verify)
|
158
|
+
{
|
159
|
+
unsigned long value = 0;
|
160
|
+
|
161
|
+
unichar non_digit = 0;
|
162
|
+
while (*s != '\0') {
|
163
|
+
unichar c = utf_char(s);
|
164
|
+
s = utf_next(s);
|
165
|
+
|
166
|
+
if (rb_utf_to_inum_num_separator(str, s, verify, c, &non_digit))
|
167
|
+
continue;
|
168
|
+
|
169
|
+
int digit_value;
|
170
|
+
if (!rb_utf_to_inum_digit_value(str, s, c, base, verify, &digit_value))
|
171
|
+
break;
|
172
|
+
value *= base;
|
173
|
+
value += digit_value;
|
174
|
+
|
175
|
+
non_digit = 0;
|
176
|
+
}
|
177
|
+
|
178
|
+
if (verify) {
|
179
|
+
while (*s != '\0' && unichar_isspace(utf_char(s)))
|
180
|
+
s = utf_next(s);
|
181
|
+
if (*s != '\0')
|
182
|
+
rb_raise(rb_eArgError,
|
183
|
+
"trailing garbage found at position %ld",
|
184
|
+
s - str);
|
185
|
+
}
|
186
|
+
|
187
|
+
if (POSFIXABLE(value)) {
|
188
|
+
if (sign)
|
189
|
+
return LONG2FIX(value);
|
190
|
+
else
|
191
|
+
return LONG2FIX(-(long)value);
|
192
|
+
}
|
193
|
+
|
194
|
+
VALUE big = rb_uint2big(value);
|
195
|
+
RBIGNUM(big)->sign = sign;
|
196
|
+
return rb_big_norm(big);
|
197
|
+
}
|
198
|
+
|
199
|
+
static VALUE
|
200
|
+
rb_cutf_to_inum(const char * const str, int base, bool verify)
|
201
|
+
{
|
202
|
+
/* FIXME: How can this even happen? */
|
203
|
+
if (str == NULL) {
|
204
|
+
if (verify)
|
205
|
+
rb_invalid_str(str, "Integer");
|
206
|
+
return INT2FIX(0);
|
207
|
+
}
|
208
|
+
|
209
|
+
const char *s = str;
|
210
|
+
|
211
|
+
/* Skip any leading whitespace. */
|
212
|
+
while (unichar_isspace(utf_char(s)))
|
213
|
+
s = utf_next(s);
|
214
|
+
|
215
|
+
/* Figure out what sign this number uses. */
|
216
|
+
int sign;
|
217
|
+
s = rb_utf_to_inum_sign(s, &sign);
|
218
|
+
|
219
|
+
/* Do we have another sign? If so, that’s not correct. */
|
220
|
+
if (*s == '+' || *s == '-') {
|
221
|
+
if (verify)
|
222
|
+
rb_raise(rb_eArgError,
|
223
|
+
"extra sign ‘%c’ found at position %ld",
|
224
|
+
*s, s - str);
|
225
|
+
return INT2FIX(0);
|
226
|
+
}
|
227
|
+
|
228
|
+
int tmp_base = base;
|
229
|
+
s = rb_utf_to_inum_base(s, &tmp_base);
|
230
|
+
if (base <= 0)
|
231
|
+
base = tmp_base;
|
232
|
+
|
233
|
+
/* Remove preceeding 0s. */
|
234
|
+
while (*s == '0')
|
235
|
+
s++;
|
236
|
+
|
237
|
+
/* Figure out how many bits we need to represent the number. */
|
238
|
+
size_t bit_length = rb_utf_to_inum_base_bit_length(str, base);
|
239
|
+
|
240
|
+
/* If the bit_length is less than the number of bits in a VALUE we can
|
241
|
+
* try to store it as a FIXNUM. */
|
242
|
+
if (bit_length <= sizeof(VALUE) * CHAR_BIT)
|
243
|
+
return rb_utf_to_inum_as_fix(str, s, sign, base, verify);
|
244
|
+
|
245
|
+
if (verify && *str == '_')
|
246
|
+
rb_raise(rb_eArgError,
|
247
|
+
"leading digit-separator ‘_’ found at position %ld",
|
248
|
+
s - str);
|
249
|
+
|
250
|
+
bit_length = bit_length / BITSPERDIG + 1;
|
251
|
+
|
252
|
+
/* TODO: Rename these variables. */
|
253
|
+
VALUE z = bignew(bit_length, sign);
|
254
|
+
BDIGIT *zds = BDIGITS(z);
|
255
|
+
MEMZERO(zds, BDIGIT, bit_length);
|
256
|
+
int big_len = 1;
|
257
|
+
|
258
|
+
unichar non_digit = 0;
|
259
|
+
while (true) {
|
260
|
+
unichar c = utf_char(s);
|
261
|
+
s = utf_next(s);
|
262
|
+
|
263
|
+
if (rb_utf_to_inum_num_separator(str, s, verify, c, &non_digit))
|
264
|
+
continue;
|
265
|
+
|
266
|
+
int digit_value;
|
267
|
+
if (!rb_utf_to_inum_digit_value(str, s, c, base, verify, &digit_value))
|
268
|
+
break;
|
269
|
+
|
270
|
+
bool more_to_shift = true;
|
271
|
+
while (more_to_shift) {
|
272
|
+
BDIGIT_DBL num = c;
|
273
|
+
|
274
|
+
for (int i = 0; i < big_len; i++) {
|
275
|
+
num += (BDIGIT_DBL)zds[i] * base;
|
276
|
+
zds[i] = BIGLO(num);
|
277
|
+
num = BIGDN(num);
|
278
|
+
}
|
279
|
+
|
280
|
+
more_to_shift = false;
|
281
|
+
if (num != 0) {
|
282
|
+
big_len++;
|
283
|
+
more_to_shift = true;
|
284
|
+
}
|
285
|
+
}
|
286
|
+
|
287
|
+
non_digit = 0;
|
288
|
+
}
|
289
|
+
|
290
|
+
if (!verify)
|
291
|
+
return rb_big_norm(z);
|
292
|
+
|
293
|
+
s--;
|
294
|
+
if (str + 1 < s && s[-1] == '_')
|
295
|
+
rb_raise(rb_eArgError,
|
296
|
+
"trailing digit-separator ‘_’ found at position %ld",
|
297
|
+
s - str);
|
298
|
+
|
299
|
+
if (*s != '\0')
|
300
|
+
rb_raise(rb_eArgError,
|
301
|
+
"trailing garbage found at position %ld",
|
302
|
+
s - str);
|
303
|
+
|
304
|
+
return rb_big_norm(z);
|
305
|
+
}
|
306
|
+
|
307
|
+
VALUE
|
308
|
+
rb_utf_to_inum(VALUE str, int base, bool verify)
|
309
|
+
{
|
310
|
+
StringValue(str);
|
311
|
+
|
312
|
+
char *s;
|
313
|
+
if (verify)
|
314
|
+
s = StringValueCStr(str);
|
315
|
+
else
|
316
|
+
s = RSTRING(str)->ptr;
|
317
|
+
|
318
|
+
if (s != NULL) {
|
319
|
+
long len = RSTRING(str)->len;
|
320
|
+
/* no sentinel somehow */
|
321
|
+
if (s[len] != '\0') {
|
322
|
+
char *p = ALLOCA_N(char, len + 1);
|
323
|
+
|
324
|
+
MEMCPY(p, s, char, len);
|
325
|
+
p[len] = '\0';
|
326
|
+
s = p;
|
327
|
+
}
|
328
|
+
}
|
329
|
+
|
330
|
+
return rb_cutf_to_inum(s, base, verify);
|
331
|
+
}
|