u 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +38 -0
- data/Rakefile +64 -0
- data/ext/encoding/character/utf-8/break.c +25 -0
- data/ext/encoding/character/utf-8/data/break.h +22931 -0
- data/ext/encoding/character/utf-8/data/character-tables.h +14358 -0
- data/ext/encoding/character/utf-8/data/compose.h +1607 -0
- data/ext/encoding/character/utf-8/data/decompose.h +10926 -0
- data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1070 -0
- data/ext/encoding/character/utf-8/decompose.c +444 -0
- data/ext/encoding/character/utf-8/depend +65 -0
- data/ext/encoding/character/utf-8/extconf.rb +67 -0
- data/ext/encoding/character/utf-8/private.c +62 -0
- data/ext/encoding/character/utf-8/private.h +51 -0
- data/ext/encoding/character/utf-8/properties.c +1056 -0
- data/ext/encoding/character/utf-8/rb_includes.h +19 -0
- data/ext/encoding/character/utf-8/rb_methods.h +49 -0
- data/ext/encoding/character/utf-8/rb_private.h +52 -0
- data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
- data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
- data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
- data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
- data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
- data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
- data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
- data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
- data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
- data/ext/encoding/character/utf-8/rb_utf_insert.c +48 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +332 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
- data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
- data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
- data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
- data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
- data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
- data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
- data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
- data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
- data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
- data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
- data/ext/encoding/character/utf-8/tables.h +38 -0
- data/ext/encoding/character/utf-8/unicode.c +319 -0
- data/ext/encoding/character/utf-8/unicode.h +216 -0
- data/ext/encoding/character/utf-8/utf.c +1334 -0
- data/lib/encoding/character/utf-8.rb +201 -0
- data/lib/u.rb +16 -0
- data/lib/u/string.rb +185 -0
- data/lib/u/version.rb +5 -0
- data/test/unit/u.rb +5 -0
- data/test/unit/u/string.rb +91 -0
- metadata +174 -0
@@ -0,0 +1,19 @@
|
|
1
|
+
/*
|
2
|
+
* contents: Standard includes for method definitions.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#ifndef RB_INCLUDES_H
|
8
|
+
#define RB_INCLUDES_H
|
9
|
+
|
10
|
+
#include <ruby.h>
|
11
|
+
#include <stdbool.h>
|
12
|
+
#include <stddef.h>
|
13
|
+
#include <stdint.h>
|
14
|
+
#include "unicode.h"
|
15
|
+
#include "private.h"
|
16
|
+
#include "rb_private.h"
|
17
|
+
#include "rb_methods.h"
|
18
|
+
|
19
|
+
#endif /* RB_INCLUDES_H */
|
@@ -0,0 +1,49 @@
|
|
1
|
+
/*
|
2
|
+
* contents: Method declarations.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#ifndef RB_METHODS_H
|
8
|
+
#define RB_METHODS_H
|
9
|
+
|
10
|
+
VALUE rb_utf_collate(UNUSED(VALUE self), VALUE str, VALUE other) HIDDEN;
|
11
|
+
VALUE rb_utf_downcase(UNUSED(VALUE self), VALUE str) HIDDEN;
|
12
|
+
VALUE rb_utf_length(UNUSED(VALUE self), VALUE str) HIDDEN;
|
13
|
+
VALUE rb_utf_reverse(UNUSED(VALUE self), VALUE str) HIDDEN;
|
14
|
+
VALUE rb_utf_upcase(UNUSED(VALUE self), VALUE str) HIDDEN;
|
15
|
+
VALUE rb_utf_aref_m(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
|
16
|
+
VALUE rb_utf_aset_m(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
|
17
|
+
VALUE rb_utf_casecmp(UNUSED(VALUE self), VALUE str1, VALUE str2) HIDDEN;
|
18
|
+
VALUE rb_utf_center(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
|
19
|
+
VALUE rb_utf_ljust(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
|
20
|
+
VALUE rb_utf_rjust(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
|
21
|
+
VALUE rb_utf_chomp_bang(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
|
22
|
+
VALUE rb_utf_chomp(int argc, VALUE *argv, VALUE self) HIDDEN;
|
23
|
+
VALUE rb_utf_chop_bang(UNUSED(VALUE self), VALUE str) HIDDEN;
|
24
|
+
VALUE rb_utf_chop(VALUE self, VALUE str) HIDDEN;
|
25
|
+
VALUE rb_utf_count(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
|
26
|
+
VALUE rb_utf_delete_bang(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
|
27
|
+
VALUE rb_utf_delete(int argc, VALUE *argv, VALUE self) HIDDEN;
|
28
|
+
VALUE rb_utf_each_char(UNUSED(VALUE self), VALUE str) HIDDEN;
|
29
|
+
VALUE rb_utf_index_m(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
|
30
|
+
VALUE rb_utf_insert(UNUSED(VALUE self), VALUE str, VALUE index,
|
31
|
+
VALUE other) HIDDEN;
|
32
|
+
VALUE rb_utf_lstrip_bang(UNUSED(VALUE self), VALUE str) HIDDEN;
|
33
|
+
VALUE rb_utf_lstrip(VALUE self, VALUE str) HIDDEN;
|
34
|
+
VALUE rb_utf_rindex_m(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
|
35
|
+
VALUE rb_utf_rstrip_bang(UNUSED(VALUE self), VALUE str) HIDDEN;
|
36
|
+
VALUE rb_utf_rstrip(VALUE self, VALUE str) HIDDEN;
|
37
|
+
VALUE rb_utf_squeeze_bang(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
|
38
|
+
VALUE rb_utf_squeeze(int argc, VALUE *argv, VALUE self) HIDDEN;
|
39
|
+
VALUE rb_utf_strip_bang(VALUE self, VALUE str) HIDDEN;
|
40
|
+
VALUE rb_utf_strip(VALUE self, VALUE str) HIDDEN;
|
41
|
+
VALUE rb_utf_to_i(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
|
42
|
+
VALUE rb_utf_hex(UNUSED(VALUE self), VALUE str) HIDDEN;
|
43
|
+
VALUE rb_utf_oct(UNUSED(VALUE self), VALUE str) HIDDEN;
|
44
|
+
VALUE rb_utf_tr(UNUSED(VALUE self), VALUE str, VALUE from, VALUE to) HIDDEN;
|
45
|
+
VALUE rb_utf_tr_s(UNUSED(VALUE self), VALUE str, VALUE from, VALUE to) HIDDEN;
|
46
|
+
VALUE rb_utf_foldcase(UNUSED(VALUE self), VALUE str) HIDDEN;
|
47
|
+
VALUE rb_utf_normalize(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
|
48
|
+
|
49
|
+
#endif /* RB_METHODS_H */
|
@@ -0,0 +1,52 @@
|
|
1
|
+
/*
|
2
|
+
* contents: Private Ruby-related functions.
|
3
|
+
*
|
4
|
+
* Copyright © 2007 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#ifndef RB_PRIVATE_H
|
8
|
+
#define RB_PRIVATE_H
|
9
|
+
|
10
|
+
|
11
|
+
void need_at_least_n_arguments(int argc, int n) HIDDEN;
|
12
|
+
|
13
|
+
unichar _utf_char_validated(char const *const str,
|
14
|
+
char const *const str_end) HIDDEN;
|
15
|
+
char *_utf_offset_to_pointer_validated_impl(const char *str, long offset,
|
16
|
+
const char *limit, bool noisy) HIDDEN;
|
17
|
+
|
18
|
+
char *_utf_offset_to_pointer_validated(const char *str, long offset,
|
19
|
+
const char *end) HIDDEN;
|
20
|
+
|
21
|
+
char *_utf_offset_to_pointer_failable(const char *str, long offset,
|
22
|
+
const char *end) HIDDEN;
|
23
|
+
|
24
|
+
VALUE rb_utf_new(const char *str, long len) HIDDEN;
|
25
|
+
|
26
|
+
VALUE rb_utf_new2(const char *str) HIDDEN;
|
27
|
+
|
28
|
+
VALUE rb_utf_new5(VALUE obj, const char *str, long len) HIDDEN;
|
29
|
+
|
30
|
+
VALUE rb_utf_alloc_using(char *str) HIDDEN;
|
31
|
+
|
32
|
+
VALUE rb_utf_dup(VALUE str) HIDDEN;
|
33
|
+
|
34
|
+
long rb_utf_index(VALUE str, VALUE sub, long offset) HIDDEN;
|
35
|
+
|
36
|
+
bool rb_utf_begin_from_offset(VALUE str, long offset, char **begin,
|
37
|
+
char **limit) HIDDEN;
|
38
|
+
|
39
|
+
void rb_utf_begin_from_offset_validated(VALUE str, long offset, char **begin,
|
40
|
+
char **limit) HIDDEN;
|
41
|
+
|
42
|
+
char *rb_utf_prev_validated(const char *begin, const char *p) HIDDEN;
|
43
|
+
|
44
|
+
VALUE rb_utf_update(VALUE str, long offset, long len, VALUE replacement) HIDDEN;
|
45
|
+
|
46
|
+
char *rb_utf_next_validated(const char *p, const char *end) HIDDEN;
|
47
|
+
|
48
|
+
long rb_utf_index_regexp(VALUE str, const char *s, const char *end, VALUE sub,
|
49
|
+
long offset, bool reverse) HIDDEN;
|
50
|
+
|
51
|
+
|
52
|
+
#endif /* RB_PRIVATE_H */
|
@@ -0,0 +1,111 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF8.aref module function.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
#include <re.h>
|
9
|
+
|
10
|
+
static VALUE
|
11
|
+
rb_utf_substr(VALUE str, long offset, long len)
|
12
|
+
{
|
13
|
+
if (len < 0)
|
14
|
+
return Qnil;
|
15
|
+
|
16
|
+
char *begin, *limit;
|
17
|
+
if (!rb_utf_begin_from_offset(str, offset, &begin, &limit))
|
18
|
+
return Qnil;
|
19
|
+
char *end = _utf_offset_to_pointer_failable(begin, len, limit);
|
20
|
+
if (end == NULL)
|
21
|
+
end = limit;
|
22
|
+
|
23
|
+
VALUE substr = (begin == end) ?
|
24
|
+
rb_utf_new5(str, NULL, 0) :
|
25
|
+
rb_utf_new5(str, begin, end - begin);
|
26
|
+
|
27
|
+
OBJ_INFECT(substr, str);
|
28
|
+
|
29
|
+
return substr;
|
30
|
+
}
|
31
|
+
|
32
|
+
static VALUE
|
33
|
+
rb_utf_substr_and_infect(VALUE str, long offset, long len, VALUE source)
|
34
|
+
{
|
35
|
+
VALUE substr = rb_utf_substr(str, offset, len);
|
36
|
+
OBJ_INFECT(substr, source);
|
37
|
+
return substr;
|
38
|
+
}
|
39
|
+
|
40
|
+
/* XXX: Stolen straight from string.c. */
|
41
|
+
static VALUE
|
42
|
+
rb_str_subpat(VALUE str, VALUE re, int nth)
|
43
|
+
{
|
44
|
+
if (rb_reg_search(re, str, 0, 0) >= 0)
|
45
|
+
return rb_reg_nth_match(nth, rb_backref_get());
|
46
|
+
|
47
|
+
return Qnil;
|
48
|
+
}
|
49
|
+
|
50
|
+
static VALUE
|
51
|
+
rb_utf_aref_num(VALUE str, long offset)
|
52
|
+
{
|
53
|
+
char *begin, *limit;
|
54
|
+
if (!rb_utf_begin_from_offset(str, offset, &begin, &limit))
|
55
|
+
return Qnil;
|
56
|
+
|
57
|
+
char *end = rb_utf_next_validated(begin, limit);
|
58
|
+
|
59
|
+
return rb_utf_new(begin, end - begin);
|
60
|
+
}
|
61
|
+
|
62
|
+
static VALUE
|
63
|
+
rb_utf_aref_default(VALUE str, VALUE index)
|
64
|
+
{
|
65
|
+
long n_chars = utf_length_n(RSTRING(str)->ptr, RSTRING(str)->len);
|
66
|
+
|
67
|
+
long begin, len;
|
68
|
+
switch (rb_range_beg_len(index, &begin, &len, n_chars, 0)) {
|
69
|
+
case Qfalse:
|
70
|
+
return rb_utf_aref_num(str, NUM2LONG(index));
|
71
|
+
case Qnil:
|
72
|
+
return Qnil;
|
73
|
+
default:
|
74
|
+
return rb_utf_substr_and_infect(str, begin, len, index);
|
75
|
+
}
|
76
|
+
}
|
77
|
+
|
78
|
+
static VALUE
|
79
|
+
rb_utf_aref(VALUE str, VALUE index)
|
80
|
+
{
|
81
|
+
switch (TYPE(index)) {
|
82
|
+
case T_FIXNUM:
|
83
|
+
return rb_utf_aref_num(str, FIX2LONG(index));
|
84
|
+
case T_REGEXP:
|
85
|
+
return rb_str_subpat(str, index, 0);
|
86
|
+
case T_STRING:
|
87
|
+
if (rb_utf_index(str, index, 0) != -1)
|
88
|
+
return rb_utf_dup(index);
|
89
|
+
return Qnil;
|
90
|
+
default:
|
91
|
+
return rb_utf_aref_default(str, index);
|
92
|
+
}
|
93
|
+
}
|
94
|
+
|
95
|
+
VALUE
|
96
|
+
rb_utf_aref_m(int argc, VALUE *argv, UNUSED(VALUE self))
|
97
|
+
{
|
98
|
+
StringValue(argv[0]);
|
99
|
+
|
100
|
+
if (argc > 3 || argc < 2)
|
101
|
+
rb_raise(rb_eArgError,
|
102
|
+
"wrong number of arguments (%d for 2)", argc);
|
103
|
+
|
104
|
+
if (argc == 2)
|
105
|
+
return rb_utf_aref(argv[0], argv[1]);
|
106
|
+
|
107
|
+
if (TYPE(argv[1]) == T_REGEXP)
|
108
|
+
return rb_str_subpat(argv[0], argv[1], NUM2INT(argv[2]));
|
109
|
+
|
110
|
+
return rb_utf_substr(argv[0], NUM2INT(argv[1]), NUM2INT(argv[2]));
|
111
|
+
}
|
@@ -0,0 +1,105 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF8.aset module function.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
#include <re.h>
|
9
|
+
|
10
|
+
/* XXX: Stolen straight from string.c. */
|
11
|
+
#define BEG(no) regs->beg[no]
|
12
|
+
#define END(no) regs->end[no]
|
13
|
+
|
14
|
+
static VALUE
|
15
|
+
rb_str_subpat_set(VALUE str, VALUE re, int nth, VALUE val)
|
16
|
+
{
|
17
|
+
VALUE match;
|
18
|
+
long start, end, len;
|
19
|
+
|
20
|
+
if (rb_reg_search(re, str, 0, 0) < 0) {
|
21
|
+
rb_raise(rb_eIndexError, "regexp not matched");
|
22
|
+
}
|
23
|
+
match = rb_backref_get();
|
24
|
+
if (nth >= RMATCH(match)->regs->num_regs) {
|
25
|
+
out_of_range:
|
26
|
+
rb_raise(rb_eIndexError, "index %d out of regexp", nth);
|
27
|
+
}
|
28
|
+
if (nth < 0) {
|
29
|
+
if (-nth >= RMATCH(match)->regs->num_regs) {
|
30
|
+
goto out_of_range;
|
31
|
+
}
|
32
|
+
nth += RMATCH(match)->regs->num_regs;
|
33
|
+
}
|
34
|
+
|
35
|
+
start = RMATCH(match)->BEG(nth);
|
36
|
+
if (start == -1) {
|
37
|
+
rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
|
38
|
+
}
|
39
|
+
end = RMATCH(match)->END(nth);
|
40
|
+
len = end - start;
|
41
|
+
rb_str_update(str, start, len, val);
|
42
|
+
|
43
|
+
return val;
|
44
|
+
}
|
45
|
+
|
46
|
+
static VALUE
|
47
|
+
rb_utf_aset_num(VALUE str, long offset, VALUE replacement)
|
48
|
+
{
|
49
|
+
return rb_utf_update(str, offset, 1, replacement);
|
50
|
+
}
|
51
|
+
|
52
|
+
static VALUE
|
53
|
+
rb_utf_aset_default(VALUE str, VALUE index, VALUE replacement)
|
54
|
+
{
|
55
|
+
long n_chars = utf_length_n(RSTRING(str)->ptr, RSTRING(str)->len);
|
56
|
+
|
57
|
+
long begin, len;
|
58
|
+
if (rb_range_beg_len(index, &begin, &len, n_chars, 2))
|
59
|
+
return rb_utf_update(str, begin, len, replacement);
|
60
|
+
|
61
|
+
return rb_utf_aset_num(str, NUM2LONG(index), replacement);
|
62
|
+
}
|
63
|
+
|
64
|
+
static VALUE
|
65
|
+
rb_utf_aset(VALUE str, VALUE index, VALUE replacement)
|
66
|
+
{
|
67
|
+
switch (TYPE(index)) {
|
68
|
+
case T_FIXNUM:
|
69
|
+
return rb_utf_aset_num(str, FIX2LONG(index), replacement);
|
70
|
+
case T_BIGNUM:
|
71
|
+
return rb_utf_aset_num(str, NUM2LONG(index), replacement);
|
72
|
+
case T_REGEXP:
|
73
|
+
return rb_str_subpat_set(str, index, 0, replacement);
|
74
|
+
case T_STRING: {
|
75
|
+
long begin = rb_utf_index(str, index, 0);
|
76
|
+
if (begin < 0)
|
77
|
+
rb_raise(rb_eIndexError, "string not matched");
|
78
|
+
return rb_utf_update(str,
|
79
|
+
begin,
|
80
|
+
utf_length_n(RSTRING(index)->ptr,
|
81
|
+
RSTRING(index)->len),
|
82
|
+
replacement);
|
83
|
+
}
|
84
|
+
default:
|
85
|
+
return rb_utf_aset_default(str, index, replacement);
|
86
|
+
}
|
87
|
+
}
|
88
|
+
|
89
|
+
VALUE
|
90
|
+
rb_utf_aset_m(int argc, VALUE *argv, UNUSED(VALUE self))
|
91
|
+
{
|
92
|
+
if (argc > 4 || argc < 3)
|
93
|
+
rb_raise(rb_eArgError,
|
94
|
+
"wrong number of arguments (%d for 3)", argc);
|
95
|
+
|
96
|
+
StringValue(argv[0]);
|
97
|
+
|
98
|
+
if (argc == 3)
|
99
|
+
return rb_utf_aset(argv[0], argv[1], argv[2]);
|
100
|
+
|
101
|
+
if (TYPE(argv[1]) == T_REGEXP)
|
102
|
+
return rb_str_subpat_set(argv[0], argv[1], NUM2INT(argv[2]), argv[3]);
|
103
|
+
|
104
|
+
return rb_utf_update(argv[0], NUM2LONG(argv[1]), NUM2LONG(argv[2]), argv[3]);
|
105
|
+
}
|
@@ -0,0 +1,24 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF8.casecmp module function.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
|
9
|
+
VALUE
|
10
|
+
rb_utf_casecmp(UNUSED(VALUE self), VALUE str1, VALUE str2)
|
11
|
+
{
|
12
|
+
StringValue(str1);
|
13
|
+
StringValue(str2);
|
14
|
+
|
15
|
+
char *folded1 = utf_foldcase(RSTRING(str1)->ptr);
|
16
|
+
char *folded2 = utf_foldcase(RSTRING(str2)->ptr);
|
17
|
+
|
18
|
+
int result = utf_collate(folded1, folded2);
|
19
|
+
|
20
|
+
free(folded2);
|
21
|
+
free(folded1);
|
22
|
+
|
23
|
+
return INT2FIX(result);
|
24
|
+
}
|
@@ -0,0 +1,114 @@
|
|
1
|
+
/*
|
2
|
+
* contents: UTF8.chomp module function.
|
3
|
+
*
|
4
|
+
* Copyright © 2006 Nikolai Weibull <now@bitwi.se>
|
5
|
+
*/
|
6
|
+
|
7
|
+
#include "rb_includes.h"
|
8
|
+
|
9
|
+
static VALUE
|
10
|
+
rb_utf_chomp_default(VALUE str)
|
11
|
+
{
|
12
|
+
rb_str_modify(str);
|
13
|
+
|
14
|
+
const char *end = RSTRING(str)->ptr + RSTRING(str)->len;
|
15
|
+
|
16
|
+
char *last = utf_find_prev(RSTRING(str)->ptr, end);
|
17
|
+
if (last == NULL)
|
18
|
+
return Qnil;
|
19
|
+
|
20
|
+
if (_utf_char_validated(last, end) == '\n') {
|
21
|
+
char *last_but_one = utf_find_prev(RSTRING(str)->ptr, last);
|
22
|
+
|
23
|
+
if (last_but_one != NULL && utf_char(last_but_one) == '\r')
|
24
|
+
last = last_but_one;
|
25
|
+
} else if (!unichar_isnewline(utf_char(last))) {
|
26
|
+
return Qnil;
|
27
|
+
}
|
28
|
+
|
29
|
+
RSTRING(str)->len -= (RSTRING(str)->ptr + RSTRING(str)->len) - last;
|
30
|
+
*last = '\0';
|
31
|
+
|
32
|
+
return str;
|
33
|
+
}
|
34
|
+
|
35
|
+
static VALUE
|
36
|
+
rb_utf_chomp_newlines(VALUE str)
|
37
|
+
{
|
38
|
+
char *begin = RSTRING(str)->ptr;
|
39
|
+
char *end = begin + RSTRING(str)->len;
|
40
|
+
|
41
|
+
char *last = end;
|
42
|
+
while (last > begin) {
|
43
|
+
char *last_but_one = utf_find_prev(begin, last);
|
44
|
+
if (last == NULL || !unichar_isnewline(utf_char(last_but_one)))
|
45
|
+
break;
|
46
|
+
last = last_but_one;
|
47
|
+
}
|
48
|
+
|
49
|
+
if (last == end)
|
50
|
+
return Qnil;
|
51
|
+
|
52
|
+
rb_str_modify(str);
|
53
|
+
RSTRING(str)->len -= end - last;
|
54
|
+
*last = '\0';
|
55
|
+
|
56
|
+
return str;
|
57
|
+
}
|
58
|
+
|
59
|
+
VALUE
|
60
|
+
rb_utf_chomp_bang(int argc, VALUE *argv, UNUSED(VALUE self))
|
61
|
+
{
|
62
|
+
VALUE str, rs;
|
63
|
+
|
64
|
+
rb_scan_args(argc, argv, "11", &str, &rs);
|
65
|
+
|
66
|
+
if (RSTRING(str)->len == 0)
|
67
|
+
return Qnil;
|
68
|
+
|
69
|
+
if (argc == 1) {
|
70
|
+
rs = rb_rs;
|
71
|
+
if (rs == rb_default_rs)
|
72
|
+
rb_utf_chomp_default(str);
|
73
|
+
}
|
74
|
+
|
75
|
+
if (NIL_P(rs))
|
76
|
+
return Qnil;
|
77
|
+
|
78
|
+
StringValue(rs);
|
79
|
+
|
80
|
+
long rs_len = RSTRING(rs)->len;
|
81
|
+
if (rs_len == 0)
|
82
|
+
return rb_utf_chomp_newlines(str);
|
83
|
+
|
84
|
+
long len = RSTRING(str)->len;
|
85
|
+
if (rs_len > len)
|
86
|
+
return Qnil;
|
87
|
+
|
88
|
+
char last_char = RSTRING(rs)->ptr[rs_len - 1];
|
89
|
+
if (rs_len == 1 && last_char == '\n')
|
90
|
+
rb_utf_chomp_default(str);
|
91
|
+
|
92
|
+
char *p = RSTRING(str)->ptr;
|
93
|
+
|
94
|
+
if (p[len - 1] != last_char ||
|
95
|
+
(rs_len > 1 &&
|
96
|
+
rb_memcmp(RSTRING(rs)->ptr, p + len - rs_len, rs_len) != 0))
|
97
|
+
return Qnil;
|
98
|
+
|
99
|
+
rb_str_modify(str);
|
100
|
+
RSTRING(str)->len -= rs_len;
|
101
|
+
RSTRING(str)->ptr[RSTRING(str)->len] = '\0';
|
102
|
+
|
103
|
+
return str;
|
104
|
+
}
|
105
|
+
|
106
|
+
VALUE
|
107
|
+
rb_utf_chomp(int argc, VALUE *argv, VALUE self)
|
108
|
+
{
|
109
|
+
StringValue(argv[0]);
|
110
|
+
argv[0] = rb_utf_dup(argv[0]);
|
111
|
+
rb_utf_chomp_bang(argc, argv, self);
|
112
|
+
return argv[0];
|
113
|
+
}
|
114
|
+
|