character-encodings 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +26 -0
- data/Rakefile +157 -0
- data/ext/encoding/character/unicode/codepoint.c +48 -0
- data/ext/encoding/character/utf-8/break.c +38 -0
- data/ext/encoding/character/utf-8/data/break.h +22931 -0
- data/ext/encoding/character/utf-8/data/character-tables.h +14356 -0
- data/ext/encoding/character/utf-8/data/compose.h +1607 -0
- data/ext/encoding/character/utf-8/data/decompose.h +10925 -0
- data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1065 -0
- data/ext/encoding/character/utf-8/decompose.c +476 -0
- data/ext/encoding/character/utf-8/depend +64 -0
- data/ext/encoding/character/utf-8/extconf.rb +47 -0
- data/ext/encoding/character/utf-8/private.h +68 -0
- data/ext/encoding/character/utf-8/properties.c +1061 -0
- data/ext/encoding/character/utf-8/rb_includes.h +18 -0
- data/ext/encoding/character/utf-8/rb_methods.h +49 -0
- data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
- data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
- data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
- data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
- data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
- data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
- data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
- data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
- data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
- data/ext/encoding/character/utf-8/rb_utf_insert.c +43 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +331 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
- data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
- data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
- data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
- data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
- data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
- data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
- data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
- data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
- data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
- data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
- data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
- data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
- data/ext/encoding/character/utf-8/unicode.c +319 -0
- data/ext/encoding/character/utf-8/unicode.h +208 -0
- data/ext/encoding/character/utf-8/utf.c +1332 -0
- data/lib/encoding/character/utf-8.rb +201 -0
- data/specifications/aref.rb +45 -0
- data/specifications/count.rb +29 -0
- data/specifications/delete.rb +25 -0
- data/specifications/each_char.rb +28 -0
- data/specifications/index.rb +35 -0
- data/specifications/insert.rb +67 -0
- data/specifications/length.rb +45 -0
- data/specifications/rindex.rb +52 -0
- data/specifications/squeeze.rb +25 -0
- data/specifications/to_i.rb +54 -0
- data/specifications/tr.rb +39 -0
- data/tests/foldcase.rb +28 -0
- data/tests/normalize.rb +101 -0
- data/tests/unicodedatatestbase.rb +45 -0
- metadata +112 -0
| @@ -0,0 +1,12 @@ | |
| 1 | 
            +
            /*
         | 
| 2 | 
            +
             * contents: Internal functionality for turning strings into Bignums.
         | 
| 3 | 
            +
             *
         | 
| 4 | 
            +
             * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
         | 
| 5 | 
            +
             */
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            #ifndef RB_UTF_INTERNAL_BIGNUM_H
         | 
| 8 | 
            +
            #define RB_UTF_INTERNAL_BIGNUM_H
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            VALUE rb_utf_to_inum(VALUE str, int base, bool verify) HIDDEN;
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            #endif /* RB_UTF_INTERNAL_BIGNUM_H */
         | 
| @@ -0,0 +1,142 @@ | |
| 1 | 
            +
            /*
         | 
| 2 | 
            +
             * contents: Translation (#tr) related functions.
         | 
| 3 | 
            +
             *
         | 
| 4 | 
            +
             * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
         | 
| 5 | 
            +
             */
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            #include "rb_includes.h"
         | 
| 8 | 
            +
            #include "rb_utf_internal_tr.h"
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            void
         | 
| 11 | 
            +
            tr_init(struct tr *tr, char *p, char *p_end)
         | 
| 12 | 
            +
            {
         | 
| 13 | 
            +
                    tr->p = p;
         | 
| 14 | 
            +
                    tr->p_end = p_end;
         | 
| 15 | 
            +
                    tr->inside_range = false;
         | 
| 16 | 
            +
            }
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            bool
         | 
| 19 | 
            +
            tr_should_exclude(struct tr *tr)
         | 
| 20 | 
            +
            {
         | 
| 21 | 
            +
                    if (tr->p + 1 < tr->p_end && *tr->p == '^') {
         | 
| 22 | 
            +
                            tr->p++;
         | 
| 23 | 
            +
                            return true;
         | 
| 24 | 
            +
                    }
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                    return false;
         | 
| 27 | 
            +
            }
         | 
| 28 | 
            +
             | 
| 29 | 
            +
            static enum tr_state
         | 
| 30 | 
            +
            tr_next_char(struct tr *t)
         | 
| 31 | 
            +
            {
         | 
| 32 | 
            +
                    if (t->p == t->p_end)
         | 
| 33 | 
            +
                            return TR_FINISHED;
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                    if (_utf_char_validated(t->p, t->p_end) == '\\') {
         | 
| 36 | 
            +
                            char *next = utf_find_next(t->p, t->p_end);
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                            if (next == NULL) {
         | 
| 39 | 
            +
                                    t->now = '\\';
         | 
| 40 | 
            +
                                    t->p = t->p_end;
         | 
| 41 | 
            +
                                    return TR_FOUND;
         | 
| 42 | 
            +
                            }
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                            t->p = next;
         | 
| 45 | 
            +
                    }
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                    t->now = _utf_char_validated(t->p, t->p_end);
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                    char *next = utf_find_next(t->p, t->p_end);
         | 
| 50 | 
            +
                    if (next == NULL) {
         | 
| 51 | 
            +
                            t->p = t->p_end;
         | 
| 52 | 
            +
                            return TR_FOUND;
         | 
| 53 | 
            +
                    }
         | 
| 54 | 
            +
                    t->p = next;
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                    if (_utf_char_validated(t->p, t->p_end) == '-') {
         | 
| 57 | 
            +
                            next = utf_find_next(t->p, t->p_end);
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                            if (next != NULL) {
         | 
| 60 | 
            +
                                    unichar max = utf_char(next);
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                                    if (max < t->now) {
         | 
| 63 | 
            +
                                            t->p = next;
         | 
| 64 | 
            +
                                            return TR_READ_ANOTHER;
         | 
| 65 | 
            +
                                    }
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                                    t->inside_range = true;
         | 
| 68 | 
            +
                                    t->max = max;
         | 
| 69 | 
            +
                            }
         | 
| 70 | 
            +
                    }
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                    return TR_FOUND;
         | 
| 73 | 
            +
            }
         | 
| 74 | 
            +
             | 
| 75 | 
            +
            enum tr_state
         | 
| 76 | 
            +
            tr_next(struct tr *t)
         | 
| 77 | 
            +
            {
         | 
| 78 | 
            +
                    while (true) {
         | 
| 79 | 
            +
                            if (!t->inside_range) {
         | 
| 80 | 
            +
                                    enum tr_state state;
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                                    if ((state = tr_next_char(t)) == TR_READ_ANOTHER)
         | 
| 83 | 
            +
                                            continue;
         | 
| 84 | 
            +
             | 
| 85 | 
            +
                                    return state;
         | 
| 86 | 
            +
                            } else if (++t->now < t->max) {
         | 
| 87 | 
            +
                                    return TR_FOUND;
         | 
| 88 | 
            +
                            } else {
         | 
| 89 | 
            +
                                    t->inside_range = false;
         | 
| 90 | 
            +
                                    return TR_FOUND;
         | 
| 91 | 
            +
                            }
         | 
| 92 | 
            +
                    }
         | 
| 93 | 
            +
            }
         | 
| 94 | 
            +
             | 
| 95 | 
            +
            static void
         | 
| 96 | 
            +
            tr_table_set(unsigned int *table, unichar c, unsigned int value)
         | 
| 97 | 
            +
            {
         | 
| 98 | 
            +
                    unsigned int offset = c / WORD_BIT;
         | 
| 99 | 
            +
                    unsigned int bit = c % WORD_BIT;
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                    table[offset] = (table[offset] & ~(1U << bit)) | ((value & 1U) << bit);
         | 
| 102 | 
            +
            }
         | 
| 103 | 
            +
             | 
| 104 | 
            +
            void
         | 
| 105 | 
            +
            tr_setup_table(VALUE str, unsigned int *table, bool initialize)
         | 
| 106 | 
            +
            {
         | 
| 107 | 
            +
                    unsigned int buf[TR_TABLE_SIZE];
         | 
| 108 | 
            +
             | 
| 109 | 
            +
                    struct tr tr;
         | 
| 110 | 
            +
                    tr_init(&tr, RSTRING(str)->ptr, RSTRING(str)->ptr + RSTRING(str)->len);
         | 
| 111 | 
            +
             | 
| 112 | 
            +
                    bool exclude = tr_should_exclude(&tr);
         | 
| 113 | 
            +
             | 
| 114 | 
            +
                    if (initialize)
         | 
| 115 | 
            +
                            for (int i = 0; i < TR_TABLE_SIZE; i++)
         | 
| 116 | 
            +
                                    table[i] = ~0U;
         | 
| 117 | 
            +
             | 
| 118 | 
            +
                    unsigned int buf_initializer = exclude ? ~0U : 0U;
         | 
| 119 | 
            +
                    for (int i = 0; i < TR_TABLE_SIZE; i++)
         | 
| 120 | 
            +
                            buf[i] = buf_initializer;
         | 
| 121 | 
            +
             | 
| 122 | 
            +
                    unsigned int buf_setter = !exclude;
         | 
| 123 | 
            +
                    while (tr_next(&tr) != TR_FINISHED)
         | 
| 124 | 
            +
                            tr_table_set(buf, tr.now, buf_setter);
         | 
| 125 | 
            +
             | 
| 126 | 
            +
                    for (int i = 0; i < TR_TABLE_SIZE; i++)
         | 
| 127 | 
            +
                            table[i] &= buf[i];
         | 
| 128 | 
            +
            }
         | 
| 129 | 
            +
             | 
| 130 | 
            +
            void
         | 
| 131 | 
            +
            tr_setup_table_from_strings(unsigned int *table, int argc, VALUE *argv)
         | 
| 132 | 
            +
            {
         | 
| 133 | 
            +
                bool initialize = true;
         | 
| 134 | 
            +
                for (int i = 0; i < argc; i++) {
         | 
| 135 | 
            +
                        VALUE s = argv[i];
         | 
| 136 | 
            +
             | 
| 137 | 
            +
                        StringValue(s);
         | 
| 138 | 
            +
                        tr_setup_table(s, table, initialize);
         | 
| 139 | 
            +
                        initialize = false;
         | 
| 140 | 
            +
                }
         | 
| 141 | 
            +
            }
         | 
| 142 | 
            +
             | 
| @@ -0,0 +1,41 @@ | |
| 1 | 
            +
            /*
         | 
| 2 | 
            +
             * contents: Translation (#tr) related functions
         | 
| 3 | 
            +
             *
         | 
| 4 | 
            +
             * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
         | 
| 5 | 
            +
             */
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            #ifndef TR_H
         | 
| 8 | 
            +
            #define TR_H
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            #ifndef WORD_BIT
         | 
| 11 | 
            +
            #  define WORD_BIT              (sizeof(int) * CHAR_BIT)
         | 
| 12 | 
            +
            #endif
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            #define TR_TABLE_SIZE           ((int)(UNICODE_N_CODEPOINTS / WORD_BIT))
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            #define tr_table_lookup(table, offset)       \
         | 
| 17 | 
            +
                    ((table)[(offset) / WORD_BIT] & (1U << (offset) % WORD_BIT))
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            struct tr {
         | 
| 20 | 
            +
                    bool inside_range;
         | 
| 21 | 
            +
                    unichar now;
         | 
| 22 | 
            +
                    unichar max;
         | 
| 23 | 
            +
                    char *p;
         | 
| 24 | 
            +
                    char *p_end;
         | 
| 25 | 
            +
            };
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            enum tr_state
         | 
| 28 | 
            +
            {
         | 
| 29 | 
            +
                    TR_FOUND,
         | 
| 30 | 
            +
                    TR_READ_ANOTHER,
         | 
| 31 | 
            +
                    TR_FINISHED
         | 
| 32 | 
            +
            };
         | 
| 33 | 
            +
             | 
| 34 | 
            +
            void tr_init(struct tr *tr, char *p, char *p_end) HIDDEN;
         | 
| 35 | 
            +
            bool tr_should_exclude(struct tr *tr) HIDDEN;
         | 
| 36 | 
            +
            enum tr_state tr_next(struct tr *t) HIDDEN;
         | 
| 37 | 
            +
            void tr_setup_table(VALUE str, unsigned int *table, bool initialize) HIDDEN;
         | 
| 38 | 
            +
            void tr_setup_table_from_strings(unsigned int *table, int argc,
         | 
| 39 | 
            +
                                             VALUE *argv) HIDDEN;
         | 
| 40 | 
            +
             | 
| 41 | 
            +
            #endif /* TR_H */
         | 
| @@ -0,0 +1,96 @@ | |
| 1 | 
            +
            /*
         | 
| 2 | 
            +
             * contents: UTF8.center, UTF8.ljust, and UTF8.rjust.
         | 
| 3 | 
            +
             *
         | 
| 4 | 
            +
             * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
         | 
| 5 | 
            +
             */
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            #include "rb_includes.h"
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            static char *
         | 
| 10 | 
            +
            rb_utf_justify_one_side(char *p, const char *f, long f_len, long f_size, long n)
         | 
| 11 | 
            +
            {
         | 
| 12 | 
            +
                    long i;
         | 
| 13 | 
            +
                    for (i = 0; i + f_len < n; i += f_len, p += f_size)
         | 
| 14 | 
            +
                            memcpy(p, f, f_size);
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                    const char *q = f;
         | 
| 17 | 
            +
                    while (i < n) {
         | 
| 18 | 
            +
                            const char *q_end = utf_next(q);
         | 
| 19 | 
            +
                            memcpy(p, q, q_end - q);
         | 
| 20 | 
            +
                            p += q_end - q;
         | 
| 21 | 
            +
                            q = q_end;
         | 
| 22 | 
            +
                            i++;
         | 
| 23 | 
            +
                    }
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                    return p;
         | 
| 26 | 
            +
            }
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            static VALUE
         | 
| 29 | 
            +
            rb_utf_justify(int argc, VALUE *argv, char jflag)
         | 
| 30 | 
            +
            {
         | 
| 31 | 
            +
                    VALUE str, w, pad;
         | 
| 32 | 
            +
                    const char *f = " ";
         | 
| 33 | 
            +
                    long f_len = 1;
         | 
| 34 | 
            +
                    long f_size = 1;
         | 
| 35 | 
            +
                    bool infect_from_pad = false;
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                    if (rb_scan_args(argc, argv, "21", &str, &w, &pad) == 3) {
         | 
| 38 | 
            +
                            StringValue(pad);
         | 
| 39 | 
            +
                            f = RSTRING(pad)->ptr;
         | 
| 40 | 
            +
                            f_len = utf_length_n(f, RSTRING(pad)->len);
         | 
| 41 | 
            +
                            if (f_len == 0)
         | 
| 42 | 
            +
                                    rb_raise(rb_eArgError, "zero width padding");
         | 
| 43 | 
            +
                            f_size = RSTRING(pad)->len;
         | 
| 44 | 
            +
                            infect_from_pad = true;
         | 
| 45 | 
            +
                    }
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                    long len = utf_length_n(RSTRING(str)->ptr, RSTRING(str)->len);
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                    long width = NUM2LONG(w);
         | 
| 50 | 
            +
                    if (width < 0 || len >= width)
         | 
| 51 | 
            +
                            return rb_utf_dup(str);
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                    VALUE res = rb_utf_new5(str, 0, RSTRING(str)->len + (width - len) * f_size);
         | 
| 54 | 
            +
                    char *p = RSTRING(res)->ptr;
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                    long n_remaining = width - len;
         | 
| 57 | 
            +
                    if (jflag != 'l') {
         | 
| 58 | 
            +
                            long n = n_remaining;
         | 
| 59 | 
            +
                            if (jflag == 'c')
         | 
| 60 | 
            +
                                    n /= 2;
         | 
| 61 | 
            +
                            n_remaining -= n;
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                            p = rb_utf_justify_one_side(p, f, f_len, f_size, n);
         | 
| 64 | 
            +
                    }
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                    memcpy(p, RSTRING(str)->ptr, RSTRING(str)->len);
         | 
| 67 | 
            +
                    p += RSTRING(str)->len;
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                    if (jflag != 'r')
         | 
| 70 | 
            +
                            p = rb_utf_justify_one_side(p, f, f_len, f_size, n_remaining);
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                    OBJ_INFECT(res, str);
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                    if (infect_from_pad)
         | 
| 75 | 
            +
                            OBJ_INFECT(res, pad);
         | 
| 76 | 
            +
             | 
| 77 | 
            +
                    return res;
         | 
| 78 | 
            +
            }
         | 
| 79 | 
            +
             | 
| 80 | 
            +
            VALUE
         | 
| 81 | 
            +
            rb_utf_center(int argc, VALUE *argv, UNUSED(VALUE self))
         | 
| 82 | 
            +
            {
         | 
| 83 | 
            +
                    return rb_utf_justify(argc, argv, 'c');
         | 
| 84 | 
            +
            }
         | 
| 85 | 
            +
             | 
| 86 | 
            +
            VALUE
         | 
| 87 | 
            +
            rb_utf_ljust(int argc, VALUE *argv, UNUSED(VALUE self))
         | 
| 88 | 
            +
            {
         | 
| 89 | 
            +
                    return rb_utf_justify(argc, argv, 'l');
         | 
| 90 | 
            +
            }
         | 
| 91 | 
            +
             | 
| 92 | 
            +
            VALUE
         | 
| 93 | 
            +
            rb_utf_rjust(int argc, VALUE *argv, UNUSED(VALUE self))
         | 
| 94 | 
            +
            {
         | 
| 95 | 
            +
                    return rb_utf_justify(argc, argv, 'r');
         | 
| 96 | 
            +
            }
         | 
| @@ -0,0 +1,14 @@ | |
| 1 | 
            +
            /*
         | 
| 2 | 
            +
             * contents: UTF8.length module function.
         | 
| 3 | 
            +
             *
         | 
| 4 | 
            +
             * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
         | 
| 5 | 
            +
             */
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            #include "rb_includes.h"
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            VALUE
         | 
| 10 | 
            +
            rb_utf_length(UNUSED(VALUE self), VALUE str)
         | 
| 11 | 
            +
            {
         | 
| 12 | 
            +
                    StringValue(str);
         | 
| 13 | 
            +
                    return UINT2NUM(utf_length_n(RSTRING(str)->ptr, RSTRING(str)->len));
         | 
| 14 | 
            +
            }
         | 
| @@ -0,0 +1,41 @@ | |
| 1 | 
            +
            /*
         | 
| 2 | 
            +
             * contents: UTF8.lstrip module function.
         | 
| 3 | 
            +
             *
         | 
| 4 | 
            +
             * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
         | 
| 5 | 
            +
             */
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            #include "rb_includes.h"
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            VALUE
         | 
| 10 | 
            +
            rb_utf_lstrip_bang(UNUSED(VALUE self), VALUE str)
         | 
| 11 | 
            +
            {
         | 
| 12 | 
            +
                    StringValue(str);
         | 
| 13 | 
            +
                    char *s = RSTRING(str)->ptr;
         | 
| 14 | 
            +
                    if (s == NULL || RSTRING(str)->len == 0)
         | 
| 15 | 
            +
                            return Qnil;
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                    char *end = s + RSTRING(str)->len;
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                    /* Remove spaces at head. */
         | 
| 20 | 
            +
                    while (s < end && unichar_isspace(_utf_char_validated(s, end)))
         | 
| 21 | 
            +
                            s = utf_next(s);
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                    /* If there weren’t any spaces at head, return Qnil. */
         | 
| 24 | 
            +
                    if (s == RSTRING(str)->ptr)
         | 
| 25 | 
            +
                            return Qnil;
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                    rb_str_modify(str);
         | 
| 28 | 
            +
                    RSTRING(str)->len = end - s;
         | 
| 29 | 
            +
                    memmove(RSTRING(str)->ptr, s, RSTRING(str)->len);
         | 
| 30 | 
            +
                    RSTRING(str)->ptr[RSTRING(str)->len] = '\0';
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                    return str;
         | 
| 33 | 
            +
            }
         | 
| 34 | 
            +
             | 
| 35 | 
            +
            VALUE
         | 
| 36 | 
            +
            rb_utf_lstrip(VALUE self, VALUE str)
         | 
| 37 | 
            +
            {
         | 
| 38 | 
            +
                    str = rb_utf_dup(str);
         | 
| 39 | 
            +
                    rb_utf_lstrip_bang(self, str);
         | 
| 40 | 
            +
                    return str;
         | 
| 41 | 
            +
            }
         | 
| @@ -0,0 +1,51 @@ | |
| 1 | 
            +
            /*
         | 
| 2 | 
            +
             * contents: Encoding::Character::UTF8.normalize module function.
         | 
| 3 | 
            +
             *
         | 
| 4 | 
            +
             * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
         | 
| 5 | 
            +
             */
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            #include "rb_includes.h"
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            #define SYMBOL2MODE(symbol, mode, id) do {              \
         | 
| 10 | 
            +
                    static ID id_##symbol;                          \
         | 
| 11 | 
            +
                    if (id_##symbol == 0)                           \
         | 
| 12 | 
            +
                            id_##symbol = rb_intern(#symbol);       \
         | 
| 13 | 
            +
                    if (id == id_##symbol)                          \
         | 
| 14 | 
            +
                            return mode;                            \
         | 
| 15 | 
            +
            } while (0)
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            static NormalizeMode
         | 
| 18 | 
            +
            symbol_to_mode(VALUE symbol)
         | 
| 19 | 
            +
            {
         | 
| 20 | 
            +
                    if (!SYMBOL_P(symbol))
         | 
| 21 | 
            +
                            rb_raise(rb_eTypeError, "not a symbol");
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                    ID id = SYM2ID(symbol);
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                    SYMBOL2MODE(default, NORMALIZE_DEFAULT, id);
         | 
| 26 | 
            +
                    SYMBOL2MODE(nfd, NORMALIZE_NFD, id);
         | 
| 27 | 
            +
                    SYMBOL2MODE(default_compose, NORMALIZE_DEFAULT_COMPOSE, id);
         | 
| 28 | 
            +
                    SYMBOL2MODE(nfc, NORMALIZE_NFC, id);
         | 
| 29 | 
            +
                    SYMBOL2MODE(all, NORMALIZE_ALL, id);
         | 
| 30 | 
            +
                    SYMBOL2MODE(nfkd, NORMALIZE_NFKD, id);
         | 
| 31 | 
            +
                    SYMBOL2MODE(all_compose, NORMALIZE_ALL_COMPOSE, id);
         | 
| 32 | 
            +
                    SYMBOL2MODE(nfkc, NORMALIZE_NFKC, id);
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                    rb_raise(rb_eArgError, "unknown symbol");
         | 
| 35 | 
            +
            }
         | 
| 36 | 
            +
             | 
| 37 | 
            +
            VALUE
         | 
| 38 | 
            +
            rb_utf_normalize(int argc, VALUE *argv, UNUSED(VALUE self))
         | 
| 39 | 
            +
            {
         | 
| 40 | 
            +
                    VALUE str, rbmode;
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                    NormalizeMode mode = NORMALIZE_DEFAULT;
         | 
| 43 | 
            +
                    if (rb_scan_args(argc, argv, "11", &str, &rbmode) == 2)
         | 
| 44 | 
            +
                            mode = symbol_to_mode(rbmode);
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                    StringValue(str);
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                    return rb_utf_alloc_using(utf_normalize_n(RSTRING(str)->ptr,
         | 
| 49 | 
            +
                                                              mode,
         | 
| 50 | 
            +
                                                              RSTRING(str)->len));
         | 
| 51 | 
            +
            }
         | 
| @@ -0,0 +1,14 @@ | |
| 1 | 
            +
            /*
         | 
| 2 | 
            +
             * contents: UTF8.oct module function.
         | 
| 3 | 
            +
             *
         | 
| 4 | 
            +
             * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
         | 
| 5 | 
            +
             */
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            #include "rb_includes.h"
         | 
| 8 | 
            +
            #include "rb_utf_internal_bignum.h"
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            VALUE
         | 
| 11 | 
            +
            rb_utf_oct(UNUSED(VALUE self), VALUE str)
         | 
| 12 | 
            +
            {
         | 
| 13 | 
            +
                    return rb_utf_to_inum(str, -8, false);
         | 
| 14 | 
            +
            }
         | 
| @@ -0,0 +1,13 @@ | |
| 1 | 
            +
            /*
         | 
| 2 | 
            +
             * contents: UTF8.reverse module function.
         | 
| 3 | 
            +
             *
         | 
| 4 | 
            +
             * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
         | 
| 5 | 
            +
             */
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            #include "rb_includes.h"
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            VALUE
         | 
| 10 | 
            +
            rb_utf_reverse(UNUSED(VALUE self), VALUE str)
         | 
| 11 | 
            +
            {
         | 
| 12 | 
            +
                    return rb_utf_alloc_using(utf_reverse(StringValuePtr(str)));
         | 
| 13 | 
            +
            }
         | 
| @@ -0,0 +1,88 @@ | |
| 1 | 
            +
            /*
         | 
| 2 | 
            +
             * contents: UTF8.rindex module function.
         | 
| 3 | 
            +
             *
         | 
| 4 | 
            +
             * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
         | 
| 5 | 
            +
             */
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            #include "rb_includes.h"
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            static long
         | 
| 10 | 
            +
            rb_utf_rindex(VALUE str, VALUE sub, long offset)
         | 
| 11 | 
            +
            {
         | 
| 12 | 
            +
                    if (RSTRING(str)->len < RSTRING(sub)->len)
         | 
| 13 | 
            +
                            return -1;
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                    char *s, *end;
         | 
| 16 | 
            +
                    rb_utf_begin_from_offset_validated(str, offset, &s, &end);
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                    if (RSTRING(sub)->len == 0)
         | 
| 19 | 
            +
                            return utf_pointer_to_offset(RSTRING(str)->ptr, s);
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                    char *s_begin = RSTRING(str)->ptr;
         | 
| 22 | 
            +
                    char *t = RSTRING(sub)->ptr;
         | 
| 23 | 
            +
                    long len = RSTRING(sub)->len;
         | 
| 24 | 
            +
                    while (s >= s_begin) {
         | 
| 25 | 
            +
                            if (rb_memcmp(s, t, len) == 0)
         | 
| 26 | 
            +
                                    return utf_pointer_to_offset(s_begin, s);
         | 
| 27 | 
            +
                            s--;
         | 
| 28 | 
            +
                    }
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                    return -1;
         | 
| 31 | 
            +
            }
         | 
| 32 | 
            +
             | 
| 33 | 
            +
            VALUE
         | 
| 34 | 
            +
            rb_utf_rindex_m(int argc, VALUE *argv, UNUSED(VALUE self))
         | 
| 35 | 
            +
            {
         | 
| 36 | 
            +
                    VALUE str, sub, rboffset;
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                    rb_scan_args(argc, argv, "21", &str, &sub, &rboffset);
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                    StringValue(str);
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                    long offset = (argc == 3) ? NUM2LONG(rboffset) : RSTRING(str)->len;
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                    char *begin, *end;
         | 
| 45 | 
            +
                    rb_utf_begin_from_offset(str, offset, &begin, &end);
         | 
| 46 | 
            +
                    if (begin == NULL) {
         | 
| 47 | 
            +
                            if (offset <= 0) {
         | 
| 48 | 
            +
                                    if (TYPE(sub) == T_REGEXP)
         | 
| 49 | 
            +
                                            rb_backref_set(Qnil);
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                                    return Qnil;
         | 
| 52 | 
            +
                            }
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                            begin = end;
         | 
| 55 | 
            +
                            /* TODO: this converting back and forward can be optimized away
         | 
| 56 | 
            +
                             * if rb_utf_index_regexp() and rb_utf_rindex() were split up
         | 
| 57 | 
            +
                             * into two additional functions, adding
         | 
| 58 | 
            +
                             * rb_utf_index_regexp_pointer() and rb_utf_rindex_pointer(),
         | 
| 59 | 
            +
                             * so that one can pass a pointer to start at immediately
         | 
| 60 | 
            +
                             * instead of an offset that gets calculated into a pointer. */
         | 
| 61 | 
            +
                            offset = utf_length_n(RSTRING(str)->ptr, RSTRING(str)->len);
         | 
| 62 | 
            +
                    }
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                    switch (TYPE(sub)) {
         | 
| 65 | 
            +
                    case T_REGEXP:
         | 
| 66 | 
            +
                            if (RREGEXP(sub)->len > 0)
         | 
| 67 | 
            +
                                    offset = rb_utf_index_regexp(str, begin, end, sub,
         | 
| 68 | 
            +
                                                                 offset, true);
         | 
| 69 | 
            +
                            break;
         | 
| 70 | 
            +
                    default: {
         | 
| 71 | 
            +
                            VALUE tmp = rb_check_string_type(sub);
         | 
| 72 | 
            +
                            if (NIL_P(tmp))
         | 
| 73 | 
            +
                                    rb_raise(rb_eTypeError, "type mismatch: %s given",
         | 
| 74 | 
            +
                                             rb_obj_classname(sub));
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                            sub = tmp;
         | 
| 77 | 
            +
                    }
         | 
| 78 | 
            +
                            /* fall through */
         | 
| 79 | 
            +
                    case T_STRING:
         | 
| 80 | 
            +
                            offset = rb_utf_rindex(str, sub, offset);
         | 
| 81 | 
            +
                            break;
         | 
| 82 | 
            +
                    }
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                    if (offset < 0)
         | 
| 85 | 
            +
                            return Qnil;
         | 
| 86 | 
            +
             | 
| 87 | 
            +
                    return LONG2NUM(offset);
         | 
| 88 | 
            +
            }
         |