RubyGems - character-encodings - Versions diffs - 0.2.0 - Mend

character-encodings 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

data/README +26 -0
data/Rakefile +157 -0
data/ext/encoding/character/unicode/codepoint.c +48 -0
data/ext/encoding/character/utf-8/break.c +38 -0
data/ext/encoding/character/utf-8/data/break.h +22931 -0
data/ext/encoding/character/utf-8/data/character-tables.h +14356 -0
data/ext/encoding/character/utf-8/data/compose.h +1607 -0
data/ext/encoding/character/utf-8/data/decompose.h +10925 -0
data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1065 -0
data/ext/encoding/character/utf-8/decompose.c +476 -0
data/ext/encoding/character/utf-8/depend +64 -0
data/ext/encoding/character/utf-8/extconf.rb +47 -0
data/ext/encoding/character/utf-8/private.h +68 -0
data/ext/encoding/character/utf-8/properties.c +1061 -0
data/ext/encoding/character/utf-8/rb_includes.h +18 -0
data/ext/encoding/character/utf-8/rb_methods.h +49 -0
data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
data/ext/encoding/character/utf-8/rb_utf_insert.c +43 -0
data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +331 -0
data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
data/ext/encoding/character/utf-8/unicode.c +319 -0
data/ext/encoding/character/utf-8/unicode.h +208 -0
data/ext/encoding/character/utf-8/utf.c +1332 -0
data/lib/encoding/character/utf-8.rb +201 -0
data/specifications/aref.rb +45 -0
data/specifications/count.rb +29 -0
data/specifications/delete.rb +25 -0
data/specifications/each_char.rb +28 -0
data/specifications/index.rb +35 -0
data/specifications/insert.rb +67 -0
data/specifications/length.rb +45 -0
data/specifications/rindex.rb +52 -0
data/specifications/squeeze.rb +25 -0
data/specifications/to_i.rb +54 -0
data/specifications/tr.rb +39 -0
data/tests/foldcase.rb +28 -0
data/tests/normalize.rb +101 -0
data/tests/unicodedatatestbase.rb +45 -0
metadata +112 -0

data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h ADDED Viewed

@@ -0,0 +1,12 @@
+/*
+ * contents: Internal functionality for turning strings into Bignums.
+ *
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
+ */
+#ifndef RB_UTF_INTERNAL_BIGNUM_H
+#define RB_UTF_INTERNAL_BIGNUM_H
+VALUE rb_utf_to_inum(VALUE str, int base, bool verify) HIDDEN;
+#endif /* RB_UTF_INTERNAL_BIGNUM_H */

data/ext/encoding/character/utf-8/rb_utf_internal_tr.c ADDED Viewed

@@ -0,0 +1,142 @@
+/*
+ * contents: Translation (#tr) related functions.
+ *
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
+ */
+#include "rb_includes.h"
+#include "rb_utf_internal_tr.h"
+void
+tr_init(struct tr *tr, char *p, char *p_end)
+{
+        tr->p = p;
+        tr->p_end = p_end;
+        tr->inside_range = false;
+}
+bool
+tr_should_exclude(struct tr *tr)
+{
+        if (tr->p + 1 < tr->p_end && *tr->p == '^') {
+                tr->p++;
+                return true;
+        }
+        return false;
+}
+static enum tr_state
+tr_next_char(struct tr *t)
+{
+        if (t->p == t->p_end)
+                return TR_FINISHED;
+        if (_utf_char_validated(t->p, t->p_end) == '\\') {
+                char *next = utf_find_next(t->p, t->p_end);
+                if (next == NULL) {
+                        t->now = '\\';
+                        t->p = t->p_end;
+                        return TR_FOUND;
+                }
+                t->p = next;
+        }
+        t->now = _utf_char_validated(t->p, t->p_end);
+        char *next = utf_find_next(t->p, t->p_end);
+        if (next == NULL) {
+                t->p = t->p_end;
+                return TR_FOUND;
+        }
+        t->p = next;
+        if (_utf_char_validated(t->p, t->p_end) == '-') {
+                next = utf_find_next(t->p, t->p_end);
+                if (next != NULL) {
+                        unichar max = utf_char(next);
+                        if (max < t->now) {
+                                t->p = next;
+                                return TR_READ_ANOTHER;
+                        }
+                        t->inside_range = true;
+                        t->max = max;
+                }
+        }
+        return TR_FOUND;
+}
+enum tr_state
+tr_next(struct tr *t)
+{
+        while (true) {
+                if (!t->inside_range) {
+                        enum tr_state state;
+                        if ((state = tr_next_char(t)) == TR_READ_ANOTHER)
+                                continue;
+                        return state;
+                } else if (++t->now < t->max) {
+                        return TR_FOUND;
+                } else {
+                        t->inside_range = false;
+                        return TR_FOUND;
+                }
+        }
+}
+static void
+tr_table_set(unsigned int *table, unichar c, unsigned int value)
+{
+        unsigned int offset = c / WORD_BIT;
+        unsigned int bit = c % WORD_BIT;
+        table[offset] = (table[offset] & ~(1U << bit)) | ((value & 1U) << bit);
+}
+void
+tr_setup_table(VALUE str, unsigned int *table, bool initialize)
+{
+        unsigned int buf[TR_TABLE_SIZE];
+        struct tr tr;
+        tr_init(&tr, RSTRING(str)->ptr, RSTRING(str)->ptr + RSTRING(str)->len);
+        bool exclude = tr_should_exclude(&tr);
+        if (initialize)
+                for (int i = 0; i < TR_TABLE_SIZE; i++)
+                        table[i] = ~0U;
+        unsigned int buf_initializer = exclude ? ~0U : 0U;
+        for (int i = 0; i < TR_TABLE_SIZE; i++)
+                buf[i] = buf_initializer;
+        unsigned int buf_setter = !exclude;
+        while (tr_next(&tr) != TR_FINISHED)
+                tr_table_set(buf, tr.now, buf_setter);
+        for (int i = 0; i < TR_TABLE_SIZE; i++)
+                table[i] &= buf[i];
+}
+void
+tr_setup_table_from_strings(unsigned int *table, int argc, VALUE *argv)
+{
+    bool initialize = true;
+    for (int i = 0; i < argc; i++) {
+            VALUE s = argv[i];
+            StringValue(s);
+            tr_setup_table(s, table, initialize);
+            initialize = false;
+    }
+}

data/ext/encoding/character/utf-8/rb_utf_internal_tr.h ADDED Viewed

@@ -0,0 +1,41 @@
+/*
+ * contents: Translation (#tr) related functions
+ *
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
+ */
+#ifndef TR_H
+#define TR_H
+#ifndef WORD_BIT
+#  define WORD_BIT              (sizeof(int) * CHAR_BIT)
+#endif
+#define TR_TABLE_SIZE           ((int)(UNICODE_N_CODEPOINTS / WORD_BIT))
+#define tr_table_lookup(table, offset)       \
+        ((table)[(offset) / WORD_BIT] & (1U << (offset) % WORD_BIT))
+struct tr {
+        bool inside_range;
+        unichar now;
+        unichar max;
+        char *p;
+        char *p_end;
+};
+enum tr_state
+{
+        TR_FOUND,
+        TR_READ_ANOTHER,
+        TR_FINISHED
+};
+void tr_init(struct tr *tr, char *p, char *p_end) HIDDEN;
+bool tr_should_exclude(struct tr *tr) HIDDEN;
+enum tr_state tr_next(struct tr *t) HIDDEN;
+void tr_setup_table(VALUE str, unsigned int *table, bool initialize) HIDDEN;
+void tr_setup_table_from_strings(unsigned int *table, int argc,
+                                 VALUE *argv) HIDDEN;
+#endif /* TR_H */

data/ext/encoding/character/utf-8/rb_utf_justify.c ADDED Viewed

@@ -0,0 +1,96 @@
+/*
+ * contents: UTF8.center, UTF8.ljust, and UTF8.rjust.
+ *
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
+ */
+#include "rb_includes.h"
+static char *
+rb_utf_justify_one_side(char *p, const char *f, long f_len, long f_size, long n)
+{
+        long i;
+        for (i = 0; i + f_len < n; i += f_len, p += f_size)
+                memcpy(p, f, f_size);
+        const char *q = f;
+        while (i < n) {
+                const char *q_end = utf_next(q);
+                memcpy(p, q, q_end - q);
+                p += q_end - q;
+                q = q_end;
+                i++;
+        }
+        return p;
+}
+static VALUE
+rb_utf_justify(int argc, VALUE *argv, char jflag)
+{
+        VALUE str, w, pad;
+        const char *f = " ";
+        long f_len = 1;
+        long f_size = 1;
+        bool infect_from_pad = false;
+        if (rb_scan_args(argc, argv, "21", &str, &w, &pad) == 3) {
+                StringValue(pad);
+                f = RSTRING(pad)->ptr;
+                f_len = utf_length_n(f, RSTRING(pad)->len);
+                if (f_len == 0)
+                        rb_raise(rb_eArgError, "zero width padding");
+                f_size = RSTRING(pad)->len;
+                infect_from_pad = true;
+        }
+        long len = utf_length_n(RSTRING(str)->ptr, RSTRING(str)->len);
+        long width = NUM2LONG(w);
+        if (width < 0 || len >= width)
+                return rb_utf_dup(str);
+        VALUE res = rb_utf_new5(str, 0, RSTRING(str)->len + (width - len) * f_size);
+        char *p = RSTRING(res)->ptr;
+        long n_remaining = width - len;
+        if (jflag != 'l') {
+                long n = n_remaining;
+                if (jflag == 'c')
+                        n /= 2;
+                n_remaining -= n;
+                p = rb_utf_justify_one_side(p, f, f_len, f_size, n);
+        }
+        memcpy(p, RSTRING(str)->ptr, RSTRING(str)->len);
+        p += RSTRING(str)->len;
+        if (jflag != 'r')
+                p = rb_utf_justify_one_side(p, f, f_len, f_size, n_remaining);
+        OBJ_INFECT(res, str);
+        if (infect_from_pad)
+                OBJ_INFECT(res, pad);
+        return res;
+}
+VALUE
+rb_utf_center(int argc, VALUE *argv, UNUSED(VALUE self))
+{
+        return rb_utf_justify(argc, argv, 'c');
+}
+VALUE
+rb_utf_ljust(int argc, VALUE *argv, UNUSED(VALUE self))
+{
+        return rb_utf_justify(argc, argv, 'l');
+}
+VALUE
+rb_utf_rjust(int argc, VALUE *argv, UNUSED(VALUE self))
+{
+        return rb_utf_justify(argc, argv, 'r');
+}

data/ext/encoding/character/utf-8/rb_utf_length.c ADDED Viewed

@@ -0,0 +1,14 @@
+/*
+ * contents: UTF8.length module function.
+ *
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
+ */
+#include "rb_includes.h"
+VALUE
+rb_utf_length(UNUSED(VALUE self), VALUE str)
+{
+        StringValue(str);
+        return UINT2NUM(utf_length_n(RSTRING(str)->ptr, RSTRING(str)->len));
+}

data/ext/encoding/character/utf-8/rb_utf_lstrip.c ADDED Viewed

@@ -0,0 +1,41 @@
+/*
+ * contents: UTF8.lstrip module function.
+ *
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
+ */
+#include "rb_includes.h"
+VALUE
+rb_utf_lstrip_bang(UNUSED(VALUE self), VALUE str)
+{
+        StringValue(str);
+        char *s = RSTRING(str)->ptr;
+        if (s == NULL || RSTRING(str)->len == 0)
+                return Qnil;
+        char *end = s + RSTRING(str)->len;
+        /* Remove spaces at head. */
+        while (s < end && unichar_isspace(_utf_char_validated(s, end)))
+                s = utf_next(s);
+        /* If there weren’t any spaces at head, return Qnil. */
+        if (s == RSTRING(str)->ptr)
+                return Qnil;
+        rb_str_modify(str);
+        RSTRING(str)->len = end - s;
+        memmove(RSTRING(str)->ptr, s, RSTRING(str)->len);
+        RSTRING(str)->ptr[RSTRING(str)->len] = '\0';
+        return str;
+}
+VALUE
+rb_utf_lstrip(VALUE self, VALUE str)
+{
+        str = rb_utf_dup(str);
+        rb_utf_lstrip_bang(self, str);
+        return str;
+}

data/ext/encoding/character/utf-8/rb_utf_normalize.c ADDED Viewed

@@ -0,0 +1,51 @@
+/*
+ * contents: Encoding::Character::UTF8.normalize module function.
+ *
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
+ */
+#include "rb_includes.h"
+#define SYMBOL2MODE(symbol, mode, id) do {              \
+        static ID id_##symbol;                          \
+        if (id_##symbol == 0)                           \
+                id_##symbol = rb_intern(#symbol);       \
+        if (id == id_##symbol)                          \
+                return mode;                            \
+} while (0)
+static NormalizeMode
+symbol_to_mode(VALUE symbol)
+{
+        if (!SYMBOL_P(symbol))
+                rb_raise(rb_eTypeError, "not a symbol");
+        ID id = SYM2ID(symbol);
+        SYMBOL2MODE(default, NORMALIZE_DEFAULT, id);
+        SYMBOL2MODE(nfd, NORMALIZE_NFD, id);
+        SYMBOL2MODE(default_compose, NORMALIZE_DEFAULT_COMPOSE, id);
+        SYMBOL2MODE(nfc, NORMALIZE_NFC, id);
+        SYMBOL2MODE(all, NORMALIZE_ALL, id);
+        SYMBOL2MODE(nfkd, NORMALIZE_NFKD, id);
+        SYMBOL2MODE(all_compose, NORMALIZE_ALL_COMPOSE, id);
+        SYMBOL2MODE(nfkc, NORMALIZE_NFKC, id);
+        rb_raise(rb_eArgError, "unknown symbol");
+}
+VALUE
+rb_utf_normalize(int argc, VALUE *argv, UNUSED(VALUE self))
+{
+        VALUE str, rbmode;
+        NormalizeMode mode = NORMALIZE_DEFAULT;
+        if (rb_scan_args(argc, argv, "11", &str, &rbmode) == 2)
+                mode = symbol_to_mode(rbmode);
+        StringValue(str);
+        return rb_utf_alloc_using(utf_normalize_n(RSTRING(str)->ptr,
+                                                  mode,
+                                                  RSTRING(str)->len));
+}

data/ext/encoding/character/utf-8/rb_utf_oct.c ADDED Viewed

@@ -0,0 +1,14 @@
+/*
+ * contents: UTF8.oct module function.
+ *
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
+ */
+#include "rb_includes.h"
+#include "rb_utf_internal_bignum.h"
+VALUE
+rb_utf_oct(UNUSED(VALUE self), VALUE str)
+{
+        return rb_utf_to_inum(str, -8, false);
+}

data/ext/encoding/character/utf-8/rb_utf_reverse.c ADDED Viewed

@@ -0,0 +1,13 @@
+/*
+ * contents: UTF8.reverse module function.
+ *
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
+ */
+#include "rb_includes.h"
+VALUE
+rb_utf_reverse(UNUSED(VALUE self), VALUE str)
+{
+        return rb_utf_alloc_using(utf_reverse(StringValuePtr(str)));
+}

data/ext/encoding/character/utf-8/rb_utf_rindex.c ADDED Viewed

@@ -0,0 +1,88 @@
+/*
+ * contents: UTF8.rindex module function.
+ *
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
+ */
+#include "rb_includes.h"
+static long
+rb_utf_rindex(VALUE str, VALUE sub, long offset)
+{
+        if (RSTRING(str)->len < RSTRING(sub)->len)
+                return -1;
+        char *s, *end;
+        rb_utf_begin_from_offset_validated(str, offset, &s, &end);
+        if (RSTRING(sub)->len == 0)
+                return utf_pointer_to_offset(RSTRING(str)->ptr, s);
+        char *s_begin = RSTRING(str)->ptr;
+        char *t = RSTRING(sub)->ptr;
+        long len = RSTRING(sub)->len;
+        while (s >= s_begin) {
+                if (rb_memcmp(s, t, len) == 0)
+                        return utf_pointer_to_offset(s_begin, s);
+                s--;
+        }
+        return -1;
+}
+VALUE
+rb_utf_rindex_m(int argc, VALUE *argv, UNUSED(VALUE self))
+{
+        VALUE str, sub, rboffset;
+        rb_scan_args(argc, argv, "21", &str, &sub, &rboffset);
+        StringValue(str);
+        long offset = (argc == 3) ? NUM2LONG(rboffset) : RSTRING(str)->len;
+        char *begin, *end;
+        rb_utf_begin_from_offset(str, offset, &begin, &end);
+        if (begin == NULL) {
+                if (offset <= 0) {
+                        if (TYPE(sub) == T_REGEXP)
+                                rb_backref_set(Qnil);
+                        return Qnil;
+                }
+                begin = end;
+                /* TODO: this converting back and forward can be optimized away
+                 * if rb_utf_index_regexp() and rb_utf_rindex() were split up
+                 * into two additional functions, adding
+                 * rb_utf_index_regexp_pointer() and rb_utf_rindex_pointer(),
+                 * so that one can pass a pointer to start at immediately
+                 * instead of an offset that gets calculated into a pointer. */
+                offset = utf_length_n(RSTRING(str)->ptr, RSTRING(str)->len);
+        }
+        switch (TYPE(sub)) {
+        case T_REGEXP:
+                if (RREGEXP(sub)->len > 0)
+                        offset = rb_utf_index_regexp(str, begin, end, sub,
+                                                     offset, true);
+                break;
+        default: {
+                VALUE tmp = rb_check_string_type(sub);
+                if (NIL_P(tmp))
+                        rb_raise(rb_eTypeError, "type mismatch: %s given",
+                                 rb_obj_classname(sub));
+                sub = tmp;
+        }
+                /* fall through */
+        case T_STRING:
+                offset = rb_utf_rindex(str, sub, offset);
+                break;
+        }
+        if (offset < 0)
+                return Qnil;
+        return LONG2NUM(offset);
+}