RubyGems - u - Versions diffs - 0.5.0 → 1.0.0 - Mend

u 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (316) hide show

checksums.yaml +7 -0
data/build/ext/u/data/attributes.rb +39 -0
data/build/ext/u/data/bidi-mirroring.rb +27 -0
data/build/ext/u/data/canonical-combining-class.rb +15 -0
data/build/ext/u/data/case-folding.rb +39 -0
data/build/ext/u/data/cased.rb +19 -0
data/build/ext/u/data/compose.rb +304 -0
data/build/ext/u/data/constants.rb +31 -0
data/build/ext/u/data/decompose.rb +85 -0
data/build/ext/u/data/general-category.rb +61 -0
data/build/ext/u/data/grapheme-word-break.rb +15 -0
data/build/ext/u/data/marshalled.rb +5 -0
data/build/ext/u/data/script.rb +91 -0
data/build/ext/u/data/soft-dotted.rb +17 -0
data/build/ext/u/data/title-table.rb +30 -0
data/build/ext/u/data/wide.rb +17 -0
data/build/lib/u/build.rb +8 -0
data/build/lib/u/build/data.rb +16 -0
data/build/lib/u/build/data/bidimirroring.rb +26 -0
data/build/lib/u/build/data/break.rb +14 -0
data/build/lib/u/build/data/casefolding.rb +77 -0
data/build/lib/u/build/data/compositionexclusions.rb +14 -0
data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
data/build/lib/u/build/data/file.rb +88 -0
data/build/lib/u/build/data/linebreak.rb +14 -0
data/build/lib/u/build/data/proplist.rb +18 -0
data/build/lib/u/build/data/scripts.rb +22 -0
data/build/lib/u/build/data/specialcasing.rb +106 -0
data/build/lib/u/build/data/unicode.rb +41 -0
data/build/lib/u/build/data/unicode/entry.rb +27 -0
data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
data/build/lib/u/build/data/unicode/points.rb +32 -0
data/build/lib/u/build/header.rb +11 -0
data/build/lib/u/build/header/table.rb +19 -0
data/build/lib/u/build/header/table/row.rb +64 -0
data/build/lib/u/build/header/tables.rb +6 -0
data/build/lib/u/build/header/tables/intervals.rb +50 -0
data/build/lib/u/build/header/tables/split.rb +20 -0
data/build/lib/u/build/header/tables/split/data.rb +29 -0
data/build/lib/u/build/header/tables/split/part1.rb +28 -0
data/build/lib/u/build/header/tables/split/part2.rb +13 -0
data/build/lib/u/build/header/tables/split/row.rb +34 -0
data/build/lib/u/build/header/tables/split/rows.rb +22 -0
data/build/test/unit/break.rb +45 -0
data/build/test/unit/case.rb +178 -0
data/build/test/unit/foldcase.rb +44 -0
data/build/test/unit/normalize.rb +81 -0
data/ext/u/attributes.c +62 -0
data/ext/u/attributes.h +5 -0
data/ext/u/case.h +41 -0
data/ext/u/data/attributes.h +3070 -0
data/ext/u/data/bidi-mirroring.h +373 -0
data/ext/u/data/canonical-combining-class.h +2157 -0
data/ext/u/data/case-folding.h +171 -0
data/ext/u/data/cased.h +42 -0
data/ext/u/data/compose.h +1714 -0
data/ext/u/data/constants.h +17 -0
data/ext/u/data/decompose.h +9356 -0
data/ext/u/data/general-category.h +28959 -0
data/ext/u/data/grapheme-break.h +13201 -0
data/ext/u/data/line-break.h +26501 -0
data/ext/u/data/normalization-quick-check.h +3002 -0
data/ext/u/data/script.h +2928 -0
data/ext/u/data/soft-dotted.h +55 -0
data/ext/u/data/title-table.h +41 -0
data/ext/u/data/types.h +11117 -0
data/ext/u/data/wide-cjk.h +197 -0
data/ext/u/data/wide.h +59 -0
data/ext/u/data/word-break.h +10001 -0
data/ext/u/depend +281 -0
data/ext/u/extconf.rb +158 -0
data/ext/u/output.h +51 -0
data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
data/ext/u/private.h +58 -0
data/ext/u/rb_includes.h +10 -0
data/ext/u/rb_private.c +98 -0
data/ext/u/rb_private.h +67 -0
data/ext/u/rb_u.c +251 -0
data/ext/u/rb_u_buffer.c +443 -0
data/ext/u/rb_u_buffer.h +24 -0
data/ext/u/rb_u_re.c +43 -0
data/ext/u/rb_u_re.h +15 -0
data/ext/u/rb_u_string.c +478 -0
data/ext/u/rb_u_string.h +173 -0
data/ext/u/rb_u_string_alnum.c +10 -0
data/ext/u/rb_u_string_alpha.c +10 -0
data/ext/u/rb_u_string_aref.c +142 -0
data/ext/u/rb_u_string_ascii_only.c +13 -0
data/ext/u/rb_u_string_assigned.c +10 -0
data/ext/u/rb_u_string_b.c +18 -0
data/ext/u/rb_u_string_bytesize.c +10 -0
data/ext/u/rb_u_string_byteslice.c +103 -0
data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
data/ext/u/rb_u_string_case_ignorable.c +25 -0
data/ext/u/rb_u_string_casecmp.c +61 -0
data/ext/u/rb_u_string_cased.c +17 -0
data/ext/u/rb_u_string_chomp.c +107 -0
data/ext/u/rb_u_string_chop.c +33 -0
data/ext/u/rb_u_string_chr.c +9 -0
data/ext/u/rb_u_string_cntrl.c +10 -0
data/ext/u/rb_u_string_collate.c +46 -0
data/ext/u/rb_u_string_collation_key.c +18 -0
data/ext/u/rb_u_string_count.c +38 -0
data/ext/u/rb_u_string_defined.c +10 -0
data/ext/u/rb_u_string_delete.c +62 -0
data/ext/u/rb_u_string_digit.c +10 -0
data/ext/u/rb_u_string_downcase.c +13 -0
data/ext/u/rb_u_string_dump.c +153 -0
data/ext/u/rb_u_string_each_byte.c +46 -0
data/ext/u/rb_u_string_each_char.c +49 -0
data/ext/u/rb_u_string_each_codepoint.c +45 -0
data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
data/ext/u/rb_u_string_each_line.c +142 -0
data/ext/u/rb_u_string_each_word.c +34 -0
data/ext/u/rb_u_string_empty.c +11 -0
data/ext/u/rb_u_string_end_with.c +31 -0
data/ext/u/rb_u_string_eql.c +30 -0
data/ext/u/rb_u_string_equal.c +33 -0
data/ext/u/rb_u_string_foldcase.c +12 -0
data/ext/u/rb_u_string_folded.c +13 -0
data/ext/u/rb_u_string_format.c +1745 -0
data/ext/u/rb_u_string_general_category.c +109 -0
data/ext/u/rb_u_string_getbyte.c +21 -0
data/ext/u/rb_u_string_graph.c +21 -0
data/ext/u/rb_u_string_grapheme_break.c +61 -0
data/ext/u/rb_u_string_gsub.c +164 -0
data/ext/u/rb_u_string_hash.c +10 -0
data/ext/u/rb_u_string_hex.c +9 -0
data/ext/u/rb_u_string_include.c +10 -0
data/ext/u/rb_u_string_index.c +110 -0
data/ext/u/rb_u_string_inspect.c +189 -0
data/ext/u/rb_u_string_internal_tr.c +148 -0
data/ext/u/rb_u_string_internal_tr.h +29 -0
data/ext/u/rb_u_string_justify.c +169 -0
data/ext/u/rb_u_string_length.c +10 -0
data/ext/u/rb_u_string_line_break.c +115 -0
data/ext/u/rb_u_string_lower.c +13 -0
data/ext/u/rb_u_string_lstrip.c +24 -0
data/ext/u/rb_u_string_match.c +65 -0
data/ext/u/rb_u_string_mirror.c +16 -0
data/ext/u/rb_u_string_newline.c +21 -0
data/ext/u/rb_u_string_normalize.c +70 -0
data/ext/u/rb_u_string_normalized.c +28 -0
data/ext/u/rb_u_string_oct.c +11 -0
data/ext/u/rb_u_string_ord.c +14 -0
data/ext/u/rb_u_string_partition.c +80 -0
data/ext/u/rb_u_string_plus.c +33 -0
data/ext/u/rb_u_string_print.c +10 -0
data/ext/u/rb_u_string_punct.c +10 -0
data/ext/u/rb_u_string_reverse.c +13 -0
data/ext/u/rb_u_string_rindex.c +104 -0
data/ext/u/rb_u_string_rpartition.c +81 -0
data/ext/u/rb_u_string_rstrip.c +29 -0
data/ext/u/rb_u_string_scan.c +109 -0
data/ext/u/rb_u_string_script.c +253 -0
data/ext/u/rb_u_string_soft_dotted.c +13 -0
data/ext/u/rb_u_string_space.c +24 -0
data/ext/u/rb_u_string_split.c +245 -0
data/ext/u/rb_u_string_squeeze.c +75 -0
data/ext/u/rb_u_string_start_with.c +31 -0
data/ext/u/rb_u_string_strip.c +36 -0
data/ext/u/rb_u_string_sub.c +147 -0
data/ext/u/rb_u_string_times.c +35 -0
data/ext/u/rb_u_string_title.c +10 -0
data/ext/u/rb_u_string_titlecase.c +13 -0
data/ext/u/rb_u_string_to_i.c +45 -0
data/ext/u/rb_u_string_to_inum.c +364 -0
data/ext/u/rb_u_string_to_inum.h +1 -0
data/ext/u/rb_u_string_to_str.c +17 -0
data/ext/u/rb_u_string_to_sym.c +12 -0
data/ext/u/rb_u_string_tr.c +290 -0
data/ext/u/rb_u_string_upcase.c +12 -0
data/ext/u/rb_u_string_upper.c +13 -0
data/ext/u/rb_u_string_valid.c +10 -0
data/ext/u/rb_u_string_valid_encoding.c +12 -0
data/ext/u/rb_u_string_wide.c +21 -0
data/ext/u/rb_u_string_wide_cjk.c +21 -0
data/ext/u/rb_u_string_width.c +19 -0
data/ext/u/rb_u_string_word_break.c +63 -0
data/ext/u/rb_u_string_xdigit.c +22 -0
data/ext/u/rb_u_string_zero_width.c +16 -0
data/ext/u/titled.c +55 -0
data/ext/u/titled.h +1 -0
data/ext/u/u.c +23 -0
data/ext/u/u.h +458 -0
data/ext/u/u_char_canonical_combining_class.c +31 -0
data/ext/u/u_char_digit_value.c +21 -0
data/ext/u/u_char_downcase.c +27 -0
data/ext/u/u_char_general_category.c +31 -0
data/ext/u/u_char_grapheme_break.c +28 -0
data/ext/u/u_char_isalnum.c +24 -0
data/ext/u/u_char_isalpha.c +21 -0
data/ext/u/u_char_isassigned.c +16 -0
data/ext/u/u_char_iscased.c +22 -0
data/ext/u/u_char_iscaseignorable.c +29 -0
data/ext/u/u_char_iscntrl.c +17 -0
data/ext/u/u_char_isdefined.c +15 -0
data/ext/u/u_char_isdigit.c +16 -0
data/ext/u/u_char_isgraph.c +22 -0
data/ext/u/u_char_islower.c +16 -0
data/ext/u/u_char_isnewline.c +24 -0
data/ext/u/u_char_isprint.c +21 -0
data/ext/u/u_char_ispunct.c +27 -0
data/ext/u/u_char_issoftdotted.c +18 -0
data/ext/u/u_char_isspace.c +28 -0
data/ext/u/u_char_isupper.c +16 -0
data/ext/u/u_char_isvalid.c +18 -0
data/ext/u/u_char_iswide.c +18 -0
data/ext/u/u_char_iswide_cjk.c +22 -0
data/ext/u/u_char_isxdigit.c +27 -0
data/ext/u/u_char_iszerowidth.c +29 -0
data/ext/u/u_char_line_break.c +29 -0
data/ext/u/u_char_mirror.c +16 -0
data/ext/u/u_char_normalized.c +23 -0
data/ext/u/u_char_script.c +41 -0
data/ext/u/u_char_to_u.c +48 -0
data/ext/u/u_char_upcase.c +24 -0
data/ext/u/u_char_width.c +12 -0
data/ext/u/u_char_word_break.c +28 -0
data/ext/u/u_char_xdigit_value.c +31 -0
data/ext/u/u_collate.c +83 -0
data/ext/u/u_collation_key.c +132 -0
data/ext/u/u_decode.c +156 -0
data/ext/u/u_downcase.c +201 -0
data/ext/u/u_foldcase.c +68 -0
data/ext/u/u_grapheme_clusters.c +57 -0
data/ext/u/u_has_prefix.c +27 -0
data/ext/u/u_index.c +93 -0
data/ext/u/u_is_ascii_only.c +33 -0
data/ext/u/u_locale.c +40 -0
data/ext/u/u_locale.h +14 -0
data/ext/u/u_mirror.c +20 -0
data/ext/u/u_n_bytes.c +16 -0
data/ext/u/u_n_chars.c +43 -0
data/ext/u/u_normalize.c +232 -0
data/ext/u/u_normalized.c +28 -0
data/ext/u/u_offset_to_pointer.c +62 -0
data/ext/u/u_pointer_to_offset.c +23 -0
data/ext/u/u_recode.c +73 -0
data/ext/u/u_reverse.c +21 -0
data/ext/u/u_rindex.c +132 -0
data/ext/u/u_titlecase.c +68 -0
data/ext/u/u_upcase.c +89 -0
data/ext/u/u_width.c +35 -0
data/ext/u/u_words.c +82 -0
data/ext/u/yield.h +27 -0
data/lib/u-1.0.rb +20 -0
data/lib/u-1.0/buffer.rb +10 -0
data/lib/u-1.0/string.rb +9 -0
data/lib/u-1.0/version.rb +287 -0
data/test/unit/case.rb +2080 -0
data/test/unit/foldcase.rb +1136 -0
data/test/unit/graphemebreak.rb +407 -0
data/test/unit/normalize.rb +367545 -0
data/test/unit/u-1.0.rb +10 -0
data/test/unit/u-1.0/buffer.rb +52 -0
data/test/unit/u-1.0/string.rb +1439 -0
data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
data/test/unit/wordbreak.rb +1083 -0
metadata +603 -148
data/README +0 -38
data/Rakefile +0 -64
data/ext/encoding/character/utf-8/break.c +0 -25
data/ext/encoding/character/utf-8/data/break.h +0 -22931
data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
data/ext/encoding/character/utf-8/data/compose.h +0 -1607
data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
data/ext/encoding/character/utf-8/decompose.c +0 -444
data/ext/encoding/character/utf-8/depend +0 -65
data/ext/encoding/character/utf-8/extconf.rb +0 -67
data/ext/encoding/character/utf-8/private.h +0 -51
data/ext/encoding/character/utf-8/properties.c +0 -1056
data/ext/encoding/character/utf-8/rb_includes.h +0 -19
data/ext/encoding/character/utf-8/rb_methods.h +0 -49
data/ext/encoding/character/utf-8/rb_private.h +0 -52
data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
data/ext/encoding/character/utf-8/tables.h +0 -38
data/ext/encoding/character/utf-8/unicode.c +0 -319
data/ext/encoding/character/utf-8/unicode.h +0 -216
data/ext/encoding/character/utf-8/utf.c +0 -1334
data/lib/encoding/character/utf-8.rb +0 -201
data/lib/u.rb +0 -16
data/lib/u/string.rb +0 -185
data/lib/u/version.rb +0 -5
data/test/unit/u/string.rb +0 -91

data/ext/u/rb_u_string_to_inum.h ADDED

	@@ -0,0 +1 @@
1	+ VALUE rb_u_string_to_inum(VALUE str, int base, bool verify);

data/ext/u/rb_u_string_to_str.c ADDED

@@ -0,0 +1,17 @@
+#include "rb_includes.h"
+/* @return The String representation of the receiver, inheriting any taint and
+ *   untrust, encoded as UTF-8 */
+VALUE
+rb_u_string_to_str(VALUE self)
+{
+        const struct rb_u_string *string = RVAL2USTRING(self);
+        VALUE result = NIL_P(string->rb) ?
+                rb_u_str_new(USTRING_STR(string), USTRING_LENGTH(string)) :
+                string->rb;
+        OBJ_INFECT(result, self);
+        return result;
+}

data/ext/u/rb_u_string_to_sym.c ADDED

@@ -0,0 +1,12 @@
+#include "rb_includes.h"
+/* @raise [EncodingError] If the receiver contains an invalid UTF-8 sequence
+ * @raise [RuntimeError] If there’s no more room for a new Symbol in Ruby’s
+ *   Symbol table
+ * @return [Symbol] The Symbol representation of the receiver */
+VALUE
+rb_u_string_to_sym(VALUE self)
+{
+        /* NOTE: Lazy, but MRI makes it hard to implement this method. */
+        return rb_str_intern(StringValue(self));
+}

data/ext/u/rb_u_string_tr.c ADDED

@@ -0,0 +1,290 @@
+#include "rb_includes.h"
+#include "rb_u_string_internal_tr.h"
+#include "output.h"
+struct tr_range
+{
+        uint32_t begin;
+        uint32_t end;
+};
+static int
+tr_ranges_setup(struct tr *tr, struct tr_range *ranges)
+{
+        int n = 0;
+        bool was_inside_range = false;
+        while (tr_next(tr) != TR_FINISHED) {
+                if (tr->inside_range) {
+                        if (!was_inside_range) {
+                                ranges[n].begin = tr->now;
+                                was_inside_range = true;
+                        }
+                } else {
+                        if (was_inside_range)
+                                ranges[n].end = tr->now;
+                        else
+                                ranges[n].begin = ranges[n].end = tr->now;
+                        n++;
+                        was_inside_range = false;
+                }
+        }
+        return n;
+}
+struct tr_trans_closure
+{
+        struct tr_range *from;
+        int n_from;
+        struct tr_range *to;
+        int n_to;
+};
+static uint32_t
+tr_trans_replace_exclude(UNUSED(uint32_t c), void *closure)
+{
+        return *((uint32_t *)closure);
+}
+static int
+tr_trans_replace_include_offset_of(struct tr_range *ranges, int range, uint32_t c)
+{
+        int offset = 0;
+        for (int i = 0; i < range; i++)
+                offset += ranges[i].end - ranges[i].begin + 1;
+        offset += c - ranges[range].begin;
+        return offset;
+}
+static int
+tr_trans_replace_include_find_from_range(struct tr_trans_closure *closure, uint32_t c)
+{
+        for (int i = closure->n_from - 1; i >= 0; i--)
+                if (closure->from[i].begin <= c && c <= closure->from[i].end)
+                        return i;
+        return -1;
+}
+static uint32_t
+tr_trans_replace_include_find_to_u_char(struct tr_trans_closure *closure, int offset)
+{
+        for (int i = 0, seen = 0; i < closure->n_to; i++) {
+                int size = closure->to[i].end - closure->to[i].begin + 1;
+                if (seen + size > offset)
+                        return closure->to[i].begin + (offset - seen);
+                seen += size;
+        }
+        return closure->to[closure->n_to - 1].end;
+}
+static uint32_t
+tr_trans_replace_include(uint32_t c, void *v_closure)
+{
+        struct tr_trans_closure *closure = (struct tr_trans_closure *)v_closure;
+        int from = tr_trans_replace_include_find_from_range(closure, c);
+        if (from == -1)
+                return closure->to[closure->n_to - 1].end;
+        int offset = tr_trans_replace_include_offset_of(closure->from, from, c);
+        return tr_trans_replace_include_find_to_u_char(closure, offset);
+}
+static void
+tr_trans_real_squeeze(const char *str, const char *end,
+                      struct tr_table *translation,
+                      uint32_t replace(uint32_t, void *), void *closure,
+                      struct output *output, bool *modified)
+{
+        size_t n = output->n;
+        const char *p = str;
+        uint32_t prev_c = U_N_CODEPOINTS;
+        while (p < end) {
+                const char *prev = p;
+                uint32_t c0 = u_decode(&p, p, end);
+                if (tr_table_lookup(translation, c0)) {
+                        uint32_t c = replace(c0, closure);
+                        if (prev_c == c)
+                                continue;
+                        prev_c = c;
+                        output_char(output, c);
+                        if (c != c0)
+                                *modified = true;
+                } else {
+                        output_string(output, prev, p - prev);
+                        prev_c = U_N_CODEPOINTS;
+                }
+        }
+        if ((size_t)(end - str) > (output->n - n))
+                *modified = true;
+}
+static void
+tr_trans_real_standard(const char *str, const char *end,
+                       struct tr_table *translation,
+                       uint32_t replace(uint32_t, void *), void *closure,
+                       struct output *output, bool *modified)
+{
+        const char *p = str;
+        while (p < end) {
+                const char *prev = p;
+                uint32_t c = u_decode(&p, p, end);
+                if (tr_table_lookup(translation, c)) {
+                        uint32_t replacement = replace(c, closure);
+                        output_char(output, replacement);
+                        if (replacement != c)
+                                *modified = true;
+                } else
+                        output_string(output, prev, p - prev);
+        }
+}
+static void
+tr_trans_real(const char *str, const char *end,
+              struct tr_table *translation,
+              uint32_t replace(uint32_t, void *), void *closure, bool squeeze,
+              struct output *output, bool *modified)
+{
+        if (squeeze)
+                tr_trans_real_squeeze(str, end,
+                                      translation,
+                                      replace, closure,
+                                      output, modified);
+        else
+                tr_trans_real_standard(str, end,
+                                       translation,
+                                       replace, closure,
+                                       output, modified);
+}
+static VALUE
+tr_trans_do(VALUE self, struct tr_table *translation,
+            uint32_t (*replace)(uint32_t, void *), void *closure, bool squeeze)
+{
+        const struct rb_u_string *string = RVAL2USTRING(self);
+        const char *begin = USTRING_STR(string);
+        const char *end = USTRING_END(string);
+        bool modified = false;
+        struct output output = OUTPUT_INIT(NULL, 0);
+        tr_trans_real(begin, end,
+                      translation,
+                      replace, closure, squeeze,
+                      &output, &modified);
+        if (!modified)
+                return self;
+        output.result = ALLOC_N(char, output.n + 1);
+        output.m = output.n + 1;
+        output.n = 0;
+        tr_trans_real(begin, end,
+                      translation,
+                      replace, closure, squeeze,
+                      &output, &modified);
+        output_finalize(&output);
+        return rb_u_string_new_c_own(self, output.result, output.n);
+}
+static VALUE
+tr_trans(VALUE self, VALUE rbfrom, VALUE rbto, bool squeeze)
+{
+        const struct rb_u_string *string = RVAL2USTRING(self);
+        const struct rb_u_string *from = RVAL2USTRING_ANY(rbfrom);
+        const struct rb_u_string *to = RVAL2USTRING_ANY(rbto);
+        if (USTRING_STR(string) == NULL || USTRING_LENGTH(string) == 0)
+                return self;
+        if (USTRING_LENGTH(to) == 0)
+                return rb_u_string_delete(1, &rbfrom, self);
+        struct tr tr_from;
+        tr_init(&tr_from, USTRING_STR(from), USTRING_END(from));
+        struct tr tr_to;
+        tr_init(&tr_to, USTRING_STR(to), USTRING_END(to));
+        struct tr_table translation;
+        tr_table_initialize(&translation, rbfrom);
+        if (tr_should_exclude(&tr_from)) {
+                /* This case is easy.  Just include everything by default and
+                 * exclude the rest as always.  Replace characters found by the
+                 * last character found in tr_to. */
+                while (tr_next(&tr_to) != TR_FINISHED)
+                       ; /* We just need the last replacement character. */
+                return tr_trans_do(self, &translation, tr_trans_replace_exclude,
+                                   &tr_to.now, squeeze);
+        }
+        /* This case is hard.  We need a full-fledged lookup of what character
+         * to translate to, not simply a check whether to include it or not. */
+        struct tr_trans_closure trans_closure;
+        struct tr_range from_ranges[u_n_chars_n(USTRING_STR(from), USTRING_LENGTH(from))];
+        trans_closure.from = from_ranges;
+        trans_closure.n_from = tr_ranges_setup(&tr_from, from_ranges);
+        struct tr_range to_ranges[u_n_chars_n(USTRING_STR(to), USTRING_LENGTH(to))];
+        trans_closure.to = to_ranges;
+        trans_closure.n_to = tr_ranges_setup(&tr_to, to_ranges);
+        return tr_trans_do(self, &translation, tr_trans_replace_include,
+                           &trans_closure, squeeze);
+}
+/* @overload tr(from, to)
+ *
+ *   Returns the receiver, translating characters in FROM to their equivalent
+ *   character, by index, in TO, inheriting any taint and untrust.  If
+ *   TO{#length} < FROM{#length}, TO[-1] will be used for any index _i_ >
+ *   TO{#length}.
+ *
+ *   The complement of all Unicode characters and a given set of characters may
+ *   be specified by prefixing a non-empty set with ‘`^`’ (U+005E CIRCUMFLEX
+ *   ACCENT).
+ *
+ *   Any sequence of characters _a_-_b_ inside a set will expand to also
+ *   include all characters whose code points lay between those of _a_ and _b_.
+ *
+ *   @param [#to_str] from
+ *   @param [#to_str] to
+ *   @return [U::String] */
+VALUE
+rb_u_string_tr(VALUE self, VALUE from, VALUE to)
+{
+        return tr_trans(self, from, to, false);
+}
+/* @overload tr_s(from, to)
+ *
+ *   Returns the receiver, translating characters in FROM to their equivalent
+ *   character, by index, in TO and then squeezing any substrings of
+ *   {#length} > 1 consisting of the same character _c_ with _c_, inheriting
+ *   any taint and untrust.  If TO{#length} < FROM{#length}, TO[-1] will be
+ *   used for any index _i_ > TO{#length}.
+ *
+ *   The complement of all Unicode characters and a given set of characters may
+ *   be specified by prefixing a non-empty set with ‘`^`’ (U+005E CIRCUMFLEX
+ *   ACCENT).
+ *
+ *   Any sequence of characters _a_-_b_ inside a set will expand to also
+ *   include all characters whose code points lay between those of _a_ and _b_.
+ *
+ *   @param [#to_str] from
+ *   @param [#to_str] to
+ *   @return [U::String] */
+VALUE
+rb_u_string_tr_s(VALUE self, VALUE from, VALUE to)
+{
+        return tr_trans(self, from, to, true);
+}

data/ext/u/rb_u_string_upcase.c ADDED

@@ -0,0 +1,12 @@
+#include "rb_includes.h"
+/* @overload upcase(locale = ENV['LC_CTYPE'])
+ *   @param [#to_str] locale
+ *   @return [U::String] The upcasing of the receiver according to the rules of
+ *     of the language of LOCALE, which may be empty to specifically use the
+ *     default, language-independent, rules, inheriting any taint and untrust */
+VALUE
+rb_u_string_upcase(int argc, VALUE *argv, VALUE self)
+{
+        return _rb_u_string_convert_locale(argc, argv, self, u_upcase, NULL);
+}

data/ext/u/rb_u_string_upper.c ADDED

@@ -0,0 +1,13 @@
+#include "rb_includes.h"
+/* @overload upper?(locale = ENV[LC_CTYPE])
+ *   @param [#to_str] locale
+ *   @return [Boolean] True if the receiver has been upcased according to the
+ *     rules of the language of LOCALE, which may be empty to specifically use
+ *     the default, language-independent, rules, that is, if _a_ =
+ *     _a_{#upcase}(LOCALE), where _a_ = {#normalize}(`:nfd`) */
+VALUE
+rb_u_string_upper(int argc, VALUE *argv, VALUE self)
+{
+        return _rb_u_string_test_locale(argc, argv, self, u_upcase);
+}

data/ext/u/rb_u_string_valid.c ADDED

@@ -0,0 +1,10 @@
+#include "rb_includes.h"
+/* @overload valid?
+ *   @return [Boolean] True if the receiver contains only valid Unicode
+ *     characters */
+VALUE
+rb_u_string_valid(VALUE self)
+{
+        return _rb_u_character_test(self, u_char_isvalid);
+}

data/ext/u/rb_u_string_valid_encoding.c ADDED

@@ -0,0 +1,12 @@
+#include "rb_includes.h"
+/* @overload valid_encoding?
+ *   @return [Boolean] True if the receiver contains only valid UTF-8
+ *     sequences */
+VALUE
+rb_u_string_valid_encoding(VALUE self)
+{
+        const struct rb_u_string *string = RVAL2USTRING(self);
+        return u_valid(USTRING_STR(string), USTRING_LENGTH(string), NULL) ? Qtrue : Qfalse;
+}

data/ext/u/rb_u_string_wide.c ADDED

@@ -0,0 +1,21 @@
+#include "rb_includes.h"
+/* @overload wide?
+ *
+ *   Returns true if the receiver contains only “wide” characters.  Wide
+ *   character are those that have their East_Asian_Width property set to Wide
+ *   or Fullwidth.
+ *
+ *   This is mostly useful for determining how many “cells” a character will
+ *   take up on a terminal or similar cell-based display.
+ *
+ *   @return [Boolean]
+ *   @see http://www.unicode.org/reports/tr11/
+ *     Unicode Standard Annex #11: East Asian Width
+ *   @see #wide_cjk?
+ *   @see #width */
+VALUE
+rb_u_string_wide(VALUE self)
+{
+        return _rb_u_character_test(self, u_char_iswide);
+}

data/ext/u/rb_u_string_wide_cjk.c ADDED

@@ -0,0 +1,21 @@
+#include "rb_includes.h"
+/* @overload wide_cjk?
+ *
+ *   Returns true if the receiver contains only “wide” and “ambiguously wide”
+ *   characters.  Wide and ambiguously wide character are those that have their
+ *   East_Asian_Width property set to Ambiguous, Wide or Fullwidth.
+ *
+ *   This is mostly useful for determining how many “cells” a character will
+ *   take up on a terminal or similar cell-based display.
+ *
+ *   @return [Boolean]
+ *   @see http://www.unicode.org/reports/tr11/
+ *     Unicode Standard Annex #11: East Asian Width
+ *   @see #wide?
+ *   @see #width */
+VALUE
+rb_u_string_wide_cjk(VALUE self)
+{
+        return _rb_u_character_test(self, u_char_iswide_cjk);
+}

data/ext/u/rb_u_string_width.c ADDED

@@ -0,0 +1,19 @@
+#include "rb_includes.h"
+/* Returns the width of the receiver.  The width is defined as the sum of the
+ * number of “cells” on a terminal or similar cell-based display that the
+ * characters in the string will require.
+ *
+ * Characters that are {#wide?} have a width of 2.  Characters that are
+ * {#zero_width?} have a width of 0.  Other characters have a width of 1.
+ *
+ * @return [Integer]
+ * @see http://www.unicode.org/reports/tr11/
+ *   Unicode Standard Annex #11: East Asian Width */
+VALUE
+rb_u_string_width(VALUE self)
+{
+        const struct rb_u_string *string = RVAL2USTRING(self);
+        return UINT2NUM(u_width_n(USTRING_STR(string), USTRING_LENGTH(string)));
+}

data/ext/u/rb_u_string_word_break.c ADDED

@@ -0,0 +1,63 @@
+#include "rb_includes.h"
+#define BREAK2ID(value, symbol) \
+        case U_WORD_BREAK_##value: { \
+                static ID id_##symbol; \
+                if (id_##symbol == 0) \
+                        id_##symbol = rb_intern(#symbol); \
+                return ID2SYM(id_##symbol); \
+        }
+static VALUE
+break_to_symbol(enum u_word_break value)
+{
+        switch (value) {
+	BREAK2ID(ALETTER, aletter);
+	BREAK2ID(CR, cr);
+	BREAK2ID(EXTEND, extend);
+	BREAK2ID(EXTENDNUMLET, extendnumlet);
+	BREAK2ID(FORMAT, format);
+	BREAK2ID(KATAKANA, katakana);
+	BREAK2ID(LF, lf);
+	BREAK2ID(MIDLETTER, midletter);
+	BREAK2ID(MIDNUM, midnum);
+	BREAK2ID(MIDNUMLET, midnumlet);
+	BREAK2ID(NEWLINE, newline);
+	BREAK2ID(NUMERIC, numeric);
+	BREAK2ID(OTHER, other);
+	BREAK2ID(REGIONAL_INDICATOR, regional_indicator);
+        default:
+                rb_u_raise(rb_eNotImpError, "unknown word break value: %d", value);
+        }
+}
+/* Returns the word break property value of the characters of the receiver.
+ *
+ * The possible word break values are
+ *
+ * * :aletter
+ * * :cr
+ * * :extend
+ * * :extendnumlet
+ * * :format
+ * * :katakana
+ * * :lf
+ * * :midletter
+ * * :midnum
+ * * :midnumlet
+ * * :newline
+ * * :numeric
+ * * :other
+ * * :regional_indicator
+ *
+ * @raise [ArgumentError] If the string consists of more than one break type
+ * @return [Symbol]
+ * @see http://www.unicode.org/reports/tr29/
+ *   Unicode Standard Annex #29: Unicode Text Segmentation */
+VALUE
+rb_u_string_word_break(VALUE self)
+{
+        return _rb_u_string_property(self, "word break", U_WORD_BREAK_OTHER,
+                                     (int (*)(uint32_t))u_char_word_break,
+                                     (VALUE (*)(int))break_to_symbol);
+}