RubyGems - u - Versions diffs - 0.5.0 → 1.0.0 - Mend

u 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (316) hide show

checksums.yaml +7 -0
data/build/ext/u/data/attributes.rb +39 -0
data/build/ext/u/data/bidi-mirroring.rb +27 -0
data/build/ext/u/data/canonical-combining-class.rb +15 -0
data/build/ext/u/data/case-folding.rb +39 -0
data/build/ext/u/data/cased.rb +19 -0
data/build/ext/u/data/compose.rb +304 -0
data/build/ext/u/data/constants.rb +31 -0
data/build/ext/u/data/decompose.rb +85 -0
data/build/ext/u/data/general-category.rb +61 -0
data/build/ext/u/data/grapheme-word-break.rb +15 -0
data/build/ext/u/data/marshalled.rb +5 -0
data/build/ext/u/data/script.rb +91 -0
data/build/ext/u/data/soft-dotted.rb +17 -0
data/build/ext/u/data/title-table.rb +30 -0
data/build/ext/u/data/wide.rb +17 -0
data/build/lib/u/build.rb +8 -0
data/build/lib/u/build/data.rb +16 -0
data/build/lib/u/build/data/bidimirroring.rb +26 -0
data/build/lib/u/build/data/break.rb +14 -0
data/build/lib/u/build/data/casefolding.rb +77 -0
data/build/lib/u/build/data/compositionexclusions.rb +14 -0
data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
data/build/lib/u/build/data/file.rb +88 -0
data/build/lib/u/build/data/linebreak.rb +14 -0
data/build/lib/u/build/data/proplist.rb +18 -0
data/build/lib/u/build/data/scripts.rb +22 -0
data/build/lib/u/build/data/specialcasing.rb +106 -0
data/build/lib/u/build/data/unicode.rb +41 -0
data/build/lib/u/build/data/unicode/entry.rb +27 -0
data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
data/build/lib/u/build/data/unicode/points.rb +32 -0
data/build/lib/u/build/header.rb +11 -0
data/build/lib/u/build/header/table.rb +19 -0
data/build/lib/u/build/header/table/row.rb +64 -0
data/build/lib/u/build/header/tables.rb +6 -0
data/build/lib/u/build/header/tables/intervals.rb +50 -0
data/build/lib/u/build/header/tables/split.rb +20 -0
data/build/lib/u/build/header/tables/split/data.rb +29 -0
data/build/lib/u/build/header/tables/split/part1.rb +28 -0
data/build/lib/u/build/header/tables/split/part2.rb +13 -0
data/build/lib/u/build/header/tables/split/row.rb +34 -0
data/build/lib/u/build/header/tables/split/rows.rb +22 -0
data/build/test/unit/break.rb +45 -0
data/build/test/unit/case.rb +178 -0
data/build/test/unit/foldcase.rb +44 -0
data/build/test/unit/normalize.rb +81 -0
data/ext/u/attributes.c +62 -0
data/ext/u/attributes.h +5 -0
data/ext/u/case.h +41 -0
data/ext/u/data/attributes.h +3070 -0
data/ext/u/data/bidi-mirroring.h +373 -0
data/ext/u/data/canonical-combining-class.h +2157 -0
data/ext/u/data/case-folding.h +171 -0
data/ext/u/data/cased.h +42 -0
data/ext/u/data/compose.h +1714 -0
data/ext/u/data/constants.h +17 -0
data/ext/u/data/decompose.h +9356 -0
data/ext/u/data/general-category.h +28959 -0
data/ext/u/data/grapheme-break.h +13201 -0
data/ext/u/data/line-break.h +26501 -0
data/ext/u/data/normalization-quick-check.h +3002 -0
data/ext/u/data/script.h +2928 -0
data/ext/u/data/soft-dotted.h +55 -0
data/ext/u/data/title-table.h +41 -0
data/ext/u/data/types.h +11117 -0
data/ext/u/data/wide-cjk.h +197 -0
data/ext/u/data/wide.h +59 -0
data/ext/u/data/word-break.h +10001 -0
data/ext/u/depend +281 -0
data/ext/u/extconf.rb +158 -0
data/ext/u/output.h +51 -0
data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
data/ext/u/private.h +58 -0
data/ext/u/rb_includes.h +10 -0
data/ext/u/rb_private.c +98 -0
data/ext/u/rb_private.h +67 -0
data/ext/u/rb_u.c +251 -0
data/ext/u/rb_u_buffer.c +443 -0
data/ext/u/rb_u_buffer.h +24 -0
data/ext/u/rb_u_re.c +43 -0
data/ext/u/rb_u_re.h +15 -0
data/ext/u/rb_u_string.c +478 -0
data/ext/u/rb_u_string.h +173 -0
data/ext/u/rb_u_string_alnum.c +10 -0
data/ext/u/rb_u_string_alpha.c +10 -0
data/ext/u/rb_u_string_aref.c +142 -0
data/ext/u/rb_u_string_ascii_only.c +13 -0
data/ext/u/rb_u_string_assigned.c +10 -0
data/ext/u/rb_u_string_b.c +18 -0
data/ext/u/rb_u_string_bytesize.c +10 -0
data/ext/u/rb_u_string_byteslice.c +103 -0
data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
data/ext/u/rb_u_string_case_ignorable.c +25 -0
data/ext/u/rb_u_string_casecmp.c +61 -0
data/ext/u/rb_u_string_cased.c +17 -0
data/ext/u/rb_u_string_chomp.c +107 -0
data/ext/u/rb_u_string_chop.c +33 -0
data/ext/u/rb_u_string_chr.c +9 -0
data/ext/u/rb_u_string_cntrl.c +10 -0
data/ext/u/rb_u_string_collate.c +46 -0
data/ext/u/rb_u_string_collation_key.c +18 -0
data/ext/u/rb_u_string_count.c +38 -0
data/ext/u/rb_u_string_defined.c +10 -0
data/ext/u/rb_u_string_delete.c +62 -0
data/ext/u/rb_u_string_digit.c +10 -0
data/ext/u/rb_u_string_downcase.c +13 -0
data/ext/u/rb_u_string_dump.c +153 -0
data/ext/u/rb_u_string_each_byte.c +46 -0
data/ext/u/rb_u_string_each_char.c +49 -0
data/ext/u/rb_u_string_each_codepoint.c +45 -0
data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
data/ext/u/rb_u_string_each_line.c +142 -0
data/ext/u/rb_u_string_each_word.c +34 -0
data/ext/u/rb_u_string_empty.c +11 -0
data/ext/u/rb_u_string_end_with.c +31 -0
data/ext/u/rb_u_string_eql.c +30 -0
data/ext/u/rb_u_string_equal.c +33 -0
data/ext/u/rb_u_string_foldcase.c +12 -0
data/ext/u/rb_u_string_folded.c +13 -0
data/ext/u/rb_u_string_format.c +1745 -0
data/ext/u/rb_u_string_general_category.c +109 -0
data/ext/u/rb_u_string_getbyte.c +21 -0
data/ext/u/rb_u_string_graph.c +21 -0
data/ext/u/rb_u_string_grapheme_break.c +61 -0
data/ext/u/rb_u_string_gsub.c +164 -0
data/ext/u/rb_u_string_hash.c +10 -0
data/ext/u/rb_u_string_hex.c +9 -0
data/ext/u/rb_u_string_include.c +10 -0
data/ext/u/rb_u_string_index.c +110 -0
data/ext/u/rb_u_string_inspect.c +189 -0
data/ext/u/rb_u_string_internal_tr.c +148 -0
data/ext/u/rb_u_string_internal_tr.h +29 -0
data/ext/u/rb_u_string_justify.c +169 -0
data/ext/u/rb_u_string_length.c +10 -0
data/ext/u/rb_u_string_line_break.c +115 -0
data/ext/u/rb_u_string_lower.c +13 -0
data/ext/u/rb_u_string_lstrip.c +24 -0
data/ext/u/rb_u_string_match.c +65 -0
data/ext/u/rb_u_string_mirror.c +16 -0
data/ext/u/rb_u_string_newline.c +21 -0
data/ext/u/rb_u_string_normalize.c +70 -0
data/ext/u/rb_u_string_normalized.c +28 -0
data/ext/u/rb_u_string_oct.c +11 -0
data/ext/u/rb_u_string_ord.c +14 -0
data/ext/u/rb_u_string_partition.c +80 -0
data/ext/u/rb_u_string_plus.c +33 -0
data/ext/u/rb_u_string_print.c +10 -0
data/ext/u/rb_u_string_punct.c +10 -0
data/ext/u/rb_u_string_reverse.c +13 -0
data/ext/u/rb_u_string_rindex.c +104 -0
data/ext/u/rb_u_string_rpartition.c +81 -0
data/ext/u/rb_u_string_rstrip.c +29 -0
data/ext/u/rb_u_string_scan.c +109 -0
data/ext/u/rb_u_string_script.c +253 -0
data/ext/u/rb_u_string_soft_dotted.c +13 -0
data/ext/u/rb_u_string_space.c +24 -0
data/ext/u/rb_u_string_split.c +245 -0
data/ext/u/rb_u_string_squeeze.c +75 -0
data/ext/u/rb_u_string_start_with.c +31 -0
data/ext/u/rb_u_string_strip.c +36 -0
data/ext/u/rb_u_string_sub.c +147 -0
data/ext/u/rb_u_string_times.c +35 -0
data/ext/u/rb_u_string_title.c +10 -0
data/ext/u/rb_u_string_titlecase.c +13 -0
data/ext/u/rb_u_string_to_i.c +45 -0
data/ext/u/rb_u_string_to_inum.c +364 -0
data/ext/u/rb_u_string_to_inum.h +1 -0
data/ext/u/rb_u_string_to_str.c +17 -0
data/ext/u/rb_u_string_to_sym.c +12 -0
data/ext/u/rb_u_string_tr.c +290 -0
data/ext/u/rb_u_string_upcase.c +12 -0
data/ext/u/rb_u_string_upper.c +13 -0
data/ext/u/rb_u_string_valid.c +10 -0
data/ext/u/rb_u_string_valid_encoding.c +12 -0
data/ext/u/rb_u_string_wide.c +21 -0
data/ext/u/rb_u_string_wide_cjk.c +21 -0
data/ext/u/rb_u_string_width.c +19 -0
data/ext/u/rb_u_string_word_break.c +63 -0
data/ext/u/rb_u_string_xdigit.c +22 -0
data/ext/u/rb_u_string_zero_width.c +16 -0
data/ext/u/titled.c +55 -0
data/ext/u/titled.h +1 -0
data/ext/u/u.c +23 -0
data/ext/u/u.h +458 -0
data/ext/u/u_char_canonical_combining_class.c +31 -0
data/ext/u/u_char_digit_value.c +21 -0
data/ext/u/u_char_downcase.c +27 -0
data/ext/u/u_char_general_category.c +31 -0
data/ext/u/u_char_grapheme_break.c +28 -0
data/ext/u/u_char_isalnum.c +24 -0
data/ext/u/u_char_isalpha.c +21 -0
data/ext/u/u_char_isassigned.c +16 -0
data/ext/u/u_char_iscased.c +22 -0
data/ext/u/u_char_iscaseignorable.c +29 -0
data/ext/u/u_char_iscntrl.c +17 -0
data/ext/u/u_char_isdefined.c +15 -0
data/ext/u/u_char_isdigit.c +16 -0
data/ext/u/u_char_isgraph.c +22 -0
data/ext/u/u_char_islower.c +16 -0
data/ext/u/u_char_isnewline.c +24 -0
data/ext/u/u_char_isprint.c +21 -0
data/ext/u/u_char_ispunct.c +27 -0
data/ext/u/u_char_issoftdotted.c +18 -0
data/ext/u/u_char_isspace.c +28 -0
data/ext/u/u_char_isupper.c +16 -0
data/ext/u/u_char_isvalid.c +18 -0
data/ext/u/u_char_iswide.c +18 -0
data/ext/u/u_char_iswide_cjk.c +22 -0
data/ext/u/u_char_isxdigit.c +27 -0
data/ext/u/u_char_iszerowidth.c +29 -0
data/ext/u/u_char_line_break.c +29 -0
data/ext/u/u_char_mirror.c +16 -0
data/ext/u/u_char_normalized.c +23 -0
data/ext/u/u_char_script.c +41 -0
data/ext/u/u_char_to_u.c +48 -0
data/ext/u/u_char_upcase.c +24 -0
data/ext/u/u_char_width.c +12 -0
data/ext/u/u_char_word_break.c +28 -0
data/ext/u/u_char_xdigit_value.c +31 -0
data/ext/u/u_collate.c +83 -0
data/ext/u/u_collation_key.c +132 -0
data/ext/u/u_decode.c +156 -0
data/ext/u/u_downcase.c +201 -0
data/ext/u/u_foldcase.c +68 -0
data/ext/u/u_grapheme_clusters.c +57 -0
data/ext/u/u_has_prefix.c +27 -0
data/ext/u/u_index.c +93 -0
data/ext/u/u_is_ascii_only.c +33 -0
data/ext/u/u_locale.c +40 -0
data/ext/u/u_locale.h +14 -0
data/ext/u/u_mirror.c +20 -0
data/ext/u/u_n_bytes.c +16 -0
data/ext/u/u_n_chars.c +43 -0
data/ext/u/u_normalize.c +232 -0
data/ext/u/u_normalized.c +28 -0
data/ext/u/u_offset_to_pointer.c +62 -0
data/ext/u/u_pointer_to_offset.c +23 -0
data/ext/u/u_recode.c +73 -0
data/ext/u/u_reverse.c +21 -0
data/ext/u/u_rindex.c +132 -0
data/ext/u/u_titlecase.c +68 -0
data/ext/u/u_upcase.c +89 -0
data/ext/u/u_width.c +35 -0
data/ext/u/u_words.c +82 -0
data/ext/u/yield.h +27 -0
data/lib/u-1.0.rb +20 -0
data/lib/u-1.0/buffer.rb +10 -0
data/lib/u-1.0/string.rb +9 -0
data/lib/u-1.0/version.rb +287 -0
data/test/unit/case.rb +2080 -0
data/test/unit/foldcase.rb +1136 -0
data/test/unit/graphemebreak.rb +407 -0
data/test/unit/normalize.rb +367545 -0
data/test/unit/u-1.0.rb +10 -0
data/test/unit/u-1.0/buffer.rb +52 -0
data/test/unit/u-1.0/string.rb +1439 -0
data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
data/test/unit/wordbreak.rb +1083 -0
metadata +603 -148
data/README +0 -38
data/Rakefile +0 -64
data/ext/encoding/character/utf-8/break.c +0 -25
data/ext/encoding/character/utf-8/data/break.h +0 -22931
data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
data/ext/encoding/character/utf-8/data/compose.h +0 -1607
data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
data/ext/encoding/character/utf-8/decompose.c +0 -444
data/ext/encoding/character/utf-8/depend +0 -65
data/ext/encoding/character/utf-8/extconf.rb +0 -67
data/ext/encoding/character/utf-8/private.h +0 -51
data/ext/encoding/character/utf-8/properties.c +0 -1056
data/ext/encoding/character/utf-8/rb_includes.h +0 -19
data/ext/encoding/character/utf-8/rb_methods.h +0 -49
data/ext/encoding/character/utf-8/rb_private.h +0 -52
data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
data/ext/encoding/character/utf-8/tables.h +0 -38
data/ext/encoding/character/utf-8/unicode.c +0 -319
data/ext/encoding/character/utf-8/unicode.h +0 -216
data/ext/encoding/character/utf-8/utf.c +0 -1334
data/lib/encoding/character/utf-8.rb +0 -201
data/lib/u.rb +0 -16
data/lib/u/string.rb +0 -185
data/lib/u/version.rb +0 -5
data/test/unit/u/string.rb +0 -91

data/ext/u/rb_u_string_canonical_combining_class.c ADDED

@@ -0,0 +1,33 @@
+#include "rb_includes.h"
+/* Returns the canonical combining class of the characters of the receiver.
+ *
+ * The canonical combining class of a character is a number in the range [0,
+ * 254].  The canonical combining class is used when generating a canonical
+ * ordering of the characters in a string.
+ *
+ * The empty string has a canonical combining class of 0.
+ *
+ * @raise [ArgumentError] If the receiver contains two characters belonging to
+ *   different combining classes
+ * @raise [ArgumentError] If the receiver contains an incomplete UTF-8 sequence
+ * @raise [ArgumentError] If the receiver contains an invalid UTF-8 sequence
+ * @return [Fixnum] */
+VALUE
+rb_u_string_canonical_combining_class(VALUE self)
+{
+        const struct rb_u_string *string = RVAL2USTRING(self);
+        const char *p = USTRING_STR(string);
+        const char *end = USTRING_END(string);
+        if (p == end)
+                return 0;
+        int first = u_char_canonical_combining_class(u_decode(&p, p, end));
+        while (p < end) {
+                int value = u_char_canonical_combining_class(u_decode(&p, p, end));
+                if (value != first)
+                        rb_u_raise(rb_eArgError,
+                                   "string consists of characters with different canonical combining class values: %d+, %d",
+                                   first, value);
+        }
+        return INT2FIX(first);
+}

data/ext/u/rb_u_string_case_ignorable.c ADDED

@@ -0,0 +1,25 @@
+#include "rb_includes.h"
+/* @overload case_ignorable?
+ *
+ *   @return [Boolean] True if the receiver contains only “case ignorable”
+ *     characters, that is, characters in the general categories
+ *
+ *     * Other, format (Cf)
+ *     * Letter, modifier (Lm)
+ *     * Mark, enclosing (Me)
+ *     * Mark, nonspacing (Mn)
+ *     * Symbol, modifier (Sk)
+ *
+ *     and the characters
+ *
+ *     * U+0027 APOSTROPHE
+ *     * U+00AD SOFT HYPHEN
+ *     * U+2019 RIGHT SINGLE QUOTATION MARK
+ *   @see http://unicode.org/reports/tr21/tr21-5.html
+ *     Unicode Standard Annex #21: Case Mappings */
+VALUE
+rb_u_string_case_ignorable(VALUE self)
+{
+        return _rb_u_character_test(self, u_char_iscaseignorable);
+}

data/ext/u/rb_u_string_casecmp.c ADDED

@@ -0,0 +1,61 @@
+#include <errno.h>
+#include "rb_includes.h"
+static size_t
+foldcase(char **result, const struct rb_u_string *string, const char *locale,
+         char *previous)
+{
+        size_t n = u_foldcase(NULL, 0,
+                              USTRING_STR(string), USTRING_LENGTH(string),
+                              locale);
+        *result = _rb_u_guarded_alloc(n + 1, previous, NULL);
+        return u_foldcase(*result, n + 1,
+                          USTRING_STR(string), USTRING_LENGTH(string),
+                          locale);
+}
+/* @overload casecmp(other, locale = ENV['LC_COLLATE'])
+ *
+ *   Returns the comparison of {#foldcase} to _other_{#foldcase} using the
+ *   linguistically correct rules of LOCALE.  This is, however, only an
+ *   approximation of a case-insensitive comparison.  The LOCALE must be given
+ *   as a language, region, and encoding, for example, “en_US.UTF-8”.
+ *
+ *   This operation is known as “collation” and you can find more information
+ *   about the collation algorithm employed in the
+ *   Unicode Technical Standard #10, see http://unicode.org/reports/tr10/.
+ *
+ *   @param [U::String, #to_str] other
+ *   @param [#to_str] locale
+ *   @return [Fixnum] */
+VALUE
+rb_u_string_casecmp(int argc, VALUE *argv, VALUE self)
+{
+        const char *locale = NULL;
+        VALUE rbother, rblocale;
+        if (rb_scan_args(argc, argv, "11", &rbother, &rblocale) == 2)
+                locale = StringValuePtr(rblocale);
+        const struct rb_u_string *string = RVAL2USTRING(self);
+        const struct rb_u_string *other = RVAL2USTRING_ANY(rbother);
+        char *folded;
+        size_t folded_n = foldcase(&folded, string, locale, NULL);
+        char *folded_other;
+        size_t folded_other_n = foldcase(&folded_other, other, locale, folded);
+        errno = 0;
+        int r = u_collate(folded, folded_n,
+                          folded_other, folded_other_n,
+                          locale);
+        free(folded_other);
+        free(folded);
+        if (errno != 0)
+                rb_u_raise_errno(errno, "can’t collate strings");
+        return INT2FIX(r);
+}

data/ext/u/rb_u_string_cased.c ADDED

@@ -0,0 +1,17 @@
+#include "rb_includes.h"
+/* @overload cased?
+ *
+ *   @return [Boolean] True if the receiver only contains characters in the
+ *     general categories
+ *
+ *     * Letter, uppercase (Lu)
+ *     * Letter, lowercase (Ll)
+ *     * Letter, titlecase (Lt)
+ *
+ *     or has the derived properties Other_Uppercase or Other_Lowercase */
+VALUE
+rb_u_string_cased(VALUE self)
+{
+        return _rb_u_character_test(self, u_char_iscased);
+}

data/ext/u/rb_u_string_chomp.c ADDED

@@ -0,0 +1,107 @@
+#include "rb_includes.h"
+static VALUE
+rb_u_string_chomp_default(VALUE self)
+{
+        const struct rb_u_string *string = RVAL2USTRING(self);
+        const char *begin = USTRING_STR(string);
+        const char *end = USTRING_END(string);
+        const char *last;
+        uint32_t c = u_decode_r(&last, begin, end);
+        if (c == '\n') {
+                if (*(last - 1) == '\r')
+                        last--;
+        } else if (!u_char_isnewline(c))
+                return self;
+        return rb_u_string_new_c(self, begin, last - begin);
+}
+static VALUE
+rb_u_string_chomp_newlines(VALUE self)
+{
+        const struct rb_u_string *string = RVAL2USTRING(self);
+        const char *begin = USTRING_STR(string);
+        const char *end = USTRING_END(string);
+        const char *last = end;
+        const char *last_but_one;
+        while (last > begin &&
+               u_char_isnewline(u_decode_r(&last_but_one, begin, last)))
+                last = last_but_one;
+        if (last == end)
+                return self;
+        return rb_u_string_new_c(self, begin, last - begin);
+}
+/* @overload chomp(separator = $/)
+ *
+ *   Returns the receiver, minus any SEPARATOR suffix, inheriting any taint and
+ *   untrust, unless {#length} = 0, in which case nil is returned.  If
+ *   SEPARATOR is nil or invalidly encoded, the receiver is returned.
+ *
+ *   If SEPARATOR is `$/` and `$/` has its default value or if SEPARATOR is
+ *   U+000A LINE FEED, the longest suffix consisting of any of
+ *
+ *   * U+000A LINE FEED
+ *   * U+000D CARRIAGE RETURN
+ *   * U+000D CARRIAGE RETURN, U+000D LINE FEED
+ *
+ *   will be removed. If no such suffix exists and the last character is a
+ *   {#newline?}, it will be removed instead.
+ *
+ *   If SEPARATOR is {#empty?}, remove the longest {#newline?} suffix.
+ *
+ *   @param [U::String, #to_str, nil] separator
+ *   @return [U::String, self, nil]
+ *   @see #chop
+ *   @see #lstrip
+ *   @see #rstrip
+ *   @see #strip */
+VALUE
+rb_u_string_chomp(int argc, VALUE *argv, VALUE self)
+{
+        const struct rb_u_string *string = RVAL2USTRING(self);
+        long length = USTRING_LENGTH(string);
+        if (length == 0)
+                return Qnil;
+        VALUE rs;
+        if (argc == 0) {
+                rs = rb_rs;
+                if (rs == rb_default_rs)
+                        return rb_u_string_chomp_default(self);
+        } else {
+                rb_scan_args(argc, argv, "01", &rs);
+        }
+        if (NIL_P(rs))
+                return self;
+        const struct rb_u_string *separator = RVAL2USTRING_ANY(rs);
+        long separator_length = USTRING_LENGTH(separator);
+        if (separator_length == 0)
+                return rb_u_string_chomp_newlines(self);
+        if (separator_length > length)
+                return self;
+        char last_char = USTRING_STR(separator)[separator_length - 1];
+        if (separator_length == 1 && last_char == '\n')
+                return rb_u_string_chomp_default(self);
+        if (!u_valid(USTRING_STR(separator), separator_length, NULL) ||
+            USTRING_STR(string)[length - 1] != last_char ||
+            (separator_length > 1 &&
+             rb_memcmp(USTRING_STR(separator),
+                       USTRING_END(string) - separator_length,
+                       separator_length) != 0))
+                return self;
+        return rb_u_string_new_c(self, USTRING_STR(string), length - separator_length);
+}

data/ext/u/rb_u_string_chop.c ADDED

@@ -0,0 +1,33 @@
+#include "rb_includes.h"
+/* Returns the receiver, minus its last character, inheriting any taint and
+ * untrust, unless the receiver is {#empty?} or if the last character is
+ * invalidly encoded, in which case the receiver is returned.
+ *
+ * If the last character is U+000A LINE FEED and the second-to-last character
+ * is U+000D CARRIAGE RETURN, both characters are removed.
+ *
+ * @return [U::String]
+ * @see #chomp
+ * @see #lstrip
+ * @see #rstrip
+ * @see #strip */
+VALUE
+rb_u_string_chop(VALUE self)
+{
+        const struct rb_u_string *string = RVAL2USTRING(self);
+        if (USTRING_LENGTH(string) == 0)
+                return self;
+        const char *begin = USTRING_STR(string);
+        const char *end = USTRING_END(string);
+        const char *last;
+        uint32_t c = u_decode_r(&last, begin, end);
+        if (c == '\n')
+                if (*(last - 1) == '\r')
+                        last--;
+        return rb_u_string_new_c(self, begin, last - begin);
+}

data/ext/u/rb_u_string_chr.c ADDED

@@ -0,0 +1,9 @@
+#include "rb_includes.h"
+/* @return [U::String] The substring [0, min({#length}, 1)], inheriting any
+ *   taint and untrust */
+VALUE
+rb_u_string_chr(VALUE self)
+{
+        return rb_u_string_substr(self, 0, 1);
+}

data/ext/u/rb_u_string_cntrl.c ADDED

@@ -0,0 +1,10 @@
+#include "rb_includes.h"
+/* @overload cntrl?
+ *   @return [Boolean] True if the receiver contains only characters in the
+ *     general category Other, control (Cc) */
+VALUE
+rb_u_string_cntrl(VALUE self)
+{
+        return _rb_u_character_test(self, u_char_iscntrl);
+}

data/ext/u/rb_u_string_collate.c ADDED

@@ -0,0 +1,46 @@
+#include "rb_includes.h"
+#include <errno.h>
+/* @overload <=>(other, locale = ENV['LC_COLLATE'])
+ *
+ *   Returns the comparison of the receiver and OTHER using the linguistically
+ *   correct rules of LOCALE.  The LOCALE must be given as a language, region,
+ *   and encoding, for example, “en_US.UTF-8”.
+ *
+ *   This operation is known as “collation” and you can find more information
+ *   about the collation algorithm employed in the
+ *   Unicode Technical Standard #10, see http://unicode.org/reports/tr10/.
+ *
+ *   @param [U::String, #to_str] other
+ *   @param [#to_str] locale
+ *   @raise [Errno::EILSEQ] If a character in the receiver can’t be converted
+ *     into the encoding of the locale
+ *   @return [Fixnum]
+ *   @see #==
+ *   @see #eql? */
+VALUE
+rb_u_string_collate(int argc, VALUE *argv, VALUE self)
+{
+        const char *locale = NULL;
+        VALUE rbother, rblocale;
+        if (rb_scan_args(argc, argv, "11", &rbother, &rblocale) == 2)
+                locale = StringValuePtr(rblocale);
+        else {
+                const char * const env[] = { "LC_ALL", "LC_COLLATE", "LANG", NULL };
+                for (const char * const *p = env; *p != NULL; p++)
+                        if ((locale = getenv(*p)) != NULL)
+                                break;
+        }
+        const struct rb_u_string *string = RVAL2USTRING(self);
+        const struct rb_u_string *other = RVAL2USTRING_ANY(rbother);
+        errno = 0;
+        int r = u_collate(USTRING_STR(string), USTRING_LENGTH(string),
+                          USTRING_STR(other), USTRING_LENGTH(other),
+                          locale);
+        if (errno != 0)
+                rb_u_raise_errno(errno, "can’t collate strings");
+        return INT2FIX(r);
+}

data/ext/u/rb_u_string_collation_key.c ADDED

@@ -0,0 +1,18 @@
+#include "rb_includes.h"
+/* @overload collation_key(locale = ENV['LC_COLLATE'])
+ *
+ *   @raise [Errno::EILSEQ] If a character in the receiver can’t be converted
+ *     into the encoding of the locale
+ *   @return [U::String] The locale-dependent collation key of the receiver in
+ *     LOCALE, inheriting any taint and untrust
+ *   @note Use the collation key when comparing U::Strings to each other
+ *     repeatedly, as occurs when, for example, sorting a list of
+ *     U::Strings.
+ *   @note The LOCALE must be given as a language, region, and encoding, for
+ *     example, “en_US.UTF-8”. */
+VALUE
+rb_u_string_collation_key(int argc, VALUE *argv, VALUE self)
+{
+        return _rb_u_string_convert_locale(argc, argv, self, u_collation_key, "LC_COLLATE");
+}

data/ext/u/rb_u_string_count.c ADDED

@@ -0,0 +1,38 @@
+#include "rb_includes.h"
+#include "rb_u_string_internal_tr.h"
+/* @overload count(set, *sets)
+ *
+ * Returns the number of characters in the receiver that are included in the
+ * intersection of SET and any additional SETS of characters.
+ *
+ * The complement of all Unicode characters and a given set of characters may
+ * be specified by prefixing a non-empty set with ‘`^`’ (U+005E CIRCUMFLEX
+ * ACCENT).
+ *
+ * Any sequence of characters _a_-_b_ inside a set will expand to also
+ * include all characters whose code points lay between those of _a_ and _b_.
+ *
+ * @param [U::String, #to_str] set
+ * @param [Array<U::String, #to_str>] sets
+ * @return [Integer] */
+VALUE
+rb_u_string_count(int argc, VALUE *argv, VALUE self)
+{
+        const struct rb_u_string *string = RVAL2USTRING(self);
+        need_at_least_n_arguments(argc, 1);
+        if (USTRING_LENGTH(string) == 0)
+                return INT2FIX(0);
+        struct tr_table table;
+        tr_table_initialize_from_strings(&table, argc, argv);
+        long count = 0;
+        for (const char *p = USTRING_STR(string), *end = USTRING_END(string); p < end; )
+                if (tr_table_lookup(&table, u_decode(&p, p, end)))
+                        count++;
+        return LONG2NUM(count);
+}

data/ext/u/rb_u_string_defined.c ADDED

@@ -0,0 +1,10 @@
+#include "rb_includes.h"
+/* @overload defined?
+ *   @return [Boolean] True if the receiver contains only characters not in the
+ *     general categories Other, not assigned (Cn) and Other, surrogate (Cs) */
+VALUE
+rb_u_string_defined(VALUE self)
+{
+        return _rb_u_character_test(self, u_char_isdefined);
+}

data/ext/u/rb_u_string_delete.c ADDED

@@ -0,0 +1,62 @@
+#include "rb_includes.h"
+#include "rb_u_string_internal_tr.h"
+static long
+rb_u_string_delete_loop(const struct rb_u_string *string, struct tr_table *table,
+                        char *result)
+{
+        long count = 0;
+        char *base = result;
+        for (const char *p = USTRING_STR(string), *q, *end = USTRING_END(string); p < end; p = q)
+                if (!tr_table_lookup(table, u_decode(&q, p, end))) {
+                        long run = q - p;
+                        if (base != NULL) {
+                                memcpy(base, p, run);
+                                base += run;
+                        }
+                        count += run;
+                }
+        return count;
+}
+/* @overload delete(set, *sets)
+ *
+ *   Returns the receiver, minus any characters that are included in the
+ *   intersection of SET and any additional SETS of characters, inheriting any
+ *   taint and untrust.
+ *
+ *   The complement of all Unicode characters and a given set of characters may
+ *   be specified by prefixing a non-empty set with ‘`^`’ (U+005E CIRCUMFLEX
+ *   ACCENT).
+ *
+ *   Any sequence of characters _a_-_b_ inside a set will expand to also
+ *   include all characters whose code points lay between those of _a_ and _b_.
+ *
+ *   @param [U::String, #to_str] set
+ *   @param [Array<U::String, #to_str>] sets
+ *   @return [U::String] */
+VALUE
+rb_u_string_delete(int argc, VALUE *argv, VALUE self)
+{
+        const struct rb_u_string *string = RVAL2USTRING(self);
+        need_at_least_n_arguments(argc, 1);
+        if (USTRING_LENGTH(string) == 0)
+                return self;
+        struct tr_table table;
+        tr_table_initialize_from_strings(&table, argc, argv);
+        long count = rb_u_string_delete_loop(string, &table, NULL);
+        if (count == 0)
+                return self;
+        char *remaining = ALLOC_N(char, count + 1);
+        rb_u_string_delete_loop(string, &table, remaining);
+        remaining[count] = '\0';
+        return rb_u_string_new_c_own(self, remaining, count);
+}