RubyGems - u - Versions diffs - 0.5.0 → 1.0.0 - Mend

u 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (316) hide show

checksums.yaml +7 -0
data/build/ext/u/data/attributes.rb +39 -0
data/build/ext/u/data/bidi-mirroring.rb +27 -0
data/build/ext/u/data/canonical-combining-class.rb +15 -0
data/build/ext/u/data/case-folding.rb +39 -0
data/build/ext/u/data/cased.rb +19 -0
data/build/ext/u/data/compose.rb +304 -0
data/build/ext/u/data/constants.rb +31 -0
data/build/ext/u/data/decompose.rb +85 -0
data/build/ext/u/data/general-category.rb +61 -0
data/build/ext/u/data/grapheme-word-break.rb +15 -0
data/build/ext/u/data/marshalled.rb +5 -0
data/build/ext/u/data/script.rb +91 -0
data/build/ext/u/data/soft-dotted.rb +17 -0
data/build/ext/u/data/title-table.rb +30 -0
data/build/ext/u/data/wide.rb +17 -0
data/build/lib/u/build.rb +8 -0
data/build/lib/u/build/data.rb +16 -0
data/build/lib/u/build/data/bidimirroring.rb +26 -0
data/build/lib/u/build/data/break.rb +14 -0
data/build/lib/u/build/data/casefolding.rb +77 -0
data/build/lib/u/build/data/compositionexclusions.rb +14 -0
data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
data/build/lib/u/build/data/file.rb +88 -0
data/build/lib/u/build/data/linebreak.rb +14 -0
data/build/lib/u/build/data/proplist.rb +18 -0
data/build/lib/u/build/data/scripts.rb +22 -0
data/build/lib/u/build/data/specialcasing.rb +106 -0
data/build/lib/u/build/data/unicode.rb +41 -0
data/build/lib/u/build/data/unicode/entry.rb +27 -0
data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
data/build/lib/u/build/data/unicode/points.rb +32 -0
data/build/lib/u/build/header.rb +11 -0
data/build/lib/u/build/header/table.rb +19 -0
data/build/lib/u/build/header/table/row.rb +64 -0
data/build/lib/u/build/header/tables.rb +6 -0
data/build/lib/u/build/header/tables/intervals.rb +50 -0
data/build/lib/u/build/header/tables/split.rb +20 -0
data/build/lib/u/build/header/tables/split/data.rb +29 -0
data/build/lib/u/build/header/tables/split/part1.rb +28 -0
data/build/lib/u/build/header/tables/split/part2.rb +13 -0
data/build/lib/u/build/header/tables/split/row.rb +34 -0
data/build/lib/u/build/header/tables/split/rows.rb +22 -0
data/build/test/unit/break.rb +45 -0
data/build/test/unit/case.rb +178 -0
data/build/test/unit/foldcase.rb +44 -0
data/build/test/unit/normalize.rb +81 -0
data/ext/u/attributes.c +62 -0
data/ext/u/attributes.h +5 -0
data/ext/u/case.h +41 -0
data/ext/u/data/attributes.h +3070 -0
data/ext/u/data/bidi-mirroring.h +373 -0
data/ext/u/data/canonical-combining-class.h +2157 -0
data/ext/u/data/case-folding.h +171 -0
data/ext/u/data/cased.h +42 -0
data/ext/u/data/compose.h +1714 -0
data/ext/u/data/constants.h +17 -0
data/ext/u/data/decompose.h +9356 -0
data/ext/u/data/general-category.h +28959 -0
data/ext/u/data/grapheme-break.h +13201 -0
data/ext/u/data/line-break.h +26501 -0
data/ext/u/data/normalization-quick-check.h +3002 -0
data/ext/u/data/script.h +2928 -0
data/ext/u/data/soft-dotted.h +55 -0
data/ext/u/data/title-table.h +41 -0
data/ext/u/data/types.h +11117 -0
data/ext/u/data/wide-cjk.h +197 -0
data/ext/u/data/wide.h +59 -0
data/ext/u/data/word-break.h +10001 -0
data/ext/u/depend +281 -0
data/ext/u/extconf.rb +158 -0
data/ext/u/output.h +51 -0
data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
data/ext/u/private.h +58 -0
data/ext/u/rb_includes.h +10 -0
data/ext/u/rb_private.c +98 -0
data/ext/u/rb_private.h +67 -0
data/ext/u/rb_u.c +251 -0
data/ext/u/rb_u_buffer.c +443 -0
data/ext/u/rb_u_buffer.h +24 -0
data/ext/u/rb_u_re.c +43 -0
data/ext/u/rb_u_re.h +15 -0
data/ext/u/rb_u_string.c +478 -0
data/ext/u/rb_u_string.h +173 -0
data/ext/u/rb_u_string_alnum.c +10 -0
data/ext/u/rb_u_string_alpha.c +10 -0
data/ext/u/rb_u_string_aref.c +142 -0
data/ext/u/rb_u_string_ascii_only.c +13 -0
data/ext/u/rb_u_string_assigned.c +10 -0
data/ext/u/rb_u_string_b.c +18 -0
data/ext/u/rb_u_string_bytesize.c +10 -0
data/ext/u/rb_u_string_byteslice.c +103 -0
data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
data/ext/u/rb_u_string_case_ignorable.c +25 -0
data/ext/u/rb_u_string_casecmp.c +61 -0
data/ext/u/rb_u_string_cased.c +17 -0
data/ext/u/rb_u_string_chomp.c +107 -0
data/ext/u/rb_u_string_chop.c +33 -0
data/ext/u/rb_u_string_chr.c +9 -0
data/ext/u/rb_u_string_cntrl.c +10 -0
data/ext/u/rb_u_string_collate.c +46 -0
data/ext/u/rb_u_string_collation_key.c +18 -0
data/ext/u/rb_u_string_count.c +38 -0
data/ext/u/rb_u_string_defined.c +10 -0
data/ext/u/rb_u_string_delete.c +62 -0
data/ext/u/rb_u_string_digit.c +10 -0
data/ext/u/rb_u_string_downcase.c +13 -0
data/ext/u/rb_u_string_dump.c +153 -0
data/ext/u/rb_u_string_each_byte.c +46 -0
data/ext/u/rb_u_string_each_char.c +49 -0
data/ext/u/rb_u_string_each_codepoint.c +45 -0
data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
data/ext/u/rb_u_string_each_line.c +142 -0
data/ext/u/rb_u_string_each_word.c +34 -0
data/ext/u/rb_u_string_empty.c +11 -0
data/ext/u/rb_u_string_end_with.c +31 -0
data/ext/u/rb_u_string_eql.c +30 -0
data/ext/u/rb_u_string_equal.c +33 -0
data/ext/u/rb_u_string_foldcase.c +12 -0
data/ext/u/rb_u_string_folded.c +13 -0
data/ext/u/rb_u_string_format.c +1745 -0
data/ext/u/rb_u_string_general_category.c +109 -0
data/ext/u/rb_u_string_getbyte.c +21 -0
data/ext/u/rb_u_string_graph.c +21 -0
data/ext/u/rb_u_string_grapheme_break.c +61 -0
data/ext/u/rb_u_string_gsub.c +164 -0
data/ext/u/rb_u_string_hash.c +10 -0
data/ext/u/rb_u_string_hex.c +9 -0
data/ext/u/rb_u_string_include.c +10 -0
data/ext/u/rb_u_string_index.c +110 -0
data/ext/u/rb_u_string_inspect.c +189 -0
data/ext/u/rb_u_string_internal_tr.c +148 -0
data/ext/u/rb_u_string_internal_tr.h +29 -0
data/ext/u/rb_u_string_justify.c +169 -0
data/ext/u/rb_u_string_length.c +10 -0
data/ext/u/rb_u_string_line_break.c +115 -0
data/ext/u/rb_u_string_lower.c +13 -0
data/ext/u/rb_u_string_lstrip.c +24 -0
data/ext/u/rb_u_string_match.c +65 -0
data/ext/u/rb_u_string_mirror.c +16 -0
data/ext/u/rb_u_string_newline.c +21 -0
data/ext/u/rb_u_string_normalize.c +70 -0
data/ext/u/rb_u_string_normalized.c +28 -0
data/ext/u/rb_u_string_oct.c +11 -0
data/ext/u/rb_u_string_ord.c +14 -0
data/ext/u/rb_u_string_partition.c +80 -0
data/ext/u/rb_u_string_plus.c +33 -0
data/ext/u/rb_u_string_print.c +10 -0
data/ext/u/rb_u_string_punct.c +10 -0
data/ext/u/rb_u_string_reverse.c +13 -0
data/ext/u/rb_u_string_rindex.c +104 -0
data/ext/u/rb_u_string_rpartition.c +81 -0
data/ext/u/rb_u_string_rstrip.c +29 -0
data/ext/u/rb_u_string_scan.c +109 -0
data/ext/u/rb_u_string_script.c +253 -0
data/ext/u/rb_u_string_soft_dotted.c +13 -0
data/ext/u/rb_u_string_space.c +24 -0
data/ext/u/rb_u_string_split.c +245 -0
data/ext/u/rb_u_string_squeeze.c +75 -0
data/ext/u/rb_u_string_start_with.c +31 -0
data/ext/u/rb_u_string_strip.c +36 -0
data/ext/u/rb_u_string_sub.c +147 -0
data/ext/u/rb_u_string_times.c +35 -0
data/ext/u/rb_u_string_title.c +10 -0
data/ext/u/rb_u_string_titlecase.c +13 -0
data/ext/u/rb_u_string_to_i.c +45 -0
data/ext/u/rb_u_string_to_inum.c +364 -0
data/ext/u/rb_u_string_to_inum.h +1 -0
data/ext/u/rb_u_string_to_str.c +17 -0
data/ext/u/rb_u_string_to_sym.c +12 -0
data/ext/u/rb_u_string_tr.c +290 -0
data/ext/u/rb_u_string_upcase.c +12 -0
data/ext/u/rb_u_string_upper.c +13 -0
data/ext/u/rb_u_string_valid.c +10 -0
data/ext/u/rb_u_string_valid_encoding.c +12 -0
data/ext/u/rb_u_string_wide.c +21 -0
data/ext/u/rb_u_string_wide_cjk.c +21 -0
data/ext/u/rb_u_string_width.c +19 -0
data/ext/u/rb_u_string_word_break.c +63 -0
data/ext/u/rb_u_string_xdigit.c +22 -0
data/ext/u/rb_u_string_zero_width.c +16 -0
data/ext/u/titled.c +55 -0
data/ext/u/titled.h +1 -0
data/ext/u/u.c +23 -0
data/ext/u/u.h +458 -0
data/ext/u/u_char_canonical_combining_class.c +31 -0
data/ext/u/u_char_digit_value.c +21 -0
data/ext/u/u_char_downcase.c +27 -0
data/ext/u/u_char_general_category.c +31 -0
data/ext/u/u_char_grapheme_break.c +28 -0
data/ext/u/u_char_isalnum.c +24 -0
data/ext/u/u_char_isalpha.c +21 -0
data/ext/u/u_char_isassigned.c +16 -0
data/ext/u/u_char_iscased.c +22 -0
data/ext/u/u_char_iscaseignorable.c +29 -0
data/ext/u/u_char_iscntrl.c +17 -0
data/ext/u/u_char_isdefined.c +15 -0
data/ext/u/u_char_isdigit.c +16 -0
data/ext/u/u_char_isgraph.c +22 -0
data/ext/u/u_char_islower.c +16 -0
data/ext/u/u_char_isnewline.c +24 -0
data/ext/u/u_char_isprint.c +21 -0
data/ext/u/u_char_ispunct.c +27 -0
data/ext/u/u_char_issoftdotted.c +18 -0
data/ext/u/u_char_isspace.c +28 -0
data/ext/u/u_char_isupper.c +16 -0
data/ext/u/u_char_isvalid.c +18 -0
data/ext/u/u_char_iswide.c +18 -0
data/ext/u/u_char_iswide_cjk.c +22 -0
data/ext/u/u_char_isxdigit.c +27 -0
data/ext/u/u_char_iszerowidth.c +29 -0
data/ext/u/u_char_line_break.c +29 -0
data/ext/u/u_char_mirror.c +16 -0
data/ext/u/u_char_normalized.c +23 -0
data/ext/u/u_char_script.c +41 -0
data/ext/u/u_char_to_u.c +48 -0
data/ext/u/u_char_upcase.c +24 -0
data/ext/u/u_char_width.c +12 -0
data/ext/u/u_char_word_break.c +28 -0
data/ext/u/u_char_xdigit_value.c +31 -0
data/ext/u/u_collate.c +83 -0
data/ext/u/u_collation_key.c +132 -0
data/ext/u/u_decode.c +156 -0
data/ext/u/u_downcase.c +201 -0
data/ext/u/u_foldcase.c +68 -0
data/ext/u/u_grapheme_clusters.c +57 -0
data/ext/u/u_has_prefix.c +27 -0
data/ext/u/u_index.c +93 -0
data/ext/u/u_is_ascii_only.c +33 -0
data/ext/u/u_locale.c +40 -0
data/ext/u/u_locale.h +14 -0
data/ext/u/u_mirror.c +20 -0
data/ext/u/u_n_bytes.c +16 -0
data/ext/u/u_n_chars.c +43 -0
data/ext/u/u_normalize.c +232 -0
data/ext/u/u_normalized.c +28 -0
data/ext/u/u_offset_to_pointer.c +62 -0
data/ext/u/u_pointer_to_offset.c +23 -0
data/ext/u/u_recode.c +73 -0
data/ext/u/u_reverse.c +21 -0
data/ext/u/u_rindex.c +132 -0
data/ext/u/u_titlecase.c +68 -0
data/ext/u/u_upcase.c +89 -0
data/ext/u/u_width.c +35 -0
data/ext/u/u_words.c +82 -0
data/ext/u/yield.h +27 -0
data/lib/u-1.0.rb +20 -0
data/lib/u-1.0/buffer.rb +10 -0
data/lib/u-1.0/string.rb +9 -0
data/lib/u-1.0/version.rb +287 -0
data/test/unit/case.rb +2080 -0
data/test/unit/foldcase.rb +1136 -0
data/test/unit/graphemebreak.rb +407 -0
data/test/unit/normalize.rb +367545 -0
data/test/unit/u-1.0.rb +10 -0
data/test/unit/u-1.0/buffer.rb +52 -0
data/test/unit/u-1.0/string.rb +1439 -0
data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
data/test/unit/wordbreak.rb +1083 -0
metadata +603 -148
data/README +0 -38
data/Rakefile +0 -64
data/ext/encoding/character/utf-8/break.c +0 -25
data/ext/encoding/character/utf-8/data/break.h +0 -22931
data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
data/ext/encoding/character/utf-8/data/compose.h +0 -1607
data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
data/ext/encoding/character/utf-8/decompose.c +0 -444
data/ext/encoding/character/utf-8/depend +0 -65
data/ext/encoding/character/utf-8/extconf.rb +0 -67
data/ext/encoding/character/utf-8/private.h +0 -51
data/ext/encoding/character/utf-8/properties.c +0 -1056
data/ext/encoding/character/utf-8/rb_includes.h +0 -19
data/ext/encoding/character/utf-8/rb_methods.h +0 -49
data/ext/encoding/character/utf-8/rb_private.h +0 -52
data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
data/ext/encoding/character/utf-8/tables.h +0 -38
data/ext/encoding/character/utf-8/unicode.c +0 -319
data/ext/encoding/character/utf-8/unicode.h +0 -216
data/ext/encoding/character/utf-8/utf.c +0 -1334
data/lib/encoding/character/utf-8.rb +0 -201
data/lib/u.rb +0 -16
data/lib/u/string.rb +0 -185
data/lib/u/version.rb +0 -5
data/test/unit/u/string.rb +0 -91

data/ext/u/u_normalized.c ADDED

@@ -0,0 +1,28 @@
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include "u.h"
+enum u_normalized
+u_normalized(const char *u, size_t n, enum u_normalization_form form)
+{
+        enum u_canonical_combining_class pcc = 0;
+        enum u_normalized r = U_NORMALIZED_YES;
+        for (const char *p = u, *end = u + n; p < end; ) {
+                uint32_t c = u_decode(&p, p, end);
+                enum u_canonical_combining_class cc = u_char_canonical_combining_class(c);
+                if (pcc > cc && cc != 0)
+                        return U_NORMALIZED_NO;
+                switch (u_char_normalized(c, form)) {
+                case U_NORMALIZED_NO:
+                        return U_NORMALIZED_NO;
+                case U_NORMALIZED_MAYBE:
+                        r = U_NORMALIZED_MAYBE;
+                        break;
+                case U_NORMALIZED_YES:
+                        break;
+                }
+                pcc = cc;
+        }
+        return r;
+}

data/ext/u/u_offset_to_pointer.c ADDED

@@ -0,0 +1,62 @@
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include "u.h"
+#include "private.h"
+/* {{{1
+ * Convert an integer offset to a pointer within ‘str’.
+ *
+ */
+char *
+u_offset_to_pointer(const char *str, long offset)
+{
+	const char *p = str;
+        if (offset > 0) {
+                while (offset-- > 0)
+                        p = u_next(p);
+        } else {
+                while (offset != 0) {
+                        const char *base = p;
+                        p += offset;
+                        while ((*p & 0xc0) == 0x80)
+                                p--;
+                        offset += u_pointer_to_offset(p, base);
+                 }
+        }
+	return (char *)p;
+}
+char *
+u_offset_to_pointer_n(const char *str, long offset, size_t n)
+{
+	const char *p = str;
+        if (offset > 0) {
+                const char *end = p + n;
+                while (p < end && offset-- > 0)
+                        p = u_next(p);
+                if (offset > 0)
+                        return NULL;
+        } else {
+                const char *end = p - n;
+                while (offset != 0) {
+                        const char *base = p;
+                        p += offset;
+                        while (p >= end && (*p & 0xc0) == 0x80)
+                                p--;
+                        if (p < end)
+                                return NULL;
+                        offset += u_pointer_to_offset(p, base);
+                 }
+        }
+	return (char *)p;
+}

data/ext/u/u_pointer_to_offset.c ADDED

@@ -0,0 +1,23 @@
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include "u.h"
+#include "private.h"
+/* {{{1
+ * Convert a pointer to an integer offset within ‘str’.
+ */
+long
+u_pointer_to_offset(const char *str, const char *pos)
+{
+        if (pos < str)
+                return -u_pointer_to_offset(pos, str);
+	long offset = 0;
+	for (const char *p = str; p < pos; p = u_next(p))
+		offset++;
+	return offset;
+}

data/ext/u/u_recode.c ADDED

@@ -0,0 +1,73 @@
+#include "extconf.h"
+#include <assert.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include "u.h"
+#ifdef HAVE_ICONV
+#  include <iconv.h>
+#  include <limits.h>
+#endif
+size_t
+u_recode(char *result, size_t m, const char *string, size_t n,
+         const char *codeset)
+{
+        iconv_t cd = iconv_open(codeset, "UTF-8");
+        if (cd == (iconv_t)-1)
+                return 0;
+        bool done = false;
+        bool failed = false;
+        bool final = false;
+        bool too_big = false;
+        char *p = (char *)string;
+        size_t p_left = n;
+        // We need to align the buffer.
+        union { unsigned int align; char buffer[4096]; } b;
+        char *base = m > 0 ? result : b.buffer;
+        char *q = base;
+        size_t q_left = m > 0 ? m : sizeof(b.buffer);
+        size_t written = 0;
+        while (!done && !failed) {
+                size_t err = final ?
+                        iconv(cd, NULL, NULL, &q, &q_left) :
+                        iconv(cd, &p, &p_left, &q, &q_left);
+                if (err == (size_t)-1) {
+                        switch (errno) {
+                        case EINVAL:
+                                done = true;
+                                break;
+                        case E2BIG:
+                                written += q - base;
+                                if (!too_big) {
+                                        too_big = true;
+                                        base = b.buffer;
+                                }
+                                q = base;
+                                q_left = sizeof(b.buffer);
+                                errno = 0;
+                                break;
+                        default:
+                                failed = true;
+                                break;
+                        }
+                } else {
+                        if (!final)
+                                final = true;
+                        else
+                                done = true;
+                }
+        }
+        *q = '\0';
+        int saved_errno = errno;
+        if (iconv_close(cd) < 0 && failed)
+                errno = saved_errno;
+        return written + (q - base);
+}

data/ext/u/u_reverse.c ADDED

@@ -0,0 +1,21 @@
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include "u.h"
+#include "private.h"
+#include "output.h"
+size_t
+u_reverse(char *result, size_t m, const char *string, size_t n)
+{
+	assert(string != NULL);
+        assert(result != NULL || m == 0);
+        struct output output = OUTPUT_INIT(result, m);
+        for (const char *p = string + n; p > string; )
+                output_char(&output, u_decode_r(&p, string, p));
+        return output_finalize(&output);
+}

data/ext/u/u_rindex.c ADDED

@@ -0,0 +1,132 @@
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include "u.h"
+#include "private.h"
+/*
+ * Retrieve the index/offset of the right-most occurence of NEEDLE in
+ * HAYSTACK, or -1 if it doesn’t exist.
+ */
+static U_PURE size_t
+str_rindex(const char *haystack, const char *needle)
+{
+	assert(haystack != NULL);
+	assert(needle != NULL);
+	size_t needle_n = strlen(needle);
+	size_t haystack_n = strlen(haystack);
+	if (needle_n == 0)
+		return haystack_n;
+	if (haystack_n < needle_n)
+		return -1;
+	for (const char *p = haystack + haystack_n - needle_n; p >= haystack; p--) {
+		size_t i;
+		for (i = 0; i < needle_n; i++) {
+			if (p[i] != needle[i])
+				break;
+		}
+		if (i == needle_n)
+			return p - haystack;
+	}
+	return -1;
+}
+/*
+ * Retrieve the index/offset of the right-most occurence of NEEDLE in
+ * HAYSTACK, or -1 if it doesn’t exist.
+ */
+static U_PURE size_t
+str_rindex_n(const char *haystack, const char *needle, size_t haystack_n)
+{
+	assert(haystack != NULL);
+	assert(needle != NULL);
+	size_t needle_n = strlen(needle);
+	const char *haystack_max = haystack + haystack_n;
+	const char *p = haystack;
+	while (p < haystack_max && *p != '\0')
+		p++;
+	if (p < haystack + needle_n)
+		return -1;
+	p -= needle_n;
+	for ( ; p >= haystack; p--) {
+		size_t i;
+		for (i = 0; i < needle_n; i++) {
+			if (p[i] != needle[i])
+				break;
+		}
+		if (i == needle_n)
+			return p - haystack;
+	}
+	return -1;
+}
+/* {{{1
+ * Retrieve the index of the right-most occurence of ‘c’ in ‘str’, or -1 if it
+ * doesn't exist.
+ */
+size_t
+u_char_rindex(const char *str, uint32_t c)
+{
+	char ch[7];
+	ch[u_char_to_u(c, ch)] = '\0';
+	return str_rindex(str, ch);
+}
+/* {{{1
+ * Retrieve the index of the right-most occurence of ‘c’ in ‘str’, or -1 if it
+ * doesn't exist, going over at most ‘len’ bytes in ‘str’.
+ */
+size_t
+u_char_rindex_n(const char *str, uint32_t c, size_t n)
+{
+	char ch[7];
+	ch[u_char_to_u(c, ch)] = '\0';
+	return str_rindex_n(str, ch, n);
+}
+/* {{{1
+ * Retrieve the index of the right-most occurence of ‘needle’ in ‘haystack’, or
+ * -1 if it doesn't exist.
+ */
+size_t
+u_rindex(const char *haystack, const char *needle)
+{
+	return str_rindex(haystack, needle);
+}
+/* {{{1
+ * Retrieve the index of the right-most occurence of ‘needle’ in ‘haystack’, or
+ * -1 if it doesn't exist, going over at most ‘len’ bytes in ‘haystack’.
+ */
+size_t
+u_rindex_n(const char *haystack, const char *needle, size_t n)
+{
+	return str_rindex_n(haystack, needle, n);
+}

data/ext/u/u_titlecase.c ADDED

@@ -0,0 +1,68 @@
+#include <ruby.h>
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include "u.h"
+#include "private.h"
+#include "data/constants.h"
+#include "attributes.h"
+#include "titled.h"
+#include "output.h"
+#include "u_locale.h"
+#include "case.h"
+#define LATIN_CAPITAL_LETTER_I ((uint32_t)0x0049)
+#define LATIN_CAPITAL_LETTER_J ((uint32_t)0x004a)
+#define LATIN_SMALL_LETTER_I ((uint32_t)0x0069)
+#define LATIN_SMALL_LETTER_J ((uint32_t)0x006a)
+struct titlecase_closure {
+        const char *string;
+        enum locale locale;
+        struct output *output;
+};
+static void
+titlecase_step(const char *p, const char *q, struct titlecase_closure *closure)
+{
+        const char *t = p;
+        const char *u;
+        while (t < q && !u_char_iscased(u_decode(&u, t, q)))
+                t = u;
+        output_string(closure->output, p, t - p);
+        if (t == q)
+                return;
+        _u_upcase_step(closure->string, t, q, closure->locale, true,
+                       closure->output);
+        if (u < q && closure->locale == LOCALE_DUTCH &&
+            (*t == LATIN_CAPITAL_LETTER_I || *t == LATIN_SMALL_LETTER_I) &&
+            (*u == LATIN_CAPITAL_LETTER_J || *u == LATIN_SMALL_LETTER_J)) {
+                output_char(closure->output, LATIN_CAPITAL_LETTER_J);
+                u++;
+        }
+        while (u < q)
+                u = _u_downcase_step(closure->string, u, q, closure->locale,
+                                     closure->output);
+}
+static void
+titlecase_words(const char *p, size_t n, struct titlecase_closure *closure)
+{
+        titlecase_step(p, p + n, closure);
+}
+size_t
+u_titlecase(char *result, size_t m, const char *string, size_t n,
+            const char *locale)
+{
+	assert(string != NULL);
+        assert(result != NULL || m == 0);
+        struct output output = OUTPUT_INIT(result, m);
+        struct titlecase_closure closure =
+                { string, _u_locale_from_string(locale), &output };
+        u_words(string, n, (u_substring_fn)titlecase_words, &closure);
+        return output_finalize(&output);
+}

data/ext/u/u_upcase.c ADDED

@@ -0,0 +1,89 @@
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "u.h"
+#include "private.h"
+#include "data/constants.h"
+#include "attributes.h"
+#include "titled.h"
+#include "output.h"
+#include "u_locale.h"
+#include "case.h"
+#define LATIN_SMALL_LETTER_I ((uint32_t)0x0069)
+#define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE ((uint32_t)0x0130)
+#define COMBINING_DOT_ABOVE ((uint32_t)0x0307)
+#define COMBINING_GREEK_YPOGEGRAMMENI ((uint32_t)0x0345)
+#define GREEK_CAPITAL_LETTER_IOTA ((uint32_t)0x0399)
+static inline bool
+ismark(int category)
+{
+        return IS(category,
+                  OR(U_GENERAL_CATEGORY_MARK_NON_SPACING,
+                     OR(U_GENERAL_CATEGORY_MARK_SPACING_COMBINING,
+                        OR(U_GENERAL_CATEGORY_MARK_ENCLOSING, 0))));
+}
+static inline const char *
+output_marks(const char *q, const char *end,
+             struct output *output)
+{
+        while (q < end) {
+                const char *r;
+		uint32_t c = u_decode(&r, q, end);
+                if (!ismark(u_char_general_category(c)))
+                        break;
+                output_char(output, c);
+                q = r;
+	}
+        return q;
+}
+const char *
+_u_upcase_step(const char *string, const char *p, const char *end,
+               enum locale locale, bool title, struct output *output)
+{
+        const char *q;
+        uint32_t c = u_decode(&q, p, end);
+        enum u_general_category gc;
+        if (!title && c == COMBINING_GREEK_YPOGEGRAMMENI) {
+                q = output_marks(q, end, output);
+                output_char(output, GREEK_CAPITAL_LETTER_IOTA);
+        } else if (locale == LOCALE_LITHUANIAN &&
+                   c == COMBINING_DOT_ABOVE &&
+                   is_after(string, p, u_char_issoftdotted))
+                ;
+        else if (locale == LOCALE_TURKIC && c == LATIN_SMALL_LETTER_I)
+                output_char(output, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+        else if (IS(gc = u_char_general_category(c),
+                    OR(U_GENERAL_CATEGORY_LETTER_LOWERCASE,
+                       OR(U_GENERAL_CATEGORY_LETTER_TITLECASE, 0))))
+                case_simple(c, gc,
+                            title || gc != U_GENERAL_CATEGORY_LETTER_LOWERCASE,
+                            true,
+                            output);
+        else
+                output_string(output, p, q - p);
+        return q;
+}
+size_t
+u_upcase(char *result, size_t m, const char *string, size_t n,
+         const char *locale)
+{
+	assert(string != NULL);
+        assert(result != NULL || m == 0);
+	enum locale l = _u_locale_from_string(locale);
+        const char *end = string + n;
+        struct output output = OUTPUT_INIT(result, m);
+        for (const char *p = string; p < end; )
+                p = _u_upcase_step(string, p, end, l, false, &output);
+        return output_finalize(&output);
+}