RubyGems - u - Versions diffs - 0.5.0 → 1.0.0 - Mend

u 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (316) hide show

checksums.yaml +7 -0
data/build/ext/u/data/attributes.rb +39 -0
data/build/ext/u/data/bidi-mirroring.rb +27 -0
data/build/ext/u/data/canonical-combining-class.rb +15 -0
data/build/ext/u/data/case-folding.rb +39 -0
data/build/ext/u/data/cased.rb +19 -0
data/build/ext/u/data/compose.rb +304 -0
data/build/ext/u/data/constants.rb +31 -0
data/build/ext/u/data/decompose.rb +85 -0
data/build/ext/u/data/general-category.rb +61 -0
data/build/ext/u/data/grapheme-word-break.rb +15 -0
data/build/ext/u/data/marshalled.rb +5 -0
data/build/ext/u/data/script.rb +91 -0
data/build/ext/u/data/soft-dotted.rb +17 -0
data/build/ext/u/data/title-table.rb +30 -0
data/build/ext/u/data/wide.rb +17 -0
data/build/lib/u/build.rb +8 -0
data/build/lib/u/build/data.rb +16 -0
data/build/lib/u/build/data/bidimirroring.rb +26 -0
data/build/lib/u/build/data/break.rb +14 -0
data/build/lib/u/build/data/casefolding.rb +77 -0
data/build/lib/u/build/data/compositionexclusions.rb +14 -0
data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
data/build/lib/u/build/data/file.rb +88 -0
data/build/lib/u/build/data/linebreak.rb +14 -0
data/build/lib/u/build/data/proplist.rb +18 -0
data/build/lib/u/build/data/scripts.rb +22 -0
data/build/lib/u/build/data/specialcasing.rb +106 -0
data/build/lib/u/build/data/unicode.rb +41 -0
data/build/lib/u/build/data/unicode/entry.rb +27 -0
data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
data/build/lib/u/build/data/unicode/points.rb +32 -0
data/build/lib/u/build/header.rb +11 -0
data/build/lib/u/build/header/table.rb +19 -0
data/build/lib/u/build/header/table/row.rb +64 -0
data/build/lib/u/build/header/tables.rb +6 -0
data/build/lib/u/build/header/tables/intervals.rb +50 -0
data/build/lib/u/build/header/tables/split.rb +20 -0
data/build/lib/u/build/header/tables/split/data.rb +29 -0
data/build/lib/u/build/header/tables/split/part1.rb +28 -0
data/build/lib/u/build/header/tables/split/part2.rb +13 -0
data/build/lib/u/build/header/tables/split/row.rb +34 -0
data/build/lib/u/build/header/tables/split/rows.rb +22 -0
data/build/test/unit/break.rb +45 -0
data/build/test/unit/case.rb +178 -0
data/build/test/unit/foldcase.rb +44 -0
data/build/test/unit/normalize.rb +81 -0
data/ext/u/attributes.c +62 -0
data/ext/u/attributes.h +5 -0
data/ext/u/case.h +41 -0
data/ext/u/data/attributes.h +3070 -0
data/ext/u/data/bidi-mirroring.h +373 -0
data/ext/u/data/canonical-combining-class.h +2157 -0
data/ext/u/data/case-folding.h +171 -0
data/ext/u/data/cased.h +42 -0
data/ext/u/data/compose.h +1714 -0
data/ext/u/data/constants.h +17 -0
data/ext/u/data/decompose.h +9356 -0
data/ext/u/data/general-category.h +28959 -0
data/ext/u/data/grapheme-break.h +13201 -0
data/ext/u/data/line-break.h +26501 -0
data/ext/u/data/normalization-quick-check.h +3002 -0
data/ext/u/data/script.h +2928 -0
data/ext/u/data/soft-dotted.h +55 -0
data/ext/u/data/title-table.h +41 -0
data/ext/u/data/types.h +11117 -0
data/ext/u/data/wide-cjk.h +197 -0
data/ext/u/data/wide.h +59 -0
data/ext/u/data/word-break.h +10001 -0
data/ext/u/depend +281 -0
data/ext/u/extconf.rb +158 -0
data/ext/u/output.h +51 -0
data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
data/ext/u/private.h +58 -0
data/ext/u/rb_includes.h +10 -0
data/ext/u/rb_private.c +98 -0
data/ext/u/rb_private.h +67 -0
data/ext/u/rb_u.c +251 -0
data/ext/u/rb_u_buffer.c +443 -0
data/ext/u/rb_u_buffer.h +24 -0
data/ext/u/rb_u_re.c +43 -0
data/ext/u/rb_u_re.h +15 -0
data/ext/u/rb_u_string.c +478 -0
data/ext/u/rb_u_string.h +173 -0
data/ext/u/rb_u_string_alnum.c +10 -0
data/ext/u/rb_u_string_alpha.c +10 -0
data/ext/u/rb_u_string_aref.c +142 -0
data/ext/u/rb_u_string_ascii_only.c +13 -0
data/ext/u/rb_u_string_assigned.c +10 -0
data/ext/u/rb_u_string_b.c +18 -0
data/ext/u/rb_u_string_bytesize.c +10 -0
data/ext/u/rb_u_string_byteslice.c +103 -0
data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
data/ext/u/rb_u_string_case_ignorable.c +25 -0
data/ext/u/rb_u_string_casecmp.c +61 -0
data/ext/u/rb_u_string_cased.c +17 -0
data/ext/u/rb_u_string_chomp.c +107 -0
data/ext/u/rb_u_string_chop.c +33 -0
data/ext/u/rb_u_string_chr.c +9 -0
data/ext/u/rb_u_string_cntrl.c +10 -0
data/ext/u/rb_u_string_collate.c +46 -0
data/ext/u/rb_u_string_collation_key.c +18 -0
data/ext/u/rb_u_string_count.c +38 -0
data/ext/u/rb_u_string_defined.c +10 -0
data/ext/u/rb_u_string_delete.c +62 -0
data/ext/u/rb_u_string_digit.c +10 -0
data/ext/u/rb_u_string_downcase.c +13 -0
data/ext/u/rb_u_string_dump.c +153 -0
data/ext/u/rb_u_string_each_byte.c +46 -0
data/ext/u/rb_u_string_each_char.c +49 -0
data/ext/u/rb_u_string_each_codepoint.c +45 -0
data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
data/ext/u/rb_u_string_each_line.c +142 -0
data/ext/u/rb_u_string_each_word.c +34 -0
data/ext/u/rb_u_string_empty.c +11 -0
data/ext/u/rb_u_string_end_with.c +31 -0
data/ext/u/rb_u_string_eql.c +30 -0
data/ext/u/rb_u_string_equal.c +33 -0
data/ext/u/rb_u_string_foldcase.c +12 -0
data/ext/u/rb_u_string_folded.c +13 -0
data/ext/u/rb_u_string_format.c +1745 -0
data/ext/u/rb_u_string_general_category.c +109 -0
data/ext/u/rb_u_string_getbyte.c +21 -0
data/ext/u/rb_u_string_graph.c +21 -0
data/ext/u/rb_u_string_grapheme_break.c +61 -0
data/ext/u/rb_u_string_gsub.c +164 -0
data/ext/u/rb_u_string_hash.c +10 -0
data/ext/u/rb_u_string_hex.c +9 -0
data/ext/u/rb_u_string_include.c +10 -0
data/ext/u/rb_u_string_index.c +110 -0
data/ext/u/rb_u_string_inspect.c +189 -0
data/ext/u/rb_u_string_internal_tr.c +148 -0
data/ext/u/rb_u_string_internal_tr.h +29 -0
data/ext/u/rb_u_string_justify.c +169 -0
data/ext/u/rb_u_string_length.c +10 -0
data/ext/u/rb_u_string_line_break.c +115 -0
data/ext/u/rb_u_string_lower.c +13 -0
data/ext/u/rb_u_string_lstrip.c +24 -0
data/ext/u/rb_u_string_match.c +65 -0
data/ext/u/rb_u_string_mirror.c +16 -0
data/ext/u/rb_u_string_newline.c +21 -0
data/ext/u/rb_u_string_normalize.c +70 -0
data/ext/u/rb_u_string_normalized.c +28 -0
data/ext/u/rb_u_string_oct.c +11 -0
data/ext/u/rb_u_string_ord.c +14 -0
data/ext/u/rb_u_string_partition.c +80 -0
data/ext/u/rb_u_string_plus.c +33 -0
data/ext/u/rb_u_string_print.c +10 -0
data/ext/u/rb_u_string_punct.c +10 -0
data/ext/u/rb_u_string_reverse.c +13 -0
data/ext/u/rb_u_string_rindex.c +104 -0
data/ext/u/rb_u_string_rpartition.c +81 -0
data/ext/u/rb_u_string_rstrip.c +29 -0
data/ext/u/rb_u_string_scan.c +109 -0
data/ext/u/rb_u_string_script.c +253 -0
data/ext/u/rb_u_string_soft_dotted.c +13 -0
data/ext/u/rb_u_string_space.c +24 -0
data/ext/u/rb_u_string_split.c +245 -0
data/ext/u/rb_u_string_squeeze.c +75 -0
data/ext/u/rb_u_string_start_with.c +31 -0
data/ext/u/rb_u_string_strip.c +36 -0
data/ext/u/rb_u_string_sub.c +147 -0
data/ext/u/rb_u_string_times.c +35 -0
data/ext/u/rb_u_string_title.c +10 -0
data/ext/u/rb_u_string_titlecase.c +13 -0
data/ext/u/rb_u_string_to_i.c +45 -0
data/ext/u/rb_u_string_to_inum.c +364 -0
data/ext/u/rb_u_string_to_inum.h +1 -0
data/ext/u/rb_u_string_to_str.c +17 -0
data/ext/u/rb_u_string_to_sym.c +12 -0
data/ext/u/rb_u_string_tr.c +290 -0
data/ext/u/rb_u_string_upcase.c +12 -0
data/ext/u/rb_u_string_upper.c +13 -0
data/ext/u/rb_u_string_valid.c +10 -0
data/ext/u/rb_u_string_valid_encoding.c +12 -0
data/ext/u/rb_u_string_wide.c +21 -0
data/ext/u/rb_u_string_wide_cjk.c +21 -0
data/ext/u/rb_u_string_width.c +19 -0
data/ext/u/rb_u_string_word_break.c +63 -0
data/ext/u/rb_u_string_xdigit.c +22 -0
data/ext/u/rb_u_string_zero_width.c +16 -0
data/ext/u/titled.c +55 -0
data/ext/u/titled.h +1 -0
data/ext/u/u.c +23 -0
data/ext/u/u.h +458 -0
data/ext/u/u_char_canonical_combining_class.c +31 -0
data/ext/u/u_char_digit_value.c +21 -0
data/ext/u/u_char_downcase.c +27 -0
data/ext/u/u_char_general_category.c +31 -0
data/ext/u/u_char_grapheme_break.c +28 -0
data/ext/u/u_char_isalnum.c +24 -0
data/ext/u/u_char_isalpha.c +21 -0
data/ext/u/u_char_isassigned.c +16 -0
data/ext/u/u_char_iscased.c +22 -0
data/ext/u/u_char_iscaseignorable.c +29 -0
data/ext/u/u_char_iscntrl.c +17 -0
data/ext/u/u_char_isdefined.c +15 -0
data/ext/u/u_char_isdigit.c +16 -0
data/ext/u/u_char_isgraph.c +22 -0
data/ext/u/u_char_islower.c +16 -0
data/ext/u/u_char_isnewline.c +24 -0
data/ext/u/u_char_isprint.c +21 -0
data/ext/u/u_char_ispunct.c +27 -0
data/ext/u/u_char_issoftdotted.c +18 -0
data/ext/u/u_char_isspace.c +28 -0
data/ext/u/u_char_isupper.c +16 -0
data/ext/u/u_char_isvalid.c +18 -0
data/ext/u/u_char_iswide.c +18 -0
data/ext/u/u_char_iswide_cjk.c +22 -0
data/ext/u/u_char_isxdigit.c +27 -0
data/ext/u/u_char_iszerowidth.c +29 -0
data/ext/u/u_char_line_break.c +29 -0
data/ext/u/u_char_mirror.c +16 -0
data/ext/u/u_char_normalized.c +23 -0
data/ext/u/u_char_script.c +41 -0
data/ext/u/u_char_to_u.c +48 -0
data/ext/u/u_char_upcase.c +24 -0
data/ext/u/u_char_width.c +12 -0
data/ext/u/u_char_word_break.c +28 -0
data/ext/u/u_char_xdigit_value.c +31 -0
data/ext/u/u_collate.c +83 -0
data/ext/u/u_collation_key.c +132 -0
data/ext/u/u_decode.c +156 -0
data/ext/u/u_downcase.c +201 -0
data/ext/u/u_foldcase.c +68 -0
data/ext/u/u_grapheme_clusters.c +57 -0
data/ext/u/u_has_prefix.c +27 -0
data/ext/u/u_index.c +93 -0
data/ext/u/u_is_ascii_only.c +33 -0
data/ext/u/u_locale.c +40 -0
data/ext/u/u_locale.h +14 -0
data/ext/u/u_mirror.c +20 -0
data/ext/u/u_n_bytes.c +16 -0
data/ext/u/u_n_chars.c +43 -0
data/ext/u/u_normalize.c +232 -0
data/ext/u/u_normalized.c +28 -0
data/ext/u/u_offset_to_pointer.c +62 -0
data/ext/u/u_pointer_to_offset.c +23 -0
data/ext/u/u_recode.c +73 -0
data/ext/u/u_reverse.c +21 -0
data/ext/u/u_rindex.c +132 -0
data/ext/u/u_titlecase.c +68 -0
data/ext/u/u_upcase.c +89 -0
data/ext/u/u_width.c +35 -0
data/ext/u/u_words.c +82 -0
data/ext/u/yield.h +27 -0
data/lib/u-1.0.rb +20 -0
data/lib/u-1.0/buffer.rb +10 -0
data/lib/u-1.0/string.rb +9 -0
data/lib/u-1.0/version.rb +287 -0
data/test/unit/case.rb +2080 -0
data/test/unit/foldcase.rb +1136 -0
data/test/unit/graphemebreak.rb +407 -0
data/test/unit/normalize.rb +367545 -0
data/test/unit/u-1.0.rb +10 -0
data/test/unit/u-1.0/buffer.rb +52 -0
data/test/unit/u-1.0/string.rb +1439 -0
data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
data/test/unit/wordbreak.rb +1083 -0
metadata +603 -148
data/README +0 -38
data/Rakefile +0 -64
data/ext/encoding/character/utf-8/break.c +0 -25
data/ext/encoding/character/utf-8/data/break.h +0 -22931
data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
data/ext/encoding/character/utf-8/data/compose.h +0 -1607
data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
data/ext/encoding/character/utf-8/decompose.c +0 -444
data/ext/encoding/character/utf-8/depend +0 -65
data/ext/encoding/character/utf-8/extconf.rb +0 -67
data/ext/encoding/character/utf-8/private.h +0 -51
data/ext/encoding/character/utf-8/properties.c +0 -1056
data/ext/encoding/character/utf-8/rb_includes.h +0 -19
data/ext/encoding/character/utf-8/rb_methods.h +0 -49
data/ext/encoding/character/utf-8/rb_private.h +0 -52
data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
data/ext/encoding/character/utf-8/tables.h +0 -38
data/ext/encoding/character/utf-8/unicode.c +0 -319
data/ext/encoding/character/utf-8/unicode.h +0 -216
data/ext/encoding/character/utf-8/utf.c +0 -1334
data/lib/encoding/character/utf-8.rb +0 -201
data/lib/u.rb +0 -16
data/lib/u/string.rb +0 -185
data/lib/u/version.rb +0 -5
data/test/unit/u/string.rb +0 -91

data/ext/u/rb_u_string_xdigit.c ADDED

@@ -0,0 +1,22 @@
+#include "rb_includes.h"
+/* @overload xdigit?
+ *
+ *   Returns true if the receiver contains only characters in the general
+ *   category Number, decimal digit (Nd) or is a lower- or uppercase letter
+ *   between ‘a’ and ‘f’.  Specifically, any character that
+ *
+ *   * Belongs to the general category Number, decimal digit (Nd)
+ *   * Falls in the range U+0041 (LATIN CAPITAL LETTER A) through U+0046 (LATIN CAPITAL LETTER F)
+ *   * Falls in the range U+0061 (LATIN SMALL LETTER A) through U+0066 (LATIN SMALL LETTER F)
+ *   * Falls in the range U+FF21 (FULLWIDTH LATIN CAPITAL LETTER A) through U+FF26 (FULLWIDTH LATIN CAPITAL LETTER F)
+ *   * Falls in the range U+FF41 (FULLWIDTH LATIN SMALL LETTER A) through U+FF46 (FULLWIDTH LATIN SMALL LETTER F)
+ *
+ *   will do.
+ *
+ *   @return [Boolean] */
+VALUE
+rb_u_string_xdigit(VALUE self)
+{
+        return _rb_u_character_test(self, u_char_isxdigit);
+}

data/ext/u/rb_u_string_zero_width.c ADDED

@@ -0,0 +1,16 @@
+#include "rb_includes.h"
+/* @overload zero_width?
+ *
+ *   Returns true if the receiver contains only “zero-width” characters.  A
+ *   zero-width character is defined as a character in the general categories
+ *   Mark, nonspacing (Mn), Mark, enclosing (Me) or Other, format (Of),
+ *   excluding the character U+00AD (SOFT HYPHEN), or is a Hangul character
+ *   between U+1160 and U+1200 or U+200B (ZERO WIDTH SPACE).
+ *
+ *   @return [Boolean] */
+VALUE
+rb_u_string_zero_width(VALUE self)
+{
+        return _rb_u_character_test(self, u_char_iszerowidth);
+}

data/ext/u/titled.c ADDED

@@ -0,0 +1,55 @@
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include "u.h"
+#include "private.h"
+#include "data/constants.h"
+#include "data/title-table.h"
+#include "titled.h"
+/* {{{1
+ * Determine whether ‘c’ is a titlecase letter, such as the slavic digraph Ǳ,
+ * which at the beginning of a word is written as ǲ, where only the initial D
+ * is capitalized.  (Complicated huh?)
+ */
+bool
+u_char_istitle(uint32_t c)
+{
+        size_t index;
+        return unicode_table_lookup(title_table, c, &index);
+}
+/* {{{1
+ * Convert ‘c’ to its titlecase representation (if any).
+ */
+uint32_t
+u_char_titlecase(uint32_t c)
+{
+	for (size_t i = 0; i < lengthof(title_table); i++)
+                if (title_table[i].ch == c ||
+                    title_table[i].upper == c ||
+                    title_table[i].lower == c)
+			return title_table[i].ch;
+        if (u_char_general_category(c) == U_GENERAL_CATEGORY_LETTER_LOWERCASE)
+                return u_char_upcase(c);
+        return c;
+}
+uint32_t
+_u_titlecase_table_lookup(uint32_t c, bool want_upper)
+{
+        size_t index;
+        if (!unicode_table_lookup(title_table, c, &index))
+                return c;
+        return want_upper ? title_table[index].upper : title_table[index].lower;
+}

data/ext/u/titled.h ADDED

	@@ -0,0 +1 @@
1	+ U_CONST uint32_t _u_titlecase_table_lookup(uint32_t c, bool want_upper);

data/ext/u/u.c ADDED

@@ -0,0 +1,23 @@
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include "u.h"
+/* {{{1
+ * s_utf_skip_lengths: This table is used for keeping track of how long a given
+ * UTF-8 character sequence is from the contents of the first byte.
+ */
+static const uint8_t s_u_skip_length_data[256] = {
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
+};
+const char * const u_skip_lengths = (const char *)s_u_skip_length_data;

data/ext/u/u.h ADDED

@@ -0,0 +1,458 @@
+#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4)
+#  define U_CONST __attribute__((__const__))
+#else
+#  define U_CONST
+#endif
+#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 96)
+#  define U_PURE __attribute__((__pure__))
+#else
+#  define U_PURE
+#endif
+#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 2)
+#  define U_NON_NULL(parameters) __attribute__((__nonnull__ parameters))
+#else
+#  define U_NON_NULL(parameters)
+#endif
+#define U_CHAR_MAX_BYTE_LENGTH 4
+#define U_N_CODEPOINTS (0x10ffff + 1)
+enum u_general_category {
+        U_GENERAL_CATEGORY_OTHER_CONTROL,
+        U_GENERAL_CATEGORY_OTHER_FORMAT,
+        U_GENERAL_CATEGORY_OTHER_NOT_ASSIGNED,
+        U_GENERAL_CATEGORY_OTHER_PRIVATE_USE,
+        U_GENERAL_CATEGORY_OTHER_SURROGATE,
+        U_GENERAL_CATEGORY_LETTER_LOWERCASE,
+        U_GENERAL_CATEGORY_LETTER_MODIFIER,
+        U_GENERAL_CATEGORY_LETTER_OTHER,
+        U_GENERAL_CATEGORY_LETTER_TITLECASE,
+        U_GENERAL_CATEGORY_LETTER_UPPERCASE,
+        U_GENERAL_CATEGORY_MARK_SPACING_COMBINING,
+        U_GENERAL_CATEGORY_MARK_ENCLOSING,
+        U_GENERAL_CATEGORY_MARK_NON_SPACING,
+        U_GENERAL_CATEGORY_NUMBER_DECIMAL,
+        U_GENERAL_CATEGORY_NUMBER_LETTER,
+        U_GENERAL_CATEGORY_NUMBER_OTHER,
+        U_GENERAL_CATEGORY_PUNCTUATION_CONNECTOR,
+        U_GENERAL_CATEGORY_PUNCTUATION_DASH,
+        U_GENERAL_CATEGORY_PUNCTUATION_CLOSE,
+        U_GENERAL_CATEGORY_PUNCTUATION_FINAL_QUOTE,
+        U_GENERAL_CATEGORY_PUNCTUATION_INITIAL_QUOTE,
+        U_GENERAL_CATEGORY_PUNCTUATION_OTHER,
+        U_GENERAL_CATEGORY_PUNCTUATION_OPEN,
+        U_GENERAL_CATEGORY_SYMBOL_CURRENCY,
+        U_GENERAL_CATEGORY_SYMBOL_MODIFIER,
+        U_GENERAL_CATEGORY_SYMBOL_MATH,
+        U_GENERAL_CATEGORY_SYMBOL_OTHER,
+        U_GENERAL_CATEGORY_SEPARATOR_LINE,
+        U_GENERAL_CATEGORY_SEPARATOR_PARAGRAPH,
+        U_GENERAL_CATEGORY_SEPARATOR_SPACE,
+};
+U_CONST bool u_char_isalnum(uint32_t c);
+U_CONST bool u_char_isalpha(uint32_t c);
+U_CONST bool u_char_isassigned(uint32_t c);
+U_CONST bool u_char_iscased(uint32_t c);
+U_CONST bool u_char_iscaseignorable(uint32_t c);
+U_CONST bool u_char_iscntrl(uint32_t c);
+U_CONST bool u_char_isdefined(uint32_t c);
+U_CONST bool u_char_isdigit(uint32_t c);
+U_CONST bool u_char_isgraph(uint32_t c);
+U_CONST bool u_char_islower(uint32_t c);
+U_CONST bool u_char_isnewline(uint32_t c);
+U_CONST bool u_char_isprint(uint32_t c);
+U_CONST bool u_char_ispunct(uint32_t c);
+U_CONST bool u_char_issoftdotted(uint32_t c);
+U_CONST bool u_char_isspace(uint32_t c);
+U_CONST bool u_char_istitle(uint32_t c);
+U_CONST bool u_char_isupper(uint32_t c);
+U_CONST bool u_char_isvalid(uint32_t c);
+U_CONST bool u_char_iswide(uint32_t c);
+U_CONST bool u_char_iswide_cjk(uint32_t c);
+U_CONST bool u_char_isxdigit(uint32_t c);
+U_CONST bool u_char_iszerowidth(uint32_t c);
+U_CONST uint32_t u_char_downcase(uint32_t c);
+U_CONST uint32_t u_char_titlecase(uint32_t c);
+U_CONST uint32_t u_char_upcase(uint32_t c);
+U_CONST size_t u_char_width(uint32_t c);
+U_CONST int u_char_digit_value(uint32_t c);
+U_CONST int u_char_xdigit_value(uint32_t c);
+U_CONST enum u_general_category u_char_general_category(uint32_t c);
+enum u_canonical_combining_class {
+        U_CANONICAL_COMBINING_CLASS_NOT_REORDERED = 0,
+        U_CANONICAL_COMBINING_CLASS_OVERLAY = 1,
+        U_CANONICAL_COMBINING_CLASS_NUKTA = 7,
+        U_CANONICAL_COMBINING_CLASS_KANA_VOICING = 8,
+        U_CANONICAL_COMBINING_CLASS_VIRAMA = 9,
+        U_CANONICAL_COMBINING_CLASS_CCC10 = 10,
+        U_CANONICAL_COMBINING_CLASS_CCC11 = 11,
+        U_CANONICAL_COMBINING_CLASS_CCC12 = 12,
+        U_CANONICAL_COMBINING_CLASS_CCC13 = 13,
+        U_CANONICAL_COMBINING_CLASS_CCC14 = 14,
+        U_CANONICAL_COMBINING_CLASS_CCC15 = 15,
+        U_CANONICAL_COMBINING_CLASS_CCC16 = 16,
+        U_CANONICAL_COMBINING_CLASS_CCC17 = 17,
+        U_CANONICAL_COMBINING_CLASS_CCC18 = 18,
+        U_CANONICAL_COMBINING_CLASS_CCC19 = 19,
+        U_CANONICAL_COMBINING_CLASS_CCC20 = 20,
+        U_CANONICAL_COMBINING_CLASS_CCC21 = 21,
+        U_CANONICAL_COMBINING_CLASS_CCC22 = 22,
+        U_CANONICAL_COMBINING_CLASS_CCC23 = 23,
+        U_CANONICAL_COMBINING_CLASS_CCC24 = 24,
+        U_CANONICAL_COMBINING_CLASS_CCC25 = 25,
+        U_CANONICAL_COMBINING_CLASS_CCC26 = 26,
+        U_CANONICAL_COMBINING_CLASS_CCC27 = 27,
+        U_CANONICAL_COMBINING_CLASS_CCC28 = 28,
+        U_CANONICAL_COMBINING_CLASS_CCC29 = 29,
+        U_CANONICAL_COMBINING_CLASS_CCC30 = 30,
+        U_CANONICAL_COMBINING_CLASS_CCC31 = 31,
+        U_CANONICAL_COMBINING_CLASS_CCC32 = 32,
+        U_CANONICAL_COMBINING_CLASS_CCC33 = 33,
+        U_CANONICAL_COMBINING_CLASS_CCC34 = 34,
+        U_CANONICAL_COMBINING_CLASS_CCC35 = 35,
+        U_CANONICAL_COMBINING_CLASS_CCC36 = 36,
+        U_CANONICAL_COMBINING_CLASS_CCC84 = 84,
+        U_CANONICAL_COMBINING_CLASS_CCC91 = 91,
+        U_CANONICAL_COMBINING_CLASS_CCC103 = 103,
+        U_CANONICAL_COMBINING_CLASS_CCC107 = 107,
+        U_CANONICAL_COMBINING_CLASS_CCC118 = 118,
+        U_CANONICAL_COMBINING_CLASS_CCC122 = 122,
+        U_CANONICAL_COMBINING_CLASS_CCC129 = 129,
+        U_CANONICAL_COMBINING_CLASS_CCC130 = 130,
+        U_CANONICAL_COMBINING_CLASS_CCC132 = 132,
+        U_CANONICAL_COMBINING_CLASS_ATTACHED_BELOW_LEFT = 200,
+        U_CANONICAL_COMBINING_CLASS_ATTACHED_BELOW = 202,
+        U_CANONICAL_COMBINING_CLASS_ATTACHED_ABOVE = 214,
+        U_CANONICAL_COMBINING_CLASS_ATTACHED_ABOVE_RIGHT = 216,
+        U_CANONICAL_COMBINING_CLASS_BELOW_LEFT = 218,
+        U_CANONICAL_COMBINING_CLASS_BELOW = 220,
+        U_CANONICAL_COMBINING_CLASS_BELOW_RIGHT = 222,
+        U_CANONICAL_COMBINING_CLASS_LEFT = 224,
+        U_CANONICAL_COMBINING_CLASS_RIGHT = 226,
+        U_CANONICAL_COMBINING_CLASS_ABOVE_LEFT = 228,
+        U_CANONICAL_COMBINING_CLASS_ABOVE = 230,
+        U_CANONICAL_COMBINING_CLASS_ABOVE_RIGHT = 232,
+        U_CANONICAL_COMBINING_CLASS_DOUBLE_BELOW = 233,
+        U_CANONICAL_COMBINING_CLASS_DOUBLE_ABOVE = 234,
+        U_CANONICAL_COMBINING_CLASS_IOTA_SUBSCRIPT = 240,
+};
+U_CONST enum u_canonical_combining_class
+        u_char_canonical_combining_class(uint32_t c);
+uint32_t u_char_mirror(uint32_t c);
+enum u_line_break {
+        U_LINE_BREAK_MANDATORY,
+        U_LINE_BREAK_CARRIAGE_RETURN,
+        U_LINE_BREAK_LINE_FEED,
+        U_LINE_BREAK_COMBINING_MARK,
+        U_LINE_BREAK_NEXT_LINE,
+        U_LINE_BREAK_SURROGATE,
+        U_LINE_BREAK_WORD_JOINER,
+        U_LINE_BREAK_ZERO_WIDTH_SPACE,
+        U_LINE_BREAK_NON_BREAKING_GLUE,
+        U_LINE_BREAK_SPACE,
+        U_LINE_BREAK_BREAK_OPPORTUNITY_BEFORE_AND_AFTER,
+        U_LINE_BREAK_BREAK_AFTER,
+        U_LINE_BREAK_BREAK_BEFORE,
+        U_LINE_BREAK_HYPHEN,
+        U_LINE_BREAK_CONTINGENT_BREAK_OPPORTUNITY,
+        U_LINE_BREAK_CLOSE_PUNCTUATION,
+        U_LINE_BREAK_CLOSE_PARENTHESIS,
+        U_LINE_BREAK_EXCLAMATION_INTERROGATION,
+        U_LINE_BREAK_INSEPARABLE,
+        U_LINE_BREAK_NONSTARTER,
+        U_LINE_BREAK_OPEN_PUNCTUATION,
+        U_LINE_BREAK_QUOTATION,
+        U_LINE_BREAK_INFIX_NUMERIC_SEPARATOR,
+        U_LINE_BREAK_NUMERIC,
+        U_LINE_BREAK_POSTFIX_NUMERIC,
+        U_LINE_BREAK_PREFIX_NUMERIC,
+        U_LINE_BREAK_SYMBOLS_ALLOWING_BREAK_AFTER,
+        U_LINE_BREAK_AMBIGUOUS,
+        U_LINE_BREAK_ALPHABETIC,
+        U_LINE_BREAK_CONDITIONAL_JAPANESE_STARTER,
+        U_LINE_BREAK_HANGUL_LV_SYLLABLE,
+        U_LINE_BREAK_HANGUL_LVT_SYLLABLE,
+        U_LINE_BREAK_HEBREW_LETTER,
+        U_LINE_BREAK_IDEOGRAPHIC,
+        U_LINE_BREAK_HANGUL_L_JAMO,
+        U_LINE_BREAK_HANGUL_V_JAMO,
+        U_LINE_BREAK_HANGUL_T_JAMO,
+        U_LINE_BREAK_REGIONAL_INDICATOR,
+        U_LINE_BREAK_COMPLEX_CONTEXT_DEPENDENT,
+        U_LINE_BREAK_UNKNOWN,
+};
+U_CONST enum u_line_break u_char_line_break(uint32_t c);
+enum u_grapheme_break {
+        U_GRAPHEME_BREAK_CONTROL,
+        U_GRAPHEME_BREAK_CR,
+        U_GRAPHEME_BREAK_EXTEND,
+        U_GRAPHEME_BREAK_L,
+        U_GRAPHEME_BREAK_LF,
+        U_GRAPHEME_BREAK_LV,
+        U_GRAPHEME_BREAK_LVT,
+        U_GRAPHEME_BREAK_OTHER,
+        U_GRAPHEME_BREAK_PREPEND,
+        U_GRAPHEME_BREAK_REGIONAL_INDICATOR,
+        U_GRAPHEME_BREAK_SPACINGMARK,
+        U_GRAPHEME_BREAK_T,
+        U_GRAPHEME_BREAK_V,
+};
+U_CONST enum u_grapheme_break u_char_grapheme_break(uint32_t c);
+enum u_word_break {
+        U_WORD_BREAK_ALETTER,
+        U_WORD_BREAK_CR,
+        U_WORD_BREAK_EXTEND,
+        U_WORD_BREAK_EXTENDNUMLET,
+        U_WORD_BREAK_FORMAT,
+        U_WORD_BREAK_KATAKANA,
+        U_WORD_BREAK_LF,
+        U_WORD_BREAK_MIDLETTER,
+        U_WORD_BREAK_MIDNUM,
+        U_WORD_BREAK_MIDNUMLET,
+        U_WORD_BREAK_NEWLINE,
+        U_WORD_BREAK_NUMERIC,
+        U_WORD_BREAK_OTHER,
+        U_WORD_BREAK_REGIONAL_INDICATOR,
+};
+U_CONST enum u_word_break u_char_word_break(uint32_t c);
+enum u_script {
+        U_SCRIPT_COMMON,
+        U_SCRIPT_INHERITED,
+        U_SCRIPT_ARABIC,
+        U_SCRIPT_ARMENIAN,
+        U_SCRIPT_BENGALI,
+        U_SCRIPT_BOPOMOFO,
+        U_SCRIPT_CHEROKEE,
+        U_SCRIPT_COPTIC,
+        U_SCRIPT_CYRILLIC,
+        U_SCRIPT_DESERET,
+        U_SCRIPT_DEVANAGARI,
+        U_SCRIPT_ETHIOPIC,
+        U_SCRIPT_GEORGIAN,
+        U_SCRIPT_GOTHIC,
+        U_SCRIPT_GREEK,
+        U_SCRIPT_GUJARATI,
+        U_SCRIPT_GURMUKHI,
+        U_SCRIPT_HAN,
+        U_SCRIPT_HANGUL,
+        U_SCRIPT_HEBREW,
+        U_SCRIPT_HIRAGANA,
+        U_SCRIPT_KANNADA,
+        U_SCRIPT_KATAKANA,
+        U_SCRIPT_KHMER,
+        U_SCRIPT_LAO,
+        U_SCRIPT_LATIN,
+        U_SCRIPT_MALAYALAM,
+        U_SCRIPT_MONGOLIAN,
+        U_SCRIPT_MYANMAR,
+        U_SCRIPT_OGHAM,
+        U_SCRIPT_OLD_ITALIC,
+        U_SCRIPT_ORIYA,
+        U_SCRIPT_RUNIC,
+        U_SCRIPT_SINHALA,
+        U_SCRIPT_SYRIAC,
+        U_SCRIPT_TAMIL,
+        U_SCRIPT_TELUGU,
+        U_SCRIPT_THAANA,
+        U_SCRIPT_THAI,
+        U_SCRIPT_TIBETAN,
+        U_SCRIPT_CANADIAN_ABORIGINAL,
+        U_SCRIPT_YI,
+        U_SCRIPT_TAGALOG,
+        U_SCRIPT_HANUNOO,
+        U_SCRIPT_BUHID,
+        U_SCRIPT_TAGBANWA,
+        U_SCRIPT_BRAILLE,
+        U_SCRIPT_CYPRIOT,
+        U_SCRIPT_LIMBU,
+        U_SCRIPT_OSMANYA,
+        U_SCRIPT_SHAVIAN,
+        U_SCRIPT_LINEAR_B,
+        U_SCRIPT_TAI_LE,
+        U_SCRIPT_UGARITIC,
+        U_SCRIPT_NEW_TAI_LUE,
+        U_SCRIPT_BUGINESE,
+        U_SCRIPT_GLAGOLITIC,
+        U_SCRIPT_TIFINAGH,
+        U_SCRIPT_SYLOTI_NAGRI,
+        U_SCRIPT_OLD_PERSIAN,
+        U_SCRIPT_KHAROSHTHI,
+        U_SCRIPT_UNKNOWN,
+        U_SCRIPT_BALINESE,
+        U_SCRIPT_CUNEIFORM,
+        U_SCRIPT_PHOENICIAN,
+        U_SCRIPT_PHAGS_PA,
+        U_SCRIPT_NKO,
+        U_SCRIPT_KAYAH_LI,
+        U_SCRIPT_LEPCHA,
+        U_SCRIPT_REJANG,
+        U_SCRIPT_SUNDANESE,
+        U_SCRIPT_SAURASHTRA,
+        U_SCRIPT_CHAM,
+        U_SCRIPT_OL_CHIKI,
+        U_SCRIPT_VAI,
+        U_SCRIPT_CARIAN,
+        U_SCRIPT_LYCIAN,
+        U_SCRIPT_LYDIAN,
+        U_SCRIPT_AVESTAN,
+        U_SCRIPT_BAMUM,
+        U_SCRIPT_EGYPTIAN_HIEROGLYPHS,
+        U_SCRIPT_IMPERIAL_ARAMAIC,
+        U_SCRIPT_INSCRIPTIONAL_PAHLAVI,
+        U_SCRIPT_INSCRIPTIONAL_PARTHIAN,
+        U_SCRIPT_JAVANESE,
+        U_SCRIPT_KAITHI,
+        U_SCRIPT_LISU,
+        U_SCRIPT_MEETEI_MAYEK,
+        U_SCRIPT_OLD_SOUTH_ARABIAN,
+        U_SCRIPT_OLD_TURKIC,
+        U_SCRIPT_SAMARITAN,
+        U_SCRIPT_TAI_THAM,
+        U_SCRIPT_TAI_VIET,
+        U_SCRIPT_BATAK,
+        U_SCRIPT_BRAHMI,
+        U_SCRIPT_MANDAIC,
+        U_SCRIPT_MEROITIC_HIEROGLYPHS,
+        U_SCRIPT_MEROITIC_CURSIVE,
+        U_SCRIPT_SORA_SOMPENG,
+        U_SCRIPT_CHAKMA,
+        U_SCRIPT_SHARADA,
+        U_SCRIPT_TAKRI,
+        U_SCRIPT_MIAO,
+};
+U_CONST enum u_script u_char_script(uint32_t c);
+enum u_normalization_form {
+        U_NORMALIZATION_FORM_D,
+        U_NORMALIZATION_FORM_C,
+        U_NORMALIZATION_FORM_KD,
+        U_NORMALIZATION_FORM_KC,
+};
+size_t u_normalize(char *restrict result, size_t m,
+                   const char *restrict u, size_t n,
+                   enum u_normalization_form form) U_NON_NULL((3));
+enum u_normalized {
+        U_NORMALIZED_YES,
+        U_NORMALIZED_NO,
+        U_NORMALIZED_MAYBE,
+};
+enum u_normalized u_char_normalized(uint32_t c, enum u_normalization_form form);
+enum u_normalized u_normalized(const char *u, size_t n,
+                               enum u_normalization_form form) U_NON_NULL((1));
+size_t u_downcase(char *restrict result, size_t m,
+                  const char *restrict u, size_t n,
+                  const char *restrict locale) U_NON_NULL((3));
+size_t u_foldcase(char *restrict result, size_t m,
+                  const char *restrict string, size_t n,
+                  const char *restrict locale) U_NON_NULL((3));
+size_t u_titlecase(char *restrict result, size_t m,
+                   const char *restrict string, size_t n,
+                   const char *restrict locale) U_NON_NULL((3));
+size_t u_upcase(char *restrict result, size_t m,
+                const char *restrict string, size_t n,
+                const char *restrict locale) U_NON_NULL((3));
+size_t u_mirror(char *restrict result, size_t m,
+                const char *restrict u, size_t n) U_NON_NULL((3));
+size_t u_recode(char *restrict result, size_t m,
+                const char *restrict string, size_t n,
+                const char *restrict codeset) U_NON_NULL((3, 5));
+extern const char * const u_skip_lengths;
+#define u_next(str) ((str) + u_skip_lengths[*(const unsigned char *)(str)])
+uint32_t u_decode(const char **q, const char *u, const char *end)
+        U_NON_NULL((1, 2, 3));
+int u_decode_n(uint32_t *result, const char *u, size_t n)
+        U_NON_NULL((1, 2));
+uint32_t u_decode_r(const char **p, const char *begin, const char *u);
+U_PURE char *u_offset_to_pointer(const char *str, long offset) U_NON_NULL((1));
+U_PURE char *u_offset_to_pointer_n(const char *str, long offset, size_t n)
+        U_NON_NULL((1));
+U_PURE long u_pointer_to_offset(const char *str, const char *pos)
+        U_NON_NULL((1, 2));
+U_PURE int u_collate(const char *a, size_t a_n, const char *b, size_t b_n,
+                     const char *locale) U_NON_NULL((1, 3));
+size_t u_collation_key(char *restrict result, size_t m,
+                       const char *restrict string, size_t n,
+                       const char *restrict locale) U_NON_NULL((3));
+size_t u_normalized_collation_key(char *restrict result, size_t m,
+                                  const char *restrict string, size_t n,
+                                  const char *restrict locale) U_NON_NULL((3));
+U_PURE size_t u_char_index(const char *str, uint32_t c) U_NON_NULL((1));
+U_PURE size_t u_char_index_n(const char *str, uint32_t c, size_t n)
+        U_NON_NULL((1));
+U_PURE size_t u_index(const char *haystack, const char *needle)
+        U_NON_NULL((1, 2));
+U_PURE size_t u_index_n(const char *haystack, const char *needle, size_t n)
+        U_NON_NULL((1, 2));
+U_PURE size_t u_char_rindex(const char *str, uint32_t c) U_NON_NULL((1));
+U_PURE size_t u_char_rindex_n(const char *str, uint32_t c, size_t n)
+        U_NON_NULL((1));
+U_PURE size_t u_rindex(const char *haystack, const char *needle)
+        U_NON_NULL((1, 2));
+U_PURE size_t u_rindex_n(const char *haystack, const char *needle, size_t n)
+        U_NON_NULL((1, 2));
+U_PURE bool u_has_prefix(const char *str, const char *prefix)
+        U_NON_NULL((1, 2));
+bool u_valid(const char *restrict u, size_t n, const char **restrict end)
+        U_NON_NULL((1));
+U_PURE bool u_is_ascii_only(const char *string) U_NON_NULL((1));
+U_PURE bool u_is_ascii_only_n(const char *string, size_t n) U_NON_NULL((1));
+U_PURE size_t u_n_chars(const char *str) U_NON_NULL((1));
+U_PURE size_t u_n_chars_n(const char *str, size_t n) U_NON_NULL((1));
+U_PURE size_t u_width(const char *string) U_NON_NULL((1));
+U_PURE size_t u_width_n(const char *string, size_t n) U_NON_NULL((1));
+U_PURE size_t u_n_bytes(const char *str) U_NON_NULL((1));
+size_t u_reverse(char *restrict result, size_t m,
+                 const char *restrict string, size_t n) U_NON_NULL((3));
+typedef void (*u_substring_fn)(const char *, size_t, void *);
+void u_words(const char *string, size_t n, u_substring_fn fn, void *closure)
+        U_NON_NULL((1));
+void u_grapheme_clusters(const char *string, size_t n, u_substring_fn fn,
+                         void *closure) U_NON_NULL((1));
+int u_char_to_u_n(uint32_t c, char *result, size_t n);
+int u_char_to_u(uint32_t c, char *result);