RubyGems - u - Versions diffs - 0.5.0 → 1.0.0 - Mend

u 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (316) hide show

checksums.yaml +7 -0
data/build/ext/u/data/attributes.rb +39 -0
data/build/ext/u/data/bidi-mirroring.rb +27 -0
data/build/ext/u/data/canonical-combining-class.rb +15 -0
data/build/ext/u/data/case-folding.rb +39 -0
data/build/ext/u/data/cased.rb +19 -0
data/build/ext/u/data/compose.rb +304 -0
data/build/ext/u/data/constants.rb +31 -0
data/build/ext/u/data/decompose.rb +85 -0
data/build/ext/u/data/general-category.rb +61 -0
data/build/ext/u/data/grapheme-word-break.rb +15 -0
data/build/ext/u/data/marshalled.rb +5 -0
data/build/ext/u/data/script.rb +91 -0
data/build/ext/u/data/soft-dotted.rb +17 -0
data/build/ext/u/data/title-table.rb +30 -0
data/build/ext/u/data/wide.rb +17 -0
data/build/lib/u/build.rb +8 -0
data/build/lib/u/build/data.rb +16 -0
data/build/lib/u/build/data/bidimirroring.rb +26 -0
data/build/lib/u/build/data/break.rb +14 -0
data/build/lib/u/build/data/casefolding.rb +77 -0
data/build/lib/u/build/data/compositionexclusions.rb +14 -0
data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
data/build/lib/u/build/data/file.rb +88 -0
data/build/lib/u/build/data/linebreak.rb +14 -0
data/build/lib/u/build/data/proplist.rb +18 -0
data/build/lib/u/build/data/scripts.rb +22 -0
data/build/lib/u/build/data/specialcasing.rb +106 -0
data/build/lib/u/build/data/unicode.rb +41 -0
data/build/lib/u/build/data/unicode/entry.rb +27 -0
data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
data/build/lib/u/build/data/unicode/points.rb +32 -0
data/build/lib/u/build/header.rb +11 -0
data/build/lib/u/build/header/table.rb +19 -0
data/build/lib/u/build/header/table/row.rb +64 -0
data/build/lib/u/build/header/tables.rb +6 -0
data/build/lib/u/build/header/tables/intervals.rb +50 -0
data/build/lib/u/build/header/tables/split.rb +20 -0
data/build/lib/u/build/header/tables/split/data.rb +29 -0
data/build/lib/u/build/header/tables/split/part1.rb +28 -0
data/build/lib/u/build/header/tables/split/part2.rb +13 -0
data/build/lib/u/build/header/tables/split/row.rb +34 -0
data/build/lib/u/build/header/tables/split/rows.rb +22 -0
data/build/test/unit/break.rb +45 -0
data/build/test/unit/case.rb +178 -0
data/build/test/unit/foldcase.rb +44 -0
data/build/test/unit/normalize.rb +81 -0
data/ext/u/attributes.c +62 -0
data/ext/u/attributes.h +5 -0
data/ext/u/case.h +41 -0
data/ext/u/data/attributes.h +3070 -0
data/ext/u/data/bidi-mirroring.h +373 -0
data/ext/u/data/canonical-combining-class.h +2157 -0
data/ext/u/data/case-folding.h +171 -0
data/ext/u/data/cased.h +42 -0
data/ext/u/data/compose.h +1714 -0
data/ext/u/data/constants.h +17 -0
data/ext/u/data/decompose.h +9356 -0
data/ext/u/data/general-category.h +28959 -0
data/ext/u/data/grapheme-break.h +13201 -0
data/ext/u/data/line-break.h +26501 -0
data/ext/u/data/normalization-quick-check.h +3002 -0
data/ext/u/data/script.h +2928 -0
data/ext/u/data/soft-dotted.h +55 -0
data/ext/u/data/title-table.h +41 -0
data/ext/u/data/types.h +11117 -0
data/ext/u/data/wide-cjk.h +197 -0
data/ext/u/data/wide.h +59 -0
data/ext/u/data/word-break.h +10001 -0
data/ext/u/depend +281 -0
data/ext/u/extconf.rb +158 -0
data/ext/u/output.h +51 -0
data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
data/ext/u/private.h +58 -0
data/ext/u/rb_includes.h +10 -0
data/ext/u/rb_private.c +98 -0
data/ext/u/rb_private.h +67 -0
data/ext/u/rb_u.c +251 -0
data/ext/u/rb_u_buffer.c +443 -0
data/ext/u/rb_u_buffer.h +24 -0
data/ext/u/rb_u_re.c +43 -0
data/ext/u/rb_u_re.h +15 -0
data/ext/u/rb_u_string.c +478 -0
data/ext/u/rb_u_string.h +173 -0
data/ext/u/rb_u_string_alnum.c +10 -0
data/ext/u/rb_u_string_alpha.c +10 -0
data/ext/u/rb_u_string_aref.c +142 -0
data/ext/u/rb_u_string_ascii_only.c +13 -0
data/ext/u/rb_u_string_assigned.c +10 -0
data/ext/u/rb_u_string_b.c +18 -0
data/ext/u/rb_u_string_bytesize.c +10 -0
data/ext/u/rb_u_string_byteslice.c +103 -0
data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
data/ext/u/rb_u_string_case_ignorable.c +25 -0
data/ext/u/rb_u_string_casecmp.c +61 -0
data/ext/u/rb_u_string_cased.c +17 -0
data/ext/u/rb_u_string_chomp.c +107 -0
data/ext/u/rb_u_string_chop.c +33 -0
data/ext/u/rb_u_string_chr.c +9 -0
data/ext/u/rb_u_string_cntrl.c +10 -0
data/ext/u/rb_u_string_collate.c +46 -0
data/ext/u/rb_u_string_collation_key.c +18 -0
data/ext/u/rb_u_string_count.c +38 -0
data/ext/u/rb_u_string_defined.c +10 -0
data/ext/u/rb_u_string_delete.c +62 -0
data/ext/u/rb_u_string_digit.c +10 -0
data/ext/u/rb_u_string_downcase.c +13 -0
data/ext/u/rb_u_string_dump.c +153 -0
data/ext/u/rb_u_string_each_byte.c +46 -0
data/ext/u/rb_u_string_each_char.c +49 -0
data/ext/u/rb_u_string_each_codepoint.c +45 -0
data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
data/ext/u/rb_u_string_each_line.c +142 -0
data/ext/u/rb_u_string_each_word.c +34 -0
data/ext/u/rb_u_string_empty.c +11 -0
data/ext/u/rb_u_string_end_with.c +31 -0
data/ext/u/rb_u_string_eql.c +30 -0
data/ext/u/rb_u_string_equal.c +33 -0
data/ext/u/rb_u_string_foldcase.c +12 -0
data/ext/u/rb_u_string_folded.c +13 -0
data/ext/u/rb_u_string_format.c +1745 -0
data/ext/u/rb_u_string_general_category.c +109 -0
data/ext/u/rb_u_string_getbyte.c +21 -0
data/ext/u/rb_u_string_graph.c +21 -0
data/ext/u/rb_u_string_grapheme_break.c +61 -0
data/ext/u/rb_u_string_gsub.c +164 -0
data/ext/u/rb_u_string_hash.c +10 -0
data/ext/u/rb_u_string_hex.c +9 -0
data/ext/u/rb_u_string_include.c +10 -0
data/ext/u/rb_u_string_index.c +110 -0
data/ext/u/rb_u_string_inspect.c +189 -0
data/ext/u/rb_u_string_internal_tr.c +148 -0
data/ext/u/rb_u_string_internal_tr.h +29 -0
data/ext/u/rb_u_string_justify.c +169 -0
data/ext/u/rb_u_string_length.c +10 -0
data/ext/u/rb_u_string_line_break.c +115 -0
data/ext/u/rb_u_string_lower.c +13 -0
data/ext/u/rb_u_string_lstrip.c +24 -0
data/ext/u/rb_u_string_match.c +65 -0
data/ext/u/rb_u_string_mirror.c +16 -0
data/ext/u/rb_u_string_newline.c +21 -0
data/ext/u/rb_u_string_normalize.c +70 -0
data/ext/u/rb_u_string_normalized.c +28 -0
data/ext/u/rb_u_string_oct.c +11 -0
data/ext/u/rb_u_string_ord.c +14 -0
data/ext/u/rb_u_string_partition.c +80 -0
data/ext/u/rb_u_string_plus.c +33 -0
data/ext/u/rb_u_string_print.c +10 -0
data/ext/u/rb_u_string_punct.c +10 -0
data/ext/u/rb_u_string_reverse.c +13 -0
data/ext/u/rb_u_string_rindex.c +104 -0
data/ext/u/rb_u_string_rpartition.c +81 -0
data/ext/u/rb_u_string_rstrip.c +29 -0
data/ext/u/rb_u_string_scan.c +109 -0
data/ext/u/rb_u_string_script.c +253 -0
data/ext/u/rb_u_string_soft_dotted.c +13 -0
data/ext/u/rb_u_string_space.c +24 -0
data/ext/u/rb_u_string_split.c +245 -0
data/ext/u/rb_u_string_squeeze.c +75 -0
data/ext/u/rb_u_string_start_with.c +31 -0
data/ext/u/rb_u_string_strip.c +36 -0
data/ext/u/rb_u_string_sub.c +147 -0
data/ext/u/rb_u_string_times.c +35 -0
data/ext/u/rb_u_string_title.c +10 -0
data/ext/u/rb_u_string_titlecase.c +13 -0
data/ext/u/rb_u_string_to_i.c +45 -0
data/ext/u/rb_u_string_to_inum.c +364 -0
data/ext/u/rb_u_string_to_inum.h +1 -0
data/ext/u/rb_u_string_to_str.c +17 -0
data/ext/u/rb_u_string_to_sym.c +12 -0
data/ext/u/rb_u_string_tr.c +290 -0
data/ext/u/rb_u_string_upcase.c +12 -0
data/ext/u/rb_u_string_upper.c +13 -0
data/ext/u/rb_u_string_valid.c +10 -0
data/ext/u/rb_u_string_valid_encoding.c +12 -0
data/ext/u/rb_u_string_wide.c +21 -0
data/ext/u/rb_u_string_wide_cjk.c +21 -0
data/ext/u/rb_u_string_width.c +19 -0
data/ext/u/rb_u_string_word_break.c +63 -0
data/ext/u/rb_u_string_xdigit.c +22 -0
data/ext/u/rb_u_string_zero_width.c +16 -0
data/ext/u/titled.c +55 -0
data/ext/u/titled.h +1 -0
data/ext/u/u.c +23 -0
data/ext/u/u.h +458 -0
data/ext/u/u_char_canonical_combining_class.c +31 -0
data/ext/u/u_char_digit_value.c +21 -0
data/ext/u/u_char_downcase.c +27 -0
data/ext/u/u_char_general_category.c +31 -0
data/ext/u/u_char_grapheme_break.c +28 -0
data/ext/u/u_char_isalnum.c +24 -0
data/ext/u/u_char_isalpha.c +21 -0
data/ext/u/u_char_isassigned.c +16 -0
data/ext/u/u_char_iscased.c +22 -0
data/ext/u/u_char_iscaseignorable.c +29 -0
data/ext/u/u_char_iscntrl.c +17 -0
data/ext/u/u_char_isdefined.c +15 -0
data/ext/u/u_char_isdigit.c +16 -0
data/ext/u/u_char_isgraph.c +22 -0
data/ext/u/u_char_islower.c +16 -0
data/ext/u/u_char_isnewline.c +24 -0
data/ext/u/u_char_isprint.c +21 -0
data/ext/u/u_char_ispunct.c +27 -0
data/ext/u/u_char_issoftdotted.c +18 -0
data/ext/u/u_char_isspace.c +28 -0
data/ext/u/u_char_isupper.c +16 -0
data/ext/u/u_char_isvalid.c +18 -0
data/ext/u/u_char_iswide.c +18 -0
data/ext/u/u_char_iswide_cjk.c +22 -0
data/ext/u/u_char_isxdigit.c +27 -0
data/ext/u/u_char_iszerowidth.c +29 -0
data/ext/u/u_char_line_break.c +29 -0
data/ext/u/u_char_mirror.c +16 -0
data/ext/u/u_char_normalized.c +23 -0
data/ext/u/u_char_script.c +41 -0
data/ext/u/u_char_to_u.c +48 -0
data/ext/u/u_char_upcase.c +24 -0
data/ext/u/u_char_width.c +12 -0
data/ext/u/u_char_word_break.c +28 -0
data/ext/u/u_char_xdigit_value.c +31 -0
data/ext/u/u_collate.c +83 -0
data/ext/u/u_collation_key.c +132 -0
data/ext/u/u_decode.c +156 -0
data/ext/u/u_downcase.c +201 -0
data/ext/u/u_foldcase.c +68 -0
data/ext/u/u_grapheme_clusters.c +57 -0
data/ext/u/u_has_prefix.c +27 -0
data/ext/u/u_index.c +93 -0
data/ext/u/u_is_ascii_only.c +33 -0
data/ext/u/u_locale.c +40 -0
data/ext/u/u_locale.h +14 -0
data/ext/u/u_mirror.c +20 -0
data/ext/u/u_n_bytes.c +16 -0
data/ext/u/u_n_chars.c +43 -0
data/ext/u/u_normalize.c +232 -0
data/ext/u/u_normalized.c +28 -0
data/ext/u/u_offset_to_pointer.c +62 -0
data/ext/u/u_pointer_to_offset.c +23 -0
data/ext/u/u_recode.c +73 -0
data/ext/u/u_reverse.c +21 -0
data/ext/u/u_rindex.c +132 -0
data/ext/u/u_titlecase.c +68 -0
data/ext/u/u_upcase.c +89 -0
data/ext/u/u_width.c +35 -0
data/ext/u/u_words.c +82 -0
data/ext/u/yield.h +27 -0
data/lib/u-1.0.rb +20 -0
data/lib/u-1.0/buffer.rb +10 -0
data/lib/u-1.0/string.rb +9 -0
data/lib/u-1.0/version.rb +287 -0
data/test/unit/case.rb +2080 -0
data/test/unit/foldcase.rb +1136 -0
data/test/unit/graphemebreak.rb +407 -0
data/test/unit/normalize.rb +367545 -0
data/test/unit/u-1.0.rb +10 -0
data/test/unit/u-1.0/buffer.rb +52 -0
data/test/unit/u-1.0/string.rb +1439 -0
data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
data/test/unit/wordbreak.rb +1083 -0
metadata +603 -148
data/README +0 -38
data/Rakefile +0 -64
data/ext/encoding/character/utf-8/break.c +0 -25
data/ext/encoding/character/utf-8/data/break.h +0 -22931
data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
data/ext/encoding/character/utf-8/data/compose.h +0 -1607
data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
data/ext/encoding/character/utf-8/decompose.c +0 -444
data/ext/encoding/character/utf-8/depend +0 -65
data/ext/encoding/character/utf-8/extconf.rb +0 -67
data/ext/encoding/character/utf-8/private.h +0 -51
data/ext/encoding/character/utf-8/properties.c +0 -1056
data/ext/encoding/character/utf-8/rb_includes.h +0 -19
data/ext/encoding/character/utf-8/rb_methods.h +0 -49
data/ext/encoding/character/utf-8/rb_private.h +0 -52
data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
data/ext/encoding/character/utf-8/tables.h +0 -38
data/ext/encoding/character/utf-8/unicode.c +0 -319
data/ext/encoding/character/utf-8/unicode.h +0 -216
data/ext/encoding/character/utf-8/utf.c +0 -1334
data/lib/encoding/character/utf-8.rb +0 -201
data/lib/u.rb +0 -16
data/lib/u/string.rb +0 -185
data/lib/u/version.rb +0 -5
data/test/unit/u/string.rb +0 -91

data/ext/u/u_foldcase.c ADDED

@@ -0,0 +1,68 @@
+#include <ruby.h>
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include "u.h"
+#include "private.h"
+#include "data/case-folding.h"
+#include "output.h"
+#include "u_locale.h"
+static inline void
+foldcase(uint32_t c, struct output *output)
+{
+        size_t i;
+        if (unicode_table_lookup(casefold_table, c, &i))
+                output_zstring(output, casefold_table[i].data);
+        else
+                output_char(output, u_char_downcase(c));
+}
+static inline const char *
+foldcase_step(const char *p, const char *end, struct output *output)
+{
+        const char *q;
+        foldcase(u_decode(&q, p, end), output);
+        return q;
+}
+#define LATIN_CAPITAL_LETTER_I ((uint32_t)0x0049)
+#define LATIN_SMALL_LETTER_DOTLESS_I ((uint32_t)0x0131)
+#define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE ((uint32_t)0x0130)
+#define LATIN_SMALL_LETTER_I ((uint32_t)0x0069)
+static inline const char *
+foldcase_step_turkic(const char *p, const char *end, struct output *output)
+{
+        const char *q;
+        uint32_t c = u_decode(&q, p, end);
+        if (c == LATIN_CAPITAL_LETTER_I)
+                output_char(output, LATIN_SMALL_LETTER_DOTLESS_I);
+        else if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
+                output_char(output, LATIN_SMALL_LETTER_I);
+        else
+                foldcase(c, output);
+        return q;
+}
+size_t
+u_foldcase(char *result, size_t m, const char *string, size_t n,
+           const char *locale)
+{
+	assert(string != NULL);
+        assert(result != NULL || m == 0);
+	enum locale l = _u_locale_from_string(locale);
+        const char *end = string + n;
+        struct output output = OUTPUT_INIT(result, m);
+        if (l == LOCALE_TURKIC)
+                for (const char *p = string; p < end; )
+                        p = foldcase_step_turkic(p, end, &output);
+        else
+                for (const char *p = string; p < end; )
+                        p = foldcase_step(p, end, &output);
+        return output_finalize(&output);
+}

data/ext/u/u_grapheme_clusters.c ADDED

@@ -0,0 +1,57 @@
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include "u.h"
+#include "private.h"
+#define ROW(other, cr, lf, control, l, v, lv, lvt, t, regional_indicator, \
+            prepend, extend, spacingmark) \
+        { [U_GRAPHEME_BREAK_OTHER] = other, \
+          [U_GRAPHEME_BREAK_CR] = cr, \
+          [U_GRAPHEME_BREAK_LF] = lf, \
+          [U_GRAPHEME_BREAK_CONTROL] = control, \
+          [U_GRAPHEME_BREAK_L] = l, \
+          [U_GRAPHEME_BREAK_V] = v, \
+          [U_GRAPHEME_BREAK_LV] = lv, \
+          [U_GRAPHEME_BREAK_LVT] = lvt, \
+          [U_GRAPHEME_BREAK_T] = t, \
+          [U_GRAPHEME_BREAK_REGIONAL_INDICATOR] = regional_indicator, \
+          [U_GRAPHEME_BREAK_EXTEND] = extend, \
+          [U_GRAPHEME_BREAK_SPACINGMARK] = spacingmark, \
+          [U_GRAPHEME_BREAK_PREPEND] = prepend }
+#define K(s) (s | (1 << 4))
+static const uint8_t gb_dfa[][U_GRAPHEME_BREAK_V + 1] = {
+        ROW(  0 ,  1 ,  2 ,  2 ,  3 ,  4 ,  4 ,  5 ,  5 ,  6 ,  7 ,K(0),K(0)), // Other | Extend | SpacingMark
+        ROW(  0 ,  1 ,K(2),  2 ,  3 ,  4 ,  4 ,  5 ,  5 ,  6 ,  7 ,  0 ,  0 ), // CR
+        ROW(  0 ,  1 ,  2 ,  2 ,  3 ,  4 ,  4 ,  5 ,  5 ,  6 ,  7 ,  0 ,  0 ), // LF | Control
+        ROW(  0 ,  1 ,  2 ,  2 ,K(3),K(4),K(4),K(5),  5 ,  6 ,  7 ,K(0),K(0)), // L
+        ROW(  0 ,  1 ,  2 ,  2 ,  3 ,K(4),  4 ,  5 ,K(5),  6 ,  7 ,K(0),K(0)), // V | LV
+        ROW(  0 ,  1 ,  2 ,  2 ,  3 ,  4 ,  4 ,  5 ,K(5),  6 ,  7 ,K(0),K(0)), // LVT | T
+        ROW(  0 ,  1 ,  2 ,  2 ,  3 ,  4 ,  4 ,  5 ,  5 ,K(6),  7 ,K(0),K(0)), // Regional_Indicator
+        ROW(K(0),  1 ,  2 ,  2 ,K(3),K(4),K(4),K(5),K(5),K(6),K(0),K(0),K(0)), // Prepend
+};
+#undef K
+void
+u_grapheme_clusters(const char *string, size_t n, u_substring_fn fn, void *closure)
+{
+        const char *p = string;
+        const char *q = p;
+        const char *end = p + n;
+        uint8_t state = 2;
+        while (q < end) {
+                const char *r;
+                uint32_t c = u_decode(&r, q, end);
+                state = gb_dfa[state & 0xf][u_char_grapheme_break(c)];
+                if (state >> 4 != 1) {
+                        if (p < q)
+                                fn(p, q - p, closure);
+                        p = q;
+                }
+                q = r;
+        }
+        if (p < q)
+                fn(p, q - p, closure);
+}

data/ext/u/u_has_prefix.c ADDED

@@ -0,0 +1,27 @@
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include "u.h"
+#include "private.h"
+/* {{{1
+ * Check if the given string begins with ‘prefix’.
+ */
+bool
+u_has_prefix(const char *str, const char *prefix)
+{
+	assert(str != NULL);
+	assert(prefix != NULL);
+	do {
+		if (*prefix == '\0')
+			return true;
+		else if (*str == '\0')
+			return false;
+	} while (*str++ == *prefix++);
+	return false;
+}

data/ext/u/u_index.c ADDED

@@ -0,0 +1,93 @@
+#include <assert.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include "u.h"
+#include "private.h"
+/* {{{1
+ * Retrieve the offset/index of ‘needle’ in ‘haystack’ which is of size
+ * ‘haystack_len’.
+ */
+static U_PURE size_t
+str_index_n(const char *haystack, const char *needle, size_t haystack_n)
+{
+	assert(haystack != NULL);
+	assert(needle != NULL);
+	size_t needle_n = strlen(needle);
+	if (needle_n == 0)
+		return 0;
+	if (haystack_n < needle_n)
+		return -1;
+	const char *end = haystack + haystack_n - needle_n;
+	for (const char *p = haystack; *p != '\0' && p <= end; p++) {
+		size_t i;
+		for (i = 0; i < needle_n; i++) {
+			if (p[i] != needle[i])
+				break;
+		}
+		if (i == needle_n)
+			return p - haystack;
+	}
+	return -1;
+}
+/* {{{1
+ * Retrieve the index of the left-most occurence of ‘c’ in ‘str’, or -1 if it
+ * doesn't exist.
+ */
+size_t
+u_char_index(const char *str, uint32_t c)
+{
+	char ch[7];
+	ch[u_char_to_u(c, ch)] = '\0';
+	char *p = strstr(str, ch);
+	return (p != NULL) ? p - str : -1;
+}
+/* {{{1
+ * Retrieve the index of the left-most occurence of ‘c’ in ‘str’, or -1 if it
+ * doesn't exist, going over at most ‘len’ bytes in ‘str’.
+ */
+size_t
+u_char_index_n(const char *str, uint32_t c, size_t n)
+{
+	char ch[7];
+	ch[u_char_to_u(c, ch)] = '\0';
+	return str_index_n(str, ch, n);
+}
+/* {{{1
+ * Retrieve the index of the left-most occurence of ‘needle’ in ‘haystack’, or
+ * -1 if it doesn't exist.
+ */
+size_t
+u_index(const char *haystack, const char *needle)
+{
+	return strstr(haystack, needle) - haystack;
+}
+/* {{{1
+ * Retrieve the index of the left-most occurence of ‘needle’ in ‘haystack’, or
+ * -1 if it doesn't exist, going over at most ‘len’ bytes in ‘haystack’.
+ */
+size_t
+u_index_n(const char *haystack, const char *needle, size_t n)
+{
+	return str_index_n(haystack, needle, n);
+}

data/ext/u/u_is_ascii_only.c ADDED

@@ -0,0 +1,33 @@
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include "u.h"
+#include "private.h"
+static bool
+u_is_ascii_only_impl(const char *string, size_t n, bool use_n)
+{
+        const char *p = string;
+        const char *end = p + n;
+        while (P_WITHIN_STR(p, end, use_n)) {
+                if (*(unsigned char *)p > 127)
+                        return false;
+                p++;
+        }
+        return true;
+}
+bool
+u_is_ascii_only(const char *string)
+{
+        return u_is_ascii_only_impl(string, 0, false);
+}
+bool
+u_is_ascii_only_n(const char *string, size_t n)
+{
+        return u_is_ascii_only_impl(string, n, true);
+}

data/ext/u/u_locale.c ADDED

@@ -0,0 +1,40 @@
+#include <locale.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include "u.h"
+#include "u_locale.h"
+#include "private.h"
+/* {{{1
+ * Retrieve the locale type from the environment (LC_CTYPE).
+ */
+enum locale
+_u_locale(void)
+{
+        const char *locale = setlocale(LC_CTYPE, NULL);
+        return locale == NULL ? LOCALE_NORMAL : _u_locale_from_string(locale);
+}
+enum locale
+_u_locale_from_string(const char *locale)
+{
+        if (locale == NULL)
+                return _u_locale();
+        if (locale[0] == '\0')
+                return LOCALE_NORMAL;
+	if ((locale[0] == 'a' && locale[1] == 'z') ||
+	    (locale[0] == 't' && locale[1] == 'r'))
+		return LOCALE_TURKIC;
+	if (locale[0] == 'l' && locale[1] == 't')
+		return LOCALE_LITHUANIAN;
+        if (locale[0] == 'n' && locale[1] == 'l')
+                return LOCALE_DUTCH;
+        return LOCALE_NORMAL;
+}

data/ext/u/u_locale.h ADDED

@@ -0,0 +1,14 @@
+/* {{{1
+ * LocaleType: This ‹enum› is used for dealing with different locales for
+ * turning strings into uppercase or lowercase.
+ */
+enum locale {
+	LOCALE_NORMAL,
+	LOCALE_TURKIC,
+	LOCALE_LITHUANIAN,
+        LOCALE_DUTCH,
+};
+enum locale _u_locale(void);
+enum locale _u_locale_from_string(const char *locale);

data/ext/u/u_mirror.c ADDED

@@ -0,0 +1,20 @@
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include "u.h"
+#include <assert.h>
+#include <string.h>
+#include "output.h"
+size_t
+u_mirror(char *result, size_t m, const char *u, size_t n)
+{
+	assert(u != NULL);
+        assert(result != NULL || m == 0);
+        struct output o = OUTPUT_INIT(result, m);
+        for (const char *p = u, *end = u + n; p < end; )
+                output_char(&o, u_char_mirror(u_decode(&p, p, end)));
+        return output_finalize(&o);
+}

data/ext/u/u_n_bytes.c ADDED

@@ -0,0 +1,16 @@
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include "u.h"
+#include "private.h"
+/* {{{1
+ * Retrieve the number of bytes making up the given UTF-8 string.
+ */
+size_t
+u_n_bytes(const char *str)
+{
+	return strlen(str);
+}

data/ext/u/u_n_chars.c ADDED

@@ -0,0 +1,43 @@
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include "u.h"
+#include "private.h"
+/* {{{1
+ * Retrieve the number of UTF-8 encoded Unicode characters in ‘str’.
+ */
+size_t
+u_n_chars(const char *str)
+{
+        assert(str != NULL);
+        size_t n = 0;
+        for (const char *p = str, *end = str + strlen(str); *p != '\0'; n++)
+                u_decode(&p, p, end);
+        return n;
+}
+/* {{{1
+ * Retrieve the number of UTF-8 encoded Unicode characters in ‘str’, examining
+ * ‘len’ bytes.
+ */
+size_t
+u_n_chars_n(const char *str, size_t n)
+{
+        assert(str != NULL || n == 0);
+        if (n == 0)
+                return 0;
+        size_t m = 0;
+        const char *p = str;
+        const char *end = str + n;
+        while (p < end) {
+                u_decode(&p, p, end);
+                m++;
+        }
+        return m;
+}

data/ext/u/u_normalize.c ADDED

@@ -0,0 +1,232 @@
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include "u.h"
+#include "data/constants.h"
+#include "data/decompose.h"
+#include "data/compose.h"
+#include "private.h"
+#include <string.h>
+#include "output.h"
+enum {
+        SBase = 0xac00,
+        LBase = 0x1100,
+        VBase = 0x1161,
+        TBase = 0x11a7,
+        LCount = 19,
+        VCount = 21,
+        TCount = 28,
+        NCount = (VCount * TCount),
+        SCount = (LCount * NCount),
+        SLast = (SBase + SCount - 1)
+};
+static const char *
+compatible(size_t i)
+{
+        uint16_t j = decomp_table[i].compat_offset;
+        return &decomp_expansion_string[j == UNICODE_NOT_PRESENT_OFFSET ?
+                                        decomp_table[i].canon_offset :
+                                        j];
+}
+static const char *
+canonical(size_t i)
+{
+        uint16_t j = decomp_table[i].canon_offset;
+        return j == UNICODE_NOT_PRESENT_OFFSET ? NULL : &decomp_expansion_string[j];
+}
+static void
+decompose(const char *u, const char *end, enum u_normalization_form form,
+          struct output *o)
+{
+        const char *(*decompose)(size_t) =
+                (form == U_NORMALIZATION_FORM_KC ||
+                 form == U_NORMALIZATION_FORM_KD) ? compatible : canonical;
+        for (const char *p = u; p < end; ) {
+                uint32_t c = u_decode(&p, p, end);
+                if (SBase <= c && c <= SLast) {
+                        int SIndex = c - SBase;
+                        output_char(o, LBase + SIndex / NCount);
+                        output_char(o, VBase + (SIndex % NCount) / TCount);
+                        uint32_t T = TBase + SIndex % TCount;
+                        if (T != TBase)
+                                output_char(o, T);
+                } else {
+                        size_t i;
+                        const char *d;
+                        if (unicode_table_lookup(decomp_table, c, &i) &&
+                            (d = decompose(i)) != NULL)
+                                output_zstring(o, d);
+                        else
+                                output_char(o, c);
+                }
+        }
+}
+static inline void
+canonical_swap(char *begin, char *p, char *q,
+               enum u_canonical_combining_class ccc)
+{
+        char *r = p;
+        char *s;
+        while (begin < r &&
+               u_char_canonical_combining_class(u_decode_r((const char **)&s,
+                                                           begin, r)) > ccc)
+                r = s;
+        char buf[U_CHAR_MAX_BYTE_LENGTH];
+        size_t n = q - p;
+        memcpy(buf, p, n);
+        memmove(r + n, r, p - r);
+        memcpy(r, buf, n);
+}
+static inline bool
+canonical_reorder(char *begin, char *end)
+{
+        bool swapped = false;
+        char *p;
+        uint32_t c = u_decode((const char **)&p, begin, end);
+        enum u_canonical_combining_class pcc = u_char_canonical_combining_class(c);
+        while (p < end) {
+                char *q;
+                enum u_canonical_combining_class cc =
+                        u_char_canonical_combining_class(u_decode((const char **)&q, p, end));
+                if (cc != 0 && pcc > cc) {
+                        canonical_swap(begin, p, q, cc);
+                        swapped = true;
+                } else
+                        pcc = cc;
+                p = q;
+        }
+        return swapped;
+}
+static void
+canonical_order(char *begin, size_t n)
+{
+        char *end = begin + n;
+        while (canonical_reorder(begin, end))
+                ;
+}
+static inline bool
+compose_hangul(uint32_t a, uint32_t b, uint32_t *result)
+{
+        int LIndex = a - LBase;
+        if (0 <= LIndex && LIndex < LCount) {
+                int VIndex = b - VBase;
+                if (0 <= VIndex && VIndex < VCount) {
+                        *result = SBase + (LIndex * VCount + VIndex) * TCount;
+                        return true;
+                }
+        }
+        int SIndex = a - SBase;
+        if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0) {
+                int TIndex = b - TBase;
+                if (0 < TIndex && TIndex < TCount) {
+                        *result = a + TIndex;
+                        return true;
+                }
+        }
+        return false;
+}
+static inline uint16_t
+compose_index(uint32_t c)
+{
+        unsigned int page = c >> 8;
+        if (page > COMPOSE_TABLE_LAST)
+                return 0;
+        int16_t i = compose_table[page];
+        if (i >= UNICODE_MAX_TABLE_INDEX)
+                return i - UNICODE_MAX_TABLE_INDEX;
+        return compose_data[i][c & 0xff];
+}
+static inline bool
+compose_2(uint32_t a, uint32_t b, uint32_t *result)
+{
+        if (compose_hangul(a, b, result))
+                return true;
+        uint16_t i = compose_index(a);
+        if (COMPOSE_FIRST_SINGLE_START <= i && i < COMPOSE_SECOND_START) {
+                if (b != compose_first_single[i - COMPOSE_FIRST_SINGLE_START][0])
+                        return false;
+                *result = compose_first_single[i - COMPOSE_FIRST_SINGLE_START][1];
+                return true;
+        }
+        uint16_t j = compose_index(b);
+        if (COMPOSE_SECOND_SINGLE_START <= j) {
+                if (a != compose_second_single[j - COMPOSE_SECOND_SINGLE_START][0])
+                        return false;
+                *result = compose_second_single[j - COMPOSE_SECOND_SINGLE_START][1];
+                return true;
+        }
+        if (COMPOSE_FIRST_START <= i && i < COMPOSE_FIRST_SINGLE_START &&
+            COMPOSE_SECOND_START <= j && j < COMPOSE_SECOND_SINGLE_START) {
+                uint32_t r = compose_array[i - COMPOSE_FIRST_START][j - COMPOSE_SECOND_START];
+                if (r != 0) {
+                        *result = r;
+                        return true;
+                }
+        }
+        return false;
+}
+static inline size_t
+compose(char *begin, size_t n, enum u_normalization_form form)
+{
+        if (form != U_NORMALIZATION_FORM_C && form != U_NORMALIZATION_FORM_KC)
+                return n;
+        int pcc = -1;
+        char *t;
+        char *s = begin;
+        char *end = begin + n;
+        uint32_t sc = u_decode((const char **)&t, s, end);
+        for (char *p = t, *q; p < end; p = q) {
+                uint32_t c = u_decode((const char **)&q, p, end);
+                int cc = u_char_canonical_combining_class(c);
+                uint32_t sc2;
+                if (pcc < cc && compose_2(sc, c, &sc2)) {
+                        char *r = u_next(s);
+                        ptrdiff_t k = u_char_to_u(sc2, NULL) - (r - s);
+                        memmove(r + k, r, t - r);
+                        u_char_to_u(sc2, s);
+                        sc = sc2;
+                        t += k;
+                } else if (cc == 0) {
+                        pcc = -1;
+                        s = t;
+                        sc = c;
+                        t += u_char_to_u(c, t);
+                } else {
+                        pcc = cc;
+                        t += u_char_to_u(c, t);
+                }
+        }
+        return t - begin;
+}
+size_t
+u_normalize(char *result, size_t m, const char *u, size_t n,
+            enum u_normalization_form form)
+{
+        struct output o = OUTPUT_INIT(result, m);
+        decompose(u, u + n, form, &o);
+        if (o.m > o.n && o.n > 0) {
+                canonical_order(o.result, o.n);
+                o.n = compose(o.result, o.n, form);
+        }
+        return output_finalize(&o);
+}