u 0.5.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/build/ext/u/data/attributes.rb +39 -0
- data/build/ext/u/data/bidi-mirroring.rb +27 -0
- data/build/ext/u/data/canonical-combining-class.rb +15 -0
- data/build/ext/u/data/case-folding.rb +39 -0
- data/build/ext/u/data/cased.rb +19 -0
- data/build/ext/u/data/compose.rb +304 -0
- data/build/ext/u/data/constants.rb +31 -0
- data/build/ext/u/data/decompose.rb +85 -0
- data/build/ext/u/data/general-category.rb +61 -0
- data/build/ext/u/data/grapheme-word-break.rb +15 -0
- data/build/ext/u/data/marshalled.rb +5 -0
- data/build/ext/u/data/script.rb +91 -0
- data/build/ext/u/data/soft-dotted.rb +17 -0
- data/build/ext/u/data/title-table.rb +30 -0
- data/build/ext/u/data/wide.rb +17 -0
- data/build/lib/u/build.rb +8 -0
- data/build/lib/u/build/data.rb +16 -0
- data/build/lib/u/build/data/bidimirroring.rb +26 -0
- data/build/lib/u/build/data/break.rb +14 -0
- data/build/lib/u/build/data/casefolding.rb +77 -0
- data/build/lib/u/build/data/compositionexclusions.rb +14 -0
- data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
- data/build/lib/u/build/data/file.rb +88 -0
- data/build/lib/u/build/data/linebreak.rb +14 -0
- data/build/lib/u/build/data/proplist.rb +18 -0
- data/build/lib/u/build/data/scripts.rb +22 -0
- data/build/lib/u/build/data/specialcasing.rb +106 -0
- data/build/lib/u/build/data/unicode.rb +41 -0
- data/build/lib/u/build/data/unicode/entry.rb +27 -0
- data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
- data/build/lib/u/build/data/unicode/points.rb +32 -0
- data/build/lib/u/build/header.rb +11 -0
- data/build/lib/u/build/header/table.rb +19 -0
- data/build/lib/u/build/header/table/row.rb +64 -0
- data/build/lib/u/build/header/tables.rb +6 -0
- data/build/lib/u/build/header/tables/intervals.rb +50 -0
- data/build/lib/u/build/header/tables/split.rb +20 -0
- data/build/lib/u/build/header/tables/split/data.rb +29 -0
- data/build/lib/u/build/header/tables/split/part1.rb +28 -0
- data/build/lib/u/build/header/tables/split/part2.rb +13 -0
- data/build/lib/u/build/header/tables/split/row.rb +34 -0
- data/build/lib/u/build/header/tables/split/rows.rb +22 -0
- data/build/test/unit/break.rb +45 -0
- data/build/test/unit/case.rb +178 -0
- data/build/test/unit/foldcase.rb +44 -0
- data/build/test/unit/normalize.rb +81 -0
- data/ext/u/attributes.c +62 -0
- data/ext/u/attributes.h +5 -0
- data/ext/u/case.h +41 -0
- data/ext/u/data/attributes.h +3070 -0
- data/ext/u/data/bidi-mirroring.h +373 -0
- data/ext/u/data/canonical-combining-class.h +2157 -0
- data/ext/u/data/case-folding.h +171 -0
- data/ext/u/data/cased.h +42 -0
- data/ext/u/data/compose.h +1714 -0
- data/ext/u/data/constants.h +17 -0
- data/ext/u/data/decompose.h +9356 -0
- data/ext/u/data/general-category.h +28959 -0
- data/ext/u/data/grapheme-break.h +13201 -0
- data/ext/u/data/line-break.h +26501 -0
- data/ext/u/data/normalization-quick-check.h +3002 -0
- data/ext/u/data/script.h +2928 -0
- data/ext/u/data/soft-dotted.h +55 -0
- data/ext/u/data/title-table.h +41 -0
- data/ext/u/data/types.h +11117 -0
- data/ext/u/data/wide-cjk.h +197 -0
- data/ext/u/data/wide.h +59 -0
- data/ext/u/data/word-break.h +10001 -0
- data/ext/u/depend +281 -0
- data/ext/u/extconf.rb +158 -0
- data/ext/u/output.h +51 -0
- data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
- data/ext/u/private.h +58 -0
- data/ext/u/rb_includes.h +10 -0
- data/ext/u/rb_private.c +98 -0
- data/ext/u/rb_private.h +67 -0
- data/ext/u/rb_u.c +251 -0
- data/ext/u/rb_u_buffer.c +443 -0
- data/ext/u/rb_u_buffer.h +24 -0
- data/ext/u/rb_u_re.c +43 -0
- data/ext/u/rb_u_re.h +15 -0
- data/ext/u/rb_u_string.c +478 -0
- data/ext/u/rb_u_string.h +173 -0
- data/ext/u/rb_u_string_alnum.c +10 -0
- data/ext/u/rb_u_string_alpha.c +10 -0
- data/ext/u/rb_u_string_aref.c +142 -0
- data/ext/u/rb_u_string_ascii_only.c +13 -0
- data/ext/u/rb_u_string_assigned.c +10 -0
- data/ext/u/rb_u_string_b.c +18 -0
- data/ext/u/rb_u_string_bytesize.c +10 -0
- data/ext/u/rb_u_string_byteslice.c +103 -0
- data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
- data/ext/u/rb_u_string_case_ignorable.c +25 -0
- data/ext/u/rb_u_string_casecmp.c +61 -0
- data/ext/u/rb_u_string_cased.c +17 -0
- data/ext/u/rb_u_string_chomp.c +107 -0
- data/ext/u/rb_u_string_chop.c +33 -0
- data/ext/u/rb_u_string_chr.c +9 -0
- data/ext/u/rb_u_string_cntrl.c +10 -0
- data/ext/u/rb_u_string_collate.c +46 -0
- data/ext/u/rb_u_string_collation_key.c +18 -0
- data/ext/u/rb_u_string_count.c +38 -0
- data/ext/u/rb_u_string_defined.c +10 -0
- data/ext/u/rb_u_string_delete.c +62 -0
- data/ext/u/rb_u_string_digit.c +10 -0
- data/ext/u/rb_u_string_downcase.c +13 -0
- data/ext/u/rb_u_string_dump.c +153 -0
- data/ext/u/rb_u_string_each_byte.c +46 -0
- data/ext/u/rb_u_string_each_char.c +49 -0
- data/ext/u/rb_u_string_each_codepoint.c +45 -0
- data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
- data/ext/u/rb_u_string_each_line.c +142 -0
- data/ext/u/rb_u_string_each_word.c +34 -0
- data/ext/u/rb_u_string_empty.c +11 -0
- data/ext/u/rb_u_string_end_with.c +31 -0
- data/ext/u/rb_u_string_eql.c +30 -0
- data/ext/u/rb_u_string_equal.c +33 -0
- data/ext/u/rb_u_string_foldcase.c +12 -0
- data/ext/u/rb_u_string_folded.c +13 -0
- data/ext/u/rb_u_string_format.c +1745 -0
- data/ext/u/rb_u_string_general_category.c +109 -0
- data/ext/u/rb_u_string_getbyte.c +21 -0
- data/ext/u/rb_u_string_graph.c +21 -0
- data/ext/u/rb_u_string_grapheme_break.c +61 -0
- data/ext/u/rb_u_string_gsub.c +164 -0
- data/ext/u/rb_u_string_hash.c +10 -0
- data/ext/u/rb_u_string_hex.c +9 -0
- data/ext/u/rb_u_string_include.c +10 -0
- data/ext/u/rb_u_string_index.c +110 -0
- data/ext/u/rb_u_string_inspect.c +189 -0
- data/ext/u/rb_u_string_internal_tr.c +148 -0
- data/ext/u/rb_u_string_internal_tr.h +29 -0
- data/ext/u/rb_u_string_justify.c +169 -0
- data/ext/u/rb_u_string_length.c +10 -0
- data/ext/u/rb_u_string_line_break.c +115 -0
- data/ext/u/rb_u_string_lower.c +13 -0
- data/ext/u/rb_u_string_lstrip.c +24 -0
- data/ext/u/rb_u_string_match.c +65 -0
- data/ext/u/rb_u_string_mirror.c +16 -0
- data/ext/u/rb_u_string_newline.c +21 -0
- data/ext/u/rb_u_string_normalize.c +70 -0
- data/ext/u/rb_u_string_normalized.c +28 -0
- data/ext/u/rb_u_string_oct.c +11 -0
- data/ext/u/rb_u_string_ord.c +14 -0
- data/ext/u/rb_u_string_partition.c +80 -0
- data/ext/u/rb_u_string_plus.c +33 -0
- data/ext/u/rb_u_string_print.c +10 -0
- data/ext/u/rb_u_string_punct.c +10 -0
- data/ext/u/rb_u_string_reverse.c +13 -0
- data/ext/u/rb_u_string_rindex.c +104 -0
- data/ext/u/rb_u_string_rpartition.c +81 -0
- data/ext/u/rb_u_string_rstrip.c +29 -0
- data/ext/u/rb_u_string_scan.c +109 -0
- data/ext/u/rb_u_string_script.c +253 -0
- data/ext/u/rb_u_string_soft_dotted.c +13 -0
- data/ext/u/rb_u_string_space.c +24 -0
- data/ext/u/rb_u_string_split.c +245 -0
- data/ext/u/rb_u_string_squeeze.c +75 -0
- data/ext/u/rb_u_string_start_with.c +31 -0
- data/ext/u/rb_u_string_strip.c +36 -0
- data/ext/u/rb_u_string_sub.c +147 -0
- data/ext/u/rb_u_string_times.c +35 -0
- data/ext/u/rb_u_string_title.c +10 -0
- data/ext/u/rb_u_string_titlecase.c +13 -0
- data/ext/u/rb_u_string_to_i.c +45 -0
- data/ext/u/rb_u_string_to_inum.c +364 -0
- data/ext/u/rb_u_string_to_inum.h +1 -0
- data/ext/u/rb_u_string_to_str.c +17 -0
- data/ext/u/rb_u_string_to_sym.c +12 -0
- data/ext/u/rb_u_string_tr.c +290 -0
- data/ext/u/rb_u_string_upcase.c +12 -0
- data/ext/u/rb_u_string_upper.c +13 -0
- data/ext/u/rb_u_string_valid.c +10 -0
- data/ext/u/rb_u_string_valid_encoding.c +12 -0
- data/ext/u/rb_u_string_wide.c +21 -0
- data/ext/u/rb_u_string_wide_cjk.c +21 -0
- data/ext/u/rb_u_string_width.c +19 -0
- data/ext/u/rb_u_string_word_break.c +63 -0
- data/ext/u/rb_u_string_xdigit.c +22 -0
- data/ext/u/rb_u_string_zero_width.c +16 -0
- data/ext/u/titled.c +55 -0
- data/ext/u/titled.h +1 -0
- data/ext/u/u.c +23 -0
- data/ext/u/u.h +458 -0
- data/ext/u/u_char_canonical_combining_class.c +31 -0
- data/ext/u/u_char_digit_value.c +21 -0
- data/ext/u/u_char_downcase.c +27 -0
- data/ext/u/u_char_general_category.c +31 -0
- data/ext/u/u_char_grapheme_break.c +28 -0
- data/ext/u/u_char_isalnum.c +24 -0
- data/ext/u/u_char_isalpha.c +21 -0
- data/ext/u/u_char_isassigned.c +16 -0
- data/ext/u/u_char_iscased.c +22 -0
- data/ext/u/u_char_iscaseignorable.c +29 -0
- data/ext/u/u_char_iscntrl.c +17 -0
- data/ext/u/u_char_isdefined.c +15 -0
- data/ext/u/u_char_isdigit.c +16 -0
- data/ext/u/u_char_isgraph.c +22 -0
- data/ext/u/u_char_islower.c +16 -0
- data/ext/u/u_char_isnewline.c +24 -0
- data/ext/u/u_char_isprint.c +21 -0
- data/ext/u/u_char_ispunct.c +27 -0
- data/ext/u/u_char_issoftdotted.c +18 -0
- data/ext/u/u_char_isspace.c +28 -0
- data/ext/u/u_char_isupper.c +16 -0
- data/ext/u/u_char_isvalid.c +18 -0
- data/ext/u/u_char_iswide.c +18 -0
- data/ext/u/u_char_iswide_cjk.c +22 -0
- data/ext/u/u_char_isxdigit.c +27 -0
- data/ext/u/u_char_iszerowidth.c +29 -0
- data/ext/u/u_char_line_break.c +29 -0
- data/ext/u/u_char_mirror.c +16 -0
- data/ext/u/u_char_normalized.c +23 -0
- data/ext/u/u_char_script.c +41 -0
- data/ext/u/u_char_to_u.c +48 -0
- data/ext/u/u_char_upcase.c +24 -0
- data/ext/u/u_char_width.c +12 -0
- data/ext/u/u_char_word_break.c +28 -0
- data/ext/u/u_char_xdigit_value.c +31 -0
- data/ext/u/u_collate.c +83 -0
- data/ext/u/u_collation_key.c +132 -0
- data/ext/u/u_decode.c +156 -0
- data/ext/u/u_downcase.c +201 -0
- data/ext/u/u_foldcase.c +68 -0
- data/ext/u/u_grapheme_clusters.c +57 -0
- data/ext/u/u_has_prefix.c +27 -0
- data/ext/u/u_index.c +93 -0
- data/ext/u/u_is_ascii_only.c +33 -0
- data/ext/u/u_locale.c +40 -0
- data/ext/u/u_locale.h +14 -0
- data/ext/u/u_mirror.c +20 -0
- data/ext/u/u_n_bytes.c +16 -0
- data/ext/u/u_n_chars.c +43 -0
- data/ext/u/u_normalize.c +232 -0
- data/ext/u/u_normalized.c +28 -0
- data/ext/u/u_offset_to_pointer.c +62 -0
- data/ext/u/u_pointer_to_offset.c +23 -0
- data/ext/u/u_recode.c +73 -0
- data/ext/u/u_reverse.c +21 -0
- data/ext/u/u_rindex.c +132 -0
- data/ext/u/u_titlecase.c +68 -0
- data/ext/u/u_upcase.c +89 -0
- data/ext/u/u_width.c +35 -0
- data/ext/u/u_words.c +82 -0
- data/ext/u/yield.h +27 -0
- data/lib/u-1.0.rb +20 -0
- data/lib/u-1.0/buffer.rb +10 -0
- data/lib/u-1.0/string.rb +9 -0
- data/lib/u-1.0/version.rb +287 -0
- data/test/unit/case.rb +2080 -0
- data/test/unit/foldcase.rb +1136 -0
- data/test/unit/graphemebreak.rb +407 -0
- data/test/unit/normalize.rb +367545 -0
- data/test/unit/u-1.0.rb +10 -0
- data/test/unit/u-1.0/buffer.rb +52 -0
- data/test/unit/u-1.0/string.rb +1439 -0
- data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
- data/test/unit/wordbreak.rb +1083 -0
- metadata +603 -148
- data/README +0 -38
- data/Rakefile +0 -64
- data/ext/encoding/character/utf-8/break.c +0 -25
- data/ext/encoding/character/utf-8/data/break.h +0 -22931
- data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
- data/ext/encoding/character/utf-8/data/compose.h +0 -1607
- data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
- data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
- data/ext/encoding/character/utf-8/decompose.c +0 -444
- data/ext/encoding/character/utf-8/depend +0 -65
- data/ext/encoding/character/utf-8/extconf.rb +0 -67
- data/ext/encoding/character/utf-8/private.h +0 -51
- data/ext/encoding/character/utf-8/properties.c +0 -1056
- data/ext/encoding/character/utf-8/rb_includes.h +0 -19
- data/ext/encoding/character/utf-8/rb_methods.h +0 -49
- data/ext/encoding/character/utf-8/rb_private.h +0 -52
- data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
- data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
- data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
- data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
- data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
- data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
- data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
- data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
- data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
- data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
- data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
- data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
- data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
- data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
- data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
- data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
- data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
- data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
- data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
- data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
- data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
- data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
- data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
- data/ext/encoding/character/utf-8/tables.h +0 -38
- data/ext/encoding/character/utf-8/unicode.c +0 -319
- data/ext/encoding/character/utf-8/unicode.h +0 -216
- data/ext/encoding/character/utf-8/utf.c +0 -1334
- data/lib/encoding/character/utf-8.rb +0 -201
- data/lib/u.rb +0 -16
- data/lib/u/string.rb +0 -185
- data/lib/u/version.rb +0 -5
- data/test/unit/u/string.rb +0 -91
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
#include "extconf.h"
|
|
2
|
+
#include <assert.h>
|
|
3
|
+
#include <errno.h>
|
|
4
|
+
#define __USE_XOPEN2K8 1
|
|
5
|
+
#include <locale.h>
|
|
6
|
+
#ifdef HAVE_XLOCALE_H
|
|
7
|
+
# include <xlocale.h>
|
|
8
|
+
#endif
|
|
9
|
+
#define __USE_XOPEN2K 1
|
|
10
|
+
#include <langinfo.h>
|
|
11
|
+
#include <stdbool.h>
|
|
12
|
+
#include <stdint.h>
|
|
13
|
+
#include <stdlib.h>
|
|
14
|
+
#include <string.h>
|
|
15
|
+
#ifndef HAVE_STRXFRM_L
|
|
16
|
+
static inline size_t
|
|
17
|
+
strxfrm_l(char *restrict s1, const char *restrict s2, size_t n,
|
|
18
|
+
UNUSED(locale_t loc))
|
|
19
|
+
{
|
|
20
|
+
return strxfrm(s1, s2, n);
|
|
21
|
+
}
|
|
22
|
+
#endif
|
|
23
|
+
#ifndef HAVE_NL_LANGINFO_L
|
|
24
|
+
static inline char *
|
|
25
|
+
nl_langinfo_l(nl_item item, UNUSED(locale_t loc))
|
|
26
|
+
{
|
|
27
|
+
return nl_langinfo(item);
|
|
28
|
+
}
|
|
29
|
+
#endif
|
|
30
|
+
|
|
31
|
+
#include "u.h"
|
|
32
|
+
#include "private.h"
|
|
33
|
+
|
|
34
|
+
static inline const char *
|
|
35
|
+
codeset(locale_t locale)
|
|
36
|
+
{
|
|
37
|
+
return locale == NULL ?
|
|
38
|
+
nl_langinfo(CODESET) :
|
|
39
|
+
nl_langinfo_l(CODESET, locale);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
static inline size_t
|
|
43
|
+
transform(char *result, const char *string, size_t n, locale_t locale)
|
|
44
|
+
{
|
|
45
|
+
return locale == NULL ?
|
|
46
|
+
strxfrm(result, string, n) :
|
|
47
|
+
strxfrm_l(result, string, n, locale);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
static size_t
|
|
51
|
+
ckey(char *result, size_t m, const char *string, size_t n, locale_t locale)
|
|
52
|
+
{
|
|
53
|
+
char saved_sentinel = string[n];
|
|
54
|
+
((char *)string)[n] = '\0';
|
|
55
|
+
size_t l = 0;
|
|
56
|
+
const char *p = string;
|
|
57
|
+
const char *end = string + n + 1;
|
|
58
|
+
while (true) {
|
|
59
|
+
errno = 0;
|
|
60
|
+
size_t k = m > l ?
|
|
61
|
+
transform(result + l, p, m - l, locale) :
|
|
62
|
+
transform(NULL, p, 0, locale);
|
|
63
|
+
if (errno != 0)
|
|
64
|
+
break;
|
|
65
|
+
l += k;
|
|
66
|
+
p += strlen(p) + 1;
|
|
67
|
+
if (p == end)
|
|
68
|
+
break;
|
|
69
|
+
if (m > l)
|
|
70
|
+
result[l] = '\0';
|
|
71
|
+
l++;
|
|
72
|
+
}
|
|
73
|
+
((char *)string)[n] = saved_sentinel;
|
|
74
|
+
return l;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
static size_t
|
|
78
|
+
recode_ckey(char *result, size_t m, const char *string, size_t n,
|
|
79
|
+
locale_t locale, const char *cs)
|
|
80
|
+
{
|
|
81
|
+
char buf[2048];
|
|
82
|
+
errno = 0;
|
|
83
|
+
size_t n_recoded = u_recode(buf, sizeof(buf), string, n, cs);
|
|
84
|
+
if (errno != 0)
|
|
85
|
+
return 0;
|
|
86
|
+
if (n_recoded < sizeof(buf))
|
|
87
|
+
return ckey(result, m, buf, n_recoded, locale);
|
|
88
|
+
char *recoded = malloc(n_recoded + 1);
|
|
89
|
+
if (recoded == NULL)
|
|
90
|
+
return 0;
|
|
91
|
+
u_recode(recoded, n_recoded + 1, string, n, cs);
|
|
92
|
+
size_t n_key = ckey(result, m, recoded, n_recoded, locale);
|
|
93
|
+
free(recoded);
|
|
94
|
+
return n_key;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
size_t
|
|
98
|
+
u_collation_key(char *result, size_t m, const char *string, size_t n,
|
|
99
|
+
const char *locale)
|
|
100
|
+
{
|
|
101
|
+
assert(string != NULL);
|
|
102
|
+
assert(result != NULL || m == 0);
|
|
103
|
+
locale_t l = NULL;
|
|
104
|
+
if (locale != NULL)
|
|
105
|
+
l = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, locale, NULL);
|
|
106
|
+
const char *cs = codeset(l);
|
|
107
|
+
size_t r = strcmp(cs, "UTF-8") != 0 ?
|
|
108
|
+
recode_ckey(result, m, string, n, l, cs) :
|
|
109
|
+
ckey(result, m, string, n, l);
|
|
110
|
+
if (l != NULL)
|
|
111
|
+
freelocale(l);
|
|
112
|
+
return r;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
size_t
|
|
116
|
+
u_normalized_collation_key(char *result, size_t m, const char *string, size_t n,
|
|
117
|
+
const char *locale)
|
|
118
|
+
{
|
|
119
|
+
assert(string != NULL);
|
|
120
|
+
assert(result != NULL || m == 0);
|
|
121
|
+
char buf[2048];
|
|
122
|
+
size_t n_normalized = u_normalize(buf, sizeof(buf), string, n, U_NORMALIZATION_FORM_KC);
|
|
123
|
+
if (n_normalized < sizeof(buf))
|
|
124
|
+
return u_collation_key(result, m, buf, n_normalized, locale);
|
|
125
|
+
char *normalized = malloc(n_normalized + 1);
|
|
126
|
+
if (normalized == NULL)
|
|
127
|
+
return 0;
|
|
128
|
+
u_normalize(normalized, n_normalized + 1, string, n, U_NORMALIZATION_FORM_KC);
|
|
129
|
+
size_t n_key = u_collation_key(result, m, normalized, n_normalized, locale);
|
|
130
|
+
free(normalized);
|
|
131
|
+
return n_key;
|
|
132
|
+
}
|
data/ext/u/u_decode.c
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
#include <assert.h>
|
|
2
|
+
#include <stddef.h>
|
|
3
|
+
#include <stdint.h>
|
|
4
|
+
#include <stdbool.h>
|
|
5
|
+
|
|
6
|
+
#include "u.h"
|
|
7
|
+
#include "private.h"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
// The dfa table and decode function is © 2008–2010 Björn Höhrmann
|
|
11
|
+
// <bjoern@hoehrmann.de>. See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
|
|
12
|
+
// for details.
|
|
13
|
+
|
|
14
|
+
enum {
|
|
15
|
+
ACCEPT = 0,
|
|
16
|
+
REJECT = 12
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
static const uint8_t dfa[] = {
|
|
20
|
+
// The first part of the table maps bytes to character classes to
|
|
21
|
+
// reduce the size of the transition table and create bitmasks.
|
|
22
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
23
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
24
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
25
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
26
|
+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
|
|
27
|
+
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
|
28
|
+
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
|
29
|
+
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
|
|
30
|
+
|
|
31
|
+
// The second part is a transition table that maps a combination
|
|
32
|
+
// of a state of the automaton and a character class to a state.
|
|
33
|
+
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
|
|
34
|
+
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
|
|
35
|
+
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
|
|
36
|
+
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
|
|
37
|
+
12,36,12,12,12,12,12,12,12,12,12,12,
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
// This reversal of ‹dfa› is © 2013 Nikolai Weibull.
|
|
41
|
+
static const uint8_t dfa_r[] = {
|
|
42
|
+
// The first part of the table maps bytes to character classes to
|
|
43
|
+
// reduce the size of the transition table and create bitmasks.
|
|
44
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
45
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
46
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
47
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
48
|
+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
|
|
49
|
+
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
|
50
|
+
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
|
51
|
+
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
|
|
52
|
+
|
|
53
|
+
// The second part is a transition table that maps a combination
|
|
54
|
+
// of a state of the automaton and a character class to a state.
|
|
55
|
+
0,24,12,12,12,12,12,24,12,24,12,12, 12,12,12,12,12,12,12,12,12,12,12,12,
|
|
56
|
+
12,36, 0,12,12,12,12,48,12,36,12,12, 12,60,12, 0, 0,12,12,72,12,72,12,12,
|
|
57
|
+
12,60,12, 0,12,12,12,72,12,72, 0,12, 12,12,12,12,12, 0, 0,12,12,12,12,12,
|
|
58
|
+
12,12,12,12,12,12, 0,12,12,12,12, 0,
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
static inline uint32_t
|
|
62
|
+
decode(uint32_t *state, uint32_t *c, uint32_t b)
|
|
63
|
+
{
|
|
64
|
+
uint32_t type = dfa[b];
|
|
65
|
+
*c = *state != ACCEPT ? (*c << 6) | (b & 0x3f) : (0xff >> type) & b;
|
|
66
|
+
*state = dfa[256 + *state + type];
|
|
67
|
+
return *state;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
#define REPLACEMENT_CHARACTER ((uint32_t)0xfffd)
|
|
71
|
+
|
|
72
|
+
uint32_t
|
|
73
|
+
u_decode(const char **q, const char *u, const char *end)
|
|
74
|
+
{
|
|
75
|
+
assert(u < end);
|
|
76
|
+
uint32_t c, state = ACCEPT;
|
|
77
|
+
const unsigned char *p;
|
|
78
|
+
for (p = (const unsigned char *)u; p < (const unsigned char *)end; p++)
|
|
79
|
+
switch (decode(&state, &c, *p)) {
|
|
80
|
+
case ACCEPT:
|
|
81
|
+
*q = (const char *)p + 1;
|
|
82
|
+
return c;
|
|
83
|
+
case REJECT:
|
|
84
|
+
*q = (const char *)p + 1;
|
|
85
|
+
return REPLACEMENT_CHARACTER;
|
|
86
|
+
}
|
|
87
|
+
*q = (const char *)p;
|
|
88
|
+
return REPLACEMENT_CHARACTER;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
int
|
|
92
|
+
u_decode_n(uint32_t *result, const char *u, size_t n)
|
|
93
|
+
{
|
|
94
|
+
const char *q;
|
|
95
|
+
*result = u_decode(&q, u, u + n);
|
|
96
|
+
return (int)(q - u);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
static inline uint32_t
|
|
100
|
+
decode_r(uint32_t *state, uint32_t *c, uint32_t b, int i)
|
|
101
|
+
{
|
|
102
|
+
uint32_t type = dfa_r[b];
|
|
103
|
+
*state = dfa_r[256 + *state + type];
|
|
104
|
+
*c |= (*state != ACCEPT ? b & 0x3f : (0xff >> type) & b) << (6*i);
|
|
105
|
+
return *state;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
uint32_t
|
|
109
|
+
u_decode_r(const char **p, const char *begin, const char *u)
|
|
110
|
+
{
|
|
111
|
+
assert(begin < u);
|
|
112
|
+
uint32_t c = 0, state = ACCEPT;
|
|
113
|
+
int i = 0;
|
|
114
|
+
const unsigned char *q;
|
|
115
|
+
for (q = (const unsigned char *)u - 1; (const unsigned char *)begin <= q; q--, i++)
|
|
116
|
+
switch (decode_r(&state, &c, *q, i)) {
|
|
117
|
+
case ACCEPT:
|
|
118
|
+
*p = (const char *)q;
|
|
119
|
+
return c;
|
|
120
|
+
case REJECT:
|
|
121
|
+
*p = (const char *)q;
|
|
122
|
+
return REPLACEMENT_CHARACTER;
|
|
123
|
+
}
|
|
124
|
+
*p = (const char *)begin;
|
|
125
|
+
return REPLACEMENT_CHARACTER;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
static inline uint32_t
|
|
129
|
+
validate(uint32_t *state, uint32_t b)
|
|
130
|
+
{
|
|
131
|
+
uint32_t type = dfa[b];
|
|
132
|
+
return *state = dfa[256 + *state + type];
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
bool
|
|
136
|
+
u_valid(const char *u, size_t n, const char **end)
|
|
137
|
+
{
|
|
138
|
+
uint32_t state = ACCEPT;
|
|
139
|
+
const unsigned char *p = (const unsigned char *)u;
|
|
140
|
+
const unsigned char *o = p;
|
|
141
|
+
const unsigned char *q = p + n;
|
|
142
|
+
for ( ; p < q; p++)
|
|
143
|
+
switch (validate(&state, *p)) {
|
|
144
|
+
case ACCEPT:
|
|
145
|
+
o = p;
|
|
146
|
+
break;
|
|
147
|
+
case REJECT:
|
|
148
|
+
reject:
|
|
149
|
+
if (end != NULL)
|
|
150
|
+
*end = (const char *)o;
|
|
151
|
+
return false;
|
|
152
|
+
}
|
|
153
|
+
if (p == q)
|
|
154
|
+
return state == ACCEPT;
|
|
155
|
+
goto reject;
|
|
156
|
+
}
|
data/ext/u/u_downcase.c
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
#include <stdbool.h>
|
|
2
|
+
#include <stddef.h>
|
|
3
|
+
#include <stdint.h>
|
|
4
|
+
#include "u.h"
|
|
5
|
+
|
|
6
|
+
#include <assert.h>
|
|
7
|
+
|
|
8
|
+
#include <string.h>
|
|
9
|
+
#include "output.h"
|
|
10
|
+
|
|
11
|
+
#include "data/constants.h"
|
|
12
|
+
#include "attributes.h"
|
|
13
|
+
#include "u_locale.h"
|
|
14
|
+
#include "titled.h"
|
|
15
|
+
#include "case.h"
|
|
16
|
+
#include "private.h"
|
|
17
|
+
|
|
18
|
+
#define LATIN_CAPITAL_LETTER_I ((uint32_t)0x0049)
|
|
19
|
+
#define LATIN_CAPITAL_LETTER_J ((uint32_t)0x004a)
|
|
20
|
+
#define LATIN_SMALL_LETTER_I ((uint32_t)0x0069)
|
|
21
|
+
#define LATIN_CAPITAL_LETTER_I_WITH_GRAVE ((uint32_t)0x00cc)
|
|
22
|
+
#define LATIN_CAPITAL_LETTER_I_WITH_ACUTE ((uint32_t)0x00cd)
|
|
23
|
+
#define LATIN_CAPITAL_LETTER_I_WITH_TILDE ((uint32_t)0x0128)
|
|
24
|
+
#define LATIN_CAPITAL_LETTER_I_WITH_OGONEK ((uint32_t)0x012e)
|
|
25
|
+
#define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE ((uint32_t)0x0130)
|
|
26
|
+
#define LATIN_SMALL_LETTER_DOTLESS_I ((uint32_t)0x0131)
|
|
27
|
+
#define COMBINING_GRAVE_ACCENT ((uint32_t)0x0300)
|
|
28
|
+
#define COMBINING_ACUTE_ACCENT ((uint32_t)0x0301)
|
|
29
|
+
#define COMBINING_TILDE ((uint32_t)0x0303)
|
|
30
|
+
#define COMBINING_DOT_ABOVE ((uint32_t)0x0307)
|
|
31
|
+
#define GREEK_CAPITAL_LETTER_SIGMA ((uint32_t)0x03a3)
|
|
32
|
+
#define GREEK_SMALL_LETTER_FINAL_SIGMA ((uint32_t)0x03c2)
|
|
33
|
+
#define GREEK_SMALL_LETTER_SIGMA ((uint32_t)0x03c3)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
static inline bool
|
|
37
|
+
is_final_sigma(const char *string, const char *p, const char *q, const char *end)
|
|
38
|
+
{
|
|
39
|
+
if (p == string)
|
|
40
|
+
return false;
|
|
41
|
+
uint32_t c;
|
|
42
|
+
while (q < end) {
|
|
43
|
+
c = u_decode(&q, q, end);
|
|
44
|
+
if (u_char_iscaseignorable(c))
|
|
45
|
+
continue;
|
|
46
|
+
if (u_char_iscased(c))
|
|
47
|
+
return false;
|
|
48
|
+
break;
|
|
49
|
+
}
|
|
50
|
+
while (string < p) {
|
|
51
|
+
c = u_decode_r(&p, string, p);
|
|
52
|
+
if (u_char_iscaseignorable(c))
|
|
53
|
+
continue;
|
|
54
|
+
if (u_char_iscased(c))
|
|
55
|
+
return true;
|
|
56
|
+
return false;
|
|
57
|
+
}
|
|
58
|
+
return false;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
static inline bool
|
|
62
|
+
has_more_above(const char *q, const char *end)
|
|
63
|
+
{
|
|
64
|
+
while (q < end) {
|
|
65
|
+
switch (u_char_canonical_combining_class(u_decode(&q, q, end))) {
|
|
66
|
+
case U_CANONICAL_COMBINING_CLASS_ABOVE:
|
|
67
|
+
return true;
|
|
68
|
+
case U_CANONICAL_COMBINING_CLASS_NOT_REORDERED:
|
|
69
|
+
return false;
|
|
70
|
+
default:
|
|
71
|
+
break;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
return false;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
static bool
|
|
78
|
+
downcase_lithuanian_i(uint32_t c, uint32_t combiner, struct output *output)
|
|
79
|
+
{
|
|
80
|
+
output_char(output, c);
|
|
81
|
+
output_char(output, COMBINING_DOT_ABOVE);
|
|
82
|
+
if (combiner != '\0')
|
|
83
|
+
output_char(output, combiner);
|
|
84
|
+
return true;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
static inline bool
|
|
88
|
+
downcase_lithuanian(uint32_t c, const char *q, const char *end,
|
|
89
|
+
struct output *output)
|
|
90
|
+
{
|
|
91
|
+
switch (c) {
|
|
92
|
+
case LATIN_CAPITAL_LETTER_I:
|
|
93
|
+
case LATIN_CAPITAL_LETTER_J:
|
|
94
|
+
case LATIN_CAPITAL_LETTER_I_WITH_OGONEK:
|
|
95
|
+
if (!has_more_above(q, end))
|
|
96
|
+
return false;
|
|
97
|
+
return downcase_lithuanian_i(u_char_downcase(c), '\0', output);
|
|
98
|
+
case LATIN_CAPITAL_LETTER_I_WITH_GRAVE:
|
|
99
|
+
return downcase_lithuanian_i(LATIN_SMALL_LETTER_I,
|
|
100
|
+
COMBINING_GRAVE_ACCENT, output);
|
|
101
|
+
case LATIN_CAPITAL_LETTER_I_WITH_ACUTE:
|
|
102
|
+
return downcase_lithuanian_i(LATIN_SMALL_LETTER_I,
|
|
103
|
+
COMBINING_ACUTE_ACCENT, output);
|
|
104
|
+
case LATIN_CAPITAL_LETTER_I_WITH_TILDE:
|
|
105
|
+
return downcase_lithuanian_i(LATIN_SMALL_LETTER_I,
|
|
106
|
+
COMBINING_TILDE, output);
|
|
107
|
+
default:
|
|
108
|
+
return false;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
static inline bool
|
|
113
|
+
is_before_dot(const char *q, const char *end)
|
|
114
|
+
{
|
|
115
|
+
while (q < end) {
|
|
116
|
+
uint32_t c = u_decode(&q, q, end);
|
|
117
|
+
if (c == COMBINING_DOT_ABOVE)
|
|
118
|
+
return true;
|
|
119
|
+
switch (u_char_canonical_combining_class(c)) {
|
|
120
|
+
case U_CANONICAL_COMBINING_CLASS_ABOVE:
|
|
121
|
+
case U_CANONICAL_COMBINING_CLASS_NOT_REORDERED:
|
|
122
|
+
return false;
|
|
123
|
+
default:
|
|
124
|
+
break;
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
return false;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
static bool
|
|
131
|
+
is_i(uint32_t c)
|
|
132
|
+
{
|
|
133
|
+
return c == LATIN_CAPITAL_LETTER_I;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
static inline bool
|
|
137
|
+
downcase_turkic(uint32_t c,
|
|
138
|
+
const char *string, const char *p, const char *q, const char *end,
|
|
139
|
+
struct output *output)
|
|
140
|
+
{
|
|
141
|
+
switch (c) {
|
|
142
|
+
case LATIN_CAPITAL_LETTER_I:
|
|
143
|
+
output_char(output,
|
|
144
|
+
is_before_dot(q, end) ?
|
|
145
|
+
LATIN_SMALL_LETTER_I :
|
|
146
|
+
LATIN_SMALL_LETTER_DOTLESS_I);
|
|
147
|
+
return true;
|
|
148
|
+
case LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE:
|
|
149
|
+
case_simple(LATIN_CAPITAL_LETTER_I,
|
|
150
|
+
U_GENERAL_CATEGORY_LETTER_UPPERCASE,
|
|
151
|
+
false, false, output);
|
|
152
|
+
return true;
|
|
153
|
+
case COMBINING_DOT_ABOVE:
|
|
154
|
+
if (!is_after(string, p, is_i))
|
|
155
|
+
output_char(output, COMBINING_DOT_ABOVE);
|
|
156
|
+
return true;
|
|
157
|
+
default:
|
|
158
|
+
return false;
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
const char *
|
|
163
|
+
_u_downcase_step(const char *string, const char *p, const char *end,
|
|
164
|
+
enum locale locale, struct output *output)
|
|
165
|
+
{
|
|
166
|
+
const char *q;
|
|
167
|
+
uint32_t c = u_decode(&q, p, end);
|
|
168
|
+
enum u_general_category gc;
|
|
169
|
+
if (c == GREEK_CAPITAL_LETTER_SIGMA)
|
|
170
|
+
output_char(output,
|
|
171
|
+
is_final_sigma(string, p, q, end) ?
|
|
172
|
+
GREEK_SMALL_LETTER_FINAL_SIGMA :
|
|
173
|
+
GREEK_SMALL_LETTER_SIGMA);
|
|
174
|
+
else if (locale == LOCALE_LITHUANIAN &&
|
|
175
|
+
downcase_lithuanian(c, q, end, output))
|
|
176
|
+
;
|
|
177
|
+
else if (locale == LOCALE_TURKIC &&
|
|
178
|
+
downcase_turkic(c, string, p, q, end, output))
|
|
179
|
+
;
|
|
180
|
+
else if (IS(gc = u_char_general_category(c),
|
|
181
|
+
OR(U_GENERAL_CATEGORY_LETTER_UPPERCASE,
|
|
182
|
+
OR(U_GENERAL_CATEGORY_LETTER_TITLECASE, 0))))
|
|
183
|
+
case_simple(c, gc, false, false, output);
|
|
184
|
+
else
|
|
185
|
+
output_string(output, p, q - p);
|
|
186
|
+
return q;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
size_t
|
|
190
|
+
u_downcase(char *result, size_t m, const char *u, size_t n,
|
|
191
|
+
const char *locale)
|
|
192
|
+
{
|
|
193
|
+
assert(u != NULL);
|
|
194
|
+
assert(result != NULL || m == 0);
|
|
195
|
+
enum locale l = _u_locale_from_string(locale);
|
|
196
|
+
const char *end = u + n;
|
|
197
|
+
struct output o = OUTPUT_INIT(result, m);
|
|
198
|
+
for (const char *p = u; p < end; )
|
|
199
|
+
p = _u_downcase_step(u, p, end, l, &o);
|
|
200
|
+
return output_finalize(&o);
|
|
201
|
+
}
|