u 0.5.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/build/ext/u/data/attributes.rb +39 -0
- data/build/ext/u/data/bidi-mirroring.rb +27 -0
- data/build/ext/u/data/canonical-combining-class.rb +15 -0
- data/build/ext/u/data/case-folding.rb +39 -0
- data/build/ext/u/data/cased.rb +19 -0
- data/build/ext/u/data/compose.rb +304 -0
- data/build/ext/u/data/constants.rb +31 -0
- data/build/ext/u/data/decompose.rb +85 -0
- data/build/ext/u/data/general-category.rb +61 -0
- data/build/ext/u/data/grapheme-word-break.rb +15 -0
- data/build/ext/u/data/marshalled.rb +5 -0
- data/build/ext/u/data/script.rb +91 -0
- data/build/ext/u/data/soft-dotted.rb +17 -0
- data/build/ext/u/data/title-table.rb +30 -0
- data/build/ext/u/data/wide.rb +17 -0
- data/build/lib/u/build.rb +8 -0
- data/build/lib/u/build/data.rb +16 -0
- data/build/lib/u/build/data/bidimirroring.rb +26 -0
- data/build/lib/u/build/data/break.rb +14 -0
- data/build/lib/u/build/data/casefolding.rb +77 -0
- data/build/lib/u/build/data/compositionexclusions.rb +14 -0
- data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
- data/build/lib/u/build/data/file.rb +88 -0
- data/build/lib/u/build/data/linebreak.rb +14 -0
- data/build/lib/u/build/data/proplist.rb +18 -0
- data/build/lib/u/build/data/scripts.rb +22 -0
- data/build/lib/u/build/data/specialcasing.rb +106 -0
- data/build/lib/u/build/data/unicode.rb +41 -0
- data/build/lib/u/build/data/unicode/entry.rb +27 -0
- data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
- data/build/lib/u/build/data/unicode/points.rb +32 -0
- data/build/lib/u/build/header.rb +11 -0
- data/build/lib/u/build/header/table.rb +19 -0
- data/build/lib/u/build/header/table/row.rb +64 -0
- data/build/lib/u/build/header/tables.rb +6 -0
- data/build/lib/u/build/header/tables/intervals.rb +50 -0
- data/build/lib/u/build/header/tables/split.rb +20 -0
- data/build/lib/u/build/header/tables/split/data.rb +29 -0
- data/build/lib/u/build/header/tables/split/part1.rb +28 -0
- data/build/lib/u/build/header/tables/split/part2.rb +13 -0
- data/build/lib/u/build/header/tables/split/row.rb +34 -0
- data/build/lib/u/build/header/tables/split/rows.rb +22 -0
- data/build/test/unit/break.rb +45 -0
- data/build/test/unit/case.rb +178 -0
- data/build/test/unit/foldcase.rb +44 -0
- data/build/test/unit/normalize.rb +81 -0
- data/ext/u/attributes.c +62 -0
- data/ext/u/attributes.h +5 -0
- data/ext/u/case.h +41 -0
- data/ext/u/data/attributes.h +3070 -0
- data/ext/u/data/bidi-mirroring.h +373 -0
- data/ext/u/data/canonical-combining-class.h +2157 -0
- data/ext/u/data/case-folding.h +171 -0
- data/ext/u/data/cased.h +42 -0
- data/ext/u/data/compose.h +1714 -0
- data/ext/u/data/constants.h +17 -0
- data/ext/u/data/decompose.h +9356 -0
- data/ext/u/data/general-category.h +28959 -0
- data/ext/u/data/grapheme-break.h +13201 -0
- data/ext/u/data/line-break.h +26501 -0
- data/ext/u/data/normalization-quick-check.h +3002 -0
- data/ext/u/data/script.h +2928 -0
- data/ext/u/data/soft-dotted.h +55 -0
- data/ext/u/data/title-table.h +41 -0
- data/ext/u/data/types.h +11117 -0
- data/ext/u/data/wide-cjk.h +197 -0
- data/ext/u/data/wide.h +59 -0
- data/ext/u/data/word-break.h +10001 -0
- data/ext/u/depend +281 -0
- data/ext/u/extconf.rb +158 -0
- data/ext/u/output.h +51 -0
- data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
- data/ext/u/private.h +58 -0
- data/ext/u/rb_includes.h +10 -0
- data/ext/u/rb_private.c +98 -0
- data/ext/u/rb_private.h +67 -0
- data/ext/u/rb_u.c +251 -0
- data/ext/u/rb_u_buffer.c +443 -0
- data/ext/u/rb_u_buffer.h +24 -0
- data/ext/u/rb_u_re.c +43 -0
- data/ext/u/rb_u_re.h +15 -0
- data/ext/u/rb_u_string.c +478 -0
- data/ext/u/rb_u_string.h +173 -0
- data/ext/u/rb_u_string_alnum.c +10 -0
- data/ext/u/rb_u_string_alpha.c +10 -0
- data/ext/u/rb_u_string_aref.c +142 -0
- data/ext/u/rb_u_string_ascii_only.c +13 -0
- data/ext/u/rb_u_string_assigned.c +10 -0
- data/ext/u/rb_u_string_b.c +18 -0
- data/ext/u/rb_u_string_bytesize.c +10 -0
- data/ext/u/rb_u_string_byteslice.c +103 -0
- data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
- data/ext/u/rb_u_string_case_ignorable.c +25 -0
- data/ext/u/rb_u_string_casecmp.c +61 -0
- data/ext/u/rb_u_string_cased.c +17 -0
- data/ext/u/rb_u_string_chomp.c +107 -0
- data/ext/u/rb_u_string_chop.c +33 -0
- data/ext/u/rb_u_string_chr.c +9 -0
- data/ext/u/rb_u_string_cntrl.c +10 -0
- data/ext/u/rb_u_string_collate.c +46 -0
- data/ext/u/rb_u_string_collation_key.c +18 -0
- data/ext/u/rb_u_string_count.c +38 -0
- data/ext/u/rb_u_string_defined.c +10 -0
- data/ext/u/rb_u_string_delete.c +62 -0
- data/ext/u/rb_u_string_digit.c +10 -0
- data/ext/u/rb_u_string_downcase.c +13 -0
- data/ext/u/rb_u_string_dump.c +153 -0
- data/ext/u/rb_u_string_each_byte.c +46 -0
- data/ext/u/rb_u_string_each_char.c +49 -0
- data/ext/u/rb_u_string_each_codepoint.c +45 -0
- data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
- data/ext/u/rb_u_string_each_line.c +142 -0
- data/ext/u/rb_u_string_each_word.c +34 -0
- data/ext/u/rb_u_string_empty.c +11 -0
- data/ext/u/rb_u_string_end_with.c +31 -0
- data/ext/u/rb_u_string_eql.c +30 -0
- data/ext/u/rb_u_string_equal.c +33 -0
- data/ext/u/rb_u_string_foldcase.c +12 -0
- data/ext/u/rb_u_string_folded.c +13 -0
- data/ext/u/rb_u_string_format.c +1745 -0
- data/ext/u/rb_u_string_general_category.c +109 -0
- data/ext/u/rb_u_string_getbyte.c +21 -0
- data/ext/u/rb_u_string_graph.c +21 -0
- data/ext/u/rb_u_string_grapheme_break.c +61 -0
- data/ext/u/rb_u_string_gsub.c +164 -0
- data/ext/u/rb_u_string_hash.c +10 -0
- data/ext/u/rb_u_string_hex.c +9 -0
- data/ext/u/rb_u_string_include.c +10 -0
- data/ext/u/rb_u_string_index.c +110 -0
- data/ext/u/rb_u_string_inspect.c +189 -0
- data/ext/u/rb_u_string_internal_tr.c +148 -0
- data/ext/u/rb_u_string_internal_tr.h +29 -0
- data/ext/u/rb_u_string_justify.c +169 -0
- data/ext/u/rb_u_string_length.c +10 -0
- data/ext/u/rb_u_string_line_break.c +115 -0
- data/ext/u/rb_u_string_lower.c +13 -0
- data/ext/u/rb_u_string_lstrip.c +24 -0
- data/ext/u/rb_u_string_match.c +65 -0
- data/ext/u/rb_u_string_mirror.c +16 -0
- data/ext/u/rb_u_string_newline.c +21 -0
- data/ext/u/rb_u_string_normalize.c +70 -0
- data/ext/u/rb_u_string_normalized.c +28 -0
- data/ext/u/rb_u_string_oct.c +11 -0
- data/ext/u/rb_u_string_ord.c +14 -0
- data/ext/u/rb_u_string_partition.c +80 -0
- data/ext/u/rb_u_string_plus.c +33 -0
- data/ext/u/rb_u_string_print.c +10 -0
- data/ext/u/rb_u_string_punct.c +10 -0
- data/ext/u/rb_u_string_reverse.c +13 -0
- data/ext/u/rb_u_string_rindex.c +104 -0
- data/ext/u/rb_u_string_rpartition.c +81 -0
- data/ext/u/rb_u_string_rstrip.c +29 -0
- data/ext/u/rb_u_string_scan.c +109 -0
- data/ext/u/rb_u_string_script.c +253 -0
- data/ext/u/rb_u_string_soft_dotted.c +13 -0
- data/ext/u/rb_u_string_space.c +24 -0
- data/ext/u/rb_u_string_split.c +245 -0
- data/ext/u/rb_u_string_squeeze.c +75 -0
- data/ext/u/rb_u_string_start_with.c +31 -0
- data/ext/u/rb_u_string_strip.c +36 -0
- data/ext/u/rb_u_string_sub.c +147 -0
- data/ext/u/rb_u_string_times.c +35 -0
- data/ext/u/rb_u_string_title.c +10 -0
- data/ext/u/rb_u_string_titlecase.c +13 -0
- data/ext/u/rb_u_string_to_i.c +45 -0
- data/ext/u/rb_u_string_to_inum.c +364 -0
- data/ext/u/rb_u_string_to_inum.h +1 -0
- data/ext/u/rb_u_string_to_str.c +17 -0
- data/ext/u/rb_u_string_to_sym.c +12 -0
- data/ext/u/rb_u_string_tr.c +290 -0
- data/ext/u/rb_u_string_upcase.c +12 -0
- data/ext/u/rb_u_string_upper.c +13 -0
- data/ext/u/rb_u_string_valid.c +10 -0
- data/ext/u/rb_u_string_valid_encoding.c +12 -0
- data/ext/u/rb_u_string_wide.c +21 -0
- data/ext/u/rb_u_string_wide_cjk.c +21 -0
- data/ext/u/rb_u_string_width.c +19 -0
- data/ext/u/rb_u_string_word_break.c +63 -0
- data/ext/u/rb_u_string_xdigit.c +22 -0
- data/ext/u/rb_u_string_zero_width.c +16 -0
- data/ext/u/titled.c +55 -0
- data/ext/u/titled.h +1 -0
- data/ext/u/u.c +23 -0
- data/ext/u/u.h +458 -0
- data/ext/u/u_char_canonical_combining_class.c +31 -0
- data/ext/u/u_char_digit_value.c +21 -0
- data/ext/u/u_char_downcase.c +27 -0
- data/ext/u/u_char_general_category.c +31 -0
- data/ext/u/u_char_grapheme_break.c +28 -0
- data/ext/u/u_char_isalnum.c +24 -0
- data/ext/u/u_char_isalpha.c +21 -0
- data/ext/u/u_char_isassigned.c +16 -0
- data/ext/u/u_char_iscased.c +22 -0
- data/ext/u/u_char_iscaseignorable.c +29 -0
- data/ext/u/u_char_iscntrl.c +17 -0
- data/ext/u/u_char_isdefined.c +15 -0
- data/ext/u/u_char_isdigit.c +16 -0
- data/ext/u/u_char_isgraph.c +22 -0
- data/ext/u/u_char_islower.c +16 -0
- data/ext/u/u_char_isnewline.c +24 -0
- data/ext/u/u_char_isprint.c +21 -0
- data/ext/u/u_char_ispunct.c +27 -0
- data/ext/u/u_char_issoftdotted.c +18 -0
- data/ext/u/u_char_isspace.c +28 -0
- data/ext/u/u_char_isupper.c +16 -0
- data/ext/u/u_char_isvalid.c +18 -0
- data/ext/u/u_char_iswide.c +18 -0
- data/ext/u/u_char_iswide_cjk.c +22 -0
- data/ext/u/u_char_isxdigit.c +27 -0
- data/ext/u/u_char_iszerowidth.c +29 -0
- data/ext/u/u_char_line_break.c +29 -0
- data/ext/u/u_char_mirror.c +16 -0
- data/ext/u/u_char_normalized.c +23 -0
- data/ext/u/u_char_script.c +41 -0
- data/ext/u/u_char_to_u.c +48 -0
- data/ext/u/u_char_upcase.c +24 -0
- data/ext/u/u_char_width.c +12 -0
- data/ext/u/u_char_word_break.c +28 -0
- data/ext/u/u_char_xdigit_value.c +31 -0
- data/ext/u/u_collate.c +83 -0
- data/ext/u/u_collation_key.c +132 -0
- data/ext/u/u_decode.c +156 -0
- data/ext/u/u_downcase.c +201 -0
- data/ext/u/u_foldcase.c +68 -0
- data/ext/u/u_grapheme_clusters.c +57 -0
- data/ext/u/u_has_prefix.c +27 -0
- data/ext/u/u_index.c +93 -0
- data/ext/u/u_is_ascii_only.c +33 -0
- data/ext/u/u_locale.c +40 -0
- data/ext/u/u_locale.h +14 -0
- data/ext/u/u_mirror.c +20 -0
- data/ext/u/u_n_bytes.c +16 -0
- data/ext/u/u_n_chars.c +43 -0
- data/ext/u/u_normalize.c +232 -0
- data/ext/u/u_normalized.c +28 -0
- data/ext/u/u_offset_to_pointer.c +62 -0
- data/ext/u/u_pointer_to_offset.c +23 -0
- data/ext/u/u_recode.c +73 -0
- data/ext/u/u_reverse.c +21 -0
- data/ext/u/u_rindex.c +132 -0
- data/ext/u/u_titlecase.c +68 -0
- data/ext/u/u_upcase.c +89 -0
- data/ext/u/u_width.c +35 -0
- data/ext/u/u_words.c +82 -0
- data/ext/u/yield.h +27 -0
- data/lib/u-1.0.rb +20 -0
- data/lib/u-1.0/buffer.rb +10 -0
- data/lib/u-1.0/string.rb +9 -0
- data/lib/u-1.0/version.rb +287 -0
- data/test/unit/case.rb +2080 -0
- data/test/unit/foldcase.rb +1136 -0
- data/test/unit/graphemebreak.rb +407 -0
- data/test/unit/normalize.rb +367545 -0
- data/test/unit/u-1.0.rb +10 -0
- data/test/unit/u-1.0/buffer.rb +52 -0
- data/test/unit/u-1.0/string.rb +1439 -0
- data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
- data/test/unit/wordbreak.rb +1083 -0
- metadata +603 -148
- data/README +0 -38
- data/Rakefile +0 -64
- data/ext/encoding/character/utf-8/break.c +0 -25
- data/ext/encoding/character/utf-8/data/break.h +0 -22931
- data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
- data/ext/encoding/character/utf-8/data/compose.h +0 -1607
- data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
- data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
- data/ext/encoding/character/utf-8/decompose.c +0 -444
- data/ext/encoding/character/utf-8/depend +0 -65
- data/ext/encoding/character/utf-8/extconf.rb +0 -67
- data/ext/encoding/character/utf-8/private.h +0 -51
- data/ext/encoding/character/utf-8/properties.c +0 -1056
- data/ext/encoding/character/utf-8/rb_includes.h +0 -19
- data/ext/encoding/character/utf-8/rb_methods.h +0 -49
- data/ext/encoding/character/utf-8/rb_private.h +0 -52
- data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
- data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
- data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
- data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
- data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
- data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
- data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
- data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
- data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
- data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
- data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
- data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
- data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
- data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
- data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
- data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
- data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
- data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
- data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
- data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
- data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
- data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
- data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
- data/ext/encoding/character/utf-8/tables.h +0 -38
- data/ext/encoding/character/utf-8/unicode.c +0 -319
- data/ext/encoding/character/utf-8/unicode.h +0 -216
- data/ext/encoding/character/utf-8/utf.c +0 -1334
- data/lib/encoding/character/utf-8.rb +0 -201
- data/lib/u.rb +0 -16
- data/lib/u/string.rb +0 -185
- data/lib/u/version.rb +0 -5
- data/test/unit/u/string.rb +0 -91
|
@@ -1,38 +1,34 @@
|
|
|
1
|
-
/*
|
|
2
|
-
* contents: Private functions used by the UTF-8 character-encoding library.
|
|
3
|
-
*
|
|
4
|
-
* Copyright © 2007 Nikolai Weibull <now@bitwi.se>
|
|
5
|
-
*/
|
|
6
|
-
|
|
7
1
|
#include <ruby.h>
|
|
8
2
|
#include <stdbool.h>
|
|
9
3
|
#include <stddef.h>
|
|
10
4
|
#include <stdint.h>
|
|
11
5
|
#include <stdlib.h>
|
|
12
6
|
|
|
13
|
-
#include "
|
|
7
|
+
#include "u.h"
|
|
14
8
|
|
|
15
9
|
#include "private.h"
|
|
16
10
|
|
|
17
11
|
/* Lookup C in the sorted TABLE using binary search. TABLE consists of N
|
|
18
12
|
* entries, where each entry is SIZEOF_ENTRY bytes in size and the first
|
|
19
|
-
* component is a
|
|
13
|
+
* component is a uint32_t of size SIZEOF_CHAR. If C is found in TABLE, its
|
|
20
14
|
* index is stored in INDEX and true is returned. Otherwise, false is returned
|
|
21
15
|
* and INDEX is left untouched. */
|
|
22
16
|
bool
|
|
23
|
-
binary_search_unicode_table(const void *table, size_t n, size_t sizeof_entry, size_t sizeof_char,
|
|
17
|
+
binary_search_unicode_table(const void *table, size_t n, size_t sizeof_entry, size_t sizeof_char, uint32_t c, size_t *index)
|
|
24
18
|
{
|
|
25
|
-
#define ENTRY(index) ((
|
|
19
|
+
#define ENTRY(index) (*(uint32_t *)(void *)((const char *)table + ((index) * sizeof_entry)) & char_mask)
|
|
26
20
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
21
|
+
size_t begin = 0;
|
|
22
|
+
size_t end = n - 1;
|
|
23
|
+
size_t middle;
|
|
30
24
|
|
|
31
25
|
/* This is ugly, but not all tables use unichars as their lookup
|
|
32
26
|
* character. The casefold table, for example, uses uint16_t-sized
|
|
33
27
|
* characters. To only get the interesting part of our table entry
|
|
34
28
|
* we’ll have to mask the retrieved value. */
|
|
35
|
-
|
|
29
|
+
uint32_t char_mask = sizeof_char < sizeof(uint32_t) ?
|
|
30
|
+
((uint32_t)1 << (CHAR_BIT * sizeof_char)) - 1 :
|
|
31
|
+
(uint32_t)-1;
|
|
36
32
|
|
|
37
33
|
/* Drop out early if we know for certain that C can’t be in the
|
|
38
34
|
* decomposition table. */
|
|
@@ -42,7 +38,7 @@ binary_search_unicode_table(const void *table, size_t n, size_t sizeof_entry, si
|
|
|
42
38
|
while (begin <= end) {
|
|
43
39
|
middle = binary_search_middle_of(begin, end);
|
|
44
40
|
|
|
45
|
-
|
|
41
|
+
uint32_t probe = ENTRY(middle);
|
|
46
42
|
if (c < probe)
|
|
47
43
|
end = middle - 1;
|
|
48
44
|
else if (c > probe)
|
data/ext/u/private.h
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
#define IS(category, class) (((unsigned int)1 << (category)) & (class))
|
|
2
|
+
#define OR(class, rest) (((unsigned int)1 << (class)) | (rest))
|
|
3
|
+
|
|
4
|
+
#define P_WITHIN_STR(p, end, use_end) \
|
|
5
|
+
((use_end) ? (p) < (end) : *(p) != '\0')
|
|
6
|
+
|
|
7
|
+
#define lengthof(ary) (sizeof(ary) / sizeof((ary)[0]))
|
|
8
|
+
|
|
9
|
+
#if defined(_WIN32) || defined(__CYGWIN__)
|
|
10
|
+
# ifdef U_COMPILATION
|
|
11
|
+
# define U_EXTERN __declspec(dllexport) extern
|
|
12
|
+
# else
|
|
13
|
+
# define U_EXTERN __declspec(dllimport) extern
|
|
14
|
+
# endif
|
|
15
|
+
#elif __GNUC__ >= 4
|
|
16
|
+
# define U_EXTERN __attribute__((visibility("default"))) extern
|
|
17
|
+
#else
|
|
18
|
+
# define U_EXTERN extern
|
|
19
|
+
#endif
|
|
20
|
+
|
|
21
|
+
#if defined(__GNUC__) && __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4)
|
|
22
|
+
# define PRINTF(format_index, first_argument_index) \
|
|
23
|
+
__attribute__((format(printf, format_index, first_argument_index)))
|
|
24
|
+
# define UNUSED(u) \
|
|
25
|
+
__attribute__((__unused__)) u
|
|
26
|
+
#else
|
|
27
|
+
# define PRINTF(format, arguments)
|
|
28
|
+
# define UNUSED(u) u
|
|
29
|
+
#endif
|
|
30
|
+
|
|
31
|
+
#if defined(__GNUC__) && __GNUC__ > 2 && defined(__OPTIMIZE__)
|
|
32
|
+
# define BOOLEAN_EXPR(expr) __extension__({ \
|
|
33
|
+
int _boolean_var_; \
|
|
34
|
+
if (expr) \
|
|
35
|
+
_boolean_var_ = 1; \
|
|
36
|
+
else \
|
|
37
|
+
_boolean_var_ = 0; \
|
|
38
|
+
_boolean_var_; \
|
|
39
|
+
})
|
|
40
|
+
# define LIKELY(expr) (__builtin_expect(BOOLEAN_EXPR(expr), 1))
|
|
41
|
+
# define UNLIKELY(expr) (__builtin_expect(BOOLEAN_EXPR(expr), 0))
|
|
42
|
+
#else
|
|
43
|
+
# define LIKELY(expr) (expr)
|
|
44
|
+
# define UNLIKELY(expr) (expr)
|
|
45
|
+
#endif
|
|
46
|
+
|
|
47
|
+
#define binary_search_middle_of(begin, end) \
|
|
48
|
+
(((unsigned)((begin) + (end))) >> 1)
|
|
49
|
+
|
|
50
|
+
#define unicode_table_lookup(table, c, index) \
|
|
51
|
+
binary_search_unicode_table(table, lengthof(table), sizeof((table)[0]), sizeof((table)[0].ch), c, index)
|
|
52
|
+
|
|
53
|
+
bool binary_search_unicode_table(const void *table,
|
|
54
|
+
size_t n,
|
|
55
|
+
size_t sizeof_entry,
|
|
56
|
+
size_t sizeof_char,
|
|
57
|
+
uint32_t c,
|
|
58
|
+
size_t *index);
|
data/ext/u/rb_includes.h
ADDED
data/ext/u/rb_private.c
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
|
3
|
+
# include <ruby/encoding.h>
|
|
4
|
+
#endif
|
|
5
|
+
#include <errno.h>
|
|
6
|
+
|
|
7
|
+
#ifndef HAVE_RB_MEMHASH
|
|
8
|
+
int
|
|
9
|
+
rb_memhash(const char *string, long length)
|
|
10
|
+
{
|
|
11
|
+
const char *p = string;
|
|
12
|
+
const char *end = string + length;
|
|
13
|
+
int hash = 0;
|
|
14
|
+
|
|
15
|
+
while (p < end) {
|
|
16
|
+
hash = hash * 65599 + *p;
|
|
17
|
+
p++;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
return hash + (hash >> 5);
|
|
21
|
+
}
|
|
22
|
+
#endif
|
|
23
|
+
|
|
24
|
+
long
|
|
25
|
+
rb_u_memsearch(const void *a, long a_n, const void *b, long b_n)
|
|
26
|
+
{
|
|
27
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
|
28
|
+
return rb_memsearch(a, a_n, b, b_n, rb_utf8_encoding());
|
|
29
|
+
#else
|
|
30
|
+
return rb_memsearch(a, a_n, b, b_n);
|
|
31
|
+
#endif
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
static VALUE PRINTF(1, 0)
|
|
35
|
+
format_message(const char *format, va_list args)
|
|
36
|
+
{
|
|
37
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
|
38
|
+
return rb_enc_vsprintf(rb_utf8_encoding(), format, args);
|
|
39
|
+
#else
|
|
40
|
+
# ifdef HAVE_RB_VSPRINTF
|
|
41
|
+
return rb_vsprintf(format, args);
|
|
42
|
+
# else
|
|
43
|
+
char buf[2048];
|
|
44
|
+
int n = vsnprintf(buf, sizeof(buf), format, args);
|
|
45
|
+
return rb_str_new(buf, n);
|
|
46
|
+
# endif
|
|
47
|
+
#endif
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
void
|
|
51
|
+
rb_u_raise(VALUE exception, const char *format, ...)
|
|
52
|
+
{
|
|
53
|
+
va_list args;
|
|
54
|
+
va_start(args, format);
|
|
55
|
+
VALUE message = format_message(format, args);
|
|
56
|
+
va_end(args);
|
|
57
|
+
rb_exc_raise(rb_exc_new3(exception, message));
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
void
|
|
61
|
+
rb_u_raise_errno(int number, const char *format, ...)
|
|
62
|
+
{
|
|
63
|
+
va_list args;
|
|
64
|
+
va_start(args, format);
|
|
65
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
|
66
|
+
VALUE message = format_message(format, args);
|
|
67
|
+
va_end(args);
|
|
68
|
+
rb_syserr_fail_str(number, message);
|
|
69
|
+
#else
|
|
70
|
+
char buf[2048];
|
|
71
|
+
vsnprintf(buf, sizeof(buf), format, args);
|
|
72
|
+
errno = number;
|
|
73
|
+
va_end(args);
|
|
74
|
+
rb_sys_fail(buf);
|
|
75
|
+
#endif
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
VALUE
|
|
79
|
+
rb_u_str_new(const char *string, long length)
|
|
80
|
+
{
|
|
81
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
|
82
|
+
return rb_enc_str_new(string, length, rb_utf8_encoding());
|
|
83
|
+
#else
|
|
84
|
+
return rb_str_new(string, length);
|
|
85
|
+
#endif
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
VALUE
|
|
89
|
+
rb_u_str_buf_new(long length)
|
|
90
|
+
{
|
|
91
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
|
92
|
+
VALUE buffer = rb_str_buf_new(length);
|
|
93
|
+
rb_enc_associate(buffer, rb_utf8_encoding());
|
|
94
|
+
return buffer;
|
|
95
|
+
#else
|
|
96
|
+
return rb_str_buf_new(length);
|
|
97
|
+
#endif
|
|
98
|
+
}
|
data/ext/u/rb_private.h
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
#if __GNUC__ >= 4
|
|
2
|
+
# define RB_U_NULL_TERMINATED(parameter) __attribute__((__sentinel__(parameter)))
|
|
3
|
+
#else
|
|
4
|
+
# define RB_U_NULL_TERMINATED(parameter)
|
|
5
|
+
#endif
|
|
6
|
+
|
|
7
|
+
void need_at_least_n_arguments(int argc, int n);
|
|
8
|
+
|
|
9
|
+
void need_m_to_n_arguments(int argc, int m, int n);
|
|
10
|
+
|
|
11
|
+
void *_rb_u_guarded_alloc(size_t n, ...) RB_U_NULL_TERMINATED(0);
|
|
12
|
+
|
|
13
|
+
int rb_u_char_to_u(uint32_t c, char *result);
|
|
14
|
+
|
|
15
|
+
void rb_u_validate(const char *string, long length);
|
|
16
|
+
|
|
17
|
+
VALUE _rb_u_character_test(VALUE string, bool (*test)(uint32_t));
|
|
18
|
+
|
|
19
|
+
VALUE _rb_u_string_test_locale(int argc, VALUE *argv, VALUE self,
|
|
20
|
+
size_t convert(char *, size_t, const char *, size_t,
|
|
21
|
+
const char *));
|
|
22
|
+
|
|
23
|
+
VALUE _rb_u_string_convert(VALUE self,
|
|
24
|
+
size_t convert(char *, size_t, const char *, size_t));
|
|
25
|
+
VALUE _rb_u_string_convert_locale(int argc, VALUE *argv, VALUE self,
|
|
26
|
+
size_t convert(char *, size_t, const char *,
|
|
27
|
+
size_t, const char *),
|
|
28
|
+
const char *lc_env);
|
|
29
|
+
|
|
30
|
+
VALUE _rb_u_string_property(VALUE self, const char *name,
|
|
31
|
+
int unknown, int property(uint32_t),
|
|
32
|
+
VALUE tosym(int));
|
|
33
|
+
|
|
34
|
+
enum u_normalization_form _rb_u_symbol_to_normalization_form(VALUE symbol);
|
|
35
|
+
|
|
36
|
+
VALUE rb_u_pattern_argument(VALUE pattern, bool quote);
|
|
37
|
+
|
|
38
|
+
long rb_u_string_index_regexp(VALUE self, const char *begin, VALUE regex, bool reverse);
|
|
39
|
+
|
|
40
|
+
#ifndef HAVE_RB_ERRINFO
|
|
41
|
+
# define rb_errinfo() (ruby_errinfo)
|
|
42
|
+
#endif
|
|
43
|
+
|
|
44
|
+
#ifndef HAVE_RB_MEMHASH
|
|
45
|
+
int rb_memhash(const char *string, long length);
|
|
46
|
+
#endif
|
|
47
|
+
|
|
48
|
+
#ifndef RETURN_SIZED_ENUMERATOR
|
|
49
|
+
# define RETURN_SIZED_ENUMERATOR(self, argc, argv, size) \
|
|
50
|
+
RETURN_ENUMERATOR(self, argc, argv)
|
|
51
|
+
#endif
|
|
52
|
+
|
|
53
|
+
#ifndef OBJ_UNTRUSTED
|
|
54
|
+
# define OBJ_UNTRUSTED(o) (false)
|
|
55
|
+
#endif
|
|
56
|
+
|
|
57
|
+
#ifndef OBJ_UNTRUST
|
|
58
|
+
# define OBJ_UNTRUST(o) do { } while (0)
|
|
59
|
+
#endif
|
|
60
|
+
|
|
61
|
+
long rb_u_memsearch(const void *a, long a_n, const void *b, long b_n);
|
|
62
|
+
|
|
63
|
+
NORETURN(void rb_u_raise(VALUE exception, const char *format, ...)) PRINTF(2, 3);
|
|
64
|
+
NORETURN(void rb_u_raise_errno(int number, const char *format, ...)) PRINTF(2, 3);
|
|
65
|
+
|
|
66
|
+
VALUE rb_u_str_new(const char *string, long length);
|
|
67
|
+
VALUE rb_u_str_buf_new(long length);
|
data/ext/u/rb_u.c
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
/* -*- coding: utf-8 -*- */
|
|
2
|
+
|
|
3
|
+
#include "extconf.h"
|
|
4
|
+
#include <errno.h>
|
|
5
|
+
#include <ruby.h>
|
|
6
|
+
#include <stdarg.h>
|
|
7
|
+
#include <stdbool.h>
|
|
8
|
+
#include <stddef.h>
|
|
9
|
+
#include <stdint.h>
|
|
10
|
+
#include <stdlib.h>
|
|
11
|
+
#include <limits.h>
|
|
12
|
+
#include "u.h"
|
|
13
|
+
#include "private.h"
|
|
14
|
+
#include "rb_private.h"
|
|
15
|
+
#include "rb_u_buffer.h"
|
|
16
|
+
#include "rb_u_string.h"
|
|
17
|
+
|
|
18
|
+
void
|
|
19
|
+
need_at_least_n_arguments(int argc, int n)
|
|
20
|
+
{
|
|
21
|
+
if (argc < n)
|
|
22
|
+
rb_u_raise(rb_eArgError,
|
|
23
|
+
"wrong number of arguments (%d for at least %d)",
|
|
24
|
+
argc, n);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
void
|
|
28
|
+
need_m_to_n_arguments(int argc, int m, int n)
|
|
29
|
+
{
|
|
30
|
+
if (argc < m || argc > n)
|
|
31
|
+
rb_u_raise(rb_eArgError,
|
|
32
|
+
"wrong number of arguments (%d for %d..%d)",
|
|
33
|
+
argc, m, n);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
struct guarded_alloc_closure {
|
|
37
|
+
void *result;
|
|
38
|
+
size_t n;
|
|
39
|
+
};
|
|
40
|
+
|
|
41
|
+
static VALUE
|
|
42
|
+
guarded_alloc(VALUE data)
|
|
43
|
+
{
|
|
44
|
+
struct guarded_alloc_closure *closure = (struct guarded_alloc_closure *)data;
|
|
45
|
+
closure->result = (void *)ALLOC_N(char, closure->n);
|
|
46
|
+
return Qnil;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
void *
|
|
50
|
+
_rb_u_guarded_alloc(size_t n, ...)
|
|
51
|
+
{
|
|
52
|
+
struct guarded_alloc_closure closure = { NULL, n };
|
|
53
|
+
int error;
|
|
54
|
+
rb_protect(guarded_alloc, (VALUE)&closure, &error);
|
|
55
|
+
if (error == 0)
|
|
56
|
+
return closure.result;
|
|
57
|
+
va_list args;
|
|
58
|
+
va_start(args, n);
|
|
59
|
+
void *previous;
|
|
60
|
+
while ((previous = va_arg(args, void *)) != NULL)
|
|
61
|
+
free(previous);
|
|
62
|
+
va_end(args);
|
|
63
|
+
rb_exc_raise(rb_errinfo());
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
int
|
|
67
|
+
rb_u_char_to_u(uint32_t c, char *result)
|
|
68
|
+
{
|
|
69
|
+
if (!u_char_isvalid(c))
|
|
70
|
+
rb_u_raise(rb_eArgError, "not a Unicode character: %#04x", c);
|
|
71
|
+
|
|
72
|
+
return u_char_to_u(c, result);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
void
|
|
76
|
+
rb_u_validate(const char *string, long length)
|
|
77
|
+
{
|
|
78
|
+
const char *end;
|
|
79
|
+
if (!u_valid(string, length, &end))
|
|
80
|
+
rb_u_raise(rb_eArgError,
|
|
81
|
+
"invalid byte sequence at byte %ld",
|
|
82
|
+
end - string);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
VALUE
|
|
86
|
+
_rb_u_character_test(VALUE self, bool (*test)(uint32_t))
|
|
87
|
+
{
|
|
88
|
+
const struct rb_u_string *s = RVAL2USTRING(self);
|
|
89
|
+
for (const char *p = USTRING_STR(s), *end = USTRING_END(s); p < end; )
|
|
90
|
+
if (!test(u_decode(&p, p, end)))
|
|
91
|
+
return Qfalse;
|
|
92
|
+
return Qtrue;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
VALUE
|
|
96
|
+
_rb_u_string_test_locale(int argc, VALUE *argv, VALUE self,
|
|
97
|
+
size_t convert(char *, size_t, const char *, size_t,
|
|
98
|
+
const char *))
|
|
99
|
+
{
|
|
100
|
+
const char *locale = NULL;
|
|
101
|
+
|
|
102
|
+
VALUE rblocale;
|
|
103
|
+
if (rb_scan_args(argc, argv, "01", &rblocale) == 1)
|
|
104
|
+
locale = StringValuePtr(rblocale);
|
|
105
|
+
|
|
106
|
+
const struct rb_u_string *string = RVAL2USTRING(self);
|
|
107
|
+
|
|
108
|
+
size_t nfd_n = u_normalize(NULL, 0,
|
|
109
|
+
USTRING_STR(string), USTRING_LENGTH(string),
|
|
110
|
+
U_NORMALIZATION_FORM_D);
|
|
111
|
+
char *nfd = ALLOC_N(char, nfd_n + 1);
|
|
112
|
+
nfd_n = u_normalize(nfd, nfd_n + 1,
|
|
113
|
+
USTRING_STR(string), USTRING_LENGTH(string),
|
|
114
|
+
U_NORMALIZATION_FORM_D);
|
|
115
|
+
|
|
116
|
+
size_t converted_n = convert(NULL, 0, nfd, nfd_n, locale);
|
|
117
|
+
char *converted = _rb_u_guarded_alloc(converted_n + 1, nfd, NULL);
|
|
118
|
+
convert(converted, converted_n + 1, nfd, nfd_n, locale);
|
|
119
|
+
|
|
120
|
+
VALUE result = converted_n == nfd_n &&
|
|
121
|
+
memcmp(converted, nfd, nfd_n) == 0 ? Qtrue : Qfalse;
|
|
122
|
+
|
|
123
|
+
free(converted);
|
|
124
|
+
free(nfd);
|
|
125
|
+
|
|
126
|
+
return result;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
VALUE
|
|
130
|
+
_rb_u_string_convert(VALUE self,
|
|
131
|
+
size_t convert(char *, size_t, const char *, size_t))
|
|
132
|
+
{
|
|
133
|
+
const struct rb_u_string *string = RVAL2USTRING(self);
|
|
134
|
+
|
|
135
|
+
size_t n = convert(NULL, 0, USTRING_STR(string), USTRING_LENGTH(string));
|
|
136
|
+
char *converted = ALLOC_N(char, n + 1);
|
|
137
|
+
convert(converted, n + 1, USTRING_STR(string), USTRING_LENGTH(string));
|
|
138
|
+
|
|
139
|
+
return rb_u_string_new_c_own(self, converted, n);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
static size_t
|
|
143
|
+
try_convert(char *result, size_t m, const struct rb_u_string *string,
|
|
144
|
+
size_t convert(char *, size_t, const char *, size_t,
|
|
145
|
+
const char *), const char *locale)
|
|
146
|
+
{
|
|
147
|
+
errno = 0;
|
|
148
|
+
size_t n = convert(result, m, USTRING_STR(string), USTRING_LENGTH(string),
|
|
149
|
+
locale);
|
|
150
|
+
if (errno != 0) {
|
|
151
|
+
free(result);
|
|
152
|
+
rb_u_raise_errno(errno, "can’t apply conversion");
|
|
153
|
+
}
|
|
154
|
+
return n;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
VALUE
|
|
158
|
+
_rb_u_string_convert_locale(int argc, VALUE *argv, VALUE self,
|
|
159
|
+
size_t convert(char *, size_t, const char *, size_t,
|
|
160
|
+
const char *),
|
|
161
|
+
const char *lc_env)
|
|
162
|
+
{
|
|
163
|
+
const char *locale = NULL;
|
|
164
|
+
|
|
165
|
+
VALUE rblocale;
|
|
166
|
+
if (rb_scan_args(argc, argv, "01", &rblocale) == 1)
|
|
167
|
+
locale = StringValuePtr(rblocale);
|
|
168
|
+
else if (lc_env != NULL) {
|
|
169
|
+
const char * const env[] = { "LC_ALL", lc_env, "LANG", NULL };
|
|
170
|
+
for (const char * const *p = env; *p != NULL; p++)
|
|
171
|
+
if ((locale = getenv(*p)) != NULL)
|
|
172
|
+
break;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
const struct rb_u_string *string = RVAL2USTRING(self);
|
|
176
|
+
|
|
177
|
+
size_t n = try_convert(NULL, 0, string, convert, locale);
|
|
178
|
+
char *converted = ALLOC_N(char, n + 1);
|
|
179
|
+
size_t m = try_convert(converted, n + 1, string, convert, locale);
|
|
180
|
+
if (m < n) {
|
|
181
|
+
char *t = REALLOC_N(converted, char, m + 1);
|
|
182
|
+
if (t != NULL)
|
|
183
|
+
converted = t;
|
|
184
|
+
n = m;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
return rb_u_string_new_c_own(self, converted, n);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
VALUE
|
|
191
|
+
_rb_u_string_property(VALUE self, const char *name, int unknown,
|
|
192
|
+
int property(uint32_t), VALUE tosym(int))
|
|
193
|
+
{
|
|
194
|
+
const struct rb_u_string *string = RVAL2USTRING(self);
|
|
195
|
+
const char *p = USTRING_STR(string);
|
|
196
|
+
const char *end = USTRING_END(string);
|
|
197
|
+
if (p == end)
|
|
198
|
+
return tosym(unknown);
|
|
199
|
+
int first = property(u_decode(&p, p, end));
|
|
200
|
+
while (p < end) {
|
|
201
|
+
int value = property(u_decode(&p, p, end));
|
|
202
|
+
if (value != first)
|
|
203
|
+
rb_u_raise(rb_eArgError,
|
|
204
|
+
"string consists of characters with different %s values: :%s+, :%s",
|
|
205
|
+
name,
|
|
206
|
+
rb_id2name(SYM2ID(tosym(first))),
|
|
207
|
+
rb_id2name(SYM2ID(tosym(value))));
|
|
208
|
+
}
|
|
209
|
+
return tosym(first);
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
#define SYMBOL2MODE(symbol, mode, id) do { \
|
|
213
|
+
static ID id_##symbol; \
|
|
214
|
+
if (id_##symbol == 0) \
|
|
215
|
+
id_##symbol = rb_intern(#symbol); \
|
|
216
|
+
if (id == id_##symbol) \
|
|
217
|
+
return mode; \
|
|
218
|
+
} while (0)
|
|
219
|
+
|
|
220
|
+
enum u_normalization_form
|
|
221
|
+
_rb_u_symbol_to_normalization_form(VALUE symbol)
|
|
222
|
+
{
|
|
223
|
+
if (!SYMBOL_P(symbol)) {
|
|
224
|
+
VALUE inspected = rb_inspect(symbol);
|
|
225
|
+
|
|
226
|
+
rb_u_raise(rb_eTypeError,
|
|
227
|
+
"not a symbol: %s",
|
|
228
|
+
StringValuePtr(inspected));
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
ID id = SYM2ID(symbol);
|
|
232
|
+
|
|
233
|
+
SYMBOL2MODE(nfd, U_NORMALIZATION_FORM_D, id);
|
|
234
|
+
SYMBOL2MODE(nfc, U_NORMALIZATION_FORM_C, id);
|
|
235
|
+
SYMBOL2MODE(nfkd, U_NORMALIZATION_FORM_KD, id);
|
|
236
|
+
SYMBOL2MODE(nfkc, U_NORMALIZATION_FORM_KC, id);
|
|
237
|
+
|
|
238
|
+
rb_u_raise(rb_eArgError,
|
|
239
|
+
"unknown normalization form: :%s",
|
|
240
|
+
rb_id2name(SYM2ID(symbol)));
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
U_EXTERN void Init_u(void);
|
|
244
|
+
void
|
|
245
|
+
Init_u(void)
|
|
246
|
+
{
|
|
247
|
+
VALUE mU = rb_define_module("U");
|
|
248
|
+
|
|
249
|
+
Init_u_buffer(mU);
|
|
250
|
+
Init_u_string(mU);
|
|
251
|
+
}
|