u 0.5.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/build/ext/u/data/attributes.rb +39 -0
- data/build/ext/u/data/bidi-mirroring.rb +27 -0
- data/build/ext/u/data/canonical-combining-class.rb +15 -0
- data/build/ext/u/data/case-folding.rb +39 -0
- data/build/ext/u/data/cased.rb +19 -0
- data/build/ext/u/data/compose.rb +304 -0
- data/build/ext/u/data/constants.rb +31 -0
- data/build/ext/u/data/decompose.rb +85 -0
- data/build/ext/u/data/general-category.rb +61 -0
- data/build/ext/u/data/grapheme-word-break.rb +15 -0
- data/build/ext/u/data/marshalled.rb +5 -0
- data/build/ext/u/data/script.rb +91 -0
- data/build/ext/u/data/soft-dotted.rb +17 -0
- data/build/ext/u/data/title-table.rb +30 -0
- data/build/ext/u/data/wide.rb +17 -0
- data/build/lib/u/build.rb +8 -0
- data/build/lib/u/build/data.rb +16 -0
- data/build/lib/u/build/data/bidimirroring.rb +26 -0
- data/build/lib/u/build/data/break.rb +14 -0
- data/build/lib/u/build/data/casefolding.rb +77 -0
- data/build/lib/u/build/data/compositionexclusions.rb +14 -0
- data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
- data/build/lib/u/build/data/file.rb +88 -0
- data/build/lib/u/build/data/linebreak.rb +14 -0
- data/build/lib/u/build/data/proplist.rb +18 -0
- data/build/lib/u/build/data/scripts.rb +22 -0
- data/build/lib/u/build/data/specialcasing.rb +106 -0
- data/build/lib/u/build/data/unicode.rb +41 -0
- data/build/lib/u/build/data/unicode/entry.rb +27 -0
- data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
- data/build/lib/u/build/data/unicode/points.rb +32 -0
- data/build/lib/u/build/header.rb +11 -0
- data/build/lib/u/build/header/table.rb +19 -0
- data/build/lib/u/build/header/table/row.rb +64 -0
- data/build/lib/u/build/header/tables.rb +6 -0
- data/build/lib/u/build/header/tables/intervals.rb +50 -0
- data/build/lib/u/build/header/tables/split.rb +20 -0
- data/build/lib/u/build/header/tables/split/data.rb +29 -0
- data/build/lib/u/build/header/tables/split/part1.rb +28 -0
- data/build/lib/u/build/header/tables/split/part2.rb +13 -0
- data/build/lib/u/build/header/tables/split/row.rb +34 -0
- data/build/lib/u/build/header/tables/split/rows.rb +22 -0
- data/build/test/unit/break.rb +45 -0
- data/build/test/unit/case.rb +178 -0
- data/build/test/unit/foldcase.rb +44 -0
- data/build/test/unit/normalize.rb +81 -0
- data/ext/u/attributes.c +62 -0
- data/ext/u/attributes.h +5 -0
- data/ext/u/case.h +41 -0
- data/ext/u/data/attributes.h +3070 -0
- data/ext/u/data/bidi-mirroring.h +373 -0
- data/ext/u/data/canonical-combining-class.h +2157 -0
- data/ext/u/data/case-folding.h +171 -0
- data/ext/u/data/cased.h +42 -0
- data/ext/u/data/compose.h +1714 -0
- data/ext/u/data/constants.h +17 -0
- data/ext/u/data/decompose.h +9356 -0
- data/ext/u/data/general-category.h +28959 -0
- data/ext/u/data/grapheme-break.h +13201 -0
- data/ext/u/data/line-break.h +26501 -0
- data/ext/u/data/normalization-quick-check.h +3002 -0
- data/ext/u/data/script.h +2928 -0
- data/ext/u/data/soft-dotted.h +55 -0
- data/ext/u/data/title-table.h +41 -0
- data/ext/u/data/types.h +11117 -0
- data/ext/u/data/wide-cjk.h +197 -0
- data/ext/u/data/wide.h +59 -0
- data/ext/u/data/word-break.h +10001 -0
- data/ext/u/depend +281 -0
- data/ext/u/extconf.rb +158 -0
- data/ext/u/output.h +51 -0
- data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
- data/ext/u/private.h +58 -0
- data/ext/u/rb_includes.h +10 -0
- data/ext/u/rb_private.c +98 -0
- data/ext/u/rb_private.h +67 -0
- data/ext/u/rb_u.c +251 -0
- data/ext/u/rb_u_buffer.c +443 -0
- data/ext/u/rb_u_buffer.h +24 -0
- data/ext/u/rb_u_re.c +43 -0
- data/ext/u/rb_u_re.h +15 -0
- data/ext/u/rb_u_string.c +478 -0
- data/ext/u/rb_u_string.h +173 -0
- data/ext/u/rb_u_string_alnum.c +10 -0
- data/ext/u/rb_u_string_alpha.c +10 -0
- data/ext/u/rb_u_string_aref.c +142 -0
- data/ext/u/rb_u_string_ascii_only.c +13 -0
- data/ext/u/rb_u_string_assigned.c +10 -0
- data/ext/u/rb_u_string_b.c +18 -0
- data/ext/u/rb_u_string_bytesize.c +10 -0
- data/ext/u/rb_u_string_byteslice.c +103 -0
- data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
- data/ext/u/rb_u_string_case_ignorable.c +25 -0
- data/ext/u/rb_u_string_casecmp.c +61 -0
- data/ext/u/rb_u_string_cased.c +17 -0
- data/ext/u/rb_u_string_chomp.c +107 -0
- data/ext/u/rb_u_string_chop.c +33 -0
- data/ext/u/rb_u_string_chr.c +9 -0
- data/ext/u/rb_u_string_cntrl.c +10 -0
- data/ext/u/rb_u_string_collate.c +46 -0
- data/ext/u/rb_u_string_collation_key.c +18 -0
- data/ext/u/rb_u_string_count.c +38 -0
- data/ext/u/rb_u_string_defined.c +10 -0
- data/ext/u/rb_u_string_delete.c +62 -0
- data/ext/u/rb_u_string_digit.c +10 -0
- data/ext/u/rb_u_string_downcase.c +13 -0
- data/ext/u/rb_u_string_dump.c +153 -0
- data/ext/u/rb_u_string_each_byte.c +46 -0
- data/ext/u/rb_u_string_each_char.c +49 -0
- data/ext/u/rb_u_string_each_codepoint.c +45 -0
- data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
- data/ext/u/rb_u_string_each_line.c +142 -0
- data/ext/u/rb_u_string_each_word.c +34 -0
- data/ext/u/rb_u_string_empty.c +11 -0
- data/ext/u/rb_u_string_end_with.c +31 -0
- data/ext/u/rb_u_string_eql.c +30 -0
- data/ext/u/rb_u_string_equal.c +33 -0
- data/ext/u/rb_u_string_foldcase.c +12 -0
- data/ext/u/rb_u_string_folded.c +13 -0
- data/ext/u/rb_u_string_format.c +1745 -0
- data/ext/u/rb_u_string_general_category.c +109 -0
- data/ext/u/rb_u_string_getbyte.c +21 -0
- data/ext/u/rb_u_string_graph.c +21 -0
- data/ext/u/rb_u_string_grapheme_break.c +61 -0
- data/ext/u/rb_u_string_gsub.c +164 -0
- data/ext/u/rb_u_string_hash.c +10 -0
- data/ext/u/rb_u_string_hex.c +9 -0
- data/ext/u/rb_u_string_include.c +10 -0
- data/ext/u/rb_u_string_index.c +110 -0
- data/ext/u/rb_u_string_inspect.c +189 -0
- data/ext/u/rb_u_string_internal_tr.c +148 -0
- data/ext/u/rb_u_string_internal_tr.h +29 -0
- data/ext/u/rb_u_string_justify.c +169 -0
- data/ext/u/rb_u_string_length.c +10 -0
- data/ext/u/rb_u_string_line_break.c +115 -0
- data/ext/u/rb_u_string_lower.c +13 -0
- data/ext/u/rb_u_string_lstrip.c +24 -0
- data/ext/u/rb_u_string_match.c +65 -0
- data/ext/u/rb_u_string_mirror.c +16 -0
- data/ext/u/rb_u_string_newline.c +21 -0
- data/ext/u/rb_u_string_normalize.c +70 -0
- data/ext/u/rb_u_string_normalized.c +28 -0
- data/ext/u/rb_u_string_oct.c +11 -0
- data/ext/u/rb_u_string_ord.c +14 -0
- data/ext/u/rb_u_string_partition.c +80 -0
- data/ext/u/rb_u_string_plus.c +33 -0
- data/ext/u/rb_u_string_print.c +10 -0
- data/ext/u/rb_u_string_punct.c +10 -0
- data/ext/u/rb_u_string_reverse.c +13 -0
- data/ext/u/rb_u_string_rindex.c +104 -0
- data/ext/u/rb_u_string_rpartition.c +81 -0
- data/ext/u/rb_u_string_rstrip.c +29 -0
- data/ext/u/rb_u_string_scan.c +109 -0
- data/ext/u/rb_u_string_script.c +253 -0
- data/ext/u/rb_u_string_soft_dotted.c +13 -0
- data/ext/u/rb_u_string_space.c +24 -0
- data/ext/u/rb_u_string_split.c +245 -0
- data/ext/u/rb_u_string_squeeze.c +75 -0
- data/ext/u/rb_u_string_start_with.c +31 -0
- data/ext/u/rb_u_string_strip.c +36 -0
- data/ext/u/rb_u_string_sub.c +147 -0
- data/ext/u/rb_u_string_times.c +35 -0
- data/ext/u/rb_u_string_title.c +10 -0
- data/ext/u/rb_u_string_titlecase.c +13 -0
- data/ext/u/rb_u_string_to_i.c +45 -0
- data/ext/u/rb_u_string_to_inum.c +364 -0
- data/ext/u/rb_u_string_to_inum.h +1 -0
- data/ext/u/rb_u_string_to_str.c +17 -0
- data/ext/u/rb_u_string_to_sym.c +12 -0
- data/ext/u/rb_u_string_tr.c +290 -0
- data/ext/u/rb_u_string_upcase.c +12 -0
- data/ext/u/rb_u_string_upper.c +13 -0
- data/ext/u/rb_u_string_valid.c +10 -0
- data/ext/u/rb_u_string_valid_encoding.c +12 -0
- data/ext/u/rb_u_string_wide.c +21 -0
- data/ext/u/rb_u_string_wide_cjk.c +21 -0
- data/ext/u/rb_u_string_width.c +19 -0
- data/ext/u/rb_u_string_word_break.c +63 -0
- data/ext/u/rb_u_string_xdigit.c +22 -0
- data/ext/u/rb_u_string_zero_width.c +16 -0
- data/ext/u/titled.c +55 -0
- data/ext/u/titled.h +1 -0
- data/ext/u/u.c +23 -0
- data/ext/u/u.h +458 -0
- data/ext/u/u_char_canonical_combining_class.c +31 -0
- data/ext/u/u_char_digit_value.c +21 -0
- data/ext/u/u_char_downcase.c +27 -0
- data/ext/u/u_char_general_category.c +31 -0
- data/ext/u/u_char_grapheme_break.c +28 -0
- data/ext/u/u_char_isalnum.c +24 -0
- data/ext/u/u_char_isalpha.c +21 -0
- data/ext/u/u_char_isassigned.c +16 -0
- data/ext/u/u_char_iscased.c +22 -0
- data/ext/u/u_char_iscaseignorable.c +29 -0
- data/ext/u/u_char_iscntrl.c +17 -0
- data/ext/u/u_char_isdefined.c +15 -0
- data/ext/u/u_char_isdigit.c +16 -0
- data/ext/u/u_char_isgraph.c +22 -0
- data/ext/u/u_char_islower.c +16 -0
- data/ext/u/u_char_isnewline.c +24 -0
- data/ext/u/u_char_isprint.c +21 -0
- data/ext/u/u_char_ispunct.c +27 -0
- data/ext/u/u_char_issoftdotted.c +18 -0
- data/ext/u/u_char_isspace.c +28 -0
- data/ext/u/u_char_isupper.c +16 -0
- data/ext/u/u_char_isvalid.c +18 -0
- data/ext/u/u_char_iswide.c +18 -0
- data/ext/u/u_char_iswide_cjk.c +22 -0
- data/ext/u/u_char_isxdigit.c +27 -0
- data/ext/u/u_char_iszerowidth.c +29 -0
- data/ext/u/u_char_line_break.c +29 -0
- data/ext/u/u_char_mirror.c +16 -0
- data/ext/u/u_char_normalized.c +23 -0
- data/ext/u/u_char_script.c +41 -0
- data/ext/u/u_char_to_u.c +48 -0
- data/ext/u/u_char_upcase.c +24 -0
- data/ext/u/u_char_width.c +12 -0
- data/ext/u/u_char_word_break.c +28 -0
- data/ext/u/u_char_xdigit_value.c +31 -0
- data/ext/u/u_collate.c +83 -0
- data/ext/u/u_collation_key.c +132 -0
- data/ext/u/u_decode.c +156 -0
- data/ext/u/u_downcase.c +201 -0
- data/ext/u/u_foldcase.c +68 -0
- data/ext/u/u_grapheme_clusters.c +57 -0
- data/ext/u/u_has_prefix.c +27 -0
- data/ext/u/u_index.c +93 -0
- data/ext/u/u_is_ascii_only.c +33 -0
- data/ext/u/u_locale.c +40 -0
- data/ext/u/u_locale.h +14 -0
- data/ext/u/u_mirror.c +20 -0
- data/ext/u/u_n_bytes.c +16 -0
- data/ext/u/u_n_chars.c +43 -0
- data/ext/u/u_normalize.c +232 -0
- data/ext/u/u_normalized.c +28 -0
- data/ext/u/u_offset_to_pointer.c +62 -0
- data/ext/u/u_pointer_to_offset.c +23 -0
- data/ext/u/u_recode.c +73 -0
- data/ext/u/u_reverse.c +21 -0
- data/ext/u/u_rindex.c +132 -0
- data/ext/u/u_titlecase.c +68 -0
- data/ext/u/u_upcase.c +89 -0
- data/ext/u/u_width.c +35 -0
- data/ext/u/u_words.c +82 -0
- data/ext/u/yield.h +27 -0
- data/lib/u-1.0.rb +20 -0
- data/lib/u-1.0/buffer.rb +10 -0
- data/lib/u-1.0/string.rb +9 -0
- data/lib/u-1.0/version.rb +287 -0
- data/test/unit/case.rb +2080 -0
- data/test/unit/foldcase.rb +1136 -0
- data/test/unit/graphemebreak.rb +407 -0
- data/test/unit/normalize.rb +367545 -0
- data/test/unit/u-1.0.rb +10 -0
- data/test/unit/u-1.0/buffer.rb +52 -0
- data/test/unit/u-1.0/string.rb +1439 -0
- data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
- data/test/unit/wordbreak.rb +1083 -0
- metadata +603 -148
- data/README +0 -38
- data/Rakefile +0 -64
- data/ext/encoding/character/utf-8/break.c +0 -25
- data/ext/encoding/character/utf-8/data/break.h +0 -22931
- data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
- data/ext/encoding/character/utf-8/data/compose.h +0 -1607
- data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
- data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
- data/ext/encoding/character/utf-8/decompose.c +0 -444
- data/ext/encoding/character/utf-8/depend +0 -65
- data/ext/encoding/character/utf-8/extconf.rb +0 -67
- data/ext/encoding/character/utf-8/private.h +0 -51
- data/ext/encoding/character/utf-8/properties.c +0 -1056
- data/ext/encoding/character/utf-8/rb_includes.h +0 -19
- data/ext/encoding/character/utf-8/rb_methods.h +0 -49
- data/ext/encoding/character/utf-8/rb_private.h +0 -52
- data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
- data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
- data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
- data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
- data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
- data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
- data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
- data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
- data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
- data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
- data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
- data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
- data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
- data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
- data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
- data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
- data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
- data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
- data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
- data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
- data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
- data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
- data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
- data/ext/encoding/character/utf-8/tables.h +0 -38
- data/ext/encoding/character/utf-8/unicode.c +0 -319
- data/ext/encoding/character/utf-8/unicode.h +0 -216
- data/ext/encoding/character/utf-8/utf.c +0 -1334
- data/lib/encoding/character/utf-8.rb +0 -201
- data/lib/u.rb +0 -16
- data/lib/u/string.rb +0 -185
- data/lib/u/version.rb +0 -5
- data/test/unit/u/string.rb +0 -91
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
|
|
3
|
+
static void
|
|
4
|
+
rb_u_string_inspect_bad_input(const char *p, const char *q, VALUE result)
|
|
5
|
+
{
|
|
6
|
+
while (p < q) {
|
|
7
|
+
char hex[5];
|
|
8
|
+
snprintf(hex, lengthof(hex), "\\x%02X", *p & 0xff);
|
|
9
|
+
rb_str_buf_cat2(result, hex);
|
|
10
|
+
p++;
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
static void
|
|
15
|
+
rb_u_string_inspect_special_char(uint32_t c, VALUE result)
|
|
16
|
+
{
|
|
17
|
+
char str[U_CHAR_MAX_BYTE_LENGTH];
|
|
18
|
+
|
|
19
|
+
rb_str_buf_cat2(result, "\\");
|
|
20
|
+
rb_str_buf_cat(result, str, u_char_to_u(c, str));
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
static void
|
|
24
|
+
rb_u_string_inspect_escaped_char(uint32_t c, VALUE result)
|
|
25
|
+
{
|
|
26
|
+
char str[4 + 8 + 1];
|
|
27
|
+
|
|
28
|
+
if (c < 0x10000)
|
|
29
|
+
snprintf(str, lengthof(str), "\\u%04X", c);
|
|
30
|
+
else
|
|
31
|
+
snprintf(str, lengthof(str), "\\u{%X}", c & 0xffffffff);
|
|
32
|
+
|
|
33
|
+
rb_str_buf_cat2(result, str);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
static void
|
|
37
|
+
rb_u_string_inspect_default(uint32_t c, VALUE result)
|
|
38
|
+
{
|
|
39
|
+
if (!u_char_isprint(c)) {
|
|
40
|
+
rb_u_string_inspect_escaped_char(c, result);
|
|
41
|
+
return;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
char str[U_CHAR_MAX_BYTE_LENGTH];
|
|
45
|
+
rb_str_buf_cat(result, str, u_char_to_u(c, str));
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
#define REPLACEMENT_CHARACTER ((uint32_t)0xfffd)
|
|
49
|
+
|
|
50
|
+
static const char *
|
|
51
|
+
rb_u_string_inspect_hash_char(const char *q, const char *end,
|
|
52
|
+
VALUE result)
|
|
53
|
+
{
|
|
54
|
+
if (q == end) {
|
|
55
|
+
rb_str_buf_cat2(result, "#");
|
|
56
|
+
return q;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
const char *p = q;
|
|
60
|
+
uint32_t c = u_decode(&q, p, end);
|
|
61
|
+
switch (c) {
|
|
62
|
+
case REPLACEMENT_CHARACTER:
|
|
63
|
+
rb_str_buf_cat2(result, "#");
|
|
64
|
+
if (!u_valid(p, q - p, NULL))
|
|
65
|
+
rb_u_string_inspect_bad_input(p, q, result);
|
|
66
|
+
else
|
|
67
|
+
rb_u_string_inspect_default(c, result);
|
|
68
|
+
return q;
|
|
69
|
+
case '$':
|
|
70
|
+
case '@':
|
|
71
|
+
case '{':
|
|
72
|
+
rb_str_buf_cat2(result, "\\#");
|
|
73
|
+
rb_u_string_inspect_default(c, result);
|
|
74
|
+
return q;
|
|
75
|
+
default:
|
|
76
|
+
rb_str_buf_cat2(result, "#");
|
|
77
|
+
return p;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/* Returns the receiver in a reader-friendly inspectable format, inheriting
|
|
82
|
+
* any taint and untrust, encoded using UTF-8.
|
|
83
|
+
*
|
|
84
|
+
* The reader-friendly inspectable format looks like “`"…".u`”. Inside the
|
|
85
|
+
* “…”, any {#print?} characters are output as-is, the following special
|
|
86
|
+
* characters are escaped according to the following table:
|
|
87
|
+
*
|
|
88
|
+
* <table>
|
|
89
|
+
* <thead><tr><th>Character</th><th>Dumped Sequence</th></tr></thead>
|
|
90
|
+
* <tbody>
|
|
91
|
+
* <tr><td>U+0022 QUOTATION MARK</td><td><code>\"</code></td></tr>
|
|
92
|
+
* <tr><td>U+005C REVERSE SOLIDUS</td><td><code>\\</code></td></tr>
|
|
93
|
+
* <tr><td>U+000A LINE FEED (LF)</td><td><code>\n</code></td></tr>
|
|
94
|
+
* <tr><td>U+000D CARRIAGE RETURN (CR)</td><td><code>\r</code></td></tr>
|
|
95
|
+
* <tr><td>U+0009 CHARACTER TABULATION</td><td><code>\t</code></td></tr>
|
|
96
|
+
* <tr><td>U+000C FORM FEED (FF)</td><td><code>\f</code></td></tr>
|
|
97
|
+
* <tr><td>U+000B LINE TABULATION</td><td><code>\v</code></td></tr>
|
|
98
|
+
* <tr><td>U+0008 BACKSPACE</td><td><code>\b</code></td></tr>
|
|
99
|
+
* <tr><td>U+0007 BELL</td><td><code>\a</code></td></tr>
|
|
100
|
+
* <tr><td>U+001B ESCAPE</td><td><code>\e</code></td></tr>
|
|
101
|
+
* </tbody>
|
|
102
|
+
* </table>
|
|
103
|
+
*
|
|
104
|
+
* the following special sequences are also escaped:
|
|
105
|
+
*
|
|
106
|
+
* <table>
|
|
107
|
+
* <thead><tr><th>Character</th><th>Dumped Sequence</th></tr></thead>
|
|
108
|
+
* <tbody>
|
|
109
|
+
* <tr><td><code>#$</code></td><td><code>\#$</code></td></tr>
|
|
110
|
+
* <tr><td><code>#@</code></td><td><code>\#@</code></td></tr>
|
|
111
|
+
* <tr><td><code>#{</code></td><td><code>\#{</code></td></tr>
|
|
112
|
+
* </tbody>
|
|
113
|
+
* </table>
|
|
114
|
+
*
|
|
115
|
+
* Valid UTF-8 byte sequences representing code points < 0x10000 are output as
|
|
116
|
+
* `\u`_n_, where _n_ is the four-digit uppercase hexadecimal representation
|
|
117
|
+
* of the code point.
|
|
118
|
+
*
|
|
119
|
+
* Valid UTF-8 byte sequences representing code points ≥ 0x10000 are output as
|
|
120
|
+
* `\u{`_n_`}`, where _n_ is the uppercase hexadecimal representation of the
|
|
121
|
+
* code point.
|
|
122
|
+
*
|
|
123
|
+
* Any other byte is output as `\x`_n_, where _n_ is the two-digit uppercase
|
|
124
|
+
* hexadecimal representation of the byte’s value.
|
|
125
|
+
*
|
|
126
|
+
* @return [String] */
|
|
127
|
+
VALUE
|
|
128
|
+
rb_u_string_inspect(VALUE self)
|
|
129
|
+
{
|
|
130
|
+
const struct rb_u_string *string = RVAL2USTRING(self);
|
|
131
|
+
|
|
132
|
+
VALUE result = rb_u_str_buf_new(0);
|
|
133
|
+
rb_str_buf_cat2(result, "\"");
|
|
134
|
+
const char *p = USTRING_STR(string);
|
|
135
|
+
const char *end = USTRING_END(string);
|
|
136
|
+
while (p < end) {
|
|
137
|
+
const char *q;
|
|
138
|
+
uint32_t c = u_decode(&q, p, end);
|
|
139
|
+
switch (c) {
|
|
140
|
+
case '"':
|
|
141
|
+
case '\\':
|
|
142
|
+
rb_u_string_inspect_special_char(c, result);
|
|
143
|
+
break;
|
|
144
|
+
case '#':
|
|
145
|
+
p = rb_u_string_inspect_hash_char(q, end, result);
|
|
146
|
+
continue;
|
|
147
|
+
case '\n':
|
|
148
|
+
rb_str_buf_cat2(result, "\\n");
|
|
149
|
+
break;
|
|
150
|
+
case '\r':
|
|
151
|
+
rb_str_buf_cat2(result, "\\r");
|
|
152
|
+
break;
|
|
153
|
+
case '\t':
|
|
154
|
+
rb_str_buf_cat2(result, "\\t");
|
|
155
|
+
break;
|
|
156
|
+
case '\f':
|
|
157
|
+
rb_str_buf_cat2(result, "\\f");
|
|
158
|
+
break;
|
|
159
|
+
case '\013':
|
|
160
|
+
rb_str_buf_cat2(result, "\\v");
|
|
161
|
+
break;
|
|
162
|
+
case '\010':
|
|
163
|
+
rb_str_buf_cat2(result, "\\b");
|
|
164
|
+
break;
|
|
165
|
+
case '\007':
|
|
166
|
+
rb_str_buf_cat2(result, "\\a");
|
|
167
|
+
break;
|
|
168
|
+
case '\033':
|
|
169
|
+
rb_str_buf_cat2(result, "\\e");
|
|
170
|
+
break;
|
|
171
|
+
case REPLACEMENT_CHARACTER:
|
|
172
|
+
if (!u_valid(p, q - p, NULL)) {
|
|
173
|
+
rb_u_string_inspect_bad_input(p, q, result);
|
|
174
|
+
break;
|
|
175
|
+
}
|
|
176
|
+
/* fall through */
|
|
177
|
+
default:
|
|
178
|
+
rb_u_string_inspect_default(c, result);
|
|
179
|
+
break;
|
|
180
|
+
}
|
|
181
|
+
p = q;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
rb_str_buf_cat2(result, "\".u");
|
|
185
|
+
|
|
186
|
+
OBJ_INFECT(result, self);
|
|
187
|
+
|
|
188
|
+
return result;
|
|
189
|
+
}
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
#include "rb_u_string_internal_tr.h"
|
|
3
|
+
|
|
4
|
+
void
|
|
5
|
+
tr_init(struct tr *tr, const char *p, const char *end)
|
|
6
|
+
{
|
|
7
|
+
tr->p = p;
|
|
8
|
+
tr->end = end;
|
|
9
|
+
tr->inside_range = false;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
bool
|
|
13
|
+
tr_should_exclude(struct tr *tr)
|
|
14
|
+
{
|
|
15
|
+
if (tr->p + 1 < tr->end && *tr->p == '^') {
|
|
16
|
+
tr->p++;
|
|
17
|
+
return true;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
return false;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
static enum tr_state
|
|
24
|
+
tr_next_char(struct tr *t)
|
|
25
|
+
{
|
|
26
|
+
if (t->p == t->end)
|
|
27
|
+
return TR_FINISHED;
|
|
28
|
+
|
|
29
|
+
t->now = u_decode(&t->p, t->p, t->end);
|
|
30
|
+
if (t->p == t->end)
|
|
31
|
+
return TR_FOUND;
|
|
32
|
+
if (t->now == '\\') {
|
|
33
|
+
t->now = u_decode(&t->p, t->p, t->end);
|
|
34
|
+
if (t->p == t->end)
|
|
35
|
+
return TR_FOUND;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
const char *next;
|
|
39
|
+
if (u_decode(&next, t->p, t->end) == '-') {
|
|
40
|
+
/* TODO: Make this simpler. Perhaps we don’t need
|
|
41
|
+
* TR_READ_ANOTHER, as we advance it here ourselves. I got to
|
|
42
|
+
* check the offsets here. Perhaps TR_READ_ANOTHER should also
|
|
43
|
+
* have advanced t->p one more step. */
|
|
44
|
+
if (next < t->end) {
|
|
45
|
+
uint32_t max = u_decode(&t->p, next, t->end);
|
|
46
|
+
if (max < t->now) {
|
|
47
|
+
t->p = next;
|
|
48
|
+
return TR_READ_ANOTHER;
|
|
49
|
+
}
|
|
50
|
+
t->inside_range = true;
|
|
51
|
+
t->max = max;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
return TR_FOUND;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
enum tr_state
|
|
59
|
+
tr_next(struct tr *t)
|
|
60
|
+
{
|
|
61
|
+
while (true) {
|
|
62
|
+
if (!t->inside_range) {
|
|
63
|
+
enum tr_state state;
|
|
64
|
+
|
|
65
|
+
if ((state = tr_next_char(t)) == TR_READ_ANOTHER)
|
|
66
|
+
continue;
|
|
67
|
+
|
|
68
|
+
return state;
|
|
69
|
+
} else if (++t->now < t->max) {
|
|
70
|
+
return TR_FOUND;
|
|
71
|
+
} else {
|
|
72
|
+
t->inside_range = false;
|
|
73
|
+
return TR_FOUND;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
static void
|
|
79
|
+
tr_table_set(struct tr_table *table, bool *buffer, uint32_t c, bool value)
|
|
80
|
+
{
|
|
81
|
+
if (c < lengthof(table->continuous)) {
|
|
82
|
+
buffer[c] = value;
|
|
83
|
+
return;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
if (NIL_P(table->sparse))
|
|
87
|
+
table->sparse = rb_hash_new();
|
|
88
|
+
|
|
89
|
+
rb_hash_aset(table->sparse, UINT2NUM(c), value ? Qtrue : Qfalse);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
static void
|
|
93
|
+
tr_table_add(struct tr_table *table, const struct rb_u_string *string)
|
|
94
|
+
{
|
|
95
|
+
struct tr tr;
|
|
96
|
+
tr_init(&tr, USTRING_STR(string), USTRING_END(string));
|
|
97
|
+
|
|
98
|
+
bool exclude = tr_should_exclude(&tr);
|
|
99
|
+
|
|
100
|
+
bool buffer[lengthof(table->continuous)];
|
|
101
|
+
|
|
102
|
+
for (size_t i = 0; i < lengthof(buffer); i++)
|
|
103
|
+
buffer[i] = exclude;
|
|
104
|
+
|
|
105
|
+
while (tr_next(&tr) != TR_FINISHED)
|
|
106
|
+
tr_table_set(table, buffer, tr.now, !exclude);
|
|
107
|
+
|
|
108
|
+
for (size_t i = 0; i < lengthof(table->continuous); i++)
|
|
109
|
+
table->continuous[i] = table->continuous[i] && buffer[i];
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
void
|
|
113
|
+
tr_table_initialize(struct tr_table *table, VALUE rbstring)
|
|
114
|
+
{
|
|
115
|
+
const struct rb_u_string *string = RVAL2USTRING_ANY(rbstring);
|
|
116
|
+
|
|
117
|
+
struct tr tr;
|
|
118
|
+
tr_init(&tr, USTRING_STR(string), USTRING_END(string));
|
|
119
|
+
|
|
120
|
+
table->exclude = tr_should_exclude(&tr);
|
|
121
|
+
|
|
122
|
+
for (size_t i = 0; i < lengthof(table->continuous); i++)
|
|
123
|
+
table->continuous[i] = true;
|
|
124
|
+
|
|
125
|
+
table->sparse = Qnil;
|
|
126
|
+
|
|
127
|
+
tr_table_add(table, string);
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
void
|
|
131
|
+
tr_table_initialize_from_strings(struct tr_table *table, int argc, VALUE *argv)
|
|
132
|
+
{
|
|
133
|
+
tr_table_initialize(table, argv[0]);
|
|
134
|
+
for (int i = 1; i < argc; i++)
|
|
135
|
+
tr_table_add(table, RVAL2USTRING_ANY(argv[i]));
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
bool
|
|
139
|
+
tr_table_lookup(struct tr_table *table, uint32_t c)
|
|
140
|
+
{
|
|
141
|
+
if (c < lengthof(table->continuous))
|
|
142
|
+
return table->continuous[c];
|
|
143
|
+
|
|
144
|
+
VALUE value = NIL_P(table->sparse) ?
|
|
145
|
+
Qnil : rb_hash_lookup(table->sparse, UINT2NUM(c));
|
|
146
|
+
|
|
147
|
+
return NIL_P(value) ? table->exclude : RTEST(value);
|
|
148
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
struct tr {
|
|
2
|
+
bool inside_range;
|
|
3
|
+
uint32_t now;
|
|
4
|
+
uint32_t max;
|
|
5
|
+
const char *p;
|
|
6
|
+
const char *end;
|
|
7
|
+
};
|
|
8
|
+
|
|
9
|
+
enum tr_state
|
|
10
|
+
{
|
|
11
|
+
TR_FOUND,
|
|
12
|
+
TR_READ_ANOTHER,
|
|
13
|
+
TR_FINISHED
|
|
14
|
+
};
|
|
15
|
+
|
|
16
|
+
struct tr_table {
|
|
17
|
+
bool exclude;
|
|
18
|
+
bool continuous[256];
|
|
19
|
+
VALUE sparse;
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
void tr_init(struct tr *tr, const char *p, const char *end);
|
|
23
|
+
bool tr_should_exclude(struct tr *tr);
|
|
24
|
+
enum tr_state tr_next(struct tr *t);
|
|
25
|
+
void tr_table_initialize(struct tr_table *table, VALUE rbstring);
|
|
26
|
+
void tr_table_initialize_from_strings(struct tr_table *table,
|
|
27
|
+
int argc,
|
|
28
|
+
VALUE *argv);
|
|
29
|
+
bool tr_table_lookup(struct tr_table *table, uint32_t c);
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
|
|
3
|
+
static char *
|
|
4
|
+
rb_u_string_justify_one_side(char *p, const struct rb_u_string *padding, long padding_width, long n)
|
|
5
|
+
{
|
|
6
|
+
const char *padding_str = USTRING_STR(padding);
|
|
7
|
+
long padding_size = USTRING_LENGTH(padding);
|
|
8
|
+
|
|
9
|
+
long i = 0;
|
|
10
|
+
|
|
11
|
+
for ( ; i + padding_width < n; i += padding_width, p += padding_size)
|
|
12
|
+
memcpy(p, padding_str, padding_size);
|
|
13
|
+
|
|
14
|
+
const char *q = padding_str;
|
|
15
|
+
const char *end = padding_str + padding_size;
|
|
16
|
+
while (i < n)
|
|
17
|
+
i += u_char_width(u_decode(&q, q, end));
|
|
18
|
+
memcpy(p, padding_str, q - padding_str);
|
|
19
|
+
p += q - padding_str;
|
|
20
|
+
|
|
21
|
+
return p;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
static long
|
|
25
|
+
rounding_size(const struct rb_u_string *padding, long padding_width, long n)
|
|
26
|
+
{
|
|
27
|
+
const char *padding_str = USTRING_STR(padding);
|
|
28
|
+
const char *q = padding_str, *end = padding_str + USTRING_LENGTH(padding);
|
|
29
|
+
long r = n % padding_width;
|
|
30
|
+
long i = 0;
|
|
31
|
+
while (i < r && q < end)
|
|
32
|
+
i += u_char_width(u_decode(&q, q, end));
|
|
33
|
+
// NOTE I think i ≮ r is guaranteed, but I can’t seem to prove it, so
|
|
34
|
+
// leave this in for safety.
|
|
35
|
+
if (i < r)
|
|
36
|
+
rb_u_raise(rb_eArgError,
|
|
37
|
+
"padding isn’t wide enough to complete rounding (%ld < %ld)",
|
|
38
|
+
i, r);
|
|
39
|
+
if (i > r)
|
|
40
|
+
rb_u_raise(rb_eArgError,
|
|
41
|
+
"padding is too wide to complete rounding (%ld > %ld)",
|
|
42
|
+
i, r);
|
|
43
|
+
return q - padding_str;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
static long
|
|
47
|
+
rb_u_string_justified_size(long string_size,
|
|
48
|
+
const struct rb_u_string *padding, long padding_width,
|
|
49
|
+
long left_n, long right_n)
|
|
50
|
+
{
|
|
51
|
+
long size;
|
|
52
|
+
|
|
53
|
+
long left_n_2 = rounding_size(padding, padding_width, left_n);
|
|
54
|
+
long right_n_2 = rounding_size(padding, padding_width, right_n);
|
|
55
|
+
if ((size = left_n / padding_width + right_n / padding_width) >= LONG_MAX / USTRING_LENGTH(padding) ||
|
|
56
|
+
(size *= USTRING_LENGTH(padding)) >= LONG_MAX - left_n_2 - right_n_2 ||
|
|
57
|
+
(size += left_n_2 + right_n_2) >= LONG_MAX - string_size)
|
|
58
|
+
rb_u_raise(rb_eArgError, "argument too big");
|
|
59
|
+
size += string_size;
|
|
60
|
+
|
|
61
|
+
return size;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
static VALUE
|
|
65
|
+
rb_u_string_justify_impl(VALUE self,
|
|
66
|
+
const struct rb_u_string *string, long string_width,
|
|
67
|
+
const struct rb_u_string *padding, long padding_width,
|
|
68
|
+
long width, char jflag)
|
|
69
|
+
{
|
|
70
|
+
long n = width - string_width;
|
|
71
|
+
long left_n = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n / 2);
|
|
72
|
+
long right_n = n - left_n;
|
|
73
|
+
|
|
74
|
+
long string_size = USTRING_LENGTH(string);
|
|
75
|
+
long justified_size = rb_u_string_justified_size(string_size,
|
|
76
|
+
padding, padding_width,
|
|
77
|
+
left_n, right_n);
|
|
78
|
+
char *justified = ALLOC_N(char, justified_size + 1);
|
|
79
|
+
|
|
80
|
+
char *p = rb_u_string_justify_one_side(justified, padding, padding_width, left_n);
|
|
81
|
+
memcpy(p, USTRING_STR(string), string_size);
|
|
82
|
+
p += string_size;
|
|
83
|
+
p = rb_u_string_justify_one_side(p, padding, padding_width, right_n);
|
|
84
|
+
justified[justified_size] = '\0';
|
|
85
|
+
|
|
86
|
+
return rb_u_string_new_c_own(self, justified, justified_size);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
static VALUE
|
|
90
|
+
rb_u_string_justify(int argc, VALUE *argv, VALUE self, char jflag)
|
|
91
|
+
{
|
|
92
|
+
const struct rb_u_string *string = RVAL2USTRING(self);
|
|
93
|
+
|
|
94
|
+
VALUE rbwidth, rbpadding;
|
|
95
|
+
const struct rb_u_string *padding = USTRING_LOCAL(Qnil, " ", 1);
|
|
96
|
+
long padding_width = 1;
|
|
97
|
+
if (rb_scan_args(argc, argv, "11", &rbwidth, &rbpadding) == 2) {
|
|
98
|
+
padding = RVAL2USTRING_ANY(rbpadding);
|
|
99
|
+
padding_width = u_width_n(USTRING_STR(padding), USTRING_LENGTH(padding));
|
|
100
|
+
if (padding_width == 0)
|
|
101
|
+
rb_u_raise(rb_eArgError, "zero-width padding");
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
long string_width = u_width_n(USTRING_STR(string), USTRING_LENGTH(string));
|
|
105
|
+
|
|
106
|
+
long width = NUM2LONG(rbwidth);
|
|
107
|
+
if (width < 0 || string_width >= width)
|
|
108
|
+
return self;
|
|
109
|
+
|
|
110
|
+
VALUE result = rb_u_string_justify_impl(self,
|
|
111
|
+
string, string_width,
|
|
112
|
+
padding, padding_width,
|
|
113
|
+
width, jflag);
|
|
114
|
+
if (!NIL_P(rbpadding))
|
|
115
|
+
OBJ_INFECT(result, rbpadding);
|
|
116
|
+
return result;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/* @overload center(width, padding = ' ')
|
|
120
|
+
* @param [#to_int] width
|
|
121
|
+
* @param [U::String, #to_str] padding
|
|
122
|
+
* @raise [ArgumentError] If PADDING{#width} = 0
|
|
123
|
+
* @raise [ArgumentError] If characters inside PADDING that should be used
|
|
124
|
+
* for round-off padding are too wide
|
|
125
|
+
* @return [U::String] The receiver padded as evenly as possible on both
|
|
126
|
+
* sides with PADDING to make it max({#length}, WIDTH) wide, inheriting any
|
|
127
|
+
* taint and untrust from the receiver and also from PADDING if PADDING is
|
|
128
|
+
* used
|
|
129
|
+
* @see #ljust
|
|
130
|
+
* @see #rjust */
|
|
131
|
+
VALUE
|
|
132
|
+
rb_u_string_center(int argc, VALUE *argv, VALUE self)
|
|
133
|
+
{
|
|
134
|
+
return rb_u_string_justify(argc, argv, self, 'c');
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/* @overload ljust(width, padding = ' ')
|
|
138
|
+
* @param [#to_int] width
|
|
139
|
+
* @param [U::String, #to_str] padding
|
|
140
|
+
* @raise [ArgumentError] If PADDING{#width} = 0
|
|
141
|
+
* @raise [ArgumentError] If characters inside PADDING that should be used
|
|
142
|
+
* for round-off padding are too wide
|
|
143
|
+
* @return [U::String] The receiver padded on the right with PADDING to make
|
|
144
|
+
* it max({#length}, WIDTH) wide, inheriting any taint and untrust from
|
|
145
|
+
* the receiver and also from PADDING if PADDING is used
|
|
146
|
+
* @see #center
|
|
147
|
+
* @see #rjust */
|
|
148
|
+
VALUE
|
|
149
|
+
rb_u_string_ljust(int argc, VALUE *argv, VALUE self)
|
|
150
|
+
{
|
|
151
|
+
return rb_u_string_justify(argc, argv, self, 'l');
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
/* @overload rjust(width, padding = ' ')
|
|
155
|
+
* @param [#to_int] width
|
|
156
|
+
* @param [U::String, #to_str] padding
|
|
157
|
+
* @raise [ArgumentError] If PADDING{#width} = 0
|
|
158
|
+
* @raise [ArgumentError] If characters inside PADDING that should be used
|
|
159
|
+
* for round-off padding are too wide
|
|
160
|
+
* @return [U::String] The receiver padded on the left with PADDING to make
|
|
161
|
+
* it max({#length}, WIDTH) wide, inheriting any taint and untrust from the
|
|
162
|
+
* receiver and also from PADDING if PADDING is used
|
|
163
|
+
* @see #center
|
|
164
|
+
* @see #ljust */
|
|
165
|
+
VALUE
|
|
166
|
+
rb_u_string_rjust(int argc, VALUE *argv, VALUE self)
|
|
167
|
+
{
|
|
168
|
+
return rb_u_string_justify(argc, argv, self, 'r');
|
|
169
|
+
}
|