u 0.5.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/build/ext/u/data/attributes.rb +39 -0
- data/build/ext/u/data/bidi-mirroring.rb +27 -0
- data/build/ext/u/data/canonical-combining-class.rb +15 -0
- data/build/ext/u/data/case-folding.rb +39 -0
- data/build/ext/u/data/cased.rb +19 -0
- data/build/ext/u/data/compose.rb +304 -0
- data/build/ext/u/data/constants.rb +31 -0
- data/build/ext/u/data/decompose.rb +85 -0
- data/build/ext/u/data/general-category.rb +61 -0
- data/build/ext/u/data/grapheme-word-break.rb +15 -0
- data/build/ext/u/data/marshalled.rb +5 -0
- data/build/ext/u/data/script.rb +91 -0
- data/build/ext/u/data/soft-dotted.rb +17 -0
- data/build/ext/u/data/title-table.rb +30 -0
- data/build/ext/u/data/wide.rb +17 -0
- data/build/lib/u/build.rb +8 -0
- data/build/lib/u/build/data.rb +16 -0
- data/build/lib/u/build/data/bidimirroring.rb +26 -0
- data/build/lib/u/build/data/break.rb +14 -0
- data/build/lib/u/build/data/casefolding.rb +77 -0
- data/build/lib/u/build/data/compositionexclusions.rb +14 -0
- data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
- data/build/lib/u/build/data/file.rb +88 -0
- data/build/lib/u/build/data/linebreak.rb +14 -0
- data/build/lib/u/build/data/proplist.rb +18 -0
- data/build/lib/u/build/data/scripts.rb +22 -0
- data/build/lib/u/build/data/specialcasing.rb +106 -0
- data/build/lib/u/build/data/unicode.rb +41 -0
- data/build/lib/u/build/data/unicode/entry.rb +27 -0
- data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
- data/build/lib/u/build/data/unicode/points.rb +32 -0
- data/build/lib/u/build/header.rb +11 -0
- data/build/lib/u/build/header/table.rb +19 -0
- data/build/lib/u/build/header/table/row.rb +64 -0
- data/build/lib/u/build/header/tables.rb +6 -0
- data/build/lib/u/build/header/tables/intervals.rb +50 -0
- data/build/lib/u/build/header/tables/split.rb +20 -0
- data/build/lib/u/build/header/tables/split/data.rb +29 -0
- data/build/lib/u/build/header/tables/split/part1.rb +28 -0
- data/build/lib/u/build/header/tables/split/part2.rb +13 -0
- data/build/lib/u/build/header/tables/split/row.rb +34 -0
- data/build/lib/u/build/header/tables/split/rows.rb +22 -0
- data/build/test/unit/break.rb +45 -0
- data/build/test/unit/case.rb +178 -0
- data/build/test/unit/foldcase.rb +44 -0
- data/build/test/unit/normalize.rb +81 -0
- data/ext/u/attributes.c +62 -0
- data/ext/u/attributes.h +5 -0
- data/ext/u/case.h +41 -0
- data/ext/u/data/attributes.h +3070 -0
- data/ext/u/data/bidi-mirroring.h +373 -0
- data/ext/u/data/canonical-combining-class.h +2157 -0
- data/ext/u/data/case-folding.h +171 -0
- data/ext/u/data/cased.h +42 -0
- data/ext/u/data/compose.h +1714 -0
- data/ext/u/data/constants.h +17 -0
- data/ext/u/data/decompose.h +9356 -0
- data/ext/u/data/general-category.h +28959 -0
- data/ext/u/data/grapheme-break.h +13201 -0
- data/ext/u/data/line-break.h +26501 -0
- data/ext/u/data/normalization-quick-check.h +3002 -0
- data/ext/u/data/script.h +2928 -0
- data/ext/u/data/soft-dotted.h +55 -0
- data/ext/u/data/title-table.h +41 -0
- data/ext/u/data/types.h +11117 -0
- data/ext/u/data/wide-cjk.h +197 -0
- data/ext/u/data/wide.h +59 -0
- data/ext/u/data/word-break.h +10001 -0
- data/ext/u/depend +281 -0
- data/ext/u/extconf.rb +158 -0
- data/ext/u/output.h +51 -0
- data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
- data/ext/u/private.h +58 -0
- data/ext/u/rb_includes.h +10 -0
- data/ext/u/rb_private.c +98 -0
- data/ext/u/rb_private.h +67 -0
- data/ext/u/rb_u.c +251 -0
- data/ext/u/rb_u_buffer.c +443 -0
- data/ext/u/rb_u_buffer.h +24 -0
- data/ext/u/rb_u_re.c +43 -0
- data/ext/u/rb_u_re.h +15 -0
- data/ext/u/rb_u_string.c +478 -0
- data/ext/u/rb_u_string.h +173 -0
- data/ext/u/rb_u_string_alnum.c +10 -0
- data/ext/u/rb_u_string_alpha.c +10 -0
- data/ext/u/rb_u_string_aref.c +142 -0
- data/ext/u/rb_u_string_ascii_only.c +13 -0
- data/ext/u/rb_u_string_assigned.c +10 -0
- data/ext/u/rb_u_string_b.c +18 -0
- data/ext/u/rb_u_string_bytesize.c +10 -0
- data/ext/u/rb_u_string_byteslice.c +103 -0
- data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
- data/ext/u/rb_u_string_case_ignorable.c +25 -0
- data/ext/u/rb_u_string_casecmp.c +61 -0
- data/ext/u/rb_u_string_cased.c +17 -0
- data/ext/u/rb_u_string_chomp.c +107 -0
- data/ext/u/rb_u_string_chop.c +33 -0
- data/ext/u/rb_u_string_chr.c +9 -0
- data/ext/u/rb_u_string_cntrl.c +10 -0
- data/ext/u/rb_u_string_collate.c +46 -0
- data/ext/u/rb_u_string_collation_key.c +18 -0
- data/ext/u/rb_u_string_count.c +38 -0
- data/ext/u/rb_u_string_defined.c +10 -0
- data/ext/u/rb_u_string_delete.c +62 -0
- data/ext/u/rb_u_string_digit.c +10 -0
- data/ext/u/rb_u_string_downcase.c +13 -0
- data/ext/u/rb_u_string_dump.c +153 -0
- data/ext/u/rb_u_string_each_byte.c +46 -0
- data/ext/u/rb_u_string_each_char.c +49 -0
- data/ext/u/rb_u_string_each_codepoint.c +45 -0
- data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
- data/ext/u/rb_u_string_each_line.c +142 -0
- data/ext/u/rb_u_string_each_word.c +34 -0
- data/ext/u/rb_u_string_empty.c +11 -0
- data/ext/u/rb_u_string_end_with.c +31 -0
- data/ext/u/rb_u_string_eql.c +30 -0
- data/ext/u/rb_u_string_equal.c +33 -0
- data/ext/u/rb_u_string_foldcase.c +12 -0
- data/ext/u/rb_u_string_folded.c +13 -0
- data/ext/u/rb_u_string_format.c +1745 -0
- data/ext/u/rb_u_string_general_category.c +109 -0
- data/ext/u/rb_u_string_getbyte.c +21 -0
- data/ext/u/rb_u_string_graph.c +21 -0
- data/ext/u/rb_u_string_grapheme_break.c +61 -0
- data/ext/u/rb_u_string_gsub.c +164 -0
- data/ext/u/rb_u_string_hash.c +10 -0
- data/ext/u/rb_u_string_hex.c +9 -0
- data/ext/u/rb_u_string_include.c +10 -0
- data/ext/u/rb_u_string_index.c +110 -0
- data/ext/u/rb_u_string_inspect.c +189 -0
- data/ext/u/rb_u_string_internal_tr.c +148 -0
- data/ext/u/rb_u_string_internal_tr.h +29 -0
- data/ext/u/rb_u_string_justify.c +169 -0
- data/ext/u/rb_u_string_length.c +10 -0
- data/ext/u/rb_u_string_line_break.c +115 -0
- data/ext/u/rb_u_string_lower.c +13 -0
- data/ext/u/rb_u_string_lstrip.c +24 -0
- data/ext/u/rb_u_string_match.c +65 -0
- data/ext/u/rb_u_string_mirror.c +16 -0
- data/ext/u/rb_u_string_newline.c +21 -0
- data/ext/u/rb_u_string_normalize.c +70 -0
- data/ext/u/rb_u_string_normalized.c +28 -0
- data/ext/u/rb_u_string_oct.c +11 -0
- data/ext/u/rb_u_string_ord.c +14 -0
- data/ext/u/rb_u_string_partition.c +80 -0
- data/ext/u/rb_u_string_plus.c +33 -0
- data/ext/u/rb_u_string_print.c +10 -0
- data/ext/u/rb_u_string_punct.c +10 -0
- data/ext/u/rb_u_string_reverse.c +13 -0
- data/ext/u/rb_u_string_rindex.c +104 -0
- data/ext/u/rb_u_string_rpartition.c +81 -0
- data/ext/u/rb_u_string_rstrip.c +29 -0
- data/ext/u/rb_u_string_scan.c +109 -0
- data/ext/u/rb_u_string_script.c +253 -0
- data/ext/u/rb_u_string_soft_dotted.c +13 -0
- data/ext/u/rb_u_string_space.c +24 -0
- data/ext/u/rb_u_string_split.c +245 -0
- data/ext/u/rb_u_string_squeeze.c +75 -0
- data/ext/u/rb_u_string_start_with.c +31 -0
- data/ext/u/rb_u_string_strip.c +36 -0
- data/ext/u/rb_u_string_sub.c +147 -0
- data/ext/u/rb_u_string_times.c +35 -0
- data/ext/u/rb_u_string_title.c +10 -0
- data/ext/u/rb_u_string_titlecase.c +13 -0
- data/ext/u/rb_u_string_to_i.c +45 -0
- data/ext/u/rb_u_string_to_inum.c +364 -0
- data/ext/u/rb_u_string_to_inum.h +1 -0
- data/ext/u/rb_u_string_to_str.c +17 -0
- data/ext/u/rb_u_string_to_sym.c +12 -0
- data/ext/u/rb_u_string_tr.c +290 -0
- data/ext/u/rb_u_string_upcase.c +12 -0
- data/ext/u/rb_u_string_upper.c +13 -0
- data/ext/u/rb_u_string_valid.c +10 -0
- data/ext/u/rb_u_string_valid_encoding.c +12 -0
- data/ext/u/rb_u_string_wide.c +21 -0
- data/ext/u/rb_u_string_wide_cjk.c +21 -0
- data/ext/u/rb_u_string_width.c +19 -0
- data/ext/u/rb_u_string_word_break.c +63 -0
- data/ext/u/rb_u_string_xdigit.c +22 -0
- data/ext/u/rb_u_string_zero_width.c +16 -0
- data/ext/u/titled.c +55 -0
- data/ext/u/titled.h +1 -0
- data/ext/u/u.c +23 -0
- data/ext/u/u.h +458 -0
- data/ext/u/u_char_canonical_combining_class.c +31 -0
- data/ext/u/u_char_digit_value.c +21 -0
- data/ext/u/u_char_downcase.c +27 -0
- data/ext/u/u_char_general_category.c +31 -0
- data/ext/u/u_char_grapheme_break.c +28 -0
- data/ext/u/u_char_isalnum.c +24 -0
- data/ext/u/u_char_isalpha.c +21 -0
- data/ext/u/u_char_isassigned.c +16 -0
- data/ext/u/u_char_iscased.c +22 -0
- data/ext/u/u_char_iscaseignorable.c +29 -0
- data/ext/u/u_char_iscntrl.c +17 -0
- data/ext/u/u_char_isdefined.c +15 -0
- data/ext/u/u_char_isdigit.c +16 -0
- data/ext/u/u_char_isgraph.c +22 -0
- data/ext/u/u_char_islower.c +16 -0
- data/ext/u/u_char_isnewline.c +24 -0
- data/ext/u/u_char_isprint.c +21 -0
- data/ext/u/u_char_ispunct.c +27 -0
- data/ext/u/u_char_issoftdotted.c +18 -0
- data/ext/u/u_char_isspace.c +28 -0
- data/ext/u/u_char_isupper.c +16 -0
- data/ext/u/u_char_isvalid.c +18 -0
- data/ext/u/u_char_iswide.c +18 -0
- data/ext/u/u_char_iswide_cjk.c +22 -0
- data/ext/u/u_char_isxdigit.c +27 -0
- data/ext/u/u_char_iszerowidth.c +29 -0
- data/ext/u/u_char_line_break.c +29 -0
- data/ext/u/u_char_mirror.c +16 -0
- data/ext/u/u_char_normalized.c +23 -0
- data/ext/u/u_char_script.c +41 -0
- data/ext/u/u_char_to_u.c +48 -0
- data/ext/u/u_char_upcase.c +24 -0
- data/ext/u/u_char_width.c +12 -0
- data/ext/u/u_char_word_break.c +28 -0
- data/ext/u/u_char_xdigit_value.c +31 -0
- data/ext/u/u_collate.c +83 -0
- data/ext/u/u_collation_key.c +132 -0
- data/ext/u/u_decode.c +156 -0
- data/ext/u/u_downcase.c +201 -0
- data/ext/u/u_foldcase.c +68 -0
- data/ext/u/u_grapheme_clusters.c +57 -0
- data/ext/u/u_has_prefix.c +27 -0
- data/ext/u/u_index.c +93 -0
- data/ext/u/u_is_ascii_only.c +33 -0
- data/ext/u/u_locale.c +40 -0
- data/ext/u/u_locale.h +14 -0
- data/ext/u/u_mirror.c +20 -0
- data/ext/u/u_n_bytes.c +16 -0
- data/ext/u/u_n_chars.c +43 -0
- data/ext/u/u_normalize.c +232 -0
- data/ext/u/u_normalized.c +28 -0
- data/ext/u/u_offset_to_pointer.c +62 -0
- data/ext/u/u_pointer_to_offset.c +23 -0
- data/ext/u/u_recode.c +73 -0
- data/ext/u/u_reverse.c +21 -0
- data/ext/u/u_rindex.c +132 -0
- data/ext/u/u_titlecase.c +68 -0
- data/ext/u/u_upcase.c +89 -0
- data/ext/u/u_width.c +35 -0
- data/ext/u/u_words.c +82 -0
- data/ext/u/yield.h +27 -0
- data/lib/u-1.0.rb +20 -0
- data/lib/u-1.0/buffer.rb +10 -0
- data/lib/u-1.0/string.rb +9 -0
- data/lib/u-1.0/version.rb +287 -0
- data/test/unit/case.rb +2080 -0
- data/test/unit/foldcase.rb +1136 -0
- data/test/unit/graphemebreak.rb +407 -0
- data/test/unit/normalize.rb +367545 -0
- data/test/unit/u-1.0.rb +10 -0
- data/test/unit/u-1.0/buffer.rb +52 -0
- data/test/unit/u-1.0/string.rb +1439 -0
- data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
- data/test/unit/wordbreak.rb +1083 -0
- metadata +603 -148
- data/README +0 -38
- data/Rakefile +0 -64
- data/ext/encoding/character/utf-8/break.c +0 -25
- data/ext/encoding/character/utf-8/data/break.h +0 -22931
- data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
- data/ext/encoding/character/utf-8/data/compose.h +0 -1607
- data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
- data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
- data/ext/encoding/character/utf-8/decompose.c +0 -444
- data/ext/encoding/character/utf-8/depend +0 -65
- data/ext/encoding/character/utf-8/extconf.rb +0 -67
- data/ext/encoding/character/utf-8/private.h +0 -51
- data/ext/encoding/character/utf-8/properties.c +0 -1056
- data/ext/encoding/character/utf-8/rb_includes.h +0 -19
- data/ext/encoding/character/utf-8/rb_methods.h +0 -49
- data/ext/encoding/character/utf-8/rb_private.h +0 -52
- data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
- data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
- data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
- data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
- data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
- data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
- data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
- data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
- data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
- data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
- data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
- data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
- data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
- data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
- data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
- data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
- data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
- data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
- data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
- data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
- data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
- data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
- data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
- data/ext/encoding/character/utf-8/tables.h +0 -38
- data/ext/encoding/character/utf-8/unicode.c +0 -319
- data/ext/encoding/character/utf-8/unicode.h +0 -216
- data/ext/encoding/character/utf-8/utf.c +0 -1334
- data/lib/encoding/character/utf-8.rb +0 -201
- data/lib/u.rb +0 -16
- data/lib/u/string.rb +0 -185
- data/lib/u/version.rb +0 -5
- data/test/unit/u/string.rb +0 -91
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
|
|
3
|
+
/* @overload digit?
|
|
4
|
+
* @return [Boolean] True if the receiver contains only characters in the
|
|
5
|
+
* general category Number, decimal digit (Nd) */
|
|
6
|
+
VALUE
|
|
7
|
+
rb_u_string_digit(VALUE self)
|
|
8
|
+
{
|
|
9
|
+
return _rb_u_character_test(self, u_char_isdigit);
|
|
10
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
|
|
3
|
+
/* @overload downcase(locale = ENV['LC_CTYPE'])
|
|
4
|
+
* @param [#to_str] locale
|
|
5
|
+
* @return [U::String] The downcasing of the receiver according to the rules
|
|
6
|
+
* of the language of LOCALE, which may be empty to specifically use the
|
|
7
|
+
* default, language-independent, rules, inheriting any taint and
|
|
8
|
+
* untrust */
|
|
9
|
+
VALUE
|
|
10
|
+
rb_u_string_downcase(int argc, VALUE *argv, VALUE self)
|
|
11
|
+
{
|
|
12
|
+
return _rb_u_string_convert_locale(argc, argv, self, u_downcase, NULL);
|
|
13
|
+
}
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
#include <ctype.h>
|
|
2
|
+
|
|
3
|
+
#include "rb_includes.h"
|
|
4
|
+
#include "rb_u_buffer.h"
|
|
5
|
+
|
|
6
|
+
static inline bool
|
|
7
|
+
rb_u_string_dump_escape(VALUE buffer, unsigned char c)
|
|
8
|
+
{
|
|
9
|
+
const char *escape = NULL;
|
|
10
|
+
|
|
11
|
+
switch (c) {
|
|
12
|
+
case '"': escape = "\\\""; break;
|
|
13
|
+
case '\\': escape = "\\\\"; break;
|
|
14
|
+
case '\n': escape = "\\n"; break;
|
|
15
|
+
case '\r': escape = "\\r"; break;
|
|
16
|
+
case '\t': escape = "\\t"; break;
|
|
17
|
+
case '\f': escape = "\\f"; break;
|
|
18
|
+
case '\013': escape = "\\v"; break;
|
|
19
|
+
case '\010': escape = "\\b"; break;
|
|
20
|
+
case '\007': escape = "\\a"; break;
|
|
21
|
+
case '\033': escape = "\\e"; break;
|
|
22
|
+
default:
|
|
23
|
+
return false;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
rb_u_buffer_append(buffer, escape, 2);
|
|
27
|
+
|
|
28
|
+
return true;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
|
|
32
|
+
|
|
33
|
+
static inline bool
|
|
34
|
+
rb_u_string_dump_hash(VALUE buffer, unsigned char c, const char *p, const char *end)
|
|
35
|
+
{
|
|
36
|
+
if (c != '#' || !IS_EVSTR(p + 1, end))
|
|
37
|
+
return false;
|
|
38
|
+
|
|
39
|
+
rb_u_buffer_append(buffer, "\\#", 2);
|
|
40
|
+
|
|
41
|
+
return true;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
static inline bool
|
|
45
|
+
rb_u_string_dump_ascii_printable(VALUE buffer, unsigned char c)
|
|
46
|
+
{
|
|
47
|
+
if (c > 0x7f || !u_char_isprint(c))
|
|
48
|
+
return false;
|
|
49
|
+
|
|
50
|
+
rb_u_buffer_append_char(buffer, c);
|
|
51
|
+
|
|
52
|
+
return true;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
#define REPLACEMENT_CHARACTER ((uint32_t)0xfffd)
|
|
56
|
+
|
|
57
|
+
static inline void
|
|
58
|
+
rb_u_string_dump_hex(VALUE buffer, unsigned char c)
|
|
59
|
+
{
|
|
60
|
+
char escaped[4 + 1];
|
|
61
|
+
int length = snprintf(escaped, sizeof(escaped), "\\x%02X", c);
|
|
62
|
+
rb_u_buffer_append(buffer, escaped, length);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
static inline bool
|
|
66
|
+
rb_u_string_dump_codepoint(VALUE buffer, const char **p, const char *end)
|
|
67
|
+
{
|
|
68
|
+
const char *q;
|
|
69
|
+
uint32_t c = u_decode(&q, *p, end);
|
|
70
|
+
if (c == REPLACEMENT_CHARACTER && !u_valid(*p, q - *p, NULL)) {
|
|
71
|
+
for (const char *r = *p; r < q; r++)
|
|
72
|
+
rb_u_string_dump_hex(buffer, (unsigned char)*r);
|
|
73
|
+
/* -1, since we increase p inside the loop. */
|
|
74
|
+
*p = q - 1;
|
|
75
|
+
return true;
|
|
76
|
+
}
|
|
77
|
+
/* -1, since we increase p inside the loop. */
|
|
78
|
+
*p = q - 1;
|
|
79
|
+
char escaped[3 + sizeof(c) * CHAR_BIT + 2 + 1];
|
|
80
|
+
int length = snprintf(escaped, sizeof(escaped), "\\u{%x}", c);
|
|
81
|
+
rb_u_buffer_append(buffer, escaped, length);
|
|
82
|
+
return true;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/* Returns the receiver in a reader-friendly format, inheriting any taint and
|
|
86
|
+
* untrust.
|
|
87
|
+
*
|
|
88
|
+
* The reader-friendly format looks like “`"…".u`”. Inside the “…”, any
|
|
89
|
+
* {#print?} characters in the ASCII range are output as-is, the following
|
|
90
|
+
* special characters are escaped according to the following table:
|
|
91
|
+
*
|
|
92
|
+
* <table>
|
|
93
|
+
* <thead><tr><th>Character</th><th>Dumped Sequence</th></tr></thead>
|
|
94
|
+
* <tbody>
|
|
95
|
+
* <tr><td>U+0022 QUOTATION MARK</td><td><code>\"</code></td></tr>
|
|
96
|
+
* <tr><td>U+005C REVERSE SOLIDUS</td><td><code>\\</code></td></tr>
|
|
97
|
+
* <tr><td>U+000A LINE FEED (LF)</td><td><code>\n</code></td></tr>
|
|
98
|
+
* <tr><td>U+000D CARRIAGE RETURN (CR)</td><td><code>\r</code></td></tr>
|
|
99
|
+
* <tr><td>U+0009 CHARACTER TABULATION</td><td><code>\t</code></td></tr>
|
|
100
|
+
* <tr><td>U+000C FORM FEED (FF)</td><td><code>\f</code></td></tr>
|
|
101
|
+
* <tr><td>U+000B LINE TABULATION</td><td><code>\v</code></td></tr>
|
|
102
|
+
* <tr><td>U+0008 BACKSPACE</td><td><code>\b</code></td></tr>
|
|
103
|
+
* <tr><td>U+0007 BELL</td><td><code>\a</code></td></tr>
|
|
104
|
+
* <tr><td>U+001B ESCAPE</td><td><code>\e</code></td></tr>
|
|
105
|
+
* </tbody>
|
|
106
|
+
* </table>
|
|
107
|
+
*
|
|
108
|
+
* the following special sequences are also escaped:
|
|
109
|
+
*
|
|
110
|
+
* <table>
|
|
111
|
+
* <thead><tr><th>Character</th><th>Dumped Sequence</th></tr></thead>
|
|
112
|
+
* <tbody>
|
|
113
|
+
* <tr><td><code>#$</code></td><td><code>\#$</code></td></tr>
|
|
114
|
+
* <tr><td><code>#@</code></td><td><code>\#@</code></td></tr>
|
|
115
|
+
* <tr><td><code>#{</code></td><td><code>\#{</code></td></tr>
|
|
116
|
+
* </tbody>
|
|
117
|
+
* </table>
|
|
118
|
+
*
|
|
119
|
+
* any valid UTF-8 byte sequences are output as “`\u{`_n_`}`”, where _n_ is the
|
|
120
|
+
* lowercase hexadecimal representation of the code point encoded by the UTF-8
|
|
121
|
+
* sequence, and any other byte is output as “`\x`_n_”, where _n_ is the
|
|
122
|
+
* two-digit uppercase hexadecimal representation of the byte’s value.
|
|
123
|
+
*
|
|
124
|
+
* @return [U::String] */
|
|
125
|
+
VALUE
|
|
126
|
+
rb_u_string_dump(VALUE self)
|
|
127
|
+
{
|
|
128
|
+
const struct rb_u_string *string = RVAL2USTRING(self);
|
|
129
|
+
const char *p = USTRING_STR(string);
|
|
130
|
+
const char *end = USTRING_END(string);
|
|
131
|
+
|
|
132
|
+
VALUE buffer = rb_u_buffer_new_sized(7);
|
|
133
|
+
|
|
134
|
+
rb_u_buffer_append(buffer, "\"", 1);
|
|
135
|
+
while (p < end) {
|
|
136
|
+
unsigned char c = *p;
|
|
137
|
+
|
|
138
|
+
if (!rb_u_string_dump_escape(buffer, c) &&
|
|
139
|
+
!rb_u_string_dump_hash(buffer, c, p, end) &&
|
|
140
|
+
!rb_u_string_dump_ascii_printable(buffer, c) &&
|
|
141
|
+
!rb_u_string_dump_codepoint(buffer, &p, end))
|
|
142
|
+
rb_u_string_dump_hex(buffer, c);
|
|
143
|
+
|
|
144
|
+
p++;
|
|
145
|
+
}
|
|
146
|
+
rb_u_buffer_append(buffer, "\".u", 3);
|
|
147
|
+
|
|
148
|
+
VALUE result = rb_u_buffer_to_u_bang(buffer);
|
|
149
|
+
|
|
150
|
+
OBJ_INFECT(result, self);
|
|
151
|
+
|
|
152
|
+
return result;
|
|
153
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
#include "yield.h"
|
|
3
|
+
|
|
4
|
+
static void
|
|
5
|
+
each(VALUE self, struct yield *yield)
|
|
6
|
+
{
|
|
7
|
+
const struct rb_u_string *string = RVAL2USTRING(self);
|
|
8
|
+
const char *end = USTRING_END(string);
|
|
9
|
+
for (const char *p = USTRING_STR(string); p < end; p++)
|
|
10
|
+
yield_call(yield, INT2FIX(*p & 0xff));
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
UNUSED(static VALUE
|
|
14
|
+
size(VALUE self, UNUSED(VALUE args)))
|
|
15
|
+
{
|
|
16
|
+
return LONG2NUM(USTRING_LENGTH(RVAL2USTRING(self)));
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/* @overload each_byte{ |byte| … }
|
|
20
|
+
*
|
|
21
|
+
* Enumerates the bytes in the receiver.
|
|
22
|
+
*
|
|
23
|
+
* @yieldparam [Fixnum] byte
|
|
24
|
+
* @return [self]
|
|
25
|
+
*
|
|
26
|
+
* @overload each_byte
|
|
27
|
+
*
|
|
28
|
+
* @return [Enumerator] An Enumerator over the bytes in the receiver
|
|
29
|
+
*/
|
|
30
|
+
VALUE
|
|
31
|
+
rb_u_string_each_byte(VALUE self)
|
|
32
|
+
{
|
|
33
|
+
RETURN_SIZED_ENUMERATOR(self, 0, NULL, size);
|
|
34
|
+
struct yield y = YIELD_INIT;
|
|
35
|
+
each(self, &y);
|
|
36
|
+
return self;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/* @return [Array<Fixnum>] The bytes of the receiver. */
|
|
40
|
+
VALUE
|
|
41
|
+
rb_u_string_bytes(VALUE self)
|
|
42
|
+
{
|
|
43
|
+
struct yield_array y = YIELD_ARRAY_INIT;
|
|
44
|
+
each(self, &y.yield);
|
|
45
|
+
return y.array;
|
|
46
|
+
}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
#include "yield.h"
|
|
3
|
+
|
|
4
|
+
static void
|
|
5
|
+
each(VALUE self, struct yield *yield)
|
|
6
|
+
{
|
|
7
|
+
const struct rb_u_string *s = RVAL2USTRING(self);
|
|
8
|
+
for (const char *p = USTRING_STR(s), *q, *end = USTRING_END(s); p < end; p = q) {
|
|
9
|
+
u_decode(&q, p, end);
|
|
10
|
+
yield_call(yield, rb_u_string_new_c(self, p, q - p));
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
UNUSED(static VALUE
|
|
15
|
+
size(VALUE self, UNUSED(VALUE args)))
|
|
16
|
+
{
|
|
17
|
+
const struct rb_u_string *string = RVAL2USTRING(self);
|
|
18
|
+
return UINT2NUM(u_n_chars_n(USTRING_STR(string), USTRING_LENGTH(string)));
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
/* @overload each_char{ |char| … }
|
|
22
|
+
*
|
|
23
|
+
* Enumerates the characters in the receiver, each inheriting any taint and
|
|
24
|
+
* untrust.
|
|
25
|
+
*
|
|
26
|
+
* @yieldparam [U::String] char
|
|
27
|
+
* @return [self]
|
|
28
|
+
*
|
|
29
|
+
* @overload each_char
|
|
30
|
+
*
|
|
31
|
+
* @return [Enumerator] An Enumerator over the characters in the receiver */
|
|
32
|
+
VALUE
|
|
33
|
+
rb_u_string_each_char(VALUE self)
|
|
34
|
+
{
|
|
35
|
+
RETURN_SIZED_ENUMERATOR(self, 0, NULL, size);
|
|
36
|
+
struct yield y = YIELD_INIT;
|
|
37
|
+
each(self, &y);
|
|
38
|
+
return self;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/* @return [Array<U::String>] The characters of the receiver, each inheriting
|
|
42
|
+
* any taint and untrust. */
|
|
43
|
+
VALUE
|
|
44
|
+
rb_u_string_chars(VALUE self)
|
|
45
|
+
{
|
|
46
|
+
struct yield_array y = YIELD_ARRAY_INIT;
|
|
47
|
+
each(self, &y.yield);
|
|
48
|
+
return y.array;
|
|
49
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
#include "yield.h"
|
|
3
|
+
|
|
4
|
+
static void
|
|
5
|
+
each(VALUE self, struct yield *yield)
|
|
6
|
+
{
|
|
7
|
+
const struct rb_u_string *s = RVAL2USTRING(self);
|
|
8
|
+
for (const char *p = USTRING_STR(s), *end = USTRING_END(s); p < end; )
|
|
9
|
+
yield_call(yield, UINT2NUM(u_decode(&p, p, end)));
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
UNUSED(static VALUE
|
|
13
|
+
size(VALUE self, UNUSED(VALUE args)))
|
|
14
|
+
{
|
|
15
|
+
const struct rb_u_string *string = RVAL2USTRING(self);
|
|
16
|
+
return UINT2NUM(u_n_chars_n(USTRING_STR(string), USTRING_LENGTH(string)));
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/* @overload each_codepoint{ |codepoint| … }
|
|
20
|
+
*
|
|
21
|
+
* Enumerates the code points of the receiver.
|
|
22
|
+
*
|
|
23
|
+
* @yieldparam [Integer] codepoint
|
|
24
|
+
* @return [self]
|
|
25
|
+
*
|
|
26
|
+
* @overload each_codepoint
|
|
27
|
+
* @return [Enumerator] An Enumerator over the code points of the receiver
|
|
28
|
+
*/
|
|
29
|
+
VALUE
|
|
30
|
+
rb_u_string_each_codepoint(VALUE self)
|
|
31
|
+
{
|
|
32
|
+
RETURN_SIZED_ENUMERATOR(self, 0, NULL, size);
|
|
33
|
+
struct yield y = YIELD_INIT;
|
|
34
|
+
each(self, &y);
|
|
35
|
+
return self;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/* @return [Array<Integer>] The code points of the receiver. */
|
|
39
|
+
VALUE
|
|
40
|
+
rb_u_string_codepoints(VALUE self)
|
|
41
|
+
{
|
|
42
|
+
struct yield_array y = YIELD_ARRAY_INIT;
|
|
43
|
+
each(self, &y.yield);
|
|
44
|
+
return y.array;
|
|
45
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
|
|
3
|
+
static void
|
|
4
|
+
each(const char *p, size_t n, VALUE *self)
|
|
5
|
+
{
|
|
6
|
+
rb_yield(rb_u_string_new_c(*self, p, n));
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
/* @overload each_grapheme_cluster{ |cluster| … }
|
|
10
|
+
*
|
|
11
|
+
* Enumerates the grapheme clusters in the receiver, each inheriting any
|
|
12
|
+
* taint and untrust.
|
|
13
|
+
*
|
|
14
|
+
* @yieldparam [U::String] cluster
|
|
15
|
+
* @return [self]
|
|
16
|
+
* @see http://www.unicode.org/reports/tr29/
|
|
17
|
+
* Unicode Standard Annex #29: Unicode Text Segmentation
|
|
18
|
+
*
|
|
19
|
+
* @overload each_grapheme_cluster
|
|
20
|
+
*
|
|
21
|
+
* @return [Enumerator] An Enumerator over the grapheme clusters in the
|
|
22
|
+
* receiver
|
|
23
|
+
* @see http://www.unicode.org/reports/tr29/
|
|
24
|
+
* Unicode Standard Annex #29: Unicode Text Segmentation */
|
|
25
|
+
VALUE
|
|
26
|
+
rb_u_string_each_grapheme_cluster(VALUE self)
|
|
27
|
+
{
|
|
28
|
+
RETURN_ENUMERATOR(self, 0, NULL);
|
|
29
|
+
|
|
30
|
+
const struct rb_u_string *string = RVAL2USTRING(self);
|
|
31
|
+
const char *p = USTRING_STR(string);
|
|
32
|
+
const char *end = USTRING_END(string);
|
|
33
|
+
size_t length = end - p;
|
|
34
|
+
u_grapheme_clusters(p, length, (u_substring_fn)each, &self);
|
|
35
|
+
return self;
|
|
36
|
+
}
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
#include "yield.h"
|
|
3
|
+
|
|
4
|
+
static void
|
|
5
|
+
rb_u_string_each_line_default(VALUE self, struct yield *yield)
|
|
6
|
+
{
|
|
7
|
+
const struct rb_u_string *string = RVAL2USTRING(self);
|
|
8
|
+
|
|
9
|
+
const char *begin = USTRING_STR(string);
|
|
10
|
+
const char *base = begin;
|
|
11
|
+
const char *p = begin;
|
|
12
|
+
const char *end = USTRING_END(string);
|
|
13
|
+
|
|
14
|
+
while (p < end) {
|
|
15
|
+
p = memchr(p, '\n', end - p);
|
|
16
|
+
if (p == NULL)
|
|
17
|
+
break;
|
|
18
|
+
p++;
|
|
19
|
+
|
|
20
|
+
yield_call(yield, rb_u_string_new_c(self, base, p - base));
|
|
21
|
+
|
|
22
|
+
base = p;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
if (base != end)
|
|
26
|
+
yield_call(yield, rb_u_string_new_c(self, base, end - base));
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
static void
|
|
30
|
+
rb_u_string_each_line_separator(VALUE self, const struct rb_u_string *separator,
|
|
31
|
+
struct yield *yield)
|
|
32
|
+
{
|
|
33
|
+
const struct rb_u_string *string = RVAL2USTRING(self);
|
|
34
|
+
|
|
35
|
+
long separator_length = USTRING_LENGTH(separator);
|
|
36
|
+
const char *q;
|
|
37
|
+
uint32_t first = separator_length == 0 ?
|
|
38
|
+
'\n' :
|
|
39
|
+
u_decode(&q, USTRING_STR(separator), USTRING_END(separator));
|
|
40
|
+
|
|
41
|
+
const char *begin = USTRING_STR(string);
|
|
42
|
+
const char *base = begin;
|
|
43
|
+
const char *p = begin;
|
|
44
|
+
const char *end = USTRING_END(string);
|
|
45
|
+
|
|
46
|
+
while (p < end) {
|
|
47
|
+
uint32_t c = u_decode(&q, p, end);
|
|
48
|
+
again:
|
|
49
|
+
if (separator_length == 0 && c == first) {
|
|
50
|
+
p = q;
|
|
51
|
+
if (p < end) {
|
|
52
|
+
c = u_decode(&q, p, end);
|
|
53
|
+
if (c != first)
|
|
54
|
+
goto again;
|
|
55
|
+
}
|
|
56
|
+
while (p < end) {
|
|
57
|
+
if (u_decode(&q, p, end) != first)
|
|
58
|
+
break;
|
|
59
|
+
p = q;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
if (c == first &&
|
|
64
|
+
(separator_length < 2 ||
|
|
65
|
+
(end - p >= separator_length &&
|
|
66
|
+
memcmp(USTRING_STR(separator), p, separator_length) == 0))) {
|
|
67
|
+
p += separator_length;
|
|
68
|
+
yield_call(yield, rb_u_string_new_c(self, base, p - base));
|
|
69
|
+
base = p;
|
|
70
|
+
} else
|
|
71
|
+
p = q;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if (base != end)
|
|
75
|
+
yield_call(yield, rb_u_string_new_c(self, base, end - base));
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
static void
|
|
79
|
+
each(int argc, VALUE *argv, VALUE self, struct yield *yield)
|
|
80
|
+
{
|
|
81
|
+
VALUE rs;
|
|
82
|
+
if (argc == 0)
|
|
83
|
+
rs = rb_rs;
|
|
84
|
+
else
|
|
85
|
+
rb_scan_args(argc, argv, "01", &rs);
|
|
86
|
+
if (NIL_P(rs)) {
|
|
87
|
+
yield_call(yield, self);
|
|
88
|
+
return;
|
|
89
|
+
}
|
|
90
|
+
const struct rb_u_string *separator = RVAL2USTRING_ANY(rs);
|
|
91
|
+
if (rs == rb_default_rs)
|
|
92
|
+
rb_u_string_each_line_default(self, yield);
|
|
93
|
+
else
|
|
94
|
+
rb_u_string_each_line_separator(self, separator, yield);
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/* @overload each_line(separator = $/){ |lp| … }
|
|
98
|
+
*
|
|
99
|
+
* Enumerates the lines of the receiver, inheriting any taint and untrust.
|
|
100
|
+
*
|
|
101
|
+
* If SEPARATOR is nil, yields self. If SEPARATOR is {#empty?}, separates
|
|
102
|
+
* each line (paragraph) by two or more U+000A LINE FEED characters.
|
|
103
|
+
*
|
|
104
|
+
* @param [U::String, #to_str] separator
|
|
105
|
+
* @yieldparam [U::String, self] lp
|
|
106
|
+
* @return [self]
|
|
107
|
+
*
|
|
108
|
+
* @overload each_line(separator = $/)
|
|
109
|
+
*
|
|
110
|
+
* Returns an Enumerator over the lines of the receiver.
|
|
111
|
+
*
|
|
112
|
+
* If SEPARATOR is nil, self will be yielded. If SEPARATOR is {#empty?},
|
|
113
|
+
* separates each line (paragraph) by two or more U+000A LINE FEED
|
|
114
|
+
* characters.
|
|
115
|
+
*
|
|
116
|
+
* @param [U::String, #to_str] separator
|
|
117
|
+
* @return [Enumerator] */
|
|
118
|
+
VALUE
|
|
119
|
+
rb_u_string_each_line(int argc, VALUE *argv, VALUE self)
|
|
120
|
+
{
|
|
121
|
+
RETURN_ENUMERATOR(self, argc, argv);
|
|
122
|
+
struct yield y = YIELD_INIT;
|
|
123
|
+
each(argc, argv, self, &y);
|
|
124
|
+
return self;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/* @overload lines(separator = $/)
|
|
128
|
+
*
|
|
129
|
+
* Returns the lines of the receiver, inheriting any taint and untrust.
|
|
130
|
+
*
|
|
131
|
+
* If SEPARATOR is nil, yields self. If SEPARATOR is {#empty?}, separates
|
|
132
|
+
* each line (paragraph) by two or more U+000A LINE FEED characters.
|
|
133
|
+
*
|
|
134
|
+
* @param [U::String, #to_str] separator
|
|
135
|
+
* @return [Array<U::String>] */
|
|
136
|
+
VALUE
|
|
137
|
+
rb_u_string_lines(int argc, VALUE *argv, VALUE self)
|
|
138
|
+
{
|
|
139
|
+
struct yield_array y = YIELD_ARRAY_INIT;
|
|
140
|
+
each(argc, argv, self, &y.yield);
|
|
141
|
+
return y.array;
|
|
142
|
+
}
|