u 0.5.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/build/ext/u/data/attributes.rb +39 -0
- data/build/ext/u/data/bidi-mirroring.rb +27 -0
- data/build/ext/u/data/canonical-combining-class.rb +15 -0
- data/build/ext/u/data/case-folding.rb +39 -0
- data/build/ext/u/data/cased.rb +19 -0
- data/build/ext/u/data/compose.rb +304 -0
- data/build/ext/u/data/constants.rb +31 -0
- data/build/ext/u/data/decompose.rb +85 -0
- data/build/ext/u/data/general-category.rb +61 -0
- data/build/ext/u/data/grapheme-word-break.rb +15 -0
- data/build/ext/u/data/marshalled.rb +5 -0
- data/build/ext/u/data/script.rb +91 -0
- data/build/ext/u/data/soft-dotted.rb +17 -0
- data/build/ext/u/data/title-table.rb +30 -0
- data/build/ext/u/data/wide.rb +17 -0
- data/build/lib/u/build.rb +8 -0
- data/build/lib/u/build/data.rb +16 -0
- data/build/lib/u/build/data/bidimirroring.rb +26 -0
- data/build/lib/u/build/data/break.rb +14 -0
- data/build/lib/u/build/data/casefolding.rb +77 -0
- data/build/lib/u/build/data/compositionexclusions.rb +14 -0
- data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
- data/build/lib/u/build/data/file.rb +88 -0
- data/build/lib/u/build/data/linebreak.rb +14 -0
- data/build/lib/u/build/data/proplist.rb +18 -0
- data/build/lib/u/build/data/scripts.rb +22 -0
- data/build/lib/u/build/data/specialcasing.rb +106 -0
- data/build/lib/u/build/data/unicode.rb +41 -0
- data/build/lib/u/build/data/unicode/entry.rb +27 -0
- data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
- data/build/lib/u/build/data/unicode/points.rb +32 -0
- data/build/lib/u/build/header.rb +11 -0
- data/build/lib/u/build/header/table.rb +19 -0
- data/build/lib/u/build/header/table/row.rb +64 -0
- data/build/lib/u/build/header/tables.rb +6 -0
- data/build/lib/u/build/header/tables/intervals.rb +50 -0
- data/build/lib/u/build/header/tables/split.rb +20 -0
- data/build/lib/u/build/header/tables/split/data.rb +29 -0
- data/build/lib/u/build/header/tables/split/part1.rb +28 -0
- data/build/lib/u/build/header/tables/split/part2.rb +13 -0
- data/build/lib/u/build/header/tables/split/row.rb +34 -0
- data/build/lib/u/build/header/tables/split/rows.rb +22 -0
- data/build/test/unit/break.rb +45 -0
- data/build/test/unit/case.rb +178 -0
- data/build/test/unit/foldcase.rb +44 -0
- data/build/test/unit/normalize.rb +81 -0
- data/ext/u/attributes.c +62 -0
- data/ext/u/attributes.h +5 -0
- data/ext/u/case.h +41 -0
- data/ext/u/data/attributes.h +3070 -0
- data/ext/u/data/bidi-mirroring.h +373 -0
- data/ext/u/data/canonical-combining-class.h +2157 -0
- data/ext/u/data/case-folding.h +171 -0
- data/ext/u/data/cased.h +42 -0
- data/ext/u/data/compose.h +1714 -0
- data/ext/u/data/constants.h +17 -0
- data/ext/u/data/decompose.h +9356 -0
- data/ext/u/data/general-category.h +28959 -0
- data/ext/u/data/grapheme-break.h +13201 -0
- data/ext/u/data/line-break.h +26501 -0
- data/ext/u/data/normalization-quick-check.h +3002 -0
- data/ext/u/data/script.h +2928 -0
- data/ext/u/data/soft-dotted.h +55 -0
- data/ext/u/data/title-table.h +41 -0
- data/ext/u/data/types.h +11117 -0
- data/ext/u/data/wide-cjk.h +197 -0
- data/ext/u/data/wide.h +59 -0
- data/ext/u/data/word-break.h +10001 -0
- data/ext/u/depend +281 -0
- data/ext/u/extconf.rb +158 -0
- data/ext/u/output.h +51 -0
- data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
- data/ext/u/private.h +58 -0
- data/ext/u/rb_includes.h +10 -0
- data/ext/u/rb_private.c +98 -0
- data/ext/u/rb_private.h +67 -0
- data/ext/u/rb_u.c +251 -0
- data/ext/u/rb_u_buffer.c +443 -0
- data/ext/u/rb_u_buffer.h +24 -0
- data/ext/u/rb_u_re.c +43 -0
- data/ext/u/rb_u_re.h +15 -0
- data/ext/u/rb_u_string.c +478 -0
- data/ext/u/rb_u_string.h +173 -0
- data/ext/u/rb_u_string_alnum.c +10 -0
- data/ext/u/rb_u_string_alpha.c +10 -0
- data/ext/u/rb_u_string_aref.c +142 -0
- data/ext/u/rb_u_string_ascii_only.c +13 -0
- data/ext/u/rb_u_string_assigned.c +10 -0
- data/ext/u/rb_u_string_b.c +18 -0
- data/ext/u/rb_u_string_bytesize.c +10 -0
- data/ext/u/rb_u_string_byteslice.c +103 -0
- data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
- data/ext/u/rb_u_string_case_ignorable.c +25 -0
- data/ext/u/rb_u_string_casecmp.c +61 -0
- data/ext/u/rb_u_string_cased.c +17 -0
- data/ext/u/rb_u_string_chomp.c +107 -0
- data/ext/u/rb_u_string_chop.c +33 -0
- data/ext/u/rb_u_string_chr.c +9 -0
- data/ext/u/rb_u_string_cntrl.c +10 -0
- data/ext/u/rb_u_string_collate.c +46 -0
- data/ext/u/rb_u_string_collation_key.c +18 -0
- data/ext/u/rb_u_string_count.c +38 -0
- data/ext/u/rb_u_string_defined.c +10 -0
- data/ext/u/rb_u_string_delete.c +62 -0
- data/ext/u/rb_u_string_digit.c +10 -0
- data/ext/u/rb_u_string_downcase.c +13 -0
- data/ext/u/rb_u_string_dump.c +153 -0
- data/ext/u/rb_u_string_each_byte.c +46 -0
- data/ext/u/rb_u_string_each_char.c +49 -0
- data/ext/u/rb_u_string_each_codepoint.c +45 -0
- data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
- data/ext/u/rb_u_string_each_line.c +142 -0
- data/ext/u/rb_u_string_each_word.c +34 -0
- data/ext/u/rb_u_string_empty.c +11 -0
- data/ext/u/rb_u_string_end_with.c +31 -0
- data/ext/u/rb_u_string_eql.c +30 -0
- data/ext/u/rb_u_string_equal.c +33 -0
- data/ext/u/rb_u_string_foldcase.c +12 -0
- data/ext/u/rb_u_string_folded.c +13 -0
- data/ext/u/rb_u_string_format.c +1745 -0
- data/ext/u/rb_u_string_general_category.c +109 -0
- data/ext/u/rb_u_string_getbyte.c +21 -0
- data/ext/u/rb_u_string_graph.c +21 -0
- data/ext/u/rb_u_string_grapheme_break.c +61 -0
- data/ext/u/rb_u_string_gsub.c +164 -0
- data/ext/u/rb_u_string_hash.c +10 -0
- data/ext/u/rb_u_string_hex.c +9 -0
- data/ext/u/rb_u_string_include.c +10 -0
- data/ext/u/rb_u_string_index.c +110 -0
- data/ext/u/rb_u_string_inspect.c +189 -0
- data/ext/u/rb_u_string_internal_tr.c +148 -0
- data/ext/u/rb_u_string_internal_tr.h +29 -0
- data/ext/u/rb_u_string_justify.c +169 -0
- data/ext/u/rb_u_string_length.c +10 -0
- data/ext/u/rb_u_string_line_break.c +115 -0
- data/ext/u/rb_u_string_lower.c +13 -0
- data/ext/u/rb_u_string_lstrip.c +24 -0
- data/ext/u/rb_u_string_match.c +65 -0
- data/ext/u/rb_u_string_mirror.c +16 -0
- data/ext/u/rb_u_string_newline.c +21 -0
- data/ext/u/rb_u_string_normalize.c +70 -0
- data/ext/u/rb_u_string_normalized.c +28 -0
- data/ext/u/rb_u_string_oct.c +11 -0
- data/ext/u/rb_u_string_ord.c +14 -0
- data/ext/u/rb_u_string_partition.c +80 -0
- data/ext/u/rb_u_string_plus.c +33 -0
- data/ext/u/rb_u_string_print.c +10 -0
- data/ext/u/rb_u_string_punct.c +10 -0
- data/ext/u/rb_u_string_reverse.c +13 -0
- data/ext/u/rb_u_string_rindex.c +104 -0
- data/ext/u/rb_u_string_rpartition.c +81 -0
- data/ext/u/rb_u_string_rstrip.c +29 -0
- data/ext/u/rb_u_string_scan.c +109 -0
- data/ext/u/rb_u_string_script.c +253 -0
- data/ext/u/rb_u_string_soft_dotted.c +13 -0
- data/ext/u/rb_u_string_space.c +24 -0
- data/ext/u/rb_u_string_split.c +245 -0
- data/ext/u/rb_u_string_squeeze.c +75 -0
- data/ext/u/rb_u_string_start_with.c +31 -0
- data/ext/u/rb_u_string_strip.c +36 -0
- data/ext/u/rb_u_string_sub.c +147 -0
- data/ext/u/rb_u_string_times.c +35 -0
- data/ext/u/rb_u_string_title.c +10 -0
- data/ext/u/rb_u_string_titlecase.c +13 -0
- data/ext/u/rb_u_string_to_i.c +45 -0
- data/ext/u/rb_u_string_to_inum.c +364 -0
- data/ext/u/rb_u_string_to_inum.h +1 -0
- data/ext/u/rb_u_string_to_str.c +17 -0
- data/ext/u/rb_u_string_to_sym.c +12 -0
- data/ext/u/rb_u_string_tr.c +290 -0
- data/ext/u/rb_u_string_upcase.c +12 -0
- data/ext/u/rb_u_string_upper.c +13 -0
- data/ext/u/rb_u_string_valid.c +10 -0
- data/ext/u/rb_u_string_valid_encoding.c +12 -0
- data/ext/u/rb_u_string_wide.c +21 -0
- data/ext/u/rb_u_string_wide_cjk.c +21 -0
- data/ext/u/rb_u_string_width.c +19 -0
- data/ext/u/rb_u_string_word_break.c +63 -0
- data/ext/u/rb_u_string_xdigit.c +22 -0
- data/ext/u/rb_u_string_zero_width.c +16 -0
- data/ext/u/titled.c +55 -0
- data/ext/u/titled.h +1 -0
- data/ext/u/u.c +23 -0
- data/ext/u/u.h +458 -0
- data/ext/u/u_char_canonical_combining_class.c +31 -0
- data/ext/u/u_char_digit_value.c +21 -0
- data/ext/u/u_char_downcase.c +27 -0
- data/ext/u/u_char_general_category.c +31 -0
- data/ext/u/u_char_grapheme_break.c +28 -0
- data/ext/u/u_char_isalnum.c +24 -0
- data/ext/u/u_char_isalpha.c +21 -0
- data/ext/u/u_char_isassigned.c +16 -0
- data/ext/u/u_char_iscased.c +22 -0
- data/ext/u/u_char_iscaseignorable.c +29 -0
- data/ext/u/u_char_iscntrl.c +17 -0
- data/ext/u/u_char_isdefined.c +15 -0
- data/ext/u/u_char_isdigit.c +16 -0
- data/ext/u/u_char_isgraph.c +22 -0
- data/ext/u/u_char_islower.c +16 -0
- data/ext/u/u_char_isnewline.c +24 -0
- data/ext/u/u_char_isprint.c +21 -0
- data/ext/u/u_char_ispunct.c +27 -0
- data/ext/u/u_char_issoftdotted.c +18 -0
- data/ext/u/u_char_isspace.c +28 -0
- data/ext/u/u_char_isupper.c +16 -0
- data/ext/u/u_char_isvalid.c +18 -0
- data/ext/u/u_char_iswide.c +18 -0
- data/ext/u/u_char_iswide_cjk.c +22 -0
- data/ext/u/u_char_isxdigit.c +27 -0
- data/ext/u/u_char_iszerowidth.c +29 -0
- data/ext/u/u_char_line_break.c +29 -0
- data/ext/u/u_char_mirror.c +16 -0
- data/ext/u/u_char_normalized.c +23 -0
- data/ext/u/u_char_script.c +41 -0
- data/ext/u/u_char_to_u.c +48 -0
- data/ext/u/u_char_upcase.c +24 -0
- data/ext/u/u_char_width.c +12 -0
- data/ext/u/u_char_word_break.c +28 -0
- data/ext/u/u_char_xdigit_value.c +31 -0
- data/ext/u/u_collate.c +83 -0
- data/ext/u/u_collation_key.c +132 -0
- data/ext/u/u_decode.c +156 -0
- data/ext/u/u_downcase.c +201 -0
- data/ext/u/u_foldcase.c +68 -0
- data/ext/u/u_grapheme_clusters.c +57 -0
- data/ext/u/u_has_prefix.c +27 -0
- data/ext/u/u_index.c +93 -0
- data/ext/u/u_is_ascii_only.c +33 -0
- data/ext/u/u_locale.c +40 -0
- data/ext/u/u_locale.h +14 -0
- data/ext/u/u_mirror.c +20 -0
- data/ext/u/u_n_bytes.c +16 -0
- data/ext/u/u_n_chars.c +43 -0
- data/ext/u/u_normalize.c +232 -0
- data/ext/u/u_normalized.c +28 -0
- data/ext/u/u_offset_to_pointer.c +62 -0
- data/ext/u/u_pointer_to_offset.c +23 -0
- data/ext/u/u_recode.c +73 -0
- data/ext/u/u_reverse.c +21 -0
- data/ext/u/u_rindex.c +132 -0
- data/ext/u/u_titlecase.c +68 -0
- data/ext/u/u_upcase.c +89 -0
- data/ext/u/u_width.c +35 -0
- data/ext/u/u_words.c +82 -0
- data/ext/u/yield.h +27 -0
- data/lib/u-1.0.rb +20 -0
- data/lib/u-1.0/buffer.rb +10 -0
- data/lib/u-1.0/string.rb +9 -0
- data/lib/u-1.0/version.rb +287 -0
- data/test/unit/case.rb +2080 -0
- data/test/unit/foldcase.rb +1136 -0
- data/test/unit/graphemebreak.rb +407 -0
- data/test/unit/normalize.rb +367545 -0
- data/test/unit/u-1.0.rb +10 -0
- data/test/unit/u-1.0/buffer.rb +52 -0
- data/test/unit/u-1.0/string.rb +1439 -0
- data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
- data/test/unit/wordbreak.rb +1083 -0
- metadata +603 -148
- data/README +0 -38
- data/Rakefile +0 -64
- data/ext/encoding/character/utf-8/break.c +0 -25
- data/ext/encoding/character/utf-8/data/break.h +0 -22931
- data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
- data/ext/encoding/character/utf-8/data/compose.h +0 -1607
- data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
- data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
- data/ext/encoding/character/utf-8/decompose.c +0 -444
- data/ext/encoding/character/utf-8/depend +0 -65
- data/ext/encoding/character/utf-8/extconf.rb +0 -67
- data/ext/encoding/character/utf-8/private.h +0 -51
- data/ext/encoding/character/utf-8/properties.c +0 -1056
- data/ext/encoding/character/utf-8/rb_includes.h +0 -19
- data/ext/encoding/character/utf-8/rb_methods.h +0 -49
- data/ext/encoding/character/utf-8/rb_private.h +0 -52
- data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
- data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
- data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
- data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
- data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
- data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
- data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
- data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
- data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
- data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
- data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
- data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
- data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
- data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
- data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
- data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
- data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
- data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
- data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
- data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
- data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
- data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
- data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
- data/ext/encoding/character/utf-8/tables.h +0 -38
- data/ext/encoding/character/utf-8/unicode.c +0 -319
- data/ext/encoding/character/utf-8/unicode.h +0 -216
- data/ext/encoding/character/utf-8/utf.c +0 -1334
- data/lib/encoding/character/utf-8.rb +0 -201
- data/lib/u.rb +0 -16
- data/lib/u/string.rb +0 -185
- data/lib/u/version.rb +0 -5
- data/test/unit/u/string.rb +0 -91
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
|
|
3
|
+
/* @overload +(other)
|
|
4
|
+
* @param [U::String, #to_str] other
|
|
5
|
+
* @raise [ArgumentError] If {#bytesize} + OTHER{#bytesize} > LONG_MAX
|
|
6
|
+
* @return [U::String] The concatenation of OTHER to the receiver, inheriting
|
|
7
|
+
* any taint on either */
|
|
8
|
+
VALUE
|
|
9
|
+
rb_u_string_plus(VALUE self, VALUE rbother)
|
|
10
|
+
{
|
|
11
|
+
const struct rb_u_string *string = RVAL2USTRING(self);
|
|
12
|
+
const struct rb_u_string *other = RVAL2USTRING_ANY(rbother);
|
|
13
|
+
|
|
14
|
+
long string_length = USTRING_LENGTH(string);
|
|
15
|
+
long other_length = USTRING_LENGTH(other);
|
|
16
|
+
|
|
17
|
+
/* TODO: Isn’t this off by one, as we add one to length for the
|
|
18
|
+
* ALLOC_N() call? */
|
|
19
|
+
if (string_length > LONG_MAX - other_length)
|
|
20
|
+
rb_u_raise(rb_eArgError, "length of resulting string would be too big");
|
|
21
|
+
long length = string_length + other_length;
|
|
22
|
+
|
|
23
|
+
char *sum = ALLOC_N(char, length + 1);
|
|
24
|
+
memcpy(sum, USTRING_STR(string), string_length);
|
|
25
|
+
memcpy(sum + string_length, USTRING_STR(other), other_length);
|
|
26
|
+
sum[length] = '\0';
|
|
27
|
+
|
|
28
|
+
VALUE result = rb_u_string_new_uninfected_own(sum, length);
|
|
29
|
+
if (OBJ_TAINTED(self) || OBJ_TAINTED(rbother))
|
|
30
|
+
OBJ_TAINT(result);
|
|
31
|
+
|
|
32
|
+
return result;
|
|
33
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
|
|
3
|
+
/* @overload punct?
|
|
4
|
+
* @return [Boolean] True if the receiver contains only characters in the
|
|
5
|
+
* general categories Punctuation and Symbol */
|
|
6
|
+
VALUE
|
|
7
|
+
rb_u_string_punct(VALUE self)
|
|
8
|
+
{
|
|
9
|
+
return _rb_u_character_test(self, u_char_ispunct);
|
|
10
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
|
|
3
|
+
/* @return [U::String] The reversal of the receiver, inheriting any taint and
|
|
4
|
+
* untrust from the receiver
|
|
5
|
+
* @note This doesn’t take into account proper handling of combining marks,
|
|
6
|
+
* direction indicators, and similarly relevant characters, so this method is
|
|
7
|
+
* mostly useful when you know the contents of the string is simple and the
|
|
8
|
+
* result isn’t intended for display. */
|
|
9
|
+
VALUE
|
|
10
|
+
rb_u_string_reverse(VALUE self)
|
|
11
|
+
{
|
|
12
|
+
return _rb_u_string_convert(self, u_reverse);
|
|
13
|
+
}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
|
|
3
|
+
long
|
|
4
|
+
rb_u_string_rindex(VALUE self, VALUE rbsubstring, long offset)
|
|
5
|
+
{
|
|
6
|
+
const struct rb_u_string *string = RVAL2USTRING(self);
|
|
7
|
+
const struct rb_u_string *substring = RVAL2USTRING_ANY(rbsubstring);
|
|
8
|
+
|
|
9
|
+
if (USTRING_LENGTH(string) < USTRING_LENGTH(substring))
|
|
10
|
+
return -1;
|
|
11
|
+
|
|
12
|
+
const char *s = rb_u_string_begin_from_offset(string, offset);
|
|
13
|
+
if (s == NULL)
|
|
14
|
+
return -1;
|
|
15
|
+
|
|
16
|
+
if (USTRING_LENGTH(substring) == 0)
|
|
17
|
+
return offset;
|
|
18
|
+
|
|
19
|
+
const char *begin = USTRING_STR(string);
|
|
20
|
+
const char *t = USTRING_STR(substring);
|
|
21
|
+
long t_length = USTRING_LENGTH(substring);
|
|
22
|
+
while (s >= begin) {
|
|
23
|
+
if (rb_memcmp(s, t, t_length) == 0)
|
|
24
|
+
return u_pointer_to_offset(begin, s);
|
|
25
|
+
s--;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
return -1;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/* @overload rindex(pattern, offset = -1)
|
|
32
|
+
*
|
|
33
|
+
* Returns the maximal index of the receiver where PATTERN matches, equal to
|
|
34
|
+
* or less than _i_, where _i_ = OFFSET if OFFSET ≥ 0, _i_ = {#length} -
|
|
35
|
+
* abs(OFFSET) otherwise, or nil if there is no match.
|
|
36
|
+
*
|
|
37
|
+
* If PATTERN is a Regexp, the Regexp special variables `$&`, `$'`,
|
|
38
|
+
* <code>$\`</code>, `$1`, `$2`, …, `$`_n_ are updated accordingly.
|
|
39
|
+
*
|
|
40
|
+
* If PATTERN responds to `#to_str`, the matching is performed by a byte
|
|
41
|
+
* comparison.
|
|
42
|
+
*
|
|
43
|
+
* @param [Regexp, #to_str] pattern
|
|
44
|
+
* @param [#to_int] offset
|
|
45
|
+
* @return [Integer, nil]
|
|
46
|
+
* @see #index */
|
|
47
|
+
VALUE
|
|
48
|
+
rb_u_string_rindex_m(int argc, VALUE *argv, VALUE self)
|
|
49
|
+
{
|
|
50
|
+
const struct rb_u_string *string = RVAL2USTRING(self);
|
|
51
|
+
|
|
52
|
+
VALUE sub, rboffset;
|
|
53
|
+
long offset;
|
|
54
|
+
if (rb_scan_args(argc, argv, "11", &sub, &rboffset) == 2)
|
|
55
|
+
offset = NUM2LONG(rboffset);
|
|
56
|
+
else
|
|
57
|
+
/* TODO: Why not simply use -1? Benchmark which is faster. */
|
|
58
|
+
offset = u_n_chars_n(USTRING_STR(string), USTRING_LENGTH(string));
|
|
59
|
+
|
|
60
|
+
const char *begin = rb_u_string_begin_from_offset(string, offset);
|
|
61
|
+
const char *end = USTRING_END(string);
|
|
62
|
+
if (begin == NULL) {
|
|
63
|
+
if (offset <= 0) {
|
|
64
|
+
if (TYPE(sub) == T_REGEXP)
|
|
65
|
+
rb_backref_set(Qnil);
|
|
66
|
+
|
|
67
|
+
return Qnil;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
begin = end;
|
|
71
|
+
/* TODO: this converting back and forward can be optimized away
|
|
72
|
+
* if rb_u_string_index_regexp() and rb_u_string_rindex() were split up
|
|
73
|
+
* into two additional functions, adding
|
|
74
|
+
* rb_u_string_index_regexp_pointer() and rb_u_string_rindex_pointer(),
|
|
75
|
+
* so that one can pass a pointer to start at immediately
|
|
76
|
+
* instead of an offset that gets calculated into a pointer. */
|
|
77
|
+
offset = u_n_chars_n(USTRING_STR(string), USTRING_LENGTH(string));
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
switch (TYPE(sub)) {
|
|
81
|
+
case T_REGEXP:
|
|
82
|
+
/* TODO: What’s this first test for, exactly? */
|
|
83
|
+
if (RREGEXP(sub)->ptr == NULL || RREGEXP_SRC_LEN(sub) > 0)
|
|
84
|
+
offset = rb_u_string_index_regexp(self, begin, sub, true);
|
|
85
|
+
break;
|
|
86
|
+
default: {
|
|
87
|
+
VALUE tmp = rb_check_string_type(sub);
|
|
88
|
+
if (NIL_P(tmp))
|
|
89
|
+
rb_u_raise(rb_eTypeError, "type mismatch: %s given",
|
|
90
|
+
rb_obj_classname(sub));
|
|
91
|
+
|
|
92
|
+
sub = tmp;
|
|
93
|
+
}
|
|
94
|
+
/* fall through */
|
|
95
|
+
case T_STRING:
|
|
96
|
+
offset = rb_u_string_rindex(self, sub, offset);
|
|
97
|
+
break;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
if (offset < 0)
|
|
101
|
+
return Qnil;
|
|
102
|
+
|
|
103
|
+
return LONG2NUM(offset);
|
|
104
|
+
}
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
#include "rb_u_re.h"
|
|
3
|
+
|
|
4
|
+
static VALUE
|
|
5
|
+
rb_u_string_rpartition_failure(VALUE self)
|
|
6
|
+
{
|
|
7
|
+
return rb_ary_new3(3,
|
|
8
|
+
rb_u_string_new_empty(self),
|
|
9
|
+
rb_u_string_new_empty(self),
|
|
10
|
+
self);
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
static VALUE
|
|
14
|
+
rb_u_string_rpartition_success(VALUE self, VALUE rbseparator, long offset)
|
|
15
|
+
{
|
|
16
|
+
const struct rb_u_string *string = RVAL2USTRING(self);
|
|
17
|
+
const struct rb_u_string *separator = RVAL2USTRING_ANY(rbseparator);
|
|
18
|
+
|
|
19
|
+
long after = offset + USTRING_LENGTH(separator);
|
|
20
|
+
|
|
21
|
+
return rb_ary_new3(3,
|
|
22
|
+
rb_u_string_new_subsequence(self, 0, offset),
|
|
23
|
+
TYPE(rbseparator) == T_STRING ?
|
|
24
|
+
rb_u_string_new_rb(rbseparator) :
|
|
25
|
+
rbseparator,
|
|
26
|
+
rb_u_string_new_subsequence(self,
|
|
27
|
+
after,
|
|
28
|
+
USTRING_LENGTH(string) - after));
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
static VALUE
|
|
32
|
+
rb_u_string_rpartition_regex(VALUE self, VALUE regex)
|
|
33
|
+
{
|
|
34
|
+
VALUE str = rb_str_to_str(self);
|
|
35
|
+
|
|
36
|
+
long offset = rb_reg_search(regex, str, RSTRING_LEN(str), 1);
|
|
37
|
+
if (offset < 0)
|
|
38
|
+
return rb_u_string_rpartition_failure(self);
|
|
39
|
+
|
|
40
|
+
VALUE separator = rb_u_pattern_match_reference(INT2FIX(0));
|
|
41
|
+
|
|
42
|
+
return rb_u_string_rpartition_success(self, separator, offset);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
static VALUE
|
|
46
|
+
rb_u_string_rpartition_string(VALUE self, VALUE rbseparator)
|
|
47
|
+
{
|
|
48
|
+
const struct rb_u_string *string = RVAL2USTRING(self);
|
|
49
|
+
const char *begin = USTRING_STR(string);
|
|
50
|
+
|
|
51
|
+
VALUE validated = rb_u_string_validate_type(rbseparator);
|
|
52
|
+
|
|
53
|
+
long offset = rb_u_string_rindex(self,
|
|
54
|
+
validated,
|
|
55
|
+
u_n_chars_n(begin,
|
|
56
|
+
USTRING_LENGTH(string)));
|
|
57
|
+
if (offset < 0)
|
|
58
|
+
return rb_u_string_rpartition_failure(self);
|
|
59
|
+
|
|
60
|
+
long byte_offset = u_offset_to_pointer(begin, offset) - begin;
|
|
61
|
+
|
|
62
|
+
return rb_u_string_rpartition_success(self, validated, byte_offset);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/* @overload rpartition(separator)
|
|
66
|
+
* @param [Regexp, #to_str] separator
|
|
67
|
+
* @return [Array<U::String>] The receiver split into _s₁_ = {#slice}(0, _i_),
|
|
68
|
+
* _s₂_ = {#slice}(_i_, _n_), _s₃_ = {#slice}(_i_ + _n_, -1), where _i_ = _j_ if _j_ ≠
|
|
69
|
+
* nil, _i_ = 0 otherwise, _j_ = {#rindex}(SEPARATOR), _n_ =
|
|
70
|
+
* SEPARATOR{#length}, where _s₁_ and _s₃_ inherit any taint and untrust
|
|
71
|
+
* from the receiver and _s₂_ inherits any taint and untrust from SEPARATOR
|
|
72
|
+
* and also from the receiver if SEPARATOR is a Regexp
|
|
73
|
+
* @see #partition */
|
|
74
|
+
VALUE
|
|
75
|
+
rb_u_string_rpartition(VALUE self, VALUE separator)
|
|
76
|
+
{
|
|
77
|
+
if (TYPE(separator) == T_REGEXP)
|
|
78
|
+
return rb_u_string_rpartition_regex(self, separator);
|
|
79
|
+
|
|
80
|
+
return rb_u_string_rpartition_string(self, separator);
|
|
81
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
|
|
3
|
+
/* @return [U::String] The receiver with its maximum {#space?} suffix removed,
|
|
4
|
+
* inheriting any taint and untrust from the receiver
|
|
5
|
+
* @see #lstrip
|
|
6
|
+
* @see #strip */
|
|
7
|
+
VALUE
|
|
8
|
+
rb_u_string_rstrip(VALUE self)
|
|
9
|
+
{
|
|
10
|
+
const struct rb_u_string *string = RVAL2USTRING(self);
|
|
11
|
+
|
|
12
|
+
const char *begin = USTRING_STR(string);
|
|
13
|
+
if (begin == NULL)
|
|
14
|
+
return self;
|
|
15
|
+
|
|
16
|
+
const char *end = USTRING_END(string);
|
|
17
|
+
const char *q = end;
|
|
18
|
+
while (begin < q) {
|
|
19
|
+
const char *p;
|
|
20
|
+
uint32_t c = u_decode_r(&p, begin, q);
|
|
21
|
+
if (c != '\0' && !u_char_isspace(c))
|
|
22
|
+
break;
|
|
23
|
+
q = p;
|
|
24
|
+
}
|
|
25
|
+
if (q == end)
|
|
26
|
+
return self;
|
|
27
|
+
|
|
28
|
+
return rb_u_string_new_c(self, begin, q - begin);
|
|
29
|
+
}
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
#include "rb_u_re.h"
|
|
3
|
+
|
|
4
|
+
static VALUE
|
|
5
|
+
rb_u_string_scan_once(VALUE string, VALUE pattern, long *start, long *last)
|
|
6
|
+
{
|
|
7
|
+
if (rb_reg_search(pattern, string, *start, false) < 0) {
|
|
8
|
+
if (*last >= 0)
|
|
9
|
+
rb_reg_search(pattern, string, *last, false);
|
|
10
|
+
|
|
11
|
+
return Qnil;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
*last = *start;
|
|
15
|
+
|
|
16
|
+
VALUE match = rb_backref_get();
|
|
17
|
+
struct re_registers *registers = RMATCH_REGS(match);
|
|
18
|
+
if (registers->beg[0] == registers->end[0]) {
|
|
19
|
+
if (RSTRING_LEN(string) > registers->end[0])
|
|
20
|
+
*start = registers->end[0] +
|
|
21
|
+
(u_next(RSTRING_PTR(string) + registers->end[0]) -
|
|
22
|
+
(RSTRING_PTR(string) + registers->end[0]));
|
|
23
|
+
else
|
|
24
|
+
*start = registers->end[0] + 1;
|
|
25
|
+
} else {
|
|
26
|
+
*start = registers->end[0];
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
if (registers->num_regs == 1)
|
|
30
|
+
return rb_u_string_new_rb(rb_reg_nth_match(0, match));
|
|
31
|
+
|
|
32
|
+
VALUE result = rb_ary_new2(registers->num_regs);
|
|
33
|
+
for (int i = 1; i < registers->num_regs; i++)
|
|
34
|
+
rb_ary_push(result, rb_u_string_new_rb(rb_reg_nth_match(i, match)));
|
|
35
|
+
|
|
36
|
+
return result;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
static VALUE
|
|
40
|
+
rb_u_string_scan_block(VALUE self, VALUE string, VALUE pattern)
|
|
41
|
+
{
|
|
42
|
+
VALUE result;
|
|
43
|
+
long start = 0;
|
|
44
|
+
long last = -1;
|
|
45
|
+
|
|
46
|
+
while (!NIL_P(result = rb_u_string_scan_once(string, pattern, &start, &last)))
|
|
47
|
+
rb_yield(result);
|
|
48
|
+
|
|
49
|
+
return self;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
static VALUE
|
|
53
|
+
rb_u_string_scan_array(VALUE string, VALUE pattern)
|
|
54
|
+
{
|
|
55
|
+
VALUE result;
|
|
56
|
+
long start = 0;
|
|
57
|
+
long last = -1;
|
|
58
|
+
|
|
59
|
+
VALUE array = rb_ary_new();
|
|
60
|
+
while (!NIL_P(result = rb_u_string_scan_once(string, pattern, &start, &last)))
|
|
61
|
+
rb_ary_push(array, result);
|
|
62
|
+
|
|
63
|
+
return array;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/* @overload scan(pattern)
|
|
67
|
+
* @param [Regexp] pattern
|
|
68
|
+
* @return [Array<U::String>, Array<Array<U::String>>] All matches – or
|
|
69
|
+
* sub-matches, if they exist – of matches of PATTERN in the receiver, each
|
|
70
|
+
* inheriting any taint and untrust from both the receiver and from PATTERN
|
|
71
|
+
* @note The Regexp special variables `$&`, `$'`, <code>$\`</code>, `$1`,
|
|
72
|
+
* `$2`, …, `$`_n_ are updated accordingly.
|
|
73
|
+
*
|
|
74
|
+
* @overload scan(pattern)
|
|
75
|
+
* @param [#to_str] pattern
|
|
76
|
+
* @return [Array<U::String>] All matches of PATTERN in the receiver, each
|
|
77
|
+
* inheriting any taint and untrust from the receiver
|
|
78
|
+
*
|
|
79
|
+
* @overload scan(pattern){ |*submatches| … }
|
|
80
|
+
*
|
|
81
|
+
* Enumerates the sub-matches of matches of PATTERN in the receiver, each
|
|
82
|
+
* inheriting any taint and untrust from both the receiver and from PATTERN.
|
|
83
|
+
*
|
|
84
|
+
* @param [Regexp] pattern
|
|
85
|
+
* @yieldparam [Array<U::String>] submatches
|
|
86
|
+
* @return [self]
|
|
87
|
+
* @note The Regexp special variables `$&`, `$'`, <code>$\`</code>, `$1`,
|
|
88
|
+
* `$2`, …, `$`_n_ are updated accordingly.
|
|
89
|
+
*
|
|
90
|
+
* @overload scan(pattern){ |match| … }
|
|
91
|
+
*
|
|
92
|
+
* Enumerates the matches of PATTERN in the receiver, each inheriting any
|
|
93
|
+
* taint and untrust from the receiver.
|
|
94
|
+
*
|
|
95
|
+
* @param [#to_str] pattern
|
|
96
|
+
* @yieldparam [U::String] match
|
|
97
|
+
* @return [self] */
|
|
98
|
+
VALUE
|
|
99
|
+
rb_u_string_scan(VALUE self, VALUE pattern)
|
|
100
|
+
{
|
|
101
|
+
pattern = rb_u_pattern_argument(pattern, true);
|
|
102
|
+
|
|
103
|
+
VALUE string = rb_str_to_str(self);
|
|
104
|
+
|
|
105
|
+
if (rb_block_given_p())
|
|
106
|
+
return rb_u_string_scan_block(self, string, pattern);
|
|
107
|
+
|
|
108
|
+
return rb_u_string_scan_array(string, pattern);
|
|
109
|
+
}
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
|
|
3
|
+
#define SCRIPT2ID(script, symbol) \
|
|
4
|
+
case U_SCRIPT_##script: { \
|
|
5
|
+
static ID id_##symbol; \
|
|
6
|
+
if (id_##symbol == 0) \
|
|
7
|
+
id_##symbol = rb_intern(#symbol); \
|
|
8
|
+
return ID2SYM(id_##symbol); \
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
static VALUE
|
|
12
|
+
script_to_symbol(enum u_script script)
|
|
13
|
+
{
|
|
14
|
+
switch (script) {
|
|
15
|
+
SCRIPT2ID(COMMON, common)
|
|
16
|
+
SCRIPT2ID(INHERITED, inherited)
|
|
17
|
+
SCRIPT2ID(ARABIC, arabic)
|
|
18
|
+
SCRIPT2ID(ARMENIAN, armenian)
|
|
19
|
+
SCRIPT2ID(BENGALI, bengali)
|
|
20
|
+
SCRIPT2ID(BOPOMOFO, bopomofo)
|
|
21
|
+
SCRIPT2ID(CHEROKEE, cherokee)
|
|
22
|
+
SCRIPT2ID(COPTIC, coptic)
|
|
23
|
+
SCRIPT2ID(CYRILLIC, cyrillic)
|
|
24
|
+
SCRIPT2ID(DESERET, deseret)
|
|
25
|
+
SCRIPT2ID(DEVANAGARI, devanagari)
|
|
26
|
+
SCRIPT2ID(ETHIOPIC, ethiopic)
|
|
27
|
+
SCRIPT2ID(GEORGIAN, georgian)
|
|
28
|
+
SCRIPT2ID(GOTHIC, gothic)
|
|
29
|
+
SCRIPT2ID(GREEK, greek)
|
|
30
|
+
SCRIPT2ID(GUJARATI, gujarati)
|
|
31
|
+
SCRIPT2ID(GURMUKHI, gurmukhi)
|
|
32
|
+
SCRIPT2ID(HAN, han)
|
|
33
|
+
SCRIPT2ID(HANGUL, hangul)
|
|
34
|
+
SCRIPT2ID(HEBREW, hebrew)
|
|
35
|
+
SCRIPT2ID(HIRAGANA, hiragana)
|
|
36
|
+
SCRIPT2ID(KANNADA, kannada)
|
|
37
|
+
SCRIPT2ID(KATAKANA, katakana)
|
|
38
|
+
SCRIPT2ID(KHMER, khmer)
|
|
39
|
+
SCRIPT2ID(LAO, lao)
|
|
40
|
+
SCRIPT2ID(LATIN, latin)
|
|
41
|
+
SCRIPT2ID(MALAYALAM, malayalam)
|
|
42
|
+
SCRIPT2ID(MONGOLIAN, mongolian)
|
|
43
|
+
SCRIPT2ID(MYANMAR, myanmar)
|
|
44
|
+
SCRIPT2ID(OGHAM, ogham)
|
|
45
|
+
SCRIPT2ID(OLD_ITALIC, old_italic)
|
|
46
|
+
SCRIPT2ID(ORIYA, oriya)
|
|
47
|
+
SCRIPT2ID(RUNIC, runic)
|
|
48
|
+
SCRIPT2ID(SINHALA, sinhala)
|
|
49
|
+
SCRIPT2ID(SYRIAC, syriac)
|
|
50
|
+
SCRIPT2ID(TAMIL, tamil)
|
|
51
|
+
SCRIPT2ID(TELUGU, telugu)
|
|
52
|
+
SCRIPT2ID(THAANA, thaana)
|
|
53
|
+
SCRIPT2ID(THAI, thai)
|
|
54
|
+
SCRIPT2ID(TIBETAN, tibetan)
|
|
55
|
+
SCRIPT2ID(CANADIAN_ABORIGINAL, canadian_aboriginal)
|
|
56
|
+
SCRIPT2ID(YI, yi)
|
|
57
|
+
SCRIPT2ID(TAGALOG, tagalog)
|
|
58
|
+
SCRIPT2ID(HANUNOO, hanunoo)
|
|
59
|
+
SCRIPT2ID(BUHID, buhid)
|
|
60
|
+
SCRIPT2ID(TAGBANWA, tagbanwa)
|
|
61
|
+
SCRIPT2ID(BRAILLE, braille)
|
|
62
|
+
SCRIPT2ID(CYPRIOT, cypriot)
|
|
63
|
+
SCRIPT2ID(LIMBU, limbu)
|
|
64
|
+
SCRIPT2ID(OSMANYA, osmanya)
|
|
65
|
+
SCRIPT2ID(SHAVIAN, shavian)
|
|
66
|
+
SCRIPT2ID(LINEAR_B, linear_b)
|
|
67
|
+
SCRIPT2ID(TAI_LE, tai_le)
|
|
68
|
+
SCRIPT2ID(UGARITIC, ugaritic)
|
|
69
|
+
SCRIPT2ID(NEW_TAI_LUE, new_tai_lue)
|
|
70
|
+
SCRIPT2ID(BUGINESE, buginese)
|
|
71
|
+
SCRIPT2ID(GLAGOLITIC, glagolitic)
|
|
72
|
+
SCRIPT2ID(TIFINAGH, tifinagh)
|
|
73
|
+
SCRIPT2ID(SYLOTI_NAGRI, syloti_nagri)
|
|
74
|
+
SCRIPT2ID(OLD_PERSIAN, old_persian)
|
|
75
|
+
SCRIPT2ID(KHAROSHTHI, kharoshthi)
|
|
76
|
+
SCRIPT2ID(UNKNOWN, unknown)
|
|
77
|
+
SCRIPT2ID(BALINESE, balinese)
|
|
78
|
+
SCRIPT2ID(CUNEIFORM, cuneiform)
|
|
79
|
+
SCRIPT2ID(PHOENICIAN, phoenician)
|
|
80
|
+
SCRIPT2ID(PHAGS_PA, phags_pa)
|
|
81
|
+
SCRIPT2ID(NKO, nko)
|
|
82
|
+
SCRIPT2ID(KAYAH_LI, kayah_li)
|
|
83
|
+
SCRIPT2ID(LEPCHA, lepcha)
|
|
84
|
+
SCRIPT2ID(REJANG, rejang)
|
|
85
|
+
SCRIPT2ID(SUNDANESE, sundanese)
|
|
86
|
+
SCRIPT2ID(SAURASHTRA, saurashtra)
|
|
87
|
+
SCRIPT2ID(CHAM, cham)
|
|
88
|
+
SCRIPT2ID(OL_CHIKI, ol_chiki)
|
|
89
|
+
SCRIPT2ID(VAI, vai)
|
|
90
|
+
SCRIPT2ID(CARIAN, carian)
|
|
91
|
+
SCRIPT2ID(LYCIAN, lycian)
|
|
92
|
+
SCRIPT2ID(LYDIAN, lydian)
|
|
93
|
+
SCRIPT2ID(AVESTAN, avestan)
|
|
94
|
+
SCRIPT2ID(BAMUM, bamum)
|
|
95
|
+
SCRIPT2ID(EGYPTIAN_HIEROGLYPHS, egyptian_hieroglyphs)
|
|
96
|
+
SCRIPT2ID(IMPERIAL_ARAMAIC, imperial_aramaic)
|
|
97
|
+
SCRIPT2ID(INSCRIPTIONAL_PAHLAVI, inscriptional_pahlavi)
|
|
98
|
+
SCRIPT2ID(INSCRIPTIONAL_PARTHIAN, inscriptional_parthian)
|
|
99
|
+
SCRIPT2ID(JAVANESE, javanese)
|
|
100
|
+
SCRIPT2ID(KAITHI, kaithi)
|
|
101
|
+
SCRIPT2ID(LISU, lisu)
|
|
102
|
+
SCRIPT2ID(MEETEI_MAYEK, meetei_mayek)
|
|
103
|
+
SCRIPT2ID(OLD_SOUTH_ARABIAN, old_south_arabian)
|
|
104
|
+
SCRIPT2ID(OLD_TURKIC, old_turkic)
|
|
105
|
+
SCRIPT2ID(SAMARITAN, samaritan)
|
|
106
|
+
SCRIPT2ID(TAI_THAM, tai_tham)
|
|
107
|
+
SCRIPT2ID(TAI_VIET, tai_viet)
|
|
108
|
+
SCRIPT2ID(BATAK, batak)
|
|
109
|
+
SCRIPT2ID(BRAHMI, brahmi)
|
|
110
|
+
SCRIPT2ID(MANDAIC, mandaic)
|
|
111
|
+
SCRIPT2ID(MEROITIC_HIEROGLYPHS, meroitic_hieroglyphs)
|
|
112
|
+
SCRIPT2ID(MEROITIC_CURSIVE, meroitic_cursive)
|
|
113
|
+
SCRIPT2ID(SORA_SOMPENG, sora_sompeng)
|
|
114
|
+
SCRIPT2ID(CHAKMA, chakma)
|
|
115
|
+
SCRIPT2ID(SHARADA, sharada)
|
|
116
|
+
SCRIPT2ID(TAKRI, takri)
|
|
117
|
+
SCRIPT2ID(MIAO, miao)
|
|
118
|
+
default:
|
|
119
|
+
rb_u_raise(rb_eNotImpError, "unknown script: %d", script);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/* Returns the script of the characters of the receiver.
|
|
124
|
+
*
|
|
125
|
+
* The script of a character identifies the primary writing system that uses
|
|
126
|
+
* the character.
|
|
127
|
+
*
|
|
128
|
+
* <table>
|
|
129
|
+
* <thead><tr><th>Script</th><th>Description</th></tr></thead>
|
|
130
|
+
* <tbody>
|
|
131
|
+
* <tr><td>:arabic</td><td>Arabic</td></tr>
|
|
132
|
+
* <tr><td>:armenian</td><td>Armenian</td></tr>
|
|
133
|
+
* <tr><td>:avestan</td><td>Avestan</td></tr>
|
|
134
|
+
* <tr><td>:balinese</td><td>Balinese</td></tr>
|
|
135
|
+
* <tr><td>:bamum</td><td>Bamum</td></tr>
|
|
136
|
+
* <tr><td>:batak</td><td>Batak</td></tr>
|
|
137
|
+
* <tr><td>:bengali</td><td>Bengali</td></tr>
|
|
138
|
+
* <tr><td>:bopomofo</td><td>Bopomofo</td></tr>
|
|
139
|
+
* <tr><td>:brahmi</td><td>Brahmi</td></tr>
|
|
140
|
+
* <tr><td>:braille</td><td>Braille</td></tr>
|
|
141
|
+
* <tr><td>:buginese</td><td>Buginese</td></tr>
|
|
142
|
+
* <tr><td>:buhid</td><td>Buhid</td></tr>
|
|
143
|
+
* <tr><td>:canadian_aboriginal</td><td>Canadian Aboriginal</td></tr>
|
|
144
|
+
* <tr><td>:carian</td><td>Carian</td></tr>
|
|
145
|
+
* <tr><td>:chakma</td><td>Chakma</td></tr>
|
|
146
|
+
* <tr><td>:cham</td><td>Cham</td></tr>
|
|
147
|
+
* <tr><td>:cherokee</td><td>Cherokee</td></tr>
|
|
148
|
+
* <tr><td>:common</td><td>For other characters that may be used with multiple scripts</td></tr>
|
|
149
|
+
* <tr><td>:coptic</td><td>Coptic</td></tr>
|
|
150
|
+
* <tr><td>:cuneiform</td><td>Cuneiform</td></tr>
|
|
151
|
+
* <tr><td>:cypriot</td><td>Cypriot</td></tr>
|
|
152
|
+
* <tr><td>:cyrillic</td><td>Cyrillic</td></tr>
|
|
153
|
+
* <tr><td>:deseret</td><td>Deseret</td></tr>
|
|
154
|
+
* <tr><td>:devanagari</td><td>Devanagari</td></tr>
|
|
155
|
+
* <tr><td>:egyptian_hieroglyphs</td><td>Egyptian Hieroglpyhs</td></tr>
|
|
156
|
+
* <tr><td>:ethiopic</td><td>Ethiopic</td></tr>
|
|
157
|
+
* <tr><td>:georgian</td><td>Georgian</td></tr>
|
|
158
|
+
* <tr><td>:glagolitic</td><td>Glagolitic</td></tr>
|
|
159
|
+
* <tr><td>:gothic</td><td>Gothic</td></tr>
|
|
160
|
+
* <tr><td>:greek</td><td>Greek</td></tr>
|
|
161
|
+
* <tr><td>:gujarati</td><td>Gujarati</td></tr>
|
|
162
|
+
* <tr><td>:gurmukhi</td><td>Gurmukhi</td></tr>
|
|
163
|
+
* <tr><td>:han</td><td>Han</td></tr>
|
|
164
|
+
* <tr><td>:hangul</td><td>Hangul</td></tr>
|
|
165
|
+
* <tr><td>:hanunoo</td><td>Hanunoo</td></tr>
|
|
166
|
+
* <tr><td>:hebrew</td><td>Hebrew</td></tr>
|
|
167
|
+
* <tr><td>:hiragana</td><td>Hiragana</td></tr>
|
|
168
|
+
* <tr><td>:imperial_aramaic</td><td>Imperial Aramaic</td></tr>
|
|
169
|
+
* <tr><td>:inherited</td><td>For characters that may be used with multiple
|
|
170
|
+
* scripts, and that inherit their script from the preceding characters;
|
|
171
|
+
* these include nonspacing marks, enclosing marks, and the zero-width
|
|
172
|
+
* joiner/non-joiner characters</td></tr>
|
|
173
|
+
* <tr><td>:inscriptional_pahlavi</td><td>Inscriptional Pahlavi</td></tr>
|
|
174
|
+
* <tr><td>:inscriptional_parthian</td><td>Inscriptional Parthian</td></tr>
|
|
175
|
+
* <tr><td>:javanese</td><td>Javanese</td></tr>
|
|
176
|
+
* <tr><td>:kaithi</td><td>Kaithi</td></tr>
|
|
177
|
+
* <tr><td>:kannada</td><td>Kannada</td></tr>
|
|
178
|
+
* <tr><td>:katakana</td><td>Katakana</td></tr>
|
|
179
|
+
* <tr><td>:kayah_li</td><td>Kayah Li</td></tr>
|
|
180
|
+
* <tr><td>:kharoshthi</td><td>Kharoshthi</td></tr>
|
|
181
|
+
* <tr><td>:khmer</td><td>Khmer</td></tr>
|
|
182
|
+
* <tr><td>:lao</td><td>Lao</td></tr>
|
|
183
|
+
* <tr><td>:latin</td><td>Latin</td></tr>
|
|
184
|
+
* <tr><td>:lepcha</td><td>Lepcha</td></tr>
|
|
185
|
+
* <tr><td>:limbu</td><td>Limbu</td></tr>
|
|
186
|
+
* <tr><td>:linear_b</td><td>Linear B</td></tr>
|
|
187
|
+
* <tr><td>:lisu</td><td>Lisu</td></tr>
|
|
188
|
+
* <tr><td>:lycian</td><td>Lycian</td></tr>
|
|
189
|
+
* <tr><td>:lydian</td><td>Lydian</td></tr>
|
|
190
|
+
* <tr><td>:malayalam</td><td>Malayalam</td></tr>
|
|
191
|
+
* <tr><td>:mandaic</td><td>Mandaic</td></tr>
|
|
192
|
+
* <tr><td>:meetei_mayek</td><td>Meetei Mayek</td></tr>
|
|
193
|
+
* <tr><td>:meroitic_hieroglyphs</td><td>Meroitic Hieroglyphs</td></tr>
|
|
194
|
+
* <tr><td>:meroitic_cursive</td><td>Meroitic Cursives</td></tr>
|
|
195
|
+
* <tr><td>:miao</td><td>Miao</td></tr>
|
|
196
|
+
* <tr><td>:mongolian</td><td>Mongolian</td></tr>
|
|
197
|
+
* <tr><td>:myanmar</td><td>Myanmar</td></tr>
|
|
198
|
+
* <tr><td>:new_tai_lue</td><td>New Tai Lue</td></tr>
|
|
199
|
+
* <tr><td>:nko</td><td>N'Ko</td></tr>
|
|
200
|
+
* <tr><td>:ogham</td><td>Ogham</td></tr>
|
|
201
|
+
* <tr><td>:old_italic</td><td>Old Italic</td></tr>
|
|
202
|
+
* <tr><td>:old_persian</td><td>Old Persian</td></tr>
|
|
203
|
+
* <tr><td>:old_south_arabian</td><td>Old South Arabian</td></tr>
|
|
204
|
+
* <tr><td>:old_turkic</td><td>Old Turkic</td></tr>
|
|
205
|
+
* <tr><td>:ol_chiki</td><td>Ol Chiki</td></tr>
|
|
206
|
+
* <tr><td>:oriya</td><td>Oriya</td></tr>
|
|
207
|
+
* <tr><td>:osmanya</td><td>Osmanya</td></tr>
|
|
208
|
+
* <tr><td>:phags_pa</td><td>Phags-pa</td></tr>
|
|
209
|
+
* <tr><td>:phoenician</td><td>Phoenician</td></tr>
|
|
210
|
+
* <tr><td>:rejang</td><td>Rejang</td></tr>
|
|
211
|
+
* <tr><td>:runic</td><td>Runic</td></tr>
|
|
212
|
+
* <tr><td>:samaritan</td><td>Samaritan</td></tr>
|
|
213
|
+
* <tr><td>:saurashtra</td><td>Saurashtra</td></tr>
|
|
214
|
+
* <tr><td>:sharada</td><td>Sharada</td></tr>
|
|
215
|
+
* <tr><td>:shavian</td><td>Shavian</td></tr>
|
|
216
|
+
* <tr><td>:sinhala</td><td>Sinhala</td></tr>
|
|
217
|
+
* <tr><td>:sora_sompeng</td><td>Sora Sompeng</td></tr>
|
|
218
|
+
* <tr><td>:sundanese</td><td>Sundanese</td></tr>
|
|
219
|
+
* <tr><td>:syloti_nagri</td><td>Syloti Nagri</td></tr>
|
|
220
|
+
* <tr><td>:syriac</td><td>Syriac</td></tr>
|
|
221
|
+
* <tr><td>:tagalog</td><td>Tagalog</td></tr>
|
|
222
|
+
* <tr><td>:tagbanwa</td><td>Tagbanwa</td></tr>
|
|
223
|
+
* <tr><td>:tai_le</td><td>Tai Le</td></tr>
|
|
224
|
+
* <tr><td>:tai_tham</td><td>Tai Tham</td></tr>
|
|
225
|
+
* <tr><td>:tai_viet</td><td>Tai Viet</td></tr>
|
|
226
|
+
* <tr><td>:takri</td><td>Takri</td></tr>
|
|
227
|
+
* <tr><td>:tamil</td><td>Tamil</td></tr>
|
|
228
|
+
* <tr><td>:telugu</td><td>Telugu</td></tr>
|
|
229
|
+
* <tr><td>:thaana</td><td>Thaana</td></tr>
|
|
230
|
+
* <tr><td>:thai</td><td>Thai</td></tr>
|
|
231
|
+
* <tr><td>:tibetan</td><td>Tibetan</td></tr>
|
|
232
|
+
* <tr><td>:tifinagh</td><td>Tifinagh</td></tr>
|
|
233
|
+
* <tr><td>:ugaritic</td><td>Ugaritic</td></tr>
|
|
234
|
+
* <tr><td>:unknown</td><td>For not assigned, private-use, non-character, and surrogate code points</td></tr>
|
|
235
|
+
* <tr><td>:vai</td><td>Vai</td></tr>
|
|
236
|
+
* <tr><td>:yi</td><td>Yi</td></tr>
|
|
237
|
+
* </tbody>
|
|
238
|
+
* </table>
|
|
239
|
+
*
|
|
240
|
+
* @raise [ArgumentError] If the receiver contains two characters belonging to
|
|
241
|
+
* different scripts
|
|
242
|
+
* @raise [ArgumentError] If the receiver contains an incomplete UTF-8 sequence
|
|
243
|
+
* @raise [ArgumentError] If the receiver contains an invalid UTF-8 sequence
|
|
244
|
+
* @return [Symbol]
|
|
245
|
+
* @see http://www.unicode.org/reports/tr24/
|
|
246
|
+
* Unicode Standard Annex #24 Unicode Script Property */
|
|
247
|
+
VALUE
|
|
248
|
+
rb_u_string_script(VALUE self)
|
|
249
|
+
{
|
|
250
|
+
return _rb_u_string_property(self, "script", U_SCRIPT_UNKNOWN,
|
|
251
|
+
(int (*)(uint32_t))u_char_script,
|
|
252
|
+
(VALUE (*)(int))script_to_symbol);
|
|
253
|
+
}
|