u 0.5.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/build/ext/u/data/attributes.rb +39 -0
- data/build/ext/u/data/bidi-mirroring.rb +27 -0
- data/build/ext/u/data/canonical-combining-class.rb +15 -0
- data/build/ext/u/data/case-folding.rb +39 -0
- data/build/ext/u/data/cased.rb +19 -0
- data/build/ext/u/data/compose.rb +304 -0
- data/build/ext/u/data/constants.rb +31 -0
- data/build/ext/u/data/decompose.rb +85 -0
- data/build/ext/u/data/general-category.rb +61 -0
- data/build/ext/u/data/grapheme-word-break.rb +15 -0
- data/build/ext/u/data/marshalled.rb +5 -0
- data/build/ext/u/data/script.rb +91 -0
- data/build/ext/u/data/soft-dotted.rb +17 -0
- data/build/ext/u/data/title-table.rb +30 -0
- data/build/ext/u/data/wide.rb +17 -0
- data/build/lib/u/build.rb +8 -0
- data/build/lib/u/build/data.rb +16 -0
- data/build/lib/u/build/data/bidimirroring.rb +26 -0
- data/build/lib/u/build/data/break.rb +14 -0
- data/build/lib/u/build/data/casefolding.rb +77 -0
- data/build/lib/u/build/data/compositionexclusions.rb +14 -0
- data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
- data/build/lib/u/build/data/file.rb +88 -0
- data/build/lib/u/build/data/linebreak.rb +14 -0
- data/build/lib/u/build/data/proplist.rb +18 -0
- data/build/lib/u/build/data/scripts.rb +22 -0
- data/build/lib/u/build/data/specialcasing.rb +106 -0
- data/build/lib/u/build/data/unicode.rb +41 -0
- data/build/lib/u/build/data/unicode/entry.rb +27 -0
- data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
- data/build/lib/u/build/data/unicode/points.rb +32 -0
- data/build/lib/u/build/header.rb +11 -0
- data/build/lib/u/build/header/table.rb +19 -0
- data/build/lib/u/build/header/table/row.rb +64 -0
- data/build/lib/u/build/header/tables.rb +6 -0
- data/build/lib/u/build/header/tables/intervals.rb +50 -0
- data/build/lib/u/build/header/tables/split.rb +20 -0
- data/build/lib/u/build/header/tables/split/data.rb +29 -0
- data/build/lib/u/build/header/tables/split/part1.rb +28 -0
- data/build/lib/u/build/header/tables/split/part2.rb +13 -0
- data/build/lib/u/build/header/tables/split/row.rb +34 -0
- data/build/lib/u/build/header/tables/split/rows.rb +22 -0
- data/build/test/unit/break.rb +45 -0
- data/build/test/unit/case.rb +178 -0
- data/build/test/unit/foldcase.rb +44 -0
- data/build/test/unit/normalize.rb +81 -0
- data/ext/u/attributes.c +62 -0
- data/ext/u/attributes.h +5 -0
- data/ext/u/case.h +41 -0
- data/ext/u/data/attributes.h +3070 -0
- data/ext/u/data/bidi-mirroring.h +373 -0
- data/ext/u/data/canonical-combining-class.h +2157 -0
- data/ext/u/data/case-folding.h +171 -0
- data/ext/u/data/cased.h +42 -0
- data/ext/u/data/compose.h +1714 -0
- data/ext/u/data/constants.h +17 -0
- data/ext/u/data/decompose.h +9356 -0
- data/ext/u/data/general-category.h +28959 -0
- data/ext/u/data/grapheme-break.h +13201 -0
- data/ext/u/data/line-break.h +26501 -0
- data/ext/u/data/normalization-quick-check.h +3002 -0
- data/ext/u/data/script.h +2928 -0
- data/ext/u/data/soft-dotted.h +55 -0
- data/ext/u/data/title-table.h +41 -0
- data/ext/u/data/types.h +11117 -0
- data/ext/u/data/wide-cjk.h +197 -0
- data/ext/u/data/wide.h +59 -0
- data/ext/u/data/word-break.h +10001 -0
- data/ext/u/depend +281 -0
- data/ext/u/extconf.rb +158 -0
- data/ext/u/output.h +51 -0
- data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
- data/ext/u/private.h +58 -0
- data/ext/u/rb_includes.h +10 -0
- data/ext/u/rb_private.c +98 -0
- data/ext/u/rb_private.h +67 -0
- data/ext/u/rb_u.c +251 -0
- data/ext/u/rb_u_buffer.c +443 -0
- data/ext/u/rb_u_buffer.h +24 -0
- data/ext/u/rb_u_re.c +43 -0
- data/ext/u/rb_u_re.h +15 -0
- data/ext/u/rb_u_string.c +478 -0
- data/ext/u/rb_u_string.h +173 -0
- data/ext/u/rb_u_string_alnum.c +10 -0
- data/ext/u/rb_u_string_alpha.c +10 -0
- data/ext/u/rb_u_string_aref.c +142 -0
- data/ext/u/rb_u_string_ascii_only.c +13 -0
- data/ext/u/rb_u_string_assigned.c +10 -0
- data/ext/u/rb_u_string_b.c +18 -0
- data/ext/u/rb_u_string_bytesize.c +10 -0
- data/ext/u/rb_u_string_byteslice.c +103 -0
- data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
- data/ext/u/rb_u_string_case_ignorable.c +25 -0
- data/ext/u/rb_u_string_casecmp.c +61 -0
- data/ext/u/rb_u_string_cased.c +17 -0
- data/ext/u/rb_u_string_chomp.c +107 -0
- data/ext/u/rb_u_string_chop.c +33 -0
- data/ext/u/rb_u_string_chr.c +9 -0
- data/ext/u/rb_u_string_cntrl.c +10 -0
- data/ext/u/rb_u_string_collate.c +46 -0
- data/ext/u/rb_u_string_collation_key.c +18 -0
- data/ext/u/rb_u_string_count.c +38 -0
- data/ext/u/rb_u_string_defined.c +10 -0
- data/ext/u/rb_u_string_delete.c +62 -0
- data/ext/u/rb_u_string_digit.c +10 -0
- data/ext/u/rb_u_string_downcase.c +13 -0
- data/ext/u/rb_u_string_dump.c +153 -0
- data/ext/u/rb_u_string_each_byte.c +46 -0
- data/ext/u/rb_u_string_each_char.c +49 -0
- data/ext/u/rb_u_string_each_codepoint.c +45 -0
- data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
- data/ext/u/rb_u_string_each_line.c +142 -0
- data/ext/u/rb_u_string_each_word.c +34 -0
- data/ext/u/rb_u_string_empty.c +11 -0
- data/ext/u/rb_u_string_end_with.c +31 -0
- data/ext/u/rb_u_string_eql.c +30 -0
- data/ext/u/rb_u_string_equal.c +33 -0
- data/ext/u/rb_u_string_foldcase.c +12 -0
- data/ext/u/rb_u_string_folded.c +13 -0
- data/ext/u/rb_u_string_format.c +1745 -0
- data/ext/u/rb_u_string_general_category.c +109 -0
- data/ext/u/rb_u_string_getbyte.c +21 -0
- data/ext/u/rb_u_string_graph.c +21 -0
- data/ext/u/rb_u_string_grapheme_break.c +61 -0
- data/ext/u/rb_u_string_gsub.c +164 -0
- data/ext/u/rb_u_string_hash.c +10 -0
- data/ext/u/rb_u_string_hex.c +9 -0
- data/ext/u/rb_u_string_include.c +10 -0
- data/ext/u/rb_u_string_index.c +110 -0
- data/ext/u/rb_u_string_inspect.c +189 -0
- data/ext/u/rb_u_string_internal_tr.c +148 -0
- data/ext/u/rb_u_string_internal_tr.h +29 -0
- data/ext/u/rb_u_string_justify.c +169 -0
- data/ext/u/rb_u_string_length.c +10 -0
- data/ext/u/rb_u_string_line_break.c +115 -0
- data/ext/u/rb_u_string_lower.c +13 -0
- data/ext/u/rb_u_string_lstrip.c +24 -0
- data/ext/u/rb_u_string_match.c +65 -0
- data/ext/u/rb_u_string_mirror.c +16 -0
- data/ext/u/rb_u_string_newline.c +21 -0
- data/ext/u/rb_u_string_normalize.c +70 -0
- data/ext/u/rb_u_string_normalized.c +28 -0
- data/ext/u/rb_u_string_oct.c +11 -0
- data/ext/u/rb_u_string_ord.c +14 -0
- data/ext/u/rb_u_string_partition.c +80 -0
- data/ext/u/rb_u_string_plus.c +33 -0
- data/ext/u/rb_u_string_print.c +10 -0
- data/ext/u/rb_u_string_punct.c +10 -0
- data/ext/u/rb_u_string_reverse.c +13 -0
- data/ext/u/rb_u_string_rindex.c +104 -0
- data/ext/u/rb_u_string_rpartition.c +81 -0
- data/ext/u/rb_u_string_rstrip.c +29 -0
- data/ext/u/rb_u_string_scan.c +109 -0
- data/ext/u/rb_u_string_script.c +253 -0
- data/ext/u/rb_u_string_soft_dotted.c +13 -0
- data/ext/u/rb_u_string_space.c +24 -0
- data/ext/u/rb_u_string_split.c +245 -0
- data/ext/u/rb_u_string_squeeze.c +75 -0
- data/ext/u/rb_u_string_start_with.c +31 -0
- data/ext/u/rb_u_string_strip.c +36 -0
- data/ext/u/rb_u_string_sub.c +147 -0
- data/ext/u/rb_u_string_times.c +35 -0
- data/ext/u/rb_u_string_title.c +10 -0
- data/ext/u/rb_u_string_titlecase.c +13 -0
- data/ext/u/rb_u_string_to_i.c +45 -0
- data/ext/u/rb_u_string_to_inum.c +364 -0
- data/ext/u/rb_u_string_to_inum.h +1 -0
- data/ext/u/rb_u_string_to_str.c +17 -0
- data/ext/u/rb_u_string_to_sym.c +12 -0
- data/ext/u/rb_u_string_tr.c +290 -0
- data/ext/u/rb_u_string_upcase.c +12 -0
- data/ext/u/rb_u_string_upper.c +13 -0
- data/ext/u/rb_u_string_valid.c +10 -0
- data/ext/u/rb_u_string_valid_encoding.c +12 -0
- data/ext/u/rb_u_string_wide.c +21 -0
- data/ext/u/rb_u_string_wide_cjk.c +21 -0
- data/ext/u/rb_u_string_width.c +19 -0
- data/ext/u/rb_u_string_word_break.c +63 -0
- data/ext/u/rb_u_string_xdigit.c +22 -0
- data/ext/u/rb_u_string_zero_width.c +16 -0
- data/ext/u/titled.c +55 -0
- data/ext/u/titled.h +1 -0
- data/ext/u/u.c +23 -0
- data/ext/u/u.h +458 -0
- data/ext/u/u_char_canonical_combining_class.c +31 -0
- data/ext/u/u_char_digit_value.c +21 -0
- data/ext/u/u_char_downcase.c +27 -0
- data/ext/u/u_char_general_category.c +31 -0
- data/ext/u/u_char_grapheme_break.c +28 -0
- data/ext/u/u_char_isalnum.c +24 -0
- data/ext/u/u_char_isalpha.c +21 -0
- data/ext/u/u_char_isassigned.c +16 -0
- data/ext/u/u_char_iscased.c +22 -0
- data/ext/u/u_char_iscaseignorable.c +29 -0
- data/ext/u/u_char_iscntrl.c +17 -0
- data/ext/u/u_char_isdefined.c +15 -0
- data/ext/u/u_char_isdigit.c +16 -0
- data/ext/u/u_char_isgraph.c +22 -0
- data/ext/u/u_char_islower.c +16 -0
- data/ext/u/u_char_isnewline.c +24 -0
- data/ext/u/u_char_isprint.c +21 -0
- data/ext/u/u_char_ispunct.c +27 -0
- data/ext/u/u_char_issoftdotted.c +18 -0
- data/ext/u/u_char_isspace.c +28 -0
- data/ext/u/u_char_isupper.c +16 -0
- data/ext/u/u_char_isvalid.c +18 -0
- data/ext/u/u_char_iswide.c +18 -0
- data/ext/u/u_char_iswide_cjk.c +22 -0
- data/ext/u/u_char_isxdigit.c +27 -0
- data/ext/u/u_char_iszerowidth.c +29 -0
- data/ext/u/u_char_line_break.c +29 -0
- data/ext/u/u_char_mirror.c +16 -0
- data/ext/u/u_char_normalized.c +23 -0
- data/ext/u/u_char_script.c +41 -0
- data/ext/u/u_char_to_u.c +48 -0
- data/ext/u/u_char_upcase.c +24 -0
- data/ext/u/u_char_width.c +12 -0
- data/ext/u/u_char_word_break.c +28 -0
- data/ext/u/u_char_xdigit_value.c +31 -0
- data/ext/u/u_collate.c +83 -0
- data/ext/u/u_collation_key.c +132 -0
- data/ext/u/u_decode.c +156 -0
- data/ext/u/u_downcase.c +201 -0
- data/ext/u/u_foldcase.c +68 -0
- data/ext/u/u_grapheme_clusters.c +57 -0
- data/ext/u/u_has_prefix.c +27 -0
- data/ext/u/u_index.c +93 -0
- data/ext/u/u_is_ascii_only.c +33 -0
- data/ext/u/u_locale.c +40 -0
- data/ext/u/u_locale.h +14 -0
- data/ext/u/u_mirror.c +20 -0
- data/ext/u/u_n_bytes.c +16 -0
- data/ext/u/u_n_chars.c +43 -0
- data/ext/u/u_normalize.c +232 -0
- data/ext/u/u_normalized.c +28 -0
- data/ext/u/u_offset_to_pointer.c +62 -0
- data/ext/u/u_pointer_to_offset.c +23 -0
- data/ext/u/u_recode.c +73 -0
- data/ext/u/u_reverse.c +21 -0
- data/ext/u/u_rindex.c +132 -0
- data/ext/u/u_titlecase.c +68 -0
- data/ext/u/u_upcase.c +89 -0
- data/ext/u/u_width.c +35 -0
- data/ext/u/u_words.c +82 -0
- data/ext/u/yield.h +27 -0
- data/lib/u-1.0.rb +20 -0
- data/lib/u-1.0/buffer.rb +10 -0
- data/lib/u-1.0/string.rb +9 -0
- data/lib/u-1.0/version.rb +287 -0
- data/test/unit/case.rb +2080 -0
- data/test/unit/foldcase.rb +1136 -0
- data/test/unit/graphemebreak.rb +407 -0
- data/test/unit/normalize.rb +367545 -0
- data/test/unit/u-1.0.rb +10 -0
- data/test/unit/u-1.0/buffer.rb +52 -0
- data/test/unit/u-1.0/string.rb +1439 -0
- data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
- data/test/unit/wordbreak.rb +1083 -0
- metadata +603 -148
- data/README +0 -38
- data/Rakefile +0 -64
- data/ext/encoding/character/utf-8/break.c +0 -25
- data/ext/encoding/character/utf-8/data/break.h +0 -22931
- data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
- data/ext/encoding/character/utf-8/data/compose.h +0 -1607
- data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
- data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
- data/ext/encoding/character/utf-8/decompose.c +0 -444
- data/ext/encoding/character/utf-8/depend +0 -65
- data/ext/encoding/character/utf-8/extconf.rb +0 -67
- data/ext/encoding/character/utf-8/private.h +0 -51
- data/ext/encoding/character/utf-8/properties.c +0 -1056
- data/ext/encoding/character/utf-8/rb_includes.h +0 -19
- data/ext/encoding/character/utf-8/rb_methods.h +0 -49
- data/ext/encoding/character/utf-8/rb_private.h +0 -52
- data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
- data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
- data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
- data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
- data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
- data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
- data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
- data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
- data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
- data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
- data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
- data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
- data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
- data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
- data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
- data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
- data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
- data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
- data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
- data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
- data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
- data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
- data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
- data/ext/encoding/character/utf-8/tables.h +0 -38
- data/ext/encoding/character/utf-8/unicode.c +0 -319
- data/ext/encoding/character/utf-8/unicode.h +0 -216
- data/ext/encoding/character/utf-8/utf.c +0 -1334
- data/lib/encoding/character/utf-8.rb +0 -201
- data/lib/u.rb +0 -16
- data/lib/u/string.rb +0 -185
- data/lib/u/version.rb +0 -5
- data/test/unit/u/string.rb +0 -91
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
|
|
3
|
+
/* @return [Integer] The number of characters in the receiver */
|
|
4
|
+
VALUE
|
|
5
|
+
rb_u_string_length(VALUE self)
|
|
6
|
+
{
|
|
7
|
+
const struct rb_u_string *string = RVAL2USTRING(self);
|
|
8
|
+
|
|
9
|
+
return UINT2NUM(u_n_chars_n(USTRING_STR(string), USTRING_LENGTH(string)));
|
|
10
|
+
}
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
|
|
3
|
+
#define BREAK2ID(value, symbol) \
|
|
4
|
+
case U_LINE_BREAK_##value: { \
|
|
5
|
+
static ID id_##symbol; \
|
|
6
|
+
if (id_##symbol == 0) \
|
|
7
|
+
id_##symbol = rb_intern(#symbol); \
|
|
8
|
+
return ID2SYM(id_##symbol); \
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
static VALUE
|
|
12
|
+
break_to_symbol(enum u_line_break value)
|
|
13
|
+
{
|
|
14
|
+
switch (value) {
|
|
15
|
+
BREAK2ID(MANDATORY, mandatory)
|
|
16
|
+
BREAK2ID(CARRIAGE_RETURN, carriage_return)
|
|
17
|
+
BREAK2ID(LINE_FEED, line_feed)
|
|
18
|
+
BREAK2ID(COMBINING_MARK, combining_mark)
|
|
19
|
+
BREAK2ID(NEXT_LINE, next_line)
|
|
20
|
+
BREAK2ID(SURROGATE, surrogate)
|
|
21
|
+
BREAK2ID(WORD_JOINER, word_joiner)
|
|
22
|
+
BREAK2ID(ZERO_WIDTH_SPACE, zero_width_space)
|
|
23
|
+
BREAK2ID(NON_BREAKING_GLUE, non_breaking_glue)
|
|
24
|
+
BREAK2ID(SPACE, space)
|
|
25
|
+
BREAK2ID(BREAK_OPPORTUNITY_BEFORE_AND_AFTER, break_opportunity_before_and_after)
|
|
26
|
+
BREAK2ID(BREAK_AFTER, break_after)
|
|
27
|
+
BREAK2ID(BREAK_BEFORE, break_before)
|
|
28
|
+
BREAK2ID(HYPHEN, hyphen)
|
|
29
|
+
BREAK2ID(CONTINGENT_BREAK_OPPORTUNITY, contingent_break_opportunity)
|
|
30
|
+
BREAK2ID(CLOSE_PUNCTUATION, close_punctuation)
|
|
31
|
+
BREAK2ID(CLOSE_PARENTHESIS, close_parenthesis)
|
|
32
|
+
BREAK2ID(EXCLAMATION_INTERROGATION, exclamation_interrogation)
|
|
33
|
+
BREAK2ID(INSEPARABLE, inseparable)
|
|
34
|
+
BREAK2ID(NONSTARTER, nonstarter)
|
|
35
|
+
BREAK2ID(OPEN_PUNCTUATION, open_punctuation)
|
|
36
|
+
BREAK2ID(QUOTATION, quotation)
|
|
37
|
+
BREAK2ID(INFIX_NUMERIC_SEPARATOR, infix_numeric_separator)
|
|
38
|
+
BREAK2ID(NUMERIC, numeric)
|
|
39
|
+
BREAK2ID(POSTFIX_NUMERIC, postfix_numeric)
|
|
40
|
+
BREAK2ID(PREFIX_NUMERIC, prefix_numeric)
|
|
41
|
+
BREAK2ID(SYMBOLS_ALLOWING_BREAK_AFTER, symbols_allowing_break_after)
|
|
42
|
+
BREAK2ID(AMBIGUOUS, ambiguous)
|
|
43
|
+
BREAK2ID(ALPHABETIC, alphabetic)
|
|
44
|
+
BREAK2ID(CONDITIONAL_JAPANESE_STARTER, conditional_japanese_starter)
|
|
45
|
+
BREAK2ID(HANGUL_LV_SYLLABLE, hangul_lv_syllable)
|
|
46
|
+
BREAK2ID(HANGUL_LVT_SYLLABLE, hangul_lvt_syllable)
|
|
47
|
+
BREAK2ID(HEBREW_LETTER, hebrew_letter)
|
|
48
|
+
BREAK2ID(IDEOGRAPHIC, ideographic)
|
|
49
|
+
BREAK2ID(HANGUL_L_JAMO, hangul_l_jamo)
|
|
50
|
+
BREAK2ID(HANGUL_V_JAMO, hangul_v_jamo)
|
|
51
|
+
BREAK2ID(HANGUL_T_JAMO, hangul_t_jamo)
|
|
52
|
+
BREAK2ID(REGIONAL_INDICATOR, regional_indicator)
|
|
53
|
+
BREAK2ID(COMPLEX_CONTEXT_DEPENDENT, complex_context_dependent)
|
|
54
|
+
BREAK2ID(UNKNOWN, unknown)
|
|
55
|
+
default:
|
|
56
|
+
rb_u_raise(rb_eNotImpError, "unknown line break: %d", value);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/* Returns the line break property value of the characters of the receiver.
|
|
61
|
+
*
|
|
62
|
+
* The possible break values are
|
|
63
|
+
*
|
|
64
|
+
* * :after
|
|
65
|
+
* * :alphabetic
|
|
66
|
+
* * :ambiguous
|
|
67
|
+
* * :before
|
|
68
|
+
* * :before_and_after
|
|
69
|
+
* * :carriage_return
|
|
70
|
+
* * :close_parenthesis
|
|
71
|
+
* * :close_punctuation
|
|
72
|
+
* * :combining_mark
|
|
73
|
+
* * :complex_context
|
|
74
|
+
* * :conditional_japanese_starter
|
|
75
|
+
* * :contingent
|
|
76
|
+
* * :exclamation
|
|
77
|
+
* * :hangul_l_jamo
|
|
78
|
+
* * :hangul_lv_syllable
|
|
79
|
+
* * :hangul_lvt_syllable
|
|
80
|
+
* * :hangul_t_jamo
|
|
81
|
+
* * :hangul_v_jamo
|
|
82
|
+
* * :hebrew_letter
|
|
83
|
+
* * :hyphen
|
|
84
|
+
* * :ideographic
|
|
85
|
+
* * :infix_separator
|
|
86
|
+
* * :inseparable
|
|
87
|
+
* * :line_feed
|
|
88
|
+
* * :mandatory
|
|
89
|
+
* * :next_line
|
|
90
|
+
* * :non_breaking_glue
|
|
91
|
+
* * :non_starter
|
|
92
|
+
* * :numeric
|
|
93
|
+
* * :open_punctuation
|
|
94
|
+
* * :postfix
|
|
95
|
+
* * :prefix
|
|
96
|
+
* * :quotation
|
|
97
|
+
* * :regional_indicator
|
|
98
|
+
* * :space
|
|
99
|
+
* * :surrogate
|
|
100
|
+
* * :symbol
|
|
101
|
+
* * :unknown
|
|
102
|
+
* * :word_joiner
|
|
103
|
+
* * :zero_width_space
|
|
104
|
+
*
|
|
105
|
+
* @raise [ArgumentError] If the string consists of more than one break type
|
|
106
|
+
* @return [Symbol]
|
|
107
|
+
* @see http://unicode.org/reports/tr14/
|
|
108
|
+
* Unicode Standard Annex #14: Unicode Line Breaking Algorithm */
|
|
109
|
+
VALUE
|
|
110
|
+
rb_u_string_line_break(VALUE self)
|
|
111
|
+
{
|
|
112
|
+
return _rb_u_string_property(self, "line break", U_LINE_BREAK_UNKNOWN,
|
|
113
|
+
(int (*)(uint32_t))u_char_line_break,
|
|
114
|
+
(VALUE (*)(int))break_to_symbol);
|
|
115
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
|
|
3
|
+
/* @overload lower?(locale = ENV[LC_CTYPE])
|
|
4
|
+
* @param [#to_str] locale
|
|
5
|
+
* @return [Boolean] True if the receiver has been downcased according to the
|
|
6
|
+
* rules of the language of LOCALE, which may be empty to specifically use
|
|
7
|
+
* the default, language-independent, rules, that is, if _a_ =
|
|
8
|
+
* _a_{#downcase}(LOCALE), where _a_ = {#normalize}(`:nfd`) */
|
|
9
|
+
VALUE
|
|
10
|
+
rb_u_string_lower(int argc, VALUE *argv, VALUE self)
|
|
11
|
+
{
|
|
12
|
+
return _rb_u_string_test_locale(argc, argv, self, u_downcase);
|
|
13
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
|
|
3
|
+
/* @return [U::String] The receiver with its maximum {#space?} prefix removed,
|
|
4
|
+
* inheriting any taint and untrust
|
|
5
|
+
* @see #rstrip
|
|
6
|
+
* @see #strip */
|
|
7
|
+
VALUE
|
|
8
|
+
rb_u_string_lstrip(VALUE self)
|
|
9
|
+
{
|
|
10
|
+
const struct rb_u_string *string = RVAL2USTRING(self);
|
|
11
|
+
|
|
12
|
+
const char *begin = USTRING_STR(string);
|
|
13
|
+
if (begin == NULL)
|
|
14
|
+
return self;
|
|
15
|
+
|
|
16
|
+
const char *p = begin, *end = USTRING_END(string);
|
|
17
|
+
for (const char *q; p < end; p = q)
|
|
18
|
+
if (!u_char_isspace(u_decode(&q, p, end)))
|
|
19
|
+
break;
|
|
20
|
+
if (p == begin)
|
|
21
|
+
return self;
|
|
22
|
+
|
|
23
|
+
return rb_u_string_new_c(self, p, end - p);
|
|
24
|
+
}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
#include "rb_u_re.h"
|
|
3
|
+
|
|
4
|
+
/* @overload =~(other)
|
|
5
|
+
* @param [Regexp, #=~] other
|
|
6
|
+
* @raise [TypeError] If OTHER is a {U::String} or String
|
|
7
|
+
* @return [Numeric, nil] The result of OTHER`#=~`(self), that is, the index
|
|
8
|
+
* of the first character of the match of OTHER in the receiver, if one
|
|
9
|
+
* exists */
|
|
10
|
+
VALUE
|
|
11
|
+
rb_u_string_match(VALUE self, VALUE other)
|
|
12
|
+
{
|
|
13
|
+
if (RTEST(rb_obj_is_kind_of(other, rb_cUString)))
|
|
14
|
+
rb_u_raise(rb_eTypeError, "type mismatch: U::String given");
|
|
15
|
+
|
|
16
|
+
switch (TYPE(other)) {
|
|
17
|
+
case T_STRING:
|
|
18
|
+
rb_u_raise(rb_eTypeError, "type mismatch: String given");
|
|
19
|
+
break;
|
|
20
|
+
case T_REGEXP: {
|
|
21
|
+
const struct rb_u_string *string = RVAL2USTRING(self);
|
|
22
|
+
|
|
23
|
+
long index = rb_reg_search(other, rb_str_to_str(self), 0, 0);
|
|
24
|
+
if (index < 0)
|
|
25
|
+
return Qnil;
|
|
26
|
+
|
|
27
|
+
return LONG2NUM(u_pointer_to_offset(USTRING_STR(string),
|
|
28
|
+
USTRING_STR(string) + index));
|
|
29
|
+
}
|
|
30
|
+
default:
|
|
31
|
+
return rb_funcall(other, rb_intern("=~"), 1, self);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/* @overload match(pattern, index = 0)
|
|
36
|
+
* @param [Regexp, #to_str] pattern
|
|
37
|
+
* @param [#to_int] index
|
|
38
|
+
* @return [MatchData, nil] The result of _r_#match(self, index), that is,
|
|
39
|
+
* the match data of the first match of _r_ in the receiver, inheriting any
|
|
40
|
+
* taint and untrust from both the receiver and from PATTERN, if one
|
|
41
|
+
* exists, where _r_ = PATTERN, if PATTERN is a Regexp, _r_ =
|
|
42
|
+
* Regexp.new(PATTERN) otherwise
|
|
43
|
+
* @overload match(pattern, index = 0){ |matchdata| … }
|
|
44
|
+
* @param [Regexp, #to_str] pattern
|
|
45
|
+
* @param [#to_int] index
|
|
46
|
+
* @yieldparam [MatchData] matchdata
|
|
47
|
+
* @return [Object, nil] The result of calling the given block with the
|
|
48
|
+
* result of _r_#match(self, index), that is, the match data of the first
|
|
49
|
+
* match of _r_ in the receiver, inheriting any taint and untrust from both
|
|
50
|
+
* the recevier and from PATTERN, if one exists, where _r_ = PATTERN, if
|
|
51
|
+
* PATTERN is a Regexp, _r_ = Regexp.new(PATTERN) otherwise */
|
|
52
|
+
VALUE
|
|
53
|
+
rb_u_string_match_m(int argc, VALUE *argv, VALUE self)
|
|
54
|
+
{
|
|
55
|
+
VALUE re;
|
|
56
|
+
if (argc < 0)
|
|
57
|
+
need_m_to_n_arguments(argc, 1, 2);
|
|
58
|
+
re = argv[0];
|
|
59
|
+
argv[0] = self;
|
|
60
|
+
VALUE result = rb_funcall2(rb_u_pattern_argument(re, false),
|
|
61
|
+
rb_intern("match"), argc, argv);
|
|
62
|
+
if (!NIL_P(result) && rb_block_given_p())
|
|
63
|
+
return rb_yield(result);
|
|
64
|
+
return result;
|
|
65
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
|
|
3
|
+
/* Returns the mirroring of the receiver, inheriting any taint and untrust.
|
|
4
|
+
*
|
|
5
|
+
* Mirroring is done by replacing characters in the string with their
|
|
6
|
+
* horizontal mirror image, if any, in text that is laid out from right to
|
|
7
|
+
* left. For example, ‘(’ becomes ‘)’ and ‘)’ becomes ‘(’.
|
|
8
|
+
*
|
|
9
|
+
* @return [U::String]
|
|
10
|
+
* @see http://www.unicode.org/reports/tr9/
|
|
11
|
+
* Unicode Standard Annex #9: Unicode Bidirectional Algorithm */
|
|
12
|
+
VALUE
|
|
13
|
+
rb_u_string_mirror(VALUE self)
|
|
14
|
+
{
|
|
15
|
+
return _rb_u_string_convert(self, u_mirror);
|
|
16
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
|
|
3
|
+
/* @overload newline?
|
|
4
|
+
*
|
|
5
|
+
* Returns true if the receiver contains only “newline” characters. A
|
|
6
|
+
* character is a “newline” character if it is any of the following
|
|
7
|
+
* characters:
|
|
8
|
+
*
|
|
9
|
+
* * U+000A (LINE FEED (LF))
|
|
10
|
+
* * U+000C (FORM FEED (FF))
|
|
11
|
+
* * U+000D (CARRIAGE RETURN (CR))
|
|
12
|
+
* * U+0085 (NEXT LINE)
|
|
13
|
+
* * U+2028 (LINE SEPARATOR)
|
|
14
|
+
* * U+2029 (PARAGRAPH SEPARATOR)
|
|
15
|
+
*
|
|
16
|
+
* @return [Boolean] */
|
|
17
|
+
VALUE
|
|
18
|
+
rb_u_string_newline(VALUE self)
|
|
19
|
+
{
|
|
20
|
+
return _rb_u_character_test(self, u_char_isnewline);
|
|
21
|
+
}
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
|
|
3
|
+
/* @overload normalize(form = :nfd)
|
|
4
|
+
*
|
|
5
|
+
* Returns the receiver normalized into FORM, inheriting any taint and
|
|
6
|
+
* untrust.
|
|
7
|
+
*
|
|
8
|
+
* Normalization is the process of converting characters and sequences of
|
|
9
|
+
* characters in string into a canonical form. This process includes dealing
|
|
10
|
+
* with whether characters are represented by a composed character or a base
|
|
11
|
+
* character and combining marks, such as accents.
|
|
12
|
+
*
|
|
13
|
+
* The possible normalization forms are
|
|
14
|
+
*
|
|
15
|
+
* <table>
|
|
16
|
+
* <thead>
|
|
17
|
+
* <tr><th>Form</th><th>Description</th></tr>
|
|
18
|
+
* </thead>
|
|
19
|
+
* <tbody>
|
|
20
|
+
* <tr>
|
|
21
|
+
* <td><code>:nfd</code></td>
|
|
22
|
+
* <td>Normalizes characters to their maximally decomposed form,
|
|
23
|
+
* ordering accents and so on according to their combining class</td>
|
|
24
|
+
* </tr>
|
|
25
|
+
* <tr>
|
|
26
|
+
* <td><code>:nfc</code></td>
|
|
27
|
+
* <td>Normalizes according to <code>:nfd</code>, then composes any
|
|
28
|
+
* decomposed characters</td>
|
|
29
|
+
* </tr>
|
|
30
|
+
* <tr>
|
|
31
|
+
* <td><code>:nfkd</code></td>
|
|
32
|
+
* <td>Normalizes according to <code>:nfd</code> and also normalizes
|
|
33
|
+
* “compatibility” characters, such as replacing U+00B3 SUPERSCRIPT
|
|
34
|
+
* THREE with U+0033 DIGIT THREE</td>
|
|
35
|
+
* </tr>
|
|
36
|
+
* <tr>
|
|
37
|
+
* <td><code>:nfkc</code></td>
|
|
38
|
+
* <td>Normalizes according to <code>:nfkd</code>, then composes any
|
|
39
|
+
* decomposed characters</td>
|
|
40
|
+
* </tr>
|
|
41
|
+
* </tbody>
|
|
42
|
+
* </table>
|
|
43
|
+
*
|
|
44
|
+
* @param [#to_sym] form
|
|
45
|
+
* @return [U::String]
|
|
46
|
+
* @see http://unicode.org/reports/tr15/
|
|
47
|
+
* Unicode Standard Annex #15: Unicode Normalization Forms */
|
|
48
|
+
VALUE
|
|
49
|
+
rb_u_string_normalize(int argc, VALUE *argv, VALUE self)
|
|
50
|
+
{
|
|
51
|
+
const struct rb_u_string *string = RVAL2USTRING(self);
|
|
52
|
+
|
|
53
|
+
VALUE rbform;
|
|
54
|
+
enum u_normalization_form form = U_NORMALIZATION_FORM_D;
|
|
55
|
+
if (rb_scan_args(argc, argv, "01", &rbform) == 1)
|
|
56
|
+
form = _rb_u_symbol_to_normalization_form(rbform);
|
|
57
|
+
|
|
58
|
+
size_t n = u_normalize(NULL, 0,
|
|
59
|
+
USTRING_STR(string), USTRING_LENGTH(string),
|
|
60
|
+
form);
|
|
61
|
+
char *normalized = ALLOC_N(char, n + 1);
|
|
62
|
+
n = u_normalize(normalized, n + 1,
|
|
63
|
+
USTRING_STR(string), USTRING_LENGTH(string),
|
|
64
|
+
form);
|
|
65
|
+
char *t = REALLOC_N(normalized, char, n + 1);
|
|
66
|
+
if (t != NULL)
|
|
67
|
+
normalized = t;
|
|
68
|
+
|
|
69
|
+
return rb_u_string_new_c_own(self, normalized, n);
|
|
70
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
|
|
3
|
+
/* @overload normalize?(mode = :default)
|
|
4
|
+
*
|
|
5
|
+
* Returns true if it can be determined that the receiver is normalized
|
|
6
|
+
* according to MODE.
|
|
7
|
+
*
|
|
8
|
+
* See {#normalize} for a discussion on normalization and a list of the
|
|
9
|
+
* possible normalization modes.
|
|
10
|
+
*
|
|
11
|
+
* @param [#to_sym] mode
|
|
12
|
+
* @return [Boolean]
|
|
13
|
+
* @see http://unicode.org/reports/tr15/
|
|
14
|
+
* Unicode Standard Annex #15: Unicode Normalization Forms */
|
|
15
|
+
VALUE
|
|
16
|
+
rb_u_string_normalized(int argc, VALUE *argv, VALUE self)
|
|
17
|
+
{
|
|
18
|
+
const struct rb_u_string *string = RVAL2USTRING(self);
|
|
19
|
+
|
|
20
|
+
VALUE rbform;
|
|
21
|
+
enum u_normalization_form form = U_NORMALIZATION_FORM_D;
|
|
22
|
+
if (rb_scan_args(argc, argv, "01", &rbform) == 1)
|
|
23
|
+
form = _rb_u_symbol_to_normalization_form(rbform);
|
|
24
|
+
|
|
25
|
+
return u_normalized(USTRING_STR(string),
|
|
26
|
+
USTRING_LENGTH(string),
|
|
27
|
+
form) == U_NORMALIZED_YES ? Qtrue : Qfalse;
|
|
28
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
#include "rb_u_string_to_inum.h"
|
|
3
|
+
|
|
4
|
+
/* @return [Integer] The result of {#to_i}(8), but with the added provision
|
|
5
|
+
* that any leading base specification in the receiver will override the
|
|
6
|
+
* suggested octal (8) base, that is, `'0b11'.u`{#oct} = 3, not 9. */
|
|
7
|
+
VALUE
|
|
8
|
+
rb_u_string_oct(VALUE self)
|
|
9
|
+
{
|
|
10
|
+
return rb_u_string_to_inum(self, -8, false);
|
|
11
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
|
|
3
|
+
/* @return [Integer] The code point of the first character of the receiver */
|
|
4
|
+
VALUE
|
|
5
|
+
rb_u_string_ord(VALUE self)
|
|
6
|
+
{
|
|
7
|
+
const struct rb_u_string *s = RVAL2USTRING(self);
|
|
8
|
+
const char *p = USTRING_STR(s);
|
|
9
|
+
const char *end = USTRING_END(s);
|
|
10
|
+
if (p == end)
|
|
11
|
+
rb_u_raise(rb_eArgError, "empty string");
|
|
12
|
+
const char *q;
|
|
13
|
+
return UINT2NUM(u_decode(&q, p, end));
|
|
14
|
+
}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
#include "rb_includes.h"
|
|
2
|
+
#include "rb_u_re.h"
|
|
3
|
+
|
|
4
|
+
static VALUE
|
|
5
|
+
rb_u_string_partition_failure(VALUE self)
|
|
6
|
+
{
|
|
7
|
+
return rb_ary_new3(3,
|
|
8
|
+
self,
|
|
9
|
+
rb_u_string_new_empty(self),
|
|
10
|
+
rb_u_string_new_empty(self));
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
static VALUE
|
|
14
|
+
rb_u_string_partition_success(VALUE self, VALUE rbseparator, long offset)
|
|
15
|
+
{
|
|
16
|
+
const struct rb_u_string *string = RVAL2USTRING(self);
|
|
17
|
+
const struct rb_u_string *separator = RVAL2USTRING_ANY(rbseparator);
|
|
18
|
+
|
|
19
|
+
long after = offset + USTRING_LENGTH(separator);
|
|
20
|
+
|
|
21
|
+
return rb_ary_new3(3,
|
|
22
|
+
rb_u_string_new_subsequence(self, 0, offset),
|
|
23
|
+
TYPE(rbseparator) == T_STRING ?
|
|
24
|
+
rb_u_string_new_rb(rbseparator) :
|
|
25
|
+
rbseparator,
|
|
26
|
+
rb_u_string_new_subsequence(self,
|
|
27
|
+
after,
|
|
28
|
+
USTRING_LENGTH(string) - after));
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
static VALUE
|
|
32
|
+
rb_u_string_partition_regex(VALUE self, VALUE regex)
|
|
33
|
+
{
|
|
34
|
+
VALUE str = rb_str_to_str(self);
|
|
35
|
+
|
|
36
|
+
long offset = rb_reg_search(regex, str, 0, 0);
|
|
37
|
+
if (offset < 0)
|
|
38
|
+
return rb_u_string_partition_failure(self);
|
|
39
|
+
|
|
40
|
+
VALUE separator = rb_u_pattern_match_reference(INT2FIX(0));
|
|
41
|
+
|
|
42
|
+
if (offset == 0 && RSTRING_LEN(separator) == 0)
|
|
43
|
+
return rb_u_string_partition_failure(self);
|
|
44
|
+
|
|
45
|
+
return rb_u_string_partition_success(self, separator, offset);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
static VALUE
|
|
49
|
+
rb_u_string_partition_string(VALUE self, VALUE rbseparator)
|
|
50
|
+
{
|
|
51
|
+
VALUE validated = rb_u_string_validate_type(rbseparator);
|
|
52
|
+
|
|
53
|
+
long offset = rb_u_string_index(self, validated, 0);
|
|
54
|
+
if (offset < 0)
|
|
55
|
+
return rb_u_string_partition_failure(self);
|
|
56
|
+
|
|
57
|
+
const char *begin = USTRING_STR(RVAL2USTRING(self));
|
|
58
|
+
long byte_offset = u_offset_to_pointer(begin, offset) - begin;
|
|
59
|
+
|
|
60
|
+
return rb_u_string_partition_success(self, validated, byte_offset);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/* @overload partition(separator)
|
|
64
|
+
* @param [Regexp, #to_str] separator
|
|
65
|
+
* @return [Array<U::String>] The receiver split into _s₁_ = {#slice}(0,
|
|
66
|
+
* _i_), _s₂_ = {#slice}(_i_, _n_), _s₃_ = {#slice}(_i_+_n_, -1), where _i_
|
|
67
|
+
* = _j_ if _j_ ≠ nil, _i_ = {#length} otherwise, _j_ =
|
|
68
|
+
* {#index}(SEPARATOR), _n_ = SEPARATOR{#length}, where _s₁_ and _s₃_
|
|
69
|
+
* inherit any taint and untrust from the receiver and _s₂_ inherits any
|
|
70
|
+
* taint and untrust from SEPARATOR and also from the receiver if SEPARATOR
|
|
71
|
+
* is a Regexp
|
|
72
|
+
* @see #rpartition */
|
|
73
|
+
VALUE
|
|
74
|
+
rb_u_string_partition(VALUE self, VALUE separator)
|
|
75
|
+
{
|
|
76
|
+
if (TYPE(separator) == T_REGEXP)
|
|
77
|
+
return rb_u_string_partition_regex(self, separator);
|
|
78
|
+
|
|
79
|
+
return rb_u_string_partition_string(self, separator);
|
|
80
|
+
}
|