u 0.5.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/build/ext/u/data/attributes.rb +39 -0
- data/build/ext/u/data/bidi-mirroring.rb +27 -0
- data/build/ext/u/data/canonical-combining-class.rb +15 -0
- data/build/ext/u/data/case-folding.rb +39 -0
- data/build/ext/u/data/cased.rb +19 -0
- data/build/ext/u/data/compose.rb +304 -0
- data/build/ext/u/data/constants.rb +31 -0
- data/build/ext/u/data/decompose.rb +85 -0
- data/build/ext/u/data/general-category.rb +61 -0
- data/build/ext/u/data/grapheme-word-break.rb +15 -0
- data/build/ext/u/data/marshalled.rb +5 -0
- data/build/ext/u/data/script.rb +91 -0
- data/build/ext/u/data/soft-dotted.rb +17 -0
- data/build/ext/u/data/title-table.rb +30 -0
- data/build/ext/u/data/wide.rb +17 -0
- data/build/lib/u/build.rb +8 -0
- data/build/lib/u/build/data.rb +16 -0
- data/build/lib/u/build/data/bidimirroring.rb +26 -0
- data/build/lib/u/build/data/break.rb +14 -0
- data/build/lib/u/build/data/casefolding.rb +77 -0
- data/build/lib/u/build/data/compositionexclusions.rb +14 -0
- data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
- data/build/lib/u/build/data/file.rb +88 -0
- data/build/lib/u/build/data/linebreak.rb +14 -0
- data/build/lib/u/build/data/proplist.rb +18 -0
- data/build/lib/u/build/data/scripts.rb +22 -0
- data/build/lib/u/build/data/specialcasing.rb +106 -0
- data/build/lib/u/build/data/unicode.rb +41 -0
- data/build/lib/u/build/data/unicode/entry.rb +27 -0
- data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
- data/build/lib/u/build/data/unicode/points.rb +32 -0
- data/build/lib/u/build/header.rb +11 -0
- data/build/lib/u/build/header/table.rb +19 -0
- data/build/lib/u/build/header/table/row.rb +64 -0
- data/build/lib/u/build/header/tables.rb +6 -0
- data/build/lib/u/build/header/tables/intervals.rb +50 -0
- data/build/lib/u/build/header/tables/split.rb +20 -0
- data/build/lib/u/build/header/tables/split/data.rb +29 -0
- data/build/lib/u/build/header/tables/split/part1.rb +28 -0
- data/build/lib/u/build/header/tables/split/part2.rb +13 -0
- data/build/lib/u/build/header/tables/split/row.rb +34 -0
- data/build/lib/u/build/header/tables/split/rows.rb +22 -0
- data/build/test/unit/break.rb +45 -0
- data/build/test/unit/case.rb +178 -0
- data/build/test/unit/foldcase.rb +44 -0
- data/build/test/unit/normalize.rb +81 -0
- data/ext/u/attributes.c +62 -0
- data/ext/u/attributes.h +5 -0
- data/ext/u/case.h +41 -0
- data/ext/u/data/attributes.h +3070 -0
- data/ext/u/data/bidi-mirroring.h +373 -0
- data/ext/u/data/canonical-combining-class.h +2157 -0
- data/ext/u/data/case-folding.h +171 -0
- data/ext/u/data/cased.h +42 -0
- data/ext/u/data/compose.h +1714 -0
- data/ext/u/data/constants.h +17 -0
- data/ext/u/data/decompose.h +9356 -0
- data/ext/u/data/general-category.h +28959 -0
- data/ext/u/data/grapheme-break.h +13201 -0
- data/ext/u/data/line-break.h +26501 -0
- data/ext/u/data/normalization-quick-check.h +3002 -0
- data/ext/u/data/script.h +2928 -0
- data/ext/u/data/soft-dotted.h +55 -0
- data/ext/u/data/title-table.h +41 -0
- data/ext/u/data/types.h +11117 -0
- data/ext/u/data/wide-cjk.h +197 -0
- data/ext/u/data/wide.h +59 -0
- data/ext/u/data/word-break.h +10001 -0
- data/ext/u/depend +281 -0
- data/ext/u/extconf.rb +158 -0
- data/ext/u/output.h +51 -0
- data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
- data/ext/u/private.h +58 -0
- data/ext/u/rb_includes.h +10 -0
- data/ext/u/rb_private.c +98 -0
- data/ext/u/rb_private.h +67 -0
- data/ext/u/rb_u.c +251 -0
- data/ext/u/rb_u_buffer.c +443 -0
- data/ext/u/rb_u_buffer.h +24 -0
- data/ext/u/rb_u_re.c +43 -0
- data/ext/u/rb_u_re.h +15 -0
- data/ext/u/rb_u_string.c +478 -0
- data/ext/u/rb_u_string.h +173 -0
- data/ext/u/rb_u_string_alnum.c +10 -0
- data/ext/u/rb_u_string_alpha.c +10 -0
- data/ext/u/rb_u_string_aref.c +142 -0
- data/ext/u/rb_u_string_ascii_only.c +13 -0
- data/ext/u/rb_u_string_assigned.c +10 -0
- data/ext/u/rb_u_string_b.c +18 -0
- data/ext/u/rb_u_string_bytesize.c +10 -0
- data/ext/u/rb_u_string_byteslice.c +103 -0
- data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
- data/ext/u/rb_u_string_case_ignorable.c +25 -0
- data/ext/u/rb_u_string_casecmp.c +61 -0
- data/ext/u/rb_u_string_cased.c +17 -0
- data/ext/u/rb_u_string_chomp.c +107 -0
- data/ext/u/rb_u_string_chop.c +33 -0
- data/ext/u/rb_u_string_chr.c +9 -0
- data/ext/u/rb_u_string_cntrl.c +10 -0
- data/ext/u/rb_u_string_collate.c +46 -0
- data/ext/u/rb_u_string_collation_key.c +18 -0
- data/ext/u/rb_u_string_count.c +38 -0
- data/ext/u/rb_u_string_defined.c +10 -0
- data/ext/u/rb_u_string_delete.c +62 -0
- data/ext/u/rb_u_string_digit.c +10 -0
- data/ext/u/rb_u_string_downcase.c +13 -0
- data/ext/u/rb_u_string_dump.c +153 -0
- data/ext/u/rb_u_string_each_byte.c +46 -0
- data/ext/u/rb_u_string_each_char.c +49 -0
- data/ext/u/rb_u_string_each_codepoint.c +45 -0
- data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
- data/ext/u/rb_u_string_each_line.c +142 -0
- data/ext/u/rb_u_string_each_word.c +34 -0
- data/ext/u/rb_u_string_empty.c +11 -0
- data/ext/u/rb_u_string_end_with.c +31 -0
- data/ext/u/rb_u_string_eql.c +30 -0
- data/ext/u/rb_u_string_equal.c +33 -0
- data/ext/u/rb_u_string_foldcase.c +12 -0
- data/ext/u/rb_u_string_folded.c +13 -0
- data/ext/u/rb_u_string_format.c +1745 -0
- data/ext/u/rb_u_string_general_category.c +109 -0
- data/ext/u/rb_u_string_getbyte.c +21 -0
- data/ext/u/rb_u_string_graph.c +21 -0
- data/ext/u/rb_u_string_grapheme_break.c +61 -0
- data/ext/u/rb_u_string_gsub.c +164 -0
- data/ext/u/rb_u_string_hash.c +10 -0
- data/ext/u/rb_u_string_hex.c +9 -0
- data/ext/u/rb_u_string_include.c +10 -0
- data/ext/u/rb_u_string_index.c +110 -0
- data/ext/u/rb_u_string_inspect.c +189 -0
- data/ext/u/rb_u_string_internal_tr.c +148 -0
- data/ext/u/rb_u_string_internal_tr.h +29 -0
- data/ext/u/rb_u_string_justify.c +169 -0
- data/ext/u/rb_u_string_length.c +10 -0
- data/ext/u/rb_u_string_line_break.c +115 -0
- data/ext/u/rb_u_string_lower.c +13 -0
- data/ext/u/rb_u_string_lstrip.c +24 -0
- data/ext/u/rb_u_string_match.c +65 -0
- data/ext/u/rb_u_string_mirror.c +16 -0
- data/ext/u/rb_u_string_newline.c +21 -0
- data/ext/u/rb_u_string_normalize.c +70 -0
- data/ext/u/rb_u_string_normalized.c +28 -0
- data/ext/u/rb_u_string_oct.c +11 -0
- data/ext/u/rb_u_string_ord.c +14 -0
- data/ext/u/rb_u_string_partition.c +80 -0
- data/ext/u/rb_u_string_plus.c +33 -0
- data/ext/u/rb_u_string_print.c +10 -0
- data/ext/u/rb_u_string_punct.c +10 -0
- data/ext/u/rb_u_string_reverse.c +13 -0
- data/ext/u/rb_u_string_rindex.c +104 -0
- data/ext/u/rb_u_string_rpartition.c +81 -0
- data/ext/u/rb_u_string_rstrip.c +29 -0
- data/ext/u/rb_u_string_scan.c +109 -0
- data/ext/u/rb_u_string_script.c +253 -0
- data/ext/u/rb_u_string_soft_dotted.c +13 -0
- data/ext/u/rb_u_string_space.c +24 -0
- data/ext/u/rb_u_string_split.c +245 -0
- data/ext/u/rb_u_string_squeeze.c +75 -0
- data/ext/u/rb_u_string_start_with.c +31 -0
- data/ext/u/rb_u_string_strip.c +36 -0
- data/ext/u/rb_u_string_sub.c +147 -0
- data/ext/u/rb_u_string_times.c +35 -0
- data/ext/u/rb_u_string_title.c +10 -0
- data/ext/u/rb_u_string_titlecase.c +13 -0
- data/ext/u/rb_u_string_to_i.c +45 -0
- data/ext/u/rb_u_string_to_inum.c +364 -0
- data/ext/u/rb_u_string_to_inum.h +1 -0
- data/ext/u/rb_u_string_to_str.c +17 -0
- data/ext/u/rb_u_string_to_sym.c +12 -0
- data/ext/u/rb_u_string_tr.c +290 -0
- data/ext/u/rb_u_string_upcase.c +12 -0
- data/ext/u/rb_u_string_upper.c +13 -0
- data/ext/u/rb_u_string_valid.c +10 -0
- data/ext/u/rb_u_string_valid_encoding.c +12 -0
- data/ext/u/rb_u_string_wide.c +21 -0
- data/ext/u/rb_u_string_wide_cjk.c +21 -0
- data/ext/u/rb_u_string_width.c +19 -0
- data/ext/u/rb_u_string_word_break.c +63 -0
- data/ext/u/rb_u_string_xdigit.c +22 -0
- data/ext/u/rb_u_string_zero_width.c +16 -0
- data/ext/u/titled.c +55 -0
- data/ext/u/titled.h +1 -0
- data/ext/u/u.c +23 -0
- data/ext/u/u.h +458 -0
- data/ext/u/u_char_canonical_combining_class.c +31 -0
- data/ext/u/u_char_digit_value.c +21 -0
- data/ext/u/u_char_downcase.c +27 -0
- data/ext/u/u_char_general_category.c +31 -0
- data/ext/u/u_char_grapheme_break.c +28 -0
- data/ext/u/u_char_isalnum.c +24 -0
- data/ext/u/u_char_isalpha.c +21 -0
- data/ext/u/u_char_isassigned.c +16 -0
- data/ext/u/u_char_iscased.c +22 -0
- data/ext/u/u_char_iscaseignorable.c +29 -0
- data/ext/u/u_char_iscntrl.c +17 -0
- data/ext/u/u_char_isdefined.c +15 -0
- data/ext/u/u_char_isdigit.c +16 -0
- data/ext/u/u_char_isgraph.c +22 -0
- data/ext/u/u_char_islower.c +16 -0
- data/ext/u/u_char_isnewline.c +24 -0
- data/ext/u/u_char_isprint.c +21 -0
- data/ext/u/u_char_ispunct.c +27 -0
- data/ext/u/u_char_issoftdotted.c +18 -0
- data/ext/u/u_char_isspace.c +28 -0
- data/ext/u/u_char_isupper.c +16 -0
- data/ext/u/u_char_isvalid.c +18 -0
- data/ext/u/u_char_iswide.c +18 -0
- data/ext/u/u_char_iswide_cjk.c +22 -0
- data/ext/u/u_char_isxdigit.c +27 -0
- data/ext/u/u_char_iszerowidth.c +29 -0
- data/ext/u/u_char_line_break.c +29 -0
- data/ext/u/u_char_mirror.c +16 -0
- data/ext/u/u_char_normalized.c +23 -0
- data/ext/u/u_char_script.c +41 -0
- data/ext/u/u_char_to_u.c +48 -0
- data/ext/u/u_char_upcase.c +24 -0
- data/ext/u/u_char_width.c +12 -0
- data/ext/u/u_char_word_break.c +28 -0
- data/ext/u/u_char_xdigit_value.c +31 -0
- data/ext/u/u_collate.c +83 -0
- data/ext/u/u_collation_key.c +132 -0
- data/ext/u/u_decode.c +156 -0
- data/ext/u/u_downcase.c +201 -0
- data/ext/u/u_foldcase.c +68 -0
- data/ext/u/u_grapheme_clusters.c +57 -0
- data/ext/u/u_has_prefix.c +27 -0
- data/ext/u/u_index.c +93 -0
- data/ext/u/u_is_ascii_only.c +33 -0
- data/ext/u/u_locale.c +40 -0
- data/ext/u/u_locale.h +14 -0
- data/ext/u/u_mirror.c +20 -0
- data/ext/u/u_n_bytes.c +16 -0
- data/ext/u/u_n_chars.c +43 -0
- data/ext/u/u_normalize.c +232 -0
- data/ext/u/u_normalized.c +28 -0
- data/ext/u/u_offset_to_pointer.c +62 -0
- data/ext/u/u_pointer_to_offset.c +23 -0
- data/ext/u/u_recode.c +73 -0
- data/ext/u/u_reverse.c +21 -0
- data/ext/u/u_rindex.c +132 -0
- data/ext/u/u_titlecase.c +68 -0
- data/ext/u/u_upcase.c +89 -0
- data/ext/u/u_width.c +35 -0
- data/ext/u/u_words.c +82 -0
- data/ext/u/yield.h +27 -0
- data/lib/u-1.0.rb +20 -0
- data/lib/u-1.0/buffer.rb +10 -0
- data/lib/u-1.0/string.rb +9 -0
- data/lib/u-1.0/version.rb +287 -0
- data/test/unit/case.rb +2080 -0
- data/test/unit/foldcase.rb +1136 -0
- data/test/unit/graphemebreak.rb +407 -0
- data/test/unit/normalize.rb +367545 -0
- data/test/unit/u-1.0.rb +10 -0
- data/test/unit/u-1.0/buffer.rb +52 -0
- data/test/unit/u-1.0/string.rb +1439 -0
- data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
- data/test/unit/wordbreak.rb +1083 -0
- metadata +603 -148
- data/README +0 -38
- data/Rakefile +0 -64
- data/ext/encoding/character/utf-8/break.c +0 -25
- data/ext/encoding/character/utf-8/data/break.h +0 -22931
- data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
- data/ext/encoding/character/utf-8/data/compose.h +0 -1607
- data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
- data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
- data/ext/encoding/character/utf-8/decompose.c +0 -444
- data/ext/encoding/character/utf-8/depend +0 -65
- data/ext/encoding/character/utf-8/extconf.rb +0 -67
- data/ext/encoding/character/utf-8/private.h +0 -51
- data/ext/encoding/character/utf-8/properties.c +0 -1056
- data/ext/encoding/character/utf-8/rb_includes.h +0 -19
- data/ext/encoding/character/utf-8/rb_methods.h +0 -49
- data/ext/encoding/character/utf-8/rb_private.h +0 -52
- data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
- data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
- data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
- data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
- data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
- data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
- data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
- data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
- data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
- data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
- data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
- data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
- data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
- data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
- data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
- data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
- data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
- data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
- data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
- data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
- data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
- data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
- data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
- data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
- data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
- data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
- data/ext/encoding/character/utf-8/tables.h +0 -38
- data/ext/encoding/character/utf-8/unicode.c +0 -319
- data/ext/encoding/character/utf-8/unicode.h +0 -216
- data/ext/encoding/character/utf-8/utf.c +0 -1334
- data/lib/encoding/character/utf-8.rb +0 -201
- data/lib/u.rb +0 -16
- data/lib/u/string.rb +0 -185
- data/lib/u/version.rb +0 -5
- data/test/unit/u/string.rb +0 -91
data/ext/u/rb_u_buffer.c
ADDED
|
@@ -0,0 +1,443 @@
|
|
|
1
|
+
/* -*- coding: utf-8 -*- */
|
|
2
|
+
|
|
3
|
+
#include <ruby.h>
|
|
4
|
+
#include <stdarg.h>
|
|
5
|
+
#include <stdbool.h>
|
|
6
|
+
#include <stddef.h>
|
|
7
|
+
#include <stdint.h>
|
|
8
|
+
#include <limits.h>
|
|
9
|
+
#include "u.h"
|
|
10
|
+
#include "private.h"
|
|
11
|
+
#include "rb_private.h"
|
|
12
|
+
#include "rb_u_buffer.h"
|
|
13
|
+
#include "rb_u_string.h"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
#define RVAL2RBUBUFFER(object) \
|
|
17
|
+
(Check_Type(object, T_DATA), (struct rb_u_buffer *)DATA_PTR(object))
|
|
18
|
+
|
|
19
|
+
#define UBUFFER2RVAL(buffer) \
|
|
20
|
+
Data_Wrap_Struct(rb_cUBuffer, NULL, rb_u_buffer_free, buffer)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
struct rb_u_buffer {
|
|
24
|
+
char *c;
|
|
25
|
+
long length;
|
|
26
|
+
long allocated;
|
|
27
|
+
long initially_allocated;
|
|
28
|
+
};
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
static VALUE rb_cUBuffer;
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
static void
|
|
35
|
+
rb_u_buffer_free(struct rb_u_buffer *buffer)
|
|
36
|
+
{
|
|
37
|
+
free(buffer->c);
|
|
38
|
+
free(buffer);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
static inline long
|
|
42
|
+
nearest_power(long base, long minimum)
|
|
43
|
+
{
|
|
44
|
+
if (minimum < 0 || minimum * 2 < 0)
|
|
45
|
+
return -1;
|
|
46
|
+
|
|
47
|
+
long n = base;
|
|
48
|
+
while (n < minimum)
|
|
49
|
+
n *= 2;
|
|
50
|
+
|
|
51
|
+
return n;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
static void
|
|
55
|
+
u_buffer_maybe_expand(struct rb_u_buffer *buffer, long additional)
|
|
56
|
+
{
|
|
57
|
+
if (buffer->length + additional < buffer->allocated)
|
|
58
|
+
return;
|
|
59
|
+
|
|
60
|
+
long allocate = nearest_power(1, buffer->length + additional);
|
|
61
|
+
if (allocate < 0)
|
|
62
|
+
rb_u_raise(rb_eNoMemError,
|
|
63
|
+
"buffer would be too large: %ld + %ld + 1 > %ld",
|
|
64
|
+
buffer->length, additional, LONG_MAX);
|
|
65
|
+
REALLOC_N(buffer->c, char, allocate);
|
|
66
|
+
buffer->allocated = allocate;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
static void
|
|
70
|
+
rb_u_buffer_reset(struct rb_u_buffer *buffer)
|
|
71
|
+
{
|
|
72
|
+
buffer->c = NULL;
|
|
73
|
+
buffer->length = 0;
|
|
74
|
+
buffer->allocated = 0;
|
|
75
|
+
|
|
76
|
+
if (buffer->initially_allocated > 0)
|
|
77
|
+
u_buffer_maybe_expand(buffer, buffer->initially_allocated);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
static VALUE
|
|
81
|
+
rb_u_buffer_create(long size)
|
|
82
|
+
{
|
|
83
|
+
struct rb_u_buffer *buffer = ALLOC(struct rb_u_buffer);
|
|
84
|
+
|
|
85
|
+
buffer->initially_allocated = size;
|
|
86
|
+
|
|
87
|
+
rb_u_buffer_reset(buffer);
|
|
88
|
+
|
|
89
|
+
return UBUFFER2RVAL(buffer);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
static VALUE
|
|
93
|
+
rb_u_buffer_alloc(UNUSED(VALUE klass))
|
|
94
|
+
{
|
|
95
|
+
return rb_u_buffer_create(0);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
VALUE
|
|
99
|
+
rb_u_buffer_new(void)
|
|
100
|
+
{
|
|
101
|
+
return rb_u_buffer_create(0);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
VALUE
|
|
105
|
+
rb_u_buffer_new_sized(long size)
|
|
106
|
+
{
|
|
107
|
+
return rb_u_buffer_create(size);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
VALUE
|
|
111
|
+
rb_u_buffer_append(VALUE self, const char *str, long length)
|
|
112
|
+
{
|
|
113
|
+
struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
|
|
114
|
+
|
|
115
|
+
u_buffer_maybe_expand(buffer, length);
|
|
116
|
+
memcpy(buffer->c + buffer->length, str, length);
|
|
117
|
+
buffer->length += length;
|
|
118
|
+
|
|
119
|
+
return self;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
VALUE
|
|
123
|
+
rb_u_buffer_append_char(VALUE self, uint32_t c)
|
|
124
|
+
{
|
|
125
|
+
struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
|
|
126
|
+
|
|
127
|
+
u_buffer_maybe_expand(buffer, U_CHAR_MAX_BYTE_LENGTH);
|
|
128
|
+
buffer->length += u_char_to_u(c, buffer->c + buffer->length);
|
|
129
|
+
|
|
130
|
+
return self;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
VALUE
|
|
134
|
+
rb_u_buffer_append_char_n(VALUE self, uint32_t c, long n)
|
|
135
|
+
{
|
|
136
|
+
if (n < 1)
|
|
137
|
+
return self;
|
|
138
|
+
|
|
139
|
+
struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
|
|
140
|
+
|
|
141
|
+
if (c < 128) {
|
|
142
|
+
u_buffer_maybe_expand(buffer, n);
|
|
143
|
+
memset(buffer->c + buffer->length, c & 0x7f, n);
|
|
144
|
+
buffer->length += n;
|
|
145
|
+
return self;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
char buf[U_CHAR_MAX_BYTE_LENGTH];
|
|
149
|
+
int length = u_char_to_u(c, buf);
|
|
150
|
+
u_buffer_maybe_expand(buffer, length * n);
|
|
151
|
+
for (int i = 0; i < n; i++)
|
|
152
|
+
memcpy(buffer->c + buffer->length + length * i, buf, length);
|
|
153
|
+
buffer->length += length * n;
|
|
154
|
+
|
|
155
|
+
return self;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
#pragma GCC diagnostic ignored "-Wformat-nonliteral"
|
|
159
|
+
VALUE
|
|
160
|
+
rb_u_buffer_append_printf(VALUE self, size_t needed, const char *format, ...)
|
|
161
|
+
{
|
|
162
|
+
struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
|
|
163
|
+
|
|
164
|
+
u_buffer_maybe_expand(buffer, needed);
|
|
165
|
+
|
|
166
|
+
va_list arguments;
|
|
167
|
+
va_start(arguments, format);
|
|
168
|
+
int length = vsnprintf(buffer->c + buffer->length, needed,
|
|
169
|
+
format, arguments);
|
|
170
|
+
va_end(arguments);
|
|
171
|
+
|
|
172
|
+
if (length < 0)
|
|
173
|
+
rb_sys_fail("system vsnprintf(3) failed");
|
|
174
|
+
|
|
175
|
+
if ((size_t)length >= needed)
|
|
176
|
+
rb_u_raise(rb_eNotImpError,
|
|
177
|
+
"format string buffer calculation is wrong: %s (%zu < %zu)",
|
|
178
|
+
format, needed, (size_t)length);
|
|
179
|
+
|
|
180
|
+
buffer->length += length;
|
|
181
|
+
|
|
182
|
+
return self;
|
|
183
|
+
}
|
|
184
|
+
#pragma GCC diagnostic warning "-Wformat-nonliteral"
|
|
185
|
+
|
|
186
|
+
/* @!visibility public
|
|
187
|
+
* @overload new(size = 128)
|
|
188
|
+
*
|
|
189
|
+
* Sets up a new buffer of SIZE bytes.
|
|
190
|
+
*
|
|
191
|
+
* @param [#to_int] size */
|
|
192
|
+
static VALUE
|
|
193
|
+
rb_u_buffer_initialize(int argc, VALUE *argv, VALUE self)
|
|
194
|
+
{
|
|
195
|
+
struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
|
|
196
|
+
VALUE rbsize;
|
|
197
|
+
|
|
198
|
+
rb_scan_args(argc, argv, "01", &rbsize);
|
|
199
|
+
long size = NIL_P(rbsize) ? 128 : NUM2LONG(rbsize);
|
|
200
|
+
|
|
201
|
+
u_buffer_maybe_expand(buffer, size);
|
|
202
|
+
|
|
203
|
+
return Qnil;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
static VALUE
|
|
207
|
+
rb_u_buffer_initialize_copy(VALUE self, VALUE rboriginal)
|
|
208
|
+
{
|
|
209
|
+
struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
|
|
210
|
+
const struct rb_u_buffer *original = RVAL2RBUBUFFER(rboriginal);
|
|
211
|
+
|
|
212
|
+
if (buffer == original)
|
|
213
|
+
return self;
|
|
214
|
+
|
|
215
|
+
buffer->initially_allocated = original->initially_allocated;
|
|
216
|
+
|
|
217
|
+
rb_u_buffer_append(self, original->c, original->length);
|
|
218
|
+
|
|
219
|
+
OBJ_INFECT(self, rboriginal);
|
|
220
|
+
|
|
221
|
+
return self;
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
/* @overload <<(*parts)
|
|
225
|
+
*
|
|
226
|
+
* Append each _p_ in PARTS, append _q_ to the receiver, where _q_ =
|
|
227
|
+
* _p_#to_s, if _p_ is a U::Buffer, _q_ = _p_#chr, if _p_ is a Fixnum or
|
|
228
|
+
* Bignum, _q_ = _p_#to_str, if _p_ is a U::String or responds to #to_str.
|
|
229
|
+
*
|
|
230
|
+
* @param [U::Buffer, Fixnum, Bignum, U::String, #to_str] parts
|
|
231
|
+
* @raise [RangeError] If a _p_ is a Fixnum or Bignum and ¬_p_#chr#valid?
|
|
232
|
+
* @return [self] */
|
|
233
|
+
VALUE
|
|
234
|
+
rb_u_buffer_append_m(int argc, VALUE *argv, VALUE self)
|
|
235
|
+
{
|
|
236
|
+
need_at_least_n_arguments(argc, 1);
|
|
237
|
+
|
|
238
|
+
for (int i = 0; i < argc; i++)
|
|
239
|
+
if (RTEST(rb_obj_is_kind_of(argv[i], rb_cUBuffer))) {
|
|
240
|
+
const struct rb_u_buffer *buffer = RVAL2RBUBUFFER(argv[i]);
|
|
241
|
+
|
|
242
|
+
rb_u_buffer_append(self, buffer->c, buffer->length);
|
|
243
|
+
OBJ_INFECT(self, argv[i]);
|
|
244
|
+
} else if (FIXNUM_P(argv[i]) || TYPE(argv[i]) == T_BIGNUM) {
|
|
245
|
+
uint32_t c = NUM2UINT(argv[i]);
|
|
246
|
+
|
|
247
|
+
/* TODO: This depends on an experimental API. Modify this once the API is
|
|
248
|
+
* stable. */
|
|
249
|
+
#if 0
|
|
250
|
+
if (rb_num_to_uint(argv[i], &c) != 0) {
|
|
251
|
+
if (FIXNUM_P(argv[i]))
|
|
252
|
+
rb_u_raise(rb_eRangeError,
|
|
253
|
+
"%ld out of char range",
|
|
254
|
+
FIX2LONG(argv[i]));
|
|
255
|
+
else
|
|
256
|
+
rb_u_raise(rb_eRangeError,
|
|
257
|
+
"Bignum out of char range");
|
|
258
|
+
}
|
|
259
|
+
#endif
|
|
260
|
+
|
|
261
|
+
if (!u_char_isvalid(c))
|
|
262
|
+
rb_u_raise(rb_eRangeError,
|
|
263
|
+
"invalid Unicode character: %u",
|
|
264
|
+
c);
|
|
265
|
+
|
|
266
|
+
rb_u_buffer_append_char(self, c);
|
|
267
|
+
} else {
|
|
268
|
+
const struct rb_u_string *string = RVAL2USTRING_ANY(argv[i]);
|
|
269
|
+
|
|
270
|
+
rb_u_buffer_append(self,
|
|
271
|
+
USTRING_STR(string),
|
|
272
|
+
USTRING_LENGTH(string));
|
|
273
|
+
OBJ_INFECT(self, argv[i]);
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
return self;
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
/* @return [U::String] A UTF-8-encoded string of the receiver’s content */
|
|
280
|
+
VALUE
|
|
281
|
+
rb_u_buffer_to_u(VALUE self)
|
|
282
|
+
{
|
|
283
|
+
const struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
|
|
284
|
+
|
|
285
|
+
return rb_u_string_new_c(self, buffer->c, buffer->length);
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
/* @return [U::String] The UTF-8-encoded string of the receiver’s content after
|
|
289
|
+
* clearing it from the receiver
|
|
290
|
+
* @note This method differs from {#to_u} in that it doesn’t copy the result,
|
|
291
|
+
* so it’s generally faster; call it when you’re done building your
|
|
292
|
+
* {U::String}. */
|
|
293
|
+
VALUE
|
|
294
|
+
rb_u_buffer_to_u_bang(VALUE self)
|
|
295
|
+
{
|
|
296
|
+
struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
|
|
297
|
+
|
|
298
|
+
char *c = buffer->c;
|
|
299
|
+
long length = buffer->length;
|
|
300
|
+
rb_u_buffer_reset(buffer);
|
|
301
|
+
|
|
302
|
+
REALLOC_N(c, char, length + 1);
|
|
303
|
+
c[length] = '\0';
|
|
304
|
+
|
|
305
|
+
return rb_u_string_new_c_own(self, c, length);
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
/* @return [String] A UTF-8-encoded string of the receiver’s content */
|
|
309
|
+
VALUE
|
|
310
|
+
rb_u_buffer_to_s(VALUE self)
|
|
311
|
+
{
|
|
312
|
+
const struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
|
|
313
|
+
|
|
314
|
+
VALUE result = rb_u_str_new(buffer->c, buffer->length);
|
|
315
|
+
OBJ_INFECT(result, self);
|
|
316
|
+
return result;
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
/* @return [Integer] The number of characters in the receiver */
|
|
320
|
+
VALUE
|
|
321
|
+
rb_u_buffer_length(VALUE self)
|
|
322
|
+
{
|
|
323
|
+
const struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
|
|
324
|
+
return UINT2NUM(u_n_chars_n(buffer->c, buffer->length));
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
/* @return [Integer] The number of bytes required to represent the receiver */
|
|
328
|
+
VALUE
|
|
329
|
+
rb_u_buffer_bytesize(VALUE self)
|
|
330
|
+
{
|
|
331
|
+
const struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
|
|
332
|
+
return UINT2NUM(buffer->length);
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
/* Returns the width of the receiver. The width is defined as the sum of the
|
|
336
|
+
* number of “cells” on a terminal or similar cell-based display that the
|
|
337
|
+
* characters in the string will require.
|
|
338
|
+
*
|
|
339
|
+
* Characters that are {U::String#wide?} have a width of 2. Characters that
|
|
340
|
+
* are {U::String#zero_width?} have a width of 0. Other characters have a
|
|
341
|
+
* width of 1.
|
|
342
|
+
*
|
|
343
|
+
* @return [Integer]
|
|
344
|
+
* @see http://www.unicode.org/reports/tr11/
|
|
345
|
+
* Unicode Standard Annex #11: East Asian Width */
|
|
346
|
+
VALUE
|
|
347
|
+
rb_u_buffer_width(VALUE self)
|
|
348
|
+
{
|
|
349
|
+
const struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
|
|
350
|
+
return UINT2NUM(u_width_n(buffer->c, buffer->length));
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
/* @overload ==(other)
|
|
354
|
+
* @param [U::Buffer] other
|
|
355
|
+
* @return [Boolean] True if the receiver’s class and content equal those of
|
|
356
|
+
* OTHER */
|
|
357
|
+
VALUE
|
|
358
|
+
rb_u_buffer_eql(VALUE self, VALUE rbother)
|
|
359
|
+
{
|
|
360
|
+
if (self == rbother)
|
|
361
|
+
return Qtrue;
|
|
362
|
+
|
|
363
|
+
if (!RTEST(rb_obj_is_kind_of(rbother, rb_cUBuffer)))
|
|
364
|
+
return Qfalse;
|
|
365
|
+
|
|
366
|
+
const struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
|
|
367
|
+
const struct rb_u_buffer *other = RVAL2RBUBUFFER(rbother);
|
|
368
|
+
|
|
369
|
+
return buffer->length == other->length &&
|
|
370
|
+
memcmp(buffer->c, other->c, other->length) == 0 ?
|
|
371
|
+
Qtrue : Qfalse;
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
/* @return [Fixnum] The hash value of the receiver’s content */
|
|
375
|
+
VALUE
|
|
376
|
+
rb_u_buffer_hash(VALUE self)
|
|
377
|
+
{
|
|
378
|
+
const struct rb_u_buffer *buffer = RVAL2RBUBUFFER(self);
|
|
379
|
+
|
|
380
|
+
return INT2FIX(rb_memhash(buffer->c, buffer->length));
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
/* Document-class: U::Buffer
|
|
384
|
+
*
|
|
385
|
+
* A buffer for building {U::String}s. Buffers should be used when you want to
|
|
386
|
+
* create U::Strings step-wise, for example, when joining them together, or
|
|
387
|
+
* reading some input. Create a new buffer with {#initialize}, optionally
|
|
388
|
+
* specifying an initial size. Then, {#append} (or {#<<}) content to it. You
|
|
389
|
+
* can also {#append_format}ted content. You can check the {#length}
|
|
390
|
+
* ({#size}), {#bytesize}, and {#width} of the buffer, which can be useful if
|
|
391
|
+
* you want to limit how much content you want to generate. Once all content
|
|
392
|
+
* has been appended, a buffer can be converted to a U::String via {#to_u} or
|
|
393
|
+
* {#to_u!} depending on whether you want to let the buffer keep its content or
|
|
394
|
+
* not. You can also convert it to a String with {#to_s}.
|
|
395
|
+
*
|
|
396
|
+
* @example Benchmarking U::String#+ Versus U::Buffer#append/U::Buffer#to_u!
|
|
397
|
+
* require 'benchmark'
|
|
398
|
+
* require 'u-1.0'
|
|
399
|
+
* Benchmark.bm do |x|
|
|
400
|
+
* x.report do
|
|
401
|
+
* a = ''.u
|
|
402
|
+
* 100000.times do
|
|
403
|
+
* a = a + 'a'
|
|
404
|
+
* end
|
|
405
|
+
* end
|
|
406
|
+
* x.report do
|
|
407
|
+
* b = U::Buffer.new
|
|
408
|
+
* 100000.times do
|
|
409
|
+
* b.append 'a'
|
|
410
|
+
* end
|
|
411
|
+
* a = b.to_u!
|
|
412
|
+
* end
|
|
413
|
+
* end
|
|
414
|
+
* # ⇒
|
|
415
|
+
* # user system total real
|
|
416
|
+
* # 3.560000 0.650000 4.210000 ( 4.726064)
|
|
417
|
+
* # 0.060000 0.000000 0.060000 ( 0.057134) */
|
|
418
|
+
void
|
|
419
|
+
Init_u_buffer(VALUE mU)
|
|
420
|
+
{
|
|
421
|
+
rb_cUBuffer = rb_define_class_under(mU, "Buffer", rb_cData);
|
|
422
|
+
|
|
423
|
+
rb_define_alloc_func(rb_cUBuffer, rb_u_buffer_alloc);
|
|
424
|
+
rb_define_private_method(rb_cUBuffer, "initialize", rb_u_buffer_initialize, -1);
|
|
425
|
+
rb_define_private_method(rb_cUBuffer, "initialize_copy", rb_u_buffer_initialize_copy, 1);
|
|
426
|
+
|
|
427
|
+
rb_define_method(rb_cUBuffer, "append", rb_u_buffer_append_m, -1);
|
|
428
|
+
rb_define_alias(rb_cUBuffer, "<<", "append");
|
|
429
|
+
rb_define_method(rb_cUBuffer, "append_format", rb_u_buffer_append_format_m, -1); /* in ext/u/rb_u_string_format.c */
|
|
430
|
+
|
|
431
|
+
rb_define_method(rb_cUBuffer, "to_u", rb_u_buffer_to_u, 0);
|
|
432
|
+
rb_define_method(rb_cUBuffer, "to_u!", rb_u_buffer_to_u_bang, 0);
|
|
433
|
+
rb_define_method(rb_cUBuffer, "to_s", rb_u_buffer_to_s, 0);
|
|
434
|
+
|
|
435
|
+
rb_define_method(rb_cUBuffer, "length", rb_u_buffer_length, 0);
|
|
436
|
+
rb_define_alias(rb_cUBuffer, "size", "length");
|
|
437
|
+
rb_define_method(rb_cUBuffer, "bytesize", rb_u_buffer_bytesize, 0);
|
|
438
|
+
rb_define_method(rb_cUBuffer, "width", rb_u_buffer_width, 0);
|
|
439
|
+
|
|
440
|
+
rb_define_method(rb_cUBuffer, "==", rb_u_buffer_eql, 1);
|
|
441
|
+
rb_define_alias(rb_cUBuffer, "eql?", "==");
|
|
442
|
+
rb_define_method(rb_cUBuffer, "hash", rb_u_buffer_hash, 0);
|
|
443
|
+
}
|
data/ext/u/rb_u_buffer.h
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
VALUE rb_u_buffer_new(void);
|
|
2
|
+
VALUE rb_u_buffer_new_sized(long size);
|
|
3
|
+
|
|
4
|
+
VALUE rb_u_buffer_append_printf(VALUE self, size_t needed,
|
|
5
|
+
const char *format, ...) PRINTF(3, 4);
|
|
6
|
+
|
|
7
|
+
VALUE rb_u_buffer_append(VALUE self, const char *str, long length);
|
|
8
|
+
VALUE rb_u_buffer_append_format(int argc, const VALUE *argv, VALUE self, VALUE format);
|
|
9
|
+
VALUE rb_u_buffer_append_format_m(int argc, const VALUE *argv, VALUE self);
|
|
10
|
+
VALUE rb_u_buffer_append_char(VALUE self, uint32_t c);
|
|
11
|
+
VALUE rb_u_buffer_append_char_n(VALUE self, uint32_t c, long n);
|
|
12
|
+
|
|
13
|
+
VALUE rb_u_buffer_append_m(int argc, VALUE *argv, VALUE self);
|
|
14
|
+
VALUE rb_u_buffer_bytesize(VALUE self);
|
|
15
|
+
VALUE rb_u_buffer_eql(VALUE self, VALUE rbother);
|
|
16
|
+
VALUE rb_u_buffer_hash(VALUE self);
|
|
17
|
+
VALUE rb_u_buffer_inspect(VALUE self);
|
|
18
|
+
VALUE rb_u_buffer_length(VALUE self);
|
|
19
|
+
VALUE rb_u_buffer_to_s(VALUE self);
|
|
20
|
+
VALUE rb_u_buffer_to_u(VALUE self);
|
|
21
|
+
VALUE rb_u_buffer_to_u_bang(VALUE self);
|
|
22
|
+
VALUE rb_u_buffer_width(VALUE self);
|
|
23
|
+
|
|
24
|
+
void Init_u_buffer(VALUE mU);
|