u 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/README +38 -0
  2. data/Rakefile +64 -0
  3. data/ext/encoding/character/utf-8/break.c +25 -0
  4. data/ext/encoding/character/utf-8/data/break.h +22931 -0
  5. data/ext/encoding/character/utf-8/data/character-tables.h +14358 -0
  6. data/ext/encoding/character/utf-8/data/compose.h +1607 -0
  7. data/ext/encoding/character/utf-8/data/decompose.h +10926 -0
  8. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1070 -0
  9. data/ext/encoding/character/utf-8/decompose.c +444 -0
  10. data/ext/encoding/character/utf-8/depend +65 -0
  11. data/ext/encoding/character/utf-8/extconf.rb +67 -0
  12. data/ext/encoding/character/utf-8/private.c +62 -0
  13. data/ext/encoding/character/utf-8/private.h +51 -0
  14. data/ext/encoding/character/utf-8/properties.c +1056 -0
  15. data/ext/encoding/character/utf-8/rb_includes.h +19 -0
  16. data/ext/encoding/character/utf-8/rb_methods.h +49 -0
  17. data/ext/encoding/character/utf-8/rb_private.h +52 -0
  18. data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
  19. data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
  20. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
  21. data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
  22. data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
  23. data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
  24. data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
  25. data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
  26. data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
  27. data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
  28. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
  29. data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
  30. data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
  31. data/ext/encoding/character/utf-8/rb_utf_insert.c +48 -0
  32. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +332 -0
  33. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
  34. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
  35. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
  36. data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
  37. data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
  38. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
  39. data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
  40. data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
  41. data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
  42. data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
  43. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
  44. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
  45. data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
  46. data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
  47. data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
  48. data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
  49. data/ext/encoding/character/utf-8/tables.h +38 -0
  50. data/ext/encoding/character/utf-8/unicode.c +319 -0
  51. data/ext/encoding/character/utf-8/unicode.h +216 -0
  52. data/ext/encoding/character/utf-8/utf.c +1334 -0
  53. data/lib/encoding/character/utf-8.rb +201 -0
  54. data/lib/u.rb +16 -0
  55. data/lib/u/string.rb +185 -0
  56. data/lib/u/version.rb +5 -0
  57. data/test/unit/u.rb +5 -0
  58. data/test/unit/u/string.rb +91 -0
  59. metadata +174 -0
@@ -0,0 +1,12 @@
1
+ /*
2
+ * contents: Internal functionality for turning strings into Bignums.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #ifndef RB_UTF_INTERNAL_BIGNUM_H
8
+ #define RB_UTF_INTERNAL_BIGNUM_H
9
+
10
+ VALUE rb_utf_to_inum(VALUE str, int base, bool verify) HIDDEN;
11
+
12
+ #endif /* RB_UTF_INTERNAL_BIGNUM_H */
@@ -0,0 +1,142 @@
1
+ /*
2
+ * contents: Translation (#tr) related functions.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+ #include "rb_utf_internal_tr.h"
9
+
10
+ void
11
+ tr_init(struct tr *tr, char *p, char *p_end)
12
+ {
13
+ tr->p = p;
14
+ tr->p_end = p_end;
15
+ tr->inside_range = false;
16
+ }
17
+
18
+ bool
19
+ tr_should_exclude(struct tr *tr)
20
+ {
21
+ if (tr->p + 1 < tr->p_end && *tr->p == '^') {
22
+ tr->p++;
23
+ return true;
24
+ }
25
+
26
+ return false;
27
+ }
28
+
29
+ static enum tr_state
30
+ tr_next_char(struct tr *t)
31
+ {
32
+ if (t->p == t->p_end)
33
+ return TR_FINISHED;
34
+
35
+ if (_utf_char_validated(t->p, t->p_end) == '\\') {
36
+ char *next = utf_find_next(t->p, t->p_end);
37
+
38
+ if (next == NULL) {
39
+ t->now = '\\';
40
+ t->p = t->p_end;
41
+ return TR_FOUND;
42
+ }
43
+
44
+ t->p = next;
45
+ }
46
+
47
+ t->now = _utf_char_validated(t->p, t->p_end);
48
+
49
+ char *next = utf_find_next(t->p, t->p_end);
50
+ if (next == NULL) {
51
+ t->p = t->p_end;
52
+ return TR_FOUND;
53
+ }
54
+ t->p = next;
55
+
56
+ if (_utf_char_validated(t->p, t->p_end) == '-') {
57
+ next = utf_find_next(t->p, t->p_end);
58
+
59
+ if (next != NULL) {
60
+ unichar max = utf_char(next);
61
+
62
+ if (max < t->now) {
63
+ t->p = next;
64
+ return TR_READ_ANOTHER;
65
+ }
66
+
67
+ t->inside_range = true;
68
+ t->max = max;
69
+ }
70
+ }
71
+
72
+ return TR_FOUND;
73
+ }
74
+
75
+ enum tr_state
76
+ tr_next(struct tr *t)
77
+ {
78
+ while (true) {
79
+ if (!t->inside_range) {
80
+ enum tr_state state;
81
+
82
+ if ((state = tr_next_char(t)) == TR_READ_ANOTHER)
83
+ continue;
84
+
85
+ return state;
86
+ } else if (++t->now < t->max) {
87
+ return TR_FOUND;
88
+ } else {
89
+ t->inside_range = false;
90
+ return TR_FOUND;
91
+ }
92
+ }
93
+ }
94
+
95
+ static void
96
+ tr_table_set(unsigned int *table, unichar c, unsigned int value)
97
+ {
98
+ unsigned int offset = c / WORD_BIT;
99
+ unsigned int bit = c % WORD_BIT;
100
+
101
+ table[offset] = (table[offset] & ~(1U << bit)) | ((value & 1U) << bit);
102
+ }
103
+
104
+ void
105
+ tr_setup_table(VALUE str, unsigned int *table, bool initialize)
106
+ {
107
+ unsigned int buf[TR_TABLE_SIZE];
108
+
109
+ struct tr tr;
110
+ tr_init(&tr, RSTRING(str)->ptr, RSTRING(str)->ptr + RSTRING(str)->len);
111
+
112
+ bool exclude = tr_should_exclude(&tr);
113
+
114
+ if (initialize)
115
+ for (int i = 0; i < TR_TABLE_SIZE; i++)
116
+ table[i] = ~0U;
117
+
118
+ unsigned int buf_initializer = exclude ? ~0U : 0U;
119
+ for (int i = 0; i < TR_TABLE_SIZE; i++)
120
+ buf[i] = buf_initializer;
121
+
122
+ unsigned int buf_setter = !exclude;
123
+ while (tr_next(&tr) != TR_FINISHED)
124
+ tr_table_set(buf, tr.now, buf_setter);
125
+
126
+ for (int i = 0; i < TR_TABLE_SIZE; i++)
127
+ table[i] &= buf[i];
128
+ }
129
+
130
+ void
131
+ tr_setup_table_from_strings(unsigned int *table, int argc, VALUE *argv)
132
+ {
133
+ bool initialize = true;
134
+ for (int i = 0; i < argc; i++) {
135
+ VALUE s = argv[i];
136
+
137
+ StringValue(s);
138
+ tr_setup_table(s, table, initialize);
139
+ initialize = false;
140
+ }
141
+ }
142
+
@@ -0,0 +1,41 @@
1
+ /*
2
+ * contents: Translation (#tr) related functions
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #ifndef TR_H
8
+ #define TR_H
9
+
10
+ #ifndef WORD_BIT
11
+ # define WORD_BIT (sizeof(int) * CHAR_BIT)
12
+ #endif
13
+
14
+ #define TR_TABLE_SIZE ((int)(UNICODE_N_CODEPOINTS / WORD_BIT))
15
+
16
+ #define tr_table_lookup(table, offset) \
17
+ ((table)[(offset) / WORD_BIT] & (1U << (offset) % WORD_BIT))
18
+
19
+ struct tr {
20
+ bool inside_range;
21
+ unichar now;
22
+ unichar max;
23
+ char *p;
24
+ char *p_end;
25
+ };
26
+
27
+ enum tr_state
28
+ {
29
+ TR_FOUND,
30
+ TR_READ_ANOTHER,
31
+ TR_FINISHED
32
+ };
33
+
34
+ void tr_init(struct tr *tr, char *p, char *p_end) HIDDEN;
35
+ bool tr_should_exclude(struct tr *tr) HIDDEN;
36
+ enum tr_state tr_next(struct tr *t) HIDDEN;
37
+ void tr_setup_table(VALUE str, unsigned int *table, bool initialize) HIDDEN;
38
+ void tr_setup_table_from_strings(unsigned int *table, int argc,
39
+ VALUE *argv) HIDDEN;
40
+
41
+ #endif /* TR_H */
@@ -0,0 +1,96 @@
1
+ /*
2
+ * contents: UTF8.center, UTF8.ljust, and UTF8.rjust.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+
9
+ static char *
10
+ rb_utf_justify_one_side(char *p, const char *f, long f_len, long f_size, long n)
11
+ {
12
+ long i;
13
+ for (i = 0; i + f_len < n; i += f_len, p += f_size)
14
+ memcpy(p, f, f_size);
15
+
16
+ const char *q = f;
17
+ while (i < n) {
18
+ const char *q_end = utf_next(q);
19
+ memcpy(p, q, q_end - q);
20
+ p += q_end - q;
21
+ q = q_end;
22
+ i++;
23
+ }
24
+
25
+ return p;
26
+ }
27
+
28
+ static VALUE
29
+ rb_utf_justify(int argc, VALUE *argv, char jflag)
30
+ {
31
+ VALUE str, w, pad;
32
+ const char *f = " ";
33
+ long f_len = 1;
34
+ long f_size = 1;
35
+ bool infect_from_pad = false;
36
+
37
+ if (rb_scan_args(argc, argv, "21", &str, &w, &pad) == 3) {
38
+ StringValue(pad);
39
+ f = RSTRING(pad)->ptr;
40
+ f_len = utf_length_n(f, RSTRING(pad)->len);
41
+ if (f_len == 0)
42
+ rb_raise(rb_eArgError, "zero width padding");
43
+ f_size = RSTRING(pad)->len;
44
+ infect_from_pad = true;
45
+ }
46
+
47
+ long len = utf_length_n(RSTRING(str)->ptr, RSTRING(str)->len);
48
+
49
+ long width = NUM2LONG(w);
50
+ if (width < 0 || len >= width)
51
+ return rb_utf_dup(str);
52
+
53
+ VALUE res = rb_utf_new5(str, 0, RSTRING(str)->len + (width - len) * f_size);
54
+ char *p = RSTRING(res)->ptr;
55
+
56
+ long n_remaining = width - len;
57
+ if (jflag != 'l') {
58
+ long n = n_remaining;
59
+ if (jflag == 'c')
60
+ n /= 2;
61
+ n_remaining -= n;
62
+
63
+ p = rb_utf_justify_one_side(p, f, f_len, f_size, n);
64
+ }
65
+
66
+ memcpy(p, RSTRING(str)->ptr, RSTRING(str)->len);
67
+ p += RSTRING(str)->len;
68
+
69
+ if (jflag != 'r')
70
+ p = rb_utf_justify_one_side(p, f, f_len, f_size, n_remaining);
71
+
72
+ OBJ_INFECT(res, str);
73
+
74
+ if (infect_from_pad)
75
+ OBJ_INFECT(res, pad);
76
+
77
+ return res;
78
+ }
79
+
80
+ VALUE
81
+ rb_utf_center(int argc, VALUE *argv, UNUSED(VALUE self))
82
+ {
83
+ return rb_utf_justify(argc, argv, 'c');
84
+ }
85
+
86
+ VALUE
87
+ rb_utf_ljust(int argc, VALUE *argv, UNUSED(VALUE self))
88
+ {
89
+ return rb_utf_justify(argc, argv, 'l');
90
+ }
91
+
92
+ VALUE
93
+ rb_utf_rjust(int argc, VALUE *argv, UNUSED(VALUE self))
94
+ {
95
+ return rb_utf_justify(argc, argv, 'r');
96
+ }
@@ -0,0 +1,14 @@
1
+ /*
2
+ * contents: UTF8.length module function.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+
9
+ VALUE
10
+ rb_utf_length(UNUSED(VALUE self), VALUE str)
11
+ {
12
+ StringValue(str);
13
+ return UINT2NUM(utf_length_n(RSTRING(str)->ptr, RSTRING(str)->len));
14
+ }
@@ -0,0 +1,41 @@
1
+ /*
2
+ * contents: UTF8.lstrip module function.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+
9
+ VALUE
10
+ rb_utf_lstrip_bang(UNUSED(VALUE self), VALUE str)
11
+ {
12
+ StringValue(str);
13
+ char *s = RSTRING(str)->ptr;
14
+ if (s == NULL || RSTRING(str)->len == 0)
15
+ return Qnil;
16
+
17
+ char *end = s + RSTRING(str)->len;
18
+
19
+ /* Remove spaces at head. */
20
+ while (s < end && unichar_isspace(_utf_char_validated(s, end)))
21
+ s = utf_next(s);
22
+
23
+ /* If there weren’t any spaces at head, return Qnil. */
24
+ if (s == RSTRING(str)->ptr)
25
+ return Qnil;
26
+
27
+ rb_str_modify(str);
28
+ RSTRING(str)->len = end - s;
29
+ memmove(RSTRING(str)->ptr, s, RSTRING(str)->len);
30
+ RSTRING(str)->ptr[RSTRING(str)->len] = '\0';
31
+
32
+ return str;
33
+ }
34
+
35
+ VALUE
36
+ rb_utf_lstrip(VALUE self, VALUE str)
37
+ {
38
+ str = rb_utf_dup(str);
39
+ rb_utf_lstrip_bang(self, str);
40
+ return str;
41
+ }
@@ -0,0 +1,51 @@
1
+ /*
2
+ * contents: Encoding::Character::UTF8.normalize module function.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+
9
+ #define SYMBOL2MODE(symbol, mode, id) do { \
10
+ static ID id_##symbol; \
11
+ if (id_##symbol == 0) \
12
+ id_##symbol = rb_intern(#symbol); \
13
+ if (id == id_##symbol) \
14
+ return mode; \
15
+ } while (0)
16
+
17
+ static NormalizeMode
18
+ symbol_to_mode(VALUE symbol)
19
+ {
20
+ if (!SYMBOL_P(symbol))
21
+ rb_raise(rb_eTypeError, "not a symbol");
22
+
23
+ ID id = SYM2ID(symbol);
24
+
25
+ SYMBOL2MODE(default, NORMALIZE_DEFAULT, id);
26
+ SYMBOL2MODE(nfd, NORMALIZE_NFD, id);
27
+ SYMBOL2MODE(default_compose, NORMALIZE_DEFAULT_COMPOSE, id);
28
+ SYMBOL2MODE(nfc, NORMALIZE_NFC, id);
29
+ SYMBOL2MODE(all, NORMALIZE_ALL, id);
30
+ SYMBOL2MODE(nfkd, NORMALIZE_NFKD, id);
31
+ SYMBOL2MODE(all_compose, NORMALIZE_ALL_COMPOSE, id);
32
+ SYMBOL2MODE(nfkc, NORMALIZE_NFKC, id);
33
+
34
+ rb_raise(rb_eArgError, "unknown symbol");
35
+ }
36
+
37
+ VALUE
38
+ rb_utf_normalize(int argc, VALUE *argv, UNUSED(VALUE self))
39
+ {
40
+ VALUE str, rbmode;
41
+
42
+ NormalizeMode mode = NORMALIZE_DEFAULT;
43
+ if (rb_scan_args(argc, argv, "11", &str, &rbmode) == 2)
44
+ mode = symbol_to_mode(rbmode);
45
+
46
+ StringValue(str);
47
+
48
+ return rb_utf_alloc_using(utf_normalize_n(RSTRING(str)->ptr,
49
+ mode,
50
+ RSTRING(str)->len));
51
+ }
@@ -0,0 +1,14 @@
1
+ /*
2
+ * contents: UTF8.oct module function.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+ #include "rb_utf_internal_bignum.h"
9
+
10
+ VALUE
11
+ rb_utf_oct(UNUSED(VALUE self), VALUE str)
12
+ {
13
+ return rb_utf_to_inum(str, -8, false);
14
+ }
@@ -0,0 +1,13 @@
1
+ /*
2
+ * contents: UTF8.reverse module function.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+
9
+ VALUE
10
+ rb_utf_reverse(UNUSED(VALUE self), VALUE str)
11
+ {
12
+ return rb_utf_alloc_using(utf_reverse(StringValuePtr(str)));
13
+ }
@@ -0,0 +1,88 @@
1
+ /*
2
+ * contents: UTF8.rindex module function.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+
9
+ static long
10
+ rb_utf_rindex(VALUE str, VALUE sub, long offset)
11
+ {
12
+ if (RSTRING(str)->len < RSTRING(sub)->len)
13
+ return -1;
14
+
15
+ char *s, *end;
16
+ rb_utf_begin_from_offset_validated(str, offset, &s, &end);
17
+
18
+ if (RSTRING(sub)->len == 0)
19
+ return utf_pointer_to_offset(RSTRING(str)->ptr, s);
20
+
21
+ char *s_begin = RSTRING(str)->ptr;
22
+ char *t = RSTRING(sub)->ptr;
23
+ long len = RSTRING(sub)->len;
24
+ while (s >= s_begin) {
25
+ if (rb_memcmp(s, t, len) == 0)
26
+ return utf_pointer_to_offset(s_begin, s);
27
+ s--;
28
+ }
29
+
30
+ return -1;
31
+ }
32
+
33
+ VALUE
34
+ rb_utf_rindex_m(int argc, VALUE *argv, UNUSED(VALUE self))
35
+ {
36
+ VALUE str, sub, rboffset;
37
+
38
+ rb_scan_args(argc, argv, "21", &str, &sub, &rboffset);
39
+
40
+ StringValue(str);
41
+
42
+ long offset = (argc == 3) ? NUM2LONG(rboffset) : RSTRING(str)->len;
43
+
44
+ char *begin, *end;
45
+ rb_utf_begin_from_offset(str, offset, &begin, &end);
46
+ if (begin == NULL) {
47
+ if (offset <= 0) {
48
+ if (TYPE(sub) == T_REGEXP)
49
+ rb_backref_set(Qnil);
50
+
51
+ return Qnil;
52
+ }
53
+
54
+ begin = end;
55
+ /* TODO: this converting back and forward can be optimized away
56
+ * if rb_utf_index_regexp() and rb_utf_rindex() were split up
57
+ * into two additional functions, adding
58
+ * rb_utf_index_regexp_pointer() and rb_utf_rindex_pointer(),
59
+ * so that one can pass a pointer to start at immediately
60
+ * instead of an offset that gets calculated into a pointer. */
61
+ offset = utf_length_n(RSTRING(str)->ptr, RSTRING(str)->len);
62
+ }
63
+
64
+ switch (TYPE(sub)) {
65
+ case T_REGEXP:
66
+ if (RREGEXP(sub)->len > 0)
67
+ offset = rb_utf_index_regexp(str, begin, end, sub,
68
+ offset, true);
69
+ break;
70
+ default: {
71
+ VALUE tmp = rb_check_string_type(sub);
72
+ if (NIL_P(tmp))
73
+ rb_raise(rb_eTypeError, "type mismatch: %s given",
74
+ rb_obj_classname(sub));
75
+
76
+ sub = tmp;
77
+ }
78
+ /* fall through */
79
+ case T_STRING:
80
+ offset = rb_utf_rindex(str, sub, offset);
81
+ break;
82
+ }
83
+
84
+ if (offset < 0)
85
+ return Qnil;
86
+
87
+ return LONG2NUM(offset);
88
+ }