character-encodings 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (66) hide show
  1. data/README +26 -0
  2. data/Rakefile +157 -0
  3. data/ext/encoding/character/unicode/codepoint.c +48 -0
  4. data/ext/encoding/character/utf-8/break.c +38 -0
  5. data/ext/encoding/character/utf-8/data/break.h +22931 -0
  6. data/ext/encoding/character/utf-8/data/character-tables.h +14356 -0
  7. data/ext/encoding/character/utf-8/data/compose.h +1607 -0
  8. data/ext/encoding/character/utf-8/data/decompose.h +10925 -0
  9. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1065 -0
  10. data/ext/encoding/character/utf-8/decompose.c +476 -0
  11. data/ext/encoding/character/utf-8/depend +64 -0
  12. data/ext/encoding/character/utf-8/extconf.rb +47 -0
  13. data/ext/encoding/character/utf-8/private.h +68 -0
  14. data/ext/encoding/character/utf-8/properties.c +1061 -0
  15. data/ext/encoding/character/utf-8/rb_includes.h +18 -0
  16. data/ext/encoding/character/utf-8/rb_methods.h +49 -0
  17. data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
  18. data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
  19. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
  20. data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
  21. data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
  22. data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
  23. data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
  24. data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
  25. data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
  26. data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
  27. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
  28. data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
  29. data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
  30. data/ext/encoding/character/utf-8/rb_utf_insert.c +43 -0
  31. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +331 -0
  32. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
  33. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
  34. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
  35. data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
  36. data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
  37. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
  38. data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
  39. data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
  40. data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
  41. data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
  42. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
  43. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
  44. data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
  45. data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
  46. data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
  47. data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
  48. data/ext/encoding/character/utf-8/unicode.c +319 -0
  49. data/ext/encoding/character/utf-8/unicode.h +208 -0
  50. data/ext/encoding/character/utf-8/utf.c +1332 -0
  51. data/lib/encoding/character/utf-8.rb +201 -0
  52. data/specifications/aref.rb +45 -0
  53. data/specifications/count.rb +29 -0
  54. data/specifications/delete.rb +25 -0
  55. data/specifications/each_char.rb +28 -0
  56. data/specifications/index.rb +35 -0
  57. data/specifications/insert.rb +67 -0
  58. data/specifications/length.rb +45 -0
  59. data/specifications/rindex.rb +52 -0
  60. data/specifications/squeeze.rb +25 -0
  61. data/specifications/to_i.rb +54 -0
  62. data/specifications/tr.rb +39 -0
  63. data/tests/foldcase.rb +28 -0
  64. data/tests/normalize.rb +101 -0
  65. data/tests/unicodedatatestbase.rb +45 -0
  66. metadata +112 -0
@@ -0,0 +1,12 @@
1
+ /*
2
+ * contents: Internal functionality for turning strings into Bignums.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #ifndef RB_UTF_INTERNAL_BIGNUM_H
8
+ #define RB_UTF_INTERNAL_BIGNUM_H
9
+
10
+ VALUE rb_utf_to_inum(VALUE str, int base, bool verify) HIDDEN;
11
+
12
+ #endif /* RB_UTF_INTERNAL_BIGNUM_H */
@@ -0,0 +1,142 @@
1
+ /*
2
+ * contents: Translation (#tr) related functions.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+ #include "rb_utf_internal_tr.h"
9
+
10
+ void
11
+ tr_init(struct tr *tr, char *p, char *p_end)
12
+ {
13
+ tr->p = p;
14
+ tr->p_end = p_end;
15
+ tr->inside_range = false;
16
+ }
17
+
18
+ bool
19
+ tr_should_exclude(struct tr *tr)
20
+ {
21
+ if (tr->p + 1 < tr->p_end && *tr->p == '^') {
22
+ tr->p++;
23
+ return true;
24
+ }
25
+
26
+ return false;
27
+ }
28
+
29
+ static enum tr_state
30
+ tr_next_char(struct tr *t)
31
+ {
32
+ if (t->p == t->p_end)
33
+ return TR_FINISHED;
34
+
35
+ if (_utf_char_validated(t->p, t->p_end) == '\\') {
36
+ char *next = utf_find_next(t->p, t->p_end);
37
+
38
+ if (next == NULL) {
39
+ t->now = '\\';
40
+ t->p = t->p_end;
41
+ return TR_FOUND;
42
+ }
43
+
44
+ t->p = next;
45
+ }
46
+
47
+ t->now = _utf_char_validated(t->p, t->p_end);
48
+
49
+ char *next = utf_find_next(t->p, t->p_end);
50
+ if (next == NULL) {
51
+ t->p = t->p_end;
52
+ return TR_FOUND;
53
+ }
54
+ t->p = next;
55
+
56
+ if (_utf_char_validated(t->p, t->p_end) == '-') {
57
+ next = utf_find_next(t->p, t->p_end);
58
+
59
+ if (next != NULL) {
60
+ unichar max = utf_char(next);
61
+
62
+ if (max < t->now) {
63
+ t->p = next;
64
+ return TR_READ_ANOTHER;
65
+ }
66
+
67
+ t->inside_range = true;
68
+ t->max = max;
69
+ }
70
+ }
71
+
72
+ return TR_FOUND;
73
+ }
74
+
75
+ enum tr_state
76
+ tr_next(struct tr *t)
77
+ {
78
+ while (true) {
79
+ if (!t->inside_range) {
80
+ enum tr_state state;
81
+
82
+ if ((state = tr_next_char(t)) == TR_READ_ANOTHER)
83
+ continue;
84
+
85
+ return state;
86
+ } else if (++t->now < t->max) {
87
+ return TR_FOUND;
88
+ } else {
89
+ t->inside_range = false;
90
+ return TR_FOUND;
91
+ }
92
+ }
93
+ }
94
+
95
+ static void
96
+ tr_table_set(unsigned int *table, unichar c, unsigned int value)
97
+ {
98
+ unsigned int offset = c / WORD_BIT;
99
+ unsigned int bit = c % WORD_BIT;
100
+
101
+ table[offset] = (table[offset] & ~(1U << bit)) | ((value & 1U) << bit);
102
+ }
103
+
104
+ void
105
+ tr_setup_table(VALUE str, unsigned int *table, bool initialize)
106
+ {
107
+ unsigned int buf[TR_TABLE_SIZE];
108
+
109
+ struct tr tr;
110
+ tr_init(&tr, RSTRING(str)->ptr, RSTRING(str)->ptr + RSTRING(str)->len);
111
+
112
+ bool exclude = tr_should_exclude(&tr);
113
+
114
+ if (initialize)
115
+ for (int i = 0; i < TR_TABLE_SIZE; i++)
116
+ table[i] = ~0U;
117
+
118
+ unsigned int buf_initializer = exclude ? ~0U : 0U;
119
+ for (int i = 0; i < TR_TABLE_SIZE; i++)
120
+ buf[i] = buf_initializer;
121
+
122
+ unsigned int buf_setter = !exclude;
123
+ while (tr_next(&tr) != TR_FINISHED)
124
+ tr_table_set(buf, tr.now, buf_setter);
125
+
126
+ for (int i = 0; i < TR_TABLE_SIZE; i++)
127
+ table[i] &= buf[i];
128
+ }
129
+
130
+ void
131
+ tr_setup_table_from_strings(unsigned int *table, int argc, VALUE *argv)
132
+ {
133
+ bool initialize = true;
134
+ for (int i = 0; i < argc; i++) {
135
+ VALUE s = argv[i];
136
+
137
+ StringValue(s);
138
+ tr_setup_table(s, table, initialize);
139
+ initialize = false;
140
+ }
141
+ }
142
+
@@ -0,0 +1,41 @@
1
+ /*
2
+ * contents: Translation (#tr) related functions
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #ifndef TR_H
8
+ #define TR_H
9
+
10
+ #ifndef WORD_BIT
11
+ # define WORD_BIT (sizeof(int) * CHAR_BIT)
12
+ #endif
13
+
14
+ #define TR_TABLE_SIZE ((int)(UNICODE_N_CODEPOINTS / WORD_BIT))
15
+
16
+ #define tr_table_lookup(table, offset) \
17
+ ((table)[(offset) / WORD_BIT] & (1U << (offset) % WORD_BIT))
18
+
19
+ struct tr {
20
+ bool inside_range;
21
+ unichar now;
22
+ unichar max;
23
+ char *p;
24
+ char *p_end;
25
+ };
26
+
27
+ enum tr_state
28
+ {
29
+ TR_FOUND,
30
+ TR_READ_ANOTHER,
31
+ TR_FINISHED
32
+ };
33
+
34
+ void tr_init(struct tr *tr, char *p, char *p_end) HIDDEN;
35
+ bool tr_should_exclude(struct tr *tr) HIDDEN;
36
+ enum tr_state tr_next(struct tr *t) HIDDEN;
37
+ void tr_setup_table(VALUE str, unsigned int *table, bool initialize) HIDDEN;
38
+ void tr_setup_table_from_strings(unsigned int *table, int argc,
39
+ VALUE *argv) HIDDEN;
40
+
41
+ #endif /* TR_H */
@@ -0,0 +1,96 @@
1
+ /*
2
+ * contents: UTF8.center, UTF8.ljust, and UTF8.rjust.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+
9
+ static char *
10
+ rb_utf_justify_one_side(char *p, const char *f, long f_len, long f_size, long n)
11
+ {
12
+ long i;
13
+ for (i = 0; i + f_len < n; i += f_len, p += f_size)
14
+ memcpy(p, f, f_size);
15
+
16
+ const char *q = f;
17
+ while (i < n) {
18
+ const char *q_end = utf_next(q);
19
+ memcpy(p, q, q_end - q);
20
+ p += q_end - q;
21
+ q = q_end;
22
+ i++;
23
+ }
24
+
25
+ return p;
26
+ }
27
+
28
+ static VALUE
29
+ rb_utf_justify(int argc, VALUE *argv, char jflag)
30
+ {
31
+ VALUE str, w, pad;
32
+ const char *f = " ";
33
+ long f_len = 1;
34
+ long f_size = 1;
35
+ bool infect_from_pad = false;
36
+
37
+ if (rb_scan_args(argc, argv, "21", &str, &w, &pad) == 3) {
38
+ StringValue(pad);
39
+ f = RSTRING(pad)->ptr;
40
+ f_len = utf_length_n(f, RSTRING(pad)->len);
41
+ if (f_len == 0)
42
+ rb_raise(rb_eArgError, "zero width padding");
43
+ f_size = RSTRING(pad)->len;
44
+ infect_from_pad = true;
45
+ }
46
+
47
+ long len = utf_length_n(RSTRING(str)->ptr, RSTRING(str)->len);
48
+
49
+ long width = NUM2LONG(w);
50
+ if (width < 0 || len >= width)
51
+ return rb_utf_dup(str);
52
+
53
+ VALUE res = rb_utf_new5(str, 0, RSTRING(str)->len + (width - len) * f_size);
54
+ char *p = RSTRING(res)->ptr;
55
+
56
+ long n_remaining = width - len;
57
+ if (jflag != 'l') {
58
+ long n = n_remaining;
59
+ if (jflag == 'c')
60
+ n /= 2;
61
+ n_remaining -= n;
62
+
63
+ p = rb_utf_justify_one_side(p, f, f_len, f_size, n);
64
+ }
65
+
66
+ memcpy(p, RSTRING(str)->ptr, RSTRING(str)->len);
67
+ p += RSTRING(str)->len;
68
+
69
+ if (jflag != 'r')
70
+ p = rb_utf_justify_one_side(p, f, f_len, f_size, n_remaining);
71
+
72
+ OBJ_INFECT(res, str);
73
+
74
+ if (infect_from_pad)
75
+ OBJ_INFECT(res, pad);
76
+
77
+ return res;
78
+ }
79
+
80
+ VALUE
81
+ rb_utf_center(int argc, VALUE *argv, UNUSED(VALUE self))
82
+ {
83
+ return rb_utf_justify(argc, argv, 'c');
84
+ }
85
+
86
+ VALUE
87
+ rb_utf_ljust(int argc, VALUE *argv, UNUSED(VALUE self))
88
+ {
89
+ return rb_utf_justify(argc, argv, 'l');
90
+ }
91
+
92
+ VALUE
93
+ rb_utf_rjust(int argc, VALUE *argv, UNUSED(VALUE self))
94
+ {
95
+ return rb_utf_justify(argc, argv, 'r');
96
+ }
@@ -0,0 +1,14 @@
1
+ /*
2
+ * contents: UTF8.length module function.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+
9
+ VALUE
10
+ rb_utf_length(UNUSED(VALUE self), VALUE str)
11
+ {
12
+ StringValue(str);
13
+ return UINT2NUM(utf_length_n(RSTRING(str)->ptr, RSTRING(str)->len));
14
+ }
@@ -0,0 +1,41 @@
1
+ /*
2
+ * contents: UTF8.lstrip module function.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+
9
+ VALUE
10
+ rb_utf_lstrip_bang(UNUSED(VALUE self), VALUE str)
11
+ {
12
+ StringValue(str);
13
+ char *s = RSTRING(str)->ptr;
14
+ if (s == NULL || RSTRING(str)->len == 0)
15
+ return Qnil;
16
+
17
+ char *end = s + RSTRING(str)->len;
18
+
19
+ /* Remove spaces at head. */
20
+ while (s < end && unichar_isspace(_utf_char_validated(s, end)))
21
+ s = utf_next(s);
22
+
23
+ /* If there weren’t any spaces at head, return Qnil. */
24
+ if (s == RSTRING(str)->ptr)
25
+ return Qnil;
26
+
27
+ rb_str_modify(str);
28
+ RSTRING(str)->len = end - s;
29
+ memmove(RSTRING(str)->ptr, s, RSTRING(str)->len);
30
+ RSTRING(str)->ptr[RSTRING(str)->len] = '\0';
31
+
32
+ return str;
33
+ }
34
+
35
+ VALUE
36
+ rb_utf_lstrip(VALUE self, VALUE str)
37
+ {
38
+ str = rb_utf_dup(str);
39
+ rb_utf_lstrip_bang(self, str);
40
+ return str;
41
+ }
@@ -0,0 +1,51 @@
1
+ /*
2
+ * contents: Encoding::Character::UTF8.normalize module function.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+
9
+ #define SYMBOL2MODE(symbol, mode, id) do { \
10
+ static ID id_##symbol; \
11
+ if (id_##symbol == 0) \
12
+ id_##symbol = rb_intern(#symbol); \
13
+ if (id == id_##symbol) \
14
+ return mode; \
15
+ } while (0)
16
+
17
+ static NormalizeMode
18
+ symbol_to_mode(VALUE symbol)
19
+ {
20
+ if (!SYMBOL_P(symbol))
21
+ rb_raise(rb_eTypeError, "not a symbol");
22
+
23
+ ID id = SYM2ID(symbol);
24
+
25
+ SYMBOL2MODE(default, NORMALIZE_DEFAULT, id);
26
+ SYMBOL2MODE(nfd, NORMALIZE_NFD, id);
27
+ SYMBOL2MODE(default_compose, NORMALIZE_DEFAULT_COMPOSE, id);
28
+ SYMBOL2MODE(nfc, NORMALIZE_NFC, id);
29
+ SYMBOL2MODE(all, NORMALIZE_ALL, id);
30
+ SYMBOL2MODE(nfkd, NORMALIZE_NFKD, id);
31
+ SYMBOL2MODE(all_compose, NORMALIZE_ALL_COMPOSE, id);
32
+ SYMBOL2MODE(nfkc, NORMALIZE_NFKC, id);
33
+
34
+ rb_raise(rb_eArgError, "unknown symbol");
35
+ }
36
+
37
+ VALUE
38
+ rb_utf_normalize(int argc, VALUE *argv, UNUSED(VALUE self))
39
+ {
40
+ VALUE str, rbmode;
41
+
42
+ NormalizeMode mode = NORMALIZE_DEFAULT;
43
+ if (rb_scan_args(argc, argv, "11", &str, &rbmode) == 2)
44
+ mode = symbol_to_mode(rbmode);
45
+
46
+ StringValue(str);
47
+
48
+ return rb_utf_alloc_using(utf_normalize_n(RSTRING(str)->ptr,
49
+ mode,
50
+ RSTRING(str)->len));
51
+ }
@@ -0,0 +1,14 @@
1
+ /*
2
+ * contents: UTF8.oct module function.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+ #include "rb_utf_internal_bignum.h"
9
+
10
+ VALUE
11
+ rb_utf_oct(UNUSED(VALUE self), VALUE str)
12
+ {
13
+ return rb_utf_to_inum(str, -8, false);
14
+ }
@@ -0,0 +1,13 @@
1
+ /*
2
+ * contents: UTF8.reverse module function.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+
9
+ VALUE
10
+ rb_utf_reverse(UNUSED(VALUE self), VALUE str)
11
+ {
12
+ return rb_utf_alloc_using(utf_reverse(StringValuePtr(str)));
13
+ }
@@ -0,0 +1,88 @@
1
+ /*
2
+ * contents: UTF8.rindex module function.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+
9
+ static long
10
+ rb_utf_rindex(VALUE str, VALUE sub, long offset)
11
+ {
12
+ if (RSTRING(str)->len < RSTRING(sub)->len)
13
+ return -1;
14
+
15
+ char *s, *end;
16
+ rb_utf_begin_from_offset_validated(str, offset, &s, &end);
17
+
18
+ if (RSTRING(sub)->len == 0)
19
+ return utf_pointer_to_offset(RSTRING(str)->ptr, s);
20
+
21
+ char *s_begin = RSTRING(str)->ptr;
22
+ char *t = RSTRING(sub)->ptr;
23
+ long len = RSTRING(sub)->len;
24
+ while (s >= s_begin) {
25
+ if (rb_memcmp(s, t, len) == 0)
26
+ return utf_pointer_to_offset(s_begin, s);
27
+ s--;
28
+ }
29
+
30
+ return -1;
31
+ }
32
+
33
+ VALUE
34
+ rb_utf_rindex_m(int argc, VALUE *argv, UNUSED(VALUE self))
35
+ {
36
+ VALUE str, sub, rboffset;
37
+
38
+ rb_scan_args(argc, argv, "21", &str, &sub, &rboffset);
39
+
40
+ StringValue(str);
41
+
42
+ long offset = (argc == 3) ? NUM2LONG(rboffset) : RSTRING(str)->len;
43
+
44
+ char *begin, *end;
45
+ rb_utf_begin_from_offset(str, offset, &begin, &end);
46
+ if (begin == NULL) {
47
+ if (offset <= 0) {
48
+ if (TYPE(sub) == T_REGEXP)
49
+ rb_backref_set(Qnil);
50
+
51
+ return Qnil;
52
+ }
53
+
54
+ begin = end;
55
+ /* TODO: this converting back and forward can be optimized away
56
+ * if rb_utf_index_regexp() and rb_utf_rindex() were split up
57
+ * into two additional functions, adding
58
+ * rb_utf_index_regexp_pointer() and rb_utf_rindex_pointer(),
59
+ * so that one can pass a pointer to start at immediately
60
+ * instead of an offset that gets calculated into a pointer. */
61
+ offset = utf_length_n(RSTRING(str)->ptr, RSTRING(str)->len);
62
+ }
63
+
64
+ switch (TYPE(sub)) {
65
+ case T_REGEXP:
66
+ if (RREGEXP(sub)->len > 0)
67
+ offset = rb_utf_index_regexp(str, begin, end, sub,
68
+ offset, true);
69
+ break;
70
+ default: {
71
+ VALUE tmp = rb_check_string_type(sub);
72
+ if (NIL_P(tmp))
73
+ rb_raise(rb_eTypeError, "type mismatch: %s given",
74
+ rb_obj_classname(sub));
75
+
76
+ sub = tmp;
77
+ }
78
+ /* fall through */
79
+ case T_STRING:
80
+ offset = rb_utf_rindex(str, sub, offset);
81
+ break;
82
+ }
83
+
84
+ if (offset < 0)
85
+ return Qnil;
86
+
87
+ return LONG2NUM(offset);
88
+ }