character-encodings 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (66) hide show
  1. data/README +26 -0
  2. data/Rakefile +157 -0
  3. data/ext/encoding/character/unicode/codepoint.c +48 -0
  4. data/ext/encoding/character/utf-8/break.c +38 -0
  5. data/ext/encoding/character/utf-8/data/break.h +22931 -0
  6. data/ext/encoding/character/utf-8/data/character-tables.h +14356 -0
  7. data/ext/encoding/character/utf-8/data/compose.h +1607 -0
  8. data/ext/encoding/character/utf-8/data/decompose.h +10925 -0
  9. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +1065 -0
  10. data/ext/encoding/character/utf-8/decompose.c +476 -0
  11. data/ext/encoding/character/utf-8/depend +64 -0
  12. data/ext/encoding/character/utf-8/extconf.rb +47 -0
  13. data/ext/encoding/character/utf-8/private.h +68 -0
  14. data/ext/encoding/character/utf-8/properties.c +1061 -0
  15. data/ext/encoding/character/utf-8/rb_includes.h +18 -0
  16. data/ext/encoding/character/utf-8/rb_methods.h +49 -0
  17. data/ext/encoding/character/utf-8/rb_utf_aref.c +111 -0
  18. data/ext/encoding/character/utf-8/rb_utf_aset.c +105 -0
  19. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +24 -0
  20. data/ext/encoding/character/utf-8/rb_utf_chomp.c +114 -0
  21. data/ext/encoding/character/utf-8/rb_utf_chop.c +44 -0
  22. data/ext/encoding/character/utf-8/rb_utf_collate.c +13 -0
  23. data/ext/encoding/character/utf-8/rb_utf_count.c +30 -0
  24. data/ext/encoding/character/utf-8/rb_utf_delete.c +60 -0
  25. data/ext/encoding/character/utf-8/rb_utf_downcase.c +13 -0
  26. data/ext/encoding/character/utf-8/rb_utf_each_char.c +27 -0
  27. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +13 -0
  28. data/ext/encoding/character/utf-8/rb_utf_hex.c +14 -0
  29. data/ext/encoding/character/utf-8/rb_utf_index.c +50 -0
  30. data/ext/encoding/character/utf-8/rb_utf_insert.c +43 -0
  31. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +331 -0
  32. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +12 -0
  33. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +142 -0
  34. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +41 -0
  35. data/ext/encoding/character/utf-8/rb_utf_justify.c +96 -0
  36. data/ext/encoding/character/utf-8/rb_utf_length.c +14 -0
  37. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +41 -0
  38. data/ext/encoding/character/utf-8/rb_utf_normalize.c +51 -0
  39. data/ext/encoding/character/utf-8/rb_utf_oct.c +14 -0
  40. data/ext/encoding/character/utf-8/rb_utf_reverse.c +13 -0
  41. data/ext/encoding/character/utf-8/rb_utf_rindex.c +88 -0
  42. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +51 -0
  43. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +70 -0
  44. data/ext/encoding/character/utf-8/rb_utf_strip.c +27 -0
  45. data/ext/encoding/character/utf-8/rb_utf_to_i.c +25 -0
  46. data/ext/encoding/character/utf-8/rb_utf_tr.c +250 -0
  47. data/ext/encoding/character/utf-8/rb_utf_upcase.c +13 -0
  48. data/ext/encoding/character/utf-8/unicode.c +319 -0
  49. data/ext/encoding/character/utf-8/unicode.h +208 -0
  50. data/ext/encoding/character/utf-8/utf.c +1332 -0
  51. data/lib/encoding/character/utf-8.rb +201 -0
  52. data/specifications/aref.rb +45 -0
  53. data/specifications/count.rb +29 -0
  54. data/specifications/delete.rb +25 -0
  55. data/specifications/each_char.rb +28 -0
  56. data/specifications/index.rb +35 -0
  57. data/specifications/insert.rb +67 -0
  58. data/specifications/length.rb +45 -0
  59. data/specifications/rindex.rb +52 -0
  60. data/specifications/squeeze.rb +25 -0
  61. data/specifications/to_i.rb +54 -0
  62. data/specifications/tr.rb +39 -0
  63. data/tests/foldcase.rb +28 -0
  64. data/tests/normalize.rb +101 -0
  65. data/tests/unicodedatatestbase.rb +45 -0
  66. metadata +112 -0
@@ -0,0 +1,18 @@
1
+ /*
2
+ * contents: Standard includes for method definitions.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #ifndef RB_INCLUDES_H
8
+ #define RB_INCLUDES_H
9
+
10
+ #include <ruby.h>
11
+ #include <stdbool.h>
12
+ #include <stddef.h>
13
+ #include <stdint.h>
14
+ #include "unicode.h"
15
+ #include "private.h"
16
+ #include "rb_methods.h"
17
+
18
+ #endif /* RB_INCLUDES_H */
@@ -0,0 +1,49 @@
1
+ /*
2
+ * contents: Method declarations.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #ifndef RB_METHODS_H
8
+ #define RB_METHODS_H
9
+
10
+ VALUE rb_utf_collate(UNUSED(VALUE self), VALUE str, VALUE other) HIDDEN;
11
+ VALUE rb_utf_downcase(UNUSED(VALUE self), VALUE str) HIDDEN;
12
+ VALUE rb_utf_length(UNUSED(VALUE self), VALUE str) HIDDEN;
13
+ VALUE rb_utf_reverse(UNUSED(VALUE self), VALUE str) HIDDEN;
14
+ VALUE rb_utf_upcase(UNUSED(VALUE self), VALUE str) HIDDEN;
15
+ VALUE rb_utf_aref_m(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
16
+ VALUE rb_utf_aset_m(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
17
+ VALUE rb_utf_casecmp(UNUSED(VALUE self), VALUE str1, VALUE str2) HIDDEN;
18
+ VALUE rb_utf_center(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
19
+ VALUE rb_utf_ljust(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
20
+ VALUE rb_utf_rjust(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
21
+ VALUE rb_utf_chomp_bang(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
22
+ VALUE rb_utf_chomp(int argc, VALUE *argv, VALUE self) HIDDEN;
23
+ VALUE rb_utf_chop_bang(UNUSED(VALUE self), VALUE str) HIDDEN;
24
+ VALUE rb_utf_chop(VALUE self, VALUE str) HIDDEN;
25
+ VALUE rb_utf_count(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
26
+ VALUE rb_utf_delete_bang(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
27
+ VALUE rb_utf_delete(int argc, VALUE *argv, VALUE self) HIDDEN;
28
+ VALUE rb_utf_each_char(UNUSED(VALUE self), VALUE str) HIDDEN;
29
+ VALUE rb_utf_index_m(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
30
+ VALUE rb_utf_insert(UNUSED(VALUE self), VALUE str, VALUE index,
31
+ VALUE other) HIDDEN;
32
+ VALUE rb_utf_lstrip_bang(UNUSED(VALUE self), VALUE str) HIDDEN;
33
+ VALUE rb_utf_lstrip(VALUE self, VALUE str) HIDDEN;
34
+ VALUE rb_utf_rindex_m(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
35
+ VALUE rb_utf_rstrip_bang(UNUSED(VALUE self), VALUE str) HIDDEN;
36
+ VALUE rb_utf_rstrip(VALUE self, VALUE str) HIDDEN;
37
+ VALUE rb_utf_squeeze_bang(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
38
+ VALUE rb_utf_squeeze(int argc, VALUE *argv, VALUE self) HIDDEN;
39
+ VALUE rb_utf_strip_bang(VALUE self, VALUE str) HIDDEN;
40
+ VALUE rb_utf_strip(VALUE self, VALUE str) HIDDEN;
41
+ VALUE rb_utf_to_i(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
42
+ VALUE rb_utf_hex(UNUSED(VALUE self), VALUE str) HIDDEN;
43
+ VALUE rb_utf_oct(UNUSED(VALUE self), VALUE str) HIDDEN;
44
+ VALUE rb_utf_tr(UNUSED(VALUE self), VALUE str, VALUE from, VALUE to) HIDDEN;
45
+ VALUE rb_utf_tr_s(UNUSED(VALUE self), VALUE str, VALUE from, VALUE to) HIDDEN;
46
+ VALUE rb_utf_foldcase(UNUSED(VALUE self), VALUE str) HIDDEN;
47
+ VALUE rb_utf_normalize(int argc, VALUE *argv, UNUSED(VALUE self)) HIDDEN;
48
+
49
+ #endif /* RB_METHODS_H */
@@ -0,0 +1,111 @@
1
+ /*
2
+ * contents: UTF8.aref module function.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+ #include <re.h>
9
+
10
+ static VALUE
11
+ rb_utf_substr(VALUE str, long offset, long len)
12
+ {
13
+ if (len < 0)
14
+ return Qnil;
15
+
16
+ char *begin, *limit;
17
+ if (!rb_utf_begin_from_offset(str, offset, &begin, &limit))
18
+ return Qnil;
19
+ char *end = _utf_offset_to_pointer_failable(begin, len, limit);
20
+ if (end == NULL)
21
+ end = limit;
22
+
23
+ VALUE substr = (begin == end) ?
24
+ rb_utf_new5(str, NULL, 0) :
25
+ rb_utf_new5(str, begin, end - begin);
26
+
27
+ OBJ_INFECT(substr, str);
28
+
29
+ return substr;
30
+ }
31
+
32
+ static VALUE
33
+ rb_utf_substr_and_infect(VALUE str, long offset, long len, VALUE source)
34
+ {
35
+ VALUE substr = rb_utf_substr(str, offset, len);
36
+ OBJ_INFECT(substr, source);
37
+ return substr;
38
+ }
39
+
40
+ /* XXX: Stolen straight from string.c. */
41
+ static VALUE
42
+ rb_str_subpat(VALUE str, VALUE re, int nth)
43
+ {
44
+ if (rb_reg_search(re, str, 0, 0) >= 0)
45
+ return rb_reg_nth_match(nth, rb_backref_get());
46
+
47
+ return Qnil;
48
+ }
49
+
50
+ static VALUE
51
+ rb_utf_aref_num(VALUE str, long offset)
52
+ {
53
+ char *begin, *limit;
54
+ if (!rb_utf_begin_from_offset(str, offset, &begin, &limit))
55
+ return Qnil;
56
+
57
+ char *end = rb_utf_next_validated(begin, limit);
58
+
59
+ return rb_utf_new(begin, end - begin);
60
+ }
61
+
62
+ static VALUE
63
+ rb_utf_aref_default(VALUE str, VALUE index)
64
+ {
65
+ long n_chars = utf_length_n(RSTRING(str)->ptr, RSTRING(str)->len);
66
+
67
+ long begin, len;
68
+ switch (rb_range_beg_len(index, &begin, &len, n_chars, 0)) {
69
+ case Qfalse:
70
+ return rb_utf_aref_num(str, NUM2LONG(index));
71
+ case Qnil:
72
+ return Qnil;
73
+ default:
74
+ return rb_utf_substr_and_infect(str, begin, len, index);
75
+ }
76
+ }
77
+
78
+ static VALUE
79
+ rb_utf_aref(VALUE str, VALUE index)
80
+ {
81
+ switch (TYPE(index)) {
82
+ case T_FIXNUM:
83
+ return rb_utf_aref_num(str, FIX2LONG(index));
84
+ case T_REGEXP:
85
+ return rb_str_subpat(str, index, 0);
86
+ case T_STRING:
87
+ if (rb_utf_index(str, index, 0) != -1)
88
+ return rb_utf_dup(index);
89
+ return Qnil;
90
+ default:
91
+ return rb_utf_aref_default(str, index);
92
+ }
93
+ }
94
+
95
+ VALUE
96
+ rb_utf_aref_m(int argc, VALUE *argv, UNUSED(VALUE self))
97
+ {
98
+ StringValue(argv[0]);
99
+
100
+ if (argc > 3 || argc < 2)
101
+ rb_raise(rb_eArgError,
102
+ "wrong number of arguments (%d for 2)", argc);
103
+
104
+ if (argc == 2)
105
+ return rb_utf_aref(argv[0], argv[1]);
106
+
107
+ if (TYPE(argv[1]) == T_REGEXP)
108
+ return rb_str_subpat(argv[0], argv[1], NUM2INT(argv[2]));
109
+
110
+ return rb_utf_substr(argv[0], NUM2INT(argv[1]), NUM2INT(argv[2]));
111
+ }
@@ -0,0 +1,105 @@
1
+ /*
2
+ * contents: UTF8.aset module function.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+ #include <re.h>
9
+
10
+ /* XXX: Stolen straight from string.c. */
11
+ #define BEG(no) regs->beg[no]
12
+ #define END(no) regs->end[no]
13
+
14
+ static VALUE
15
+ rb_str_subpat_set(VALUE str, VALUE re, int nth, VALUE val)
16
+ {
17
+ VALUE match;
18
+ long start, end, len;
19
+
20
+ if (rb_reg_search(re, str, 0, 0) < 0) {
21
+ rb_raise(rb_eIndexError, "regexp not matched");
22
+ }
23
+ match = rb_backref_get();
24
+ if (nth >= RMATCH(match)->regs->num_regs) {
25
+ out_of_range:
26
+ rb_raise(rb_eIndexError, "index %d out of regexp", nth);
27
+ }
28
+ if (nth < 0) {
29
+ if (-nth >= RMATCH(match)->regs->num_regs) {
30
+ goto out_of_range;
31
+ }
32
+ nth += RMATCH(match)->regs->num_regs;
33
+ }
34
+
35
+ start = RMATCH(match)->BEG(nth);
36
+ if (start == -1) {
37
+ rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
38
+ }
39
+ end = RMATCH(match)->END(nth);
40
+ len = end - start;
41
+ rb_str_update(str, start, len, val);
42
+
43
+ return val;
44
+ }
45
+
46
+ static VALUE
47
+ rb_utf_aset_num(VALUE str, long offset, VALUE replacement)
48
+ {
49
+ return rb_utf_update(str, offset, 1, replacement);
50
+ }
51
+
52
+ static VALUE
53
+ rb_utf_aset_default(VALUE str, VALUE index, VALUE replacement)
54
+ {
55
+ long n_chars = utf_length_n(RSTRING(str)->ptr, RSTRING(str)->len);
56
+
57
+ long begin, len;
58
+ if (rb_range_beg_len(index, &begin, &len, n_chars, 2))
59
+ return rb_utf_update(str, begin, len, replacement);
60
+
61
+ return rb_utf_aset_num(str, NUM2LONG(index), replacement);
62
+ }
63
+
64
+ static VALUE
65
+ rb_utf_aset(VALUE str, VALUE index, VALUE replacement)
66
+ {
67
+ switch (TYPE(index)) {
68
+ case T_FIXNUM:
69
+ return rb_utf_aset_num(str, FIX2LONG(index), replacement);
70
+ case T_BIGNUM:
71
+ return rb_utf_aset_num(str, NUM2LONG(index), replacement);
72
+ case T_REGEXP:
73
+ return rb_str_subpat_set(str, index, 0, replacement);
74
+ case T_STRING: {
75
+ long begin = rb_utf_index(str, index, 0);
76
+ if (begin < 0)
77
+ rb_raise(rb_eIndexError, "string not matched");
78
+ return rb_utf_update(str,
79
+ begin,
80
+ utf_length_n(RSTRING(index)->ptr,
81
+ RSTRING(index)->len),
82
+ replacement);
83
+ }
84
+ default:
85
+ return rb_utf_aset_default(str, index, replacement);
86
+ }
87
+ }
88
+
89
+ VALUE
90
+ rb_utf_aset_m(int argc, VALUE *argv, UNUSED(VALUE self))
91
+ {
92
+ if (argc > 4 || argc < 3)
93
+ rb_raise(rb_eArgError,
94
+ "wrong number of arguments (%d for 3)", argc);
95
+
96
+ StringValue(argv[0]);
97
+
98
+ if (argc == 3)
99
+ return rb_utf_aset(argv[0], argv[1], argv[2]);
100
+
101
+ if (TYPE(argv[1]) == T_REGEXP)
102
+ return rb_str_subpat_set(argv[0], argv[1], NUM2INT(argv[2]), argv[3]);
103
+
104
+ return rb_utf_update(argv[0], NUM2LONG(argv[1]), NUM2LONG(argv[2]), argv[3]);
105
+ }
@@ -0,0 +1,24 @@
1
+ /*
2
+ * contents: UTF8.casecmp module function.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+
9
+ VALUE
10
+ rb_utf_casecmp(UNUSED(VALUE self), VALUE str1, VALUE str2)
11
+ {
12
+ StringValue(str1);
13
+ StringValue(str2);
14
+
15
+ char *folded1 = utf_foldcase(RSTRING(str1)->ptr);
16
+ char *folded2 = utf_foldcase(RSTRING(str2)->ptr);
17
+
18
+ int result = utf_collate(folded1, folded2);
19
+
20
+ free(folded2);
21
+ free(folded1);
22
+
23
+ return INT2FIX(result);
24
+ }
@@ -0,0 +1,114 @@
1
+ /*
2
+ * contents: UTF8.chomp module function.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+
9
+ static VALUE
10
+ rb_utf_chomp_default(VALUE str)
11
+ {
12
+ rb_str_modify(str);
13
+
14
+ const char *end = RSTRING(str)->ptr + RSTRING(str)->len;
15
+
16
+ char *last = utf_find_prev(RSTRING(str)->ptr, end);
17
+ if (last == NULL)
18
+ return Qnil;
19
+
20
+ if (_utf_char_validated(last, end) == '\n') {
21
+ char *last_but_one = utf_find_prev(RSTRING(str)->ptr, last);
22
+
23
+ if (last_but_one != NULL && utf_char(last_but_one) == '\r')
24
+ last = last_but_one;
25
+ } else if (!unichar_isnewline(utf_char(last))) {
26
+ return Qnil;
27
+ }
28
+
29
+ RSTRING(str)->len -= (RSTRING(str)->ptr + RSTRING(str)->len) - last;
30
+ *last = '\0';
31
+
32
+ return str;
33
+ }
34
+
35
+ static VALUE
36
+ rb_utf_chomp_newlines(VALUE str)
37
+ {
38
+ char *begin = RSTRING(str)->ptr;
39
+ char *end = begin + RSTRING(str)->len;
40
+
41
+ char *last = end;
42
+ while (last > begin) {
43
+ char *last_but_one = utf_find_prev(begin, last);
44
+ if (last == NULL || !unichar_isnewline(utf_char(last_but_one)))
45
+ break;
46
+ last = last_but_one;
47
+ }
48
+
49
+ if (last == end)
50
+ return Qnil;
51
+
52
+ rb_str_modify(str);
53
+ RSTRING(str)->len -= end - last;
54
+ *last = '\0';
55
+
56
+ return str;
57
+ }
58
+
59
+ VALUE
60
+ rb_utf_chomp_bang(int argc, VALUE *argv, UNUSED(VALUE self))
61
+ {
62
+ VALUE str, rs;
63
+
64
+ rb_scan_args(argc, argv, "11", &str, &rs);
65
+
66
+ if (RSTRING(str)->len == 0)
67
+ return Qnil;
68
+
69
+ if (argc == 1) {
70
+ rs = rb_rs;
71
+ if (rs == rb_default_rs)
72
+ rb_utf_chomp_default(str);
73
+ }
74
+
75
+ if (NIL_P(rs))
76
+ return Qnil;
77
+
78
+ StringValue(rs);
79
+
80
+ long rs_len = RSTRING(rs)->len;
81
+ if (rs_len == 0)
82
+ return rb_utf_chomp_newlines(str);
83
+
84
+ long len = RSTRING(str)->len;
85
+ if (rs_len > len)
86
+ return Qnil;
87
+
88
+ char last_char = RSTRING(rs)->ptr[rs_len - 1];
89
+ if (rs_len == 1 && last_char == '\n')
90
+ rb_utf_chomp_default(str);
91
+
92
+ char *p = RSTRING(str)->ptr;
93
+
94
+ if (p[len - 1] != last_char ||
95
+ (rs_len > 1 &&
96
+ rb_memcmp(RSTRING(rs)->ptr, p + len - rs_len, rs_len) != 0))
97
+ return Qnil;
98
+
99
+ rb_str_modify(str);
100
+ RSTRING(str)->len -= rs_len;
101
+ RSTRING(str)->ptr[RSTRING(str)->len] = '\0';
102
+
103
+ return str;
104
+ }
105
+
106
+ VALUE
107
+ rb_utf_chomp(int argc, VALUE *argv, VALUE self)
108
+ {
109
+ StringValue(argv[0]);
110
+ argv[0] = rb_utf_dup(argv[0]);
111
+ rb_utf_chomp_bang(argc, argv, self);
112
+ return argv[0];
113
+ }
114
+
@@ -0,0 +1,44 @@
1
+ /*
2
+ * contents: UTF8.chop module function.
3
+ *
4
+ * Copyright © 2006 Nikolai Weibull <now@bitwi.se>
5
+ */
6
+
7
+ #include "rb_includes.h"
8
+
9
+ VALUE
10
+ rb_utf_chop_bang(UNUSED(VALUE self), VALUE str)
11
+ {
12
+ StringValue(str);
13
+
14
+ if (RSTRING(str)->len == 0)
15
+ return Qnil;
16
+
17
+ rb_str_modify(str);
18
+
19
+ const char *end = RSTRING(str)->ptr + RSTRING(str)->len;
20
+
21
+ char *last = rb_utf_prev_validated(RSTRING(str)->ptr, end);
22
+
23
+ if (_utf_char_validated(last, end) == '\n') {
24
+ char *last_but_one = utf_find_prev(RSTRING(str)->ptr, last);
25
+
26
+ if (last_but_one != NULL && utf_char(last_but_one) == '\r')
27
+ last = last_but_one;
28
+ } else if (!unichar_isnewline(utf_char(last))) {
29
+ return Qnil;
30
+ }
31
+
32
+ RSTRING(str)->len -= (RSTRING(str)->ptr + RSTRING(str)->len) - last;
33
+ *last = '\0';
34
+
35
+ return str;
36
+ }
37
+
38
+ VALUE
39
+ rb_utf_chop(VALUE self, VALUE str)
40
+ {
41
+ str = rb_utf_dup(str);
42
+ rb_utf_chop_bang(self, str);
43
+ return str;
44
+ }