RubyGems - character-encodings - Versions diffs - 0.2.0 → 0.3.0 - Mend

character-encodings 0.2.0 → 0.3.0

Files changed (19) hide show

data/Rakefile +2 -2
data/ext/encoding/character/utf-8/break.c +6 -19
data/ext/encoding/character/utf-8/data/character-tables.h +2 -0
data/ext/encoding/character/utf-8/data/decompose.h +1 -0
data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +5 -0
data/ext/encoding/character/utf-8/decompose.c +77 -109
data/ext/encoding/character/utf-8/depend +1 -0
data/ext/encoding/character/utf-8/extconf.rb +2 -0
data/ext/encoding/character/utf-8/private.c +62 -0
data/ext/encoding/character/utf-8/private.h +18 -38
data/ext/encoding/character/utf-8/properties.c +90 -95
data/ext/encoding/character/utf-8/rb_includes.h +1 -0
data/ext/encoding/character/utf-8/rb_private.h +52 -0
data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +10 -9
data/ext/encoding/character/utf-8/tables.h +38 -0
data/ext/encoding/character/utf-8/unicode.c +1 -1
data/ext/encoding/character/utf-8/utf.c +5 -3
data/tests/case.rb +102 -0
metadata +12 -8

data/Rakefile CHANGED Viewed

@@ -10,7 +10,7 @@ require 'rake/testtask'
 require 'spec/rake/spectask'
 PackageName = 'character-encodings'
-PackageVersion = '0.2.0'
+PackageVersion = '0.3.0'
 desc 'Default task'
 task :default => [:extensions]
@@ -92,7 +92,7 @@ Spec::Rake::SpecTask.new do |t|
 end
 Tests = [
-  ['tests/foldcase.rb'],
+  ['tests/foldcase.rb', 'tests/case.rb'],
   ['tests/normalize.rb']
 ]

data/ext/encoding/character/utf-8/break.c CHANGED Viewed

@@ -9,30 +9,17 @@
 #include <stdint.h>
 #include "unicode.h"
 #include "data/break.h"
+#include "private.h"
 /* Figure out what break type the Unicode character ‘c’ possesses, if any.
  * This information is used for finding word and line boundaries, which is
  * useful when displaying Unicode text on screen. */
-static UnicodeBreakType
-break_type(const int16_t table[], unsigned int page, unichar c)
-{
-        int16_t break_property = table[page];
-        return (break_property >= UNICODE_MAX_TABLE_INDEX) ?
-                break_property - UNICODE_MAX_TABLE_INDEX :
-                break_property_data[break_property][c & 0xff];
-}
 UnicodeBreakType
 unichar_break_type(unichar c)
 {
-	if (c <= UNICODE_LAST_CHAR_PART1)
-                return break_type(break_property_table_part1, c >> 8, c);
-        if (c >= UNICODE_FIRST_CHAR_PART2 && c <= UNICODE_LAST_CHAR)
-                return break_type(break_property_table_part2,
-                                  (c - 0xe0000) >> 8, c);
-        return UNICODE_BREAK_UNKNOWN;
+        return SPLIT_UNICODE_TABLE_LOOKUP(break_property_data,
+                                          break_property_table_part1,
+                                          break_property_table_part2,
+                                          c,
+                                          UNICODE_BREAK_UNKNOWN);
 }

data/ext/encoding/character/utf-8/data/character-tables.h CHANGED Viewed

@@ -13,6 +13,8 @@
 #define UNICODE_LAST_PAGE_PART1 762
+#define UNICODE_FIRST_CHAR_PART2 0xe0000
 #define UNICODE_SPECIAL_CASE_TABLE_START 0x1000000

data/ext/encoding/character/utf-8/data/decompose.h CHANGED Viewed

@@ -10,6 +10,7 @@
 #define UNICODE_LAST_CHAR_PART1 0x2faff
 #define UNICODE_LAST_PAGE_PART1 762
+#define UNICODE_FIRST_CHAR_PART2 0xe0000
 #define UNICODE_NOT_PRESENT_OFFSET 65535

data/ext/encoding/character/utf-8/data/generate-unicode-data.rb CHANGED Viewed

@@ -534,6 +534,8 @@ private
 #define UNICODE_LAST_PAGE_PART1 #{data.pages_before_e0000 - 1}
+#define UNICODE_FIRST_CHAR_PART2 0xe0000
 #define UNICODE_SPECIAL_CASE_TABLE_START 0x1000000
 EOF
       print_table(data, 0, @last_char_part1_i, data.last, 1,
@@ -694,8 +696,11 @@ EOF
 #define UNICODE_MAX_TABLE_INDEX (0x110000 / 256)
 #define UNICODE_LAST_CHAR_PART1 #{@last_char_part1_x}
 #define UNICODE_LAST_PAGE_PART1 #{data.pages_before_e0000 - 1}
+#define UNICODE_FIRST_CHAR_PART2 0xe0000
 #define UNICODE_NOT_PRESENT_OFFSET #{NOT_PRESENT_OFFSET}
 EOF
       print_table(data, 0, @last_char_part1_i, data.last, 1,

data/ext/encoding/character/utf-8/decompose.c CHANGED Viewed

@@ -8,35 +8,16 @@
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 #include "unicode.h"
-#include "private.h"
 #include "data/decompose.h"
 #include "data/compose.h"
+#include "private.h"
-/* {{{1
- * Macros for accessing the combining class property tables for a given
- * character.
- *
- * TODO: Turn these macros into full-fledged functions, as this is rather silly
- * when we have ‹inline› in C99.
- */
-#define CC_PART1(page, char) \
-        ((combining_class_table_part1[page] >= UNICODE_MAX_TABLE_INDEX) \
-         ? (combining_class_table_part1[page] - UNICODE_MAX_TABLE_INDEX) \
-         : (cclass_data[combining_class_table_part1[page]][char]))
-#define CC_PART2(page, char) \
-        ((combining_class_table_part2[page] >= UNICODE_MAX_TABLE_INDEX) \
-         ? (combining_class_table_part2[page] - UNICODE_MAX_TABLE_INDEX) \
-         : (cclass_data[combining_class_table_part2[page]][char]))
-#define COMBINING_CLASS(char) \
-        (((char) <= UNICODE_LAST_CHAR_PART1) \
-         ? CC_PART1((char) >> 8, (char) & 0xff) \
-         : (((char) >= 0xe0000 && (char) <= UNICODE_LAST_CHAR) \
-            ? CC_PART2(((char) - 0xe0000) >> 8, (char) & 0xff) \
-            : 0))
+#define COMBINING_CLASS(c)      \
+        SPLIT_UNICODE_TABLE_LOOKUP(cclass_data, combining_class_table_part1, combining_class_table_part2, (c), 0)
 /* {{{1
@@ -115,12 +96,11 @@ unicode_canonical_ordering(unichar *str, size_t len)
 /* {{{1
  * Decompose the character ‘s’ according to the rules outlined in
  * http://www.unicode.org/unicode/reports/tr15/#Hangul.  ‘r’ should be ‹NULL›
- * or of sufficient length to store the decomposition of ‘s’.  The number of
- * characters stored (or would be if it were non-‹NULL›) in ‘r’ is put in
- * ‘r_len’.
+ * or of sufficient length to store the decomposition of ‘s’.  Returns the
+ * number of characters stored (or would be if it were non-NULL) in R.
  */
-static void
-decompose_hangul(unichar s, unichar *r, size_t *r_len)
+static size_t
+decompose_hangul(unichar s, unichar *r)
 {
         int SIndex = s - SBase;
@@ -128,8 +108,7 @@ decompose_hangul(unichar s, unichar *r, size_t *r_len)
         if (SIndex < 0 || SIndex >= SCount) {
                 if (r != NULL)
                         r[0] = s;
-                *r_len = 1;
-                return;
+                return 1;
         }
         unichar L = LBase + SIndex / NCount;
@@ -141,13 +120,13 @@ decompose_hangul(unichar s, unichar *r, size_t *r_len)
                 r[1] = V;
         }
-        if (T != TBase) {
-                if (r != NULL)
-                        r[2] = T;
-                *r_len = 3;
-        } else {
-                *r_len = 2;
-        }
+        if (T == TBase)
+                return 2;
+        if (r != NULL)
+                r[2] = T;
+        return 3;
 }
@@ -179,26 +158,27 @@ get_decomposition(int index, bool compat)
 static const char *
 find_decomposition(unichar c, bool compat)
 {
-        int begin = 0;
-        int end = lengthof(decomp_table);
+        int index;
-        if (c < decomp_table[begin].ch || c > decomp_table[end - 1].ch)
+        if (!unicode_table_lookup(decomp_table, c, &index))
                 return NULL;
-        while (true) {
-                int middle = (begin + end) / 2;
+        return get_decomposition(index, compat);
+}
-                if (c == decomp_table[middle].ch)
-                        return get_decomposition(middle, compat);
-                else if (middle == begin)
-                        break;
-                else if (c > decomp_table[middle].ch)
-                        begin = middle;
-                else
-                        end = middle;
-        }
-        return NULL;
+/* {{{1
+ * Copy over the UTF-8 decomposition in ‘decomposition’ to the unichar buffer
+ * ‘chars’.  Return the number of unichars in ‘chars’.
+ */
+static size_t
+decomposition_to_wc(const char *decomposition, unichar *chars)
+{
+        size_t i = 0;
+        for (const char *p = decomposition; *p != '\0'; p = utf_next(p))
+                chars[i++] = utf_char(p);
+        return i;
 }
@@ -215,17 +195,13 @@ unicode_canonical_decomposition(unichar c, size_t *len)
         /* Hangul syllable */
         if (c >= SBase && c <= SLast) {
-                decompose_hangul(c, NULL, len);
+                *len = decompose_hangul(c, NULL);
                 r = ALLOC_N(unichar, *len);
-                decompose_hangul(c, r, len);
+                decompose_hangul(c, r);
         } else if ((decomp = find_decomposition(c, false)) != NULL) {
                 *len = utf_length(decomp);
                 r = ALLOC_N(unichar, *len);
-                int i;
-                const char *p;
-                for (p = decomp, i = 0; *p != NUL; p = utf_next(p), i++)
-                        r[i] = utf_char(p);
+                decomposition_to_wc(decomp, r);
         } else {
                 r = ALLOC(unichar);
                 *r = c;
@@ -281,23 +257,19 @@ compose_index(unichar c)
         if (page > COMPOSE_TABLE_LAST)
                 return 0;
-        /* TODO: why is this signed, exactly? */
-        int16_t compose_offset = compose_table[page];
-        return (compose_offset >= UNICODE_MAX_TABLE_INDEX) ?
-                compose_offset - UNICODE_MAX_TABLE_INDEX :
-                compose_data[compose_offset][c & 0xff];
+        return SPLIT_UNICODE_TABLE_LOOKUP_PAGE(compose_data, compose_table, page, c);
 }
 static bool
 lookup_compose(const uint16_t table[][2], uint16_t index, unichar c,
                unichar *result)
 {
-        if (c == table[index][0]) {
-                *result = table[index][1];
-                return true;
-        }
+        if (c != table[index][0])
+                return false;
-        return false;
+        *result = table[index][1];
+        return true;
 }
 static bool
@@ -307,21 +279,18 @@ combine(unichar a, unichar b, unichar *result)
                 return true;
         uint16_t index_a = compose_index(a);
-        if (index_a >= COMPOSE_FIRST_SINGLE_START &&
-            index_a < COMPOSE_SECOND_START) {
+        if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
                 return lookup_compose(compose_first_single,
                                       index_a - COMPOSE_FIRST_SINGLE_START,
                                       b,
                                       result);
-        }
         uint16_t index_b = compose_index(b);
-        if (index_b >= COMPOSE_SECOND_SINGLE_START) {
+        if (index_b >= COMPOSE_SECOND_SINGLE_START)
                 return lookup_compose(compose_second_single,
                                       index_b - COMPOSE_SECOND_SINGLE_START,
                                       a,
                                       result);
-        }
         if (index_a >= COMPOSE_FIRST_START &&
             index_a < COMPOSE_FIRST_SINGLE_START &&
@@ -356,12 +325,8 @@ normalize_wc_decompose_one(unichar c, NormalizeMode mode, unichar *buf)
                 return 1;
         }
-        if (buf != NULL) {
-                int i;
-                for (i = 0; *decomp != NUL; decomp = utf_next(decomp), i++)
-                        buf[i] = utf_char(decomp);
-                return i;
-        }
+        if (buf != NULL)
+                return decomposition_to_wc(decomp, buf);
         return utf_length(decomp);
 }
@@ -378,14 +343,10 @@ normalize_wc_decompose(const char *str, size_t max_len, bool use_len,
                 size_t prev_n = n;
                 unichar *base = (buf != NULL) ? buf + n : NULL;
-                if (c >= SBase && c <= SLast) {
-                        size_t len;
-                        decompose_hangul(c, base, &len);
-                        n += len;
-                } else {
+                if (c >= SBase && c <= SLast)
+                        n += decompose_hangul(c, base);
+                else
                         n += normalize_wc_decompose_one(c, mode, base);
-                }
                 if (buf != NULL && n > 0 && COMBINING_CLASS(buf[prev_n]) == 0) {
                         unicode_canonical_ordering(buf + prev_start,
@@ -403,44 +364,51 @@ normalize_wc_decompose(const char *str, size_t max_len, bool use_len,
         *buf_len = n;
 }
-unichar *
-_utf_normalize_wc(const char *str, size_t max_len, bool use_len, NormalizeMode mode)
+static unichar *
+normalize_wc_compose(unichar *buf, size_t len)
 {
-        size_t n;
-        normalize_wc_decompose(str, max_len, use_len, mode, NULL, &n);
-        unichar *buf = ALLOC_N(unichar, n + 1);
-        normalize_wc_decompose(str, max_len, use_len, mode, buf, &n);
-        /* Just return if we don’t want composition. */
-        if (!(mode == NORMALIZE_NFC || mode == NORMALIZE_NFKC))
-                return buf;
+        int new_len = len;
         size_t prev_start = 0;
         int prev_cc = 0;
-        for (size_t i = 0; i < n; i++) {
+        for (size_t i = 0; i < len; i++) {
                 int cc = COMBINING_CLASS(buf[i]);
+                size_t j = i - (len - new_len);
-                if (i > 0 && (prev_cc == 0 || prev_cc < cc) &&
+                if (j > 0 && (prev_cc == 0 || prev_cc < cc) &&
                     combine(buf[prev_start], buf[i], &buf[prev_start])) {
-                        for (size_t j = i + 1; j < n; j++)
-                                buf[j - 1] = buf[j];
-                        n--;
-                        i--;
-                        prev_cc = (i == prev_start) ? 0 : COMBINING_CLASS(buf[i - 1]);
+                        new_len--;
+                        prev_cc = (j + 1 == prev_start) ?
+                                  0 : COMBINING_CLASS(buf[j - 1]);
                 } else {
                         if (cc == 0)
-                                prev_start = i;
+                                prev_start = j;
+                        buf[j] = buf[i];
                         prev_cc = cc;
                 }
         }
-        buf[n] = NUL;
+        buf[new_len] = NUL;
         return buf;
 }
+unichar *
+_utf_normalize_wc(const char *str, size_t max_len, bool use_len, NormalizeMode mode)
+{
+        size_t n;
+        normalize_wc_decompose(str, max_len, use_len, mode, NULL, &n);
+        unichar *buf = ALLOC_N(unichar, n + 1);
+        normalize_wc_decompose(str, max_len, use_len, mode, buf, &n);
+        /* Just return if we don’t want composition. */
+        if (!(mode == NORMALIZE_NFC || mode == NORMALIZE_NFKC))
+                return buf;
+        return normalize_wc_compose(buf, n);
+}
 /* {{{1
  * Normalize (compose/decompose) characters in ‘str˚ so that strings that

data/ext/encoding/character/utf-8/depend CHANGED Viewed

@@ -1,6 +1,7 @@
 break.o: break.c unicode.h data/break.h
 decompose.o: decompose.c unicode.h private.h data/decompose.h \
   data/compose.h
+private.o: private.c private.h
 properties.o: properties.c unicode.h private.h data/character-tables.h
 rb_utf_aref.o: rb_utf_aref.c rb_includes.h unicode.h private.h \
   rb_methods.h

data/ext/encoding/character/utf-8/extconf.rb CHANGED Viewed

@@ -12,6 +12,7 @@ def try_compiler_option(opt, &b)
 end
 try_compiler_option('-std=c99')
+try_compiler_option('-finline-functions')
 try_compiler_option('-Wall')
 try_compiler_option('-Wextra')
 try_compiler_option('-Wwrite-strings')
@@ -23,6 +24,7 @@ try_compiler_option('-Wundef')
 try_compiler_option('-Wpointer-arith')
 try_compiler_option('-Wcast-align')
 try_compiler_option('-Werror')
+try_compiler_option('-Winline')
 # XXX: sadly, -Wshadow is a bit too strict.  It will, for example, whine about
 # local variables called “index” on FreeBSD.
 # try_compiler_option('-Wshadow')

data/ext/encoding/character/utf-8/private.c ADDED Viewed

@@ -0,0 +1,62 @@
+/*
+ * contents: Private functions used by the UTF-8 character-encoding library.
+ *
+ * Copyright © 2007 Nikolai Weibull <now@bitwi.se>
+ */
+#include <ruby.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include "unicode.h"
+#include "private.h"
+/* Lookup C in the sorted TABLE using binary search.  TABLE consists of N
+ * entries, where each entry is SIZEOF_ENTRY bytes in size and the first
+ * component is a unichar of size SIZEOF_CHAR.  If C is found in TABLE, its
+ * index is stored in INDEX and true is returned.  Otherwise, false is returned
+ * and INDEX is left untouched. */
+bool
+binary_search_unicode_table(const void *table, size_t n, size_t sizeof_entry, size_t sizeof_char, unichar c, int *index)
+{
+#define ENTRY(index) ((unichar)(*(unichar *)((const char *)table + ((index) * sizeof_entry))) & char_mask)
+	int begin = 0;
+        int end = n - 1;
+        int middle;
+        /* This is ugly, but not all tables use unichars as their lookup
+         * character.  The casefold table, for example, uses uint16_t-sized
+         * characters.  To only get the interesting part of our table entry
+         * we’ll have to mask the retrieved value. */
+        int char_mask = (1 << (8 * sizeof_char)) - 1;
+        /* Drop out early if we know for certain that C can’t be in the
+         * decomposition table. */
+        if (c < ENTRY(0) || c > ENTRY(end))
+                return false;
+        while (begin <= end) {
+                middle = binary_search_middle_of(begin, end);
+                unichar probe = ENTRY(middle);
+                if (c < probe)
+                        end = middle - 1;
+                else if (c > probe)
+                        begin = middle + 1;
+                else
+                        break;
+        }
+        if (begin > end)
+                return false;
+        *index = middle;
+        return true;
+#undef ENTRY
+}

data/ext/encoding/character/utf-8/private.h CHANGED Viewed

@@ -21,48 +21,28 @@
 #  define HIDDEN(u)
 #endif
-unichar *_utf_normalize_wc(const char *str, size_t max_len, bool use_len,
-                           NormalizeMode mode) HIDDEN;
-inline int _unichar_combining_class(unichar c) HIDDEN;
-void need_at_least_n_arguments(int argc, int n) HIDDEN;
-unichar _utf_char_validated(char const *const str,
-                            char const *const str_end) HIDDEN;
-char *_utf_offset_to_pointer_validated_impl(const char *str, long offset,
-                                            const char *limit, bool noisy) HIDDEN;
-char *_utf_offset_to_pointer_validated(const char *str, long offset,
-                                       const char *end) HIDDEN;
-char *_utf_offset_to_pointer_failable(const char *str, long offset,
-                                      const char *end) HIDDEN;
-VALUE rb_utf_new(const char *str, long len) HIDDEN;
-VALUE rb_utf_new2(const char *str) HIDDEN;
+#define binary_search_middle_of(begin, end)     \
+        (((unsigned)((begin) + (end))) >> 1)
-VALUE rb_utf_new5(VALUE obj, const char *str, long len) HIDDEN;
+#define unicode_table_lookup(table, c, index)    \
+        binary_search_unicode_table(table, lengthof(table), sizeof((table)[0]), sizeof((table)[0].ch), c, index)
-VALUE rb_utf_alloc_using(char *str) HIDDEN;
+bool binary_search_unicode_table(const void *table, size_t n, size_t sizeof_entry, size_t sizeof_char, unichar c, int *index) HIDDEN;
-VALUE rb_utf_dup(VALUE str) HIDDEN;
+#define SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part, page, c)  \
+        ((part[page] >= UNICODE_MAX_TABLE_INDEX) \
+         ? (part[page] - UNICODE_MAX_TABLE_INDEX) \
+         : (data[part[page]][(c) & 0xff]))
-long rb_utf_index(VALUE str, VALUE sub, long offset) HIDDEN;
+#define SPLIT_UNICODE_TABLE_LOOKUP(data, part1, part2, c, fallback)    \
+        (((c) <= UNICODE_LAST_CHAR_PART1) \
+         ? SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part1, (c) >> 8, c) \
+         : (((c) >= UNICODE_FIRST_CHAR_PART2 && (c) <= UNICODE_LAST_CHAR) \
+            ? SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part2, ((c) - UNICODE_FIRST_CHAR_PART2) >> 8, c) \
+            : (fallback)))
-bool rb_utf_begin_from_offset(VALUE str, long offset, char **begin,
-                              char **limit) HIDDEN;
-void rb_utf_begin_from_offset_validated(VALUE str, long offset, char **begin,
-                                        char **limit) HIDDEN;
-char *rb_utf_prev_validated(const char *begin, const char *p) HIDDEN;
-VALUE rb_utf_update(VALUE str, long offset, long len, VALUE replacement) HIDDEN;
-char *rb_utf_next_validated(const char *p, const char *end) HIDDEN;
-long rb_utf_index_regexp(VALUE str, const char *s, const char *end, VALUE sub,
-                         long offset, bool reverse) HIDDEN;
+unichar *_utf_normalize_wc(const char *str, size_t max_len, bool use_len,
+                           NormalizeMode mode) HIDDEN;
+int _unichar_combining_class(unichar c) HIDDEN;
 #endif /* PRIVATE_H */

data/ext/encoding/character/utf-8/properties.c CHANGED Viewed

@@ -63,8 +63,8 @@ s_type(unichar c)
 	if (c <= UNICODE_LAST_CHAR_PART1) {
 		page = c >> 8;
 		table = type_table_part1;
-	} else if (c >= 0xe0000 && c <= UNICODE_LAST_CHAR) {
-		page = (c - 0xe0000) >> 8;
+	} else if (c >= UNICODE_FIRST_CHAR_PART2 && c <= UNICODE_LAST_CHAR) {
+		page = (c - UNICODE_FIRST_CHAR_PART2) >> 8;
 		table = type_table_part2;
 	} else {
 		return UNICODE_UNASSIGNED;
@@ -364,8 +364,8 @@ special_case_table_lookup(unichar c)
         unichar tv = ATTTABLE(c >> 8, c & 0xff);
         if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
-                return utf_char(special_case_table +
-                                tv - UNICODE_SPECIAL_CASE_TABLE_START);
+                tv = utf_char(special_case_table +
+                              tv - UNICODE_SPECIAL_CASE_TABLE_START);
         if (tv == '\0')
                 return c;
@@ -429,7 +429,7 @@ unichar_totitle(unichar c)
 			return title_table[i][0];
         if (s_type(c) == UNICODE_LOWERCASE_LETTER)
-                return ATTTABLE(c >> 8, c & 0xff);
+                return unichar_toupper(c);
         return c;
 }
@@ -585,8 +585,7 @@ real_toupper_lithuanian(const char **p, unichar c, int type, char *buf,
 	if (*was_i) {
                 size_t len = remove_all_combining_dot_above(c, buf);
-		return len + output_marks(p, (buf != NULL) ? buf + len : NULL,
-                                          true);
+		return len + output_marks(p, OFFSET_IF(buf, len), true);
 	}
 	if (!s_ismark(type))
@@ -761,9 +760,48 @@ real_do_tolower(unichar c, int type, char *buf)
 /* {{{1
  * The real implementation of downcase.
- *
- * TODO: this needs a cleanup.
  */
+static size_t
+tolower_turkic_i(const char **p, char *buf)
+{
+        unichar i = LATIN_SMALL_LETTER_DOTLESS_I;
+        if (utf_char(*p) == COMBINING_DOT_ABOVE) {
+                /* TODO: don’t we need to make sure we don’t go beyond the end
+                 * of ‘p’? */
+                *p = utf_next(*p);
+                i = LATIN_SMALL_LETTER_I;
+        }
+        return unichar_to_utf(i, buf);
+}
+static size_t
+tolower_lithuianian_i(char *buf, unichar base, unichar combiner)
+{
+        size_t len = unichar_to_utf(base, buf);
+        len += unichar_to_utf(COMBINING_DOT_ABOVE, OFFSET_IF(buf, len));
+        if (combiner != '\0')
+                len += unichar_to_utf(combiner, OFFSET_IF(buf, len));
+        return len;
+}
+static size_t
+tolower_sigma(const char **p, char *buf, const char *end, bool use_end)
+{
+        unichar sigma = GREEK_SMALL_LETTER_FINAL_SIGMA;
+        /* SIGMA maps differently depending on whether it is final or not.  The
+         * following simplified test would fail in the case of combining marks
+         * following the sigma, but I don't think that occurs in real text.
+         * The test here matches that in ICU. */
+        if ((!use_end || *p < end) && **p != '\0' && s_isalpha(s_type(utf_char(*p))))
+                sigma = GREEK_SMALL_LETTER_SIGMA;
+        return unichar_to_utf(sigma, buf);
+}
 static size_t
 real_tolower_one(const char **p, const char *prev, char *buf,
                  LocaleType locale_type, const char *end, bool use_end)
@@ -771,70 +809,45 @@ real_tolower_one(const char **p, const char *prev, char *buf,
         unichar c = utf_char(prev);
         int type = s_type(c);
-        if (locale_type == LOCALE_TURKIC && c == 'I') {
-                if (utf_char(*p) == COMBINING_DOT_ABOVE) {
-                        /* TODO: don’t we need to make sure we don’t go beyond the end
-                         * of ‘p’? */
-                        *p = utf_next(*p);
-                        return unichar_to_utf(LATIN_SMALL_LETTER_I, buf);
-                }
+        if (locale_type == LOCALE_TURKIC && c == 'I')
+                return tolower_turkic_i(p, buf);
-                return unichar_to_utf(LATIN_SMALL_LETTER_DOTLESS_I, buf);
-        }
+        /* Introduce an explicit dot above the lowercasing capital I’s
+         * and J’s whenever there are more accents above.
+         * [SpecialCasing.txt] */
+        if (locale_type == LOCALE_LITHUANIAN) {
+                unichar base = LATIN_SMALL_LETTER_I;
+                unichar combiner = '\0';
-        if (locale_type == LOCALE_LITHUANIAN &&
-            (c == LATIN_CAPITAL_LETTER_I_WITH_GRAVE ||
-             c == LATIN_CAPITAL_LETTER_I_WITH_ACUTE ||
-             c == LATIN_CAPITAL_LETTER_I_WITH_TILDE)) {
-                /* Introduce an explicit dot above the lowercasing capital I's
-                 * and J's whenever there are more accents above.
-                 * [SpecialCasing.txt] */
-                size_t len = unichar_to_utf(LATIN_SMALL_LETTER_I, buf);
-                len += unichar_to_utf(COMBINING_DOT_ABOVE, OFFSET_IF(buf, len));
                 switch (c) {
                 case LATIN_CAPITAL_LETTER_I_WITH_GRAVE:
-                        len += unichar_to_utf(COMBINING_GRAVE_ACCENT,
-                                              OFFSET_IF(buf, len));
+                        combiner = COMBINING_GRAVE_ACCENT;
                         break;
                 case LATIN_CAPITAL_LETTER_I_WITH_ACUTE:
-                        len += unichar_to_utf(COMBINING_ACUTE_ACCENT,
-                                              OFFSET_IF(buf, len));
+                        combiner = COMBINING_ACUTE_ACCENT;
                         break;
                 case LATIN_CAPITAL_LETTER_I_WITH_TILDE:
-                        len += unichar_to_utf(COMBINING_TILDE,
-                                              OFFSET_IF(buf, len));
+                        combiner = COMBINING_TILDE;
                         break;
-                }
+                case 'I':
+                case 'J':
+                case LATIN_CAPITAL_LETTER_I_WITH_OGONEK:
+                        if (!has_more_above(*p))
+                                goto no_lithuanian_i_casing;
-                return len;
-        }
+                        base = unichar_tolower(c);
+                        break;
+                default:
+                        goto no_lithuanian_i_casing;
+                }
-        if (locale_type == LOCALE_LITHUANIAN &&
-            (c == 'I' || c == 'J' || c == LATIN_CAPITAL_LETTER_I_WITH_OGONEK) &&
-            has_more_above(*p)) {
-                size_t len = unichar_to_utf(unichar_tolower(c), buf);
-                return len + unichar_to_utf(COMBINING_DOT_ABOVE,
-                                            OFFSET_IF(buf, len));
+                return tolower_lithuianian_i(buf, base, combiner);
         }
-        if (c == GREEK_CAPITAL_LETTER_SIGMA) {
-                unichar tv = GREEK_SMALL_LETTER_FINAL_SIGMA;
-                if ((!use_end || *p < end) && **p != '\0') {
-                        unichar next_c = utf_char(*p);
-                        int next_type = s_type(next_c);
+no_lithuanian_i_casing:
-                        /* SIGMA maps differently depending on whether it is
-                         * final or not.  The following simplified test would
-                         * fail in the case of combining marks following the
-                         * sigma, but I don't think that occurs in real text.
-                         * The test here matches that in ICU. */
-                        if (s_isalpha(next_type))
-                                tv = GREEK_SMALL_LETTER_SIGMA;
-                }
-                return unichar_to_utf(tv, buf);
-        }
+        if (c == GREEK_CAPITAL_LETTER_SIGMA)
+                return tolower_sigma(p, buf, end, use_end);
         if (IS(type, OR(UNICODE_UPPERCASE_LETTER,
                         OR(UNICODE_TITLECASE_LETTER, 0))))
@@ -879,7 +892,7 @@ utf_downcase_impl(const char *str, size_t max, bool use_max)
 	size_t len = real_tolower(str, max, use_max, NULL, locale_type);
 	char *result = ALLOC_N(char, len + 1);
 	real_tolower(str, max, use_max, result, locale_type);
-	result[len] = NUL;
+	result[len] = '\0';
 	return result;
 }
@@ -915,28 +928,19 @@ utf_downcase_n(const char *str, size_t len)
 static bool
 casefold_table_lookup(unichar c, char *folded, size_t *len)
 {
-        int begin = 0;
-        int end = lengthof(casefold_table);
+        int index;
-        if (c < casefold_table[begin].ch || c > casefold_table[end - 1].ch)
+        if (!unicode_table_lookup(casefold_table, c, &index))
                 return false;
-        while (true) {
-                int mid = (begin + end) / 2;
-                if (c == casefold_table[mid].ch) {
-                        if (folded != NULL)
-                                strcpy(folded, casefold_table[mid].data);
-                        *len += utf_byte_length(casefold_table[mid].data);
-                        return true;
-                } else if (mid == begin) {
-                        return false;
-                } else if (c > casefold_table[mid].ch) {
-                        begin = mid;
-                } else {
-                        end = mid;
-                }
-        }
+        char const *folded_c = casefold_table[index].data;
+        if (folded != NULL)
+                strcpy(folded, folded_c);
+        *len += utf_byte_length(folded_c);
+        return true;
 }
 static char *
@@ -1037,24 +1041,15 @@ utf_width_n(const char *str, size_t len)
 bool
 unichar_mirror(unichar c, unichar *mirrored)
 {
-	int begin = 0;
-	int end = lengthof(bidi_mirroring_table);
+        int index;
-	while (true) {
-		int mid = (begin + end) / 2;
+        if (!unicode_table_lookup(bidi_mirroring_table, c, &index))
+                return false;
-		if (c == bidi_mirroring_table[mid].ch) {
-			if (mirrored != NULL)
-				*mirrored = bidi_mirroring_table[mid].mirrored_ch;
-			return true;
-		} else if (mid == begin) {
-			return false;
-		} else if (c > bidi_mirroring_table[mid].ch) {
-			begin = mid;
-		} else {
-			end = mid;
-		}
-	}
+        if (mirrored != NULL)
+                *mirrored = bidi_mirroring_table[index].mirrored_ch;
+        return true;
 }

data/ext/encoding/character/utf-8/rb_includes.h CHANGED Viewed

@@ -13,6 +13,7 @@
 #include <stdint.h>
 #include "unicode.h"
 #include "private.h"
+#include "rb_private.h"
 #include "rb_methods.h"
 #endif /* RB_INCLUDES_H */

data/ext/encoding/character/utf-8/rb_private.h ADDED Viewed

@@ -0,0 +1,52 @@
+/*
+ * contents: Private Ruby-related functions.
+ *
+ * Copyright © 2007 Nikolai Weibull <now@bitwi.se>
+ */
+#ifndef RB_PRIVATE_H
+#define RB_PRIVATE_H
+void need_at_least_n_arguments(int argc, int n) HIDDEN;
+unichar _utf_char_validated(char const *const str,
+                            char const *const str_end) HIDDEN;
+char *_utf_offset_to_pointer_validated_impl(const char *str, long offset,
+                                            const char *limit, bool noisy) HIDDEN;
+char *_utf_offset_to_pointer_validated(const char *str, long offset,
+                                       const char *end) HIDDEN;
+char *_utf_offset_to_pointer_failable(const char *str, long offset,
+                                      const char *end) HIDDEN;
+VALUE rb_utf_new(const char *str, long len) HIDDEN;
+VALUE rb_utf_new2(const char *str) HIDDEN;
+VALUE rb_utf_new5(VALUE obj, const char *str, long len) HIDDEN;
+VALUE rb_utf_alloc_using(char *str) HIDDEN;
+VALUE rb_utf_dup(VALUE str) HIDDEN;
+long rb_utf_index(VALUE str, VALUE sub, long offset) HIDDEN;
+bool rb_utf_begin_from_offset(VALUE str, long offset, char **begin,
+                              char **limit) HIDDEN;
+void rb_utf_begin_from_offset_validated(VALUE str, long offset, char **begin,
+                                        char **limit) HIDDEN;
+char *rb_utf_prev_validated(const char *begin, const char *p) HIDDEN;
+VALUE rb_utf_update(VALUE str, long offset, long len, VALUE replacement) HIDDEN;
+char *rb_utf_next_validated(const char *p, const char *end) HIDDEN;
+long rb_utf_index_regexp(VALUE str, const char *s, const char *end, VALUE sub,
+                         long offset, bool reverse) HIDDEN;
+#endif /* RB_PRIVATE_H */

data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c CHANGED Viewed

@@ -113,7 +113,8 @@ rb_utf_to_inum_num_separator(const char *str, const char *s, bool verify,
         if (*non_digit != 0)
                 rb_raise(rb_eArgError,
-                         "unexpected ‘%lc’ found at position %ld", c, s - str);
+                         "unexpected ‘%lc’ found at position %ld",
+                         c, utf_pointer_to_offset(str, s));
         *non_digit = c;
@@ -135,7 +136,7 @@ rb_utf_to_inum_digit_value(const char *str, const char *s, unichar c,
                         return false;
                 rb_raise(rb_eArgError,
                          "non-digit character ‘%lc’ found at position %ld",
-                         c, s - str);
+                         c, utf_pointer_to_offset(str, s));
         }
         if (value >= base) {
@@ -144,7 +145,7 @@ rb_utf_to_inum_digit_value(const char *str, const char *s, unichar c,
                 rb_raise(rb_eArgError,
                          "value (%d) greater than base (%d) at position %ld",
-                         value, base, s - str);
+                         value, base, utf_pointer_to_offset(str, s));
         }
         *digit_value = value;
@@ -181,7 +182,7 @@ rb_utf_to_inum_as_fix(const char *str, const char *s, int sign, int base,
                 if (*s != '\0')
                         rb_raise(rb_eArgError,
                                  "trailing garbage found at position %ld",
-                                 s - str);
+                                 utf_pointer_to_offset(str, s));
         }
         if (POSFIXABLE(value)) {
@@ -221,7 +222,7 @@ rb_cutf_to_inum(const char * const str, int base, bool verify)
                 if (verify)
                         rb_raise(rb_eArgError,
                                  "extra sign ‘%c’ found at position %ld",
-                                 *s, s - str);
+                                 *s, utf_pointer_to_offset(str, s));
                 return INT2FIX(0);
         }
@@ -245,7 +246,7 @@ rb_cutf_to_inum(const char * const str, int base, bool verify)
         if (verify && *str == '_')
                 rb_raise(rb_eArgError,
                          "leading digit-separator ‘_’ found at position %ld",
-                         s - str);
+                         utf_pointer_to_offset(str, s));
         bit_length = bit_length / BITSPERDIG + 1;
@@ -269,7 +270,7 @@ rb_cutf_to_inum(const char * const str, int base, bool verify)
                 bool more_to_shift = true;
                 while (more_to_shift) {
-                        BDIGIT_DBL num = c;
+                        BDIGIT_DBL num = digit_value;
                         for (int i = 0; i < big_len; i++) {
                                 num += (BDIGIT_DBL)zds[i] * base;
@@ -294,12 +295,12 @@ rb_cutf_to_inum(const char * const str, int base, bool verify)
         if (str + 1 < s && s[-1] == '_')
                 rb_raise(rb_eArgError,
                          "trailing digit-separator ‘_’ found at position %ld",
-                         s - str);
+                         utf_pointer_to_offset(str, s));
         if (*s != '\0')
                 rb_raise(rb_eArgError,
                          "trailing garbage found at position %ld",
-                         s - str);
+                         utf_pointer_to_offset(str, s));
         return rb_big_norm(z);
 }

data/ext/encoding/character/utf-8/tables.h ADDED Viewed

@@ -0,0 +1,38 @@
+/*
+ * contents: Functions for dealing with Unicode tables.
+ *
+ * Copyright © 2007 Nikolai Weibull <now@bitwi.se>
+ */
+#ifndef TABLES_H
+#define TABLES_H
+/*
+static inline int
+split_unicode_table_lookup_page(const uint8_t data[][256], int16_t page, unichar c)
+{
+        return (page >= UNICODE_MAX_TABLE_INDEX) ?
+                page - UNICODE_MAX_TABLE_INDEX :
+                data[page][c & 0xff];
+}
+static inline int
+split_unicode_table_lookup(const uint8_t data[][256], const int16_t part1[], const int16_t part2[], unichar c, int fallback)
+{
+	if (c <= UNICODE_LAST_CHAR_PART1)
+                return split_unicode_table_lookup_page(data,
+                                                       part1[c >> 8],
+                                                       c);
+        if (c >= UNICODE_FIRST_CHAR_PART2 && c <= UNICODE_LAST_CHAR)
+                return split_unicode_table_lookup_page(data,
+                                                       part2[(c - UNICODE_FIRST_CHAR_PART2) >> 8],
+                                                       c);
+        return fallback;
+}
+*/
+#endif /* TABLES_H */

data/ext/encoding/character/utf-8/unicode.c CHANGED Viewed

@@ -13,6 +13,7 @@
 #include <limits.h>
 #include "unicode.h"
 #include "private.h"
+#include "rb_private.h"
 #include "rb_methods.h"
 static VALUE mUTF8Methods;
@@ -85,7 +86,6 @@ _utf_offset_to_pointer_validated_impl(const char *str, long offset,
                                                 saved_offset);
                                 else
                                         return NULL;
-                                                break;
                         }
                         offset += utf_pointer_to_offset(p, base);

data/ext/encoding/character/utf-8/utf.c CHANGED Viewed

@@ -190,6 +190,9 @@ utf_char(const char *str)
 unichar
 utf_char_n(const char *str, size_t max)
 {
+        if (max == 0)
+                return UTF_INCOMPLETE_INPUT_UNICHAR;
 	size_t len;
 	unichar c = (unsigned char)*str;
@@ -454,7 +457,7 @@ utf_collate(const char *a, const char *b)
 	unichar *a_norm = _utf_normalize_wc(a, 0, false, NORMALIZE_ALL_COMPOSE);
 	unichar *b_norm = _utf_normalize_wc(b, 0, false, NORMALIZE_ALL_COMPOSE);
-	setlocale(LC_COLLATE, "");
 	int result = wcscoll((wchar_t *)a_norm, (wchar_t *)b_norm);
 	free(a_norm);
@@ -518,7 +521,6 @@ utf_collate_key_impl(const char *str, size_t len, bool use_len)
 	assert(str != NULL);
 	unichar *str_norm = _utf_normalize_wc(str, len, use_len, NORMALIZE_ALL_COMPOSE);
-	setlocale(LC_COLLATE, "");
 	size_t xfrm_len = wcsxfrm(NULL, (wchar_t *)str_norm, 0);
 	wchar_t result_wc[xfrm_len + 1];
 	wcsxfrm(result_wc, (wchar_t *)str_norm, xfrm_len + 1);
@@ -863,7 +865,7 @@ utf_reverse_impl(const char *str, size_t len, bool use_len)
 	char *result = ALLOC_N(char, len + 1);
 	char *r = result + len;
 	const char *p = str;
-	while (*p != NUL) {
+        while (r > result) {
 		uint8_t skip = s_utf_skip_lengths[*(unsigned char *)p];
 		r -= skip;
 		for (char *m = r; skip > 0; skip--)

data/tests/case.rb ADDED Viewed

@@ -0,0 +1,102 @@
+# contents: Tests for String#upcase and String#downcase.
+#
+# Copyright © 2006 Nikolai Weibull <now@bitwi.se>
+require 'tests/unicodedatatestbase'
+require 'encoding/character/utf-8'
+class TC_StringCase < Test::Unit::TestCase
+  include UnicodeDataTestBase
+  Code, Name, Category, _, _, _, _, _, _, _, _, _, Upper, Lower, Title = (0..14).to_a
+  CasingCode, CasingLower, CasingTitle, CasingUpper, CasingCondition = (0..4).to_a
+  def test_upcase_and_downcase
+    # TODO: Do it like this.  First read in SpecialCasing.txt and set up lookup
+    # tables for all the characters that need special casing.  Then, iterate
+    # over UnicodeData and simply check that the correct casings are performed,
+    # looking up data in the tables for special casing if no simple casing
+    # information is available (and skipping when appropriate - such as when
+    # there is some condition defined for the special casing).
+    special = Struct.new(:conditions, :upper, :lower, :title).new({}, [], [], [])
+    open_data_file('SpecialCasing.txt') do |file|
+      i = 0
+      file.each_line do |line|
+        i += 1
+        next if line =~ /^(#|\s*$)/
+        fields = line.sub(/\s*#.*$/, "").split('; ')
+        unless fields.size == 4 or fields.size == 5
+          raise "#{line}: Wrong number of fields; #{field.size} instead of 4 or 5."
+        end
+        code = fields[CasingCode].hex
+        special.conditions[code] = fields[CasingCondition] if fields.size == 5
+        special.upper[code] = utfify(fields[CasingUpper])
+        special.lower[code] = utfify(fields[CasingLower])
+        special.title[code] = utfify(fields[CasingTitle])
+      end
+    end
+    open_data_file('UnicodeData.txt') do |file|
+      i = 0
+      prev_code = -1
+      file.each_line do |line|
+        i += 1
+        next if line =~ /^(#|\s*$)/
+        fields = line.split(';')
+        raise "#{line}: Wrong number of fields; #{field.size} instead of 15." unless fields.size == 15
+        code = fields[Code].hex
+        if code > prev_code + 1 and fields[Name] =~ /Last>$/ and fields[Category] =~ /^L[lut]$/
+          prev_code.upto(code - 1){ |c| test_one c, fields, special }
+        end
+        test_one code, fields, special
+        prev_code = code
+      end
+    end
+    puts @i
+  end
+private
+  def utfify(codepoints)
+    return codepoints if codepoints == ""
+    codepoints.split(' ').map{ |cp| cp.hex }.pack('U*')
+  end
+  def utfone(codepoint)
+    u([codepoint].pack('U*'))
+  end
+  def test_one(code, fields, special)
+    @i ||= 0
+    @i += 1
+    case fields[Category]
+    when 'Ll'
+      test_upcase(code, fields, special)
+    when 'Lu'
+      test_downcase(code, fields, special)
+    when 'Lt'
+      test_upcase(code, fields, special)
+      test_downcase(code, fields, special)
+    end
+  end
+  def test_upcase(code, fields, special)
+    if special.upper[code]
+      if not special.conditions[code]
+        assert_equal(special.upper[code], utfone(code).upcase)
+      end
+    elsif not fields[Upper].empty?
+      assert_equal(utfify(fields[Upper]), utfone(code).upcase)
+    end
+  end
+  def test_downcase(code, fields, special)
+    if special.lower[code]
+      if not special.conditions[code]
+        assert_equal(special.lower[code], utfone(code).downcase)
+      end
+    elsif not fields[Lower].empty?
+      assert_equal(utfify(fields[Lower]), utfone(code).downcase)
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,10 +1,10 @@
 --- !ruby/object:Gem::Specification
-rubygems_version: 0.9.0
+rubygems_version: 0.9.4
 specification_version: 1
 name: character-encodings
 version: !ruby/object:Gem::Version
-  version: 0.2.0
-date: 2006-07-27 00:00:00 +02:00
+  version: 0.3.0
+date: 2007-11-22 00:00:00 +01:00
 summary: A pluggable character-encoding library
 require_paths:
 - lib
@@ -48,6 +48,7 @@ files:
 - ext/encoding/character/unicode/codepoint.c
 - ext/encoding/character/utf-8/break.c
 - ext/encoding/character/utf-8/decompose.c
+- ext/encoding/character/utf-8/private.c
 - ext/encoding/character/utf-8/properties.c
 - ext/encoding/character/utf-8/rb_utf_aref.c
 - ext/encoding/character/utf-8/rb_utf_aset.c
@@ -80,19 +81,22 @@ files:
 - ext/encoding/character/utf-8/unicode.c
 - ext/encoding/character/utf-8/utf.c
 - ext/encoding/character/utf-8/rb_utf_internal_bignum.c
+- ext/encoding/character/utf-8/data/break.h
+- ext/encoding/character/utf-8/data/character-tables.h
+- ext/encoding/character/utf-8/data/compose.h
+- ext/encoding/character/utf-8/data/decompose.h
 - ext/encoding/character/utf-8/private.h
 - ext/encoding/character/utf-8/rb_includes.h
 - ext/encoding/character/utf-8/rb_methods.h
+- ext/encoding/character/utf-8/rb_private.h
 - ext/encoding/character/utf-8/rb_utf_internal_tr.h
+- ext/encoding/character/utf-8/tables.h
 - ext/encoding/character/utf-8/unicode.h
 - ext/encoding/character/utf-8/rb_utf_internal_bignum.h
-- ext/encoding/character/utf-8/data/break.h
-- ext/encoding/character/utf-8/data/character-tables.h
-- ext/encoding/character/utf-8/data/compose.h
-- ext/encoding/character/utf-8/data/decompose.h
-- ext/encoding/character/utf-8/extconf.rb
 - ext/encoding/character/utf-8/data/generate-unicode-data.rb
+- ext/encoding/character/utf-8/extconf.rb
 - ext/encoding/character/utf-8/depend
+- tests/case.rb
 - tests/foldcase.rb
 - tests/normalize.rb
 - tests/unicodedatatestbase.rb