RubyGems - character-encodings - Versions diffs - 0.2.0 → 0.3.0 - Mend

character-encodings 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

data/Rakefile +2 -2
data/ext/encoding/character/utf-8/break.c +6 -19
data/ext/encoding/character/utf-8/data/character-tables.h +2 -0
data/ext/encoding/character/utf-8/data/decompose.h +1 -0
data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +5 -0
data/ext/encoding/character/utf-8/decompose.c +77 -109
data/ext/encoding/character/utf-8/depend +1 -0
data/ext/encoding/character/utf-8/extconf.rb +2 -0
data/ext/encoding/character/utf-8/private.c +62 -0
data/ext/encoding/character/utf-8/private.h +18 -38
data/ext/encoding/character/utf-8/properties.c +90 -95
data/ext/encoding/character/utf-8/rb_includes.h +1 -0
data/ext/encoding/character/utf-8/rb_private.h +52 -0
data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +10 -9
data/ext/encoding/character/utf-8/tables.h +38 -0
data/ext/encoding/character/utf-8/unicode.c +1 -1
data/ext/encoding/character/utf-8/utf.c +5 -3
data/tests/case.rb +102 -0
metadata +12 -8

data/Rakefile CHANGED Viewed

@@ -10,7 +10,7 @@ require 'rake/testtask'
 require 'spec/rake/spectask'
 PackageName = 'character-encodings'
-PackageVersion = '0.2.0'
+PackageVersion = '0.3.0'
 desc 'Default task'
 task :default => [:extensions]
@@ -92,7 +92,7 @@ Spec::Rake::SpecTask.new do |t|
 end
 Tests = [
-  ['tests/foldcase.rb'],
+  ['tests/foldcase.rb', 'tests/case.rb'],
   ['tests/normalize.rb']
 ]

data/ext/encoding/character/utf-8/break.c CHANGED Viewed

@@ -9,30 +9,17 @@
 #include <stdint.h>
 #include "unicode.h"
 #include "data/break.h"
+#include "private.h"
 /* Figure out what break type the Unicode character ‘c’ possesses, if any.
  * This information is used for finding word and line boundaries, which is
  * useful when displaying Unicode text on screen. */
-static UnicodeBreakType
-break_type(const int16_t table[], unsigned int page, unichar c)
-{
-        int16_t break_property = table[page];
-        return (break_property >= UNICODE_MAX_TABLE_INDEX) ?
-                break_property - UNICODE_MAX_TABLE_INDEX :
-                break_property_data[break_property][c & 0xff];
-}
 UnicodeBreakType
 unichar_break_type(unichar c)
 {
-	if (c <= UNICODE_LAST_CHAR_PART1)
-                return break_type(break_property_table_part1, c >> 8, c);
-        if (c >= UNICODE_FIRST_CHAR_PART2 && c <= UNICODE_LAST_CHAR)
-                return break_type(break_property_table_part2,
-                                  (c - 0xe0000) >> 8, c);
-        return UNICODE_BREAK_UNKNOWN;
+        return SPLIT_UNICODE_TABLE_LOOKUP(break_property_data,
+                                          break_property_table_part1,
+                                          break_property_table_part2,
+                                          c,
+                                          UNICODE_BREAK_UNKNOWN);
 }

data/ext/encoding/character/utf-8/data/character-tables.h CHANGED Viewed

@@ -13,6 +13,8 @@
 #define UNICODE_LAST_PAGE_PART1 762
+#define UNICODE_FIRST_CHAR_PART2 0xe0000
 #define UNICODE_SPECIAL_CASE_TABLE_START 0x1000000

data/ext/encoding/character/utf-8/data/decompose.h CHANGED Viewed

@@ -10,6 +10,7 @@
 #define UNICODE_LAST_CHAR_PART1 0x2faff
 #define UNICODE_LAST_PAGE_PART1 762
+#define UNICODE_FIRST_CHAR_PART2 0xe0000
 #define UNICODE_NOT_PRESENT_OFFSET 65535

data/ext/encoding/character/utf-8/data/generate-unicode-data.rb CHANGED Viewed

@@ -534,6 +534,8 @@ private
 #define UNICODE_LAST_PAGE_PART1 #{data.pages_before_e0000 - 1}
+#define UNICODE_FIRST_CHAR_PART2 0xe0000
 #define UNICODE_SPECIAL_CASE_TABLE_START 0x1000000
 EOF
       print_table(data, 0, @last_char_part1_i, data.last, 1,
@@ -694,8 +696,11 @@ EOF
 #define UNICODE_MAX_TABLE_INDEX (0x110000 / 256)
 #define UNICODE_LAST_CHAR_PART1 #{@last_char_part1_x}
 #define UNICODE_LAST_PAGE_PART1 #{data.pages_before_e0000 - 1}
+#define UNICODE_FIRST_CHAR_PART2 0xe0000
 #define UNICODE_NOT_PRESENT_OFFSET #{NOT_PRESENT_OFFSET}
 EOF
       print_table(data, 0, @last_char_part1_i, data.last, 1,

data/ext/encoding/character/utf-8/decompose.c CHANGED Viewed

@@ -8,35 +8,16 @@
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 #include "unicode.h"
-#include "private.h"
 #include "data/decompose.h"
 #include "data/compose.h"
+#include "private.h"
-/* {{{1
- * Macros for accessing the combining class property tables for a given
- * character.
- *
- * TODO: Turn these macros into full-fledged functions, as this is rather silly
- * when we have ‹inline› in C99.
- */
-#define CC_PART1(page, char) \
-        ((combining_class_table_part1[page] >= UNICODE_MAX_TABLE_INDEX) \
-         ? (combining_class_table_part1[page] - UNICODE_MAX_TABLE_INDEX) \
-         : (cclass_data[combining_class_table_part1[page]][char]))
-#define CC_PART2(page, char) \
-        ((combining_class_table_part2[page] >= UNICODE_MAX_TABLE_INDEX) \
-         ? (combining_class_table_part2[page] - UNICODE_MAX_TABLE_INDEX) \
-         : (cclass_data[combining_class_table_part2[page]][char]))
-#define COMBINING_CLASS(char) \
-        (((char) <= UNICODE_LAST_CHAR_PART1) \
-         ? CC_PART1((char) >> 8, (char) & 0xff) \
-         : (((char) >= 0xe0000 && (char) <= UNICODE_LAST_CHAR) \
-            ? CC_PART2(((char) - 0xe0000) >> 8, (char) & 0xff) \
-            : 0))
+#define COMBINING_CLASS(c)      \
+        SPLIT_UNICODE_TABLE_LOOKUP(cclass_data, combining_class_table_part1, combining_class_table_part2, (c), 0)
 /* {{{1
@@ -115,12 +96,11 @@ unicode_canonical_ordering(unichar *str, size_t len)
 /* {{{1
  * Decompose the character ‘s’ according to the rules outlined in
  * http://www.unicode.org/unicode/reports/tr15/#Hangul.  ‘r’ should be ‹NULL›
- * or of sufficient length to store the decomposition of ‘s’.  The number of
- * characters stored (or would be if it were non-‹NULL›) in ‘r’ is put in
- * ‘r_len’.
+ * or of sufficient length to store the decomposition of ‘s’.  Returns the
+ * number of characters stored (or would be if it were non-NULL) in R.
  */
-static void
-decompose_hangul(unichar s, unichar *r, size_t *r_len)
+static size_t
+decompose_hangul(unichar s, unichar *r)
 {
         int SIndex = s - SBase;
@@ -128,8 +108,7 @@ decompose_hangul(unichar s, unichar *r, size_t *r_len)
         if (SIndex < 0 || SIndex >= SCount) {
                 if (r != NULL)
                         r[0] = s;
-                *r_len = 1;
-                return;
+                return 1;
         }
         unichar L = LBase + SIndex / NCount;
@@ -141,13 +120,13 @@ decompose_hangul(unichar s, unichar *r, size_t *r_len)
                 r[1] = V;
         }
-        if (T != TBase) {
-                if (r != NULL)
-                        r[2] = T;
-                *r_len = 3;
-        } else {
-                *r_len = 2;
-        }
+        if (T == TBase)
+                return 2;
+        if (r != NULL)
+                r[2] = T;
+        return 3;
 }
@@ -179,26 +158,27 @@ get_decomposition(int index, bool compat)
 static const char *
 find_decomposition(unichar c, bool compat)
 {
-        int begin = 0;
-        int end = lengthof(decomp_table);
+        int index;
-        if (c < decomp_table[begin].ch || c > decomp_table[end - 1].ch)
+        if (!unicode_table_lookup(decomp_table, c, &index))
                 return NULL;
-        while (true) {
-                int middle = (begin + end) / 2;
+        return get_decomposition(index, compat);
+}
-                if (c == decomp_table[middle].ch)
-                        return get_decomposition(middle, compat);
-                else if (middle == begin)
-                        break;
-                else if (c > decomp_table[middle].ch)
-                        begin = middle;
-                else
-                        end = middle;
-        }
-        return NULL;
+/* {{{1
+ * Copy over the UTF-8 decomposition in ‘decomposition’ to the unichar buffer
+ * ‘chars’.  Return the number of unichars in ‘chars’.
+ */
+static size_t
+decomposition_to_wc(const char *decomposition, unichar *chars)
+{
+        size_t i = 0;
+        for (const char *p = decomposition; *p != '\0'; p = utf_next(p))
+                chars[i++] = utf_char(p);
+        return i;
 }
@@ -215,17 +195,13 @@ unicode_canonical_decomposition(unichar c, size_t *len)
         /* Hangul syllable */
         if (c >= SBase && c <= SLast) {
-                decompose_hangul(c, NULL, len);
+                *len = decompose_hangul(c, NULL);
                 r = ALLOC_N(unichar, *len);
-                decompose_hangul(c, r, len);
+                decompose_hangul(c, r);
         } else if ((decomp = find_decomposition(c, false)) != NULL) {
                 *len = utf_length(decomp);
                 r = ALLOC_N(unichar, *len);
-                int i;
-                const char *p;
-                for (p = decomp, i = 0; *p != NUL; p = utf_next(p), i++)
-                        r[i] = utf_char(p);
+                decomposition_to_wc(decomp, r);
         } else {
                 r = ALLOC(unichar);
                 *r = c;
@@ -281,23 +257,19 @@ compose_index(unichar c)
         if (page > COMPOSE_TABLE_LAST)
                 return 0;
-        /* TODO: why is this signed, exactly? */
-        int16_t compose_offset = compose_table[page];
-        return (compose_offset >= UNICODE_MAX_TABLE_INDEX) ?
-                compose_offset - UNICODE_MAX_TABLE_INDEX :
-                compose_data[compose_offset][c & 0xff];
+        return SPLIT_UNICODE_TABLE_LOOKUP_PAGE(compose_data, compose_table, page, c);
 }
 static bool
 lookup_compose(const uint16_t table[][2], uint16_t index, unichar c,
                unichar *result)
 {
-        if (c == table[index][0]) {
-                *result = table[index][1];
-                return true;
-        }
+        if (c != table[index][0])
+                return false;
-        return false;
+        *result = table[index][1];
+        return true;
 }
 static bool
@@ -307,21 +279,18 @@ combine(unichar a, unichar b, unichar *result)
                 return true;
         uint16_t index_a = compose_index(a);
-        if (index_a >= COMPOSE_FIRST_SINGLE_START &&
-            index_a < COMPOSE_SECOND_START) {
+        if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
                 return lookup_compose(compose_first_single,
                                       index_a - COMPOSE_FIRST_SINGLE_START,
                                       b,
                                       result);
-        }
         uint16_t index_b = compose_index(b);
-        if (index_b >= COMPOSE_SECOND_SINGLE_START) {
+        if (index_b >= COMPOSE_SECOND_SINGLE_START)
                 return lookup_compose(compose_second_single,
                                       index_b - COMPOSE_SECOND_SINGLE_START,
                                       a,
                                       result);
-        }
         if (index_a >= COMPOSE_FIRST_START &&
             index_a < COMPOSE_FIRST_SINGLE_START &&
@@ -356,12 +325,8 @@ normalize_wc_decompose_one(unichar c, NormalizeMode mode, unichar *buf)
                 return 1;
         }
-        if (buf != NULL) {
-                int i;
-                for (i = 0; *decomp != NUL; decomp = utf_next(decomp), i++)
-                        buf[i] = utf_char(decomp);
-                return i;
-        }
+        if (buf != NULL)
+                return decomposition_to_wc(decomp, buf);
         return utf_length(decomp);
 }
@@ -378,14 +343,10 @@ normalize_wc_decompose(const char *str, size_t max_len, bool use_len,
                 size_t prev_n = n;
                 unichar *base = (buf != NULL) ? buf + n : NULL;
-                if (c >= SBase && c <= SLast) {
-                        size_t len;
-                        decompose_hangul(c, base, &len);
-                        n += len;
-                } else {
+                if (c >= SBase && c <= SLast)
+                        n += decompose_hangul(c, base);
+                else
                         n += normalize_wc_decompose_one(c, mode, base);
-                }
                 if (buf != NULL && n > 0 && COMBINING_CLASS(buf[prev_n]) == 0) {
                         unicode_canonical_ordering(buf + prev_start,
@@ -403,44 +364,51 @@ normalize_wc_decompose(const char *str, size_t max_len, bool use_len,
         *buf_len = n;
 }
-unichar *
-_utf_normalize_wc(const char *str, size_t max_len, bool use_len, NormalizeMode mode)
+static unichar *
+normalize_wc_compose(unichar *buf, size_t len)
 {
-        size_t n;
-        normalize_wc_decompose(str, max_len, use_len, mode, NULL, &n);
-        unichar *buf = ALLOC_N(unichar, n + 1);
-        normalize_wc_decompose(str, max_len, use_len, mode, buf, &n);
-        /* Just return if we don’t want composition. */
-        if (!(mode == NORMALIZE_NFC || mode == NORMALIZE_NFKC))
-                return buf;
+        int new_len = len;
         size_t prev_start = 0;
         int prev_cc = 0;
-        for (size_t i = 0; i < n; i++) {
+        for (size_t i = 0; i < len; i++) {
                 int cc = COMBINING_CLASS(buf[i]);
+                size_t j = i - (len - new_len);
-                if (i > 0 && (prev_cc == 0 || prev_cc < cc) &&
+                if (j > 0 && (prev_cc == 0 || prev_cc < cc) &&
                     combine(buf[prev_start], buf[i], &buf[prev_start])) {
-                        for (size_t j = i + 1; j < n; j++)
-                                buf[j - 1] = buf[j];
-                        n--;
-                        i--;
-                        prev_cc = (i == prev_start) ? 0 : COMBINING_CLASS(buf[i - 1]);
+                        new_len--;
+                        prev_cc = (j + 1 == prev_start) ?
+                                  0 : COMBINING_CLASS(buf[j - 1]);
                 } else {
                         if (cc == 0)
-                                prev_start = i;
+                                prev_start = j;
+                        buf[j] = buf[i];
                         prev_cc = cc;
                 }
         }
-        buf[n] = NUL;
+        buf[new_len] = NUL;
         return buf;
 }
+unichar *
+_utf_normalize_wc(const char *str, size_t max_len, bool use_len, NormalizeMode mode)
+{
+        size_t n;
+        normalize_wc_decompose(str, max_len, use_len, mode, NULL, &n);
+        unichar *buf = ALLOC_N(unichar, n + 1);
+        normalize_wc_decompose(str, max_len, use_len, mode, buf, &n);
+        /* Just return if we don’t want composition. */
+        if (!(mode == NORMALIZE_NFC || mode == NORMALIZE_NFKC))
+                return buf;
+        return normalize_wc_compose(buf, n);
+}
 /* {{{1
  * Normalize (compose/decompose) characters in ‘str˚ so that strings that

data/ext/encoding/character/utf-8/depend CHANGED Viewed

@@ -1,6 +1,7 @@
 break.o: break.c unicode.h data/break.h
 decompose.o: decompose.c unicode.h private.h data/decompose.h \
   data/compose.h
+private.o: private.c private.h
 properties.o: properties.c unicode.h private.h data/character-tables.h
 rb_utf_aref.o: rb_utf_aref.c rb_includes.h unicode.h private.h \
   rb_methods.h

data/ext/encoding/character/utf-8/extconf.rb CHANGED Viewed

@@ -12,6 +12,7 @@ def try_compiler_option(opt, &b)
 end
 try_compiler_option('-std=c99')
+try_compiler_option('-finline-functions')
 try_compiler_option('-Wall')
 try_compiler_option('-Wextra')
 try_compiler_option('-Wwrite-strings')
@@ -23,6 +24,7 @@ try_compiler_option('-Wundef')
 try_compiler_option('-Wpointer-arith')
 try_compiler_option('-Wcast-align')
 try_compiler_option('-Werror')
+try_compiler_option('-Winline')
 # XXX: sadly, -Wshadow is a bit too strict.  It will, for example, whine about
 # local variables called “index” on FreeBSD.
 # try_compiler_option('-Wshadow')

data/ext/encoding/character/utf-8/private.c ADDED Viewed

@@ -0,0 +1,62 @@
+/*
+ * contents: Private functions used by the UTF-8 character-encoding library.
+ *
+ * Copyright © 2007 Nikolai Weibull <now@bitwi.se>
+ */
+#include <ruby.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include "unicode.h"
+#include "private.h"
+/* Lookup C in the sorted TABLE using binary search.  TABLE consists of N
+ * entries, where each entry is SIZEOF_ENTRY bytes in size and the first
+ * component is a unichar of size SIZEOF_CHAR.  If C is found in TABLE, its
+ * index is stored in INDEX and true is returned.  Otherwise, false is returned
+ * and INDEX is left untouched. */
+bool
+binary_search_unicode_table(const void *table, size_t n, size_t sizeof_entry, size_t sizeof_char, unichar c, int *index)
+{
+#define ENTRY(index) ((unichar)(*(unichar *)((const char *)table + ((index) * sizeof_entry))) & char_mask)
+	int begin = 0;
+        int end = n - 1;
+        int middle;
+        /* This is ugly, but not all tables use unichars as their lookup
+         * character.  The casefold table, for example, uses uint16_t-sized
+         * characters.  To only get the interesting part of our table entry
+         * we’ll have to mask the retrieved value. */
+        int char_mask = (1 << (8 * sizeof_char)) - 1;
+        /* Drop out early if we know for certain that C can’t be in the
+         * decomposition table. */
+        if (c < ENTRY(0) || c > ENTRY(end))
+                return false;
+        while (begin <= end) {
+                middle = binary_search_middle_of(begin, end);
+                unichar probe = ENTRY(middle);
+                if (c < probe)
+                        end = middle - 1;
+                else if (c > probe)
+                        begin = middle + 1;
+                else
+                        break;
+        }
+        if (begin > end)
+                return false;
+        *index = middle;
+        return true;
+#undef ENTRY
+}

data/ext/encoding/character/utf-8/private.h CHANGED Viewed

@@ -21,48 +21,28 @@
 #  define HIDDEN(u)
 #endif
-unichar *_utf_normalize_wc(const char *str, size_t max_len, bool use_len,
-                           NormalizeMode mode) HIDDEN;
-inline int _unichar_combining_class(unichar c) HIDDEN;
-void need_at_least_n_arguments(int argc, int n) HIDDEN;
-unichar _utf_char_validated(char const *const str,
-                            char const *const str_end) HIDDEN;
-char *_utf_offset_to_pointer_validated_impl(const char *str, long offset,
-                                            const char *limit, bool noisy) HIDDEN;
-char *_utf_offset_to_pointer_validated(const char *str, long offset,
-                                       const char *end) HIDDEN;
-char *_utf_offset_to_pointer_failable(const char *str, long offset,
-                                      const char *end) HIDDEN;
-VALUE rb_utf_new(const char *str, long len) HIDDEN;
-VALUE rb_utf_new2(const char *str) HIDDEN;
+#define binary_search_middle_of(begin, end)     \
+        (((unsigned)((begin) + (end))) >> 1)
-VALUE rb_utf_new5(VALUE obj, const char *str, long len) HIDDEN;
+#define unicode_table_lookup(table, c, index)    \
+        binary_search_unicode_table(table, lengthof(table), sizeof((table)[0]), sizeof((table)[0].ch), c, index)
-VALUE rb_utf_alloc_using(char *str) HIDDEN;
+bool binary_search_unicode_table(const void *table, size_t n, size_t sizeof_entry, size_t sizeof_char, unichar c, int *index) HIDDEN;
-VALUE rb_utf_dup(VALUE str) HIDDEN;
+#define SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part, page, c)  \
+        ((part[page] >= UNICODE_MAX_TABLE_INDEX) \
+         ? (part[page] - UNICODE_MAX_TABLE_INDEX) \
+         : (data[part[page]][(c) & 0xff]))
-long rb_utf_index(VALUE str, VALUE sub, long offset) HIDDEN;
+#define SPLIT_UNICODE_TABLE_LOOKUP(data, part1, part2, c, fallback)    \
+        (((c) <= UNICODE_LAST_CHAR_PART1) \
+         ? SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part1, (c) >> 8, c) \
+         : (((c) >= UNICODE_FIRST_CHAR_PART2 && (c) <= UNICODE_LAST_CHAR) \
+            ? SPLIT_UNICODE_TABLE_LOOKUP_PAGE(data, part2, ((c) - UNICODE_FIRST_CHAR_PART2) >> 8, c) \
+            : (fallback)))
-bool rb_utf_begin_from_offset(VALUE str, long offset, char **begin,
-                              char **limit) HIDDEN;
-void rb_utf_begin_from_offset_validated(VALUE str, long offset, char **begin,
-                                        char **limit) HIDDEN;
-char *rb_utf_prev_validated(const char *begin, const char *p) HIDDEN;
-VALUE rb_utf_update(VALUE str, long offset, long len, VALUE replacement) HIDDEN;
-char *rb_utf_next_validated(const char *p, const char *end) HIDDEN;
-long rb_utf_index_regexp(VALUE str, const char *s, const char *end, VALUE sub,
-                         long offset, bool reverse) HIDDEN;
+unichar *_utf_normalize_wc(const char *str, size_t max_len, bool use_len,
+                           NormalizeMode mode) HIDDEN;
+int _unichar_combining_class(unichar c) HIDDEN;
 #endif /* PRIVATE_H */

data/ext/encoding/character/utf-8/properties.c CHANGED Viewed

@@ -63,8 +63,8 @@ s_type(unichar c)
 	if (c <= UNICODE_LAST_CHAR_PART1) {
 		page = c >> 8;
 		table = type_table_part1;
-	} else if (c >= 0xe0000 && c <= UNICODE_LAST_CHAR) {
-		page = (c - 0xe0000) >> 8;
+	} else if (c >= UNICODE_FIRST_CHAR_PART2 && c <= UNICODE_LAST_CHAR) {
+		page = (c - UNICODE_FIRST_CHAR_PART2) >> 8;
 		table = type_table_part2;
 	} else {
 		return UNICODE_UNASSIGNED;
@@ -364,8 +364,8 @@ special_case_table_lookup(unichar c)
         unichar tv = ATTTABLE(c >> 8, c & 0xff);
         if (tv >= UNICODE_SPECIAL_CASE_TABLE_START)
-                return utf_char(special_case_table +
-                                tv - UNICODE_SPECIAL_CASE_TABLE_START);
+                tv = utf_char(special_case_table +
+                              tv - UNICODE_SPECIAL_CASE_TABLE_START);
         if (tv == '\0')
                 return c;
@@ -429,7 +429,7 @@ unichar_totitle(unichar c)
 			return title_table[i][0];
         if (s_type(c) == UNICODE_LOWERCASE_LETTER)
-                return ATTTABLE(c >> 8, c & 0xff);
+                return unichar_toupper(c);
         return c;
 }
@@ -585,8 +585,7 @@ real_toupper_lithuanian(const char **p, unichar c, int type, char *buf,
 	if (*was_i) {
                 size_t len = remove_all_combining_dot_above(c, buf);
-		return len + output_marks(p, (buf != NULL) ? buf + len : NULL,
-                                          true);
+		return len + output_marks(p, OFFSET_IF(buf, len), true);
 	}
 	if (!s_ismark(type))
@@ -761,9 +760,48 @@ real_do_tolower(unichar c, int type, char *buf)
 /* {{{1
  * The real implementation of downcase.
- *
- * TODO: this needs a cleanup.
  */
+static size_t
+tolower_turkic_i(const char **p, char *buf)
+{
+        unichar i = LATIN_SMALL_LETTER_DOTLESS_I;
+        if (utf_char(*p) == COMBINING_DOT_ABOVE) {
+                /* TODO: don’t we need to make sure we don’t go beyond the end
+                 * of ‘p’? */
+                *p = utf_next(*p);
+                i = LATIN_SMALL_LETTER_I;
+        }
+        return unichar_to_utf(i, buf);
+}
+static size_t
+tolower_lithuianian_i(char *buf, unichar base, unichar combiner)
+{
+        size_t len = unichar_to_utf(base, buf);
+        len += unichar_to_utf(COMBINING_DOT_ABOVE, OFFSET_IF(buf, len));
+        if (combiner != '\0')
+                len += unichar_to_utf(combiner, OFFSET_IF(buf, len));
+        return len;
+}
+static size_t
+tolower_sigma(const char **p, char *buf, const char *end, bool use_end)
+{
+        unichar sigma = GREEK_SMALL_LETTER_FINAL_SIGMA;
+        /* SIGMA maps differently depending on whether it is final or not.  The
+         * following simplified test would fail in the case of combining marks
+         * following the sigma, but I don't think that occurs in real text.
+         * The test here matches that in ICU. */
+        if ((!use_end || *p < end) && **p != '\0' && s_isalpha(s_type(utf_char(*p))))
+                sigma = GREEK_SMALL_LETTER_SIGMA;
+        return unichar_to_utf(sigma, buf);
+}
 static size_t
 real_tolower_one(const char **p, const char *prev, char *buf,
                  LocaleType locale_type, const char *end, bool use_end)
@@ -771,70 +809,45 @@ real_tolower_one(const char **p, const char *prev, char *buf,
         unichar c = utf_char(prev);
         int type = s_type(c);
-        if (locale_type == LOCALE_TURKIC && c == 'I') {
-                if (utf_char(*p) == COMBINING_DOT_ABOVE) {
-                        /* TODO: don’t we need to make sure we don’t go beyond the end
-                         * of ‘p’? */
-                        *p = utf_next(*p);
-                        return unichar_to_utf(LATIN_SMALL_LETTER_I, buf);
-                }
+        if (locale_type == LOCALE_TURKIC && c == 'I')
+                return tolower_turkic_i(p, buf);
-                return unichar_to_utf(LATIN_SMALL_LETTER_DOTLESS_I, buf);
-        }
+        /* Introduce an explicit dot above the lowercasing capital I’s
+         * and J’s whenever there are more accents above.
+         * [SpecialCasing.txt] */
+        if (locale_type == LOCALE_LITHUANIAN) {
+                unichar base = LATIN_SMALL_LETTER_I;
+                unichar combiner = '\0';
-        if (locale_type == LOCALE_LITHUANIAN &&
-            (c == LATIN_CAPITAL_LETTER_I_WITH_GRAVE ||
-             c == LATIN_CAPITAL_LETTER_I_WITH_ACUTE ||
-             c == LATIN_CAPITAL_LETTER_I_WITH_TILDE)) {
-                /* Introduce an explicit dot above the lowercasing capital I's
-                 * and J's whenever there are more accents above.
-                 * [SpecialCasing.txt] */
-                size_t len = unichar_to_utf(LATIN_SMALL_LETTER_I, buf);
-                len += unichar_to_utf(COMBINING_DOT_ABOVE, OFFSET_IF(buf, len));
                 switch (c) {
                 case LATIN_CAPITAL_LETTER_I_WITH_GRAVE:
-                        len += unichar_to_utf(COMBINING_GRAVE_ACCENT,
-                                              OFFSET_IF(buf, len));
+                        combiner = COMBINING_GRAVE_ACCENT;
                         break;
                 case LATIN_CAPITAL_LETTER_I_WITH_ACUTE:
-                        len += unichar_to_utf(COMBINING_ACUTE_ACCENT,
-                                              OFFSET_IF(buf, len));
+                        combiner = COMBINING_ACUTE_ACCENT;
                         break;
                 case LATIN_CAPITAL_LETTER_I_WITH_TILDE:
-                        len += unichar_to_utf(COMBINING_TILDE,
-                                              OFFSET_IF(buf, len));
+                        combiner = COMBINING_TILDE;
                         break;
-                }
+                case 'I':
+                case 'J':
+                case LATIN_CAPITAL_LETTER_I_WITH_OGONEK:
+                        if (!has_more_above(*p))
+                                goto no_lithuanian_i_casing;
-                return len;
-        }
+                        base = unichar_tolower(c);
+                        break;
+                default:
+                        goto no_lithuanian_i_casing;
+                }
-        if (locale_type == LOCALE_LITHUANIAN &&
-            (c == 'I' || c == 'J' || c == LATIN_CAPITAL_LETTER_I_WITH_OGONEK) &&
-            has_more_above(*p)) {
-                size_t len = unichar_to_utf(unichar_tolower(c), buf);
-                return len + unichar_to_utf(COMBINING_DOT_ABOVE,
-                                            OFFSET_IF(buf, len));
+                return tolower_lithuianian_i(buf, base, combiner);
         }
-        if (c == GREEK_CAPITAL_LETTER_SIGMA) {
-                unichar tv = GREEK_SMALL_LETTER_FINAL_SIGMA;
-                if ((!use_end || *p < end) && **p != '\0') {
-                        unichar next_c = utf_char(*p);
-                        int next_type = s_type(next_c);
+no_lithuanian_i_casing:
-                        /* SIGMA maps differently depending on whether it is
-                         * final or not.  The following simplified test would
-                         * fail in the case of combining marks following the
-                         * sigma, but I don't think that occurs in real text.
-                         * The test here matches that in ICU. */
-                        if (s_isalpha(next_type))
-                                tv = GREEK_SMALL_LETTER_SIGMA;
-                }
-                return unichar_to_utf(tv, buf);
-        }
+        if (c == GREEK_CAPITAL_LETTER_SIGMA)
+                return tolower_sigma(p, buf, end, use_end);
         if (IS(type, OR(UNICODE_UPPERCASE_LETTER,
                         OR(UNICODE_TITLECASE_LETTER, 0))))
@@ -879,7 +892,7 @@ utf_downcase_impl(const char *str, size_t max, bool use_max)
 	size_t len = real_tolower(str, max, use_max, NULL, locale_type);
 	char *result = ALLOC_N(char, len + 1);
 	real_tolower(str, max, use_max, result, locale_type);
-	result[len] = NUL;
+	result[len] = '\0';
 	return result;
 }
@@ -915,28 +928,19 @@ utf_downcase_n(const char *str, size_t len)
 static bool
 casefold_table_lookup(unichar c, char *folded, size_t *len)
 {
-        int begin = 0;
-        int end = lengthof(casefold_table);
+        int index;
-        if (c < casefold_table[begin].ch || c > casefold_table[end - 1].ch)
+        if (!unicode_table_lookup(casefold_table, c, &index))
                 return false;
-        while (true) {
-                int mid = (begin + end) / 2;
-                if (c == casefold_table[mid].ch) {
-                        if (folded != NULL)
-                                strcpy(folded, casefold_table[mid].data);
-                        *len += utf_byte_length(casefold_table[mid].data);
-                        return true;
-                } else if (mid == begin) {
-                        return false;
-                } else if (c > casefold_table[mid].ch) {
-                        begin = mid;
-                } else {
-                        end = mid;
-                }
-        }
+        char const *folded_c = casefold_table[index].data;
+        if (folded != NULL)
+                strcpy(folded, folded_c);
+        *len += utf_byte_length(folded_c);
+        return true;
 }
 static char *
@@ -1037,24 +1041,15 @@ utf_width_n(const char *str, size_t len)
 bool
 unichar_mirror(unichar c, unichar *mirrored)
 {
-	int begin = 0;
-	int end = lengthof(bidi_mirroring_table);
+        int index;
-	while (true) {
-		int mid = (begin + end) / 2;
+        if (!unicode_table_lookup(bidi_mirroring_table, c, &index))
+                return false;
-		if (c == bidi_mirroring_table[mid].ch) {
-			if (mirrored != NULL)
-				*mirrored = bidi_mirroring_table[mid].mirrored_ch;
-			return true;
-		} else if (mid == begin) {
-			return false;
-		} else if (c > bidi_mirroring_table[mid].ch) {
-			begin = mid;
-		} else {
-			end = mid;
-		}
-	}
+        if (mirrored != NULL)
+                *mirrored = bidi_mirroring_table[index].mirrored_ch;
+        return true;
 }

data/ext/encoding/character/utf-8/rb_includes.h CHANGED Viewed

@@ -13,6 +13,7 @@
 #include <stdint.h>
 #include "unicode.h"
 #include "private.h"
+#include "rb_private.h"
 #include "rb_methods.h"
 #endif /* RB_INCLUDES_H */

data/ext/encoding/character/utf-8/rb_private.h ADDED Viewed

@@ -0,0 +1,52 @@
+/*
+ * contents: Private Ruby-related functions.
+ *
+ * Copyright © 2007 Nikolai Weibull <now@bitwi.se>
+ */
+#ifndef RB_PRIVATE_H
+#define RB_PRIVATE_H
+void need_at_least_n_arguments(int argc, int n) HIDDEN;
+unichar _utf_char_validated(char const *const str,
+                            char const *const str_end) HIDDEN;
+char *_utf_offset_to_pointer_validated_impl(const char *str, long offset,
+                                            const char *limit, bool noisy) HIDDEN;
+char *_utf_offset_to_pointer_validated(const char *str, long offset,
+                                       const char *end) HIDDEN;
+char *_utf_offset_to_pointer_failable(const char *str, long offset,
+                                      const char *end) HIDDEN;
+VALUE rb_utf_new(const char *str, long len) HIDDEN;
+VALUE rb_utf_new2(const char *str) HIDDEN;
+VALUE rb_utf_new5(VALUE obj, const char *str, long len) HIDDEN;
+VALUE rb_utf_alloc_using(char *str) HIDDEN;
+VALUE rb_utf_dup(VALUE str) HIDDEN;
+long rb_utf_index(VALUE str, VALUE sub, long offset) HIDDEN;
+bool rb_utf_begin_from_offset(VALUE str, long offset, char **begin,
+                              char **limit) HIDDEN;
+void rb_utf_begin_from_offset_validated(VALUE str, long offset, char **begin,
+                                        char **limit) HIDDEN;
+char *rb_utf_prev_validated(const char *begin, const char *p) HIDDEN;
+VALUE rb_utf_update(VALUE str, long offset, long len, VALUE replacement) HIDDEN;
+char *rb_utf_next_validated(const char *p, const char *end) HIDDEN;
+long rb_utf_index_regexp(VALUE str, const char *s, const char *end, VALUE sub,
+                         long offset, bool reverse) HIDDEN;
+#endif /* RB_PRIVATE_H */

data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c CHANGED Viewed

@@ -113,7 +113,8 @@ rb_utf_to_inum_num_separator(const char *str, const char *s, bool verify,
         if (*non_digit != 0)
                 rb_raise(rb_eArgError,
-                         "unexpected ‘%lc’ found at position %ld", c, s - str);
+                         "unexpected ‘%lc’ found at position %ld",
+                         c, utf_pointer_to_offset(str, s));
         *non_digit = c;
@@ -135,7 +136,7 @@ rb_utf_to_inum_digit_value(const char *str, const char *s, unichar c,
                         return false;
                 rb_raise(rb_eArgError,
                          "non-digit character ‘%lc’ found at position %ld",
-                         c, s - str);
+                         c, utf_pointer_to_offset(str, s));
         }
         if (value >= base) {
@@ -144,7 +145,7 @@ rb_utf_to_inum_digit_value(const char *str, const char *s, unichar c,
                 rb_raise(rb_eArgError,
                          "value (%d) greater than base (%d) at position %ld",
-                         value, base, s - str);
+                         value, base, utf_pointer_to_offset(str, s));
         }
         *digit_value = value;
@@ -181,7 +182,7 @@ rb_utf_to_inum_as_fix(const char *str, const char *s, int sign, int base,
                 if (*s != '\0')
                         rb_raise(rb_eArgError,
                                  "trailing garbage found at position %ld",
-                                 s - str);
+                                 utf_pointer_to_offset(str, s));
         }
         if (POSFIXABLE(value)) {
@@ -221,7 +222,7 @@ rb_cutf_to_inum(const char * const str, int base, bool verify)
                 if (verify)
                         rb_raise(rb_eArgError,
                                  "extra sign ‘%c’ found at position %ld",
-                                 *s, s - str);
+                                 *s, utf_pointer_to_offset(str, s));
                 return INT2FIX(0);
         }
@@ -245,7 +246,7 @@ rb_cutf_to_inum(const char * const str, int base, bool verify)
         if (verify && *str == '_')
                 rb_raise(rb_eArgError,
                          "leading digit-separator ‘_’ found at position %ld",
-                         s - str);
+                         utf_pointer_to_offset(str, s));
         bit_length = bit_length / BITSPERDIG + 1;
@@ -269,7 +270,7 @@ rb_cutf_to_inum(const char * const str, int base, bool verify)
                 bool more_to_shift = true;
                 while (more_to_shift) {
-                        BDIGIT_DBL num = c;
+                        BDIGIT_DBL num = digit_value;
                         for (int i = 0; i < big_len; i++) {
                                 num += (BDIGIT_DBL)zds[i] * base;
@@ -294,12 +295,12 @@ rb_cutf_to_inum(const char * const str, int base, bool verify)
         if (str + 1 < s && s[-1] == '_')
                 rb_raise(rb_eArgError,
                          "trailing digit-separator ‘_’ found at position %ld",
-                         s - str);
+                         utf_pointer_to_offset(str, s));
         if (*s != '\0')
                 rb_raise(rb_eArgError,
                          "trailing garbage found at position %ld",
-                         s - str);
+                         utf_pointer_to_offset(str, s));
         return rb_big_norm(z);
 }

data/ext/encoding/character/utf-8/tables.h ADDED Viewed

@@ -0,0 +1,38 @@
+/*
+ * contents: Functions for dealing with Unicode tables.
+ *
+ * Copyright © 2007 Nikolai Weibull <now@bitwi.se>
+ */
+#ifndef TABLES_H
+#define TABLES_H
+/*
+static inline int
+split_unicode_table_lookup_page(const uint8_t data[][256], int16_t page, unichar c)
+{
+        return (page >= UNICODE_MAX_TABLE_INDEX) ?
+                page - UNICODE_MAX_TABLE_INDEX :
+                data[page][c & 0xff];
+}
+static inline int
+split_unicode_table_lookup(const uint8_t data[][256], const int16_t part1[], const int16_t part2[], unichar c, int fallback)
+{
+	if (c <= UNICODE_LAST_CHAR_PART1)
+                return split_unicode_table_lookup_page(data,
+                                                       part1[c >> 8],
+                                                       c);
+        if (c >= UNICODE_FIRST_CHAR_PART2 && c <= UNICODE_LAST_CHAR)
+                return split_unicode_table_lookup_page(data,
+                                                       part2[(c - UNICODE_FIRST_CHAR_PART2) >> 8],
+                                                       c);
+        return fallback;
+}
+*/
+#endif /* TABLES_H */

data/ext/encoding/character/utf-8/unicode.c CHANGED Viewed

@@ -13,6 +13,7 @@
 #include <limits.h>
 #include "unicode.h"
 #include "private.h"
+#include "rb_private.h"
 #include "rb_methods.h"
 static VALUE mUTF8Methods;
@@ -85,7 +86,6 @@ _utf_offset_to_pointer_validated_impl(const char *str, long offset,
                                                 saved_offset);
                                 else
                                         return NULL;
-                                                break;
                         }
                         offset += utf_pointer_to_offset(p, base);

data/ext/encoding/character/utf-8/utf.c CHANGED Viewed

@@ -190,6 +190,9 @@ utf_char(const char *str)
 unichar
 utf_char_n(const char *str, size_t max)
 {
+        if (max == 0)
+                return UTF_INCOMPLETE_INPUT_UNICHAR;
 	size_t len;
 	unichar c = (unsigned char)*str;
@@ -454,7 +457,7 @@ utf_collate(const char *a, const char *b)
 	unichar *a_norm = _utf_normalize_wc(a, 0, false, NORMALIZE_ALL_COMPOSE);
 	unichar *b_norm = _utf_normalize_wc(b, 0, false, NORMALIZE_ALL_COMPOSE);
-	setlocale(LC_COLLATE, "");
 	int result = wcscoll((wchar_t *)a_norm, (wchar_t *)b_norm);
 	free(a_norm);
@@ -518,7 +521,6 @@ utf_collate_key_impl(const char *str, size_t len, bool use_len)
 	assert(str != NULL);
 	unichar *str_norm = _utf_normalize_wc(str, len, use_len, NORMALIZE_ALL_COMPOSE);
-	setlocale(LC_COLLATE, "");
 	size_t xfrm_len = wcsxfrm(NULL, (wchar_t *)str_norm, 0);
 	wchar_t result_wc[xfrm_len + 1];
 	wcsxfrm(result_wc, (wchar_t *)str_norm, xfrm_len + 1);
@@ -863,7 +865,7 @@ utf_reverse_impl(const char *str, size_t len, bool use_len)
 	char *result = ALLOC_N(char, len + 1);
 	char *r = result + len;
 	const char *p = str;
-	while (*p != NUL) {
+        while (r > result) {
 		uint8_t skip = s_utf_skip_lengths[*(unsigned char *)p];
 		r -= skip;
 		for (char *m = r; skip > 0; skip--)

data/tests/case.rb ADDED Viewed

@@ -0,0 +1,102 @@
+# contents: Tests for String#upcase and String#downcase.
+#
+# Copyright © 2006 Nikolai Weibull <now@bitwi.se>
+require 'tests/unicodedatatestbase'
+require 'encoding/character/utf-8'
+class TC_StringCase < Test::Unit::TestCase
+  include UnicodeDataTestBase
+  Code, Name, Category, _, _, _, _, _, _, _, _, _, Upper, Lower, Title = (0..14).to_a
+  CasingCode, CasingLower, CasingTitle, CasingUpper, CasingCondition = (0..4).to_a
+  def test_upcase_and_downcase
+    # TODO: Do it like this.  First read in SpecialCasing.txt and set up lookup
+    # tables for all the characters that need special casing.  Then, iterate
+    # over UnicodeData and simply check that the correct casings are performed,
+    # looking up data in the tables for special casing if no simple casing
+    # information is available (and skipping when appropriate - such as when
+    # there is some condition defined for the special casing).
+    special = Struct.new(:conditions, :upper, :lower, :title).new({}, [], [], [])
+    open_data_file('SpecialCasing.txt') do |file|
+      i = 0
+      file.each_line do |line|
+        i += 1
+        next if line =~ /^(#|\s*$)/
+        fields = line.sub(/\s*#.*$/, "").split('; ')
+        unless fields.size == 4 or fields.size == 5
+          raise "#{line}: Wrong number of fields; #{field.size} instead of 4 or 5."
+        end
+        code = fields[CasingCode].hex
+        special.conditions[code] = fields[CasingCondition] if fields.size == 5
+        special.upper[code] = utfify(fields[CasingUpper])
+        special.lower[code] = utfify(fields[CasingLower])
+        special.title[code] = utfify(fields[CasingTitle])
+      end
+    end
+    open_data_file('UnicodeData.txt') do |file|
+      i = 0
+      prev_code = -1
+      file.each_line do |line|
+        i += 1
+        next if line =~ /^(#|\s*$)/
+        fields = line.split(';')
+        raise "#{line}: Wrong number of fields; #{field.size} instead of 15." unless fields.size == 15
+        code = fields[Code].hex
+        if code > prev_code + 1 and fields[Name] =~ /Last>$/ and fields[Category] =~ /^L[lut]$/
+          prev_code.upto(code - 1){ |c| test_one c, fields, special }
+        end
+        test_one code, fields, special
+        prev_code = code
+      end
+    end
+    puts @i
+  end
+private
+  def utfify(codepoints)
+    return codepoints if codepoints == ""
+    codepoints.split(' ').map{ |cp| cp.hex }.pack('U*')
+  end
+  def utfone(codepoint)
+    u([codepoint].pack('U*'))
+  end
+  def test_one(code, fields, special)
+    @i ||= 0
+    @i += 1
+    case fields[Category]
+    when 'Ll'
+      test_upcase(code, fields, special)
+    when 'Lu'
+      test_downcase(code, fields, special)
+    when 'Lt'
+      test_upcase(code, fields, special)
+      test_downcase(code, fields, special)
+    end
+  end
+  def test_upcase(code, fields, special)
+    if special.upper[code]
+      if not special.conditions[code]
+        assert_equal(special.upper[code], utfone(code).upcase)
+      end
+    elsif not fields[Upper].empty?
+      assert_equal(utfify(fields[Upper]), utfone(code).upcase)
+    end
+  end
+  def test_downcase(code, fields, special)
+    if special.lower[code]
+      if not special.conditions[code]
+        assert_equal(special.lower[code], utfone(code).downcase)
+      end
+    elsif not fields[Lower].empty?
+      assert_equal(utfify(fields[Lower]), utfone(code).downcase)
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,10 +1,10 @@
 --- !ruby/object:Gem::Specification
-rubygems_version: 0.9.0
+rubygems_version: 0.9.4
 specification_version: 1
 name: character-encodings
 version: !ruby/object:Gem::Version
-  version: 0.2.0
-date: 2006-07-27 00:00:00 +02:00
+  version: 0.3.0
+date: 2007-11-22 00:00:00 +01:00
 summary: A pluggable character-encoding library
 require_paths:
 - lib
@@ -48,6 +48,7 @@ files:
 - ext/encoding/character/unicode/codepoint.c
 - ext/encoding/character/utf-8/break.c
 - ext/encoding/character/utf-8/decompose.c
+- ext/encoding/character/utf-8/private.c
 - ext/encoding/character/utf-8/properties.c
 - ext/encoding/character/utf-8/rb_utf_aref.c
 - ext/encoding/character/utf-8/rb_utf_aset.c
@@ -80,19 +81,22 @@ files:
 - ext/encoding/character/utf-8/unicode.c
 - ext/encoding/character/utf-8/utf.c
 - ext/encoding/character/utf-8/rb_utf_internal_bignum.c
+- ext/encoding/character/utf-8/data/break.h
+- ext/encoding/character/utf-8/data/character-tables.h
+- ext/encoding/character/utf-8/data/compose.h
+- ext/encoding/character/utf-8/data/decompose.h
 - ext/encoding/character/utf-8/private.h
 - ext/encoding/character/utf-8/rb_includes.h
 - ext/encoding/character/utf-8/rb_methods.h
+- ext/encoding/character/utf-8/rb_private.h
 - ext/encoding/character/utf-8/rb_utf_internal_tr.h
+- ext/encoding/character/utf-8/tables.h
 - ext/encoding/character/utf-8/unicode.h
 - ext/encoding/character/utf-8/rb_utf_internal_bignum.h
-- ext/encoding/character/utf-8/data/break.h
-- ext/encoding/character/utf-8/data/character-tables.h
-- ext/encoding/character/utf-8/data/compose.h
-- ext/encoding/character/utf-8/data/decompose.h
-- ext/encoding/character/utf-8/extconf.rb
 - ext/encoding/character/utf-8/data/generate-unicode-data.rb
+- ext/encoding/character/utf-8/extconf.rb
 - ext/encoding/character/utf-8/depend
+- tests/case.rb
 - tests/foldcase.rb
 - tests/normalize.rb
 - tests/unicodedatatestbase.rb