ox 1.6.4 → 1.6.5

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of ox might be problematic. Click here for more details.

data/README.md CHANGED
@@ -34,10 +34,9 @@ A fast XML parser and Object marshaller as a Ruby gem.
34
34
 
35
35
  ## <a name="release">Release Notes</a>
36
36
 
37
- ### Release 1.6.4
37
+ ### Release 1.6.5
38
38
 
39
- - Special character handling has been improved. Both hex and base 10 numeric values are allowed up to a 64 bit number
40
- for really long UTF-8 characters.
39
+ - Special character handling now supports UCS-2 and UCS-4 Unicode characters as well as UTF-8 characters.
41
40
 
42
41
  ## <a name="description">Description</a>
43
42
 
@@ -128,6 +128,12 @@ static VALUE with_instruct_sym;
128
128
  static VALUE with_xml_sym;
129
129
  static VALUE xsd_date_sym;
130
130
 
131
+ #if HAS_ENCODING_SUPPORT
132
+ rb_encoding *ox_utf8_encoding = 0;
133
+ #else
134
+ void *ox_utf8_encoding = 0;
135
+ #endif
136
+
131
137
  struct _Options ox_default_options = {
132
138
  { '\0' }, /* encoding */
133
139
  2, /* indent */
@@ -773,6 +779,9 @@ void Init_ox() {
773
779
 
774
780
  rb_define_module_function(Ox, "cache_test", cache_test, 0);
775
781
  rb_define_module_function(Ox, "cache8_test", cache8_test, 0);
782
+ #if HAS_ENCODING_SUPPORT
783
+ ox_utf8_encoding = rb_enc_find("UTF-8");
784
+ #endif
776
785
  }
777
786
 
778
787
  void
@@ -258,6 +258,12 @@ extern ID ox_tv_nsec_id;
258
258
  extern ID ox_tv_usec_id;
259
259
  extern ID ox_value_id;
260
260
 
261
+ #if HAS_ENCODING_SUPPORT
262
+ extern rb_encoding *ox_utf8_encoding;
263
+ #else
264
+ extern void *ox_utf8_encoding;
265
+ #endif
266
+
261
267
  extern VALUE ox_date_class;
262
268
  extern VALUE ox_empty_string;
263
269
  extern VALUE ox_encoding_sym;
@@ -47,10 +47,10 @@ static char* read_name_token(PInfo pi);
47
47
  static char* read_quoted_value(PInfo pi);
48
48
  static char* read_hex_uint64(char *b, uint64_t *up);
49
49
  static char* read_10_uint64(char *b, uint64_t *up);
50
- static char* uint64_to_chars(char *text, uint64_t u);
50
+ static char* ucs_to_utf8_chars(char *text, uint64_t u);
51
51
  static char* read_coded_chars(PInfo pi, char *text);
52
52
  static void next_non_white(PInfo pi);
53
- static int collapse_special(char *str);
53
+ static int collapse_special(PInfo pi, char *str);
54
54
 
55
55
  /* This XML parser is a single pass, destructive, callback parser. It is a
56
56
  * single pass parse since it only make one pass over the characters in the
@@ -364,7 +364,7 @@ read_element(PInfo pi) {
364
364
  next_non_white(pi);
365
365
  ap->value = read_quoted_value(pi);
366
366
  if (0 != strchr(ap->value, '&')) {
367
- if (0 != collapse_special((char*)ap->value)) {
367
+ if (0 != collapse_special(pi, (char*)ap->value)) {
368
368
  raise_error("invalid format, special character does not end with a semicolon", pi->str, pi->s);
369
369
  }
370
370
  }
@@ -701,12 +701,57 @@ read_10_uint64(char *b, uint64_t *up) {
701
701
  return b;
702
702
  }
703
703
 
704
+ /*
705
+ u0000..u007F 00000000000000xxxxxxx 0xxxxxxx
706
+ u0080..u07FF 0000000000yyyyyxxxxxx 110yyyyy 10xxxxxx
707
+ u0800..uD7FF, uE000..uFFFF 00000zzzzyyyyyyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx
708
+ u10000..u10FFFF uuuzzzzzzyyyyyyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
709
+ */
704
710
  static char*
705
- uint64_to_chars(char *text, uint64_t u) {
711
+ ucs_to_utf8_chars(char *text, uint64_t u) {
706
712
  int reading = 0;
707
713
  int i;
708
714
  unsigned char c;
709
715
 
716
+ if (u <= 0x000000000000007FULL) {
717
+ /* 0xxxxxxx */
718
+ *text++ = (char)u;
719
+ } else if (u <= 0x00000000000007FFULL) {
720
+ /* 110yyyyy 10xxxxxx */
721
+ *text++ = (char)(0x00000000000000C0ULL | (0x000000000000001FULL & (u >> 6)));
722
+ *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
723
+ } else if (u <= 0x000000000000D7FFULL || (0x000000000000E000ULL <= u && u <= 0x000000000000FFFFULL)) {
724
+ /* 1110zzzz 10yyyyyy 10xxxxxx */
725
+ *text++ = (char)(0x00000000000000E0ULL | (0x000000000000000FULL & (u >> 12)));
726
+ *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6)));
727
+ *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
728
+ } else if (0x0000000000010000ULL <= u && u <= 0x000000000010FFFFULL) {
729
+ /* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */
730
+ *text++ = (char)(0x00000000000000F0ULL | (0x0000000000000007ULL & (u >> 18)));
731
+ *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 12)));
732
+ *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6)));
733
+ *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
734
+ } else {
735
+ /* assume it is UTF-8 encoded directly and not UCS */
736
+ for (i = 56; 0 <= i; i -= 8) {
737
+ c = (unsigned char)((u >> i) & 0x00000000000000FFULL);
738
+ if (reading) {
739
+ *text++ = (char)c;
740
+ } else if ('\0' != c) {
741
+ *text++ = (char)c;
742
+ reading = 1;
743
+ }
744
+ }
745
+ }
746
+ return text;
747
+ }
748
+
749
+ #if 0
750
+ static char*
751
+ uint64_to_chars(char *text, uint64_t u) {
752
+ int reading = 0;
753
+ int i;
754
+ unsigned char c;
710
755
 
711
756
  for (i = 56; 0 <= i; i -= 8) {
712
757
  c = (unsigned char)((u >> i) & 0x00000000000000FFULL);
@@ -719,6 +764,7 @@ uint64_to_chars(char *text, uint64_t u) {
719
764
  }
720
765
  return text;
721
766
  }
767
+ #endif
722
768
 
723
769
  static char*
724
770
  read_coded_chars(PInfo pi, char *text) {
@@ -749,7 +795,17 @@ read_coded_chars(PInfo pi, char *text) {
749
795
  *text++ = *pi->s;
750
796
  } else {
751
797
  pi->s = s;
752
- text = uint64_to_chars(text, u);
798
+ if (u <= 0x000000000000007FULL) {
799
+ *text++ = (char)u;
800
+ } else if (ox_utf8_encoding == pi->encoding) {
801
+ text = ucs_to_utf8_chars(text, u);
802
+ } else if (0 == pi->encoding) {
803
+ pi->encoding = ox_utf8_encoding;
804
+ text = ucs_to_utf8_chars(text, u);
805
+ } else {
806
+ /*raise_error("Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character sequences.", pi->str, pi->s); */
807
+ raise_error("Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);
808
+ }
753
809
  }
754
810
  } else if (0 == strcasecmp(buf, "nbsp;")) {
755
811
  pi->s = s;
@@ -776,7 +832,7 @@ read_coded_chars(PInfo pi, char *text) {
776
832
  }
777
833
 
778
834
  static int
779
- collapse_special(char *str) {
835
+ collapse_special(PInfo pi, char *str) {
780
836
  char *s = str;
781
837
  char *b = str;
782
838
 
@@ -799,7 +855,18 @@ collapse_special(char *str) {
799
855
  if (0 == end) {
800
856
  return EDOM;
801
857
  }
802
- b = uint64_to_chars(b, u);
858
+ if (u <= 0x000000000000007FULL) {
859
+ *b++ = (char)u;
860
+ } else if (ox_utf8_encoding == pi->encoding) {
861
+ b = ucs_to_utf8_chars(b, u);
862
+ /* TBD support UTF-16 */
863
+ } else if (0 == pi->encoding) {
864
+ pi->encoding = ox_utf8_encoding;
865
+ b = ucs_to_utf8_chars(b, u);
866
+ } else {
867
+ /* raise_error("Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);*/
868
+ raise_error("Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);
869
+ }
803
870
  s = end + 1;
804
871
  } else {
805
872
  if (0 == strncasecmp(s, "lt;", 3)) {
@@ -1,5 +1,5 @@
1
1
 
2
2
  module Ox
3
3
  # Current version of the module.
4
- VERSION = '1.6.4'
4
+ VERSION = '1.6.5'
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ox
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.6.4
4
+ version: 1.6.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-10-24 00:00:00.000000000 Z
12
+ date: 2012-10-25 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: ! "A fast XML parser and object serializer that uses only standard C
15
15
  lib.\n \nOptimized XML (Ox), as the name implies was written to provide