jsoncons 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ext/jsoncons/extconf.rb +43 -0
- data/ext/jsoncons/jsoncons.cpp +161 -0
- data/ext/jsoncons/jsoncons.h +10 -0
- data/jsoncons.gemspec +44 -0
- data/lib/jsoncons/jsoncons/examples/input/address-book.json +13 -0
- data/lib/jsoncons/jsoncons/examples/input/books.json +28 -0
- data/lib/jsoncons/jsoncons/examples/input/countries.json +7 -0
- data/lib/jsoncons/jsoncons/examples/input/employees.json +30 -0
- data/lib/jsoncons/jsoncons/examples/input/jsonschema/name.json +15 -0
- data/lib/jsoncons/jsoncons/examples/input/multiple-json-objects.json +3 -0
- data/lib/jsoncons/jsoncons/examples/input/sales.csv +6 -0
- data/lib/jsoncons/jsoncons/examples/input/store.json +28 -0
- data/lib/jsoncons/jsoncons/examples/input/tasks.csv +6 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/allocator_holder.hpp +38 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/basic_json.hpp +5905 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/bigint.hpp +1611 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/byte_string.hpp +820 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/config/binary_config.hpp +226 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/config/compiler_support.hpp +375 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/config/jsoncons_config.hpp +309 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/config/version.hpp +40 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/conv_error.hpp +218 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/decode_json.hpp +209 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/decode_traits.hpp +651 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/detail/endian.hpp +44 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/detail/grisu3.hpp +312 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/detail/optional.hpp +483 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/detail/parse_number.hpp +1133 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/detail/span.hpp +188 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/detail/string_view.hpp +537 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/detail/string_wrapper.hpp +370 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/detail/write_number.hpp +567 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/encode_json.hpp +315 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/encode_traits.hpp +378 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/json.hpp +18 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/json_array.hpp +324 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/json_content_handler.hpp +12 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/json_cursor.hpp +448 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/json_decoder.hpp +420 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/json_encoder.hpp +1587 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/json_error.hpp +156 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/json_exception.hpp +241 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/json_filter.hpp +653 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/json_fwd.hpp +23 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/json_object.hpp +1772 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/json_options.hpp +862 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/json_parser.hpp +2900 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/json_reader.hpp +731 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/json_traits_macros.hpp +1072 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/json_traits_macros_deprecated.hpp +144 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/json_type.hpp +206 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/json_type_traits.hpp +1830 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/json_visitor.hpp +1560 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/json_visitor2.hpp +2079 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/pretty_print.hpp +89 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/ser_context.hpp +62 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/sink.hpp +289 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/source.hpp +777 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/source_adaptor.hpp +148 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/staj2_cursor.hpp +1189 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/staj_cursor.hpp +1254 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/staj_iterator.hpp +449 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/tag_type.hpp +245 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/text_source_adaptor.hpp +144 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/traits_extension.hpp +884 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/typed_array_view.hpp +250 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/unicode_traits.hpp +1330 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/uri.hpp +635 -0
- data/lib/jsoncons/jsoncons/include/jsoncons/value_converter.hpp +340 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/bson/bson.hpp +23 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/bson/bson_cursor.hpp +320 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/bson/bson_decimal128.hpp +865 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/bson/bson_encoder.hpp +585 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/bson/bson_error.hpp +103 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/bson/bson_oid.hpp +245 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/bson/bson_options.hpp +75 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/bson/bson_parser.hpp +645 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/bson/bson_reader.hpp +92 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/bson/bson_type.hpp +44 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/bson/decode_bson.hpp +201 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/bson/encode_bson.hpp +144 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/cbor/cbor.hpp +26 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/cbor/cbor_cursor.hpp +351 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/cbor/cbor_cursor2.hpp +265 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/cbor/cbor_detail.hpp +93 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/cbor/cbor_encoder.hpp +1766 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/cbor/cbor_error.hpp +105 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/cbor/cbor_options.hpp +113 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/cbor/cbor_parser.hpp +1942 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/cbor/cbor_reader.hpp +116 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/cbor/decode_cbor.hpp +203 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/cbor/encode_cbor.hpp +151 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/csv/csv.hpp +17 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/csv/csv_cursor.hpp +358 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/csv/csv_encoder.hpp +954 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/csv/csv_error.hpp +85 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/csv/csv_options.hpp +973 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/csv/csv_parser.hpp +2099 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/csv/csv_reader.hpp +348 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/csv/csv_serializer.hpp +12 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/csv/decode_csv.hpp +208 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/csv/encode_csv.hpp +122 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/jmespath/jmespath.hpp +5215 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/jmespath/jmespath_error.hpp +215 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/jsonpatch/jsonpatch.hpp +579 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/jsonpatch/jsonpatch_error.hpp +121 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/jsonpath/expression.hpp +3329 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/jsonpath/flatten.hpp +432 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/jsonpath/json_location.hpp +445 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/jsonpath/json_query.hpp +115 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/jsonpath/jsonpath.hpp +13 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/jsonpath/jsonpath_error.hpp +240 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/jsonpath/jsonpath_expression.hpp +2612 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/jsonpath/jsonpath_selector.hpp +1322 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/jsonpointer/jsonpointer.hpp +1577 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/jsonpointer/jsonpointer_error.hpp +119 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/jsonschema/format_validator.hpp +968 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/jsonschema/json_validator.hpp +120 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/jsonschema/jsonschema.hpp +13 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/jsonschema/jsonschema_error.hpp +105 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/jsonschema/jsonschema_version.hpp +18 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/jsonschema/keyword_validator.hpp +1745 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/jsonschema/keyword_validator_factory.hpp +556 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/jsonschema/schema_draft7.hpp +198 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/jsonschema/schema_location.hpp +200 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/jsonschema/schema_version.hpp +35 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/jsonschema/subschema.hpp +144 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/mergepatch/mergepatch.hpp +103 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/msgpack/decode_msgpack.hpp +202 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/msgpack/encode_msgpack.hpp +142 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/msgpack/msgpack.hpp +24 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/msgpack/msgpack_cursor.hpp +343 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/msgpack/msgpack_cursor2.hpp +259 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/msgpack/msgpack_encoder.hpp +753 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/msgpack/msgpack_error.hpp +94 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/msgpack/msgpack_options.hpp +74 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/msgpack/msgpack_parser.hpp +748 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/msgpack/msgpack_reader.hpp +116 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/msgpack/msgpack_type.hpp +63 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/ubjson/decode_ubjson.hpp +201 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/ubjson/encode_ubjson.hpp +142 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/ubjson/ubjson.hpp +23 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/ubjson/ubjson_cursor.hpp +307 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/ubjson/ubjson_encoder.hpp +502 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/ubjson/ubjson_error.hpp +100 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/ubjson/ubjson_options.hpp +87 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/ubjson/ubjson_parser.hpp +880 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/ubjson/ubjson_reader.hpp +92 -0
- data/lib/jsoncons/jsoncons/include/jsoncons_ext/ubjson/ubjson_type.hpp +43 -0
- data/lib/jsoncons/version.rb +5 -0
- data/lib/jsoncons.rb +33 -0
- data/test/jsoncons_test.rb +108 -0
- data/test/test_helper.rb +7 -0
- metadata +268 -0
|
@@ -0,0 +1,1330 @@
|
|
|
1
|
+
// Copyright 2016 Daniel Parker
|
|
2
|
+
// Distributed under the Boost license, Version 1.0.
|
|
3
|
+
// (See accompanying file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
|
4
|
+
|
|
5
|
+
// See https://github.com/danielaparker/unicode_traits for latest version
|
|
6
|
+
|
|
7
|
+
/*
|
|
8
|
+
* Includes code derived from Unicode, Inc decomposition code in ConvertUTF.h and ConvertUTF.c
|
|
9
|
+
* http://www.unicode.org/
|
|
10
|
+
*
|
|
11
|
+
* "Unicode, Inc. hereby grants the right to freely use the information
|
|
12
|
+
* supplied in this file in the creation of products supporting the
|
|
13
|
+
* Unicode Standard."
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
#ifndef JSONCONS_UNICODE_TRAITS_HPP
|
|
17
|
+
#define JSONCONS_UNICODE_TRAITS_HPP
|
|
18
|
+
|
|
19
|
+
#include <cstring>
|
|
20
|
+
#include <string>
|
|
21
|
+
#include <iterator>
|
|
22
|
+
#include <type_traits>
|
|
23
|
+
#include <system_error>
|
|
24
|
+
#include <limits>
|
|
25
|
+
#include <jsoncons/config/compiler_support.hpp>
|
|
26
|
+
#include <jsoncons/traits_extension.hpp>
|
|
27
|
+
|
|
28
|
+
namespace jsoncons { namespace unicode_traits {
|
|
29
|
+
|
|
30
|
+
enum class encoding_kind {undetected,utf8,utf16le,utf16be,utf32le,utf32be};
|
|
31
|
+
|
|
32
|
+
inline
|
|
33
|
+
std::string to_string(encoding_kind encoding)
|
|
34
|
+
{
|
|
35
|
+
switch (encoding)
|
|
36
|
+
{
|
|
37
|
+
case encoding_kind::utf8:
|
|
38
|
+
return "utf8";
|
|
39
|
+
case encoding_kind::utf16le:
|
|
40
|
+
return "utf16le";
|
|
41
|
+
case encoding_kind::utf16be:
|
|
42
|
+
return "utf16be";
|
|
43
|
+
case encoding_kind::utf32le:
|
|
44
|
+
return "utf32le";
|
|
45
|
+
case encoding_kind::utf32be:
|
|
46
|
+
return "utf32be";
|
|
47
|
+
default:
|
|
48
|
+
return "undetected";
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
template <class Byte>
|
|
53
|
+
struct detect_encoding_result
|
|
54
|
+
{
|
|
55
|
+
const Byte* ptr;
|
|
56
|
+
encoding_kind encoding;
|
|
57
|
+
};
|
|
58
|
+
|
|
59
|
+
template <class CharT>
|
|
60
|
+
typename std::enable_if<traits_extension::is_char8<CharT>::value,detect_encoding_result<CharT>>::type
|
|
61
|
+
detect_encoding_from_bom(const CharT* data, std::size_t length)
|
|
62
|
+
{
|
|
63
|
+
const uint8_t bom_utf8[] = {0xef,0xbb,0xbf};
|
|
64
|
+
const uint8_t bom_utf16le[] = {0xff,0xfe};
|
|
65
|
+
const uint8_t bom_utf16be[] = {0xfe,0xff};
|
|
66
|
+
const uint8_t bom_utf32le[] = {0xff,0xfe,0x00,0x00};
|
|
67
|
+
const uint8_t bom_utf32be[] = {0x00,0x00,0xfe,0xff};
|
|
68
|
+
|
|
69
|
+
if (length >= 4 && !memcmp(data,bom_utf32le,4))
|
|
70
|
+
{
|
|
71
|
+
return detect_encoding_result<CharT>{data+4,encoding_kind::utf32le};
|
|
72
|
+
}
|
|
73
|
+
else if (length >= 4 && !memcmp(data,bom_utf32be,4))
|
|
74
|
+
{
|
|
75
|
+
return detect_encoding_result<CharT>{data+4,encoding_kind::utf32be};
|
|
76
|
+
}
|
|
77
|
+
else if (length >= 2 && !memcmp(data,bom_utf16le,2))
|
|
78
|
+
{
|
|
79
|
+
return detect_encoding_result<CharT>{data+2,encoding_kind::utf16le};
|
|
80
|
+
}
|
|
81
|
+
else if (length >= 2 && !memcmp(data,bom_utf16be,2))
|
|
82
|
+
{
|
|
83
|
+
return detect_encoding_result<CharT>{data+2,encoding_kind::utf16be};
|
|
84
|
+
}
|
|
85
|
+
else if (length >= 3 && !memcmp(data,bom_utf8,3))
|
|
86
|
+
{
|
|
87
|
+
return detect_encoding_result<CharT>{data+3,encoding_kind::utf8};
|
|
88
|
+
}
|
|
89
|
+
else
|
|
90
|
+
{
|
|
91
|
+
return detect_encoding_result<CharT>{data,encoding_kind::undetected};
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
template <class CharT>
|
|
96
|
+
typename std::enable_if<traits_extension::is_char16<CharT>::value || traits_extension::is_char32<CharT>::value,detect_encoding_result<CharT>>::type
|
|
97
|
+
detect_encoding_from_bom(const CharT* data, std::size_t)
|
|
98
|
+
{
|
|
99
|
+
return detect_encoding_result<CharT>{data,encoding_kind::undetected};
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
template <class CharT>
|
|
103
|
+
typename std::enable_if<traits_extension::is_char8<CharT>::value,detect_encoding_result<CharT>>::type
|
|
104
|
+
detect_json_encoding(const CharT* data, std::size_t length)
|
|
105
|
+
{
|
|
106
|
+
detect_encoding_result<CharT> r = detect_encoding_from_bom(data,length);
|
|
107
|
+
if (r.encoding != encoding_kind::undetected)
|
|
108
|
+
{
|
|
109
|
+
return r;
|
|
110
|
+
}
|
|
111
|
+
else if (length < 4)
|
|
112
|
+
{
|
|
113
|
+
return detect_encoding_result<CharT>{data,encoding_kind::utf8};
|
|
114
|
+
}
|
|
115
|
+
else if (*data == 0 && *(data+1) == 0 && *(data+2) == 0)
|
|
116
|
+
{
|
|
117
|
+
return detect_encoding_result<CharT>{data,encoding_kind::utf32be};
|
|
118
|
+
}
|
|
119
|
+
else if (*data == 0 && *(data+2) == 0)
|
|
120
|
+
{
|
|
121
|
+
return detect_encoding_result<CharT>{data,encoding_kind::utf16be};
|
|
122
|
+
}
|
|
123
|
+
else if (*(data+1) == 0 && *(data+2) == 0 && *(data+3) == 0)
|
|
124
|
+
{
|
|
125
|
+
return detect_encoding_result<CharT>{data,encoding_kind::utf32le};
|
|
126
|
+
}
|
|
127
|
+
else if (*(data+1) == 0 && *(data+3) == 0)
|
|
128
|
+
{
|
|
129
|
+
return detect_encoding_result<CharT>{data,encoding_kind::utf16le};
|
|
130
|
+
}
|
|
131
|
+
else
|
|
132
|
+
{
|
|
133
|
+
return detect_encoding_result<CharT>{data,encoding_kind::utf8};
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
template <class CharT>
|
|
138
|
+
typename std::enable_if<traits_extension::is_char16<CharT>::value || traits_extension::is_char32<CharT>::value,detect_encoding_result<CharT>>::type
|
|
139
|
+
detect_json_encoding(const CharT* data, std::size_t)
|
|
140
|
+
{
|
|
141
|
+
return detect_encoding_result<CharT>{data,encoding_kind::undetected};
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
/*
|
|
145
|
+
* Magic values subtracted from a buffer value during UTF8 conversion.
|
|
146
|
+
* This table contains as many values as there might be trailing bytes
|
|
147
|
+
* in a UTF-8 sequence. Source: ConvertUTF.c
|
|
148
|
+
*/
|
|
149
|
+
const uint32_t offsets_from_utf8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
|
|
150
|
+
0x03C82080UL, 0xFA082080UL, 0x82082080UL };
|
|
151
|
+
|
|
152
|
+
/*
|
|
153
|
+
* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
|
|
154
|
+
* into the first byte, depending on how many bytes follow. There are
|
|
155
|
+
* as many entries in this table as there are UTF-8 sequence types.
|
|
156
|
+
* (I.e., one byte sequence, two byte... etc.). Remember that sequencs
|
|
157
|
+
* for *legal* UTF-8 will be 4 or fewer bytes total. Source: ConvertUTF.c
|
|
158
|
+
*/
|
|
159
|
+
const uint8_t first_byte_mark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
|
|
160
|
+
|
|
161
|
+
/*
|
|
162
|
+
* Index into the table below with the first byte of a UTF-8 sequence to
|
|
163
|
+
* get the number of trailing bytes that are supposed to follow it.
|
|
164
|
+
* Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
|
|
165
|
+
* left as-is for anyone who may want to do such conversion, which was
|
|
166
|
+
* allowed in earlier algorithms. Source: ConvertUTF.c
|
|
167
|
+
*/
|
|
168
|
+
const uint8_t trailing_bytes_for_utf8[256] = {
|
|
169
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
170
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
171
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
172
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
173
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
174
|
+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
175
|
+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
176
|
+
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
|
|
177
|
+
};
|
|
178
|
+
|
|
179
|
+
// Some fundamental constants. Source: ConvertUTF.h
|
|
180
|
+
const uint32_t replacement_char = 0x0000FFFD;
|
|
181
|
+
const uint32_t max_bmp = 0x0000FFFF;
|
|
182
|
+
const uint32_t max_utf16 = 0x0010FFFF;
|
|
183
|
+
const uint32_t max_utf32 = 0x7FFFFFFF;
|
|
184
|
+
const uint32_t max_legal_utf32 = 0x0010FFFF;
|
|
185
|
+
|
|
186
|
+
const int half_shift = 10; // used for shifting by 10 bits
|
|
187
|
+
const uint32_t half_base = 0x0010000UL;
|
|
188
|
+
const uint32_t half_mask = 0x3FFUL;
|
|
189
|
+
|
|
190
|
+
const uint16_t sur_high_start = 0xD800;
|
|
191
|
+
const uint16_t sur_high_end = 0xDBFF;
|
|
192
|
+
const uint16_t sur_low_start = 0xDC00;
|
|
193
|
+
const uint16_t sur_low_end = 0xDFFF;
|
|
194
|
+
|
|
195
|
+
inline
|
|
196
|
+
static bool is_continuation_byte(unsigned char ch)
|
|
197
|
+
{
|
|
198
|
+
return (ch & 0xC0) == 0x80;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
inline
|
|
202
|
+
bool is_high_surrogate(uint32_t ch) noexcept
|
|
203
|
+
{
|
|
204
|
+
return (ch >= sur_high_start && ch <= sur_high_end);
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
inline
|
|
208
|
+
bool is_low_surrogate(uint32_t ch) noexcept
|
|
209
|
+
{
|
|
210
|
+
return (ch >= sur_low_start && ch <= sur_low_end);
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
inline
|
|
214
|
+
bool is_surrogate(uint32_t ch) noexcept
|
|
215
|
+
{
|
|
216
|
+
return (ch >= sur_high_start && ch <= sur_low_end);
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
enum class conv_flags
|
|
220
|
+
{
|
|
221
|
+
strict = 0,
|
|
222
|
+
lenient
|
|
223
|
+
};
|
|
224
|
+
|
|
225
|
+
// conv_errc
|
|
226
|
+
|
|
227
|
+
enum class conv_errc
|
|
228
|
+
{
|
|
229
|
+
success = 0,
|
|
230
|
+
over_long_utf8_sequence = 1, // over long utf8 sequence
|
|
231
|
+
expected_continuation_byte, // expected continuation byte
|
|
232
|
+
unpaired_high_surrogate, // unpaired high surrogate UTF-16
|
|
233
|
+
illegal_surrogate_value, // UTF-16 surrogate values are illegal in UTF-32
|
|
234
|
+
source_exhausted, // partial character in source, but hit end
|
|
235
|
+
source_illegal // source sequence is illegal/malformed
|
|
236
|
+
};
|
|
237
|
+
|
|
238
|
+
class Unicode_traits_error_category_impl_
|
|
239
|
+
: public std::error_category
|
|
240
|
+
{
|
|
241
|
+
public:
|
|
242
|
+
virtual const char* name() const noexcept
|
|
243
|
+
{
|
|
244
|
+
return "unicode_traits conversion error";
|
|
245
|
+
}
|
|
246
|
+
virtual std::string message(int ev) const
|
|
247
|
+
{
|
|
248
|
+
switch (static_cast<conv_errc>(ev))
|
|
249
|
+
{
|
|
250
|
+
case conv_errc::over_long_utf8_sequence:
|
|
251
|
+
return "Over long utf8 sequence";
|
|
252
|
+
case conv_errc::expected_continuation_byte:
|
|
253
|
+
return "Expected continuation byte";
|
|
254
|
+
case conv_errc::unpaired_high_surrogate:
|
|
255
|
+
return "Unpaired high surrogate UTF-16";
|
|
256
|
+
case conv_errc::illegal_surrogate_value:
|
|
257
|
+
return "UTF-16 surrogate values are illegal in UTF-32";
|
|
258
|
+
case conv_errc::source_exhausted:
|
|
259
|
+
return "Partial character in source, but hit end";
|
|
260
|
+
case conv_errc::source_illegal:
|
|
261
|
+
return "Source sequence is illegal/malformed";
|
|
262
|
+
default:
|
|
263
|
+
return "";
|
|
264
|
+
break;
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
};
|
|
268
|
+
|
|
269
|
+
inline
|
|
270
|
+
const std::error_category& unicode_traits_error_category()
|
|
271
|
+
{
|
|
272
|
+
static Unicode_traits_error_category_impl_ instance;
|
|
273
|
+
return instance;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
inline
|
|
277
|
+
std::error_code make_error_code(conv_errc result)
|
|
278
|
+
{
|
|
279
|
+
return std::error_code(static_cast<int>(result),unicode_traits_error_category());
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
} // unicode_traits
|
|
283
|
+
} // jsoncons
|
|
284
|
+
|
|
285
|
+
namespace std {
|
|
286
|
+
template<>
|
|
287
|
+
struct is_error_code_enum<jsoncons::unicode_traits::conv_errc> : public true_type
|
|
288
|
+
{
|
|
289
|
+
};
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
namespace jsoncons { namespace unicode_traits {
|
|
293
|
+
|
|
294
|
+
// utf8
|
|
295
|
+
|
|
296
|
+
template <class CharT>
|
|
297
|
+
typename std::enable_if<traits_extension::is_char8<CharT>::value, conv_errc>::type
|
|
298
|
+
is_legal_utf8(const CharT* first, std::size_t length)
|
|
299
|
+
{
|
|
300
|
+
uint8_t a;
|
|
301
|
+
const CharT* srcptr = first+length;
|
|
302
|
+
switch (length) {
|
|
303
|
+
default:
|
|
304
|
+
return conv_errc::over_long_utf8_sequence;
|
|
305
|
+
case 4:
|
|
306
|
+
if (((a = (*--srcptr))& 0xC0) != 0x80)
|
|
307
|
+
return conv_errc::expected_continuation_byte;
|
|
308
|
+
JSONCONS_FALLTHROUGH;
|
|
309
|
+
case 3:
|
|
310
|
+
if (((a = (*--srcptr))& 0xC0) != 0x80)
|
|
311
|
+
return conv_errc::expected_continuation_byte;
|
|
312
|
+
JSONCONS_FALLTHROUGH;
|
|
313
|
+
case 2:
|
|
314
|
+
if (((a = (*--srcptr))& 0xC0) != 0x80)
|
|
315
|
+
return conv_errc::expected_continuation_byte;
|
|
316
|
+
|
|
317
|
+
switch (static_cast<uint8_t>(*first))
|
|
318
|
+
{
|
|
319
|
+
// no fall-through in this inner switch
|
|
320
|
+
case 0xE0: if (a < 0xA0) return conv_errc::source_illegal; break;
|
|
321
|
+
case 0xED: if (a > 0x9F) return conv_errc::source_illegal; break;
|
|
322
|
+
case 0xF0: if (a < 0x90) return conv_errc::source_illegal; break;
|
|
323
|
+
case 0xF4: if (a > 0x8F) return conv_errc::source_illegal; break;
|
|
324
|
+
default: if (a < 0x80) return conv_errc::source_illegal;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
JSONCONS_FALLTHROUGH;
|
|
328
|
+
case 1:
|
|
329
|
+
if (static_cast<uint8_t>(*first) >= 0x80 && static_cast<uint8_t>(*first) < 0xC2)
|
|
330
|
+
return conv_errc::source_illegal;
|
|
331
|
+
break;
|
|
332
|
+
}
|
|
333
|
+
if (static_cast<uint8_t>(*first) > 0xF4)
|
|
334
|
+
return conv_errc::source_illegal;
|
|
335
|
+
|
|
336
|
+
return conv_errc();
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
template <class...> using void_t = void;
|
|
340
|
+
|
|
341
|
+
template <class, class, class = void>
|
|
342
|
+
struct is_output_iterator : std::false_type {};
|
|
343
|
+
|
|
344
|
+
template <class I, class E>
|
|
345
|
+
struct is_output_iterator<I, E, void_t<
|
|
346
|
+
typename std::iterator_traits<I>::iterator_category,
|
|
347
|
+
decltype(*std::declval<I>() = std::declval<E>())>> : std::true_type {};
|
|
348
|
+
|
|
349
|
+
// is_same_size fixes issue with vs2013
|
|
350
|
+
|
|
351
|
+
// primary template
|
|
352
|
+
template<class T1, class T2, class Enable = void>
|
|
353
|
+
struct is_same_size : std::false_type
|
|
354
|
+
{
|
|
355
|
+
};
|
|
356
|
+
|
|
357
|
+
// specialization for non void types
|
|
358
|
+
template<class T1, class T2>
|
|
359
|
+
struct is_same_size<T1, T2, typename std::enable_if<!std::is_void<T1>::value && !std::is_void<T2>::value>::type>
|
|
360
|
+
{
|
|
361
|
+
static constexpr bool value = (sizeof(T1) == sizeof(T2));
|
|
362
|
+
};
|
|
363
|
+
|
|
364
|
+
// convert
|
|
365
|
+
|
|
366
|
+
template <class CharT>
|
|
367
|
+
struct convert_result
|
|
368
|
+
{
|
|
369
|
+
const CharT* ptr;
|
|
370
|
+
conv_errc ec;
|
|
371
|
+
};
|
|
372
|
+
|
|
373
|
+
// to_codepoint
|
|
374
|
+
|
|
375
|
+
template <class CharT,class CodepointT>
|
|
376
|
+
typename std::enable_if<traits_extension::is_char8<CharT>::value && traits_extension::is_char32<CodepointT>::value,
|
|
377
|
+
convert_result<CharT>>::type
|
|
378
|
+
to_codepoint(const CharT* first, const CharT* last,
|
|
379
|
+
CodepointT& ch,
|
|
380
|
+
conv_flags flags = conv_flags::strict) noexcept
|
|
381
|
+
{
|
|
382
|
+
ch = 0;
|
|
383
|
+
if (first >= last)
|
|
384
|
+
{
|
|
385
|
+
return convert_result<CharT>{first, conv_errc::source_exhausted};
|
|
386
|
+
}
|
|
387
|
+
conv_errc result = conv_errc();
|
|
388
|
+
|
|
389
|
+
unsigned short extra_bytes_to_read = trailing_bytes_for_utf8[static_cast<uint8_t>(*first)];
|
|
390
|
+
if (extra_bytes_to_read >= last - first)
|
|
391
|
+
{
|
|
392
|
+
result = conv_errc::source_exhausted;
|
|
393
|
+
return convert_result<CharT>{first, result};
|
|
394
|
+
}
|
|
395
|
+
// Do this check whether lenient or strict
|
|
396
|
+
if ((result=is_legal_utf8(first, extra_bytes_to_read+1)) != conv_errc())
|
|
397
|
+
{
|
|
398
|
+
return convert_result<CharT>{first, result};
|
|
399
|
+
}
|
|
400
|
+
// The cases all fall through. See "Note A" below.
|
|
401
|
+
switch (extra_bytes_to_read)
|
|
402
|
+
{
|
|
403
|
+
case 5:
|
|
404
|
+
ch += static_cast<uint8_t>(*first++);
|
|
405
|
+
ch <<= 6;
|
|
406
|
+
JSONCONS_FALLTHROUGH;
|
|
407
|
+
case 4:
|
|
408
|
+
ch += static_cast<uint8_t>(*first++);
|
|
409
|
+
ch <<= 6;
|
|
410
|
+
JSONCONS_FALLTHROUGH;
|
|
411
|
+
case 3:
|
|
412
|
+
ch += static_cast<uint8_t>(*first++);
|
|
413
|
+
ch <<= 6;
|
|
414
|
+
JSONCONS_FALLTHROUGH;
|
|
415
|
+
case 2:
|
|
416
|
+
ch += static_cast<uint8_t>(*first++);
|
|
417
|
+
ch <<= 6;
|
|
418
|
+
JSONCONS_FALLTHROUGH;
|
|
419
|
+
case 1:
|
|
420
|
+
ch += static_cast<uint8_t>(*first++);
|
|
421
|
+
ch <<= 6;
|
|
422
|
+
JSONCONS_FALLTHROUGH;
|
|
423
|
+
case 0:
|
|
424
|
+
ch += static_cast<uint8_t>(*first++);
|
|
425
|
+
break;
|
|
426
|
+
}
|
|
427
|
+
ch -= offsets_from_utf8[extra_bytes_to_read];
|
|
428
|
+
|
|
429
|
+
if (ch <= max_legal_utf32) {
|
|
430
|
+
/*
|
|
431
|
+
* UTF-16 surrogate values are illegal in UTF-32, and anything
|
|
432
|
+
* over Plane 17 (> 0x10FFFF) is illegal.
|
|
433
|
+
*/
|
|
434
|
+
if (is_surrogate(ch) )
|
|
435
|
+
{
|
|
436
|
+
if (flags == conv_flags::strict)
|
|
437
|
+
{
|
|
438
|
+
first -= (extra_bytes_to_read+1); // return to the illegal value itself
|
|
439
|
+
result = conv_errc::source_illegal;
|
|
440
|
+
return convert_result<CharT>{first, result};
|
|
441
|
+
}
|
|
442
|
+
else
|
|
443
|
+
{
|
|
444
|
+
ch = replacement_char;
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
else // i.e., ch > max_legal_utf32
|
|
449
|
+
{
|
|
450
|
+
result = conv_errc::source_illegal;
|
|
451
|
+
ch = replacement_char;
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
return convert_result<CharT>{first,result} ;
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
template <class CharT,class CodepointT>
|
|
458
|
+
typename std::enable_if<traits_extension::is_char16<CharT>::value && traits_extension::is_char32<CodepointT>::value,
|
|
459
|
+
convert_result<CharT>>::type
|
|
460
|
+
to_codepoint(const CharT* first, const CharT* last,
|
|
461
|
+
CodepointT& ch,
|
|
462
|
+
conv_flags flags = conv_flags::strict) noexcept
|
|
463
|
+
{
|
|
464
|
+
ch = 0;
|
|
465
|
+
if (first >= last)
|
|
466
|
+
{
|
|
467
|
+
return convert_result<CharT>{first, conv_errc::source_exhausted};
|
|
468
|
+
}
|
|
469
|
+
conv_errc result = conv_errc();
|
|
470
|
+
|
|
471
|
+
ch = *first++;
|
|
472
|
+
// If we have a surrogate pair, convert to UTF32 first.
|
|
473
|
+
if (is_high_surrogate(ch))
|
|
474
|
+
{
|
|
475
|
+
// If the 16 bits following the high surrogate are in the first buffer...
|
|
476
|
+
if (first < last)
|
|
477
|
+
{
|
|
478
|
+
uint32_t ch2 = *first;
|
|
479
|
+
// If ptr's a low surrogate, convert to UTF32.
|
|
480
|
+
if (ch2 >= sur_low_start && ch2 <= sur_low_end )
|
|
481
|
+
{
|
|
482
|
+
ch = ((ch - sur_high_start) << half_shift)
|
|
483
|
+
+ (ch2 - sur_low_start) + half_base;
|
|
484
|
+
++first;
|
|
485
|
+
}
|
|
486
|
+
else if (flags == conv_flags::strict) // ptr's an unpaired high surrogate
|
|
487
|
+
{
|
|
488
|
+
--first; /* return to the illegal value itself */
|
|
489
|
+
result = conv_errc::source_illegal;
|
|
490
|
+
return convert_result<CharT>{first, result};
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
else
|
|
494
|
+
{ /* We don't have the 16 bits following the high surrogate. */
|
|
495
|
+
--first; /* return to the high surrogate */
|
|
496
|
+
result = conv_errc::source_exhausted;
|
|
497
|
+
return convert_result<CharT>{first, result};
|
|
498
|
+
}
|
|
499
|
+
} else if (flags == conv_flags::strict) {
|
|
500
|
+
/* UTF-16 surrogate values are illegal in UTF-32 */
|
|
501
|
+
if (is_low_surrogate(ch) )
|
|
502
|
+
{
|
|
503
|
+
--first; /* return to the illegal value itself */
|
|
504
|
+
result = conv_errc::source_illegal;
|
|
505
|
+
return convert_result<CharT>{first, result};
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
return convert_result<CharT>{first,result} ;
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
template <class CharT,class CodepointT>
|
|
513
|
+
typename std::enable_if<traits_extension::is_char32<CharT>::value && traits_extension::is_char32<CodepointT>::value,
|
|
514
|
+
convert_result<CharT>>::type
|
|
515
|
+
to_codepoint(const CharT* first, const CharT* last,
|
|
516
|
+
CodepointT& ch,
|
|
517
|
+
conv_flags flags = conv_flags::strict) noexcept
|
|
518
|
+
{
|
|
519
|
+
ch = 0;
|
|
520
|
+
if (first >= last)
|
|
521
|
+
{
|
|
522
|
+
return convert_result<CharT>{first, conv_errc::source_exhausted};
|
|
523
|
+
}
|
|
524
|
+
conv_errc result = conv_errc();
|
|
525
|
+
|
|
526
|
+
ch = *first++;
|
|
527
|
+
if (flags == conv_flags::strict )
|
|
528
|
+
{
|
|
529
|
+
/* UTF-16 surrogate values are illegal in UTF-32 */
|
|
530
|
+
if (is_surrogate(ch))
|
|
531
|
+
{
|
|
532
|
+
--first; /* return to the illegal value itself */
|
|
533
|
+
result = conv_errc::illegal_surrogate_value;
|
|
534
|
+
return convert_result<CharT>{first,result} ;
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
if (!(ch <= max_legal_utf32))
|
|
538
|
+
{
|
|
539
|
+
ch = replacement_char;
|
|
540
|
+
result = conv_errc::source_illegal;
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
return convert_result<CharT>{first,result} ;
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
// convert
|
|
547
|
+
|
|
548
|
+
template <class CharT,class Container>
|
|
549
|
+
typename std::enable_if<traits_extension::is_char8<CharT>::value
|
|
550
|
+
&& traits_extension::is_back_insertable<Container>::value
|
|
551
|
+
&& traits_extension::is_char8<typename Container::value_type>::value,
|
|
552
|
+
convert_result<CharT>>::type
|
|
553
|
+
convert(const CharT* data, std::size_t length, Container& target, conv_flags flags=conv_flags::strict)
|
|
554
|
+
{
|
|
555
|
+
(void)flags;
|
|
556
|
+
|
|
557
|
+
conv_errc result = conv_errc();
|
|
558
|
+
const CharT* last = data + length;
|
|
559
|
+
while (data != last)
|
|
560
|
+
{
|
|
561
|
+
std::size_t len = trailing_bytes_for_utf8[static_cast<uint8_t>(*data)] + 1;
|
|
562
|
+
if (len > (std::size_t)(last - data))
|
|
563
|
+
{
|
|
564
|
+
return convert_result<CharT>{data, conv_errc::source_exhausted};
|
|
565
|
+
}
|
|
566
|
+
if ((result=is_legal_utf8(data, len)) != conv_errc())
|
|
567
|
+
{
|
|
568
|
+
return convert_result<CharT>{data,result};
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
switch (len) {
|
|
572
|
+
case 4: target.push_back(static_cast<uint8_t>(*data++));
|
|
573
|
+
JSONCONS_FALLTHROUGH;
|
|
574
|
+
case 3: target.push_back(static_cast<uint8_t>(*data++));
|
|
575
|
+
JSONCONS_FALLTHROUGH;
|
|
576
|
+
case 2: target.push_back(static_cast<uint8_t>(*data++));
|
|
577
|
+
JSONCONS_FALLTHROUGH;
|
|
578
|
+
case 1: target.push_back(static_cast<uint8_t>(*data++));
|
|
579
|
+
}
|
|
580
|
+
}
|
|
581
|
+
return convert_result<CharT>{data,result} ;
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
template <class CharT,class Container>
|
|
585
|
+
typename std::enable_if<traits_extension::is_char8<CharT>::value
|
|
586
|
+
&& traits_extension::is_back_insertable<Container>::value
|
|
587
|
+
&& traits_extension::is_char16<typename Container::value_type>::value,
|
|
588
|
+
convert_result<CharT>>::type
|
|
589
|
+
convert(const CharT* data, std::size_t length,
|
|
590
|
+
Container& target,
|
|
591
|
+
conv_flags flags = conv_flags::strict)
|
|
592
|
+
{
|
|
593
|
+
conv_errc result = conv_errc();
|
|
594
|
+
|
|
595
|
+
const CharT* last = data + length;
|
|
596
|
+
while (data != last)
|
|
597
|
+
{
|
|
598
|
+
unsigned short extra_bytes_to_read = trailing_bytes_for_utf8[static_cast<uint8_t>(*data)];
|
|
599
|
+
if (extra_bytes_to_read >= last - data)
|
|
600
|
+
{
|
|
601
|
+
result = conv_errc::source_exhausted;
|
|
602
|
+
break;
|
|
603
|
+
}
|
|
604
|
+
/* Do this check whether lenient or strict */
|
|
605
|
+
if ((result=is_legal_utf8(data, extra_bytes_to_read+1)) != conv_errc())
|
|
606
|
+
{
|
|
607
|
+
break;
|
|
608
|
+
}
|
|
609
|
+
/*
|
|
610
|
+
* The cases all fall through. See "Note A" below.
|
|
611
|
+
*/
|
|
612
|
+
uint32_t ch = 0;
|
|
613
|
+
switch (extra_bytes_to_read) {
|
|
614
|
+
case 5: ch += static_cast<uint8_t>(*data++); ch <<= 6; /* remember, illegal UTF-8 */
|
|
615
|
+
JSONCONS_FALLTHROUGH;
|
|
616
|
+
case 4: ch += static_cast<uint8_t>(*data++); ch <<= 6; /* remember, illegal UTF-8 */
|
|
617
|
+
JSONCONS_FALLTHROUGH;
|
|
618
|
+
case 3: ch += static_cast<uint8_t>(*data++); ch <<= 6;
|
|
619
|
+
JSONCONS_FALLTHROUGH;
|
|
620
|
+
case 2: ch += static_cast<uint8_t>(*data++); ch <<= 6;
|
|
621
|
+
JSONCONS_FALLTHROUGH;
|
|
622
|
+
case 1: ch += static_cast<uint8_t>(*data++); ch <<= 6;
|
|
623
|
+
JSONCONS_FALLTHROUGH;
|
|
624
|
+
case 0: ch += static_cast<uint8_t>(*data++);
|
|
625
|
+
break;
|
|
626
|
+
}
|
|
627
|
+
ch -= offsets_from_utf8[extra_bytes_to_read];
|
|
628
|
+
|
|
629
|
+
if (ch <= max_bmp) { /* Target is a character <= 0xFFFF */
|
|
630
|
+
/* UTF-16 surrogate values are illegal in UTF-32 */
|
|
631
|
+
if (is_surrogate(ch) )
|
|
632
|
+
{
|
|
633
|
+
if (flags == conv_flags::strict) {
|
|
634
|
+
data -= (extra_bytes_to_read+1); /* return to the illegal value itself */
|
|
635
|
+
result = conv_errc::source_illegal;
|
|
636
|
+
break;
|
|
637
|
+
} else {
|
|
638
|
+
target.push_back(replacement_char);
|
|
639
|
+
}
|
|
640
|
+
} else {
|
|
641
|
+
target.push_back((uint16_t)ch); /* normal case */
|
|
642
|
+
}
|
|
643
|
+
} else if (ch > max_utf16) {
|
|
644
|
+
if (flags == conv_flags::strict) {
|
|
645
|
+
result = conv_errc::source_illegal;
|
|
646
|
+
data -= (extra_bytes_to_read+1); /* return to the start */
|
|
647
|
+
break; /* Bail out; shouldn't continue */
|
|
648
|
+
} else {
|
|
649
|
+
target.push_back(replacement_char);
|
|
650
|
+
}
|
|
651
|
+
} else {
|
|
652
|
+
/* target is a character in range 0xFFFF - 0x10FFFF. */
|
|
653
|
+
ch -= half_base;
|
|
654
|
+
target.push_back((uint16_t)((ch >> half_shift) + sur_high_start));
|
|
655
|
+
target.push_back((uint16_t)((ch & half_mask) + sur_low_start));
|
|
656
|
+
}
|
|
657
|
+
}
|
|
658
|
+
return convert_result<CharT>{data,result} ;
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
template <class CharT,class Container>
|
|
662
|
+
typename std::enable_if<traits_extension::is_char8<CharT>::value
|
|
663
|
+
&& traits_extension::is_back_insertable<Container>::value
|
|
664
|
+
&& traits_extension::is_char32<typename Container::value_type>::value,
|
|
665
|
+
convert_result<CharT>>::type
|
|
666
|
+
convert(const CharT* data, std::size_t length,
|
|
667
|
+
Container& target,
|
|
668
|
+
conv_flags flags = conv_flags::strict)
|
|
669
|
+
{
|
|
670
|
+
conv_errc result = conv_errc();
|
|
671
|
+
|
|
672
|
+
const CharT* last = data + length;
|
|
673
|
+
while (data < last)
|
|
674
|
+
{
|
|
675
|
+
uint32_t ch = 0;
|
|
676
|
+
unsigned short extra_bytes_to_read = trailing_bytes_for_utf8[static_cast<uint8_t>(*data)];
|
|
677
|
+
if (extra_bytes_to_read >= last - data)
|
|
678
|
+
{
|
|
679
|
+
result = conv_errc::source_exhausted;
|
|
680
|
+
break;
|
|
681
|
+
}
|
|
682
|
+
/* Do this check whether lenient or strict */
|
|
683
|
+
if ((result=is_legal_utf8(data, extra_bytes_to_read+1)) != conv_errc())
|
|
684
|
+
{
|
|
685
|
+
break;
|
|
686
|
+
}
|
|
687
|
+
/*
|
|
688
|
+
* The cases all fall through. See "Note A" below.
|
|
689
|
+
*/
|
|
690
|
+
switch (extra_bytes_to_read)
|
|
691
|
+
{
|
|
692
|
+
case 5:
|
|
693
|
+
ch += static_cast<uint8_t>(*data++);
|
|
694
|
+
ch <<= 6;
|
|
695
|
+
JSONCONS_FALLTHROUGH;
|
|
696
|
+
case 4:
|
|
697
|
+
ch += static_cast<uint8_t>(*data++);
|
|
698
|
+
ch <<= 6;
|
|
699
|
+
JSONCONS_FALLTHROUGH;
|
|
700
|
+
case 3:
|
|
701
|
+
ch += static_cast<uint8_t>(*data++);
|
|
702
|
+
ch <<= 6;
|
|
703
|
+
JSONCONS_FALLTHROUGH;
|
|
704
|
+
case 2:
|
|
705
|
+
ch += static_cast<uint8_t>(*data++);
|
|
706
|
+
ch <<= 6;
|
|
707
|
+
JSONCONS_FALLTHROUGH;
|
|
708
|
+
case 1:
|
|
709
|
+
ch += static_cast<uint8_t>(*data++);
|
|
710
|
+
ch <<= 6;
|
|
711
|
+
JSONCONS_FALLTHROUGH;
|
|
712
|
+
case 0:
|
|
713
|
+
ch += static_cast<uint8_t>(*data++);
|
|
714
|
+
break;
|
|
715
|
+
}
|
|
716
|
+
ch -= offsets_from_utf8[extra_bytes_to_read];
|
|
717
|
+
|
|
718
|
+
if (ch <= max_legal_utf32) {
|
|
719
|
+
/*
|
|
720
|
+
* UTF-16 surrogate values are illegal in UTF-32, and anything
|
|
721
|
+
* over Plane 17 (> 0x10FFFF) is illegal.
|
|
722
|
+
*/
|
|
723
|
+
if (is_surrogate(ch) )
|
|
724
|
+
{
|
|
725
|
+
if (flags == conv_flags::strict) {
|
|
726
|
+
data -= (extra_bytes_to_read+1); /* return to the illegal value itself */
|
|
727
|
+
result = conv_errc::source_illegal;
|
|
728
|
+
break;
|
|
729
|
+
} else {
|
|
730
|
+
target.push_back(replacement_char);
|
|
731
|
+
}
|
|
732
|
+
} else {
|
|
733
|
+
target.push_back(ch);
|
|
734
|
+
}
|
|
735
|
+
} else { /* i.e., ch > max_legal_utf32 */
|
|
736
|
+
result = conv_errc::source_illegal;
|
|
737
|
+
target.push_back(replacement_char);
|
|
738
|
+
}
|
|
739
|
+
}
|
|
740
|
+
return convert_result<CharT>{data,result} ;
|
|
741
|
+
}
|
|
742
|
+
|
|
743
|
+
// utf16
|
|
744
|
+
|
|
745
|
+
template <class CharT,class Container>
|
|
746
|
+
typename std::enable_if<traits_extension::is_char16<CharT>::value
|
|
747
|
+
&& traits_extension::is_back_insertable<Container>::value
|
|
748
|
+
&& traits_extension::is_char8<typename Container::value_type>::value,
|
|
749
|
+
convert_result<CharT>>::type
|
|
750
|
+
convert(const CharT* data, std::size_t length,
|
|
751
|
+
Container& target,
|
|
752
|
+
conv_flags flags = conv_flags::strict) {
|
|
753
|
+
conv_errc result = conv_errc();
|
|
754
|
+
|
|
755
|
+
const CharT* last = data + length;
|
|
756
|
+
while (data < last) {
|
|
757
|
+
unsigned short bytes_to_write = 0;
|
|
758
|
+
const uint32_t byteMask = 0xBF;
|
|
759
|
+
const uint32_t byteMark = 0x80;
|
|
760
|
+
uint32_t ch = *data++;
|
|
761
|
+
/* If we have a surrogate pair, convert to uint32_t data. */
|
|
762
|
+
if (is_high_surrogate(ch))
|
|
763
|
+
{
|
|
764
|
+
/* If the 16 bits following the high surrogate are in the data buffer... */
|
|
765
|
+
if (data < last) {
|
|
766
|
+
uint32_t ch2 = *data;
|
|
767
|
+
/* If ptr's a low surrogate, convert to uint32_t. */
|
|
768
|
+
if (ch2 >= sur_low_start && ch2 <= sur_low_end) {
|
|
769
|
+
ch = ((ch - sur_high_start) << half_shift)
|
|
770
|
+
+ (ch2 - sur_low_start) + half_base;
|
|
771
|
+
++data;
|
|
772
|
+
} else if (flags == conv_flags::strict) { /* ptr's an unpaired high surrogate */
|
|
773
|
+
--data; /* return to the illegal value itself */
|
|
774
|
+
result = conv_errc::unpaired_high_surrogate;
|
|
775
|
+
break;
|
|
776
|
+
}
|
|
777
|
+
} else { /* We don't have the 16 bits following the high surrogate. */
|
|
778
|
+
--data; /* return to the high surrogate */
|
|
779
|
+
result = conv_errc::source_exhausted;
|
|
780
|
+
break;
|
|
781
|
+
}
|
|
782
|
+
} else if (flags == conv_flags::strict) {
|
|
783
|
+
/* UTF-16 surrogate values are illegal in UTF-32 */
|
|
784
|
+
if (is_low_surrogate(ch))
|
|
785
|
+
{
|
|
786
|
+
--data; /* return to the illegal value itself */
|
|
787
|
+
result = conv_errc::source_illegal;
|
|
788
|
+
break;
|
|
789
|
+
}
|
|
790
|
+
}
|
|
791
|
+
/* Figure out how many bytes the result will require */
|
|
792
|
+
if (ch < (uint32_t)0x80) {
|
|
793
|
+
bytes_to_write = 1;
|
|
794
|
+
} else if (ch < (uint32_t)0x800) {
|
|
795
|
+
bytes_to_write = 2;
|
|
796
|
+
} else if (ch < (uint32_t)0x10000) {
|
|
797
|
+
bytes_to_write = 3;
|
|
798
|
+
} else if (ch < (uint32_t)0x110000) {
|
|
799
|
+
bytes_to_write = 4;
|
|
800
|
+
} else {
|
|
801
|
+
bytes_to_write = 3;
|
|
802
|
+
ch = replacement_char;
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
uint8_t byte1 = 0;
|
|
806
|
+
uint8_t byte2 = 0;
|
|
807
|
+
uint8_t byte3 = 0;
|
|
808
|
+
uint8_t byte4 = 0;
|
|
809
|
+
|
|
810
|
+
switch (bytes_to_write) { // note: everything falls through
|
|
811
|
+
case 4: byte4 = (uint8_t)((ch | byteMark) & byteMask); ch >>= 6;
|
|
812
|
+
JSONCONS_FALLTHROUGH;
|
|
813
|
+
case 3: byte3 = (uint8_t)((ch | byteMark) & byteMask); ch >>= 6;
|
|
814
|
+
JSONCONS_FALLTHROUGH;
|
|
815
|
+
case 2: byte2 = (uint8_t)((ch | byteMark) & byteMask); ch >>= 6;
|
|
816
|
+
JSONCONS_FALLTHROUGH;
|
|
817
|
+
case 1: byte1 = (uint8_t)(ch | first_byte_mark[bytes_to_write]);
|
|
818
|
+
break;
|
|
819
|
+
}
|
|
820
|
+
switch (bytes_to_write)
|
|
821
|
+
{
|
|
822
|
+
case 4:
|
|
823
|
+
target.push_back(byte1);
|
|
824
|
+
target.push_back(byte2);
|
|
825
|
+
target.push_back(byte3);
|
|
826
|
+
target.push_back(byte4);
|
|
827
|
+
break;
|
|
828
|
+
case 3:
|
|
829
|
+
target.push_back(byte1);
|
|
830
|
+
target.push_back(byte2);
|
|
831
|
+
target.push_back(byte3);
|
|
832
|
+
break;
|
|
833
|
+
case 2:
|
|
834
|
+
target.push_back(byte1);
|
|
835
|
+
target.push_back(byte2);
|
|
836
|
+
break;
|
|
837
|
+
case 1:
|
|
838
|
+
target.push_back(byte1);
|
|
839
|
+
break;
|
|
840
|
+
}
|
|
841
|
+
}
|
|
842
|
+
return convert_result<CharT>{data,result} ;
|
|
843
|
+
}
|
|
844
|
+
|
|
845
|
+
template <class CharT,class Container>
|
|
846
|
+
typename std::enable_if<traits_extension::is_char16<CharT>::value
|
|
847
|
+
&& traits_extension::is_back_insertable<Container>::value
|
|
848
|
+
&& traits_extension::is_char16<typename Container::value_type>::value,
|
|
849
|
+
convert_result<CharT>>::type
|
|
850
|
+
convert(const CharT* data, std::size_t length,
|
|
851
|
+
Container& target,
|
|
852
|
+
conv_flags flags = conv_flags::strict)
|
|
853
|
+
{
|
|
854
|
+
conv_errc result = conv_errc();
|
|
855
|
+
|
|
856
|
+
const CharT* last = data + length;
|
|
857
|
+
while (data != last)
|
|
858
|
+
{
|
|
859
|
+
uint32_t ch = *data++;
|
|
860
|
+
/* If we have a surrogate pair, convert to uint32_t data. */
|
|
861
|
+
if (is_high_surrogate(ch))
|
|
862
|
+
{
|
|
863
|
+
/* If the 16 bits following the high surrogate are in the data buffer... */
|
|
864
|
+
if (data < last) {
|
|
865
|
+
uint32_t ch2 = *data;
|
|
866
|
+
/* If ptr's a low surrogate, */
|
|
867
|
+
if (ch2 >= sur_low_start && ch2 <= sur_low_end) {
|
|
868
|
+
target.push_back((uint16_t)ch);
|
|
869
|
+
target.push_back((uint16_t)ch2);
|
|
870
|
+
++data;
|
|
871
|
+
} else if (flags == conv_flags::strict) { /* ptr's an unpaired high surrogate */
|
|
872
|
+
--data; /* return to the illegal value itself */
|
|
873
|
+
result = conv_errc::unpaired_high_surrogate;
|
|
874
|
+
break;
|
|
875
|
+
}
|
|
876
|
+
} else { /* We don't have the 16 bits following the high surrogate. */
|
|
877
|
+
--data; /* return to the high surrogate */
|
|
878
|
+
result = conv_errc::source_exhausted;
|
|
879
|
+
break;
|
|
880
|
+
}
|
|
881
|
+
} else if (is_low_surrogate(ch))
|
|
882
|
+
{
|
|
883
|
+
// illegal leading low surrogate
|
|
884
|
+
if (flags == conv_flags::strict) {
|
|
885
|
+
--data; /* return to the illegal value itself */
|
|
886
|
+
result = conv_errc::source_illegal;
|
|
887
|
+
break;
|
|
888
|
+
}
|
|
889
|
+
else
|
|
890
|
+
{
|
|
891
|
+
target.push_back((uint16_t)ch);
|
|
892
|
+
}
|
|
893
|
+
}
|
|
894
|
+
else
|
|
895
|
+
{
|
|
896
|
+
target.push_back((uint16_t)ch);
|
|
897
|
+
}
|
|
898
|
+
}
|
|
899
|
+
return convert_result<CharT>{data,result} ;
|
|
900
|
+
}
|
|
901
|
+
|
|
902
|
+
template <class CharT,class Container>
|
|
903
|
+
typename std::enable_if<traits_extension::is_char16<CharT>::value
|
|
904
|
+
&& traits_extension::is_back_insertable<Container>::value
|
|
905
|
+
&& traits_extension::is_char32<typename Container::value_type>::value,
|
|
906
|
+
convert_result<CharT>>::type
|
|
907
|
+
convert(const CharT* data, std::size_t length,
|
|
908
|
+
Container& target,
|
|
909
|
+
conv_flags flags = conv_flags::strict)
|
|
910
|
+
{
|
|
911
|
+
conv_errc result = conv_errc();
|
|
912
|
+
|
|
913
|
+
const CharT* last = data + length;
|
|
914
|
+
while (data != last)
|
|
915
|
+
{
|
|
916
|
+
uint32_t ch = *data++;
|
|
917
|
+
/* If we have a surrogate pair, convert to UTF32 data. */
|
|
918
|
+
if (is_high_surrogate(ch))
|
|
919
|
+
{
|
|
920
|
+
/* If the 16 bits following the high surrogate are in the data buffer... */
|
|
921
|
+
if (data < last) {
|
|
922
|
+
uint32_t ch2 = *data;
|
|
923
|
+
/* If ptr's a low surrogate, convert to UTF32. */
|
|
924
|
+
if (ch2 >= sur_low_start && ch2 <= sur_low_end )
|
|
925
|
+
{
|
|
926
|
+
ch = ((ch - sur_high_start) << half_shift)
|
|
927
|
+
+ (ch2 - sur_low_start) + half_base;
|
|
928
|
+
++data;
|
|
929
|
+
} else if (flags == conv_flags::strict) { /* ptr's an unpaired high surrogate */
|
|
930
|
+
--data; /* return to the illegal value itself */
|
|
931
|
+
result = conv_errc::source_illegal;
|
|
932
|
+
break;
|
|
933
|
+
}
|
|
934
|
+
} else { /* We don't have the 16 bits following the high surrogate. */
|
|
935
|
+
--data; /* return to the high surrogate */
|
|
936
|
+
result = conv_errc::source_exhausted;
|
|
937
|
+
break;
|
|
938
|
+
}
|
|
939
|
+
} else if (flags == conv_flags::strict) {
|
|
940
|
+
/* UTF-16 surrogate values are illegal in UTF-32 */
|
|
941
|
+
if (is_low_surrogate(ch) )
|
|
942
|
+
{
|
|
943
|
+
--data; /* return to the illegal value itself */
|
|
944
|
+
result = conv_errc::source_illegal;
|
|
945
|
+
break;
|
|
946
|
+
}
|
|
947
|
+
}
|
|
948
|
+
target.push_back(ch);
|
|
949
|
+
}
|
|
950
|
+
return convert_result<CharT>{data,result} ;
|
|
951
|
+
}
|
|
952
|
+
|
|
953
|
+
// utf32
|
|
954
|
+
|
|
955
|
+
template <class CharT,class Container>
|
|
956
|
+
typename std::enable_if<traits_extension::is_char32<CharT>::value
|
|
957
|
+
&& traits_extension::is_back_insertable<Container>::value
|
|
958
|
+
&& traits_extension::is_char8<typename Container::value_type>::value,
|
|
959
|
+
convert_result<CharT>>::type
|
|
960
|
+
convert(const CharT* data, std::size_t length,
|
|
961
|
+
Container& target,
|
|
962
|
+
conv_flags flags = conv_flags::strict)
|
|
963
|
+
{
|
|
964
|
+
conv_errc result = conv_errc();
|
|
965
|
+
const CharT* last = data + length;
|
|
966
|
+
while (data < last)
|
|
967
|
+
{
|
|
968
|
+
unsigned short bytes_to_write = 0;
|
|
969
|
+
const uint32_t byteMask = 0xBF;
|
|
970
|
+
const uint32_t byteMark = 0x80;
|
|
971
|
+
uint32_t ch = *data++;
|
|
972
|
+
if (flags == conv_flags::strict )
|
|
973
|
+
{
|
|
974
|
+
/* UTF-16 surrogate values are illegal in UTF-32 */
|
|
975
|
+
if (is_surrogate(ch))
|
|
976
|
+
{
|
|
977
|
+
--data; /* return to the illegal value itself */
|
|
978
|
+
result = conv_errc::illegal_surrogate_value;
|
|
979
|
+
break;
|
|
980
|
+
}
|
|
981
|
+
}
|
|
982
|
+
/*
|
|
983
|
+
* Figure out how many bytes the result will require. Turn any
|
|
984
|
+
* illegally large UTF32 things (> Plane 17) into replacement chars.
|
|
985
|
+
*/
|
|
986
|
+
if (ch < (uint32_t)0x80) { bytes_to_write = 1;
|
|
987
|
+
} else if (ch < (uint32_t)0x800) { bytes_to_write = 2;
|
|
988
|
+
} else if (ch < (uint32_t)0x10000) { bytes_to_write = 3;
|
|
989
|
+
} else if (ch <= max_legal_utf32) { bytes_to_write = 4;
|
|
990
|
+
} else {
|
|
991
|
+
bytes_to_write = 3;
|
|
992
|
+
ch = replacement_char;
|
|
993
|
+
result = conv_errc::source_illegal;
|
|
994
|
+
}
|
|
995
|
+
|
|
996
|
+
uint8_t byte1 = 0;
|
|
997
|
+
uint8_t byte2 = 0;
|
|
998
|
+
uint8_t byte3 = 0;
|
|
999
|
+
uint8_t byte4 = 0;
|
|
1000
|
+
|
|
1001
|
+
switch (bytes_to_write) {
|
|
1002
|
+
case 4:
|
|
1003
|
+
byte4 = (uint8_t)((ch | byteMark) & byteMask); ch >>= 6;
|
|
1004
|
+
JSONCONS_FALLTHROUGH;
|
|
1005
|
+
case 3:
|
|
1006
|
+
byte3 = (uint8_t)((ch | byteMark) & byteMask); ch >>= 6;
|
|
1007
|
+
JSONCONS_FALLTHROUGH;
|
|
1008
|
+
case 2:
|
|
1009
|
+
byte2 = (uint8_t)((ch | byteMark) & byteMask); ch >>= 6;
|
|
1010
|
+
JSONCONS_FALLTHROUGH;
|
|
1011
|
+
case 1:
|
|
1012
|
+
byte1 = (uint8_t) (ch | first_byte_mark[bytes_to_write]);
|
|
1013
|
+
break;
|
|
1014
|
+
}
|
|
1015
|
+
|
|
1016
|
+
switch (bytes_to_write)
|
|
1017
|
+
{
|
|
1018
|
+
case 4:
|
|
1019
|
+
target.push_back(byte1);
|
|
1020
|
+
target.push_back(byte2);
|
|
1021
|
+
target.push_back(byte3);
|
|
1022
|
+
target.push_back(byte4);
|
|
1023
|
+
break;
|
|
1024
|
+
case 3:
|
|
1025
|
+
target.push_back(byte1);
|
|
1026
|
+
target.push_back(byte2);
|
|
1027
|
+
target.push_back(byte3);
|
|
1028
|
+
break;
|
|
1029
|
+
case 2:
|
|
1030
|
+
target.push_back(byte1);
|
|
1031
|
+
target.push_back(byte2);
|
|
1032
|
+
break;
|
|
1033
|
+
case 1:
|
|
1034
|
+
target.push_back(byte1);
|
|
1035
|
+
break;
|
|
1036
|
+
}
|
|
1037
|
+
}
|
|
1038
|
+
return convert_result<CharT>{data,result} ;
|
|
1039
|
+
}
|
|
1040
|
+
|
|
1041
|
+
template <class CharT,class Container>
|
|
1042
|
+
typename std::enable_if<traits_extension::is_char32<CharT>::value
|
|
1043
|
+
&& traits_extension::is_back_insertable<Container>::value
|
|
1044
|
+
&& traits_extension::is_char16<typename Container::value_type>::value,
|
|
1045
|
+
convert_result<CharT>>::type
|
|
1046
|
+
convert(const CharT* data, std::size_t length,
|
|
1047
|
+
Container& target,
|
|
1048
|
+
conv_flags flags = conv_flags::strict)
|
|
1049
|
+
{
|
|
1050
|
+
conv_errc result = conv_errc();
|
|
1051
|
+
|
|
1052
|
+
const CharT* last = data + length;
|
|
1053
|
+
while (data != last)
|
|
1054
|
+
{
|
|
1055
|
+
uint32_t ch = *data++;
|
|
1056
|
+
if (ch <= max_bmp) { /* Target is a character <= 0xFFFF */
|
|
1057
|
+
/* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
|
|
1058
|
+
if (is_surrogate(ch) )
|
|
1059
|
+
{
|
|
1060
|
+
if (flags == conv_flags::strict) {
|
|
1061
|
+
--data; /* return to the illegal value itself */
|
|
1062
|
+
result = conv_errc::source_illegal;
|
|
1063
|
+
break;
|
|
1064
|
+
} else {
|
|
1065
|
+
target.push_back(replacement_char);
|
|
1066
|
+
}
|
|
1067
|
+
} else {
|
|
1068
|
+
target.push_back((uint16_t)ch); /* normal case */
|
|
1069
|
+
}
|
|
1070
|
+
} else if (ch > max_legal_utf32) {
|
|
1071
|
+
if (flags == conv_flags::strict) {
|
|
1072
|
+
result = conv_errc::source_illegal;
|
|
1073
|
+
} else {
|
|
1074
|
+
target.push_back(replacement_char);
|
|
1075
|
+
}
|
|
1076
|
+
} else {
|
|
1077
|
+
/* target is a character in range 0xFFFF - 0x10FFFF. */
|
|
1078
|
+
ch -= half_base;
|
|
1079
|
+
target.push_back((uint16_t)((ch >> half_shift) + sur_high_start));
|
|
1080
|
+
target.push_back((uint16_t)((ch & half_mask) + sur_low_start));
|
|
1081
|
+
}
|
|
1082
|
+
}
|
|
1083
|
+
return convert_result<CharT>{data,result} ;
|
|
1084
|
+
}
|
|
1085
|
+
|
|
1086
|
+
template <class CharT,class Container>
|
|
1087
|
+
typename std::enable_if<traits_extension::is_char32<CharT>::value
|
|
1088
|
+
&& traits_extension::is_back_insertable<Container>::value
|
|
1089
|
+
&& traits_extension::is_char32<typename Container::value_type>::value,
|
|
1090
|
+
convert_result<CharT>>::type
|
|
1091
|
+
convert(const CharT* data, std::size_t length,
|
|
1092
|
+
Container& target,
|
|
1093
|
+
conv_flags flags = conv_flags::strict)
|
|
1094
|
+
{
|
|
1095
|
+
conv_errc result = conv_errc();
|
|
1096
|
+
|
|
1097
|
+
const CharT* last = data + length;
|
|
1098
|
+
while (data != last)
|
|
1099
|
+
{
|
|
1100
|
+
uint32_t ch = *data++;
|
|
1101
|
+
if (flags == conv_flags::strict )
|
|
1102
|
+
{
|
|
1103
|
+
/* UTF-16 surrogate values are illegal in UTF-32 */
|
|
1104
|
+
if (is_surrogate(ch))
|
|
1105
|
+
{
|
|
1106
|
+
--data; /* return to the illegal value itself */
|
|
1107
|
+
result = conv_errc::illegal_surrogate_value;
|
|
1108
|
+
break;
|
|
1109
|
+
}
|
|
1110
|
+
}
|
|
1111
|
+
if (ch <= max_legal_utf32)
|
|
1112
|
+
{
|
|
1113
|
+
target.push_back(ch);
|
|
1114
|
+
}
|
|
1115
|
+
else
|
|
1116
|
+
{
|
|
1117
|
+
target.push_back(replacement_char);
|
|
1118
|
+
result = conv_errc::source_illegal;
|
|
1119
|
+
}
|
|
1120
|
+
}
|
|
1121
|
+
return convert_result<CharT>{data,result} ;
|
|
1122
|
+
}
|
|
1123
|
+
|
|
1124
|
+
// validate
|
|
1125
|
+
|
|
1126
|
+
template <class CharT>
|
|
1127
|
+
typename std::enable_if<traits_extension::is_char8<CharT>::value,
|
|
1128
|
+
convert_result<CharT>>::type
|
|
1129
|
+
validate(const CharT* data, std::size_t length) noexcept
|
|
1130
|
+
{
|
|
1131
|
+
conv_errc result = conv_errc();
|
|
1132
|
+
const CharT* last = data + length;
|
|
1133
|
+
while (data != last)
|
|
1134
|
+
{
|
|
1135
|
+
std::size_t len = static_cast<std::size_t>(trailing_bytes_for_utf8[static_cast<uint8_t>(*data)]) + 1;
|
|
1136
|
+
if (len > (std::size_t)(last - data))
|
|
1137
|
+
{
|
|
1138
|
+
return convert_result<CharT>{data, conv_errc::source_exhausted};
|
|
1139
|
+
}
|
|
1140
|
+
if ((result=is_legal_utf8(data, len)) != conv_errc())
|
|
1141
|
+
{
|
|
1142
|
+
return convert_result<CharT>{data,result} ;
|
|
1143
|
+
}
|
|
1144
|
+
data += len;
|
|
1145
|
+
}
|
|
1146
|
+
return convert_result<CharT>{data,result} ;
|
|
1147
|
+
}
|
|
1148
|
+
|
|
1149
|
+
// utf16
|
|
1150
|
+
|
|
1151
|
+
template <class CharT>
|
|
1152
|
+
typename std::enable_if<traits_extension::is_char16<CharT>::value,
|
|
1153
|
+
convert_result<CharT>>::type
|
|
1154
|
+
validate(const CharT* data, std::size_t length) noexcept
|
|
1155
|
+
{
|
|
1156
|
+
conv_errc result = conv_errc();
|
|
1157
|
+
|
|
1158
|
+
const CharT* last = data + length;
|
|
1159
|
+
while (data != last)
|
|
1160
|
+
{
|
|
1161
|
+
uint32_t ch = *data++;
|
|
1162
|
+
/* If we have a surrogate pair, validate to uint32_t data. */
|
|
1163
|
+
if (is_high_surrogate(ch))
|
|
1164
|
+
{
|
|
1165
|
+
/* If the 16 bits following the high surrogate are in the data buffer... */
|
|
1166
|
+
if (data < last) {
|
|
1167
|
+
uint32_t ch2 = *data;
|
|
1168
|
+
/* If ptr's a low surrogate, */
|
|
1169
|
+
if (ch2 >= sur_low_start && ch2 <= sur_low_end) {
|
|
1170
|
+
++data;
|
|
1171
|
+
} else {
|
|
1172
|
+
--data; /* return to the illegal value itself */
|
|
1173
|
+
result = conv_errc::unpaired_high_surrogate;
|
|
1174
|
+
break;
|
|
1175
|
+
}
|
|
1176
|
+
}
|
|
1177
|
+
else // We don't have the 16 bits following the high surrogate.
|
|
1178
|
+
{
|
|
1179
|
+
--data; /* return to the high surrogate */
|
|
1180
|
+
result = conv_errc::source_exhausted;
|
|
1181
|
+
break;
|
|
1182
|
+
}
|
|
1183
|
+
}
|
|
1184
|
+
else if (is_low_surrogate(ch))
|
|
1185
|
+
{
|
|
1186
|
+
/* UTF-16 surrogate values are illegal in UTF-32 */
|
|
1187
|
+
--data; /* return to the illegal value itself */
|
|
1188
|
+
result = conv_errc::source_illegal;
|
|
1189
|
+
break;
|
|
1190
|
+
}
|
|
1191
|
+
}
|
|
1192
|
+
return convert_result<CharT>{data,result} ;
|
|
1193
|
+
}
|
|
1194
|
+
|
|
1195
|
+
// utf32
|
|
1196
|
+
|
|
1197
|
+
template <class CharT>
|
|
1198
|
+
typename std::enable_if<traits_extension::is_char32<CharT>::value,
|
|
1199
|
+
convert_result<CharT>>::type
|
|
1200
|
+
validate(const CharT* data, std::size_t length) noexcept
|
|
1201
|
+
{
|
|
1202
|
+
conv_errc result = conv_errc();
|
|
1203
|
+
|
|
1204
|
+
const CharT* last = data + length;
|
|
1205
|
+
while (data != last)
|
|
1206
|
+
{
|
|
1207
|
+
uint32_t ch = *data++;
|
|
1208
|
+
/* UTF-16 surrogate values are illegal in UTF-32 */
|
|
1209
|
+
if (is_surrogate(ch))
|
|
1210
|
+
{
|
|
1211
|
+
--data; /* return to the illegal value itself */
|
|
1212
|
+
result = conv_errc::illegal_surrogate_value;
|
|
1213
|
+
break;
|
|
1214
|
+
}
|
|
1215
|
+
if (!(ch <= max_legal_utf32))
|
|
1216
|
+
{
|
|
1217
|
+
result = conv_errc::source_illegal;
|
|
1218
|
+
}
|
|
1219
|
+
}
|
|
1220
|
+
return convert_result<CharT>{data, result} ;
|
|
1221
|
+
}
|
|
1222
|
+
|
|
1223
|
+
enum class encoding {u8,u16le,u16be,u32le,u32be,undetected};
|
|
1224
|
+
|
|
1225
|
+
template <class Iterator>
|
|
1226
|
+
struct determine_encoding_result
|
|
1227
|
+
{
|
|
1228
|
+
Iterator it;
|
|
1229
|
+
encoding ec;
|
|
1230
|
+
};
|
|
1231
|
+
|
|
1232
|
+
template <class Iterator>
|
|
1233
|
+
typename std::enable_if<std::is_integral<typename std::iterator_traits<Iterator>::value_type>::value && sizeof(typename std::iterator_traits<Iterator>::value_type) == sizeof(uint8_t),
|
|
1234
|
+
determine_encoding_result<Iterator>>::type
|
|
1235
|
+
detect_encoding(Iterator first, Iterator last) noexcept
|
|
1236
|
+
{
|
|
1237
|
+
Iterator it1 = first;
|
|
1238
|
+
if (std::distance(first,last) < 4)
|
|
1239
|
+
{
|
|
1240
|
+
if (std::distance(first,last) == 3)
|
|
1241
|
+
{
|
|
1242
|
+
Iterator it2 = ++first;
|
|
1243
|
+
Iterator it3 = ++first;
|
|
1244
|
+
if (static_cast<uint8_t>(*it1) == 0xEF && static_cast<uint8_t>(*it2) == 0xBB && static_cast<uint8_t>(*it3) == 0xBF)
|
|
1245
|
+
{
|
|
1246
|
+
return determine_encoding_result<Iterator>{last,encoding::u8};
|
|
1247
|
+
}
|
|
1248
|
+
}
|
|
1249
|
+
return determine_encoding_result<Iterator>{it1,encoding::undetected};
|
|
1250
|
+
}
|
|
1251
|
+
else
|
|
1252
|
+
{
|
|
1253
|
+
Iterator it2 = ++first;
|
|
1254
|
+
Iterator it3 = ++first;
|
|
1255
|
+
Iterator it4 = ++first;
|
|
1256
|
+
|
|
1257
|
+
uint32_t bom = static_cast<uint8_t>(*it1) | (static_cast<uint8_t>(*it2) << 8) | (static_cast<uint8_t>(*it3) << 16) | (static_cast<uint8_t>(*it4) << 24);
|
|
1258
|
+
if (bom == 0xFFFE0000)
|
|
1259
|
+
{
|
|
1260
|
+
return determine_encoding_result<Iterator>{it4++,encoding::u32be};
|
|
1261
|
+
}
|
|
1262
|
+
else if (bom == 0x0000FEFF)
|
|
1263
|
+
{
|
|
1264
|
+
return determine_encoding_result<Iterator>{first,encoding::u32le};
|
|
1265
|
+
}
|
|
1266
|
+
else if ((bom & 0xFFFF) == 0xFFFE)
|
|
1267
|
+
{
|
|
1268
|
+
return determine_encoding_result<Iterator>{it3,encoding::u16be};
|
|
1269
|
+
}
|
|
1270
|
+
else if ((bom & 0xFFFF) == 0xFEFF)
|
|
1271
|
+
{
|
|
1272
|
+
return determine_encoding_result<Iterator>{it3,encoding::u16le};
|
|
1273
|
+
}
|
|
1274
|
+
else if ((bom & 0xFFFFFF) == 0xBFBBEF)
|
|
1275
|
+
{
|
|
1276
|
+
return determine_encoding_result<Iterator>{it4,encoding::u8};
|
|
1277
|
+
}
|
|
1278
|
+
else
|
|
1279
|
+
{
|
|
1280
|
+
uint32_t pattern = (static_cast<uint8_t>(*it1) ? 1 : 0) | (static_cast<uint8_t>(*it2) ? 2 : 0) | (static_cast<uint8_t>(*it3) ? 4 : 0) | (static_cast<uint8_t>(*it4) ? 8 : 0);
|
|
1281
|
+
switch (pattern) {
|
|
1282
|
+
case 0x08:
|
|
1283
|
+
return determine_encoding_result<Iterator>{it1,encoding::u32be};
|
|
1284
|
+
case 0x0A:
|
|
1285
|
+
return determine_encoding_result<Iterator>{it1,encoding::u16be};
|
|
1286
|
+
case 0x01:
|
|
1287
|
+
return determine_encoding_result<Iterator>{it1,encoding::u32le};
|
|
1288
|
+
case 0x05:
|
|
1289
|
+
return determine_encoding_result<Iterator>{it1,encoding::u16le};
|
|
1290
|
+
case 0x0F:
|
|
1291
|
+
return determine_encoding_result<Iterator>{it1,encoding::u8};
|
|
1292
|
+
default:
|
|
1293
|
+
return determine_encoding_result<Iterator>{it1,encoding::undetected};
|
|
1294
|
+
}
|
|
1295
|
+
}
|
|
1296
|
+
}
|
|
1297
|
+
}
|
|
1298
|
+
|
|
1299
|
+
// count_codepoints
|
|
1300
|
+
|
|
1301
|
+
template <class CharT>
|
|
1302
|
+
typename std::enable_if<traits_extension::is_char8<CharT>::value || traits_extension::is_char16<CharT>::value || traits_extension::is_char32<CharT>::value, std::size_t>::type
|
|
1303
|
+
count_codepoints(const CharT* data, std::size_t length,
|
|
1304
|
+
conv_flags flags = conv_flags::strict) noexcept
|
|
1305
|
+
{
|
|
1306
|
+
conv_errc ec = conv_errc();
|
|
1307
|
+
|
|
1308
|
+
std::size_t count = 0;
|
|
1309
|
+
const CharT* ptr = data;
|
|
1310
|
+
const CharT* last = data + length;
|
|
1311
|
+
|
|
1312
|
+
for (; ptr < last; ++count)
|
|
1313
|
+
{
|
|
1314
|
+
uint32_t cp = 0;
|
|
1315
|
+
auto r = to_codepoint(ptr, last, cp, flags);
|
|
1316
|
+
if (r.ec != conv_errc())
|
|
1317
|
+
{
|
|
1318
|
+
ec = r.ec;
|
|
1319
|
+
break;
|
|
1320
|
+
}
|
|
1321
|
+
ptr = r.ptr;
|
|
1322
|
+
}
|
|
1323
|
+
return ec == conv_errc() && ptr == last ? count : 0;
|
|
1324
|
+
}
|
|
1325
|
+
|
|
1326
|
+
} // unicode_traits
|
|
1327
|
+
} // jsoncons
|
|
1328
|
+
|
|
1329
|
+
#endif
|
|
1330
|
+
|