catboost 1.25.1 → 1.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/DEPLOYMENT.md +22 -15
- package/README.md +37 -27
- package/binding.gyp +5 -7
- package/build_scripts/bootstrap.js +2 -1
- package/build_scripts/out/build.js +46 -68
- package/build_scripts/out/build_model.js +1 -1
- package/build_scripts/out/{build_ya.js → build_native.js} +1 -1
- package/build_scripts/out/ci.js +5 -5
- package/build_scripts/out/config.js +32 -18
- package/build_scripts/out/install.js +5 -3
- package/build_scripts/out/package_prepublish.js +1 -1
- package/build_scripts/out/packaging.js +1 -19
- package/build_scripts/out/run_tests.js +1 -1
- package/build_scripts/out/test.js +8 -3
- package/config.json +18 -11
- package/inc/catboost/libs/model_interface/c_api.h +349 -3
- package/lib/catboost.d.ts +65 -21
- package/package.json +4 -4
- package/src/api_helpers.cpp +100 -24
- package/src/api_helpers.h +8 -7
- package/src/api_module.cpp +1 -2
- package/src/model.cpp +483 -83
- package/src/model.h +24 -9
- package/inc/contrib/libs/cxxsupp/system_stl/include/stlfwd +0 -14
- package/inc/util/charset/recode_result.h +0 -9
- package/inc/util/charset/unicode_table.h +0 -123
- package/inc/util/charset/unidata.h +0 -421
- package/inc/util/charset/utf8.h +0 -384
- package/inc/util/charset/wide.h +0 -843
- package/inc/util/charset/wide_specific.h +0 -22
- package/inc/util/datetime/base.h +0 -669
- package/inc/util/datetime/constants.h +0 -7
- package/inc/util/datetime/cputimer.h +0 -124
- package/inc/util/datetime/parser.h +0 -292
- package/inc/util/datetime/systime.h +0 -47
- package/inc/util/datetime/uptime.h +0 -8
- package/inc/util/digest/city.h +0 -88
- package/inc/util/digest/fnv.h +0 -73
- package/inc/util/digest/multi.h +0 -14
- package/inc/util/digest/murmur.h +0 -57
- package/inc/util/digest/numeric.h +0 -86
- package/inc/util/digest/sequence.h +0 -48
- package/inc/util/draft/date.h +0 -129
- package/inc/util/draft/datetime.h +0 -184
- package/inc/util/draft/enum.h +0 -136
- package/inc/util/draft/holder_vector.h +0 -102
- package/inc/util/draft/ip.h +0 -131
- package/inc/util/draft/matrix.h +0 -108
- package/inc/util/draft/memory.h +0 -40
- package/inc/util/folder/dirent_win.h +0 -46
- package/inc/util/folder/dirut.h +0 -121
- package/inc/util/folder/filelist.h +0 -81
- package/inc/util/folder/fts.h +0 -108
- package/inc/util/folder/iterator.h +0 -109
- package/inc/util/folder/lstat_win.h +0 -20
- package/inc/util/folder/path.h +0 -225
- package/inc/util/folder/pathsplit.h +0 -113
- package/inc/util/folder/tempdir.h +0 -42
- package/inc/util/generic/adaptor.h +0 -134
- package/inc/util/generic/algorithm.h +0 -765
- package/inc/util/generic/array_ref.h +0 -282
- package/inc/util/generic/array_size.h +0 -24
- package/inc/util/generic/benchmark/vector_count_ctor/f.h +0 -9
- package/inc/util/generic/bitmap.h +0 -1115
- package/inc/util/generic/bitops.h +0 -459
- package/inc/util/generic/bt_exception.h +0 -24
- package/inc/util/generic/buffer.h +0 -232
- package/inc/util/generic/cast.h +0 -176
- package/inc/util/generic/deque.h +0 -24
- package/inc/util/generic/explicit_type.h +0 -42
- package/inc/util/generic/fastqueue.h +0 -55
- package/inc/util/generic/flags.h +0 -244
- package/inc/util/generic/function.h +0 -103
- package/inc/util/generic/fwd.h +0 -171
- package/inc/util/generic/guid.h +0 -61
- package/inc/util/generic/hash.h +0 -2032
- package/inc/util/generic/hash_primes.h +0 -140
- package/inc/util/generic/hash_set.h +0 -490
- package/inc/util/generic/hide_ptr.h +0 -3
- package/inc/util/generic/intrlist.h +0 -876
- package/inc/util/generic/is_in.h +0 -53
- package/inc/util/generic/iterator.h +0 -137
- package/inc/util/generic/iterator_range.h +0 -105
- package/inc/util/generic/lazy_value.h +0 -66
- package/inc/util/generic/list.h +0 -22
- package/inc/util/generic/map.h +0 -44
- package/inc/util/generic/mapfindptr.h +0 -60
- package/inc/util/generic/maybe.h +0 -713
- package/inc/util/generic/maybe_traits.h +0 -164
- package/inc/util/generic/mem_copy.h +0 -55
- package/inc/util/generic/noncopyable.h +0 -38
- package/inc/util/generic/object_counter.h +0 -53
- package/inc/util/generic/ptr.h +0 -1113
- package/inc/util/generic/queue.h +0 -57
- package/inc/util/generic/refcount.h +0 -162
- package/inc/util/generic/reserve.h +0 -11
- package/inc/util/generic/scope.h +0 -65
- package/inc/util/generic/serialized_enum.h +0 -406
- package/inc/util/generic/set.h +0 -42
- package/inc/util/generic/singleton.h +0 -136
- package/inc/util/generic/size_literals.h +0 -65
- package/inc/util/generic/stack.h +0 -18
- package/inc/util/generic/store_policy.h +0 -120
- package/inc/util/generic/strbase.h +0 -612
- package/inc/util/generic/strbuf.h +0 -552
- package/inc/util/generic/strfcpy.h +0 -17
- package/inc/util/generic/string.h +0 -1572
- package/inc/util/generic/string_hash.h +0 -21
- package/inc/util/generic/string_ut.h +0 -1175
- package/inc/util/generic/type_name.h +0 -34
- package/inc/util/generic/typelist.h +0 -114
- package/inc/util/generic/typetraits.h +0 -325
- package/inc/util/generic/utility.h +0 -132
- package/inc/util/generic/va_args.h +0 -400
- package/inc/util/generic/variant.h +0 -631
- package/inc/util/generic/variant_traits.h +0 -171
- package/inc/util/generic/vector.h +0 -119
- package/inc/util/generic/xrange.h +0 -258
- package/inc/util/generic/yexception.h +0 -212
- package/inc/util/generic/yexception_ut.h +0 -14
- package/inc/util/generic/ylimits.h +0 -92
- package/inc/util/generic/ymath.h +0 -206
- package/inc/util/memory/addstorage.h +0 -93
- package/inc/util/memory/alloc.h +0 -27
- package/inc/util/memory/blob.h +0 -296
- package/inc/util/memory/mmapalloc.h +0 -8
- package/inc/util/memory/pool.h +0 -432
- package/inc/util/memory/segmented_string_pool.h +0 -194
- package/inc/util/memory/segpool_alloc.h +0 -118
- package/inc/util/memory/smallobj.h +0 -141
- package/inc/util/memory/tempbuf.h +0 -111
- package/inc/util/network/address.h +0 -136
- package/inc/util/network/endpoint.h +0 -61
- package/inc/util/network/hostip.h +0 -16
- package/inc/util/network/init.h +0 -60
- package/inc/util/network/interface.h +0 -17
- package/inc/util/network/iovec.h +0 -65
- package/inc/util/network/ip.h +0 -116
- package/inc/util/network/nonblock.h +0 -8
- package/inc/util/network/pair.h +0 -9
- package/inc/util/network/poller.h +0 -58
- package/inc/util/network/pollerimpl.h +0 -707
- package/inc/util/network/sock.h +0 -608
- package/inc/util/network/socket.h +0 -421
- package/inc/util/random/common_ops.h +0 -130
- package/inc/util/random/easy.h +0 -47
- package/inc/util/random/entropy.h +0 -21
- package/inc/util/random/fast.h +0 -101
- package/inc/util/random/init_atfork.h +0 -3
- package/inc/util/random/lcg_engine.h +0 -66
- package/inc/util/random/mersenne.h +0 -46
- package/inc/util/random/mersenne32.h +0 -50
- package/inc/util/random/mersenne64.h +0 -50
- package/inc/util/random/normal.h +0 -38
- package/inc/util/random/random.h +0 -30
- package/inc/util/random/shuffle.h +0 -39
- package/inc/util/str_stl.h +0 -266
- package/inc/util/stream/aligned.h +0 -99
- package/inc/util/stream/buffer.h +0 -119
- package/inc/util/stream/buffered.h +0 -225
- package/inc/util/stream/debug.h +0 -53
- package/inc/util/stream/direct_io.h +0 -43
- package/inc/util/stream/file.h +0 -108
- package/inc/util/stream/format.h +0 -444
- package/inc/util/stream/fwd.h +0 -100
- package/inc/util/stream/hex.h +0 -8
- package/inc/util/stream/holder.h +0 -44
- package/inc/util/stream/input.h +0 -273
- package/inc/util/stream/labeled.h +0 -19
- package/inc/util/stream/length.h +0 -100
- package/inc/util/stream/mem.h +0 -255
- package/inc/util/stream/multi.h +0 -32
- package/inc/util/stream/null.h +0 -61
- package/inc/util/stream/output.h +0 -304
- package/inc/util/stream/pipe.h +0 -112
- package/inc/util/stream/printf.h +0 -25
- package/inc/util/stream/str.h +0 -207
- package/inc/util/stream/tee.h +0 -28
- package/inc/util/stream/tempbuf.h +0 -21
- package/inc/util/stream/tokenizer.h +0 -214
- package/inc/util/stream/trace.h +0 -60
- package/inc/util/stream/walk.h +0 -35
- package/inc/util/stream/zerocopy.h +0 -91
- package/inc/util/stream/zerocopy_output.h +0 -57
- package/inc/util/stream/zlib.h +0 -173
- package/inc/util/string/ascii.h +0 -236
- package/inc/util/string/builder.h +0 -39
- package/inc/util/string/cast.h +0 -347
- package/inc/util/string/cstriter.h +0 -14
- package/inc/util/string/escape.h +0 -70
- package/inc/util/string/hex.h +0 -59
- package/inc/util/string/join.h +0 -194
- package/inc/util/string/printf.h +0 -13
- package/inc/util/string/reverse.h +0 -16
- package/inc/util/string/split.h +0 -1080
- package/inc/util/string/strip.h +0 -257
- package/inc/util/string/strspn.h +0 -65
- package/inc/util/string/subst.h +0 -56
- package/inc/util/string/type.h +0 -50
- package/inc/util/string/util.h +0 -195
- package/inc/util/string/vector.h +0 -132
- package/inc/util/system/align.h +0 -50
- package/inc/util/system/atexit.h +0 -22
- package/inc/util/system/atomic.h +0 -51
- package/inc/util/system/atomic_gcc.h +0 -90
- package/inc/util/system/atomic_ops.h +0 -189
- package/inc/util/system/atomic_win.h +0 -114
- package/inc/util/system/backtrace.h +0 -39
- package/inc/util/system/byteorder.h +0 -186
- package/inc/util/system/compat.h +0 -84
- package/inc/util/system/compiler.h +0 -620
- package/inc/util/system/condvar.h +0 -71
- package/inc/util/system/context.h +0 -181
- package/inc/util/system/context_aarch64.h +0 -8
- package/inc/util/system/context_i686.h +0 -9
- package/inc/util/system/context_x86.h +0 -12
- package/inc/util/system/context_x86_64.h +0 -7
- package/inc/util/system/cpu_id.h +0 -159
- package/inc/util/system/daemon.h +0 -28
- package/inc/util/system/datetime.h +0 -98
- package/inc/util/system/defaults.h +0 -149
- package/inc/util/system/demangle.h +0 -5
- package/inc/util/system/demangle_impl.h +0 -23
- package/inc/util/system/direct_io.h +0 -71
- package/inc/util/system/dynlib.h +0 -119
- package/inc/util/system/env.h +0 -32
- package/inc/util/system/error.h +0 -95
- package/inc/util/system/event.h +0 -122
- package/inc/util/system/execpath.h +0 -17
- package/inc/util/system/fasttime.h +0 -6
- package/inc/util/system/fhandle.h +0 -27
- package/inc/util/system/file.h +0 -210
- package/inc/util/system/file_lock.h +0 -34
- package/inc/util/system/filemap.h +0 -383
- package/inc/util/system/flock.h +0 -35
- package/inc/util/system/fs.h +0 -156
- package/inc/util/system/fs_win.h +0 -29
- package/inc/util/system/fstat.h +0 -46
- package/inc/util/system/getpid.h +0 -12
- package/inc/util/system/guard.h +0 -179
- package/inc/util/system/hi_lo.h +0 -139
- package/inc/util/system/hostname.h +0 -10
- package/inc/util/system/hp_timer.h +0 -36
- package/inc/util/system/info.h +0 -12
- package/inc/util/system/interrupt_signals.h +0 -22
- package/inc/util/system/madvise.h +0 -30
- package/inc/util/system/maxlen.h +0 -32
- package/inc/util/system/mem_info.h +0 -18
- package/inc/util/system/mincore.h +0 -38
- package/inc/util/system/mktemp.h +0 -11
- package/inc/util/system/mlock.h +0 -43
- package/inc/util/system/mutex.h +0 -67
- package/inc/util/system/nice.h +0 -3
- package/inc/util/system/pipe.h +0 -90
- package/inc/util/system/platform.h +0 -246
- package/inc/util/system/progname.h +0 -13
- package/inc/util/system/protect.h +0 -25
- package/inc/util/system/rusage.h +0 -26
- package/inc/util/system/rwlock.h +0 -78
- package/inc/util/system/sanitizers.h +0 -122
- package/inc/util/system/sem.h +0 -41
- package/inc/util/system/shellcommand.h +0 -472
- package/inc/util/system/shmat.h +0 -32
- package/inc/util/system/sigset.h +0 -78
- package/inc/util/system/spin_wait.h +0 -10
- package/inc/util/system/spinlock.h +0 -121
- package/inc/util/system/src_location.h +0 -25
- package/inc/util/system/src_root.h +0 -68
- package/inc/util/system/sys_alloc.h +0 -43
- package/inc/util/system/sysstat.h +0 -52
- package/inc/util/system/tempfile.h +0 -34
- package/inc/util/system/thread.h +0 -167
- package/inc/util/system/tls.h +0 -307
- package/inc/util/system/types.h +0 -119
- package/inc/util/system/unaligned_mem.h +0 -67
- package/inc/util/system/user.h +0 -5
- package/inc/util/system/utime.h +0 -6
- package/inc/util/system/valgrind.h +0 -48
- package/inc/util/system/winint.h +0 -43
- package/inc/util/system/yassert.h +0 -121
- package/inc/util/system/yield.h +0 -4
- package/inc/util/thread/factory.h +0 -65
- package/inc/util/thread/fwd.h +0 -30
- package/inc/util/thread/lfqueue.h +0 -406
- package/inc/util/thread/lfstack.h +0 -188
- package/inc/util/thread/pool.h +0 -388
- package/inc/util/thread/singleton.h +0 -42
- package/inc/util/ysafeptr.h +0 -427
- package/inc/util/ysaveload.h +0 -700
package/inc/util/charset/utf8.h
DELETED
|
@@ -1,384 +0,0 @@
|
|
|
1
|
-
#pragma once
|
|
2
|
-
|
|
3
|
-
#include "recode_result.h"
|
|
4
|
-
|
|
5
|
-
#include <util/generic/strbuf.h>
|
|
6
|
-
#include <util/generic/string.h>
|
|
7
|
-
#include <util/generic/yexception.h>
|
|
8
|
-
#include <util/system/defaults.h>
|
|
9
|
-
#include <util/system/yassert.h>
|
|
10
|
-
|
|
11
|
-
extern const wchar32 BROKEN_RUNE;
|
|
12
|
-
|
|
13
|
-
inline unsigned char UTF8LeadByteMask(size_t utf8_rune_len) {
|
|
14
|
-
// Y_ASSERT (utf8_rune_len <= 4);
|
|
15
|
-
return "\0\0\037\017\007"[utf8_rune_len];
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
inline size_t UTF8RuneLen(const unsigned char lead_byte) {
|
|
19
|
-
//b0XXXXXXX
|
|
20
|
-
if ((lead_byte & 0x80) == 0x00) {
|
|
21
|
-
return 1;
|
|
22
|
-
}
|
|
23
|
-
//b110XXXXX
|
|
24
|
-
if ((lead_byte & 0xe0) == 0xc0) {
|
|
25
|
-
return 2;
|
|
26
|
-
}
|
|
27
|
-
//b1110XXXX
|
|
28
|
-
if ((lead_byte & 0xf0) == 0xe0) {
|
|
29
|
-
return 3;
|
|
30
|
-
}
|
|
31
|
-
//b11110XXX
|
|
32
|
-
if ((lead_byte & 0xf8) == 0xf0) {
|
|
33
|
-
return 4;
|
|
34
|
-
}
|
|
35
|
-
//b10XXXXXX
|
|
36
|
-
return 0;
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
inline size_t UTF8RuneLenByUCS(wchar32 rune) {
|
|
40
|
-
if (rune < 0x80)
|
|
41
|
-
return 1U;
|
|
42
|
-
else if (rune < 0x800)
|
|
43
|
-
return 2U;
|
|
44
|
-
else if (rune < 0x10000)
|
|
45
|
-
return 3U;
|
|
46
|
-
else if (rune < 0x200000)
|
|
47
|
-
return 4U;
|
|
48
|
-
else if (rune < 0x4000000)
|
|
49
|
-
return 5U;
|
|
50
|
-
else
|
|
51
|
-
return 6U;
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
inline void PutUTF8LeadBits(wchar32& rune, unsigned char c, size_t len) {
|
|
55
|
-
rune = c;
|
|
56
|
-
rune &= UTF8LeadByteMask(len);
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
inline void PutUTF8SixBits(wchar32& rune, unsigned char c) {
|
|
60
|
-
rune <<= 6;
|
|
61
|
-
rune |= c & 0x3F;
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
inline bool IsUTF8ContinuationByte(unsigned char c) {
|
|
65
|
-
return (c & static_cast<unsigned char>(0xC0)) == static_cast<unsigned char>(0x80);
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
//! returns length of the current UTF8 character
|
|
69
|
-
//! @param n length of the current character, it is assigned in case of valid UTF8 byte sequence
|
|
70
|
-
//! @param p pointer to the current character
|
|
71
|
-
//! @param e end of the character sequence
|
|
72
|
-
inline RECODE_RESULT GetUTF8CharLen(size_t& n, const unsigned char* p, const unsigned char* e) {
|
|
73
|
-
Y_ASSERT(p < e); // since p < e then we will check RECODE_EOINPUT only for n > 1 (see calls of this functions)
|
|
74
|
-
switch (UTF8RuneLen(*p)) {
|
|
75
|
-
case 0:
|
|
76
|
-
return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
|
|
77
|
-
|
|
78
|
-
case 1:
|
|
79
|
-
n = 1;
|
|
80
|
-
return RECODE_OK;
|
|
81
|
-
|
|
82
|
-
case 2:
|
|
83
|
-
if (p + 2 > e) {
|
|
84
|
-
return RECODE_EOINPUT;
|
|
85
|
-
} else if (!IsUTF8ContinuationByte(p[1])) {
|
|
86
|
-
return RECODE_BROKENSYMBOL;
|
|
87
|
-
} else {
|
|
88
|
-
n = 2;
|
|
89
|
-
return RECODE_OK;
|
|
90
|
-
}
|
|
91
|
-
case 3:
|
|
92
|
-
if (p + 3 > e) {
|
|
93
|
-
return RECODE_EOINPUT;
|
|
94
|
-
} else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2])) {
|
|
95
|
-
return RECODE_BROKENSYMBOL;
|
|
96
|
-
} else {
|
|
97
|
-
n = 3;
|
|
98
|
-
return RECODE_OK;
|
|
99
|
-
}
|
|
100
|
-
default: // actually 4
|
|
101
|
-
if (p + 4 > e) {
|
|
102
|
-
return RECODE_EOINPUT;
|
|
103
|
-
} else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2]) || !IsUTF8ContinuationByte(p[3])) {
|
|
104
|
-
return RECODE_BROKENSYMBOL;
|
|
105
|
-
} else {
|
|
106
|
-
n = 4;
|
|
107
|
-
return RECODE_OK;
|
|
108
|
-
}
|
|
109
|
-
}
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
//! returns number of characters in UTF8 encoded text, stops immediately if UTF8 byte sequence is wrong
|
|
113
|
-
//! @param text UTF8 encoded text
|
|
114
|
-
//! @param len the length of the text in bytes
|
|
115
|
-
//! @param number number of encoded symbols in the text
|
|
116
|
-
inline bool GetNumberOfUTF8Chars(const char* text, size_t len, size_t& number) {
|
|
117
|
-
const unsigned char* cur = reinterpret_cast<const unsigned char*>(text);
|
|
118
|
-
const unsigned char* const last = cur + len;
|
|
119
|
-
number = 0;
|
|
120
|
-
size_t runeLen;
|
|
121
|
-
bool res = true;
|
|
122
|
-
while (cur != last) {
|
|
123
|
-
if (GetUTF8CharLen(runeLen, cur, last) != RECODE_OK) { // actually it could be RECODE_BROKENSYMBOL only
|
|
124
|
-
res = false;
|
|
125
|
-
break;
|
|
126
|
-
}
|
|
127
|
-
cur += runeLen;
|
|
128
|
-
Y_ASSERT(cur <= last);
|
|
129
|
-
++number;
|
|
130
|
-
}
|
|
131
|
-
return res;
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
inline size_t GetNumberOfUTF8Chars(TStringBuf text) {
|
|
135
|
-
size_t number;
|
|
136
|
-
if (!GetNumberOfUTF8Chars(text.data(), text.size(), number)) {
|
|
137
|
-
ythrow yexception() << "GetNumberOfUTF8Chars failed on invalid utf-8 " << TString(text.substr(0, 50)).Quote();
|
|
138
|
-
}
|
|
139
|
-
return number;
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
//! reads one unicode symbol from a character sequence encoded UTF8 and checks for overlong encoding
|
|
143
|
-
//! @param rune value of the current character
|
|
144
|
-
//! @param rune_len length of the UTF8 bytes sequence that has been read
|
|
145
|
-
//! @param s pointer to the current character
|
|
146
|
-
//! @param end the end of the character sequence
|
|
147
|
-
inline RECODE_RESULT SafeReadUTF8Char(wchar32& rune, size_t& rune_len, const unsigned char* s, const unsigned char* end) {
|
|
148
|
-
rune = BROKEN_RUNE;
|
|
149
|
-
rune_len = 0;
|
|
150
|
-
wchar32 _rune;
|
|
151
|
-
|
|
152
|
-
size_t _len = UTF8RuneLen(*s);
|
|
153
|
-
if (s + _len > end)
|
|
154
|
-
return RECODE_EOINPUT; //[EOINPUT]
|
|
155
|
-
if (_len == 0)
|
|
156
|
-
return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
|
|
157
|
-
_rune = *s++; //[00000000 0XXXXXXX]
|
|
158
|
-
|
|
159
|
-
if (_len > 1) {
|
|
160
|
-
_rune &= UTF8LeadByteMask(_len);
|
|
161
|
-
unsigned char ch = *s++;
|
|
162
|
-
if (!IsUTF8ContinuationByte(ch))
|
|
163
|
-
return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte
|
|
164
|
-
PutUTF8SixBits(_rune, ch); //[00000XXX XXYYYYYY]
|
|
165
|
-
if (_len > 2) {
|
|
166
|
-
ch = *s++;
|
|
167
|
-
if (!IsUTF8ContinuationByte(ch))
|
|
168
|
-
return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte
|
|
169
|
-
PutUTF8SixBits(_rune, ch); //[XXXXYYYY YYZZZZZZ]
|
|
170
|
-
if (_len > 3) {
|
|
171
|
-
ch = *s;
|
|
172
|
-
if (!IsUTF8ContinuationByte(ch))
|
|
173
|
-
return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte
|
|
174
|
-
PutUTF8SixBits(_rune, ch); //[XXXYY YYYYZZZZ ZZQQQQQQ]
|
|
175
|
-
if (_rune > 0x10FFFF) // it is not a valid Unicode code point
|
|
176
|
-
return RECODE_BROKENSYMBOL;
|
|
177
|
-
if (_rune < 0x10000) // check for overlong encoding
|
|
178
|
-
return RECODE_BROKENSYMBOL;
|
|
179
|
-
} else {
|
|
180
|
-
if (_rune < 0x800) // check for overlong encoding
|
|
181
|
-
return RECODE_BROKENSYMBOL;
|
|
182
|
-
}
|
|
183
|
-
} else {
|
|
184
|
-
if (_rune < 0x80) // check for overlong encoding
|
|
185
|
-
return RECODE_BROKENSYMBOL;
|
|
186
|
-
}
|
|
187
|
-
}
|
|
188
|
-
rune_len = _len;
|
|
189
|
-
rune = _rune;
|
|
190
|
-
return RECODE_OK;
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
//! reads one unicode symbol from a character sequence encoded UTF8 and moves pointer to the next character
|
|
194
|
-
//! @param c value of the current character
|
|
195
|
-
//! @param p pointer to the current character, it will be changed in case of valid UTF8 byte sequence
|
|
196
|
-
//! @param e the end of the character sequence
|
|
197
|
-
Y_FORCE_INLINE RECODE_RESULT ReadUTF8CharAndAdvance(wchar32& rune, const unsigned char*& p, const unsigned char* e) noexcept {
|
|
198
|
-
Y_ASSERT(p < e); // since p < e then we will check RECODE_EOINPUT only for n > 1 (see calls of this functions)
|
|
199
|
-
switch (UTF8RuneLen(*p)) {
|
|
200
|
-
case 0:
|
|
201
|
-
rune = BROKEN_RUNE;
|
|
202
|
-
return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
|
|
203
|
-
|
|
204
|
-
case 1:
|
|
205
|
-
rune = *p; //[00000000 0XXXXXXX]
|
|
206
|
-
++p;
|
|
207
|
-
return RECODE_OK;
|
|
208
|
-
|
|
209
|
-
case 2:
|
|
210
|
-
if (p + 2 > e) {
|
|
211
|
-
return RECODE_EOINPUT;
|
|
212
|
-
} else if (!IsUTF8ContinuationByte(p[1])) {
|
|
213
|
-
rune = BROKEN_RUNE;
|
|
214
|
-
return RECODE_BROKENSYMBOL;
|
|
215
|
-
} else {
|
|
216
|
-
PutUTF8LeadBits(rune, *p++, 2); //[00000000 000XXXXX]
|
|
217
|
-
PutUTF8SixBits(rune, *p++); //[00000XXX XXYYYYYY]
|
|
218
|
-
if (Y_UNLIKELY(rune < 0x80)) { // overlong encoding
|
|
219
|
-
p -= 2;
|
|
220
|
-
rune = BROKEN_RUNE;
|
|
221
|
-
return RECODE_BROKENSYMBOL;
|
|
222
|
-
}
|
|
223
|
-
return RECODE_OK;
|
|
224
|
-
}
|
|
225
|
-
case 3:
|
|
226
|
-
if (p + 3 > e) {
|
|
227
|
-
return RECODE_EOINPUT;
|
|
228
|
-
} else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2])) {
|
|
229
|
-
rune = BROKEN_RUNE;
|
|
230
|
-
return RECODE_BROKENSYMBOL;
|
|
231
|
-
} else {
|
|
232
|
-
PutUTF8LeadBits(rune, *p++, 3); //[00000000 0000XXXX]
|
|
233
|
-
PutUTF8SixBits(rune, *p++); //[000000XX XXYYYYYY]
|
|
234
|
-
PutUTF8SixBits(rune, *p++); //[XXXXYYYY YYZZZZZZ]
|
|
235
|
-
if (Y_UNLIKELY(rune < 0x800)) { // overlong encoding
|
|
236
|
-
p -= 3;
|
|
237
|
-
rune = BROKEN_RUNE;
|
|
238
|
-
return RECODE_BROKENSYMBOL;
|
|
239
|
-
}
|
|
240
|
-
return RECODE_OK;
|
|
241
|
-
}
|
|
242
|
-
case 4:
|
|
243
|
-
if (p + 4 > e) {
|
|
244
|
-
return RECODE_EOINPUT;
|
|
245
|
-
} else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2]) || !IsUTF8ContinuationByte(p[3])) {
|
|
246
|
-
rune = BROKEN_RUNE;
|
|
247
|
-
return RECODE_BROKENSYMBOL;
|
|
248
|
-
} else {
|
|
249
|
-
PutUTF8LeadBits(rune, *p++, 4); //[00000000 00000000 00000XXX]
|
|
250
|
-
PutUTF8SixBits(rune, *p++); //[00000000 0000000X XXYYYYYY]
|
|
251
|
-
PutUTF8SixBits(rune, *p++); //[00000000 0XXXYYYY YYZZZZZZ]
|
|
252
|
-
PutUTF8SixBits(rune, *p++); //[000XXXYY YYYYZZZZ ZZQQQQQQ]
|
|
253
|
-
if (Y_UNLIKELY(rune < 0x10000 || rune > 0x10FFFF)) { // overlong encoding or non-valid code point
|
|
254
|
-
p -= 4;
|
|
255
|
-
rune = BROKEN_RUNE;
|
|
256
|
-
return RECODE_BROKENSYMBOL;
|
|
257
|
-
}
|
|
258
|
-
return RECODE_OK;
|
|
259
|
-
}
|
|
260
|
-
default: // >4
|
|
261
|
-
rune = BROKEN_RUNE;
|
|
262
|
-
return RECODE_BROKENSYMBOL;
|
|
263
|
-
}
|
|
264
|
-
}
|
|
265
|
-
|
|
266
|
-
//! writes one unicode symbol into a character sequence encoded UTF8
|
|
267
|
-
//! checks for end of the buffer and returns the result of encoding
|
|
268
|
-
//! @param rune value of the current character
|
|
269
|
-
//! @param rune_len length of the UTF8 byte sequence that has been written
|
|
270
|
-
//! @param s pointer to the output buffer
|
|
271
|
-
//! @param tail available size of the buffer
|
|
272
|
-
inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s, size_t tail) {
|
|
273
|
-
rune_len = 0;
|
|
274
|
-
if (rune < 0x80) {
|
|
275
|
-
if (tail <= 0)
|
|
276
|
-
return RECODE_EOOUTPUT;
|
|
277
|
-
*s = static_cast<unsigned char>(rune);
|
|
278
|
-
rune_len = 1;
|
|
279
|
-
return RECODE_OK;
|
|
280
|
-
}
|
|
281
|
-
if (rune < 0x800) {
|
|
282
|
-
if (tail <= 1)
|
|
283
|
-
return RECODE_EOOUTPUT;
|
|
284
|
-
*s++ = static_cast<unsigned char>(0xC0 | (rune >> 6));
|
|
285
|
-
*s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
|
|
286
|
-
rune_len = 2;
|
|
287
|
-
return RECODE_OK;
|
|
288
|
-
}
|
|
289
|
-
if (rune < 0x10000) {
|
|
290
|
-
if (tail <= 2)
|
|
291
|
-
return RECODE_EOOUTPUT;
|
|
292
|
-
*s++ = static_cast<unsigned char>(0xE0 | (rune >> 12));
|
|
293
|
-
*s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
|
|
294
|
-
*s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
|
|
295
|
-
rune_len = 3;
|
|
296
|
-
return RECODE_OK;
|
|
297
|
-
}
|
|
298
|
-
/*if (rune < 0x200000)*/ {
|
|
299
|
-
if (tail <= 3)
|
|
300
|
-
return RECODE_EOOUTPUT;
|
|
301
|
-
*s++ = static_cast<unsigned char>(0xF0 | ((rune >> 18) & 0x07));
|
|
302
|
-
*s++ = static_cast<unsigned char>(0x80 | ((rune >> 12) & 0x3F));
|
|
303
|
-
*s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
|
|
304
|
-
*s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
|
|
305
|
-
rune_len = 4;
|
|
306
|
-
return RECODE_OK;
|
|
307
|
-
}
|
|
308
|
-
}
|
|
309
|
-
|
|
310
|
-
inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s, const unsigned char* end) {
|
|
311
|
-
return SafeWriteUTF8Char(rune, rune_len, s, end - s);
|
|
312
|
-
}
|
|
313
|
-
|
|
314
|
-
//! writes one unicode symbol into a character sequence encoded UTF8
|
|
315
|
-
//! @attention this function works as @c SafeWriteUTF8Char it does not check
|
|
316
|
-
//! the size of the output buffer, it supposes that buffer is long enough
|
|
317
|
-
//! @param rune value of the current character
|
|
318
|
-
//! @param rune_len length of the UTF8 byte sequence that has been written
|
|
319
|
-
//! @param s pointer to the output buffer
|
|
320
|
-
inline void WriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s) {
|
|
321
|
-
if (rune < 0x80) {
|
|
322
|
-
*s = static_cast<unsigned char>(rune);
|
|
323
|
-
rune_len = 1;
|
|
324
|
-
return;
|
|
325
|
-
}
|
|
326
|
-
if (rune < 0x800) {
|
|
327
|
-
*s++ = static_cast<unsigned char>(0xC0 | (rune >> 6));
|
|
328
|
-
*s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
|
|
329
|
-
rune_len = 2;
|
|
330
|
-
return;
|
|
331
|
-
}
|
|
332
|
-
if (rune < 0x10000) {
|
|
333
|
-
*s++ = static_cast<unsigned char>(0xE0 | (rune >> 12));
|
|
334
|
-
*s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
|
|
335
|
-
*s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
|
|
336
|
-
rune_len = 3;
|
|
337
|
-
return;
|
|
338
|
-
}
|
|
339
|
-
/*if (rune < 0x200000)*/ {
|
|
340
|
-
*s++ = static_cast<unsigned char>(0xF0 | ((rune >> 18) & 0x07));
|
|
341
|
-
*s++ = static_cast<unsigned char>(0x80 | ((rune >> 12) & 0x3F));
|
|
342
|
-
*s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
|
|
343
|
-
*s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
|
|
344
|
-
rune_len = 4;
|
|
345
|
-
}
|
|
346
|
-
}
|
|
347
|
-
|
|
348
|
-
TStringBuf SubstrUTF8(const TStringBuf str, size_t pos, size_t len);
|
|
349
|
-
|
|
350
|
-
enum EUTF8Detect {
|
|
351
|
-
NotUTF8,
|
|
352
|
-
UTF8,
|
|
353
|
-
ASCII
|
|
354
|
-
};
|
|
355
|
-
|
|
356
|
-
EUTF8Detect UTF8Detect(const char* s, size_t len);
|
|
357
|
-
|
|
358
|
-
inline EUTF8Detect UTF8Detect(const TStringBuf input) {
|
|
359
|
-
return UTF8Detect(input.data(), input.size());
|
|
360
|
-
}
|
|
361
|
-
|
|
362
|
-
inline bool IsUtf(const char* input, size_t len) {
|
|
363
|
-
return UTF8Detect(input, len) != NotUTF8;
|
|
364
|
-
}
|
|
365
|
-
|
|
366
|
-
inline bool IsUtf(const TStringBuf input) {
|
|
367
|
-
return IsUtf(input.data(), input.size());
|
|
368
|
-
}
|
|
369
|
-
|
|
370
|
-
//! returns true, if result is not the same as input, and put it in newString
|
|
371
|
-
//! returns false, if result is unmodified
|
|
372
|
-
bool ToLowerUTF8Impl(const char* beg, size_t n, TString& newString);
|
|
373
|
-
|
|
374
|
-
TString ToLowerUTF8(const TString& s);
|
|
375
|
-
TString ToLowerUTF8(TStringBuf s);
|
|
376
|
-
TString ToLowerUTF8(const char* s);
|
|
377
|
-
|
|
378
|
-
//! returns true, if result is not the same as input, and put it in newString
|
|
379
|
-
//! returns false, if result is unmodified
|
|
380
|
-
bool ToUpperUTF8Impl(const char* beg, size_t n, TString& newString);
|
|
381
|
-
|
|
382
|
-
TString ToUpperUTF8(const TString& s);
|
|
383
|
-
TString ToUpperUTF8(TStringBuf s);
|
|
384
|
-
TString ToUpperUTF8(const char* s);
|