catboost 1.25.1 → 1.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (289) hide show
  1. package/DEPLOYMENT.md +22 -15
  2. package/README.md +37 -27
  3. package/binding.gyp +5 -7
  4. package/build_scripts/bootstrap.js +2 -1
  5. package/build_scripts/out/build.js +46 -68
  6. package/build_scripts/out/build_model.js +1 -1
  7. package/build_scripts/out/{build_ya.js → build_native.js} +1 -1
  8. package/build_scripts/out/ci.js +5 -5
  9. package/build_scripts/out/config.js +32 -18
  10. package/build_scripts/out/install.js +5 -3
  11. package/build_scripts/out/package_prepublish.js +1 -1
  12. package/build_scripts/out/packaging.js +1 -19
  13. package/build_scripts/out/run_tests.js +1 -1
  14. package/build_scripts/out/test.js +8 -3
  15. package/config.json +18 -11
  16. package/inc/catboost/libs/model_interface/c_api.h +349 -3
  17. package/lib/catboost.d.ts +65 -21
  18. package/package.json +4 -4
  19. package/src/api_helpers.cpp +100 -24
  20. package/src/api_helpers.h +8 -7
  21. package/src/api_module.cpp +1 -2
  22. package/src/model.cpp +483 -83
  23. package/src/model.h +24 -9
  24. package/inc/contrib/libs/cxxsupp/system_stl/include/stlfwd +0 -14
  25. package/inc/util/charset/recode_result.h +0 -9
  26. package/inc/util/charset/unicode_table.h +0 -123
  27. package/inc/util/charset/unidata.h +0 -421
  28. package/inc/util/charset/utf8.h +0 -384
  29. package/inc/util/charset/wide.h +0 -843
  30. package/inc/util/charset/wide_specific.h +0 -22
  31. package/inc/util/datetime/base.h +0 -669
  32. package/inc/util/datetime/constants.h +0 -7
  33. package/inc/util/datetime/cputimer.h +0 -124
  34. package/inc/util/datetime/parser.h +0 -292
  35. package/inc/util/datetime/systime.h +0 -47
  36. package/inc/util/datetime/uptime.h +0 -8
  37. package/inc/util/digest/city.h +0 -88
  38. package/inc/util/digest/fnv.h +0 -73
  39. package/inc/util/digest/multi.h +0 -14
  40. package/inc/util/digest/murmur.h +0 -57
  41. package/inc/util/digest/numeric.h +0 -86
  42. package/inc/util/digest/sequence.h +0 -48
  43. package/inc/util/draft/date.h +0 -129
  44. package/inc/util/draft/datetime.h +0 -184
  45. package/inc/util/draft/enum.h +0 -136
  46. package/inc/util/draft/holder_vector.h +0 -102
  47. package/inc/util/draft/ip.h +0 -131
  48. package/inc/util/draft/matrix.h +0 -108
  49. package/inc/util/draft/memory.h +0 -40
  50. package/inc/util/folder/dirent_win.h +0 -46
  51. package/inc/util/folder/dirut.h +0 -121
  52. package/inc/util/folder/filelist.h +0 -81
  53. package/inc/util/folder/fts.h +0 -108
  54. package/inc/util/folder/iterator.h +0 -109
  55. package/inc/util/folder/lstat_win.h +0 -20
  56. package/inc/util/folder/path.h +0 -225
  57. package/inc/util/folder/pathsplit.h +0 -113
  58. package/inc/util/folder/tempdir.h +0 -42
  59. package/inc/util/generic/adaptor.h +0 -134
  60. package/inc/util/generic/algorithm.h +0 -765
  61. package/inc/util/generic/array_ref.h +0 -282
  62. package/inc/util/generic/array_size.h +0 -24
  63. package/inc/util/generic/benchmark/vector_count_ctor/f.h +0 -9
  64. package/inc/util/generic/bitmap.h +0 -1115
  65. package/inc/util/generic/bitops.h +0 -459
  66. package/inc/util/generic/bt_exception.h +0 -24
  67. package/inc/util/generic/buffer.h +0 -232
  68. package/inc/util/generic/cast.h +0 -176
  69. package/inc/util/generic/deque.h +0 -24
  70. package/inc/util/generic/explicit_type.h +0 -42
  71. package/inc/util/generic/fastqueue.h +0 -55
  72. package/inc/util/generic/flags.h +0 -244
  73. package/inc/util/generic/function.h +0 -103
  74. package/inc/util/generic/fwd.h +0 -171
  75. package/inc/util/generic/guid.h +0 -61
  76. package/inc/util/generic/hash.h +0 -2032
  77. package/inc/util/generic/hash_primes.h +0 -140
  78. package/inc/util/generic/hash_set.h +0 -490
  79. package/inc/util/generic/hide_ptr.h +0 -3
  80. package/inc/util/generic/intrlist.h +0 -876
  81. package/inc/util/generic/is_in.h +0 -53
  82. package/inc/util/generic/iterator.h +0 -137
  83. package/inc/util/generic/iterator_range.h +0 -105
  84. package/inc/util/generic/lazy_value.h +0 -66
  85. package/inc/util/generic/list.h +0 -22
  86. package/inc/util/generic/map.h +0 -44
  87. package/inc/util/generic/mapfindptr.h +0 -60
  88. package/inc/util/generic/maybe.h +0 -713
  89. package/inc/util/generic/maybe_traits.h +0 -164
  90. package/inc/util/generic/mem_copy.h +0 -55
  91. package/inc/util/generic/noncopyable.h +0 -38
  92. package/inc/util/generic/object_counter.h +0 -53
  93. package/inc/util/generic/ptr.h +0 -1113
  94. package/inc/util/generic/queue.h +0 -57
  95. package/inc/util/generic/refcount.h +0 -162
  96. package/inc/util/generic/reserve.h +0 -11
  97. package/inc/util/generic/scope.h +0 -65
  98. package/inc/util/generic/serialized_enum.h +0 -406
  99. package/inc/util/generic/set.h +0 -42
  100. package/inc/util/generic/singleton.h +0 -136
  101. package/inc/util/generic/size_literals.h +0 -65
  102. package/inc/util/generic/stack.h +0 -18
  103. package/inc/util/generic/store_policy.h +0 -120
  104. package/inc/util/generic/strbase.h +0 -612
  105. package/inc/util/generic/strbuf.h +0 -552
  106. package/inc/util/generic/strfcpy.h +0 -17
  107. package/inc/util/generic/string.h +0 -1572
  108. package/inc/util/generic/string_hash.h +0 -21
  109. package/inc/util/generic/string_ut.h +0 -1175
  110. package/inc/util/generic/type_name.h +0 -34
  111. package/inc/util/generic/typelist.h +0 -114
  112. package/inc/util/generic/typetraits.h +0 -325
  113. package/inc/util/generic/utility.h +0 -132
  114. package/inc/util/generic/va_args.h +0 -400
  115. package/inc/util/generic/variant.h +0 -631
  116. package/inc/util/generic/variant_traits.h +0 -171
  117. package/inc/util/generic/vector.h +0 -119
  118. package/inc/util/generic/xrange.h +0 -258
  119. package/inc/util/generic/yexception.h +0 -212
  120. package/inc/util/generic/yexception_ut.h +0 -14
  121. package/inc/util/generic/ylimits.h +0 -92
  122. package/inc/util/generic/ymath.h +0 -206
  123. package/inc/util/memory/addstorage.h +0 -93
  124. package/inc/util/memory/alloc.h +0 -27
  125. package/inc/util/memory/blob.h +0 -296
  126. package/inc/util/memory/mmapalloc.h +0 -8
  127. package/inc/util/memory/pool.h +0 -432
  128. package/inc/util/memory/segmented_string_pool.h +0 -194
  129. package/inc/util/memory/segpool_alloc.h +0 -118
  130. package/inc/util/memory/smallobj.h +0 -141
  131. package/inc/util/memory/tempbuf.h +0 -111
  132. package/inc/util/network/address.h +0 -136
  133. package/inc/util/network/endpoint.h +0 -61
  134. package/inc/util/network/hostip.h +0 -16
  135. package/inc/util/network/init.h +0 -60
  136. package/inc/util/network/interface.h +0 -17
  137. package/inc/util/network/iovec.h +0 -65
  138. package/inc/util/network/ip.h +0 -116
  139. package/inc/util/network/nonblock.h +0 -8
  140. package/inc/util/network/pair.h +0 -9
  141. package/inc/util/network/poller.h +0 -58
  142. package/inc/util/network/pollerimpl.h +0 -707
  143. package/inc/util/network/sock.h +0 -608
  144. package/inc/util/network/socket.h +0 -421
  145. package/inc/util/random/common_ops.h +0 -130
  146. package/inc/util/random/easy.h +0 -47
  147. package/inc/util/random/entropy.h +0 -21
  148. package/inc/util/random/fast.h +0 -101
  149. package/inc/util/random/init_atfork.h +0 -3
  150. package/inc/util/random/lcg_engine.h +0 -66
  151. package/inc/util/random/mersenne.h +0 -46
  152. package/inc/util/random/mersenne32.h +0 -50
  153. package/inc/util/random/mersenne64.h +0 -50
  154. package/inc/util/random/normal.h +0 -38
  155. package/inc/util/random/random.h +0 -30
  156. package/inc/util/random/shuffle.h +0 -39
  157. package/inc/util/str_stl.h +0 -266
  158. package/inc/util/stream/aligned.h +0 -99
  159. package/inc/util/stream/buffer.h +0 -119
  160. package/inc/util/stream/buffered.h +0 -225
  161. package/inc/util/stream/debug.h +0 -53
  162. package/inc/util/stream/direct_io.h +0 -43
  163. package/inc/util/stream/file.h +0 -108
  164. package/inc/util/stream/format.h +0 -444
  165. package/inc/util/stream/fwd.h +0 -100
  166. package/inc/util/stream/hex.h +0 -8
  167. package/inc/util/stream/holder.h +0 -44
  168. package/inc/util/stream/input.h +0 -273
  169. package/inc/util/stream/labeled.h +0 -19
  170. package/inc/util/stream/length.h +0 -100
  171. package/inc/util/stream/mem.h +0 -255
  172. package/inc/util/stream/multi.h +0 -32
  173. package/inc/util/stream/null.h +0 -61
  174. package/inc/util/stream/output.h +0 -304
  175. package/inc/util/stream/pipe.h +0 -112
  176. package/inc/util/stream/printf.h +0 -25
  177. package/inc/util/stream/str.h +0 -207
  178. package/inc/util/stream/tee.h +0 -28
  179. package/inc/util/stream/tempbuf.h +0 -21
  180. package/inc/util/stream/tokenizer.h +0 -214
  181. package/inc/util/stream/trace.h +0 -60
  182. package/inc/util/stream/walk.h +0 -35
  183. package/inc/util/stream/zerocopy.h +0 -91
  184. package/inc/util/stream/zerocopy_output.h +0 -57
  185. package/inc/util/stream/zlib.h +0 -173
  186. package/inc/util/string/ascii.h +0 -236
  187. package/inc/util/string/builder.h +0 -39
  188. package/inc/util/string/cast.h +0 -347
  189. package/inc/util/string/cstriter.h +0 -14
  190. package/inc/util/string/escape.h +0 -70
  191. package/inc/util/string/hex.h +0 -59
  192. package/inc/util/string/join.h +0 -194
  193. package/inc/util/string/printf.h +0 -13
  194. package/inc/util/string/reverse.h +0 -16
  195. package/inc/util/string/split.h +0 -1080
  196. package/inc/util/string/strip.h +0 -257
  197. package/inc/util/string/strspn.h +0 -65
  198. package/inc/util/string/subst.h +0 -56
  199. package/inc/util/string/type.h +0 -50
  200. package/inc/util/string/util.h +0 -195
  201. package/inc/util/string/vector.h +0 -132
  202. package/inc/util/system/align.h +0 -50
  203. package/inc/util/system/atexit.h +0 -22
  204. package/inc/util/system/atomic.h +0 -51
  205. package/inc/util/system/atomic_gcc.h +0 -90
  206. package/inc/util/system/atomic_ops.h +0 -189
  207. package/inc/util/system/atomic_win.h +0 -114
  208. package/inc/util/system/backtrace.h +0 -39
  209. package/inc/util/system/byteorder.h +0 -186
  210. package/inc/util/system/compat.h +0 -84
  211. package/inc/util/system/compiler.h +0 -620
  212. package/inc/util/system/condvar.h +0 -71
  213. package/inc/util/system/context.h +0 -181
  214. package/inc/util/system/context_aarch64.h +0 -8
  215. package/inc/util/system/context_i686.h +0 -9
  216. package/inc/util/system/context_x86.h +0 -12
  217. package/inc/util/system/context_x86_64.h +0 -7
  218. package/inc/util/system/cpu_id.h +0 -159
  219. package/inc/util/system/daemon.h +0 -28
  220. package/inc/util/system/datetime.h +0 -98
  221. package/inc/util/system/defaults.h +0 -149
  222. package/inc/util/system/demangle.h +0 -5
  223. package/inc/util/system/demangle_impl.h +0 -23
  224. package/inc/util/system/direct_io.h +0 -71
  225. package/inc/util/system/dynlib.h +0 -119
  226. package/inc/util/system/env.h +0 -32
  227. package/inc/util/system/error.h +0 -95
  228. package/inc/util/system/event.h +0 -122
  229. package/inc/util/system/execpath.h +0 -17
  230. package/inc/util/system/fasttime.h +0 -6
  231. package/inc/util/system/fhandle.h +0 -27
  232. package/inc/util/system/file.h +0 -210
  233. package/inc/util/system/file_lock.h +0 -34
  234. package/inc/util/system/filemap.h +0 -383
  235. package/inc/util/system/flock.h +0 -35
  236. package/inc/util/system/fs.h +0 -156
  237. package/inc/util/system/fs_win.h +0 -29
  238. package/inc/util/system/fstat.h +0 -46
  239. package/inc/util/system/getpid.h +0 -12
  240. package/inc/util/system/guard.h +0 -179
  241. package/inc/util/system/hi_lo.h +0 -139
  242. package/inc/util/system/hostname.h +0 -10
  243. package/inc/util/system/hp_timer.h +0 -36
  244. package/inc/util/system/info.h +0 -12
  245. package/inc/util/system/interrupt_signals.h +0 -22
  246. package/inc/util/system/madvise.h +0 -30
  247. package/inc/util/system/maxlen.h +0 -32
  248. package/inc/util/system/mem_info.h +0 -18
  249. package/inc/util/system/mincore.h +0 -38
  250. package/inc/util/system/mktemp.h +0 -11
  251. package/inc/util/system/mlock.h +0 -43
  252. package/inc/util/system/mutex.h +0 -67
  253. package/inc/util/system/nice.h +0 -3
  254. package/inc/util/system/pipe.h +0 -90
  255. package/inc/util/system/platform.h +0 -246
  256. package/inc/util/system/progname.h +0 -13
  257. package/inc/util/system/protect.h +0 -25
  258. package/inc/util/system/rusage.h +0 -26
  259. package/inc/util/system/rwlock.h +0 -78
  260. package/inc/util/system/sanitizers.h +0 -122
  261. package/inc/util/system/sem.h +0 -41
  262. package/inc/util/system/shellcommand.h +0 -472
  263. package/inc/util/system/shmat.h +0 -32
  264. package/inc/util/system/sigset.h +0 -78
  265. package/inc/util/system/spin_wait.h +0 -10
  266. package/inc/util/system/spinlock.h +0 -121
  267. package/inc/util/system/src_location.h +0 -25
  268. package/inc/util/system/src_root.h +0 -68
  269. package/inc/util/system/sys_alloc.h +0 -43
  270. package/inc/util/system/sysstat.h +0 -52
  271. package/inc/util/system/tempfile.h +0 -34
  272. package/inc/util/system/thread.h +0 -167
  273. package/inc/util/system/tls.h +0 -307
  274. package/inc/util/system/types.h +0 -119
  275. package/inc/util/system/unaligned_mem.h +0 -67
  276. package/inc/util/system/user.h +0 -5
  277. package/inc/util/system/utime.h +0 -6
  278. package/inc/util/system/valgrind.h +0 -48
  279. package/inc/util/system/winint.h +0 -43
  280. package/inc/util/system/yassert.h +0 -121
  281. package/inc/util/system/yield.h +0 -4
  282. package/inc/util/thread/factory.h +0 -65
  283. package/inc/util/thread/fwd.h +0 -30
  284. package/inc/util/thread/lfqueue.h +0 -406
  285. package/inc/util/thread/lfstack.h +0 -188
  286. package/inc/util/thread/pool.h +0 -388
  287. package/inc/util/thread/singleton.h +0 -42
  288. package/inc/util/ysafeptr.h +0 -427
  289. package/inc/util/ysaveload.h +0 -700
@@ -1,384 +0,0 @@
1
- #pragma once
2
-
3
- #include "recode_result.h"
4
-
5
- #include <util/generic/strbuf.h>
6
- #include <util/generic/string.h>
7
- #include <util/generic/yexception.h>
8
- #include <util/system/defaults.h>
9
- #include <util/system/yassert.h>
10
-
11
- extern const wchar32 BROKEN_RUNE;
12
-
13
- inline unsigned char UTF8LeadByteMask(size_t utf8_rune_len) {
14
- // Y_ASSERT (utf8_rune_len <= 4);
15
- return "\0\0\037\017\007"[utf8_rune_len];
16
- }
17
-
18
- inline size_t UTF8RuneLen(const unsigned char lead_byte) {
19
- //b0XXXXXXX
20
- if ((lead_byte & 0x80) == 0x00) {
21
- return 1;
22
- }
23
- //b110XXXXX
24
- if ((lead_byte & 0xe0) == 0xc0) {
25
- return 2;
26
- }
27
- //b1110XXXX
28
- if ((lead_byte & 0xf0) == 0xe0) {
29
- return 3;
30
- }
31
- //b11110XXX
32
- if ((lead_byte & 0xf8) == 0xf0) {
33
- return 4;
34
- }
35
- //b10XXXXXX
36
- return 0;
37
- }
38
-
39
- inline size_t UTF8RuneLenByUCS(wchar32 rune) {
40
- if (rune < 0x80)
41
- return 1U;
42
- else if (rune < 0x800)
43
- return 2U;
44
- else if (rune < 0x10000)
45
- return 3U;
46
- else if (rune < 0x200000)
47
- return 4U;
48
- else if (rune < 0x4000000)
49
- return 5U;
50
- else
51
- return 6U;
52
- }
53
-
54
- inline void PutUTF8LeadBits(wchar32& rune, unsigned char c, size_t len) {
55
- rune = c;
56
- rune &= UTF8LeadByteMask(len);
57
- }
58
-
59
- inline void PutUTF8SixBits(wchar32& rune, unsigned char c) {
60
- rune <<= 6;
61
- rune |= c & 0x3F;
62
- }
63
-
64
- inline bool IsUTF8ContinuationByte(unsigned char c) {
65
- return (c & static_cast<unsigned char>(0xC0)) == static_cast<unsigned char>(0x80);
66
- }
67
-
68
- //! returns length of the current UTF8 character
69
- //! @param n length of the current character, it is assigned in case of valid UTF8 byte sequence
70
- //! @param p pointer to the current character
71
- //! @param e end of the character sequence
72
- inline RECODE_RESULT GetUTF8CharLen(size_t& n, const unsigned char* p, const unsigned char* e) {
73
- Y_ASSERT(p < e); // since p < e then we will check RECODE_EOINPUT only for n > 1 (see calls of this functions)
74
- switch (UTF8RuneLen(*p)) {
75
- case 0:
76
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
77
-
78
- case 1:
79
- n = 1;
80
- return RECODE_OK;
81
-
82
- case 2:
83
- if (p + 2 > e) {
84
- return RECODE_EOINPUT;
85
- } else if (!IsUTF8ContinuationByte(p[1])) {
86
- return RECODE_BROKENSYMBOL;
87
- } else {
88
- n = 2;
89
- return RECODE_OK;
90
- }
91
- case 3:
92
- if (p + 3 > e) {
93
- return RECODE_EOINPUT;
94
- } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2])) {
95
- return RECODE_BROKENSYMBOL;
96
- } else {
97
- n = 3;
98
- return RECODE_OK;
99
- }
100
- default: // actually 4
101
- if (p + 4 > e) {
102
- return RECODE_EOINPUT;
103
- } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2]) || !IsUTF8ContinuationByte(p[3])) {
104
- return RECODE_BROKENSYMBOL;
105
- } else {
106
- n = 4;
107
- return RECODE_OK;
108
- }
109
- }
110
- }
111
-
112
- //! returns number of characters in UTF8 encoded text, stops immediately if UTF8 byte sequence is wrong
113
- //! @param text UTF8 encoded text
114
- //! @param len the length of the text in bytes
115
- //! @param number number of encoded symbols in the text
116
- inline bool GetNumberOfUTF8Chars(const char* text, size_t len, size_t& number) {
117
- const unsigned char* cur = reinterpret_cast<const unsigned char*>(text);
118
- const unsigned char* const last = cur + len;
119
- number = 0;
120
- size_t runeLen;
121
- bool res = true;
122
- while (cur != last) {
123
- if (GetUTF8CharLen(runeLen, cur, last) != RECODE_OK) { // actually it could be RECODE_BROKENSYMBOL only
124
- res = false;
125
- break;
126
- }
127
- cur += runeLen;
128
- Y_ASSERT(cur <= last);
129
- ++number;
130
- }
131
- return res;
132
- }
133
-
134
- inline size_t GetNumberOfUTF8Chars(TStringBuf text) {
135
- size_t number;
136
- if (!GetNumberOfUTF8Chars(text.data(), text.size(), number)) {
137
- ythrow yexception() << "GetNumberOfUTF8Chars failed on invalid utf-8 " << TString(text.substr(0, 50)).Quote();
138
- }
139
- return number;
140
- }
141
-
142
- //! reads one unicode symbol from a character sequence encoded UTF8 and checks for overlong encoding
143
- //! @param rune value of the current character
144
- //! @param rune_len length of the UTF8 bytes sequence that has been read
145
- //! @param s pointer to the current character
146
- //! @param end the end of the character sequence
147
- inline RECODE_RESULT SafeReadUTF8Char(wchar32& rune, size_t& rune_len, const unsigned char* s, const unsigned char* end) {
148
- rune = BROKEN_RUNE;
149
- rune_len = 0;
150
- wchar32 _rune;
151
-
152
- size_t _len = UTF8RuneLen(*s);
153
- if (s + _len > end)
154
- return RECODE_EOINPUT; //[EOINPUT]
155
- if (_len == 0)
156
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
157
- _rune = *s++; //[00000000 0XXXXXXX]
158
-
159
- if (_len > 1) {
160
- _rune &= UTF8LeadByteMask(_len);
161
- unsigned char ch = *s++;
162
- if (!IsUTF8ContinuationByte(ch))
163
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte
164
- PutUTF8SixBits(_rune, ch); //[00000XXX XXYYYYYY]
165
- if (_len > 2) {
166
- ch = *s++;
167
- if (!IsUTF8ContinuationByte(ch))
168
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte
169
- PutUTF8SixBits(_rune, ch); //[XXXXYYYY YYZZZZZZ]
170
- if (_len > 3) {
171
- ch = *s;
172
- if (!IsUTF8ContinuationByte(ch))
173
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte
174
- PutUTF8SixBits(_rune, ch); //[XXXYY YYYYZZZZ ZZQQQQQQ]
175
- if (_rune > 0x10FFFF) // it is not a valid Unicode code point
176
- return RECODE_BROKENSYMBOL;
177
- if (_rune < 0x10000) // check for overlong encoding
178
- return RECODE_BROKENSYMBOL;
179
- } else {
180
- if (_rune < 0x800) // check for overlong encoding
181
- return RECODE_BROKENSYMBOL;
182
- }
183
- } else {
184
- if (_rune < 0x80) // check for overlong encoding
185
- return RECODE_BROKENSYMBOL;
186
- }
187
- }
188
- rune_len = _len;
189
- rune = _rune;
190
- return RECODE_OK;
191
- }
192
-
193
- //! reads one unicode symbol from a character sequence encoded UTF8 and moves pointer to the next character
194
- //! @param c value of the current character
195
- //! @param p pointer to the current character, it will be changed in case of valid UTF8 byte sequence
196
- //! @param e the end of the character sequence
197
- Y_FORCE_INLINE RECODE_RESULT ReadUTF8CharAndAdvance(wchar32& rune, const unsigned char*& p, const unsigned char* e) noexcept {
198
- Y_ASSERT(p < e); // since p < e then we will check RECODE_EOINPUT only for n > 1 (see calls of this functions)
199
- switch (UTF8RuneLen(*p)) {
200
- case 0:
201
- rune = BROKEN_RUNE;
202
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
203
-
204
- case 1:
205
- rune = *p; //[00000000 0XXXXXXX]
206
- ++p;
207
- return RECODE_OK;
208
-
209
- case 2:
210
- if (p + 2 > e) {
211
- return RECODE_EOINPUT;
212
- } else if (!IsUTF8ContinuationByte(p[1])) {
213
- rune = BROKEN_RUNE;
214
- return RECODE_BROKENSYMBOL;
215
- } else {
216
- PutUTF8LeadBits(rune, *p++, 2); //[00000000 000XXXXX]
217
- PutUTF8SixBits(rune, *p++); //[00000XXX XXYYYYYY]
218
- if (Y_UNLIKELY(rune < 0x80)) { // overlong encoding
219
- p -= 2;
220
- rune = BROKEN_RUNE;
221
- return RECODE_BROKENSYMBOL;
222
- }
223
- return RECODE_OK;
224
- }
225
- case 3:
226
- if (p + 3 > e) {
227
- return RECODE_EOINPUT;
228
- } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2])) {
229
- rune = BROKEN_RUNE;
230
- return RECODE_BROKENSYMBOL;
231
- } else {
232
- PutUTF8LeadBits(rune, *p++, 3); //[00000000 0000XXXX]
233
- PutUTF8SixBits(rune, *p++); //[000000XX XXYYYYYY]
234
- PutUTF8SixBits(rune, *p++); //[XXXXYYYY YYZZZZZZ]
235
- if (Y_UNLIKELY(rune < 0x800)) { // overlong encoding
236
- p -= 3;
237
- rune = BROKEN_RUNE;
238
- return RECODE_BROKENSYMBOL;
239
- }
240
- return RECODE_OK;
241
- }
242
- case 4:
243
- if (p + 4 > e) {
244
- return RECODE_EOINPUT;
245
- } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2]) || !IsUTF8ContinuationByte(p[3])) {
246
- rune = BROKEN_RUNE;
247
- return RECODE_BROKENSYMBOL;
248
- } else {
249
- PutUTF8LeadBits(rune, *p++, 4); //[00000000 00000000 00000XXX]
250
- PutUTF8SixBits(rune, *p++); //[00000000 0000000X XXYYYYYY]
251
- PutUTF8SixBits(rune, *p++); //[00000000 0XXXYYYY YYZZZZZZ]
252
- PutUTF8SixBits(rune, *p++); //[000XXXYY YYYYZZZZ ZZQQQQQQ]
253
- if (Y_UNLIKELY(rune < 0x10000 || rune > 0x10FFFF)) { // overlong encoding or non-valid code point
254
- p -= 4;
255
- rune = BROKEN_RUNE;
256
- return RECODE_BROKENSYMBOL;
257
- }
258
- return RECODE_OK;
259
- }
260
- default: // >4
261
- rune = BROKEN_RUNE;
262
- return RECODE_BROKENSYMBOL;
263
- }
264
- }
265
-
266
- //! writes one unicode symbol into a character sequence encoded UTF8
267
- //! checks for end of the buffer and returns the result of encoding
268
- //! @param rune value of the current character
269
- //! @param rune_len length of the UTF8 byte sequence that has been written
270
- //! @param s pointer to the output buffer
271
- //! @param tail available size of the buffer
272
- inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s, size_t tail) {
273
- rune_len = 0;
274
- if (rune < 0x80) {
275
- if (tail <= 0)
276
- return RECODE_EOOUTPUT;
277
- *s = static_cast<unsigned char>(rune);
278
- rune_len = 1;
279
- return RECODE_OK;
280
- }
281
- if (rune < 0x800) {
282
- if (tail <= 1)
283
- return RECODE_EOOUTPUT;
284
- *s++ = static_cast<unsigned char>(0xC0 | (rune >> 6));
285
- *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
286
- rune_len = 2;
287
- return RECODE_OK;
288
- }
289
- if (rune < 0x10000) {
290
- if (tail <= 2)
291
- return RECODE_EOOUTPUT;
292
- *s++ = static_cast<unsigned char>(0xE0 | (rune >> 12));
293
- *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
294
- *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
295
- rune_len = 3;
296
- return RECODE_OK;
297
- }
298
- /*if (rune < 0x200000)*/ {
299
- if (tail <= 3)
300
- return RECODE_EOOUTPUT;
301
- *s++ = static_cast<unsigned char>(0xF0 | ((rune >> 18) & 0x07));
302
- *s++ = static_cast<unsigned char>(0x80 | ((rune >> 12) & 0x3F));
303
- *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
304
- *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
305
- rune_len = 4;
306
- return RECODE_OK;
307
- }
308
- }
309
-
310
- inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s, const unsigned char* end) {
311
- return SafeWriteUTF8Char(rune, rune_len, s, end - s);
312
- }
313
-
314
- //! writes one unicode symbol into a character sequence encoded UTF8
315
- //! @attention this function works as @c SafeWriteUTF8Char it does not check
316
- //! the size of the output buffer, it supposes that buffer is long enough
317
- //! @param rune value of the current character
318
- //! @param rune_len length of the UTF8 byte sequence that has been written
319
- //! @param s pointer to the output buffer
320
- inline void WriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s) {
321
- if (rune < 0x80) {
322
- *s = static_cast<unsigned char>(rune);
323
- rune_len = 1;
324
- return;
325
- }
326
- if (rune < 0x800) {
327
- *s++ = static_cast<unsigned char>(0xC0 | (rune >> 6));
328
- *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
329
- rune_len = 2;
330
- return;
331
- }
332
- if (rune < 0x10000) {
333
- *s++ = static_cast<unsigned char>(0xE0 | (rune >> 12));
334
- *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
335
- *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
336
- rune_len = 3;
337
- return;
338
- }
339
- /*if (rune < 0x200000)*/ {
340
- *s++ = static_cast<unsigned char>(0xF0 | ((rune >> 18) & 0x07));
341
- *s++ = static_cast<unsigned char>(0x80 | ((rune >> 12) & 0x3F));
342
- *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
343
- *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
344
- rune_len = 4;
345
- }
346
- }
347
-
348
- TStringBuf SubstrUTF8(const TStringBuf str, size_t pos, size_t len);
349
-
350
- enum EUTF8Detect {
351
- NotUTF8,
352
- UTF8,
353
- ASCII
354
- };
355
-
356
- EUTF8Detect UTF8Detect(const char* s, size_t len);
357
-
358
- inline EUTF8Detect UTF8Detect(const TStringBuf input) {
359
- return UTF8Detect(input.data(), input.size());
360
- }
361
-
362
- inline bool IsUtf(const char* input, size_t len) {
363
- return UTF8Detect(input, len) != NotUTF8;
364
- }
365
-
366
- inline bool IsUtf(const TStringBuf input) {
367
- return IsUtf(input.data(), input.size());
368
- }
369
-
370
- //! returns true, if result is not the same as input, and put it in newString
371
- //! returns false, if result is unmodified
372
- bool ToLowerUTF8Impl(const char* beg, size_t n, TString& newString);
373
-
374
- TString ToLowerUTF8(const TString& s);
375
- TString ToLowerUTF8(TStringBuf s);
376
- TString ToLowerUTF8(const char* s);
377
-
378
- //! returns true, if result is not the same as input, and put it in newString
379
- //! returns false, if result is unmodified
380
- bool ToUpperUTF8Impl(const char* beg, size_t n, TString& newString);
381
-
382
- TString ToUpperUTF8(const TString& s);
383
- TString ToUpperUTF8(TStringBuf s);
384
- TString ToUpperUTF8(const char* s);