catboost 1.25.1 → 1.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. package/DEPLOYMENT.md +22 -15
  2. package/README.md +43 -27
  3. package/binding.gyp +5 -7
  4. package/build_scripts/bootstrap.js +2 -1
  5. package/build_scripts/out/build.js +46 -68
  6. package/build_scripts/out/build_model.js +1 -1
  7. package/build_scripts/out/{build_ya.js → build_native.js} +1 -1
  8. package/build_scripts/out/ci.js +5 -5
  9. package/build_scripts/out/common.js +1 -1
  10. package/build_scripts/out/config.js +32 -18
  11. package/build_scripts/out/install.js +5 -3
  12. package/build_scripts/out/package_prepublish.js +1 -1
  13. package/build_scripts/out/packaging.js +1 -19
  14. package/build_scripts/out/run_tests.js +1 -1
  15. package/build_scripts/out/test.js +8 -3
  16. package/config.json +18 -11
  17. package/inc/catboost/libs/model_interface/c_api.h +367 -5
  18. package/lib/catboost.d.ts +65 -21
  19. package/package.json +4 -4
  20. package/src/api_helpers.cpp +100 -24
  21. package/src/api_helpers.h +8 -7
  22. package/src/api_module.cpp +1 -2
  23. package/src/model.cpp +483 -83
  24. package/src/model.h +24 -9
  25. package/inc/contrib/libs/cxxsupp/system_stl/include/stlfwd +0 -14
  26. package/inc/util/charset/recode_result.h +0 -9
  27. package/inc/util/charset/unicode_table.h +0 -123
  28. package/inc/util/charset/unidata.h +0 -421
  29. package/inc/util/charset/utf8.h +0 -384
  30. package/inc/util/charset/wide.h +0 -843
  31. package/inc/util/charset/wide_specific.h +0 -22
  32. package/inc/util/datetime/base.h +0 -669
  33. package/inc/util/datetime/constants.h +0 -7
  34. package/inc/util/datetime/cputimer.h +0 -124
  35. package/inc/util/datetime/parser.h +0 -292
  36. package/inc/util/datetime/systime.h +0 -47
  37. package/inc/util/datetime/uptime.h +0 -8
  38. package/inc/util/digest/city.h +0 -88
  39. package/inc/util/digest/fnv.h +0 -73
  40. package/inc/util/digest/multi.h +0 -14
  41. package/inc/util/digest/murmur.h +0 -57
  42. package/inc/util/digest/numeric.h +0 -86
  43. package/inc/util/digest/sequence.h +0 -48
  44. package/inc/util/draft/date.h +0 -129
  45. package/inc/util/draft/datetime.h +0 -184
  46. package/inc/util/draft/enum.h +0 -136
  47. package/inc/util/draft/holder_vector.h +0 -102
  48. package/inc/util/draft/ip.h +0 -131
  49. package/inc/util/draft/matrix.h +0 -108
  50. package/inc/util/draft/memory.h +0 -40
  51. package/inc/util/folder/dirent_win.h +0 -46
  52. package/inc/util/folder/dirut.h +0 -121
  53. package/inc/util/folder/filelist.h +0 -81
  54. package/inc/util/folder/fts.h +0 -108
  55. package/inc/util/folder/iterator.h +0 -109
  56. package/inc/util/folder/lstat_win.h +0 -20
  57. package/inc/util/folder/path.h +0 -225
  58. package/inc/util/folder/pathsplit.h +0 -113
  59. package/inc/util/folder/tempdir.h +0 -42
  60. package/inc/util/generic/adaptor.h +0 -134
  61. package/inc/util/generic/algorithm.h +0 -765
  62. package/inc/util/generic/array_ref.h +0 -282
  63. package/inc/util/generic/array_size.h +0 -24
  64. package/inc/util/generic/benchmark/vector_count_ctor/f.h +0 -9
  65. package/inc/util/generic/bitmap.h +0 -1115
  66. package/inc/util/generic/bitops.h +0 -459
  67. package/inc/util/generic/bt_exception.h +0 -24
  68. package/inc/util/generic/buffer.h +0 -232
  69. package/inc/util/generic/cast.h +0 -176
  70. package/inc/util/generic/deque.h +0 -24
  71. package/inc/util/generic/explicit_type.h +0 -42
  72. package/inc/util/generic/fastqueue.h +0 -55
  73. package/inc/util/generic/flags.h +0 -244
  74. package/inc/util/generic/function.h +0 -103
  75. package/inc/util/generic/fwd.h +0 -171
  76. package/inc/util/generic/guid.h +0 -61
  77. package/inc/util/generic/hash.h +0 -2032
  78. package/inc/util/generic/hash_primes.h +0 -140
  79. package/inc/util/generic/hash_set.h +0 -490
  80. package/inc/util/generic/hide_ptr.h +0 -3
  81. package/inc/util/generic/intrlist.h +0 -876
  82. package/inc/util/generic/is_in.h +0 -53
  83. package/inc/util/generic/iterator.h +0 -137
  84. package/inc/util/generic/iterator_range.h +0 -105
  85. package/inc/util/generic/lazy_value.h +0 -66
  86. package/inc/util/generic/list.h +0 -22
  87. package/inc/util/generic/map.h +0 -44
  88. package/inc/util/generic/mapfindptr.h +0 -60
  89. package/inc/util/generic/maybe.h +0 -713
  90. package/inc/util/generic/maybe_traits.h +0 -164
  91. package/inc/util/generic/mem_copy.h +0 -55
  92. package/inc/util/generic/noncopyable.h +0 -38
  93. package/inc/util/generic/object_counter.h +0 -53
  94. package/inc/util/generic/ptr.h +0 -1113
  95. package/inc/util/generic/queue.h +0 -57
  96. package/inc/util/generic/refcount.h +0 -162
  97. package/inc/util/generic/reserve.h +0 -11
  98. package/inc/util/generic/scope.h +0 -65
  99. package/inc/util/generic/serialized_enum.h +0 -406
  100. package/inc/util/generic/set.h +0 -42
  101. package/inc/util/generic/singleton.h +0 -136
  102. package/inc/util/generic/size_literals.h +0 -65
  103. package/inc/util/generic/stack.h +0 -18
  104. package/inc/util/generic/store_policy.h +0 -120
  105. package/inc/util/generic/strbase.h +0 -612
  106. package/inc/util/generic/strbuf.h +0 -552
  107. package/inc/util/generic/strfcpy.h +0 -17
  108. package/inc/util/generic/string.h +0 -1572
  109. package/inc/util/generic/string_hash.h +0 -21
  110. package/inc/util/generic/string_ut.h +0 -1175
  111. package/inc/util/generic/type_name.h +0 -34
  112. package/inc/util/generic/typelist.h +0 -114
  113. package/inc/util/generic/typetraits.h +0 -325
  114. package/inc/util/generic/utility.h +0 -132
  115. package/inc/util/generic/va_args.h +0 -400
  116. package/inc/util/generic/variant.h +0 -631
  117. package/inc/util/generic/variant_traits.h +0 -171
  118. package/inc/util/generic/vector.h +0 -119
  119. package/inc/util/generic/xrange.h +0 -258
  120. package/inc/util/generic/yexception.h +0 -212
  121. package/inc/util/generic/yexception_ut.h +0 -14
  122. package/inc/util/generic/ylimits.h +0 -92
  123. package/inc/util/generic/ymath.h +0 -206
  124. package/inc/util/memory/addstorage.h +0 -93
  125. package/inc/util/memory/alloc.h +0 -27
  126. package/inc/util/memory/blob.h +0 -296
  127. package/inc/util/memory/mmapalloc.h +0 -8
  128. package/inc/util/memory/pool.h +0 -432
  129. package/inc/util/memory/segmented_string_pool.h +0 -194
  130. package/inc/util/memory/segpool_alloc.h +0 -118
  131. package/inc/util/memory/smallobj.h +0 -141
  132. package/inc/util/memory/tempbuf.h +0 -111
  133. package/inc/util/network/address.h +0 -136
  134. package/inc/util/network/endpoint.h +0 -61
  135. package/inc/util/network/hostip.h +0 -16
  136. package/inc/util/network/init.h +0 -60
  137. package/inc/util/network/interface.h +0 -17
  138. package/inc/util/network/iovec.h +0 -65
  139. package/inc/util/network/ip.h +0 -116
  140. package/inc/util/network/nonblock.h +0 -8
  141. package/inc/util/network/pair.h +0 -9
  142. package/inc/util/network/poller.h +0 -58
  143. package/inc/util/network/pollerimpl.h +0 -707
  144. package/inc/util/network/sock.h +0 -608
  145. package/inc/util/network/socket.h +0 -421
  146. package/inc/util/random/common_ops.h +0 -130
  147. package/inc/util/random/easy.h +0 -47
  148. package/inc/util/random/entropy.h +0 -21
  149. package/inc/util/random/fast.h +0 -101
  150. package/inc/util/random/init_atfork.h +0 -3
  151. package/inc/util/random/lcg_engine.h +0 -66
  152. package/inc/util/random/mersenne.h +0 -46
  153. package/inc/util/random/mersenne32.h +0 -50
  154. package/inc/util/random/mersenne64.h +0 -50
  155. package/inc/util/random/normal.h +0 -38
  156. package/inc/util/random/random.h +0 -30
  157. package/inc/util/random/shuffle.h +0 -39
  158. package/inc/util/str_stl.h +0 -266
  159. package/inc/util/stream/aligned.h +0 -99
  160. package/inc/util/stream/buffer.h +0 -119
  161. package/inc/util/stream/buffered.h +0 -225
  162. package/inc/util/stream/debug.h +0 -53
  163. package/inc/util/stream/direct_io.h +0 -43
  164. package/inc/util/stream/file.h +0 -108
  165. package/inc/util/stream/format.h +0 -444
  166. package/inc/util/stream/fwd.h +0 -100
  167. package/inc/util/stream/hex.h +0 -8
  168. package/inc/util/stream/holder.h +0 -44
  169. package/inc/util/stream/input.h +0 -273
  170. package/inc/util/stream/labeled.h +0 -19
  171. package/inc/util/stream/length.h +0 -100
  172. package/inc/util/stream/mem.h +0 -255
  173. package/inc/util/stream/multi.h +0 -32
  174. package/inc/util/stream/null.h +0 -61
  175. package/inc/util/stream/output.h +0 -304
  176. package/inc/util/stream/pipe.h +0 -112
  177. package/inc/util/stream/printf.h +0 -25
  178. package/inc/util/stream/str.h +0 -207
  179. package/inc/util/stream/tee.h +0 -28
  180. package/inc/util/stream/tempbuf.h +0 -21
  181. package/inc/util/stream/tokenizer.h +0 -214
  182. package/inc/util/stream/trace.h +0 -60
  183. package/inc/util/stream/walk.h +0 -35
  184. package/inc/util/stream/zerocopy.h +0 -91
  185. package/inc/util/stream/zerocopy_output.h +0 -57
  186. package/inc/util/stream/zlib.h +0 -173
  187. package/inc/util/string/ascii.h +0 -236
  188. package/inc/util/string/builder.h +0 -39
  189. package/inc/util/string/cast.h +0 -347
  190. package/inc/util/string/cstriter.h +0 -14
  191. package/inc/util/string/escape.h +0 -70
  192. package/inc/util/string/hex.h +0 -59
  193. package/inc/util/string/join.h +0 -194
  194. package/inc/util/string/printf.h +0 -13
  195. package/inc/util/string/reverse.h +0 -16
  196. package/inc/util/string/split.h +0 -1080
  197. package/inc/util/string/strip.h +0 -257
  198. package/inc/util/string/strspn.h +0 -65
  199. package/inc/util/string/subst.h +0 -56
  200. package/inc/util/string/type.h +0 -50
  201. package/inc/util/string/util.h +0 -195
  202. package/inc/util/string/vector.h +0 -132
  203. package/inc/util/system/align.h +0 -50
  204. package/inc/util/system/atexit.h +0 -22
  205. package/inc/util/system/atomic.h +0 -51
  206. package/inc/util/system/atomic_gcc.h +0 -90
  207. package/inc/util/system/atomic_ops.h +0 -189
  208. package/inc/util/system/atomic_win.h +0 -114
  209. package/inc/util/system/backtrace.h +0 -39
  210. package/inc/util/system/byteorder.h +0 -186
  211. package/inc/util/system/compat.h +0 -84
  212. package/inc/util/system/compiler.h +0 -620
  213. package/inc/util/system/condvar.h +0 -71
  214. package/inc/util/system/context.h +0 -181
  215. package/inc/util/system/context_aarch64.h +0 -8
  216. package/inc/util/system/context_i686.h +0 -9
  217. package/inc/util/system/context_x86.h +0 -12
  218. package/inc/util/system/context_x86_64.h +0 -7
  219. package/inc/util/system/cpu_id.h +0 -159
  220. package/inc/util/system/daemon.h +0 -28
  221. package/inc/util/system/datetime.h +0 -98
  222. package/inc/util/system/defaults.h +0 -149
  223. package/inc/util/system/demangle.h +0 -5
  224. package/inc/util/system/demangle_impl.h +0 -23
  225. package/inc/util/system/direct_io.h +0 -71
  226. package/inc/util/system/dynlib.h +0 -119
  227. package/inc/util/system/env.h +0 -32
  228. package/inc/util/system/error.h +0 -95
  229. package/inc/util/system/event.h +0 -122
  230. package/inc/util/system/execpath.h +0 -17
  231. package/inc/util/system/fasttime.h +0 -6
  232. package/inc/util/system/fhandle.h +0 -27
  233. package/inc/util/system/file.h +0 -210
  234. package/inc/util/system/file_lock.h +0 -34
  235. package/inc/util/system/filemap.h +0 -383
  236. package/inc/util/system/flock.h +0 -35
  237. package/inc/util/system/fs.h +0 -156
  238. package/inc/util/system/fs_win.h +0 -29
  239. package/inc/util/system/fstat.h +0 -46
  240. package/inc/util/system/getpid.h +0 -12
  241. package/inc/util/system/guard.h +0 -179
  242. package/inc/util/system/hi_lo.h +0 -139
  243. package/inc/util/system/hostname.h +0 -10
  244. package/inc/util/system/hp_timer.h +0 -36
  245. package/inc/util/system/info.h +0 -12
  246. package/inc/util/system/interrupt_signals.h +0 -22
  247. package/inc/util/system/madvise.h +0 -30
  248. package/inc/util/system/maxlen.h +0 -32
  249. package/inc/util/system/mem_info.h +0 -18
  250. package/inc/util/system/mincore.h +0 -38
  251. package/inc/util/system/mktemp.h +0 -11
  252. package/inc/util/system/mlock.h +0 -43
  253. package/inc/util/system/mutex.h +0 -67
  254. package/inc/util/system/nice.h +0 -3
  255. package/inc/util/system/pipe.h +0 -90
  256. package/inc/util/system/platform.h +0 -246
  257. package/inc/util/system/progname.h +0 -13
  258. package/inc/util/system/protect.h +0 -25
  259. package/inc/util/system/rusage.h +0 -26
  260. package/inc/util/system/rwlock.h +0 -78
  261. package/inc/util/system/sanitizers.h +0 -122
  262. package/inc/util/system/sem.h +0 -41
  263. package/inc/util/system/shellcommand.h +0 -472
  264. package/inc/util/system/shmat.h +0 -32
  265. package/inc/util/system/sigset.h +0 -78
  266. package/inc/util/system/spin_wait.h +0 -10
  267. package/inc/util/system/spinlock.h +0 -121
  268. package/inc/util/system/src_location.h +0 -25
  269. package/inc/util/system/src_root.h +0 -68
  270. package/inc/util/system/sys_alloc.h +0 -43
  271. package/inc/util/system/sysstat.h +0 -52
  272. package/inc/util/system/tempfile.h +0 -34
  273. package/inc/util/system/thread.h +0 -167
  274. package/inc/util/system/tls.h +0 -307
  275. package/inc/util/system/types.h +0 -119
  276. package/inc/util/system/unaligned_mem.h +0 -67
  277. package/inc/util/system/user.h +0 -5
  278. package/inc/util/system/utime.h +0 -6
  279. package/inc/util/system/valgrind.h +0 -48
  280. package/inc/util/system/winint.h +0 -43
  281. package/inc/util/system/yassert.h +0 -121
  282. package/inc/util/system/yield.h +0 -4
  283. package/inc/util/thread/factory.h +0 -65
  284. package/inc/util/thread/fwd.h +0 -30
  285. package/inc/util/thread/lfqueue.h +0 -406
  286. package/inc/util/thread/lfstack.h +0 -188
  287. package/inc/util/thread/pool.h +0 -388
  288. package/inc/util/thread/singleton.h +0 -42
  289. package/inc/util/ysafeptr.h +0 -427
  290. package/inc/util/ysaveload.h +0 -700
@@ -1,384 +0,0 @@
1
- #pragma once
2
-
3
- #include "recode_result.h"
4
-
5
- #include <util/generic/strbuf.h>
6
- #include <util/generic/string.h>
7
- #include <util/generic/yexception.h>
8
- #include <util/system/defaults.h>
9
- #include <util/system/yassert.h>
10
-
11
- extern const wchar32 BROKEN_RUNE;
12
-
13
- inline unsigned char UTF8LeadByteMask(size_t utf8_rune_len) {
14
- // Y_ASSERT (utf8_rune_len <= 4);
15
- return "\0\0\037\017\007"[utf8_rune_len];
16
- }
17
-
18
- inline size_t UTF8RuneLen(const unsigned char lead_byte) {
19
- //b0XXXXXXX
20
- if ((lead_byte & 0x80) == 0x00) {
21
- return 1;
22
- }
23
- //b110XXXXX
24
- if ((lead_byte & 0xe0) == 0xc0) {
25
- return 2;
26
- }
27
- //b1110XXXX
28
- if ((lead_byte & 0xf0) == 0xe0) {
29
- return 3;
30
- }
31
- //b11110XXX
32
- if ((lead_byte & 0xf8) == 0xf0) {
33
- return 4;
34
- }
35
- //b10XXXXXX
36
- return 0;
37
- }
38
-
39
- inline size_t UTF8RuneLenByUCS(wchar32 rune) {
40
- if (rune < 0x80)
41
- return 1U;
42
- else if (rune < 0x800)
43
- return 2U;
44
- else if (rune < 0x10000)
45
- return 3U;
46
- else if (rune < 0x200000)
47
- return 4U;
48
- else if (rune < 0x4000000)
49
- return 5U;
50
- else
51
- return 6U;
52
- }
53
-
54
- inline void PutUTF8LeadBits(wchar32& rune, unsigned char c, size_t len) {
55
- rune = c;
56
- rune &= UTF8LeadByteMask(len);
57
- }
58
-
59
- inline void PutUTF8SixBits(wchar32& rune, unsigned char c) {
60
- rune <<= 6;
61
- rune |= c & 0x3F;
62
- }
63
-
64
- inline bool IsUTF8ContinuationByte(unsigned char c) {
65
- return (c & static_cast<unsigned char>(0xC0)) == static_cast<unsigned char>(0x80);
66
- }
67
-
68
- //! returns length of the current UTF8 character
69
- //! @param n length of the current character, it is assigned in case of valid UTF8 byte sequence
70
- //! @param p pointer to the current character
71
- //! @param e end of the character sequence
72
- inline RECODE_RESULT GetUTF8CharLen(size_t& n, const unsigned char* p, const unsigned char* e) {
73
- Y_ASSERT(p < e); // since p < e then we will check RECODE_EOINPUT only for n > 1 (see calls of this functions)
74
- switch (UTF8RuneLen(*p)) {
75
- case 0:
76
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
77
-
78
- case 1:
79
- n = 1;
80
- return RECODE_OK;
81
-
82
- case 2:
83
- if (p + 2 > e) {
84
- return RECODE_EOINPUT;
85
- } else if (!IsUTF8ContinuationByte(p[1])) {
86
- return RECODE_BROKENSYMBOL;
87
- } else {
88
- n = 2;
89
- return RECODE_OK;
90
- }
91
- case 3:
92
- if (p + 3 > e) {
93
- return RECODE_EOINPUT;
94
- } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2])) {
95
- return RECODE_BROKENSYMBOL;
96
- } else {
97
- n = 3;
98
- return RECODE_OK;
99
- }
100
- default: // actually 4
101
- if (p + 4 > e) {
102
- return RECODE_EOINPUT;
103
- } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2]) || !IsUTF8ContinuationByte(p[3])) {
104
- return RECODE_BROKENSYMBOL;
105
- } else {
106
- n = 4;
107
- return RECODE_OK;
108
- }
109
- }
110
- }
111
-
112
- //! returns number of characters in UTF8 encoded text, stops immediately if UTF8 byte sequence is wrong
113
- //! @param text UTF8 encoded text
114
- //! @param len the length of the text in bytes
115
- //! @param number number of encoded symbols in the text
116
- inline bool GetNumberOfUTF8Chars(const char* text, size_t len, size_t& number) {
117
- const unsigned char* cur = reinterpret_cast<const unsigned char*>(text);
118
- const unsigned char* const last = cur + len;
119
- number = 0;
120
- size_t runeLen;
121
- bool res = true;
122
- while (cur != last) {
123
- if (GetUTF8CharLen(runeLen, cur, last) != RECODE_OK) { // actually it could be RECODE_BROKENSYMBOL only
124
- res = false;
125
- break;
126
- }
127
- cur += runeLen;
128
- Y_ASSERT(cur <= last);
129
- ++number;
130
- }
131
- return res;
132
- }
133
-
134
- inline size_t GetNumberOfUTF8Chars(TStringBuf text) {
135
- size_t number;
136
- if (!GetNumberOfUTF8Chars(text.data(), text.size(), number)) {
137
- ythrow yexception() << "GetNumberOfUTF8Chars failed on invalid utf-8 " << TString(text.substr(0, 50)).Quote();
138
- }
139
- return number;
140
- }
141
-
142
- //! reads one unicode symbol from a character sequence encoded UTF8 and checks for overlong encoding
143
- //! @param rune value of the current character
144
- //! @param rune_len length of the UTF8 bytes sequence that has been read
145
- //! @param s pointer to the current character
146
- //! @param end the end of the character sequence
147
- inline RECODE_RESULT SafeReadUTF8Char(wchar32& rune, size_t& rune_len, const unsigned char* s, const unsigned char* end) {
148
- rune = BROKEN_RUNE;
149
- rune_len = 0;
150
- wchar32 _rune;
151
-
152
- size_t _len = UTF8RuneLen(*s);
153
- if (s + _len > end)
154
- return RECODE_EOINPUT; //[EOINPUT]
155
- if (_len == 0)
156
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
157
- _rune = *s++; //[00000000 0XXXXXXX]
158
-
159
- if (_len > 1) {
160
- _rune &= UTF8LeadByteMask(_len);
161
- unsigned char ch = *s++;
162
- if (!IsUTF8ContinuationByte(ch))
163
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte
164
- PutUTF8SixBits(_rune, ch); //[00000XXX XXYYYYYY]
165
- if (_len > 2) {
166
- ch = *s++;
167
- if (!IsUTF8ContinuationByte(ch))
168
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte
169
- PutUTF8SixBits(_rune, ch); //[XXXXYYYY YYZZZZZZ]
170
- if (_len > 3) {
171
- ch = *s;
172
- if (!IsUTF8ContinuationByte(ch))
173
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte
174
- PutUTF8SixBits(_rune, ch); //[XXXYY YYYYZZZZ ZZQQQQQQ]
175
- if (_rune > 0x10FFFF) // it is not a valid Unicode code point
176
- return RECODE_BROKENSYMBOL;
177
- if (_rune < 0x10000) // check for overlong encoding
178
- return RECODE_BROKENSYMBOL;
179
- } else {
180
- if (_rune < 0x800) // check for overlong encoding
181
- return RECODE_BROKENSYMBOL;
182
- }
183
- } else {
184
- if (_rune < 0x80) // check for overlong encoding
185
- return RECODE_BROKENSYMBOL;
186
- }
187
- }
188
- rune_len = _len;
189
- rune = _rune;
190
- return RECODE_OK;
191
- }
192
-
193
- //! reads one unicode symbol from a character sequence encoded UTF8 and moves pointer to the next character
194
- //! @param c value of the current character
195
- //! @param p pointer to the current character, it will be changed in case of valid UTF8 byte sequence
196
- //! @param e the end of the character sequence
197
- Y_FORCE_INLINE RECODE_RESULT ReadUTF8CharAndAdvance(wchar32& rune, const unsigned char*& p, const unsigned char* e) noexcept {
198
- Y_ASSERT(p < e); // since p < e then we will check RECODE_EOINPUT only for n > 1 (see calls of this functions)
199
- switch (UTF8RuneLen(*p)) {
200
- case 0:
201
- rune = BROKEN_RUNE;
202
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
203
-
204
- case 1:
205
- rune = *p; //[00000000 0XXXXXXX]
206
- ++p;
207
- return RECODE_OK;
208
-
209
- case 2:
210
- if (p + 2 > e) {
211
- return RECODE_EOINPUT;
212
- } else if (!IsUTF8ContinuationByte(p[1])) {
213
- rune = BROKEN_RUNE;
214
- return RECODE_BROKENSYMBOL;
215
- } else {
216
- PutUTF8LeadBits(rune, *p++, 2); //[00000000 000XXXXX]
217
- PutUTF8SixBits(rune, *p++); //[00000XXX XXYYYYYY]
218
- if (Y_UNLIKELY(rune < 0x80)) { // overlong encoding
219
- p -= 2;
220
- rune = BROKEN_RUNE;
221
- return RECODE_BROKENSYMBOL;
222
- }
223
- return RECODE_OK;
224
- }
225
- case 3:
226
- if (p + 3 > e) {
227
- return RECODE_EOINPUT;
228
- } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2])) {
229
- rune = BROKEN_RUNE;
230
- return RECODE_BROKENSYMBOL;
231
- } else {
232
- PutUTF8LeadBits(rune, *p++, 3); //[00000000 0000XXXX]
233
- PutUTF8SixBits(rune, *p++); //[000000XX XXYYYYYY]
234
- PutUTF8SixBits(rune, *p++); //[XXXXYYYY YYZZZZZZ]
235
- if (Y_UNLIKELY(rune < 0x800)) { // overlong encoding
236
- p -= 3;
237
- rune = BROKEN_RUNE;
238
- return RECODE_BROKENSYMBOL;
239
- }
240
- return RECODE_OK;
241
- }
242
- case 4:
243
- if (p + 4 > e) {
244
- return RECODE_EOINPUT;
245
- } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2]) || !IsUTF8ContinuationByte(p[3])) {
246
- rune = BROKEN_RUNE;
247
- return RECODE_BROKENSYMBOL;
248
- } else {
249
- PutUTF8LeadBits(rune, *p++, 4); //[00000000 00000000 00000XXX]
250
- PutUTF8SixBits(rune, *p++); //[00000000 0000000X XXYYYYYY]
251
- PutUTF8SixBits(rune, *p++); //[00000000 0XXXYYYY YYZZZZZZ]
252
- PutUTF8SixBits(rune, *p++); //[000XXXYY YYYYZZZZ ZZQQQQQQ]
253
- if (Y_UNLIKELY(rune < 0x10000 || rune > 0x10FFFF)) { // overlong encoding or non-valid code point
254
- p -= 4;
255
- rune = BROKEN_RUNE;
256
- return RECODE_BROKENSYMBOL;
257
- }
258
- return RECODE_OK;
259
- }
260
- default: // >4
261
- rune = BROKEN_RUNE;
262
- return RECODE_BROKENSYMBOL;
263
- }
264
- }
265
-
266
- //! writes one unicode symbol into a character sequence encoded UTF8
267
- //! checks for end of the buffer and returns the result of encoding
268
- //! @param rune value of the current character
269
- //! @param rune_len length of the UTF8 byte sequence that has been written
270
- //! @param s pointer to the output buffer
271
- //! @param tail available size of the buffer
272
- inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s, size_t tail) {
273
- rune_len = 0;
274
- if (rune < 0x80) {
275
- if (tail <= 0)
276
- return RECODE_EOOUTPUT;
277
- *s = static_cast<unsigned char>(rune);
278
- rune_len = 1;
279
- return RECODE_OK;
280
- }
281
- if (rune < 0x800) {
282
- if (tail <= 1)
283
- return RECODE_EOOUTPUT;
284
- *s++ = static_cast<unsigned char>(0xC0 | (rune >> 6));
285
- *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
286
- rune_len = 2;
287
- return RECODE_OK;
288
- }
289
- if (rune < 0x10000) {
290
- if (tail <= 2)
291
- return RECODE_EOOUTPUT;
292
- *s++ = static_cast<unsigned char>(0xE0 | (rune >> 12));
293
- *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
294
- *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
295
- rune_len = 3;
296
- return RECODE_OK;
297
- }
298
- /*if (rune < 0x200000)*/ {
299
- if (tail <= 3)
300
- return RECODE_EOOUTPUT;
301
- *s++ = static_cast<unsigned char>(0xF0 | ((rune >> 18) & 0x07));
302
- *s++ = static_cast<unsigned char>(0x80 | ((rune >> 12) & 0x3F));
303
- *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
304
- *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
305
- rune_len = 4;
306
- return RECODE_OK;
307
- }
308
- }
309
-
310
- inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s, const unsigned char* end) {
311
- return SafeWriteUTF8Char(rune, rune_len, s, end - s);
312
- }
313
-
314
- //! writes one unicode symbol into a character sequence encoded UTF8
315
- //! @attention this function works as @c SafeWriteUTF8Char it does not check
316
- //! the size of the output buffer, it supposes that buffer is long enough
317
- //! @param rune value of the current character
318
- //! @param rune_len length of the UTF8 byte sequence that has been written
319
- //! @param s pointer to the output buffer
320
- inline void WriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s) {
321
- if (rune < 0x80) {
322
- *s = static_cast<unsigned char>(rune);
323
- rune_len = 1;
324
- return;
325
- }
326
- if (rune < 0x800) {
327
- *s++ = static_cast<unsigned char>(0xC0 | (rune >> 6));
328
- *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
329
- rune_len = 2;
330
- return;
331
- }
332
- if (rune < 0x10000) {
333
- *s++ = static_cast<unsigned char>(0xE0 | (rune >> 12));
334
- *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
335
- *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
336
- rune_len = 3;
337
- return;
338
- }
339
- /*if (rune < 0x200000)*/ {
340
- *s++ = static_cast<unsigned char>(0xF0 | ((rune >> 18) & 0x07));
341
- *s++ = static_cast<unsigned char>(0x80 | ((rune >> 12) & 0x3F));
342
- *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
343
- *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
344
- rune_len = 4;
345
- }
346
- }
347
-
348
- TStringBuf SubstrUTF8(const TStringBuf str, size_t pos, size_t len);
349
-
350
- enum EUTF8Detect {
351
- NotUTF8,
352
- UTF8,
353
- ASCII
354
- };
355
-
356
- EUTF8Detect UTF8Detect(const char* s, size_t len);
357
-
358
- inline EUTF8Detect UTF8Detect(const TStringBuf input) {
359
- return UTF8Detect(input.data(), input.size());
360
- }
361
-
362
- inline bool IsUtf(const char* input, size_t len) {
363
- return UTF8Detect(input, len) != NotUTF8;
364
- }
365
-
366
- inline bool IsUtf(const TStringBuf input) {
367
- return IsUtf(input.data(), input.size());
368
- }
369
-
370
- //! returns true, if result is not the same as input, and put it in newString
371
- //! returns false, if result is unmodified
372
- bool ToLowerUTF8Impl(const char* beg, size_t n, TString& newString);
373
-
374
- TString ToLowerUTF8(const TString& s);
375
- TString ToLowerUTF8(TStringBuf s);
376
- TString ToLowerUTF8(const char* s);
377
-
378
- //! returns true, if result is not the same as input, and put it in newString
379
- //! returns false, if result is unmodified
380
- bool ToUpperUTF8Impl(const char* beg, size_t n, TString& newString);
381
-
382
- TString ToUpperUTF8(const TString& s);
383
- TString ToUpperUTF8(TStringBuf s);
384
- TString ToUpperUTF8(const char* s);