duckdb 0.5.1-dev225.0 → 0.5.1-dev237.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb.cpp +70 -35
- package/src/duckdb.hpp +2 -2
- package/src/parquet-amalgamation.cpp +37555 -37555
package/package.json
CHANGED
package/src/duckdb.cpp
CHANGED
|
@@ -39580,7 +39580,7 @@ public:
|
|
|
39580
39580
|
namespace duckdb {
|
|
39581
39581
|
|
|
39582
39582
|
enum class UnicodeType { INVALID, ASCII, UNICODE };
|
|
39583
|
-
enum class UnicodeInvalidReason { BYTE_MISMATCH, NULL_BYTE };
|
|
39583
|
+
enum class UnicodeInvalidReason { BYTE_MISMATCH, NULL_BYTE, INVALID_UNICODE };
|
|
39584
39584
|
|
|
39585
39585
|
class Utf8Proc {
|
|
39586
39586
|
public:
|
|
@@ -260490,49 +260490,84 @@ static void AssignInvalidUTF8Reason(UnicodeInvalidReason *invalid_reason, size_t
|
|
|
260490
260490
|
}
|
|
260491
260491
|
}
|
|
260492
260492
|
|
|
260493
|
-
|
|
260494
|
-
|
|
260495
|
-
|
|
260496
|
-
|
|
260497
|
-
|
|
260498
|
-
|
|
260499
|
-
|
|
260500
|
-
|
|
260501
|
-
|
|
260502
|
-
|
|
260503
|
-
|
|
260504
|
-
|
|
260505
|
-
|
|
260506
|
-
|
|
260507
|
-
if ((s[++i] & 0xC0) != 0x80) {
|
|
260508
|
-
AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
|
|
260509
|
-
return UnicodeType::INVALID;
|
|
260510
|
-
}
|
|
260511
|
-
if ((c & 0xE0) == 0xC0) {
|
|
260512
|
-
continue;
|
|
260513
|
-
}
|
|
260514
|
-
if ((s[++i] & 0xC0) != 0x80) {
|
|
260515
|
-
AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
|
|
260516
|
-
return UnicodeType::INVALID;
|
|
260517
|
-
}
|
|
260518
|
-
if ((c & 0xF0) == 0xE0) {
|
|
260519
|
-
continue;
|
|
260520
|
-
}
|
|
260521
|
-
if ((s[++i] & 0xC0) != 0x80) {
|
|
260493
|
+
template <const int nextra_bytes, const int mask>
|
|
260494
|
+
static inline UnicodeType
|
|
260495
|
+
UTF8ExtraByteLoop(const int first_pos_seq, int utf8char, size_t& i,
|
|
260496
|
+
const char *s, const size_t len, UnicodeInvalidReason *invalid_reason, size_t *invalid_pos) {
|
|
260497
|
+
if ((len - i) < (nextra_bytes + 1)) {
|
|
260498
|
+
/* incomplete byte sequence */
|
|
260499
|
+
AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::BYTE_MISMATCH);
|
|
260500
|
+
return UnicodeType::INVALID;
|
|
260501
|
+
}
|
|
260502
|
+
for (size_t j = 0 ; j < nextra_bytes; j++) {
|
|
260503
|
+
int c = (int) s[++i];
|
|
260504
|
+
/* now validate the extra bytes */
|
|
260505
|
+
if ((c & 0xC0) != 0x80) {
|
|
260506
|
+
/* extra byte is not in the format 10xxxxxx */
|
|
260522
260507
|
AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
|
|
260523
260508
|
return UnicodeType::INVALID;
|
|
260524
260509
|
}
|
|
260525
|
-
|
|
260526
|
-
|
|
260527
|
-
|
|
260528
|
-
|
|
260510
|
+
utf8char = (utf8char << 6) | (c & 0x3F);
|
|
260511
|
+
}
|
|
260512
|
+
if ((utf8char & mask) == 0) {
|
|
260513
|
+
/* invalid UTF-8 codepoint, not shortest possible */
|
|
260514
|
+
AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE);
|
|
260515
|
+
return UnicodeType::INVALID;
|
|
260516
|
+
}
|
|
260517
|
+
if (utf8char > 0x10FFFF) {
|
|
260518
|
+
/* value not representable by Unicode */
|
|
260519
|
+
AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE);
|
|
260520
|
+
return UnicodeType::INVALID;
|
|
260521
|
+
}
|
|
260522
|
+
if ((utf8char & 0x1FFF800) == 0xD800) {
|
|
260523
|
+
/* Unicode characters from U+D800 to U+DFFF are surrogate characters used by UTF-16 which are invalid in UTF-8 */
|
|
260524
|
+
AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE);
|
|
260529
260525
|
return UnicodeType::INVALID;
|
|
260530
260526
|
}
|
|
260527
|
+
return UnicodeType::UNICODE;
|
|
260528
|
+
}
|
|
260531
260529
|
|
|
260530
|
+
UnicodeType Utf8Proc::Analyze(const char *s, size_t len, UnicodeInvalidReason *invalid_reason, size_t *invalid_pos) {
|
|
260531
|
+
UnicodeType type = UnicodeType::ASCII;
|
|
260532
|
+
|
|
260533
|
+
for (size_t i = 0; i < len; i++) {
|
|
260534
|
+
int c = (int) s[i];
|
|
260535
|
+
|
|
260536
|
+
if ((c & 0x80) == 0) {
|
|
260537
|
+
/* 1 byte sequence */
|
|
260538
|
+
if (c == '\0') {
|
|
260539
|
+
/* NULL byte not allowed */
|
|
260540
|
+
AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::NULL_BYTE);
|
|
260541
|
+
return UnicodeType::INVALID;
|
|
260542
|
+
}
|
|
260543
|
+
} else {
|
|
260544
|
+
int first_pos_seq = i;
|
|
260545
|
+
|
|
260546
|
+
if ((c & 0xE0) == 0xC0) {
|
|
260547
|
+
/* 2 byte sequence */
|
|
260548
|
+
int utf8char = c & 0x1F;
|
|
260549
|
+
type = UTF8ExtraByteLoop<1, 0x000780>(first_pos_seq, utf8char, i, s, len, invalid_reason, invalid_pos);
|
|
260550
|
+
} else if ((c & 0xF0) == 0xE0) {
|
|
260551
|
+
/* 3 byte sequence */
|
|
260552
|
+
int utf8char = c & 0x0F;
|
|
260553
|
+
type = UTF8ExtraByteLoop<2, 0x00F800>(first_pos_seq, utf8char, i, s, len, invalid_reason, invalid_pos);
|
|
260554
|
+
} else if ((c & 0xF8) == 0xF0) {
|
|
260555
|
+
/* 4 byte sequence */
|
|
260556
|
+
int utf8char = c & 0x07;
|
|
260557
|
+
type = UTF8ExtraByteLoop<3, 0x1F0000>(first_pos_seq, utf8char, i, s, len, invalid_reason, invalid_pos);
|
|
260558
|
+
} else {
|
|
260559
|
+
/* invalid UTF-8 start byte */
|
|
260560
|
+
AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
|
|
260561
|
+
return UnicodeType::INVALID;
|
|
260562
|
+
}
|
|
260563
|
+
if (type == UnicodeType::INVALID) {
|
|
260564
|
+
return type;
|
|
260565
|
+
}
|
|
260566
|
+
}
|
|
260567
|
+
}
|
|
260532
260568
|
return type;
|
|
260533
260569
|
}
|
|
260534
260570
|
|
|
260535
|
-
|
|
260536
260571
|
char* Utf8Proc::Normalize(const char *s, size_t len) {
|
|
260537
260572
|
assert(s);
|
|
260538
260573
|
assert(Utf8Proc::Analyze(s, len) != UnicodeType::INVALID);
|
package/src/duckdb.hpp
CHANGED
|
@@ -11,8 +11,8 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI
|
|
|
11
11
|
#pragma once
|
|
12
12
|
#define DUCKDB_AMALGAMATION 1
|
|
13
13
|
#define DUCKDB_AMALGAMATION_EXTENDED 1
|
|
14
|
-
#define DUCKDB_SOURCE_ID "
|
|
15
|
-
#define DUCKDB_VERSION "v0.5.1-
|
|
14
|
+
#define DUCKDB_SOURCE_ID "a991beaf1"
|
|
15
|
+
#define DUCKDB_VERSION "v0.5.1-dev237"
|
|
16
16
|
//===----------------------------------------------------------------------===//
|
|
17
17
|
// DuckDB
|
|
18
18
|
//
|