duckdb 0.5.1-dev225.0 → 0.5.1-dev237.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
- "version": "0.5.1-dev225.0",
4
+ "version": "0.5.1-dev237.0",
5
5
  "description": "DuckDB node.js API",
6
6
  "gypfile": true,
7
7
  "dependencies": {
package/src/duckdb.cpp CHANGED
@@ -39580,7 +39580,7 @@ public:
39580
39580
  namespace duckdb {
39581
39581
 
39582
39582
  enum class UnicodeType { INVALID, ASCII, UNICODE };
39583
- enum class UnicodeInvalidReason { BYTE_MISMATCH, NULL_BYTE };
39583
+ enum class UnicodeInvalidReason { BYTE_MISMATCH, NULL_BYTE, INVALID_UNICODE };
39584
39584
 
39585
39585
  class Utf8Proc {
39586
39586
  public:
@@ -260490,49 +260490,84 @@ static void AssignInvalidUTF8Reason(UnicodeInvalidReason *invalid_reason, size_t
260490
260490
  }
260491
260491
  }
260492
260492
 
260493
- UnicodeType Utf8Proc::Analyze(const char *s, size_t len, UnicodeInvalidReason *invalid_reason, size_t *invalid_pos) {
260494
- UnicodeType type = UnicodeType::ASCII;
260495
- char c;
260496
- for (size_t i = 0; i < len; i++) {
260497
- c = s[i];
260498
- if (c == '\0') {
260499
- AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::NULL_BYTE);
260500
- return UnicodeType::INVALID;
260501
- }
260502
- // 1 Byte / ASCII
260503
- if ((c & 0x80) == 0) {
260504
- continue;
260505
- }
260506
- type = UnicodeType::UNICODE;
260507
- if ((s[++i] & 0xC0) != 0x80) {
260508
- AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
260509
- return UnicodeType::INVALID;
260510
- }
260511
- if ((c & 0xE0) == 0xC0) {
260512
- continue;
260513
- }
260514
- if ((s[++i] & 0xC0) != 0x80) {
260515
- AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
260516
- return UnicodeType::INVALID;
260517
- }
260518
- if ((c & 0xF0) == 0xE0) {
260519
- continue;
260520
- }
260521
- if ((s[++i] & 0xC0) != 0x80) {
260493
+ template <const int nextra_bytes, const int mask>
260494
+ static inline UnicodeType
260495
+ UTF8ExtraByteLoop(const int first_pos_seq, int utf8char, size_t& i,
260496
+ const char *s, const size_t len, UnicodeInvalidReason *invalid_reason, size_t *invalid_pos) {
260497
+ if ((len - i) < (nextra_bytes + 1)) {
260498
+ /* incomplete byte sequence */
260499
+ AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::BYTE_MISMATCH);
260500
+ return UnicodeType::INVALID;
260501
+ }
260502
+ for (size_t j = 0 ; j < nextra_bytes; j++) {
260503
+ int c = (int) s[++i];
260504
+ /* now validate the extra bytes */
260505
+ if ((c & 0xC0) != 0x80) {
260506
+ /* extra byte is not in the format 10xxxxxx */
260522
260507
  AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
260523
260508
  return UnicodeType::INVALID;
260524
260509
  }
260525
- if ((c & 0xF8) == 0xF0) {
260526
- continue;
260527
- }
260528
- AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
260510
+ utf8char = (utf8char << 6) | (c & 0x3F);
260511
+ }
260512
+ if ((utf8char & mask) == 0) {
260513
+ /* invalid UTF-8 codepoint, not shortest possible */
260514
+ AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE);
260515
+ return UnicodeType::INVALID;
260516
+ }
260517
+ if (utf8char > 0x10FFFF) {
260518
+ /* value not representable by Unicode */
260519
+ AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE);
260520
+ return UnicodeType::INVALID;
260521
+ }
260522
+ if ((utf8char & 0x1FFF800) == 0xD800) {
260523
+ /* Unicode characters from U+D800 to U+DFFF are surrogate characters used by UTF-16 which are invalid in UTF-8 */
260524
+ AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE);
260529
260525
  return UnicodeType::INVALID;
260530
260526
  }
260527
+ return UnicodeType::UNICODE;
260528
+ }
260531
260529
 
260530
+ UnicodeType Utf8Proc::Analyze(const char *s, size_t len, UnicodeInvalidReason *invalid_reason, size_t *invalid_pos) {
260531
+ UnicodeType type = UnicodeType::ASCII;
260532
+
260533
+ for (size_t i = 0; i < len; i++) {
260534
+ int c = (int) s[i];
260535
+
260536
+ if ((c & 0x80) == 0) {
260537
+ /* 1 byte sequence */
260538
+ if (c == '\0') {
260539
+ /* NULL byte not allowed */
260540
+ AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::NULL_BYTE);
260541
+ return UnicodeType::INVALID;
260542
+ }
260543
+ } else {
260544
+ int first_pos_seq = i;
260545
+
260546
+ if ((c & 0xE0) == 0xC0) {
260547
+ /* 2 byte sequence */
260548
+ int utf8char = c & 0x1F;
260549
+ type = UTF8ExtraByteLoop<1, 0x000780>(first_pos_seq, utf8char, i, s, len, invalid_reason, invalid_pos);
260550
+ } else if ((c & 0xF0) == 0xE0) {
260551
+ /* 3 byte sequence */
260552
+ int utf8char = c & 0x0F;
260553
+ type = UTF8ExtraByteLoop<2, 0x00F800>(first_pos_seq, utf8char, i, s, len, invalid_reason, invalid_pos);
260554
+ } else if ((c & 0xF8) == 0xF0) {
260555
+ /* 4 byte sequence */
260556
+ int utf8char = c & 0x07;
260557
+ type = UTF8ExtraByteLoop<3, 0x1F0000>(first_pos_seq, utf8char, i, s, len, invalid_reason, invalid_pos);
260558
+ } else {
260559
+ /* invalid UTF-8 start byte */
260560
+ AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
260561
+ return UnicodeType::INVALID;
260562
+ }
260563
+ if (type == UnicodeType::INVALID) {
260564
+ return type;
260565
+ }
260566
+ }
260567
+ }
260532
260568
  return type;
260533
260569
  }
260534
260570
 
260535
-
260536
260571
  char* Utf8Proc::Normalize(const char *s, size_t len) {
260537
260572
  assert(s);
260538
260573
  assert(Utf8Proc::Analyze(s, len) != UnicodeType::INVALID);
package/src/duckdb.hpp CHANGED
@@ -11,8 +11,8 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI
11
11
  #pragma once
12
12
  #define DUCKDB_AMALGAMATION 1
13
13
  #define DUCKDB_AMALGAMATION_EXTENDED 1
14
- #define DUCKDB_SOURCE_ID "1f2e0822a"
15
- #define DUCKDB_VERSION "v0.5.1-dev225"
14
+ #define DUCKDB_SOURCE_ID "a991beaf1"
15
+ #define DUCKDB_VERSION "v0.5.1-dev237"
16
16
  //===----------------------------------------------------------------------===//
17
17
  // DuckDB
18
18
  //