npm - duckdb - Versions diffs - 0.5.2-dev1819.0 → 0.5.2-dev1840.0 - Mend

duckdb 0.5.2-dev1819.0 → 0.5.2-dev1840.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/package.json +1 -1
package/src/duckdb.cpp +156 -100
package/src/duckdb.hpp +32 -5
package/src/parquet-amalgamation.cpp +28175 -28179

package/package.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "name": "duckdb",
   "main": "./lib/duckdb.js",
   "types": "./lib/duckdb.d.ts",
-  "version": "0.5.2-dev1819.0",
+  "version": "0.5.2-dev1840.0",
   "description": "DuckDB node.js API",
   "gypfile": true,
   "dependencies": {

package/src/duckdb.cpp CHANGED Viewed

@@ -8617,7 +8617,7 @@ private:
 namespace duckdb {
 enum class UnicodeType { INVALID, ASCII, UNICODE };
-enum class UnicodeInvalidReason { BYTE_MISMATCH, NULL_BYTE, INVALID_UNICODE };
+enum class UnicodeInvalidReason { BYTE_MISMATCH, INVALID_UNICODE };
 class Utf8Proc {
 public:
@@ -8845,7 +8845,7 @@ list<ColumnDataCollection> BoxRenderer::FetchRenderCollections(ClientContext &co
 }
 string ConvertRenderValue(const string &input) {
-	return StringUtil::Replace(input, "\n", "\\n");
+	return StringUtil::Replace(StringUtil::Replace(input, "\n", "\\n"), string("\0", 1), "\\0");
 }
 string BoxRenderer::GetRenderValue(ColumnDataRowCollection &rows, idx_t c, idx_t r) {
@@ -30071,15 +30071,15 @@ PreservedError::PreservedError() : initialized(false) {
 }
 PreservedError::PreservedError(const Exception &exception)
-    : initialized(true), type(exception.type), raw_message(exception.RawMessage()) {
+    : initialized(true), type(exception.type), raw_message(SanitizeErrorMessage(exception.RawMessage())) {
 }
 PreservedError::PreservedError(const std::exception &exception)
-    : initialized(true), type(ExceptionType::INVALID), raw_message(exception.what()) {
+    : initialized(true), type(ExceptionType::INVALID), raw_message(SanitizeErrorMessage(exception.what())) {
 }
 PreservedError::PreservedError(const string &message)
-    : initialized(true), type(ExceptionType::INVALID), raw_message(message) {
+    : initialized(true), type(ExceptionType::INVALID), raw_message(SanitizeErrorMessage(message)) {
 }
 const string &PreservedError::Message() {
@@ -30089,6 +30089,10 @@ const string &PreservedError::Message() {
 	return final_message;
 }
+string PreservedError::SanitizeErrorMessage(string error) {
+	return StringUtil::Replace(move(error), string("\0", 1), "\\0");
+}
 void PreservedError::Throw(const string &prepended_message) const {
 	D_ASSERT(initialized);
 	if (!prepended_message.empty()) {
@@ -40041,6 +40045,9 @@ vector<string> StringUtil::Split(const string &input, const string &split) {
 }
 string StringUtil::Replace(string source, const string &from, const string &to) {
+	if (from.empty()) {
+		throw InternalException("Invalid argument to StringUtil::Replace - empty FROM");
+	}
 	idx_t start_pos = 0;
 	while ((start_pos = source.find(from, start_pos)) != string::npos) {
 		source.replace(start_pos, from.length(), to);
@@ -41812,7 +41819,7 @@ bool Blob::TryGetBlobSize(string_t str, idx_t &str_len, string *error_message) {
 			}
 			str_len++;
 			i += 3;
-		} else if (data[i] >= 32 && data[i] <= 127) {
+		} else if (data[i] <= 127) {
 			str_len++;
 		} else {
 			string error = "Invalid byte encountered in STRING -> BLOB conversion. All non-ascii characters "
@@ -41846,7 +41853,7 @@ void Blob::ToBlob(string_t str, data_ptr_t output) {
 			D_ASSERT(data[i + 1] == 'x');
 			output[blob_idx++] = (byte_a << 4) + byte_b;
 			i += 3;
-		} else if (data[i] >= 32 && data[i] <= 127) {
+		} else if (data[i] <= 127) {
 			output[blob_idx++] = data_t(data[i]);
 		} else {
 			throw ConversionException("Invalid byte encountered in STRING -> BLOB conversion. All non-ascii characters "
@@ -47731,12 +47738,6 @@ void string_t::Verify() const {
 	}
 }
-void string_t::VerifyNull() const {
-	for (idx_t i = 0; i < GetSize(); i++) {
-		D_ASSERT(GetDataUnsafe()[i] != '\0');
-	}
-}
 } // namespace duckdb
@@ -50430,6 +50431,17 @@ bool Value::NotDistinctFrom(const Value &lvalue, const Value &rvalue) {
 	return ValueOperations::NotDistinctFrom(lvalue, rvalue);
 }
+static string SanitizeValue(string input) {
+	// some results might contain padding spaces, e.g. when rendering
+	// VARCHAR(10) and the string only has 6 characters, they will be padded
+	// with spaces to 10 in the rendering. We don't do that here yet as we
+	// are looking at internal structures. So just ignore any extra spaces
+	// on the right
+	StringUtil::RTrim(input);
+	// for result checking code, replace null bytes with their escaped value (\0)
+	return StringUtil::Replace(input, string("\0", 1), "\\0");
+}
 bool Value::ValuesAreEqual(CastFunctionSet &set, GetCastFunctionInput &get_input, const Value &result_value,
                            const Value &value) {
 	if (result_value.IsNull() != value.IsNull()) {
@@ -50454,15 +50466,8 @@ bool Value::ValuesAreEqual(CastFunctionSet &set, GetCastFunctionInput &get_input
 	}
 	case LogicalTypeId::VARCHAR: {
 		auto other = result_value.CastAs(set, get_input, LogicalType::VARCHAR);
-		// some results might contain padding spaces, e.g. when rendering
-		// VARCHAR(10) and the string only has 6 characters, they will be padded
-		// with spaces to 10 in the rendering. We don't do that here yet as we
-		// are looking at internal structures. So just ignore any extra spaces
-		// on the right
-		string left = other.str_value;
-		string right = value.str_value;
-		StringUtil::RTrim(left);
-		StringUtil::RTrim(right);
+		string left = SanitizeValue(other.str_value);
+		string right = SanitizeValue(value.str_value);
 		return left == right;
 	}
 	default:
@@ -51767,7 +51772,7 @@ void Vector::Verify(Vector &vector_p, const SelectionVector &sel_p, idx_t count)
 		D_ASSERT(!vector->auxiliary);
 	}
 	if (type.id() == LogicalTypeId::VARCHAR || type.id() == LogicalTypeId::JSON) {
-		// verify that there are no '\0' bytes in string values
+		// verify that the string is correct unicode
 		switch (vtype) {
 		case VectorType::FLAT_VECTOR: {
 			auto &validity = FlatVector::Validity(*vector);
@@ -51775,7 +51780,7 @@ void Vector::Verify(Vector &vector_p, const SelectionVector &sel_p, idx_t count)
 			for (idx_t i = 0; i < count; i++) {
 				auto oidx = sel->get_index(i);
 				if (validity.RowIsValid(oidx)) {
-					strings[oidx].VerifyNull();
+					strings[oidx].Verify();
 				}
 			}
 			break;
@@ -79839,7 +79844,7 @@ normal:
 	} while (ReadBuffer(start));
 	goto final_state;
 add_value:
-	AddValue(buffer.get() + start, position - start - offset, column, escape_positions, has_quotes);
+	AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes);
 	// increase position by 1 and move start to the new position
 	offset = 0;
 	has_quotes = false;
@@ -79852,7 +79857,7 @@ add_value:
 add_row : {
 	// check type of newline (\r or \n)
 	bool carriage_return = buffer[position] == '\r';
-	AddValue(buffer.get() + start, position - start - offset, column, escape_positions, has_quotes);
+	AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes);
 	finished_chunk = AddRow(insert_chunk, column);
 	// increase position by 1 and move start to the new position
 	offset = 0;
@@ -79990,7 +79995,7 @@ final_state:
 	}
 	if (column > 0 || position > start) {
 		// remaining values to be added to the chunk
-		AddValue(buffer.get() + start, position - start - offset, column, escape_positions, has_quotes);
+		AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes);
 		finished_chunk = AddRow(insert_chunk, column);
 	}
 	// final stage, only reached after parsing the file is finished
@@ -80050,7 +80055,7 @@ normal:
 	// file ends during normal scan: go to end state
 	goto final_state;
 add_value:
-	AddValue(buffer.get() + start, position - start - offset, column, escape_positions, has_quotes);
+	AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes);
 	// increase position by 1 and move start to the new position
 	offset = 0;
 	has_quotes = false;
@@ -80063,7 +80068,7 @@ add_value:
 add_row : {
 	// check type of newline (\r or \n)
 	bool carriage_return = buffer[position] == '\r';
-	AddValue(buffer.get() + start, position - start - offset, column, escape_positions, has_quotes);
+	AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes);
 	finished_chunk = AddRow(insert_chunk, column);
 	// increase position by 1 and move start to the new position
 	offset = 0;
@@ -80174,7 +80179,7 @@ final_state:
 	if (column > 0 || position > start) {
 		// remaining values to be added to the chunk
-		AddValue(buffer.get() + start, position - start - offset, column, escape_positions, has_quotes);
+		AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes);
 		finished_chunk = AddRow(insert_chunk, column);
 	}
@@ -80275,8 +80280,8 @@ bool BufferedCSVReader::TryParseCSV(ParserMode parser_mode, DataChunk &insert_ch
 	}
 }
-void BufferedCSVReader::AddValue(char *str_val, idx_t length, idx_t &column, vector<idx_t> &escape_positions,
-                                 bool has_quotes) {
+void BufferedCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &escape_positions, bool has_quotes) {
+	auto length = str_val.GetSize();
 	if (length == 0 && column == 0) {
 		row_empty = true;
 	} else {
@@ -80305,18 +80310,16 @@ void BufferedCSVReader::AddValue(char *str_val, idx_t length, idx_t &column, vec
 	// insert the line number into the chunk
 	idx_t row_entry = parse_chunk.size();
-	str_val[length] = '\0';
 	// test against null string, but only if the value was not quoted
 	if ((!has_quotes || sql_types[column].id() != LogicalTypeId::VARCHAR) && !options.force_not_null[column] &&
-	    strcmp(options.null_str.c_str(), str_val) == 0) {
+	    Equals::Operation(str_val, string_t(options.null_str))) {
 		FlatVector::SetNull(parse_chunk.data[column], row_entry, true);
 	} else {
 		auto &v = parse_chunk.data[column];
 		auto parse_data = FlatVector::GetData<string_t>(v);
 		if (!escape_positions.empty()) {
 			// remove escape characters (if any)
-			string old_val = str_val;
+			string old_val = str_val.GetString();
 			string new_val = "";
 			idx_t prev_pos = 0;
 			for (idx_t i = 0; i < escape_positions.size(); i++) {
@@ -80333,7 +80336,7 @@ void BufferedCSVReader::AddValue(char *str_val, idx_t length, idx_t &column, vec
 			escape_positions.clear();
 			parse_data[row_entry] = StringVector::AddStringOrBlob(v, string_t(new_val));
 		} else {
-			parse_data[row_entry] = string_t(str_val, length);
+			parse_data[row_entry] = str_val;
 		}
 	}
@@ -115875,20 +115878,44 @@ void UpperFun::RegisterFunction(BuiltinFunctions &set) {
 namespace duckdb {
 struct ChrOperator {
-	template <class TA, class TR>
-	static inline TR Operation(const TA &input) {
-		char c[5] = {'\0', '\0', '\0', '\0', '\0'};
-		int utf8_bytes = 4;
+	static void GetCodepoint(int32_t input, char c[], int &utf8_bytes) {
 		if (input < 0 || !Utf8Proc::CodepointToUtf8(input, utf8_bytes, &c[0])) {
 			throw InvalidInputException("Invalid UTF8 Codepoint %d", input);
 		}
-		return string_t(&c[0]);
+	}
+	template <class TA, class TR>
+	static inline TR Operation(const TA &input) {
+		char c[5] = {'\0', '\0', '\0', '\0', '\0'};
+		int utf8_bytes;
+		GetCodepoint(input, c, utf8_bytes);
+		return string_t(&c[0], utf8_bytes);
 	}
 };
+#ifdef DUCKDB_DEBUG_NO_INLINE
+// the chr function depends on the data always being inlined (which is always possible, since it outputs max 4 bytes)
+// to enable chr when string inlining is disabled we create a special function here
+static void ChrFunction(DataChunk &args, ExpressionState &state, Vector &result) {
+	auto &code_vec = args.data[0];
+	char c[5] = {'\0', '\0', '\0', '\0', '\0'};
+	int utf8_bytes;
+	UnaryExecutor::Execute<int32_t, string_t>(code_vec, result, args.size(), [&](int32_t input) {
+		ChrOperator::GetCodepoint(input, c, utf8_bytes);
+		return StringVector::AddString(result, &c[0], utf8_bytes);
+	});
+}
+#endif
 void CHR::RegisterFunction(BuiltinFunctions &set) {
 	ScalarFunction chr("chr", {LogicalType::INTEGER}, LogicalType::VARCHAR,
-	                   ScalarFunction::UnaryFunction<int32_t, string_t, ChrOperator>);
+#ifdef DUCKDB_DEBUG_NO_INLINE
+	                   ChrFunction
+#else
+	                   ScalarFunction::UnaryFunction<int32_t, string_t, ChrOperator>
+#endif
+	);
 	set.AddFunction(chr);
 }
@@ -117985,14 +118012,14 @@ struct ASCIILCaseReader {
 	}
 };
-template <char PERCENTAGE, char UNDERSCORE, class READER = StandardCharacterReader>
+template <char PERCENTAGE, char UNDERSCORE, bool HAS_ESCAPE, class READER = StandardCharacterReader>
 bool TemplatedLikeOperator(const char *sdata, idx_t slen, const char *pdata, idx_t plen, char escape) {
 	idx_t pidx = 0;
 	idx_t sidx = 0;
 	for (; pidx < plen && sidx < slen; pidx++) {
 		char pchar = READER::Operation(pdata, pidx);
 		char schar = READER::Operation(sdata, sidx);
-		if (pchar == escape) {
+		if (HAS_ESCAPE && pchar == escape) {
 			pidx++;
 			if (pidx == plen) {
 				throw SyntaxException("Like pattern must not end with escape character!");
@@ -118012,8 +118039,8 @@ bool TemplatedLikeOperator(const char *sdata, idx_t slen, const char *pdata, idx
 				return true; /* tail is acceptable */
 			}
 			for (; sidx < slen; sidx++) {
-				if (TemplatedLikeOperator<PERCENTAGE, UNDERSCORE, READER>(sdata + sidx, slen - sidx, pdata + pidx,
-				                                                          plen - pidx, escape)) {
+				if (TemplatedLikeOperator<PERCENTAGE, UNDERSCORE, HAS_ESCAPE, READER>(
+				        sdata + sidx, slen - sidx, pdata + pidx, plen - pidx, escape)) {
 					return true;
 				}
 			}
@@ -118170,10 +118197,18 @@ static unique_ptr<FunctionData> LikeBindFunction(ClientContext &context, ScalarF
 }
 bool LikeOperatorFunction(const char *s, idx_t slen, const char *pattern, idx_t plen, char escape) {
-	return TemplatedLikeOperator<'%', '_'>(s, slen, pattern, plen, escape);
+	return TemplatedLikeOperator<'%', '_', true>(s, slen, pattern, plen, escape);
+}
+bool LikeOperatorFunction(const char *s, idx_t slen, const char *pattern, idx_t plen) {
+	return TemplatedLikeOperator<'%', '_', false>(s, slen, pattern, plen, '\0');
 }
-bool LikeOperatorFunction(string_t &s, string_t &pat, char escape = '\0') {
+bool LikeOperatorFunction(string_t &s, string_t &pat) {
+	return LikeOperatorFunction(s.GetDataUnsafe(), s.GetSize(), pat.GetDataUnsafe(), pat.GetSize());
+}
+bool LikeOperatorFunction(string_t &s, string_t &pat, char escape) {
 	return LikeOperatorFunction(s.GetDataUnsafe(), s.GetSize(), pat.GetDataUnsafe(), pat.GetSize(), escape);
 }
@@ -118400,8 +118435,8 @@ struct NotILikeOperator {
 struct ILikeOperatorASCII {
 	template <class TA, class TB, class TR>
 	static inline TR Operation(TA str, TB pattern) {
-		return TemplatedLikeOperator<'%', '_', ASCIILCaseReader>(str.GetDataUnsafe(), str.GetSize(),
-		                                                         pattern.GetDataUnsafe(), pattern.GetSize(), '\0');
+		return TemplatedLikeOperator<'%', '_', false, ASCIILCaseReader>(
+		    str.GetDataUnsafe(), str.GetSize(), pattern.GetDataUnsafe(), pattern.GetSize(), '\0');
 	}
 };
@@ -128040,7 +128075,8 @@ vector<TestType> TestAllTypesFun::GetTestTypes() {
 	result.emplace_back(LogicalType::INTERVAL, "interval", Value::INTERVAL(min_interval),
 	                    Value::INTERVAL(max_interval));
 	// strings/blobs
-	result.emplace_back(LogicalType::VARCHAR, "varchar", Value("🦆🦆🦆🦆🦆🦆"), Value("goose"));
+	result.emplace_back(LogicalType::VARCHAR, "varchar", Value("🦆🦆🦆🦆🦆🦆"),
+	                    Value(string("goo\x00se", 6)));
 	result.emplace_back(LogicalType::JSON, "json", Value("🦆🦆🦆🦆🦆🦆"), Value("goose"));
 	result.emplace_back(LogicalType::BLOB, "blob", Value::BLOB("thisisalongblob\\x00withnullbytes"),
 	                    Value::BLOB("\\x00\\x00\\x00a"));
@@ -130142,6 +130178,8 @@ interval_t FetchDefaultValue::Operation();
 template <>
 char *FetchDefaultValue::Operation();
 template <>
+duckdb_string FetchDefaultValue::Operation();
+template <>
 duckdb_blob FetchDefaultValue::Operation();
 //===--------------------------------------------------------------------===//
@@ -130165,9 +130203,11 @@ struct ToCStringCastWrapper {
 		auto result_size = result_string.GetSize();
 		auto result_data = result_string.GetDataUnsafe();
-		result = (char *)duckdb_malloc(result_size + 1);
-		memcpy(result, result_data, result_size);
-		result[result_size] = '\0';
+		char *allocated_data = (char *)duckdb_malloc(result_size + 1);
+		memcpy(allocated_data, result_data, result_size);
+		allocated_data[result_size] = '\0';
+		result.data = allocated_data;
+		result.size = result_size;
 		return true;
 	}
 };
@@ -130183,7 +130223,7 @@ struct FromCBlobCastWrapper {
 };
 template <>
-bool FromCBlobCastWrapper::Operation(duckdb_blob input, char *&result);
+bool FromCBlobCastWrapper::Operation(duckdb_blob input, duckdb_string &result);
 template <class SOURCE_TYPE, class RESULT_TYPE, class OP>
 RESULT_TYPE TryCastCInternal(duckdb_result *result, idx_t col, idx_t row) {
@@ -130236,7 +130276,7 @@ bool CastDecimalCInternal(duckdb_result *source, RESULT_TYPE &result, idx_t col,
 //! DECIMAL -> VARCHAR
 template <>
-bool CastDecimalCInternal(duckdb_result *source, char *&result, idx_t col, idx_t row);
+bool CastDecimalCInternal(duckdb_result *source, duckdb_string &result, idx_t col, idx_t row);
 //! DECIMAL -> DECIMAL (internal fetch)
 template <>
@@ -130264,7 +130304,7 @@ namespace duckdb {
 //! DECIMAL -> VARCHAR
 template <>
-bool CastDecimalCInternal(duckdb_result *source, char *&result, idx_t col, idx_t row) {
+bool CastDecimalCInternal(duckdb_result *source, duckdb_string &result, idx_t col, idx_t row) {
 	auto result_data = (duckdb::DuckDBResultData *)source->internal_data;
 	auto &query_result = result_data->result;
 	auto &source_type = query_result->types[col];
@@ -130293,9 +130333,10 @@ bool CastDecimalCInternal(duckdb_result *source, char *&result, idx_t col, idx_t
 	default:
 		throw duckdb::InternalException("Unimplemented internal type for decimal");
 	}
-	result = (char *)duckdb_malloc(sizeof(char) * (result_string.GetSize() + 1));
-	memcpy(result, result_string.GetDataUnsafe(), result_string.GetSize());
-	result[result_string.GetSize()] = '\0';
+	result.data = (char *)duckdb_malloc(sizeof(char) * (result_string.GetSize() + 1));
+	memcpy(result.data, result_string.GetDataUnsafe(), result_string.GetSize());
+	result.data[result_string.GetSize()] = '\0';
+	result.size = result_string.GetSize();
 	return true;
 }
@@ -130424,6 +130465,14 @@ char *FetchDefaultValue::Operation() {
 	return nullptr;
 }
+template <>
+duckdb_string FetchDefaultValue::Operation() {
+	duckdb_string result;
+	result.data = nullptr;
+	result.size = 0;
+	return result;
+}
 template <>
 duckdb_blob FetchDefaultValue::Operation() {
 	duckdb_blob result;
@@ -130437,9 +130486,9 @@ duckdb_blob FetchDefaultValue::Operation() {
 //===--------------------------------------------------------------------===//
 template <>
-bool FromCBlobCastWrapper::Operation(duckdb_blob input, char *&result) {
+bool FromCBlobCastWrapper::Operation(duckdb_blob input, duckdb_string &result) {
 	string_t input_str((const char *)input.data, input.size);
-	return ToCStringCastWrapper<duckdb::CastFromBlob>::template Operation<string_t, char *>(input_str, result);
+	return ToCStringCastWrapper<duckdb::CastFromBlob>::template Operation<string_t, duckdb_string>(input_str, result);
 }
 } // namespace duckdb
@@ -132976,6 +133025,8 @@ RESULT_TYPE GetInternalCValue(duckdb_result *result, idx_t col, idx_t row) {
 } // namespace duckdb
+#include <cstring>
 using duckdb::date_t;
 using duckdb::dtime_t;
 using duckdb::FetchDefaultValue;
@@ -133088,17 +133139,31 @@ duckdb_interval duckdb_value_interval(duckdb_result *result, idx_t col, idx_t ro
 }
 char *duckdb_value_varchar(duckdb_result *result, idx_t col, idx_t row) {
-	return GetInternalCValue<char *, ToCStringCastWrapper<StringCast>>(result, col, row);
+	return duckdb_value_string(result, col, row).data;
+}
+duckdb_string duckdb_value_string(duckdb_result *result, idx_t col, idx_t row) {
+	return GetInternalCValue<duckdb_string, ToCStringCastWrapper<StringCast>>(result, col, row);
 }
 char *duckdb_value_varchar_internal(duckdb_result *result, idx_t col, idx_t row) {
+	return duckdb_value_string_internal(result, col, row).data;
+}
+duckdb_string duckdb_value_string_internal(duckdb_result *result, idx_t col, idx_t row) {
 	if (!CanFetchValue(result, col, row)) {
-		return nullptr;
+		return FetchDefaultValue::Operation<duckdb_string>();
 	}
 	if (duckdb_column_type(result, col) != DUCKDB_TYPE_VARCHAR) {
-		return nullptr;
-	}
-	return UnsafeFetch<char *>(result, col, row);
+		return FetchDefaultValue::Operation<duckdb_string>();
+	}
+	// FIXME: this obviously does not work when there are null bytes in the string
+	// we need to remove the deprecated C result materialization to get that to work correctly
+	// since the deprecated C result materialization stores strings as null-terminated
+	duckdb_string res;
+	res.data = UnsafeFetch<char *>(result, col, row);
+	res.size = strlen(res.data);
+	return res;
 }
 duckdb_blob duckdb_value_blob(duckdb_result *result, idx_t col, idx_t row) {
@@ -136411,9 +136476,6 @@ string ErrorManager::InvalidUnicodeError(const string &input, const string &cont
 	}
 	string base_message;
 	switch (reason) {
-	case UnicodeInvalidReason::NULL_BYTE:
-		base_message = "Null-byte (\\0)";
-		break;
 	case UnicodeInvalidReason::BYTE_MISMATCH:
 		base_message = "Invalid unicode (byte sequence mismatch)";
 		break;
@@ -145605,7 +145667,7 @@ string MaterializedQueryResult::ToString() {
 					result += "\t";
 				}
 				auto val = row.GetValue(col_idx);
-				result += val.IsNull() ? "NULL" : val.ToString();
+				result += val.IsNull() ? "NULL" : StringUtil::Replace(val.ToString(), string("\0", 1), "\\0");
 			}
 			result += "\n";
 		}
@@ -276403,35 +276465,29 @@ UnicodeType Utf8Proc::Analyze(const char *s, size_t len, UnicodeInvalidReason *i
 		int c = (int) s[i];
 		if ((c & 0x80) == 0) {
-			/* 1 byte sequence */
-			if (c == '\0') {
-				/* NULL byte not allowed */
-				AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::NULL_BYTE);
-				return UnicodeType::INVALID;
-			}
+			continue;
+		}
+		int first_pos_seq = i;
+		if ((c & 0xE0) == 0xC0) {
+			/* 2 byte sequence */
+			int utf8char = c & 0x1F;
+			type = UTF8ExtraByteLoop<1, 0x000780>(first_pos_seq, utf8char, i, s, len, invalid_reason, invalid_pos);
+		} else if ((c & 0xF0) == 0xE0) {
+			/* 3 byte sequence */
+			int utf8char = c & 0x0F;
+			type = UTF8ExtraByteLoop<2, 0x00F800>(first_pos_seq, utf8char, i, s, len, invalid_reason, invalid_pos);
+		} else if ((c & 0xF8) == 0xF0) {
+			/* 4 byte sequence */
+			int utf8char = c & 0x07;
+			type = UTF8ExtraByteLoop<3, 0x1F0000>(first_pos_seq, utf8char, i, s, len, invalid_reason, invalid_pos);
 		} else {
-			int first_pos_seq = i;
-			if ((c & 0xE0) == 0xC0) {
-				/* 2 byte sequence */
-				int utf8char = c & 0x1F;
-				type = UTF8ExtraByteLoop<1, 0x000780>(first_pos_seq, utf8char, i, s, len, invalid_reason, invalid_pos);
-			} else if ((c & 0xF0) == 0xE0) {
-				/* 3 byte sequence */
-				int utf8char = c & 0x0F;
-				type = UTF8ExtraByteLoop<2, 0x00F800>(first_pos_seq, utf8char, i, s, len, invalid_reason, invalid_pos);
-			} else if ((c & 0xF8) == 0xF0) {
-				/* 4 byte sequence */
-				int utf8char = c & 0x07;
-				type = UTF8ExtraByteLoop<3, 0x1F0000>(first_pos_seq, utf8char, i, s, len, invalid_reason, invalid_pos);
-			} else {
-				/* invalid UTF-8 start byte */
-				AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
-				return UnicodeType::INVALID;
-			}
-			if (type == UnicodeType::INVALID) {
-				return type;
-			}
+			/* invalid UTF-8 start byte */
+			AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
+			return UnicodeType::INVALID;
+		}
+		if (type == UnicodeType::INVALID) {
+			return type;
 		}
 	}
 	return type;

package/src/duckdb.hpp CHANGED Viewed

@@ -11,8 +11,8 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI
 #pragma once
 #define DUCKDB_AMALGAMATION 1
 #define DUCKDB_AMALGAMATION_EXTENDED 1
-#define DUCKDB_SOURCE_ID "bf5b49fcee"
-#define DUCKDB_VERSION "v0.5.2-dev1819"
+#define DUCKDB_SOURCE_ID "a24535880b"
+#define DUCKDB_VERSION "v0.5.2-dev1840"
 //===----------------------------------------------------------------------===//
 //                         DuckDB
 //
@@ -4807,6 +4807,9 @@ private:
 	string raw_message;
 	//! The final message (stored in the preserved error for compatibility reasons with C-API)
 	string final_message;
+private:
+	static string SanitizeErrorMessage(string error);
 };
 } // namespace duckdb
@@ -17041,6 +17044,11 @@ typedef struct {
 	duckdb_hugeint value;
 } duckdb_decimal;
+typedef struct {
+	char *data;
+	idx_t size;
+} duckdb_string;
 typedef struct {
 	void *data;
 	idx_t size;
@@ -17488,12 +17496,21 @@ DUCKDB_API duckdb_timestamp duckdb_value_timestamp(duckdb_result *result, idx_t
 DUCKDB_API duckdb_interval duckdb_value_interval(duckdb_result *result, idx_t col, idx_t row);
 /*!
-* returns: The char* value at the specified location, or nullptr if the value cannot be converted.
-The result must be freed with `duckdb_free`.
+* DEPRECATED: use duckdb_value_string instead. This function does not work correctly if the string contains null bytes.
+* returns: The text value at the specified location as a null-terminated string, or nullptr if the value cannot be
+converted. The result must be freed with `duckdb_free`.
 */
 DUCKDB_API char *duckdb_value_varchar(duckdb_result *result, idx_t col, idx_t row);
+/*!s
+* returns: The string value at the specified location.
+The result must be freed with `duckdb_free`.
+*/
+DUCKDB_API duckdb_string duckdb_value_string(duckdb_result *result, idx_t col, idx_t row);
 /*!
+* DEPRECATED: use duckdb_value_string_internal instead. This function does not work correctly if the string contains
+null bytes.
 * returns: The char* value at the specified location. ONLY works on VARCHAR columns and does not auto-cast.
 If the column is NOT a VARCHAR column this function will return NULL.
@@ -17501,6 +17518,16 @@ The result must NOT be freed.
 */
 DUCKDB_API char *duckdb_value_varchar_internal(duckdb_result *result, idx_t col, idx_t row);
+/*!
+* DEPRECATED: use duckdb_value_string_internal instead. This function does not work correctly if the string contains
+null bytes.
+* returns: The char* value at the specified location. ONLY works on VARCHAR columns and does not auto-cast.
+If the column is NOT a VARCHAR column this function will return NULL.
+The result must NOT be freed.
+*/
+DUCKDB_API duckdb_string duckdb_value_string_internal(duckdb_result *result, idx_t col, idx_t row);
 /*!
 * returns: The duckdb_blob value at the specified location. Returns a blob with blob.data set to nullptr if the
 value cannot be converted. The resulting "blob.data" must be freed with `duckdb_free.`
@@ -27933,7 +27960,7 @@ private:
 	bool TryParseComplexCSV(DataChunk &insert_chunk, string &error_message);
 	//! Adds a value to the current row
-	void AddValue(char *str_val, idx_t length, idx_t &column, vector<idx_t> &escape_positions, bool has_quotes);
+	void AddValue(string_t str_val, idx_t &column, vector<idx_t> &escape_positions, bool has_quotes);
 	//! Adds a row to the insert_chunk, returns true if the chunk is filled as a result of this row being added
 	bool AddRow(DataChunk &insert_chunk, idx_t &column);
 	//! Finalizes a chunk, parsing all values that have been added so far and adding them to the insert_chunk