npm - duckdb - Versions diffs - 0.5.2-dev523.0 → 0.5.2-dev547.0 - Mend

duckdb 0.5.2-dev523.0 → 0.5.2-dev547.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/package.json +1 -1
package/src/duckdb.cpp +97 -7
package/src/duckdb.hpp +15 -2
package/src/parquet-amalgamation.cpp +36939 -36939

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "duckdb",
   "main": "./lib/duckdb.js",
-  "version": "0.5.2-dev523.0",
+  "version": "0.5.2-dev547.0",
   "description": "DuckDB node.js API",
   "gypfile": true,
   "dependencies": {

package/src/duckdb.cpp CHANGED Viewed

@@ -75716,6 +75716,8 @@ void BufferedCSVReaderOptions::SetReadOption(const string &loption, const Value
 		has_escape = true;
 	} else if (loption == "ignore_errors") {
 		ignore_errors = ParseBoolean(value, loption);
+	} else if (loption == "union_by_name") {
+		union_by_name = ParseBoolean(value, loption);
 	} else {
 		throw BinderException("Unrecognized option for CSV reader \"%s\"", loption);
 	}
@@ -75927,6 +75929,7 @@ void BufferedCSVReader::Initialize(const vector<LogicalType> &requested_types) {
 		SkipRowsAndReadHeader(options.skip_rows, options.header);
 	}
 	InitParseChunk(sql_types.size());
+	InitInsertChunkIdx(sql_types.size());
 	// we only need reset support during the automatic CSV type detection
 	// since reset support might require caching (in the case of streams), we disable it for the remainder
 	file_handle->DisableReset();
@@ -76072,6 +76075,12 @@ void BufferedCSVReader::InitParseChunk(idx_t num_cols) {
 	}
 }
+void BufferedCSVReader::InitInsertChunkIdx(idx_t num_cols) {
+	for (idx_t col = 0; col < num_cols; ++col) {
+		insert_cols_idx.push_back(col);
+	}
+}
 void BufferedCSVReader::JumpToBeginning(idx_t skip_rows = 0, bool skip_header = false) {
 	ResetBuffer();
 	ResetStream();
@@ -77371,6 +77380,13 @@ bool BufferedCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column) {
 	return false;
 }
+void BufferedCSVReader::SetNullUnionCols(DataChunk &insert_chunk) {
+	for (idx_t col = 0; col < insert_nulls_idx.size(); ++col) {
+		insert_chunk.data[insert_nulls_idx[col]].SetVectorType(VectorType::CONSTANT_VECTOR);
+		ConstantVector::SetNull(insert_chunk.data[insert_nulls_idx[col]], true);
+	}
+}
 void BufferedCSVReader::Flush(DataChunk &insert_chunk) {
 	if (parse_chunk.size() == 0) {
 		return;
@@ -77401,22 +77417,25 @@ void BufferedCSVReader::Flush(DataChunk &insert_chunk) {
 					}
 				}
 			}
-			insert_chunk.data[col_idx].Reference(parse_chunk.data[col_idx]);
+			insert_chunk.data[insert_cols_idx[col_idx]].Reference(parse_chunk.data[col_idx]);
 		} else {
 			string error_message;
 			bool success;
 			if (options.has_format[LogicalTypeId::DATE] && sql_types[col_idx].id() == LogicalTypeId::DATE) {
 				// use the date format to cast the chunk
-				success = TryCastDateVector(options, parse_chunk.data[col_idx], insert_chunk.data[col_idx],
-				                            parse_chunk.size(), error_message);
+				success =
+				    TryCastDateVector(options, parse_chunk.data[col_idx], insert_chunk.data[insert_cols_idx[col_idx]],
+				                      parse_chunk.size(), error_message);
 			} else if (options.has_format[LogicalTypeId::TIMESTAMP] &&
 			           sql_types[col_idx].id() == LogicalTypeId::TIMESTAMP) {
 				// use the date format to cast the chunk
-				success = TryCastTimestampVector(options, parse_chunk.data[col_idx], insert_chunk.data[col_idx],
-				                                 parse_chunk.size(), error_message);
+				success = TryCastTimestampVector(options, parse_chunk.data[col_idx],
+				                                 insert_chunk.data[insert_cols_idx[col_idx]], parse_chunk.size(),
+				                                 error_message);
 			} else {
 				// target type is not varchar: perform a cast
-				success = VectorOperations::DefaultTryCast(parse_chunk.data[col_idx], insert_chunk.data[col_idx],
+				success = VectorOperations::DefaultTryCast(parse_chunk.data[col_idx],
+				                                           insert_chunk.data[insert_cols_idx[col_idx]],
 				                                           parse_chunk.size(), &error_message);
 			}
 			if (success) {
@@ -118249,6 +118268,9 @@ struct ReadCSVData : public BaseCSVData {
 	//! The initial reader (if any): this is used when automatic detection is used during binding.
 	//! In this case, the CSV reader is already created and might as well be re-used.
 	unique_ptr<BufferedCSVReader> initial_reader;
+	//! The union readers is created(when csv union_by_name option is on) during binding
+	//! Those reader can be re-used during ReadCSVFunction
+	vector<unique_ptr<BufferedCSVReader>> union_readers;
 };
 struct CSVCopyFunction {
@@ -119506,6 +119528,64 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
 		result->sql_types = return_types;
 		D_ASSERT(return_types.size() == names.size());
 	}
+	// union_col_names will exclude filename and hivepartition
+	if (options.union_by_name) {
+		idx_t union_names_index = 0;
+		case_insensitive_map_t<idx_t> union_names_map;
+		vector<string> union_col_names;
+		vector<LogicalType> union_col_types;
+		for (idx_t file_idx = 0; file_idx < result->files.size(); ++file_idx) {
+			options.file_path = result->files[file_idx];
+			auto reader = make_unique<BufferedCSVReader>(context, options);
+			auto &col_names = reader->col_names;
+			auto &sql_types = reader->sql_types;
+			D_ASSERT(col_names.size() == sql_types.size());
+			for (idx_t col = 0; col < col_names.size(); ++col) {
+				auto union_find = union_names_map.find(col_names[col]);
+				if (union_find != union_names_map.end()) {
+					// given same name , union_col's type must compatible with col's type
+					LogicalType compatible_type;
+					compatible_type = LogicalType::MaxLogicalType(union_col_types[union_find->second], sql_types[col]);
+					union_col_types[union_find->second] = compatible_type;
+				} else {
+					union_names_map[col_names[col]] = union_names_index;
+					union_names_index++;
+					union_col_names.emplace_back(col_names[col]);
+					union_col_types.emplace_back(sql_types[col]);
+				}
+			}
+			result->union_readers.push_back(move(reader));
+		}
+		for (auto &reader : result->union_readers) {
+			auto &col_names = reader->col_names;
+			vector<bool> is_null_cols(union_col_names.size(), true);
+			for (idx_t col = 0; col < col_names.size(); ++col) {
+				idx_t remap_col = union_names_map[col_names[col]];
+				reader->insert_cols_idx[col] = remap_col;
+				is_null_cols[remap_col] = false;
+			}
+			for (idx_t col = 0; col < union_col_names.size(); ++col) {
+				if (is_null_cols[col]) {
+					reader->insert_nulls_idx.push_back(col);
+				}
+			}
+		}
+		const idx_t first_file_index = 0;
+		result->initial_reader = move(result->union_readers[first_file_index]);
+		names.assign(union_col_names.begin(), union_col_names.end());
+		return_types.assign(union_col_types.begin(), union_col_types.end());
+		D_ASSERT(names.size() == return_types.size());
+	}
 	if (result->options.include_file_name) {
 		result->filename_col_idx = names.size();
 		return_types.emplace_back(LogicalType::VARCHAR);
@@ -119574,13 +119654,22 @@ static void ReadCSVFunction(ClientContext &context, TableFunctionInput &data_p,
 			// exhausted this file, but we have more files we can read
 			// open the next file and increment the counter
 			bind_data.options.file_path = bind_data.files[data.file_index];
-			data.csv_reader = make_unique<BufferedCSVReader>(context, bind_data.options, data.csv_reader->sql_types);
+			// reuse csv_readers was created during binding
+			if (bind_data.options.union_by_name) {
+				data.csv_reader = move(bind_data.union_readers[data.file_index]);
+			} else {
+				data.csv_reader =
+				    make_unique<BufferedCSVReader>(context, bind_data.options, data.csv_reader->sql_types);
+			}
 			data.file_index++;
 		} else {
 			break;
 		}
 	} while (true);
+	if (bind_data.options.union_by_name) {
+		data.csv_reader->SetNullUnionCols(output);
+	}
 	if (bind_data.options.include_file_name) {
 		auto &col = output.data[bind_data.filename_col_idx];
 		col.SetValue(0, Value(data.csv_reader->options.file_path));
@@ -119633,6 +119722,7 @@ static void ReadCSVAddNamedParameters(TableFunction &table_function) {
 	table_function.named_parameters["max_line_size"] = LogicalType::VARCHAR;
 	table_function.named_parameters["maximum_line_size"] = LogicalType::VARCHAR;
 	table_function.named_parameters["ignore_errors"] = LogicalType::BOOLEAN;
+	table_function.named_parameters["union_by_name"] = LogicalType::BOOLEAN;
 }
 double CSVReaderProgress(ClientContext &context, const FunctionData *bind_data_p,

package/src/duckdb.hpp CHANGED Viewed

@@ -11,8 +11,8 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI
 #pragma once
 #define DUCKDB_AMALGAMATION 1
 #define DUCKDB_AMALGAMATION_EXTENDED 1
-#define DUCKDB_SOURCE_ID "97c2f4e7e"
-#define DUCKDB_VERSION "v0.5.2-dev523"
+#define DUCKDB_SOURCE_ID "83aff54f1"
+#define DUCKDB_VERSION "v0.5.2-dev547"
 //===----------------------------------------------------------------------===//
 //                         DuckDB
 //
@@ -27165,6 +27165,8 @@ struct BufferedCSVReaderOptions {
 	bool include_file_name = false;
 	//! Whether or not to include a parsed hive partition columns
 	bool include_parsed_hive_partitions = false;
+	//! Whether or not to union files with different (but compatible) columns
+	bool union_by_name = false;
 	//===--------------------------------------------------------------------===//
 	// WriteCSVOptions
@@ -27224,6 +27226,12 @@ public:
 	BufferedCSVReaderOptions options;
 	vector<LogicalType> sql_types;
 	vector<string> col_names;
+	//! remap parse_chunk col to insert_chunk col, because when
+	//! union_by_name option on insert_chunk may have more cols
+	vector<idx_t> insert_cols_idx;
+	vector<idx_t> insert_nulls_idx;
 	unique_ptr<CSVFileHandle> file_handle;
 	unique_ptr<char[]> buffer;
@@ -27258,11 +27266,16 @@ public:
 	idx_t GetFileSize();
+	//! Fill nulls into the cols that mismtach union names
+	void SetNullUnionCols(DataChunk &insert_chunk);
 private:
 	//! Initialize Parser
 	void Initialize(const vector<LogicalType> &requested_types);
 	//! Initializes the parse_chunk with varchar columns and aligns info with new number of cols
 	void InitParseChunk(idx_t num_cols);
+	//! Initializes the insert_chunk idx for mapping parse_chunk cols to insert_chunk cols
+	void InitInsertChunkIdx(idx_t num_cols);
 	//! Initializes the TextSearchShiftArrays for complex parser
 	void PrepareComplexParser();
 	//! Try to parse a single datachunk from the file. Throws an exception if anything goes wrong.