duckdb 0.5.2-dev523.0 → 0.5.2-dev547.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
- "version": "0.5.2-dev523.0",
4
+ "version": "0.5.2-dev547.0",
5
5
  "description": "DuckDB node.js API",
6
6
  "gypfile": true,
7
7
  "dependencies": {
package/src/duckdb.cpp CHANGED
@@ -75716,6 +75716,8 @@ void BufferedCSVReaderOptions::SetReadOption(const string &loption, const Value
75716
75716
  has_escape = true;
75717
75717
  } else if (loption == "ignore_errors") {
75718
75718
  ignore_errors = ParseBoolean(value, loption);
75719
+ } else if (loption == "union_by_name") {
75720
+ union_by_name = ParseBoolean(value, loption);
75719
75721
  } else {
75720
75722
  throw BinderException("Unrecognized option for CSV reader \"%s\"", loption);
75721
75723
  }
@@ -75927,6 +75929,7 @@ void BufferedCSVReader::Initialize(const vector<LogicalType> &requested_types) {
75927
75929
  SkipRowsAndReadHeader(options.skip_rows, options.header);
75928
75930
  }
75929
75931
  InitParseChunk(sql_types.size());
75932
+ InitInsertChunkIdx(sql_types.size());
75930
75933
  // we only need reset support during the automatic CSV type detection
75931
75934
  // since reset support might require caching (in the case of streams), we disable it for the remainder
75932
75935
  file_handle->DisableReset();
@@ -76072,6 +76075,12 @@ void BufferedCSVReader::InitParseChunk(idx_t num_cols) {
76072
76075
  }
76073
76076
  }
76074
76077
 
76078
+ void BufferedCSVReader::InitInsertChunkIdx(idx_t num_cols) {
76079
+ for (idx_t col = 0; col < num_cols; ++col) {
76080
+ insert_cols_idx.push_back(col);
76081
+ }
76082
+ }
76083
+
76075
76084
  void BufferedCSVReader::JumpToBeginning(idx_t skip_rows = 0, bool skip_header = false) {
76076
76085
  ResetBuffer();
76077
76086
  ResetStream();
@@ -77371,6 +77380,13 @@ bool BufferedCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column) {
77371
77380
  return false;
77372
77381
  }
77373
77382
 
77383
+ void BufferedCSVReader::SetNullUnionCols(DataChunk &insert_chunk) {
77384
+ for (idx_t col = 0; col < insert_nulls_idx.size(); ++col) {
77385
+ insert_chunk.data[insert_nulls_idx[col]].SetVectorType(VectorType::CONSTANT_VECTOR);
77386
+ ConstantVector::SetNull(insert_chunk.data[insert_nulls_idx[col]], true);
77387
+ }
77388
+ }
77389
+
77374
77390
  void BufferedCSVReader::Flush(DataChunk &insert_chunk) {
77375
77391
  if (parse_chunk.size() == 0) {
77376
77392
  return;
@@ -77401,22 +77417,25 @@ void BufferedCSVReader::Flush(DataChunk &insert_chunk) {
77401
77417
  }
77402
77418
  }
77403
77419
  }
77404
- insert_chunk.data[col_idx].Reference(parse_chunk.data[col_idx]);
77420
+ insert_chunk.data[insert_cols_idx[col_idx]].Reference(parse_chunk.data[col_idx]);
77405
77421
  } else {
77406
77422
  string error_message;
77407
77423
  bool success;
77408
77424
  if (options.has_format[LogicalTypeId::DATE] && sql_types[col_idx].id() == LogicalTypeId::DATE) {
77409
77425
  // use the date format to cast the chunk
77410
- success = TryCastDateVector(options, parse_chunk.data[col_idx], insert_chunk.data[col_idx],
77411
- parse_chunk.size(), error_message);
77426
+ success =
77427
+ TryCastDateVector(options, parse_chunk.data[col_idx], insert_chunk.data[insert_cols_idx[col_idx]],
77428
+ parse_chunk.size(), error_message);
77412
77429
  } else if (options.has_format[LogicalTypeId::TIMESTAMP] &&
77413
77430
  sql_types[col_idx].id() == LogicalTypeId::TIMESTAMP) {
77414
77431
  // use the date format to cast the chunk
77415
- success = TryCastTimestampVector(options, parse_chunk.data[col_idx], insert_chunk.data[col_idx],
77416
- parse_chunk.size(), error_message);
77432
+ success = TryCastTimestampVector(options, parse_chunk.data[col_idx],
77433
+ insert_chunk.data[insert_cols_idx[col_idx]], parse_chunk.size(),
77434
+ error_message);
77417
77435
  } else {
77418
77436
  // target type is not varchar: perform a cast
77419
- success = VectorOperations::DefaultTryCast(parse_chunk.data[col_idx], insert_chunk.data[col_idx],
77437
+ success = VectorOperations::DefaultTryCast(parse_chunk.data[col_idx],
77438
+ insert_chunk.data[insert_cols_idx[col_idx]],
77420
77439
  parse_chunk.size(), &error_message);
77421
77440
  }
77422
77441
  if (success) {
@@ -118249,6 +118268,9 @@ struct ReadCSVData : public BaseCSVData {
118249
118268
  //! The initial reader (if any): this is used when automatic detection is used during binding.
118250
118269
  //! In this case, the CSV reader is already created and might as well be re-used.
118251
118270
  unique_ptr<BufferedCSVReader> initial_reader;
118271
+ //! The union readers is created(when csv union_by_name option is on) during binding
118272
+ //! Those reader can be re-used during ReadCSVFunction
118273
+ vector<unique_ptr<BufferedCSVReader>> union_readers;
118252
118274
  };
118253
118275
 
118254
118276
  struct CSVCopyFunction {
@@ -119506,6 +119528,64 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
119506
119528
  result->sql_types = return_types;
119507
119529
  D_ASSERT(return_types.size() == names.size());
119508
119530
  }
119531
+
119532
+ // union_col_names will exclude filename and hivepartition
119533
+ if (options.union_by_name) {
119534
+ idx_t union_names_index = 0;
119535
+ case_insensitive_map_t<idx_t> union_names_map;
119536
+ vector<string> union_col_names;
119537
+ vector<LogicalType> union_col_types;
119538
+
119539
+ for (idx_t file_idx = 0; file_idx < result->files.size(); ++file_idx) {
119540
+ options.file_path = result->files[file_idx];
119541
+ auto reader = make_unique<BufferedCSVReader>(context, options);
119542
+ auto &col_names = reader->col_names;
119543
+ auto &sql_types = reader->sql_types;
119544
+ D_ASSERT(col_names.size() == sql_types.size());
119545
+
119546
+ for (idx_t col = 0; col < col_names.size(); ++col) {
119547
+ auto union_find = union_names_map.find(col_names[col]);
119548
+
119549
+ if (union_find != union_names_map.end()) {
119550
+ // given same name , union_col's type must compatible with col's type
119551
+ LogicalType compatible_type;
119552
+ compatible_type = LogicalType::MaxLogicalType(union_col_types[union_find->second], sql_types[col]);
119553
+ union_col_types[union_find->second] = compatible_type;
119554
+ } else {
119555
+ union_names_map[col_names[col]] = union_names_index;
119556
+ union_names_index++;
119557
+
119558
+ union_col_names.emplace_back(col_names[col]);
119559
+ union_col_types.emplace_back(sql_types[col]);
119560
+ }
119561
+ }
119562
+ result->union_readers.push_back(move(reader));
119563
+ }
119564
+
119565
+ for (auto &reader : result->union_readers) {
119566
+ auto &col_names = reader->col_names;
119567
+ vector<bool> is_null_cols(union_col_names.size(), true);
119568
+
119569
+ for (idx_t col = 0; col < col_names.size(); ++col) {
119570
+ idx_t remap_col = union_names_map[col_names[col]];
119571
+ reader->insert_cols_idx[col] = remap_col;
119572
+ is_null_cols[remap_col] = false;
119573
+ }
119574
+ for (idx_t col = 0; col < union_col_names.size(); ++col) {
119575
+ if (is_null_cols[col]) {
119576
+ reader->insert_nulls_idx.push_back(col);
119577
+ }
119578
+ }
119579
+ }
119580
+
119581
+ const idx_t first_file_index = 0;
119582
+ result->initial_reader = move(result->union_readers[first_file_index]);
119583
+
119584
+ names.assign(union_col_names.begin(), union_col_names.end());
119585
+ return_types.assign(union_col_types.begin(), union_col_types.end());
119586
+ D_ASSERT(names.size() == return_types.size());
119587
+ }
119588
+
119509
119589
  if (result->options.include_file_name) {
119510
119590
  result->filename_col_idx = names.size();
119511
119591
  return_types.emplace_back(LogicalType::VARCHAR);
@@ -119574,13 +119654,22 @@ static void ReadCSVFunction(ClientContext &context, TableFunctionInput &data_p,
119574
119654
  // exhausted this file, but we have more files we can read
119575
119655
  // open the next file and increment the counter
119576
119656
  bind_data.options.file_path = bind_data.files[data.file_index];
119577
- data.csv_reader = make_unique<BufferedCSVReader>(context, bind_data.options, data.csv_reader->sql_types);
119657
+ // reuse csv_readers was created during binding
119658
+ if (bind_data.options.union_by_name) {
119659
+ data.csv_reader = move(bind_data.union_readers[data.file_index]);
119660
+ } else {
119661
+ data.csv_reader =
119662
+ make_unique<BufferedCSVReader>(context, bind_data.options, data.csv_reader->sql_types);
119663
+ }
119578
119664
  data.file_index++;
119579
119665
  } else {
119580
119666
  break;
119581
119667
  }
119582
119668
  } while (true);
119583
119669
 
119670
+ if (bind_data.options.union_by_name) {
119671
+ data.csv_reader->SetNullUnionCols(output);
119672
+ }
119584
119673
  if (bind_data.options.include_file_name) {
119585
119674
  auto &col = output.data[bind_data.filename_col_idx];
119586
119675
  col.SetValue(0, Value(data.csv_reader->options.file_path));
@@ -119633,6 +119722,7 @@ static void ReadCSVAddNamedParameters(TableFunction &table_function) {
119633
119722
  table_function.named_parameters["max_line_size"] = LogicalType::VARCHAR;
119634
119723
  table_function.named_parameters["maximum_line_size"] = LogicalType::VARCHAR;
119635
119724
  table_function.named_parameters["ignore_errors"] = LogicalType::BOOLEAN;
119725
+ table_function.named_parameters["union_by_name"] = LogicalType::BOOLEAN;
119636
119726
  }
119637
119727
 
119638
119728
  double CSVReaderProgress(ClientContext &context, const FunctionData *bind_data_p,
package/src/duckdb.hpp CHANGED
@@ -11,8 +11,8 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI
11
11
  #pragma once
12
12
  #define DUCKDB_AMALGAMATION 1
13
13
  #define DUCKDB_AMALGAMATION_EXTENDED 1
14
- #define DUCKDB_SOURCE_ID "97c2f4e7e"
15
- #define DUCKDB_VERSION "v0.5.2-dev523"
14
+ #define DUCKDB_SOURCE_ID "83aff54f1"
15
+ #define DUCKDB_VERSION "v0.5.2-dev547"
16
16
  //===----------------------------------------------------------------------===//
17
17
  // DuckDB
18
18
  //
@@ -27165,6 +27165,8 @@ struct BufferedCSVReaderOptions {
27165
27165
  bool include_file_name = false;
27166
27166
  //! Whether or not to include a parsed hive partition columns
27167
27167
  bool include_parsed_hive_partitions = false;
27168
+ //! Whether or not to union files with different (but compatible) columns
27169
+ bool union_by_name = false;
27168
27170
 
27169
27171
  //===--------------------------------------------------------------------===//
27170
27172
  // WriteCSVOptions
@@ -27224,6 +27226,12 @@ public:
27224
27226
  BufferedCSVReaderOptions options;
27225
27227
  vector<LogicalType> sql_types;
27226
27228
  vector<string> col_names;
27229
+
27230
+ //! remap parse_chunk col to insert_chunk col, because when
27231
+ //! union_by_name option on insert_chunk may have more cols
27232
+ vector<idx_t> insert_cols_idx;
27233
+ vector<idx_t> insert_nulls_idx;
27234
+
27227
27235
  unique_ptr<CSVFileHandle> file_handle;
27228
27236
 
27229
27237
  unique_ptr<char[]> buffer;
@@ -27258,11 +27266,16 @@ public:
27258
27266
 
27259
27267
  idx_t GetFileSize();
27260
27268
 
27269
+ //! Fill nulls into the cols that mismtach union names
27270
+ void SetNullUnionCols(DataChunk &insert_chunk);
27271
+
27261
27272
  private:
27262
27273
  //! Initialize Parser
27263
27274
  void Initialize(const vector<LogicalType> &requested_types);
27264
27275
  //! Initializes the parse_chunk with varchar columns and aligns info with new number of cols
27265
27276
  void InitParseChunk(idx_t num_cols);
27277
+ //! Initializes the insert_chunk idx for mapping parse_chunk cols to insert_chunk cols
27278
+ void InitInsertChunkIdx(idx_t num_cols);
27266
27279
  //! Initializes the TextSearchShiftArrays for complex parser
27267
27280
  void PrepareComplexParser();
27268
27281
  //! Try to parse a single datachunk from the file. Throws an exception if anything goes wrong.