duckdb 0.6.2-dev1124.0 → 0.6.2-dev1160.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "0.6.2-dev1124.0",
5
+ "version": "0.6.2-dev1160.0",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
@@ -1723,6 +1723,38 @@ void ListColumnWriter::FinalizeAnalyze(ColumnWriterState &state_p) {
1723
1723
  child_writer->FinalizeAnalyze(*state.child_state);
1724
1724
  }
1725
1725
 
1726
+ idx_t GetConsecutiveChildList(Vector &list, idx_t count, Vector &result) {
1727
+ auto list_data = FlatVector::GetData<list_entry_t>(list);
1728
+ auto &validity = FlatVector::Validity(list);
1729
+ bool consecutive_flat_list = true;
1730
+ idx_t child_count = 0;
1731
+ for (idx_t i = 0; i < count; i++) {
1732
+ if (!validity.RowIsValid(i)) {
1733
+ continue;
1734
+ }
1735
+ if (list_data[i].offset != child_count) {
1736
+ consecutive_flat_list = false;
1737
+ }
1738
+ child_count += list_data[i].length;
1739
+ }
1740
+ if (!consecutive_flat_list) {
1741
+ SelectionVector child_sel(child_count);
1742
+ idx_t entry = 0;
1743
+ for (idx_t i = 0; i < count; i++) {
1744
+ if (!validity.RowIsValid(i)) {
1745
+ continue;
1746
+ }
1747
+ for (idx_t k = 0; k < list_data[i].length; k++) {
1748
+ child_sel.set_index(entry++, list_data[i].offset + k);
1749
+ }
1750
+ }
1751
+
1752
+ result.Slice(child_sel, child_count);
1753
+ result.Flatten(child_count);
1754
+ }
1755
+ return child_count;
1756
+ }
1757
+
1726
1758
  void ListColumnWriter::Prepare(ColumnWriterState &state_p, ColumnWriterState *parent, Vector &vector, idx_t count) {
1727
1759
  auto &state = (ListColumnWriterState &)state_p;
1728
1760
 
@@ -1775,8 +1807,9 @@ void ListColumnWriter::Prepare(ColumnWriterState &state_p, ColumnWriterState *pa
1775
1807
  state.parent_index += vcount;
1776
1808
 
1777
1809
  auto &list_child = ListVector::GetEntry(vector);
1778
- auto list_count = ListVector::GetListSize(vector);
1779
- child_writer->Prepare(*state.child_state, &state_p, list_child, list_count);
1810
+ Vector child_list(list_child);
1811
+ idx_t child_count = GetConsecutiveChildList(vector, count, child_list);
1812
+ child_writer->Prepare(*state.child_state, &state_p, child_list, child_count);
1780
1813
  }
1781
1814
 
1782
1815
  void ListColumnWriter::BeginWrite(ColumnWriterState &state_p) {
@@ -1788,8 +1821,9 @@ void ListColumnWriter::Write(ColumnWriterState &state_p, Vector &vector, idx_t c
1788
1821
  auto &state = (ListColumnWriterState &)state_p;
1789
1822
 
1790
1823
  auto &list_child = ListVector::GetEntry(vector);
1791
- auto list_count = ListVector::GetListSize(vector);
1792
- child_writer->Write(*state.child_state, list_child, list_count);
1824
+ Vector child_list(list_child);
1825
+ idx_t child_count = GetConsecutiveChildList(vector, count, child_list);
1826
+ child_writer->Write(*state.child_state, child_list, child_count);
1793
1827
  }
1794
1828
 
1795
1829
  void ListColumnWriter::FinalizeWrite(ColumnWriterState &state_p) {
@@ -73,6 +73,7 @@ struct ParquetOptions {
73
73
  bool filename = false;
74
74
  bool file_row_number = false;
75
75
  bool hive_partitioning = false;
76
+ bool union_by_name = false;
76
77
 
77
78
  public:
78
79
  void Serialize(FieldWriter &writer) const;
@@ -109,6 +110,17 @@ public:
109
110
  shared_ptr<ParquetFileMetadataCache> metadata;
110
111
  ParquetOptions parquet_options;
111
112
 
113
+ //! when reading multiple parquet files (with union by name option)
114
+ //! TableFunction might return more cols than any single parquet file. Even all parquet files have same
115
+ //! cols, those files might have cols at different positions and with different logical type.
116
+ //! e.g. p1.parquet (a INT , b VARCHAR) p2.parquet (c VARCHAR, a VARCHAR)
117
+ vector<idx_t> union_idx_map;
118
+ //! If the parquet file dont have union_cols5 union_null_cols[5] will be true.
119
+ //! some parquet files may not have all union cols.
120
+ vector<bool> union_null_cols;
121
+ //! All union cols will cast to same type.
122
+ vector<LogicalType> union_col_types;
123
+
112
124
  public:
113
125
  void InitializeScan(ParquetReaderScanState &state, vector<column_t> column_ids, vector<idx_t> groups_to_read,
114
126
  TableFilterSet *table_filters);
@@ -139,6 +151,7 @@ private:
139
151
  uint64_t GetGroupSpan(ParquetReaderScanState &state);
140
152
  void PrepareRowGroupBuffer(ParquetReaderScanState &state, idx_t out_col_idx);
141
153
  LogicalType DeriveLogicalType(const SchemaElement &s_ele);
154
+ void RearrangeChildReaders(unique_ptr<duckdb::ColumnReader> &root_reader, vector<column_t> &column_ids);
142
155
 
143
156
  template <typename... Args>
144
157
  std::runtime_error FormatException(const string fmt_str, Args... params) {
@@ -20,6 +20,7 @@
20
20
  #include "duckdb/common/field_writer.hpp"
21
21
  #include "duckdb/common/file_system.hpp"
22
22
  #include "duckdb/common/hive_partitioning.hpp"
23
+ #include "duckdb/common/union_by_name.hpp"
23
24
  #include "duckdb/common/types/chunk_collection.hpp"
24
25
  #include "duckdb/function/copy_function.hpp"
25
26
  #include "duckdb/function/table_function.hpp"
@@ -45,6 +46,10 @@ struct ParquetReadBindData : public TableFunctionData {
45
46
  vector<string> names;
46
47
  vector<LogicalType> types;
47
48
 
49
+ // The union readers are created (when parquet union_by_name option is on) during binding
50
+ // Those readers can be re-used during ParquetParallelStateNext
51
+ vector<shared_ptr<ParquetReader>> union_readers;
52
+
48
53
  // These come from the initial_reader, but need to be stored in case the initial_reader is removed by a filter
49
54
  idx_t initial_file_cardinality;
50
55
  idx_t initial_file_row_groups;
@@ -127,6 +132,7 @@ void ParquetOptions::Serialize(FieldWriter &writer) const {
127
132
  writer.WriteField<bool>(filename);
128
133
  writer.WriteField<bool>(file_row_number);
129
134
  writer.WriteField<bool>(hive_partitioning);
135
+ writer.WriteField<bool>(union_by_name);
130
136
  }
131
137
 
132
138
  void ParquetOptions::Deserialize(FieldReader &reader) {
@@ -134,6 +140,7 @@ void ParquetOptions::Deserialize(FieldReader &reader) {
134
140
  filename = reader.ReadRequired<bool>();
135
141
  file_row_number = reader.ReadRequired<bool>();
136
142
  hive_partitioning = reader.ReadRequired<bool>();
143
+ union_by_name = reader.ReadRequired<bool>();
137
144
  }
138
145
 
139
146
  BindInfo ParquetGetBatchInfo(const FunctionData *bind_data) {
@@ -148,6 +155,7 @@ BindInfo ParquetGetBatchInfo(const FunctionData *bind_data) {
148
155
  bind_info.InsertOption("filename", Value::BOOLEAN(parquet_bind->parquet_options.filename));
149
156
  bind_info.InsertOption("file_row_number", Value::BOOLEAN(parquet_bind->parquet_options.file_row_number));
150
157
  bind_info.InsertOption("hive_partitioning", Value::BOOLEAN(parquet_bind->parquet_options.hive_partitioning));
158
+ bind_info.InsertOption("union_by_name", Value::BOOLEAN(parquet_bind->parquet_options.union_by_name));
151
159
  return bind_info;
152
160
  }
153
161
 
@@ -164,6 +172,7 @@ public:
164
172
  table_function.named_parameters["filename"] = LogicalType::BOOLEAN;
165
173
  table_function.named_parameters["file_row_number"] = LogicalType::BOOLEAN;
166
174
  table_function.named_parameters["hive_partitioning"] = LogicalType::BOOLEAN;
175
+ table_function.named_parameters["union_by_name"] = LogicalType::BOOLEAN;
167
176
  table_function.get_batch_index = ParquetScanGetBatchIndex;
168
177
  table_function.serialize = ParquetScanSerialize;
169
178
  table_function.deserialize = ParquetScanDeserialize;
@@ -180,6 +189,7 @@ public:
180
189
  table_function.named_parameters["filename"] = LogicalType::BOOLEAN;
181
190
  table_function.named_parameters["file_row_number"] = LogicalType::BOOLEAN;
182
191
  table_function.named_parameters["hive_partitioning"] = LogicalType::BOOLEAN;
192
+ table_function.named_parameters["union_by_name"] = LogicalType::BOOLEAN;
183
193
  set.AddFunction(table_function);
184
194
  return set;
185
195
  }
@@ -201,22 +211,31 @@ public:
201
211
  parquet_options.file_row_number = true;
202
212
  } else if (loption == "hive_partitioning") {
203
213
  parquet_options.hive_partitioning = true;
214
+ } else if (loption == "union_by_name") {
215
+ parquet_options.union_by_name = true;
204
216
  } else {
205
217
  throw NotImplementedException("Unsupported option for COPY FROM parquet: %s", option.first);
206
218
  }
207
219
  }
208
- auto result = make_unique<ParquetReadBindData>();
209
220
 
210
221
  FileSystem &fs = FileSystem::GetFileSystem(context);
211
- result->files = fs.Glob(info.file_path, context);
212
- if (result->files.empty()) {
222
+ auto files = fs.Glob(info.file_path, context);
223
+ if (files.empty()) {
213
224
  throw IOException("No files found that match the pattern \"%s\"", info.file_path);
214
225
  }
215
- result->SetInitialReader(
216
- make_shared<ParquetReader>(context, result->files[0], expected_types, parquet_options));
217
- result->names = result->initial_reader->names;
218
- result->types = result->initial_reader->return_types;
219
- return std::move(result);
226
+
227
+ // The most likely path (Parquet read without union by name option)
228
+ if (!parquet_options.union_by_name) {
229
+ auto result = make_unique<ParquetReadBindData>();
230
+ result->files = std::move(files);
231
+ result->SetInitialReader(
232
+ make_shared<ParquetReader>(context, result->files[0], expected_types, parquet_options));
233
+ result->names = result->initial_reader->names;
234
+ result->types = result->initial_reader->return_types;
235
+ return std::move(result);
236
+ } else {
237
+ return ParquetUnionNamesBind(context, files, expected_types, expected_names, parquet_options);
238
+ }
220
239
  }
221
240
 
222
241
  static unique_ptr<BaseStatistics> ParquetScanStats(ClientContext &context, const FunctionData *bind_data_p,
@@ -303,11 +322,40 @@ public:
303
322
  vector<LogicalType> &return_types, vector<string> &names,
304
323
  ParquetOptions parquet_options) {
305
324
  auto result = make_unique<ParquetReadBindData>();
325
+
326
+ // The most likely path (Parquet Scan without union by name option)
327
+ if (!parquet_options.union_by_name) {
328
+ result->files = std::move(files);
329
+ result->SetInitialReader(make_shared<ParquetReader>(context, result->files[0], parquet_options));
330
+ return_types = result->types = result->initial_reader->return_types;
331
+ names = result->names = result->initial_reader->names;
332
+ return std::move(result);
333
+ } else {
334
+ return ParquetUnionNamesBind(context, files, return_types, names, parquet_options);
335
+ }
336
+ }
337
+
338
+ static unique_ptr<FunctionData> ParquetUnionNamesBind(ClientContext &context, vector<string> files,
339
+ vector<LogicalType> &return_types, vector<string> &names,
340
+ ParquetOptions parquet_options) {
341
+ auto result = make_unique<ParquetReadBindData>();
306
342
  result->files = std::move(files);
307
343
 
308
- result->SetInitialReader(make_shared<ParquetReader>(context, result->files[0], parquet_options));
309
- return_types = result->types = result->initial_reader->return_types;
310
- names = result->names = result->initial_reader->names;
344
+ case_insensitive_map_t<idx_t> union_names_map;
345
+ vector<string> union_col_names;
346
+ vector<LogicalType> union_col_types;
347
+ auto dummy_readers = UnionByName<ParquetReader, ParquetOptions>::UnionCols(
348
+ context, result->files, union_col_types, union_col_names, union_names_map, parquet_options);
349
+
350
+ dummy_readers = UnionByName<ParquetReader, ParquetOptions>::CreateUnionMap(
351
+ std::move(dummy_readers), union_col_types, union_col_names, union_names_map);
352
+
353
+ std::move(dummy_readers.begin(), dummy_readers.end(), std::back_inserter(result->union_readers));
354
+ names.assign(union_col_names.begin(), union_col_names.end());
355
+ return_types.assign(union_col_types.begin(), union_col_types.end());
356
+ result->SetInitialReader(result->union_readers[0]);
357
+ D_ASSERT(names.size() == return_types.size());
358
+
311
359
  return std::move(result);
312
360
  }
313
361
 
@@ -337,6 +385,8 @@ public:
337
385
  parquet_options.file_row_number = BooleanValue::Get(kv.second);
338
386
  } else if (loption == "hive_partitioning") {
339
387
  parquet_options.hive_partitioning = BooleanValue::Get(kv.second);
388
+ } else if (loption == "union_by_name") {
389
+ parquet_options.union_by_name = BooleanValue::Get(kv.second);
340
390
  }
341
391
  }
342
392
  FileSystem &fs = FileSystem::GetFileSystem(context);
@@ -370,6 +420,8 @@ public:
370
420
  parquet_options.file_row_number = BooleanValue::Get(kv.second);
371
421
  } else if (loption == "hive_partitioning") {
372
422
  parquet_options.hive_partitioning = BooleanValue::Get(kv.second);
423
+ } else if (loption == "union_by_name") {
424
+ parquet_options.union_by_name = true;
373
425
  }
374
426
  }
375
427
  return ParquetScanBindInternal(context, std::move(files), return_types, names, parquet_options);
@@ -417,20 +469,24 @@ public:
417
469
 
418
470
  result->file_opening = std::vector<bool>(bind_data.files.size(), false);
419
471
  result->file_mutexes = std::unique_ptr<mutex[]>(new mutex[bind_data.files.size()]);
420
- result->readers = std::vector<shared_ptr<ParquetReader>>(bind_data.files.size(), nullptr);
421
-
422
- if (bind_data.initial_reader) {
423
- result->initial_reader = bind_data.initial_reader;
424
- result->readers[0] = bind_data.initial_reader;
425
- } else {
426
- if (bind_data.files.empty()) {
427
- result->initial_reader = nullptr;
472
+ if (!bind_data.parquet_options.union_by_name) {
473
+ result->readers = std::vector<shared_ptr<ParquetReader>>(bind_data.files.size(), nullptr);
474
+ if (bind_data.initial_reader) {
475
+ result->initial_reader = bind_data.initial_reader;
476
+ result->readers[0] = bind_data.initial_reader;
428
477
  } else {
429
- result->initial_reader =
430
- make_shared<ParquetReader>(context, bind_data.files[0], bind_data.names, bind_data.types,
431
- input.column_ids, bind_data.parquet_options, bind_data.files[0]);
432
- result->readers[0] = result->initial_reader;
478
+ if (bind_data.files.empty()) {
479
+ result->initial_reader = nullptr;
480
+ } else {
481
+ result->initial_reader =
482
+ make_shared<ParquetReader>(context, bind_data.files[0], bind_data.names, bind_data.types,
483
+ input.column_ids, bind_data.parquet_options, bind_data.files[0]);
484
+ result->readers[0] = result->initial_reader;
485
+ }
433
486
  }
487
+ } else {
488
+ result->readers = std::move(bind_data.union_readers);
489
+ result->initial_reader = result->readers[0];
434
490
  }
435
491
 
436
492
  result->row_group_index = 0;
@@ -497,6 +553,9 @@ public:
497
553
 
498
554
  bind_data.chunk_count++;
499
555
  if (output.size() > 0) {
556
+ if (bind_data.parquet_options.union_by_name) {
557
+ UnionByName<ParquetReader, ParquetOptions>::SetNullUnionCols(output, data.reader->union_null_cols);
558
+ }
500
559
  return;
501
560
  }
502
561
  if (!ParquetParallelStateNext(context, bind_data, data, gstate)) {
@@ -533,6 +592,12 @@ public:
533
592
  D_ASSERT(parallel_state.initial_reader);
534
593
 
535
594
  if (parallel_state.readers[parallel_state.file_index]) {
595
+ const auto &current_reader = parallel_state.readers[parallel_state.file_index];
596
+ if (current_reader->union_null_cols.empty()) {
597
+ current_reader->union_null_cols.resize(current_reader->return_types.size());
598
+ std::fill(current_reader->union_null_cols.begin(), current_reader->union_null_cols.end(), false);
599
+ }
600
+
536
601
  if (parallel_state.row_group_index <
537
602
  parallel_state.readers[parallel_state.file_index]->NumRowGroups()) {
538
603
  // The current reader has rowgroups left to be scanned
@@ -352,7 +352,6 @@ unique_ptr<ColumnReader> ParquetReader::CreateReader(const duckdb_parquet::forma
352
352
  D_ASSERT(file_meta_data->row_groups.empty() || next_file_idx == file_meta_data->row_groups[0].columns.size());
353
353
 
354
354
  auto &root_struct_reader = (StructColumnReader &)*ret;
355
-
356
355
  // add casts if required
357
356
  for (auto &entry : cast_map) {
358
357
  auto column_idx = entry.first;
@@ -700,11 +699,35 @@ void ParquetReader::InitializeScan(ParquetReaderScanState &state, vector<column_
700
699
 
701
700
  state.thrift_file_proto = CreateThriftProtocol(allocator, *state.file_handle, *file_opener, state.prefetch_mode);
702
701
  state.root_reader = CreateReader(GetFileMetadata());
702
+ if (parquet_options.union_by_name) {
703
+ RearrangeChildReaders(state.root_reader, state.column_ids);
704
+ }
703
705
 
704
706
  state.define_buf.resize(allocator, STANDARD_VECTOR_SIZE);
705
707
  state.repeat_buf.resize(allocator, STANDARD_VECTOR_SIZE);
706
708
  }
707
709
 
710
+ void ParquetReader::RearrangeChildReaders(unique_ptr<duckdb::ColumnReader> &root_reader, vector<column_t> &column_ids) {
711
+ auto &root_struct_reader = (StructColumnReader &)*root_reader;
712
+ unordered_map<idx_t, idx_t> reverse_union_idx;
713
+
714
+ for (idx_t col = 0; col < union_idx_map.size(); ++col) {
715
+ auto child_reader = move(root_struct_reader.child_readers[col]);
716
+ auto cast_reader = make_unique<CastColumnReader>(move(child_reader), union_col_types[union_idx_map[col]]);
717
+ root_struct_reader.child_readers[col] = move(cast_reader);
718
+ reverse_union_idx[union_idx_map[col]] = col;
719
+ }
720
+
721
+ vector<bool> column_id_nulls(column_ids.size(), true);
722
+ for (idx_t col = 0; col < column_ids.size(); ++col) {
723
+ auto find = reverse_union_idx.find(column_ids[col]);
724
+ if (find != reverse_union_idx.end()) {
725
+ column_ids[col] = find->second;
726
+ column_id_nulls[col] = false;
727
+ }
728
+ }
729
+ union_null_cols = move(column_id_nulls);
730
+ }
708
731
  void FilterIsNull(Vector &v, parquet_filter_t &filter_mask, idx_t count) {
709
732
  if (v.GetVectorType() == VectorType::CONSTANT_VECTOR) {
710
733
  auto &mask = ConstantVector::Validity(v);
@@ -898,6 +921,8 @@ bool ParquetReader::ScanInternal(ParquetReaderScanState &state, DataChunk &resul
898
921
  return false;
899
922
  }
900
923
 
924
+ D_ASSERT(union_null_cols.size() >= result.ColumnCount());
925
+
901
926
  // see if we have to switch to the next row group in the parquet file
902
927
  if (state.current_group < 0 || (int64_t)state.group_offset >= GetGroup(state).num_rows) {
903
928
  state.current_group++;
@@ -915,7 +940,7 @@ bool ParquetReader::ScanInternal(ParquetReaderScanState &state, DataChunk &resul
915
940
  uint64_t to_scan_compressed_bytes = 0;
916
941
  for (idx_t out_col_idx = 0; out_col_idx < result.ColumnCount(); out_col_idx++) {
917
942
  // this is a special case where we are not interested in the actual contents of the file
918
- if (IsRowIdColumnId(state.column_ids[out_col_idx])) {
943
+ if (IsRowIdColumnId(state.column_ids[out_col_idx]) || union_null_cols[out_col_idx]) {
919
944
  continue;
920
945
  }
921
946
 
@@ -956,7 +981,7 @@ bool ParquetReader::ScanInternal(ParquetReaderScanState &state, DataChunk &resul
956
981
  // Prefetch column-wise
957
982
  for (idx_t out_col_idx = 0; out_col_idx < result.ColumnCount(); out_col_idx++) {
958
983
 
959
- if (IsRowIdColumnId(state.column_ids[out_col_idx])) {
984
+ if (IsRowIdColumnId(state.column_ids[out_col_idx]) || union_null_cols[out_col_idx]) {
960
985
  continue;
961
986
  }
962
987
 
@@ -1007,6 +1032,10 @@ bool ParquetReader::ScanInternal(ParquetReaderScanState &state, DataChunk &resul
1007
1032
  if (state.filters) {
1008
1033
  vector<bool> need_to_read(result.ColumnCount(), true);
1009
1034
 
1035
+ for (idx_t col = 0; col < need_to_read.size(); ++col) {
1036
+ need_to_read[col] = need_to_read[col] && !union_null_cols[col];
1037
+ }
1038
+
1010
1039
  // first load the columns that are used in filters
1011
1040
  for (auto &filter_col : state.filters->filters) {
1012
1041
  auto file_col_idx = state.column_ids[filter_col.first];
@@ -1058,6 +1087,9 @@ bool ParquetReader::ScanInternal(ParquetReaderScanState &state, DataChunk &resul
1058
1087
  result.data[out_col_idx].Reference(constant_42);
1059
1088
  continue;
1060
1089
  }
1090
+ if (union_null_cols[out_col_idx]) {
1091
+ continue;
1092
+ }
1061
1093
 
1062
1094
  auto rows_read = root_reader->GetChildReader(file_col_idx)
1063
1095
  ->Read(result.size(), filter_mask, define_ptr, repeat_ptr, result.data[out_col_idx]);
@@ -160,7 +160,7 @@ void BaseCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &esc
160
160
  } else {
161
161
  row_empty = false;
162
162
  }
163
- if (!sql_types.empty() && column == sql_types.size() && length == 0) {
163
+ if (!return_types.empty() && column == return_types.size() && length == 0) {
164
164
  // skip a single trailing delimiter in last column
165
165
  return;
166
166
  }
@@ -168,14 +168,14 @@ void BaseCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &esc
168
168
  column++;
169
169
  return;
170
170
  }
171
- if (column >= sql_types.size()) {
171
+ if (column >= return_types.size()) {
172
172
  if (options.ignore_errors) {
173
173
  error_column_overflow = true;
174
174
  return;
175
175
  } else {
176
176
  throw InvalidInputException(
177
177
  "Error in file \"%s\", on line %s: expected %lld values per row, but got more. (%s)", options.file_path,
178
- GetLineNumberStr(linenr, linenr_estimated).c_str(), sql_types.size(), options.ToString());
178
+ GetLineNumberStr(linenr, linenr_estimated).c_str(), return_types.size(), options.ToString());
179
179
  }
180
180
  }
181
181
 
@@ -183,7 +183,7 @@ void BaseCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &esc
183
183
  idx_t row_entry = parse_chunk.size();
184
184
 
185
185
  // test against null string, but only if the value was not quoted
186
- if ((!has_quotes || sql_types[column].id() != LogicalTypeId::VARCHAR) && !options.force_not_null[column] &&
186
+ if ((!has_quotes || return_types[column].id() != LogicalTypeId::VARCHAR) && !options.force_not_null[column] &&
187
187
  Equals::Operation(str_val, string_t(options.null_str))) {
188
188
  FlatVector::SetNull(parse_chunk.data[column], row_entry, true);
189
189
  } else {
@@ -221,7 +221,7 @@ bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error
221
221
 
222
222
  if (row_empty) {
223
223
  row_empty = false;
224
- if (sql_types.size() != 1) {
224
+ if (return_types.size() != 1) {
225
225
  if (mode == ParserMode::PARSING) {
226
226
  FlatVector::SetNull(parse_chunk.data[0], parse_chunk.size(), false);
227
227
  }
@@ -238,7 +238,7 @@ bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error
238
238
  return false;
239
239
  }
240
240
 
241
- if (column < sql_types.size() && mode != ParserMode::SNIFFING_DIALECT) {
241
+ if (column < return_types.size() && mode != ParserMode::SNIFFING_DIALECT) {
242
242
  if (options.ignore_errors) {
243
243
  column = 0;
244
244
  return false;
@@ -249,7 +249,7 @@ bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error
249
249
  } else {
250
250
  throw InvalidInputException(
251
251
  "Error in file \"%s\" on line %s: expected %lld values per row, but got %d.\nParser options:\n%s",
252
- options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(), sql_types.size(), column,
252
+ options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(), return_types.size(), column,
253
253
  options.ToString());
254
254
  }
255
255
  }
@@ -282,13 +282,6 @@ bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error
282
282
  return false;
283
283
  }
284
284
 
285
- void BaseCSVReader::SetNullUnionCols(DataChunk &insert_chunk) {
286
- for (idx_t col = 0; col < insert_nulls_idx.size(); ++col) {
287
- insert_chunk.data[insert_nulls_idx[col]].SetVectorType(VectorType::CONSTANT_VECTOR);
288
- ConstantVector::SetNull(insert_chunk.data[insert_nulls_idx[col]], true);
289
- }
290
- }
291
-
292
285
  void BaseCSVReader::VerifyUTF8(idx_t col_idx, idx_t row_idx, DataChunk &chunk, int64_t offset) {
293
286
  D_ASSERT(col_idx < chunk.data.size());
294
287
  D_ASSERT(row_idx < chunk.size());
@@ -302,8 +295,8 @@ void BaseCSVReader::VerifyUTF8(idx_t col_idx, idx_t row_idx, DataChunk &chunk, i
302
295
  auto utf_type = Utf8Proc::Analyze(s.GetDataUnsafe(), s.GetSize());
303
296
  if (utf_type == UnicodeType::INVALID) {
304
297
  string col_name = to_string(col_idx);
305
- if (col_idx < col_names.size()) {
306
- col_name = "\"" + col_names[col_idx] + "\"";
298
+ if (col_idx < names.size()) {
299
+ col_name = "\"" + names[col_idx] + "\"";
307
300
  }
308
301
  int64_t error_line = linenr - (chunk.size() - row_idx) + 1 + offset;
309
302
  D_ASSERT(error_line >= 0);
@@ -330,9 +323,9 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, bool try_add_line) {
330
323
 
331
324
  // convert the columns in the parsed chunk to the types of the table
332
325
  insert_chunk.SetCardinality(parse_chunk);
333
- for (idx_t col_idx = 0; col_idx < sql_types.size(); col_idx++) {
326
+ for (idx_t col_idx = 0; col_idx < return_types.size(); col_idx++) {
334
327
  auto insert_idx = insert_cols_idx[col_idx];
335
- auto &type = sql_types[col_idx];
328
+ auto &type = return_types[col_idx];
336
329
  if (type.id() == LogicalTypeId::VARCHAR) {
337
330
  // target type is varchar: no need to convert
338
331
  // just test that all strings are valid utf-8 strings
@@ -345,7 +338,8 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, bool try_add_line) {
345
338
  // use the date format to cast the chunk
346
339
  success = TryCastDateVector(options, parse_chunk.data[col_idx], insert_chunk.data[insert_idx],
347
340
  parse_chunk.size(), error_message);
348
- } else if (options.has_format[LogicalTypeId::TIMESTAMP] && type.id() == LogicalTypeId::TIMESTAMP) {
341
+ } else if (options.has_format[LogicalTypeId::TIMESTAMP] &&
342
+ return_types[col_idx].id() == LogicalTypeId::TIMESTAMP) {
349
343
  // use the date format to cast the chunk
350
344
  success = TryCastTimestampVector(options, parse_chunk.data[col_idx], insert_chunk.data[insert_idx],
351
345
  parse_chunk.size(), error_message);
@@ -365,8 +359,8 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, bool try_add_line) {
365
359
  continue;
366
360
  }
367
361
  string col_name = to_string(col_idx);
368
- if (col_idx < col_names.size()) {
369
- col_name = "\"" + col_names[col_idx] + "\"";
362
+ if (col_idx < names.size()) {
363
+ col_name = "\"" + names[col_idx] + "\"";
370
364
  }
371
365
 
372
366
  // figure out the exact line number
@@ -401,7 +395,7 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, bool try_add_line) {
401
395
 
402
396
  for (idx_t row_idx = 0; row_idx < parse_chunk.size(); row_idx++) {
403
397
  bool failed = false;
404
- for (idx_t column_idx = 0; column_idx < sql_types.size(); column_idx++) {
398
+ for (idx_t column_idx = 0; column_idx < return_types.size(); column_idx++) {
405
399
 
406
400
  auto &inserted_column = insert_chunk.data[column_idx];
407
401
  auto &parsed_column = parse_chunk.data[column_idx];
@@ -37,6 +37,16 @@ BufferedCSVReader::BufferedCSVReader(ClientContext &context, BufferedCSVReaderOp
37
37
  std::move(options_p), requested_types) {
38
38
  }
39
39
 
40
+ BufferedCSVReader::BufferedCSVReader(ClientContext &context, string filename, BufferedCSVReaderOptions options_p,
41
+ const vector<LogicalType> &requested_types)
42
+ : BaseCSVReader(FileSystem::GetFileSystem(context), Allocator::Get(context), FileSystem::GetFileOpener(context),
43
+ move(options_p), requested_types),
44
+ buffer_size(0), position(0), start(0) {
45
+ options.file_path = move(filename);
46
+ file_handle = OpenCSV(options);
47
+ Initialize(requested_types);
48
+ }
49
+
40
50
  BufferedCSVReader::~BufferedCSVReader() {
41
51
  }
42
52
 
@@ -236,20 +246,20 @@ static string NormalizeColumnName(const string &col_name) {
236
246
  void BufferedCSVReader::Initialize(const vector<LogicalType> &requested_types) {
237
247
  PrepareComplexParser();
238
248
  if (options.auto_detect) {
239
- sql_types = SniffCSV(requested_types);
240
- if (sql_types.empty()) {
249
+ return_types = SniffCSV(requested_types);
250
+ if (return_types.empty()) {
241
251
  throw Exception("Failed to detect column types from CSV: is the file a valid CSV file?");
242
252
  }
243
253
  if (cached_chunks.empty()) {
244
254
  JumpToBeginning(options.skip_rows, options.header);
245
255
  }
246
256
  } else {
247
- sql_types = requested_types;
257
+ return_types = requested_types;
248
258
  ResetBuffer();
249
259
  SkipRowsAndReadHeader(options.skip_rows, options.header);
250
260
  }
251
- InitParseChunk(sql_types.size());
252
- InitInsertChunkIdx(sql_types.size());
261
+ InitParseChunk(return_types.size());
262
+ InitInsertChunkIdx(return_types.size());
253
263
  // we only need reset support during the automatic CSV type detection
254
264
  // since reset support might require caching (in the case of streams), we disable it for the remainder
255
265
  file_handle->DisableReset();
@@ -297,7 +307,7 @@ void BufferedCSVReader::SkipRowsAndReadHeader(idx_t skip_rows, bool skip_header)
297
307
 
298
308
  if (skip_header) {
299
309
  // ignore the first line as a header line
300
- InitParseChunk(sql_types.size());
310
+ InitParseChunk(return_types.size());
301
311
  ParseCSV(ParserMode::PARSING_HEADER);
302
312
  }
303
313
  }
@@ -520,14 +530,14 @@ void BufferedCSVReader::DetectCandidateTypes(const vector<LogicalType> &type_can
520
530
  format_candidates[t.first].clear();
521
531
  }
522
532
 
523
- // set all sql_types to VARCHAR so we can do datatype detection based on VARCHAR values
524
- sql_types.clear();
525
- sql_types.assign(options.num_cols, LogicalType::VARCHAR);
533
+ // set all return_types to VARCHAR so we can do datatype detection based on VARCHAR values
534
+ return_types.clear();
535
+ return_types.assign(options.num_cols, LogicalType::VARCHAR);
526
536
 
527
537
  // jump to beginning and skip potential header
528
538
  JumpToBeginning(options.skip_rows, true);
529
539
  DataChunk header_row;
530
- header_row.Initialize(allocator, sql_types);
540
+ header_row.Initialize(allocator, return_types);
531
541
  parse_chunk.Copy(header_row);
532
542
 
533
543
  if (header_row.size() == 0) {
@@ -535,7 +545,7 @@ void BufferedCSVReader::DetectCandidateTypes(const vector<LogicalType> &type_can
535
545
  }
536
546
 
537
547
  // init parse chunk and read csv with info candidate
538
- InitParseChunk(sql_types.size());
548
+ InitParseChunk(return_types.size());
539
549
  if (!TryParseCSV(ParserMode::SNIFFING_DATATYPES)) {
540
550
  continue;
541
551
  }
@@ -713,7 +723,7 @@ void BufferedCSVReader::DetectHeader(const vector<vector<LogicalType>> &best_sql
713
723
  col_name = col_name + "_" + to_string(name_collision_count[col_name]);
714
724
  }
715
725
 
716
- col_names.push_back(col_name);
726
+ names.push_back(col_name);
717
727
  name_collision_count[col_name] = 0;
718
728
  }
719
729
 
@@ -721,7 +731,7 @@ void BufferedCSVReader::DetectHeader(const vector<vector<LogicalType>> &best_sql
721
731
  options.header = false;
722
732
  for (idx_t col = 0; col < options.num_cols; col++) {
723
733
  string column_name = GenerateColumnName(options.num_cols, col);
724
- col_names.push_back(column_name);
734
+ names.push_back(column_name);
725
735
  }
726
736
  }
727
737
  }
@@ -731,8 +741,8 @@ vector<LogicalType> BufferedCSVReader::RefineTypeDetection(const vector<LogicalT
731
741
  vector<vector<LogicalType>> &best_sql_types_candidates,
732
742
  map<LogicalTypeId, vector<string>> &best_format_candidates) {
733
743
  // for the type refine we set the SQL types to VARCHAR for all columns
734
- sql_types.clear();
735
- sql_types.assign(options.num_cols, LogicalType::VARCHAR);
744
+ return_types.clear();
745
+ return_types.assign(options.num_cols, LogicalType::VARCHAR);
736
746
 
737
747
  vector<LogicalType> detected_types;
738
748
 
@@ -747,11 +757,11 @@ vector<LogicalType> BufferedCSVReader::RefineTypeDetection(const vector<LogicalT
747
757
  }
748
758
  } else if (options.all_varchar) {
749
759
  // return all types varchar
750
- detected_types = sql_types;
760
+ detected_types = return_types;
751
761
  } else {
752
762
  // jump through the rest of the file and continue to refine the sql type guess
753
763
  while (JumpToNextSample()) {
754
- InitParseChunk(sql_types.size());
764
+ InitParseChunk(return_types.size());
755
765
  // if jump ends up a bad line, we just skip this chunk
756
766
  if (!TryParseCSV(ParserMode::SNIFFING_DATATYPES)) {
757
767
  continue;
@@ -878,11 +888,11 @@ vector<LogicalType> BufferedCSVReader::SniffCSV(const vector<LogicalType> &reque
878
888
  options.num_cols = best_num_cols;
879
889
  DetectHeader(best_sql_types_candidates, best_header_row);
880
890
  auto sql_types_per_column = options.sql_types_per_column;
881
- for (idx_t i = 0; i < col_names.size(); i++) {
882
- auto it = sql_types_per_column.find(col_names[i]);
891
+ for (idx_t i = 0; i < names.size(); i++) {
892
+ auto it = sql_types_per_column.find(names[i]);
883
893
  if (it != sql_types_per_column.end()) {
884
894
  best_sql_types_candidates[i] = {it->second};
885
- sql_types_per_column.erase(col_names[i]);
895
+ sql_types_per_column.erase(names[i]);
886
896
  }
887
897
  }
888
898
  if (!sql_types_per_column.empty()) {
@@ -38,9 +38,9 @@ ParallelCSVReader::~ParallelCSVReader() {
38
38
  }
39
39
 
40
40
  void ParallelCSVReader::Initialize(const vector<LogicalType> &requested_types) {
41
- sql_types = requested_types;
42
- InitParseChunk(sql_types.size());
43
- InitInsertChunkIdx(sql_types.size());
41
+ return_types = requested_types;
42
+ InitParseChunk(return_types.size());
43
+ InitInsertChunkIdx(return_types.size());
44
44
  }
45
45
 
46
46
  bool ParallelCSVReader::SetPosition(DataChunk &insert_chunk) {
@@ -360,7 +360,7 @@ final_state : {
360
360
  // remaining values to be added to the chunk
361
361
  AddValue(buffer->GetValue(start_buffer, position_buffer, offset), column, escape_positions, has_quotes);
362
362
  if (try_add_line) {
363
- bool success = column == sql_types.size();
363
+ bool success = column == return_types.size();
364
364
  if (success) {
365
365
  AddRow(insert_chunk, column, error_message);
366
366
  success = Flush(insert_chunk);
@@ -4,6 +4,7 @@
4
4
  #include "duckdb/main/database.hpp"
5
5
  #include "duckdb/common/string_util.hpp"
6
6
  #include "duckdb/common/hive_partitioning.hpp"
7
+ #include "duckdb/common/union_by_name.hpp"
7
8
  #include "duckdb/main/config.hpp"
8
9
  #include "duckdb/parser/expression/constant_expression.hpp"
9
10
  #include "duckdb/parser/expression/function_expression.hpp"
@@ -127,9 +128,9 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
127
128
  if (options.auto_detect) {
128
129
  options.file_path = result->files[0];
129
130
  auto initial_reader = make_unique<BufferedCSVReader>(context, options);
130
- return_types.assign(initial_reader->sql_types.begin(), initial_reader->sql_types.end());
131
+ return_types.assign(initial_reader->return_types.begin(), initial_reader->return_types.end());
131
132
  if (names.empty()) {
132
- names.assign(initial_reader->col_names.begin(), initial_reader->col_names.end());
133
+ names.assign(initial_reader->names.begin(), initial_reader->names.end());
133
134
  } else {
134
135
  if (explicitly_set_columns) {
135
136
  // The user has influenced the names, can't assume they are valid anymore
@@ -143,7 +144,7 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
143
144
  }
144
145
  }
145
146
  options = initial_reader->options;
146
- result->sql_types = initial_reader->sql_types;
147
+ result->sql_types = initial_reader->return_types;
147
148
  result->initial_reader = std::move(initial_reader);
148
149
  } else {
149
150
  result->sql_types = return_types;
@@ -152,58 +153,25 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
152
153
 
153
154
  // union_col_names will exclude filename and hivepartition
154
155
  if (options.union_by_name) {
155
- idx_t union_names_index = 0;
156
156
  case_insensitive_map_t<idx_t> union_names_map;
157
157
  vector<string> union_col_names;
158
158
  vector<LogicalType> union_col_types;
159
159
 
160
- for (idx_t file_idx = 0; file_idx < result->files.size(); ++file_idx) {
161
- options.file_path = result->files[file_idx];
162
- auto reader = make_unique<BufferedCSVReader>(context, options);
163
- auto &col_names = reader->col_names;
164
- auto &sql_types = reader->sql_types;
165
- D_ASSERT(col_names.size() == sql_types.size());
166
-
167
- for (idx_t col = 0; col < col_names.size(); ++col) {
168
- auto union_find = union_names_map.find(col_names[col]);
169
-
170
- if (union_find != union_names_map.end()) {
171
- // given same name , union_col's type must compatible with col's type
172
- LogicalType compatible_type;
173
- compatible_type = LogicalType::MaxLogicalType(union_col_types[union_find->second], sql_types[col]);
174
- union_col_types[union_find->second] = compatible_type;
175
- } else {
176
- union_names_map[col_names[col]] = union_names_index;
177
- union_names_index++;
178
-
179
- union_col_names.emplace_back(col_names[col]);
180
- union_col_types.emplace_back(sql_types[col]);
181
- }
182
- }
183
- result->union_readers.push_back(std::move(reader));
184
- }
160
+ auto dummy_readers = UnionByName<BufferedCSVReader, BufferedCSVReaderOptions>::UnionCols(
161
+ context, result->files, union_col_types, union_col_names, union_names_map, options);
185
162
 
186
- for (auto &reader : result->union_readers) {
187
- auto &col_names = reader->col_names;
188
- vector<bool> is_null_cols(union_col_names.size(), true);
163
+ dummy_readers = UnionByName<BufferedCSVReader, BufferedCSVReaderOptions>::CreateUnionMap(
164
+ std::move(dummy_readers), union_col_types, union_col_names, union_names_map);
189
165
 
190
- for (idx_t col = 0; col < col_names.size(); ++col) {
191
- idx_t remap_col = union_names_map[col_names[col]];
192
- reader->insert_cols_idx[col] = remap_col;
193
- is_null_cols[remap_col] = false;
194
- }
195
- for (idx_t col = 0; col < union_col_names.size(); ++col) {
196
- if (is_null_cols[col]) {
197
- reader->insert_nulls_idx.push_back(col);
198
- }
199
- }
166
+ std::move(dummy_readers.begin(), dummy_readers.end(), std::back_inserter(result->union_readers));
167
+ for (auto &reader : result->union_readers) {
168
+ reader->insert_cols_idx = reader->union_idx_map;
200
169
  }
201
170
 
202
- const idx_t first_file_index = 0;
203
- result->initial_reader = std::move(result->union_readers[first_file_index]);
204
-
205
171
  names.assign(union_col_names.begin(), union_col_names.end());
206
172
  return_types.assign(union_col_types.begin(), union_col_types.end());
173
+ const idx_t first_file_index = 0;
174
+ result->initial_reader = std::move(result->union_readers[first_file_index]);
207
175
  D_ASSERT(names.size() == return_types.size());
208
176
  }
209
177
 
@@ -544,7 +512,7 @@ static unique_ptr<GlobalTableFunctionState> SingleThreadedCSVInit(ClientContext
544
512
  }
545
513
  result->next_file = 1;
546
514
  if (result->initial_reader) {
547
- result->sql_types = result->initial_reader->sql_types;
515
+ result->sql_types = result->initial_reader->return_types;
548
516
  }
549
517
  return std::move(result);
550
518
  }
@@ -603,7 +571,8 @@ static void SingleThreadedCSVFunction(ClientContext &context, TableFunctionInput
603
571
  } while (true);
604
572
 
605
573
  if (bind_data.options.union_by_name) {
606
- lstate.csv_reader->SetNullUnionCols(output);
574
+ UnionByName<BufferedCSVReader, BufferedCSVReaderOptions>::SetNullUnionCols(output,
575
+ lstate.csv_reader->union_null_cols);
607
576
  }
608
577
  if (bind_data.options.include_file_name) {
609
578
  auto &col = output.data[bind_data.filename_col_idx];
@@ -1,8 +1,8 @@
1
1
  #ifndef DUCKDB_VERSION
2
- #define DUCKDB_VERSION "0.6.2-dev1124"
2
+ #define DUCKDB_VERSION "0.6.2-dev1160"
3
3
  #endif
4
4
  #ifndef DUCKDB_SOURCE_ID
5
- #define DUCKDB_SOURCE_ID "aa44cebfc5"
5
+ #define DUCKDB_SOURCE_ID "351d01503a"
6
6
  #endif
7
7
  #include "duckdb/function/table/system_functions.hpp"
8
8
  #include "duckdb/main/database.hpp"
@@ -0,0 +1,93 @@
1
+ //===----------------------------------------------------------------------===//
2
+ // DuckDB
3
+ //
4
+ // duckdb/common/union_by_name.hpp
5
+ //
6
+ //
7
+ //===----------------------------------------------------------------------===//
8
+
9
+ #pragma once
10
+ #include <vector>
11
+ #include <string>
12
+ #include "duckdb/common/types.hpp"
13
+
14
+ using std::string;
15
+ using std::vector;
16
+
17
+ namespace duckdb {
18
+
19
+ template <class READER_TYPE, class OPTION_TYPE>
20
+ class UnionByName {
21
+
22
+ public:
23
+ //! Union all files(readers) by their col names
24
+ static vector<unique_ptr<READER_TYPE>>
25
+ UnionCols(ClientContext &context, const vector<string> &files, vector<LogicalType> &union_col_types,
26
+ vector<string> &union_col_names, case_insensitive_map_t<idx_t> &union_names_map, OPTION_TYPE options) {
27
+ idx_t union_names_index = 0;
28
+ vector<unique_ptr<READER_TYPE>> union_readers;
29
+
30
+ for (idx_t file_idx = 0; file_idx < files.size(); ++file_idx) {
31
+ const auto file_name = files[file_idx];
32
+ auto reader = make_unique<READER_TYPE>(context, file_name, options);
33
+
34
+ auto &col_names = reader->names;
35
+ auto &sql_types = reader->return_types;
36
+ D_ASSERT(col_names.size() == sql_types.size());
37
+
38
+ for (idx_t col = 0; col < col_names.size(); ++col) {
39
+ auto union_find = union_names_map.find(col_names[col]);
40
+
41
+ if (union_find != union_names_map.end()) {
42
+ // given same name , union_col's type must compatible with col's type
43
+ LogicalType compatible_type;
44
+ compatible_type = LogicalType::MaxLogicalType(union_col_types[union_find->second], sql_types[col]);
45
+ union_col_types[union_find->second] = compatible_type;
46
+ } else {
47
+ union_names_map[col_names[col]] = union_names_index;
48
+ union_names_index++;
49
+
50
+ union_col_names.emplace_back(col_names[col]);
51
+ union_col_types.emplace_back(sql_types[col]);
52
+ }
53
+ }
54
+ union_readers.push_back(move(reader));
55
+ }
56
+ return union_readers;
57
+ }
58
+
59
+ //! Create information for reader's col mapping to union cols
60
+ static vector<unique_ptr<READER_TYPE>> CreateUnionMap(vector<unique_ptr<READER_TYPE>> union_readers,
61
+ vector<LogicalType> &union_col_types,
62
+ vector<string> &union_col_names,
63
+ case_insensitive_map_t<idx_t> &union_names_map) {
64
+ for (auto &reader : union_readers) {
65
+ auto &col_names = reader->names;
66
+ vector<bool> union_null_cols(union_col_names.size(), true);
67
+ vector<idx_t> union_idx_map(col_names.size(), 0);
68
+
69
+ for (idx_t col = 0; col < col_names.size(); ++col) {
70
+ idx_t union_idx = union_names_map[col_names[col]];
71
+ union_idx_map[col] = union_idx;
72
+ union_null_cols[union_idx] = false;
73
+ }
74
+
75
+ reader->union_col_types = union_col_types;
76
+ reader->union_idx_map = move(union_idx_map);
77
+ reader->union_null_cols = move(union_null_cols);
78
+ }
79
+ return union_readers;
80
+ }
81
+
82
+ //! Set nulls into the cols that mismtach union names
83
+ static void SetNullUnionCols(DataChunk &result, const vector<bool> &union_null_cols) {
84
+ for (idx_t col = 0; col < union_null_cols.size(); ++col) {
85
+ if (union_null_cols[col]) {
86
+ result.data[col].SetVectorType(VectorType::CONSTANT_VECTOR);
87
+ ConstantVector::SetNull(result.data[col], true);
88
+ }
89
+ }
90
+ }
91
+ };
92
+
93
+ } // namespace duckdb
@@ -44,13 +44,15 @@ public:
44
44
  Allocator &allocator;
45
45
  FileOpener *opener;
46
46
  BufferedCSVReaderOptions options;
47
- vector<LogicalType> sql_types;
48
- vector<string> col_names;
47
+ vector<LogicalType> return_types;
48
+ vector<string> names;
49
49
 
50
50
  //! remap parse_chunk col to insert_chunk col, because when
51
51
  //! union_by_name option on insert_chunk may have more cols
52
52
  vector<idx_t> insert_cols_idx;
53
- vector<idx_t> insert_nulls_idx;
53
+ vector<idx_t> union_idx_map;
54
+ vector<bool> union_null_cols;
55
+ vector<LogicalType> union_col_types;
54
56
 
55
57
  idx_t linenr = 0;
56
58
  bool linenr_estimated = false;
@@ -70,10 +72,6 @@ public:
70
72
 
71
73
  ParserMode mode;
72
74
 
73
- public:
74
- //! Fill nulls into the cols that mismtach union names
75
- void SetNullUnionCols(DataChunk &insert_chunk);
76
-
77
75
  protected:
78
76
  //! Initializes the parse_chunk with varchar columns and aligns info with new number of cols
79
77
  void InitParseChunk(idx_t num_cols);
@@ -100,7 +98,7 @@ protected:
100
98
  static string GetLineNumberStr(idx_t linenr, bool linenr_estimated);
101
99
 
102
100
  protected:
103
- //! Whether or not the current row's columns have overflown sql_types.size()
101
+ //! Whether or not the current row's columns have overflown return_types.size()
104
102
  bool error_column_overflow = false;
105
103
  //! Number of sniffed columns - only used when auto-detecting
106
104
  vector<idx_t> sniffed_column_counts;
@@ -57,6 +57,8 @@ public:
57
57
  const vector<LogicalType> &requested_types = vector<LogicalType>());
58
58
  BufferedCSVReader(FileSystem &fs, Allocator &allocator, FileOpener *opener, BufferedCSVReaderOptions options,
59
59
  const vector<LogicalType> &requested_types = vector<LogicalType>());
60
+ BufferedCSVReader(ClientContext &context, string filename, BufferedCSVReaderOptions options,
61
+ const vector<LogicalType> &requested_types = vector<LogicalType>());
60
62
  ~BufferedCSVReader();
61
63
 
62
64
  unique_ptr<char[]> buffer;
@@ -19,9 +19,12 @@ public:
19
19
  }
20
20
  virtual ~SegmentBase() {
21
21
  }
22
-
23
22
  SegmentBase *Next() {
23
+ #ifndef DUCKDB_R_BUILD
24
24
  return next.load();
25
+ #else
26
+ return next;
27
+ #endif
25
28
  }
26
29
 
27
30
  //! The start row id of this chunk
@@ -29,7 +32,12 @@ public:
29
32
  //! The amount of entries in this storage chunk
30
33
  atomic<idx_t> count;
31
34
  //! The next segment after this one
35
+
36
+ #ifndef DUCKDB_R_BUILD
32
37
  atomic<SegmentBase *> next;
38
+ #else
39
+ SegmentBase *next;
40
+ #endif
33
41
  };
34
42
 
35
43
  } // namespace duckdb
@@ -224,8 +224,8 @@ shared_ptr<Relation> Connection::ReadCSV(const string &csv_file) {
224
224
  options.auto_detect = true;
225
225
  BufferedCSVReader reader(*context, options);
226
226
  vector<ColumnDefinition> column_list;
227
- for (idx_t i = 0; i < reader.sql_types.size(); i++) {
228
- column_list.emplace_back(reader.col_names[i], reader.sql_types[i]);
227
+ for (idx_t i = 0; i < reader.return_types.size(); i++) {
228
+ column_list.emplace_back(reader.names[i], reader.return_types[i]);
229
229
  }
230
230
  return make_shared<ReadCSVRelation>(context, csv_file, std::move(column_list), true);
231
231
  }
@@ -8,14 +8,14 @@ namespace duckdb {
8
8
 
9
9
  BoundLambdaRefExpression::BoundLambdaRefExpression(string alias_p, LogicalType type, ColumnBinding binding,
10
10
  idx_t lambda_index, idx_t depth)
11
- : Expression(ExpressionType::BOUND_LAMBDA_REF, ExpressionClass::BOUND_LAMBDA_REF, move(type)), binding(binding),
12
- lambda_index(lambda_index), depth(depth) {
13
- this->alias = move(alias_p);
11
+ : Expression(ExpressionType::BOUND_LAMBDA_REF, ExpressionClass::BOUND_LAMBDA_REF, std::move(type)),
12
+ binding(binding), lambda_index(lambda_index), depth(depth) {
13
+ this->alias = std::move(alias_p);
14
14
  }
15
15
 
16
16
  BoundLambdaRefExpression::BoundLambdaRefExpression(LogicalType type, ColumnBinding binding, idx_t lambda_index,
17
17
  idx_t depth)
18
- : BoundLambdaRefExpression(string(), move(type), binding, lambda_index, depth) {
18
+ : BoundLambdaRefExpression(string(), std::move(type), binding, lambda_index, depth) {
19
19
  }
20
20
 
21
21
  unique_ptr<Expression> BoundLambdaRefExpression::Copy() {
@@ -616,7 +616,7 @@ struct UpdateSelectElement {
616
616
 
617
617
  template <>
618
618
  string_t UpdateSelectElement::Operation(UpdateSegment *segment, string_t element) {
619
- return element.IsInlined() ? element : segment->GetStringHeap().AddString(element);
619
+ return element.IsInlined() ? element : segment->GetStringHeap().AddBlob(element);
620
620
  }
621
621
 
622
622
  template <class T>
@@ -942,7 +942,7 @@ idx_t UpdateStringStatistics(UpdateSegment *segment, SegmentStatistics &stats, V
942
942
  for (idx_t i = 0; i < count; i++) {
943
943
  ((StringStatistics &)*stats.statistics).Update(update_data[i]);
944
944
  if (!update_data[i].IsInlined()) {
945
- update_data[i] = segment->GetStringHeap().AddString(update_data[i]);
945
+ update_data[i] = segment->GetStringHeap().AddBlob(update_data[i]);
946
946
  }
947
947
  }
948
948
  sel.Initialize(nullptr);
@@ -955,7 +955,7 @@ idx_t UpdateStringStatistics(UpdateSegment *segment, SegmentStatistics &stats, V
955
955
  sel.set_index(not_null_count++, i);
956
956
  ((StringStatistics &)*stats.statistics).Update(update_data[i]);
957
957
  if (!update_data[i].IsInlined()) {
958
- update_data[i] = segment->GetStringHeap().AddString(update_data[i]);
958
+ update_data[i] = segment->GetStringHeap().AddBlob(update_data[i]);
959
959
  }
960
960
  }
961
961
  }