duckdb 0.8.2-dev4376.0 → 0.8.2-dev4474.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. package/binding.gyp +1 -0
  2. package/package.json +1 -1
  3. package/src/duckdb/extension/parquet/column_writer.cpp +1 -1
  4. package/src/duckdb/extension/parquet/include/parquet_writer.hpp +4 -3
  5. package/src/duckdb/extension/parquet/parquet_writer.cpp +33 -15
  6. package/src/duckdb/src/common/enum_util.cpp +5 -0
  7. package/src/duckdb/src/common/types/date.cpp +1 -1
  8. package/src/duckdb/src/common/types/vector.cpp +3 -0
  9. package/src/duckdb/src/common/types.cpp +1 -1
  10. package/src/duckdb/src/execution/index/fixed_size_buffer.cpp +3 -10
  11. package/src/duckdb/src/execution/operator/csv_scanner/parallel_csv_reader.cpp +6 -3
  12. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +3 -0
  13. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +8 -2
  14. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +5 -1
  15. package/src/duckdb/src/function/cast/union/from_struct.cpp +114 -0
  16. package/src/duckdb/src/function/cast/union_casts.cpp +20 -36
  17. package/src/duckdb/src/function/table/read_csv.cpp +5 -22
  18. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  19. package/src/duckdb/src/include/duckdb/common/types/vector.hpp +1 -1
  20. package/src/duckdb/src/include/duckdb/function/cast/bound_cast_data.hpp +32 -0
  21. package/src/duckdb/src/include/duckdb/function/copy_function.hpp +3 -1
  22. package/src/duckdb/src/include/duckdb/main/query_result.hpp +1 -1
  23. package/src/duckdb/src/parser/keyword_helper.cpp +1 -1
  24. package/src/duckdb/src/planner/binder/statement/bind_export.cpp +109 -12
  25. package/src/duckdb/src/storage/checkpoint_manager.cpp +37 -36
  26. package/src/duckdb/src/storage/compression/rle.cpp +16 -5
  27. package/src/duckdb/src/storage/local_storage.cpp +8 -1
  28. package/src/duckdb/ub_src_function_cast_union.cpp +2 -0
  29. package/test/prepare.test.ts +10 -1
  30. package/test/test_all_types.test.ts +4 -4
package/binding.gyp CHANGED
@@ -80,6 +80,7 @@
80
80
  "src/duckdb/ub_src_function_aggregate.cpp",
81
81
  "src/duckdb/ub_src_function.cpp",
82
82
  "src/duckdb/ub_src_function_cast.cpp",
83
+ "src/duckdb/ub_src_function_cast_union.cpp",
83
84
  "src/duckdb/ub_src_function_pragma.cpp",
84
85
  "src/duckdb/ub_src_function_scalar_compressed_materialization.cpp",
85
86
  "src/duckdb/ub_src_function_scalar.cpp",
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "0.8.2-dev4376.0",
5
+ "version": "0.8.2-dev4474.0",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
@@ -1825,7 +1825,7 @@ unique_ptr<ColumnWriter> ColumnWriter::CreateWriterRecursive(vector<duckdb_parqu
1825
1825
  }
1826
1826
  }
1827
1827
 
1828
- if (type.id() == LogicalTypeId::STRUCT) {
1828
+ if (type.id() == LogicalTypeId::STRUCT || type.id() == LogicalTypeId::UNION) {
1829
1829
  auto &child_types = StructType::GetChildTypes(type);
1830
1830
  // set up the schema element for this struct
1831
1831
  duckdb_parquet::format::SchemaElement schema_element;
@@ -15,6 +15,7 @@
15
15
  #include "duckdb/common/mutex.hpp"
16
16
  #include "duckdb/common/serializer/buffered_file_writer.hpp"
17
17
  #include "duckdb/common/types/column/column_data_collection.hpp"
18
+ #include "duckdb/function/copy_function.hpp"
18
19
  #endif
19
20
 
20
21
  #include "column_writer.hpp"
@@ -75,11 +76,11 @@ public:
75
76
  return *writer;
76
77
  }
77
78
 
78
- static bool TypeIsSupported(const LogicalType &type);
79
+ static CopyTypeSupport TypeIsSupported(const LogicalType &type);
79
80
 
80
81
  private:
81
- static bool DuckDBTypeToParquetTypeInternal(const LogicalType &duckdb_type,
82
- duckdb_parquet::format::Type::type &type);
82
+ static CopyTypeSupport DuckDBTypeToParquetTypeInternal(const LogicalType &duckdb_type,
83
+ duckdb_parquet::format::Type::type &type);
83
84
  string file_name;
84
85
  vector<LogicalType> sql_types;
85
86
  vector<string> column_names;
@@ -77,7 +77,8 @@ private:
77
77
  WriteStream &serializer;
78
78
  };
79
79
 
80
- bool ParquetWriter::DuckDBTypeToParquetTypeInternal(const LogicalType &duckdb_type, Type::type &parquet_type) {
80
+ CopyTypeSupport ParquetWriter::DuckDBTypeToParquetTypeInternal(const LogicalType &duckdb_type,
81
+ Type::type &parquet_type) {
81
82
  switch (duckdb_type.id()) {
82
83
  case LogicalTypeId::BOOLEAN:
83
84
  parquet_type = Type::BOOLEAN;
@@ -95,9 +96,11 @@ bool ParquetWriter::DuckDBTypeToParquetTypeInternal(const LogicalType &duckdb_ty
95
96
  parquet_type = Type::FLOAT;
96
97
  break;
97
98
  case LogicalTypeId::DOUBLE:
98
- case LogicalTypeId::HUGEINT:
99
99
  parquet_type = Type::DOUBLE;
100
100
  break;
101
+ case LogicalTypeId::HUGEINT:
102
+ parquet_type = Type::DOUBLE;
103
+ return CopyTypeSupport::LOSSY;
101
104
  case LogicalTypeId::ENUM:
102
105
  case LogicalTypeId::BLOB:
103
106
  case LogicalTypeId::VARCHAR:
@@ -141,47 +144,62 @@ bool ParquetWriter::DuckDBTypeToParquetTypeInternal(const LogicalType &duckdb_ty
141
144
  }
142
145
  break;
143
146
  default:
144
- // Anything that is not supported returns false
145
- return false;
147
+ // Anything that is not supported
148
+ return CopyTypeSupport::UNSUPPORTED;
146
149
  }
147
- return true;
150
+ return CopyTypeSupport::SUPPORTED;
148
151
  }
149
152
 
150
153
  Type::type ParquetWriter::DuckDBTypeToParquetType(const LogicalType &duckdb_type) {
151
154
  Type::type result;
152
- if (!DuckDBTypeToParquetTypeInternal(duckdb_type, result)) {
155
+ auto type_supports = DuckDBTypeToParquetTypeInternal(duckdb_type, result);
156
+ if (type_supports == CopyTypeSupport::UNSUPPORTED) {
153
157
  throw NotImplementedException("Unimplemented type for Parquet \"%s\"", duckdb_type.ToString());
154
158
  }
155
159
  return result;
156
160
  }
157
161
 
158
- bool ParquetWriter::TypeIsSupported(const LogicalType &type) {
162
+ CopyTypeSupport ParquetWriter::TypeIsSupported(const LogicalType &type) {
159
163
  Type::type unused;
160
164
  auto id = type.id();
161
165
  if (id == LogicalTypeId::LIST) {
162
166
  auto &child_type = ListType::GetChildType(type);
163
167
  return TypeIsSupported(child_type);
164
168
  }
169
+ if (id == LogicalTypeId::UNION) {
170
+ auto count = UnionType::GetMemberCount(type);
171
+ for (idx_t i = 0; i < count; i++) {
172
+ auto &member_type = UnionType::GetMemberType(type, i);
173
+ auto type_support = TypeIsSupported(member_type);
174
+ if (type_support != CopyTypeSupport::SUPPORTED) {
175
+ return type_support;
176
+ }
177
+ }
178
+ return CopyTypeSupport::SUPPORTED;
179
+ }
165
180
  if (id == LogicalTypeId::STRUCT) {
166
181
  auto &children = StructType::GetChildTypes(type);
167
182
  for (auto &child : children) {
168
183
  auto &child_type = child.second;
169
- if (!TypeIsSupported(child_type)) {
170
- return false;
184
+ auto type_support = TypeIsSupported(child_type);
185
+ if (type_support != CopyTypeSupport::SUPPORTED) {
186
+ return type_support;
171
187
  }
172
188
  }
173
- return true;
189
+ return CopyTypeSupport::SUPPORTED;
174
190
  }
175
191
  if (id == LogicalTypeId::MAP) {
176
192
  auto &key_type = MapType::KeyType(type);
177
193
  auto &value_type = MapType::ValueType(type);
178
- if (!TypeIsSupported(key_type)) {
179
- return false;
194
+ auto key_type_support = TypeIsSupported(key_type);
195
+ if (key_type_support != CopyTypeSupport::SUPPORTED) {
196
+ return key_type_support;
180
197
  }
181
- if (!TypeIsSupported(value_type)) {
182
- return false;
198
+ auto value_type_support = TypeIsSupported(value_type);
199
+ if (value_type_support != CopyTypeSupport::SUPPORTED) {
200
+ return value_type_support;
183
201
  }
184
- return true;
202
+ return CopyTypeSupport::SUPPORTED;
185
203
  }
186
204
  return DuckDBTypeToParquetTypeInternal(type, unused);
187
205
  }
@@ -5974,6 +5974,8 @@ const char* EnumUtil::ToChars<UnionInvalidReason>(UnionInvalidReason value) {
5974
5974
  return "NO_MEMBERS";
5975
5975
  case UnionInvalidReason::VALIDITY_OVERLAP:
5976
5976
  return "VALIDITY_OVERLAP";
5977
+ case UnionInvalidReason::TAG_MISMATCH:
5978
+ return "TAG_MISMATCH";
5977
5979
  default:
5978
5980
  throw NotImplementedException(StringUtil::Format("Enum value: '%d' not implemented", value));
5979
5981
  }
@@ -5993,6 +5995,9 @@ UnionInvalidReason EnumUtil::FromString<UnionInvalidReason>(const char *value) {
5993
5995
  if (StringUtil::Equals(value, "VALIDITY_OVERLAP")) {
5994
5996
  return UnionInvalidReason::VALIDITY_OVERLAP;
5995
5997
  }
5998
+ if (StringUtil::Equals(value, "TAG_MISMATCH")) {
5999
+ return UnionInvalidReason::TAG_MISMATCH;
6000
+ }
5996
6001
  throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value));
5997
6002
  }
5998
6003
 
@@ -492,7 +492,7 @@ int32_t Date::ExtractDayOfTheYear(date_t date) {
492
492
 
493
493
  int64_t Date::ExtractJulianDay(date_t date) {
494
494
  // Julian Day 0 is (-4713, 11, 24) in the proleptic Gregorian calendar.
495
- static const auto JULIAN_EPOCH = -2440588;
495
+ static const int64_t JULIAN_EPOCH = -2440588;
496
496
  return date.days - JULIAN_EPOCH;
497
497
  }
498
498
 
@@ -2007,6 +2007,9 @@ UnionInvalidReason UnionVector::CheckUnionValidity(Vector &vector, idx_t count,
2007
2007
  return UnionInvalidReason::VALIDITY_OVERLAP;
2008
2008
  }
2009
2009
  found_valid = true;
2010
+ if (tag != static_cast<union_tag_t>(member_idx)) {
2011
+ return UnionInvalidReason::TAG_MISMATCH;
2012
+ }
2010
2013
  }
2011
2014
  }
2012
2015
  }
@@ -398,7 +398,7 @@ string LogicalType::ToString() const {
398
398
  if (i > 0) {
399
399
  ret += ", ";
400
400
  }
401
- ret += "'" + KeywordHelper::WriteOptionallyQuoted(EnumType::GetString(*this, i).GetString(), '\'') + "'";
401
+ ret += KeywordHelper::WriteQuoted(EnumType::GetString(*this, i).GetString(), '\'');
402
402
  }
403
403
  ret += ")";
404
404
  return ret;
@@ -148,9 +148,6 @@ void FixedSizeBuffer::Pin() {
148
148
 
149
149
  uint32_t FixedSizeBuffer::GetOffset(const idx_t bitmask_count) {
150
150
 
151
- // this function calls Get() on the buffer, so the buffer must already be in memory
152
- D_ASSERT(InMemory());
153
-
154
151
  // get the bitmask data
155
152
  auto bitmask_ptr = reinterpret_cast<validity_t *>(Get());
156
153
  ValidityMask mask(bitmask_ptr);
@@ -200,7 +197,7 @@ uint32_t FixedSizeBuffer::GetOffset(const idx_t bitmask_count) {
200
197
 
201
198
  uint32_t FixedSizeBuffer::GetMaxOffset(const idx_t available_segments) {
202
199
 
203
- // this function calls Get() on the buffer, so the buffer must already be in memory
200
+ // this function calls Get() on the buffer
204
201
  D_ASSERT(InMemory());
205
202
 
206
203
  // finds the maximum zero bit in a bitmask, and adds one to it,
@@ -259,17 +256,13 @@ uint32_t FixedSizeBuffer::GetMaxOffset(const idx_t available_segments) {
259
256
  }
260
257
 
261
258
  // there are no allocations in this buffer
262
- // FIXME: put this line back in and then fix the missing vacuum bug in
263
- // FIXME: test_index_large_aborted_append.test with force_restart
264
- // FIXME: test if we still have non-dirty buffer to serialize after fixing this
265
- // throw InternalException("tried to serialize empty buffer");
266
- return 0;
259
+ throw InternalException("tried to serialize empty buffer");
267
260
  }
268
261
 
269
262
  void FixedSizeBuffer::SetUninitializedRegions(PartialBlockForIndex &p_block_for_index, const idx_t segment_size,
270
263
  const idx_t offset, const idx_t bitmask_offset) {
271
264
 
272
- // this function calls Get() on the buffer, so the buffer must already be in memory
265
+ // this function calls Get() on the buffer
273
266
  D_ASSERT(InMemory());
274
267
 
275
268
  auto bitmask_ptr = reinterpret_cast<validity_t *>(Get());
@@ -89,17 +89,19 @@ bool ParallelCSVReader::SetPosition() {
89
89
  position_buffer++;
90
90
  }
91
91
  if (position_buffer > end_buffer) {
92
+ VerifyLineLength(position_buffer, buffer->batch_index);
92
93
  return false;
93
94
  }
94
95
  SkipEmptyLines();
95
96
  if (verification_positions.beginning_of_first_line == 0) {
96
97
  verification_positions.beginning_of_first_line = position_buffer;
97
98
  }
98
-
99
+ VerifyLineLength(position_buffer, buffer->batch_index);
99
100
  verification_positions.end_of_last_line = position_buffer;
100
101
  return true;
101
102
  }
102
103
  }
104
+ VerifyLineLength(position_buffer, buffer->batch_index);
103
105
  return false;
104
106
  }
105
107
  SkipEmptyLines();
@@ -143,12 +145,13 @@ bool ParallelCSVReader::SetPosition() {
143
145
  break;
144
146
  }
145
147
 
146
- if (position_buffer >= end_buffer && !StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1])) {
148
+ auto pos_check = position_buffer == 0 ? position_buffer : position_buffer - 1;
149
+ if (position_buffer >= end_buffer && !StringUtil::CharacterIsNewline((*buffer)[pos_check])) {
147
150
  break;
148
151
  }
149
152
 
150
153
  if (position_buffer > end_buffer && options.dialect_options.new_line == NewLineIdentifier::CARRY_ON &&
151
- (*buffer)[position_buffer - 1] == '\n') {
154
+ (*buffer)[pos_check] == '\n') {
152
155
  break;
153
156
  }
154
157
  idx_t position_set = position_buffer;
@@ -55,6 +55,9 @@ struct SniffDialect {
55
55
  if (machine.state == CSVState::INVALID) {
56
56
  return;
57
57
  }
58
+ if (machine.cur_rows < machine.options.sample_chunk_size && machine.state == CSVState::DELIMITER) {
59
+ sniffed_column_counts[machine.cur_rows] = ++machine.column_count;
60
+ }
58
61
  if (machine.cur_rows < machine.options.sample_chunk_size && machine.state != CSVState::EMPTY_LINE) {
59
62
  sniffed_column_counts[machine.cur_rows++] = machine.column_count;
60
63
  }
@@ -148,12 +148,18 @@ void CSVSniffer::DetectHeader() {
148
148
  names.push_back(col_name);
149
149
  name_collision_count[col_name] = 0;
150
150
  }
151
+ if (best_header_row.size() < best_candidate->dialect_options.num_cols && options.null_padding) {
152
+ for (idx_t col = best_header_row.size(); col < best_candidate->dialect_options.num_cols; col++) {
153
+ names.push_back(GenerateColumnName(best_candidate->dialect_options.num_cols, col));
154
+ }
155
+ } else if (best_header_row.size() < best_candidate->dialect_options.num_cols) {
156
+ throw InternalException("Detected header has number of columns inferior to dialect detection");
157
+ }
151
158
 
152
159
  } else {
153
160
  best_candidate->dialect_options.header = false;
154
161
  for (idx_t col = 0; col < best_candidate->dialect_options.num_cols; col++) {
155
- string column_name = GenerateColumnName(best_candidate->dialect_options.num_cols, col);
156
- names.push_back(column_name);
162
+ names.push_back(GenerateColumnName(best_candidate->dialect_options.num_cols, col));
157
163
  }
158
164
  }
159
165
 
@@ -183,6 +183,10 @@ struct SniffValue {
183
183
  }
184
184
 
185
185
  inline static void Finalize(CSVStateMachine &machine, vector<TupleSniffing> &sniffed_values) {
186
+ if (machine.cur_rows < sniffed_values.size() && machine.state == CSVState::DELIMITER) {
187
+ // Started a new empty value
188
+ sniffed_values[machine.cur_rows].values.push_back(Value(machine.value));
189
+ }
186
190
  if (machine.cur_rows < sniffed_values.size() && machine.state != CSVState::EMPTY_LINE) {
187
191
  machine.VerifyUTF8();
188
192
  sniffed_values[machine.cur_rows].line_number = machine.rows_read;
@@ -300,7 +304,7 @@ void CSVSniffer::DetectTypes() {
300
304
 
301
305
  // Potentially Skip Notes (I also find this dirty, but it is what the original code does)
302
306
  while (true_start < tuples.size()) {
303
- if (tuples[true_start].values.size() < max_columns_found) {
307
+ if (tuples[true_start].values.size() < max_columns_found && !options.null_padding) {
304
308
  true_start = tuples[true_start].line_number;
305
309
  values_start++;
306
310
  } else {
@@ -0,0 +1,114 @@
1
+ #include "duckdb/function/cast/bound_cast_data.hpp"
2
+
3
+ namespace duckdb {
4
+
5
+ bool StructToUnionCast::AllowImplicitCastFromStruct(const LogicalType &source, const LogicalType &target) {
6
+ if (source.id() != LogicalTypeId::STRUCT) {
7
+ return false;
8
+ }
9
+ auto target_fields = StructType::GetChildTypes(target);
10
+ auto fields = StructType::GetChildTypes(source);
11
+ if (target_fields.size() != fields.size()) {
12
+ // Struct should have the same amount of fields as the union
13
+ return false;
14
+ }
15
+ for (idx_t i = 0; i < target_fields.size(); i++) {
16
+ auto &target_field = target_fields[i].second;
17
+ auto &target_field_name = target_fields[i].first;
18
+ auto &field = fields[i].second;
19
+ auto &field_name = fields[i].first;
20
+ if (i == 0) {
21
+ // For the tag field we don't accept a type substitute as varchar
22
+ if (target_field != field) {
23
+ return false;
24
+ }
25
+ continue;
26
+ }
27
+ if (!StringUtil::CIEquals(target_field_name, field_name)) {
28
+ return false;
29
+ }
30
+ if (target_field != field && field != LogicalType::VARCHAR) {
31
+ // We allow the field to be VARCHAR, since unsupported types get cast to VARCHAR by EXPORT DATABASE (format
32
+ // PARQUET) i.e UNION(a BIT) becomes STRUCT(a VARCHAR)
33
+ return false;
34
+ }
35
+ }
36
+ return true;
37
+ }
38
+
39
+ // Physical Cast execution
40
+
41
+ bool StructToUnionCast::Cast(Vector &source, Vector &result, idx_t count, CastParameters &parameters) {
42
+ auto &cast_data = parameters.cast_data->Cast<StructBoundCastData>();
43
+ auto &lstate = parameters.local_state->Cast<StructCastLocalState>();
44
+
45
+ D_ASSERT(source.GetType().id() == LogicalTypeId::STRUCT);
46
+ D_ASSERT(result.GetType().id() == LogicalTypeId::UNION);
47
+ D_ASSERT(cast_data.target.id() == LogicalTypeId::UNION);
48
+
49
+ auto &source_children = StructVector::GetEntries(source);
50
+ auto &target_children = StructVector::GetEntries(result);
51
+
52
+ for (idx_t i = 0; i < source_children.size(); i++) {
53
+ auto &result_child_vector = *target_children[i];
54
+ auto &source_child_vector = *source_children[i];
55
+ CastParameters child_parameters(parameters, cast_data.child_cast_info[i].cast_data, lstate.local_states[i]);
56
+ auto converted =
57
+ cast_data.child_cast_info[i].function(source_child_vector, result_child_vector, count, child_parameters);
58
+ (void)converted;
59
+ D_ASSERT(converted);
60
+ }
61
+
62
+ auto check_tags = UnionVector::CheckUnionValidity(result, count);
63
+ switch (check_tags) {
64
+ case UnionInvalidReason::TAG_OUT_OF_RANGE:
65
+ throw ConversionException("One or more of the tags do not point to a valid union member");
66
+ case UnionInvalidReason::VALIDITY_OVERLAP:
67
+ throw ConversionException("One or more rows in the produced UNION have validity set for more than 1 member");
68
+ case UnionInvalidReason::TAG_MISMATCH:
69
+ throw ConversionException(
70
+ "One or more rows in the produced UNION have tags that don't point to the valid member");
71
+ case UnionInvalidReason::VALID:
72
+ break;
73
+ default:
74
+ throw InternalException("Struct to union cast failed for unknown reason");
75
+ }
76
+
77
+ if (source.GetVectorType() == VectorType::CONSTANT_VECTOR) {
78
+ result.SetVectorType(VectorType::CONSTANT_VECTOR);
79
+ ConstantVector::SetNull(result, ConstantVector::IsNull(source));
80
+ } else {
81
+ source.Flatten(count);
82
+ FlatVector::Validity(result) = FlatVector::Validity(source);
83
+ }
84
+ result.Verify(count);
85
+ return true;
86
+ }
87
+
88
+ // Bind cast
89
+
90
+ unique_ptr<BoundCastData> StructToUnionCast::BindData(BindCastInput &input, const LogicalType &source,
91
+ const LogicalType &target) {
92
+ vector<BoundCastInfo> child_cast_info;
93
+ D_ASSERT(source.id() == LogicalTypeId::STRUCT);
94
+ D_ASSERT(target.id() == LogicalTypeId::UNION);
95
+
96
+ auto result_child_count = StructType::GetChildCount(target);
97
+ D_ASSERT(result_child_count == StructType::GetChildCount(source));
98
+
99
+ for (idx_t i = 0; i < result_child_count; i++) {
100
+ auto &source_child = StructType::GetChildType(source, i);
101
+ auto &target_child = StructType::GetChildType(target, i);
102
+
103
+ auto child_cast = input.GetCastFunction(source_child, target_child);
104
+ child_cast_info.push_back(std::move(child_cast));
105
+ }
106
+ return make_uniq<StructBoundCastData>(std::move(child_cast_info), target);
107
+ }
108
+
109
+ BoundCastInfo StructToUnionCast::Bind(BindCastInput &input, const LogicalType &source, const LogicalType &target) {
110
+ auto cast_data = StructToUnionCast::BindData(input, source, target);
111
+ return BoundCastInfo(&StructToUnionCast::Cast, std::move(cast_data), StructBoundCastData::InitStructCastLocalState);
112
+ }
113
+
114
+ } // namespace duckdb
@@ -11,33 +11,10 @@ namespace duckdb {
11
11
  //--------------------------------------------------------------------------------------------------
12
12
  // if the source can be implicitly cast to a member of the target union, the cast is valid
13
13
 
14
- struct ToUnionBoundCastData : public BoundCastData {
15
- ToUnionBoundCastData(union_tag_t member_idx, string name, LogicalType type, int64_t cost,
16
- BoundCastInfo member_cast_info)
17
- : tag(member_idx), name(std::move(name)), type(std::move(type)), cost(cost),
18
- member_cast_info(std::move(member_cast_info)) {
19
- }
20
-
21
- union_tag_t tag;
22
- string name;
23
- LogicalType type;
24
- int64_t cost;
25
- BoundCastInfo member_cast_info;
26
-
27
- public:
28
- unique_ptr<BoundCastData> Copy() const override {
29
- return make_uniq<ToUnionBoundCastData>(tag, name, type, cost, member_cast_info.Copy());
30
- }
31
-
32
- static bool SortByCostAscending(const ToUnionBoundCastData &left, const ToUnionBoundCastData &right) {
33
- return left.cost < right.cost;
34
- }
35
- };
36
-
37
14
  unique_ptr<BoundCastData> BindToUnionCast(BindCastInput &input, const LogicalType &source, const LogicalType &target) {
38
15
  D_ASSERT(target.id() == LogicalTypeId::UNION);
39
16
 
40
- vector<ToUnionBoundCastData> candidates;
17
+ vector<UnionBoundCastData> candidates;
41
18
 
42
19
  for (idx_t member_idx = 0; member_idx < UnionType::GetMemberCount(target); member_idx++) {
43
20
  auto member_type = UnionType::GetMemberType(target, member_idx);
@@ -68,7 +45,7 @@ unique_ptr<BoundCastData> BindToUnionCast(BindCastInput &input, const LogicalTyp
68
45
  }
69
46
 
70
47
  // sort the candidate casts by cost
71
- std::sort(candidates.begin(), candidates.end(), ToUnionBoundCastData::SortByCostAscending);
48
+ std::sort(candidates.begin(), candidates.end(), UnionBoundCastData::SortByCostAscending);
72
49
 
73
50
  // select the lowest possible cost cast
74
51
  auto &selected_cast = candidates[0];
@@ -95,11 +72,11 @@ unique_ptr<BoundCastData> BindToUnionCast(BindCastInput &input, const LogicalTyp
95
72
  }
96
73
 
97
74
  // otherwise, return the selected cast
98
- return make_uniq<ToUnionBoundCastData>(std::move(selected_cast));
75
+ return make_uniq<UnionBoundCastData>(std::move(selected_cast));
99
76
  }
100
77
 
101
78
  unique_ptr<FunctionLocalState> InitToUnionLocalState(CastLocalStateParameters &parameters) {
102
- auto &cast_data = parameters.cast_data->Cast<ToUnionBoundCastData>();
79
+ auto &cast_data = parameters.cast_data->Cast<UnionBoundCastData>();
103
80
  if (!cast_data.member_cast_info.init_local_state) {
104
81
  return nullptr;
105
82
  }
@@ -109,7 +86,7 @@ unique_ptr<FunctionLocalState> InitToUnionLocalState(CastLocalStateParameters &p
109
86
 
110
87
  static bool ToUnionCast(Vector &source, Vector &result, idx_t count, CastParameters &parameters) {
111
88
  D_ASSERT(result.GetType().id() == LogicalTypeId::UNION);
112
- auto &cast_data = parameters.cast_data->Cast<ToUnionBoundCastData>();
89
+ auto &cast_data = parameters.cast_data->Cast<UnionBoundCastData>();
113
90
  auto &selected_member_vector = UnionVector::GetMember(result, cast_data.tag);
114
91
 
115
92
  CastParameters child_parameters(parameters, cast_data.member_cast_info.cast_data, parameters.local_state);
@@ -127,7 +104,13 @@ static bool ToUnionCast(Vector &source, Vector &result, idx_t count, CastParamet
127
104
 
128
105
  BoundCastInfo DefaultCasts::ImplicitToUnionCast(BindCastInput &input, const LogicalType &source,
129
106
  const LogicalType &target) {
130
- return BoundCastInfo(&ToUnionCast, BindToUnionCast(input, source, target), InitToUnionLocalState);
107
+
108
+ D_ASSERT(target.id() == LogicalTypeId::UNION);
109
+ if (StructToUnionCast::AllowImplicitCastFromStruct(source, target)) {
110
+ return StructToUnionCast::Bind(input, source, target);
111
+ }
112
+ auto cast_data = BindToUnionCast(input, source, target);
113
+ return BoundCastInfo(&ToUnionCast, std::move(cast_data), InitToUnionLocalState);
131
114
  }
132
115
 
133
116
  //--------------------------------------------------------------------------------------------------
@@ -143,7 +126,7 @@ BoundCastInfo DefaultCasts::ImplicitToUnionCast(BindCastInput &input, const Logi
143
126
  // INVALID: UNION(A, B) -> UNION(A, C) if B can't be implicitly cast to C
144
127
  // INVALID: UNION(A, B, D) -> UNION(A, B, C)
145
128
 
146
- struct UnionToUnionBoundCastData : public BoundCastData {
129
+ struct UnionUnionBoundCastData : public BoundCastData {
147
130
 
148
131
  // mapping from source member index to target member index
149
132
  // these are always the same size as the source member count
@@ -153,7 +136,7 @@ struct UnionToUnionBoundCastData : public BoundCastData {
153
136
 
154
137
  LogicalType target_type;
155
138
 
156
- UnionToUnionBoundCastData(vector<idx_t> tag_map, vector<BoundCastInfo> member_casts, LogicalType target_type)
139
+ UnionUnionBoundCastData(vector<idx_t> tag_map, vector<BoundCastInfo> member_casts, LogicalType target_type)
157
140
  : tag_map(std::move(tag_map)), member_casts(std::move(member_casts)), target_type(std::move(target_type)) {
158
141
  }
159
142
 
@@ -163,7 +146,7 @@ public:
163
146
  for (auto &member_cast : member_casts) {
164
147
  member_casts_copy.push_back(member_cast.Copy());
165
148
  }
166
- return make_uniq<UnionToUnionBoundCastData>(tag_map, std::move(member_casts_copy), target_type);
149
+ return make_uniq<UnionUnionBoundCastData>(tag_map, std::move(member_casts_copy), target_type);
167
150
  }
168
151
  };
169
152
 
@@ -203,11 +186,11 @@ unique_ptr<BoundCastData> BindUnionToUnionCast(BindCastInput &input, const Logic
203
186
  }
204
187
  }
205
188
 
206
- return make_uniq<UnionToUnionBoundCastData>(tag_map, std::move(member_casts), target);
189
+ return make_uniq<UnionUnionBoundCastData>(tag_map, std::move(member_casts), target);
207
190
  }
208
191
 
209
192
  unique_ptr<FunctionLocalState> InitUnionToUnionLocalState(CastLocalStateParameters &parameters) {
210
- auto &cast_data = parameters.cast_data->Cast<UnionToUnionBoundCastData>();
193
+ auto &cast_data = parameters.cast_data->Cast<UnionUnionBoundCastData>();
211
194
  auto result = make_uniq<StructCastLocalState>();
212
195
 
213
196
  for (auto &entry : cast_data.member_casts) {
@@ -222,7 +205,7 @@ unique_ptr<FunctionLocalState> InitUnionToUnionLocalState(CastLocalStateParamete
222
205
  }
223
206
 
224
207
  static bool UnionToUnionCast(Vector &source, Vector &result, idx_t count, CastParameters &parameters) {
225
- auto &cast_data = parameters.cast_data->Cast<UnionToUnionBoundCastData>();
208
+ auto &cast_data = parameters.cast_data->Cast<UnionUnionBoundCastData>();
226
209
  auto &lstate = parameters.local_state->Cast<StructCastLocalState>();
227
210
 
228
211
  auto source_member_count = UnionType::GetMemberCount(source.GetType());
@@ -313,7 +296,7 @@ static bool UnionToUnionCast(Vector &source, Vector &result, idx_t count, CastPa
313
296
  static bool UnionToVarcharCast(Vector &source, Vector &result, idx_t count, CastParameters &parameters) {
314
297
  auto constant = source.GetVectorType() == VectorType::CONSTANT_VECTOR;
315
298
  // first cast all union members to varchar
316
- auto &cast_data = parameters.cast_data->Cast<UnionToUnionBoundCastData>();
299
+ auto &cast_data = parameters.cast_data->Cast<UnionUnionBoundCastData>();
317
300
  Vector varchar_union(cast_data.target_type, count);
318
301
 
319
302
  UnionToUnionCast(source, varchar_union, count, parameters);
@@ -356,6 +339,7 @@ static bool UnionToVarcharCast(Vector &source, Vector &result, idx_t count, Cast
356
339
 
357
340
  BoundCastInfo DefaultCasts::UnionCastSwitch(BindCastInput &input, const LogicalType &source,
358
341
  const LogicalType &target) {
342
+ D_ASSERT(source.id() == LogicalTypeId::UNION);
359
343
  switch (target.id()) {
360
344
  case LogicalTypeId::VARCHAR: {
361
345
  // bind a cast in which we convert all members to VARCHAR first
@@ -300,7 +300,7 @@ public:
300
300
  const CSVReaderOptions &options, idx_t system_threads_p, const vector<string> &files_path_p,
301
301
  bool force_parallelism_p, vector<column_t> column_ids_p)
302
302
  : buffer_manager(std::move(buffer_manager_p)), system_threads(system_threads_p),
303
- buffer_size(options.buffer_size), force_parallelism(force_parallelism_p), column_ids(std::move(column_ids_p)),
303
+ force_parallelism(force_parallelism_p), column_ids(std::move(column_ids_p)),
304
304
  line_info(main_mutex, batch_to_tuple_end, tuple_start, tuple_end) {
305
305
  current_file_path = files_path_p[0];
306
306
  CSVFileHandle *file_handle_ptr;
@@ -316,16 +316,6 @@ public:
316
316
  first_file_size = file_size;
317
317
  on_disk_file = file_handle_ptr->OnDiskFile();
318
318
  bytes_read = 0;
319
- if (buffer_size < file_size || file_size == 0) {
320
- bytes_per_local_state = buffer_size / ParallelCSVGlobalState::MaxThreads();
321
- } else {
322
- bytes_per_local_state = file_size / MaxThreads();
323
- }
324
- if (bytes_per_local_state == 0) {
325
- // In practice, I think this won't happen, it only happens because we are mocking up test scenarios
326
- // this boy needs to be at least one.
327
- bytes_per_local_state = 1;
328
- }
329
319
  running_threads = MaxThreads();
330
320
 
331
321
  // Initialize all the book-keeping variables
@@ -368,8 +358,6 @@ public:
368
358
 
369
359
  void UpdateLinesRead(CSVBufferRead &buffer_read, idx_t file_idx);
370
360
 
371
- void IncrementThread();
372
-
373
361
  void DecrementThread();
374
362
 
375
363
  bool Finished();
@@ -402,16 +390,12 @@ private:
402
390
  mutex main_mutex;
403
391
  //! Byte set from for last thread
404
392
  idx_t next_byte = 0;
405
- //! How many bytes we should execute per local state
406
- idx_t bytes_per_local_state;
407
393
  //! Size of first file
408
394
  idx_t first_file_size = 0;
409
395
  //! Whether or not this is an on-disk file
410
396
  bool on_disk_file = true;
411
397
  //! Basically max number of threads in DuckDB
412
398
  idx_t system_threads;
413
- //! Size of the buffers
414
- idx_t buffer_size;
415
399
  //! Current batch index
416
400
  idx_t batch_index = 0;
417
401
  idx_t local_batch_index = 0;
@@ -454,11 +438,6 @@ idx_t ParallelCSVGlobalState::MaxThreads() const {
454
438
  return system_threads;
455
439
  }
456
440
 
457
- void ParallelCSVGlobalState::IncrementThread() {
458
- lock_guard<mutex> parallel_lock(main_mutex);
459
- running_threads++;
460
- }
461
-
462
441
  void ParallelCSVGlobalState::DecrementThread() {
463
442
  lock_guard<mutex> parallel_lock(main_mutex);
464
443
  D_ASSERT(running_threads > 0);
@@ -572,6 +551,7 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
572
551
  }
573
552
  // set up the current buffer
574
553
  line_info.current_batches[file_index - 1].insert(local_batch_index);
554
+ idx_t bytes_per_local_state = current_buffer->actual_size / MaxThreads() + 1;
575
555
  auto result = make_uniq<CSVBufferRead>(
576
556
  buffer_manager->GetBuffer(cur_buffer_idx), buffer_manager->GetBuffer(cur_buffer_idx + 1), next_byte,
577
557
  next_byte + bytes_per_local_state, batch_index++, local_batch_index++, &line_info);
@@ -1135,6 +1115,9 @@ unique_ptr<TableRef> ReadCSVReplacement(ClientContext &context, const string &ta
1135
1115
  if (StringUtil::EndsWith(lower_name, ".gz")) {
1136
1116
  lower_name = lower_name.substr(0, lower_name.size() - 3);
1137
1117
  } else if (StringUtil::EndsWith(lower_name, ".zst")) {
1118
+ if (!Catalog::TryAutoLoad(context, "parquet")) {
1119
+ throw MissingExtensionException("parquet extension is required for reading zst compressed file");
1120
+ }
1138
1121
  lower_name = lower_name.substr(0, lower_name.size() - 4);
1139
1122
  }
1140
1123
  if (!StringUtil::EndsWith(lower_name, ".csv") && !StringUtil::Contains(lower_name, ".csv?") &&
@@ -1,8 +1,8 @@
1
1
  #ifndef DUCKDB_VERSION
2
- #define DUCKDB_VERSION "0.8.2-dev4376"
2
+ #define DUCKDB_VERSION "0.8.2-dev4474"
3
3
  #endif
4
4
  #ifndef DUCKDB_SOURCE_ID
5
- #define DUCKDB_SOURCE_ID "312b995450"
5
+ #define DUCKDB_SOURCE_ID "ba71015ee7"
6
6
  #endif
7
7
  #include "duckdb/function/table/system_functions.hpp"
8
8
  #include "duckdb/main/database.hpp"
@@ -447,7 +447,7 @@ struct StructVector {
447
447
  DUCKDB_API static vector<unique_ptr<Vector>> &GetEntries(Vector &vector);
448
448
  };
449
449
 
450
- enum class UnionInvalidReason : uint8_t { VALID, TAG_OUT_OF_RANGE, NO_MEMBERS, VALIDITY_OVERLAP };
450
+ enum class UnionInvalidReason : uint8_t { VALID, TAG_OUT_OF_RANGE, NO_MEMBERS, VALIDITY_OVERLAP, TAG_MISMATCH };
451
451
 
452
452
  struct UnionVector {
453
453
  // Unions are stored as structs, but the first child is always the "tag"
@@ -81,4 +81,36 @@ public:
81
81
  unique_ptr<FunctionLocalState> value_state;
82
82
  };
83
83
 
84
+ struct UnionBoundCastData : public BoundCastData {
85
+ UnionBoundCastData(union_tag_t member_idx, string name, LogicalType type, int64_t cost,
86
+ BoundCastInfo member_cast_info)
87
+ : tag(member_idx), name(std::move(name)), type(std::move(type)), cost(cost),
88
+ member_cast_info(std::move(member_cast_info)) {
89
+ }
90
+
91
+ union_tag_t tag;
92
+ string name;
93
+ LogicalType type;
94
+ int64_t cost;
95
+ BoundCastInfo member_cast_info;
96
+
97
+ public:
98
+ unique_ptr<BoundCastData> Copy() const override {
99
+ return make_uniq<UnionBoundCastData>(tag, name, type, cost, member_cast_info.Copy());
100
+ }
101
+
102
+ static bool SortByCostAscending(const UnionBoundCastData &left, const UnionBoundCastData &right) {
103
+ return left.cost < right.cost;
104
+ }
105
+ };
106
+
107
+ struct StructToUnionCast {
108
+ public:
109
+ static bool AllowImplicitCastFromStruct(const LogicalType &source, const LogicalType &target);
110
+ static bool Cast(Vector &source, Vector &result, idx_t count, CastParameters &parameters);
111
+ static unique_ptr<BoundCastData> BindData(BindCastInput &input, const LogicalType &source,
112
+ const LogicalType &target);
113
+ static BoundCastInfo Bind(BindCastInput &input, const LogicalType &source, const LogicalType &target);
114
+ };
115
+
84
116
  } // namespace duckdb
@@ -99,7 +99,9 @@ typedef void (*copy_flush_batch_t)(ClientContext &context, FunctionData &bind_da
99
99
  PreparedBatchData &batch);
100
100
  typedef idx_t (*copy_desired_batch_size_t)(ClientContext &context, FunctionData &bind_data);
101
101
 
102
- typedef bool (*copy_supports_type_t)(const LogicalType &type);
102
+ enum class CopyTypeSupport { SUPPORTED, LOSSY, UNSUPPORTED };
103
+
104
+ typedef CopyTypeSupport (*copy_supports_type_t)(const LogicalType &type);
103
105
 
104
106
  class CopyFunction : public Function {
105
107
  public:
@@ -40,7 +40,7 @@ public:
40
40
  vector<string> names;
41
41
 
42
42
  public:
43
- DUCKDB_API void ThrowError(const string &prepended_message = "") const;
43
+ [[noreturn]] DUCKDB_API void ThrowError(const string &prepended_message = "") const;
44
44
  DUCKDB_API void SetError(PreservedError error);
45
45
  DUCKDB_API bool HasError() const;
46
46
  DUCKDB_API const ExceptionType &GetErrorType() const;
@@ -36,7 +36,7 @@ string KeywordHelper::EscapeQuotes(const string &text, char quote) {
36
36
  string KeywordHelper::WriteQuoted(const string &text, char quote) {
37
37
  // 1. Escapes all occurences of 'quote' by doubling them (escape in SQL)
38
38
  // 2. Adds quotes around the string
39
- return string(1, quote) + EscapeQuotes(text) + string(1, quote);
39
+ return string(1, quote) + EscapeQuotes(text, quote) + string(1, quote);
40
40
  }
41
41
 
42
42
  string KeywordHelper::WriteOptionallyQuoted(const string &text, char quote, bool allow_caps) {
@@ -115,7 +115,95 @@ string CreateFileName(const string &id_suffix, TableCatalogEntry &table, const s
115
115
  return StringUtil::Format("%s_%s%s.%s", schema, name, id_suffix, extension);
116
116
  }
117
117
 
118
- unique_ptr<QueryNode> CreateSelectStatement(CopyStatement &stmt, vector<unique_ptr<ParsedExpression>> select_list) {
118
+ static bool IsSupported(CopyTypeSupport support_level) {
119
+ // For export purposes we don't want to lose information, so we only accept fully supported types
120
+ return support_level == CopyTypeSupport::SUPPORTED;
121
+ }
122
+
123
+ static LogicalType AlterLogicalType(const LogicalType &original, copy_supports_type_t type_check) {
124
+ D_ASSERT(type_check);
125
+ auto id = original.id();
126
+ switch (id) {
127
+ case LogicalTypeId::LIST: {
128
+ auto child = AlterLogicalType(ListType::GetChildType(original), type_check);
129
+ return LogicalType::LIST(child);
130
+ }
131
+ case LogicalTypeId::STRUCT: {
132
+ auto &original_children = StructType::GetChildTypes(original);
133
+ child_list_t<LogicalType> new_children;
134
+ for (auto &child : original_children) {
135
+ auto &child_name = child.first;
136
+ auto &child_type = child.second;
137
+
138
+ LogicalType new_type;
139
+ if (!IsSupported(type_check(child_type))) {
140
+ new_type = AlterLogicalType(child_type, type_check);
141
+ } else {
142
+ new_type = child_type;
143
+ }
144
+ new_children.push_back(std::make_pair(child_name, new_type));
145
+ }
146
+ return LogicalType::STRUCT(std::move(new_children));
147
+ }
148
+ case LogicalTypeId::UNION: {
149
+ auto member_count = UnionType::GetMemberCount(original);
150
+ child_list_t<LogicalType> new_children;
151
+ for (idx_t i = 0; i < member_count; i++) {
152
+ auto &child_name = UnionType::GetMemberName(original, i);
153
+ auto &child_type = UnionType::GetMemberType(original, i);
154
+
155
+ LogicalType new_type;
156
+ if (!IsSupported(type_check(child_type))) {
157
+ new_type = AlterLogicalType(child_type, type_check);
158
+ } else {
159
+ new_type = child_type;
160
+ }
161
+
162
+ new_children.push_back(std::make_pair(child_name, new_type));
163
+ }
164
+ return LogicalType::UNION(std::move(new_children));
165
+ }
166
+ case LogicalTypeId::MAP: {
167
+ auto &key_type = MapType::KeyType(original);
168
+ auto &value_type = MapType::ValueType(original);
169
+
170
+ LogicalType new_key_type;
171
+ LogicalType new_value_type;
172
+ if (!IsSupported(type_check(key_type))) {
173
+ new_key_type = AlterLogicalType(key_type, type_check);
174
+ } else {
175
+ new_key_type = key_type;
176
+ }
177
+
178
+ if (!IsSupported(type_check(value_type))) {
179
+ new_value_type = AlterLogicalType(value_type, type_check);
180
+ } else {
181
+ new_value_type = value_type;
182
+ }
183
+ return LogicalType::MAP(new_key_type, new_value_type);
184
+ }
185
+ default: {
186
+ D_ASSERT(!IsSupported(type_check(original)));
187
+ return LogicalType::VARCHAR;
188
+ }
189
+ }
190
+ }
191
+
192
+ static bool NeedsCast(LogicalType &type, copy_supports_type_t type_check) {
193
+ if (!type_check) {
194
+ return false;
195
+ }
196
+ if (IsSupported(type_check(type))) {
197
+ // The type is supported in it's entirety, no cast is required
198
+ return false;
199
+ }
200
+ // Change the type to something that is supported
201
+ type = AlterLogicalType(type, type_check);
202
+ return true;
203
+ }
204
+
205
+ static unique_ptr<QueryNode> CreateSelectStatement(CopyStatement &stmt, child_list_t<LogicalType> &select_list,
206
+ copy_supports_type_t type_check) {
119
207
  auto ref = make_uniq<BaseTableRef>();
120
208
  ref->catalog_name = stmt.info->catalog;
121
209
  ref->schema_name = stmt.info->schema;
@@ -123,7 +211,21 @@ unique_ptr<QueryNode> CreateSelectStatement(CopyStatement &stmt, vector<unique_p
123
211
 
124
212
  auto statement = make_uniq<SelectNode>();
125
213
  statement->from_table = std::move(ref);
126
- statement->select_list = std::move(select_list);
214
+
215
+ vector<unique_ptr<ParsedExpression>> expressions;
216
+ for (auto &col : select_list) {
217
+ auto &name = col.first;
218
+ auto &type = col.second;
219
+
220
+ auto expression = make_uniq_base<ParsedExpression, ColumnRefExpression>(name);
221
+ if (NeedsCast(type, type_check)) {
222
+ // Add a cast to a type supported by the copy function
223
+ expression = make_uniq_base<ParsedExpression, CastExpression>(type, std::move(expression));
224
+ }
225
+ expressions.push_back(std::move(expression));
226
+ }
227
+
228
+ statement->select_list = std::move(expressions);
127
229
  return std::move(statement);
128
230
  }
129
231
 
@@ -194,16 +296,10 @@ BoundStatement Binder::Bind(ExportStatement &stmt) {
194
296
  info->table = table.name;
195
297
 
196
298
  // We can not export generated columns
197
- vector<unique_ptr<ParsedExpression>> expressions;
299
+ child_list_t<LogicalType> select_list;
300
+
198
301
  for (auto &col : table.GetColumns().Physical()) {
199
- auto expression = make_uniq_base<ParsedExpression, ColumnRefExpression>(col.GetName());
200
- auto is_supported = copy_function.function.supports_type;
201
- if (is_supported && !is_supported(col.Type())) {
202
- expression =
203
- make_uniq_base<ParsedExpression, CastExpression>(LogicalType::VARCHAR, std::move(expression));
204
- }
205
- expressions.push_back(std::move(expression));
206
- info->select_list.push_back(col.GetName());
302
+ select_list.push_back(std::make_pair(col.Name(), col.Type()));
207
303
  }
208
304
 
209
305
  ExportedTableData exported_data;
@@ -220,7 +316,8 @@ BoundStatement Binder::Bind(ExportStatement &stmt) {
220
316
  // generate the copy statement and bind it
221
317
  CopyStatement copy_stmt;
222
318
  copy_stmt.info = std::move(info);
223
- copy_stmt.select_statement = CreateSelectStatement(copy_stmt, std::move(expressions));
319
+ copy_stmt.select_statement =
320
+ CreateSelectStatement(copy_stmt, select_list, copy_function.function.supports_type);
224
321
 
225
322
  auto copy_binder = Binder::CreateBinder(context, this);
226
323
  auto bound_statement = copy_binder->Bind(copy_stmt);
@@ -363,63 +363,64 @@ void CheckpointWriter::WriteIndex(IndexCatalogEntry &index_catalog, Serializer &
363
363
 
364
364
  void CheckpointReader::ReadIndex(ClientContext &context, Deserializer &deserializer) {
365
365
 
366
- // Deserialize the index metadata
367
- auto info = deserializer.ReadProperty<unique_ptr<CreateInfo>>(100, "index");
368
- auto &index_info = info->Cast<CreateIndexInfo>();
369
-
370
- // Create the index in the catalog
371
- auto &schema_catalog = catalog.GetSchema(context, info->schema);
372
- auto &table_catalog =
373
- catalog.GetEntry(context, CatalogType::TABLE_ENTRY, info->schema, index_info.table).Cast<DuckTableEntry>();
374
- auto &index_catalog = schema_catalog.CreateIndex(context, index_info, table_catalog)->Cast<DuckIndexEntry>();
375
- index_catalog.info = table_catalog.GetStorage().info;
376
-
377
- // We deserialize the index lazily, i.e., we do not need to load any node information
366
+ // deserialize the index create info
367
+ auto create_info = deserializer.ReadProperty<unique_ptr<CreateInfo>>(100, "index");
368
+ auto &info = create_info->Cast<CreateIndexInfo>();
369
+
370
+ // create the index in the catalog
371
+ auto &schema = catalog.GetSchema(context, create_info->schema);
372
+ auto &table =
373
+ catalog.GetEntry(context, CatalogType::TABLE_ENTRY, create_info->schema, info.table).Cast<DuckTableEntry>();
374
+
375
+ auto &index = schema.CreateIndex(context, info, table)->Cast<DuckIndexEntry>();
376
+
377
+ index.info = table.GetStorage().info;
378
+ // insert the parsed expressions into the stored index so that we correctly (de)serialize it during consecutive
379
+ // checkpoints
380
+ for (auto &parsed_expr : info.parsed_expressions) {
381
+ index.parsed_expressions.push_back(parsed_expr->Copy());
382
+ }
383
+
384
+ // we deserialize the index lazily, i.e., we do not need to load any node information
378
385
  // except the root block pointer
379
- auto index_block_pointer = deserializer.ReadProperty<BlockPointer>(101, "root_block_pointer");
386
+ auto root_block_pointer = deserializer.ReadProperty<BlockPointer>(101, "root_block_pointer");
380
387
 
381
- // obtain the expressions of the ART from the index metadata
382
- vector<unique_ptr<Expression>> unbound_expressions;
388
+ // obtain the parsed expressions of the ART from the index metadata
383
389
  vector<unique_ptr<ParsedExpression>> parsed_expressions;
384
- for (auto &p_exp : index_info.parsed_expressions) {
385
- parsed_expressions.push_back(p_exp->Copy());
390
+ for (auto &parsed_expr : info.parsed_expressions) {
391
+ parsed_expressions.push_back(parsed_expr->Copy());
386
392
  }
393
+ D_ASSERT(!parsed_expressions.empty());
387
394
 
388
- // bind the parsed expressions
389
- // add the table to the bind context
395
+ // add the table to the bind context to bind the parsed expressions
390
396
  auto binder = Binder::CreateBinder(context);
391
397
  vector<LogicalType> column_types;
392
398
  vector<string> column_names;
393
- for (auto &col : table_catalog.GetColumns().Logical()) {
399
+ for (auto &col : table.GetColumns().Logical()) {
394
400
  column_types.push_back(col.Type());
395
401
  column_names.push_back(col.Name());
396
402
  }
403
+
404
+ // create a binder to bind the parsed expressions
397
405
  vector<column_t> column_ids;
398
- binder->bind_context.AddBaseTable(0, index_info.table, column_names, column_types, column_ids, &table_catalog);
406
+ binder->bind_context.AddBaseTable(0, info.table, column_names, column_types, column_ids, &table);
399
407
  IndexBinder idx_binder(*binder, context);
408
+
409
+ // bind the parsed expressions to create unbound expressions
410
+ vector<unique_ptr<Expression>> unbound_expressions;
400
411
  unbound_expressions.reserve(parsed_expressions.size());
401
412
  for (auto &expr : parsed_expressions) {
402
413
  unbound_expressions.push_back(idx_binder.Bind(expr));
403
414
  }
404
415
 
405
- if (parsed_expressions.empty()) {
406
- // this is a PK/FK index: we create the necessary bound column ref expressions
407
- unbound_expressions.reserve(index_info.column_ids.size());
408
- for (idx_t key_nr = 0; key_nr < index_info.column_ids.size(); key_nr++) {
409
- auto &col = table_catalog.GetColumn(LogicalIndex(index_info.column_ids[key_nr]));
410
- unbound_expressions.push_back(
411
- make_uniq<BoundColumnRefExpression>(col.GetName(), col.GetType(), ColumnBinding(0, key_nr)));
412
- }
413
- }
414
-
415
416
  // create the index and add it to the storage
416
- switch (index_info.index_type) {
417
+ switch (info.index_type) {
417
418
  case IndexType::ART: {
418
- auto &storage = table_catalog.GetStorage();
419
- auto art = make_uniq<ART>(index_info.column_ids, TableIOManager::Get(storage), std::move(unbound_expressions),
420
- index_info.constraint_type, storage.db, nullptr, index_block_pointer);
419
+ auto &storage = table.GetStorage();
420
+ auto art = make_uniq<ART>(info.column_ids, TableIOManager::Get(storage), std::move(unbound_expressions),
421
+ info.constraint_type, storage.db, nullptr, root_block_pointer);
421
422
 
422
- index_catalog.index = art.get();
423
+ index.index = art.get();
423
424
  storage.info->indexes.AddIndex(std::move(art));
424
425
  } break;
425
426
  default:
@@ -292,7 +292,11 @@ void RLESkip(ColumnSegment &segment, ColumnScanState &state, idx_t skip_count) {
292
292
  scan_state.Skip(segment, skip_count);
293
293
  }
294
294
 
295
+ template <bool ENTIRE_VECTOR>
295
296
  static bool CanEmitConstantVector(idx_t position, idx_t run_length, idx_t scan_count) {
297
+ if (!ENTIRE_VECTOR) {
298
+ return false;
299
+ }
296
300
  if (scan_count != STANDARD_VECTOR_SIZE) {
297
301
  // Only when we can fill an entire Vector can we emit a ConstantVector, because subsequent scans require the
298
302
  // input Vector to be flat
@@ -330,9 +334,9 @@ static void RLEScanConstant(RLEScanState<T> &scan_state, rle_count_t *index_poin
330
334
  return;
331
335
  }
332
336
 
333
- template <class T>
334
- void RLEScanPartial(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, Vector &result,
335
- idx_t result_offset) {
337
+ template <class T, bool ENTIRE_VECTOR>
338
+ void RLEScanPartialInternal(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, Vector &result,
339
+ idx_t result_offset) {
336
340
  auto &scan_state = state.scan_state->Cast<RLEScanState<T>>();
337
341
 
338
342
  auto data = scan_state.handle.Ptr() + segment.GetBlockOffset();
@@ -340,7 +344,8 @@ void RLEScanPartial(ColumnSegment &segment, ColumnScanState &state, idx_t scan_c
340
344
  auto index_pointer = reinterpret_cast<rle_count_t *>(data + scan_state.rle_count_offset);
341
345
 
342
346
  // If we are scanning an entire Vector and it contains only a single run
343
- if (CanEmitConstantVector(scan_state.position_in_entry, index_pointer[scan_state.entry_pos], scan_count)) {
347
+ if (CanEmitConstantVector<ENTIRE_VECTOR>(scan_state.position_in_entry, index_pointer[scan_state.entry_pos],
348
+ scan_count)) {
344
349
  RLEScanConstant<T>(scan_state, index_pointer, data_pointer, scan_count, result);
345
350
  return;
346
351
  }
@@ -357,9 +362,15 @@ void RLEScanPartial(ColumnSegment &segment, ColumnScanState &state, idx_t scan_c
357
362
  }
358
363
  }
359
364
 
365
+ template <class T>
366
+ void RLEScanPartial(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, Vector &result,
367
+ idx_t result_offset) {
368
+ return RLEScanPartialInternal<T, false>(segment, state, scan_count, result, result_offset);
369
+ }
370
+
360
371
  template <class T>
361
372
  void RLEScan(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, Vector &result) {
362
- RLEScanPartial<T>(segment, state, scan_count, result, 0);
373
+ RLEScanPartialInternal<T, true>(segment, state, scan_count, result, 0);
363
374
  }
364
375
 
365
376
  //===--------------------------------------------------------------------===//
@@ -159,7 +159,7 @@ void LocalTableStorage::AppendToIndexes(DuckTransaction &transaction, TableAppen
159
159
  AppendToIndexes(transaction, *row_groups, table.info->indexes, table.GetTypes(), append_state.current_row);
160
160
  }
161
161
  if (error) {
162
- // need to revert the append
162
+ // need to revert all appended row ids
163
163
  row_t current_row = append_state.row_start;
164
164
  // remove the data from the indexes, if there are any indexes
165
165
  row_groups->Scan(transaction, [&](DataChunk &chunk) -> bool {
@@ -184,6 +184,13 @@ void LocalTableStorage::AppendToIndexes(DuckTransaction &transaction, TableAppen
184
184
  if (append_to_table) {
185
185
  table.RevertAppendInternal(append_state.row_start, append_count);
186
186
  }
187
+
188
+ // we need to vacuum the indexes to remove any buffers that are now empty
189
+ // due to reverting the appends
190
+ table.info->indexes.Scan([&](Index &index) {
191
+ index.Vacuum();
192
+ return false;
193
+ });
187
194
  error.Throw();
188
195
  }
189
196
  }
@@ -0,0 +1,2 @@
1
+ #include "src/function/cast/union/from_struct.cpp"
2
+
@@ -652,7 +652,16 @@ describe('prepare', function() {
652
652
  });
653
653
  it("should aggregate kurtosis(num)", function (done) {
654
654
  db.all("SELECT kurtosis(num) as kurtosis FROM foo", function (err: null | Error, res: TableData) {
655
- assert.equal(res[0].kurtosis, -1.1999999999999997);
655
+ // The `num` column of table `foo` contains each integer from 0 to 999,999 exactly once.
656
+ // This is a uniform distribution. The excess kurtosis for a uniform distribution is exactly -1.2.
657
+ // See https://en.wikipedia.org/wiki/Kurtosis#Other_well-known_distributions
658
+ const expected = -1.2;
659
+
660
+ // The calculated value can differ from the exact answer by small amounts on different platforms due
661
+ // to floating-point errors. This tolerance was determined experimentally.
662
+ const tolerance = Number.EPSILON * 10;
663
+
664
+ assert.ok(Math.abs(res[0].kurtosis - expected) < tolerance);
656
665
  done(err);
657
666
  });
658
667
  });
@@ -90,7 +90,7 @@ const correct_answer_map: Record<string, any[]> = {
90
90
  date_array: [
91
91
  [],
92
92
  [
93
- new Date(1970, 0, 1),
93
+ new Date(Date.UTC(1970, 0, 1)),
94
94
  null,
95
95
  new Date("0001-01-01T00:00:00.000Z"),
96
96
  new Date("9999-12-31T00:00:00.000Z"),
@@ -100,7 +100,7 @@ const correct_answer_map: Record<string, any[]> = {
100
100
  timestamp_array: [
101
101
  [],
102
102
  [
103
- new Date(1970, 0, 1),
103
+ new Date(Date.UTC(1970, 0, 1)),
104
104
  null,
105
105
  new Date("0001-01-01T00:00:00.000Z"),
106
106
  new Date("9999-12-31T23:59:59.999Z"),
@@ -111,7 +111,7 @@ const correct_answer_map: Record<string, any[]> = {
111
111
  timestamptz_array: [
112
112
  [],
113
113
  [
114
- new Date(1970, 0, 1),
114
+ new Date(Date.UTC(1970, 0, 1)),
115
115
  null,
116
116
  new Date("0001-01-01T00:00:00.000Z"),
117
117
  new Date("9999-12-31T23:59:59.999Z"),
@@ -171,7 +171,7 @@ const correct_answer_map: Record<string, any[]> = {
171
171
  ],
172
172
 
173
173
  timestamp: [
174
- new Date("1990-01-01T00:00"),
174
+ new Date(Date.UTC(1990, 0, 1)),
175
175
  new Date("9999-12-31T23:59:59.000Z"),
176
176
  null,
177
177
  ],