duckdb 0.8.2-dev4424.0 → 0.8.2-dev4474.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/binding.gyp CHANGED
@@ -80,6 +80,7 @@
80
80
  "src/duckdb/ub_src_function_aggregate.cpp",
81
81
  "src/duckdb/ub_src_function.cpp",
82
82
  "src/duckdb/ub_src_function_cast.cpp",
83
+ "src/duckdb/ub_src_function_cast_union.cpp",
83
84
  "src/duckdb/ub_src_function_pragma.cpp",
84
85
  "src/duckdb/ub_src_function_scalar_compressed_materialization.cpp",
85
86
  "src/duckdb/ub_src_function_scalar.cpp",
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "0.8.2-dev4424.0",
5
+ "version": "0.8.2-dev4474.0",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
@@ -1825,7 +1825,7 @@ unique_ptr<ColumnWriter> ColumnWriter::CreateWriterRecursive(vector<duckdb_parqu
1825
1825
  }
1826
1826
  }
1827
1827
 
1828
- if (type.id() == LogicalTypeId::STRUCT) {
1828
+ if (type.id() == LogicalTypeId::STRUCT || type.id() == LogicalTypeId::UNION) {
1829
1829
  auto &child_types = StructType::GetChildTypes(type);
1830
1830
  // set up the schema element for this struct
1831
1831
  duckdb_parquet::format::SchemaElement schema_element;
@@ -15,6 +15,7 @@
15
15
  #include "duckdb/common/mutex.hpp"
16
16
  #include "duckdb/common/serializer/buffered_file_writer.hpp"
17
17
  #include "duckdb/common/types/column/column_data_collection.hpp"
18
+ #include "duckdb/function/copy_function.hpp"
18
19
  #endif
19
20
 
20
21
  #include "column_writer.hpp"
@@ -75,11 +76,11 @@ public:
75
76
  return *writer;
76
77
  }
77
78
 
78
- static bool TypeIsSupported(const LogicalType &type);
79
+ static CopyTypeSupport TypeIsSupported(const LogicalType &type);
79
80
 
80
81
  private:
81
- static bool DuckDBTypeToParquetTypeInternal(const LogicalType &duckdb_type,
82
- duckdb_parquet::format::Type::type &type);
82
+ static CopyTypeSupport DuckDBTypeToParquetTypeInternal(const LogicalType &duckdb_type,
83
+ duckdb_parquet::format::Type::type &type);
83
84
  string file_name;
84
85
  vector<LogicalType> sql_types;
85
86
  vector<string> column_names;
@@ -77,7 +77,8 @@ private:
77
77
  WriteStream &serializer;
78
78
  };
79
79
 
80
- bool ParquetWriter::DuckDBTypeToParquetTypeInternal(const LogicalType &duckdb_type, Type::type &parquet_type) {
80
+ CopyTypeSupport ParquetWriter::DuckDBTypeToParquetTypeInternal(const LogicalType &duckdb_type,
81
+ Type::type &parquet_type) {
81
82
  switch (duckdb_type.id()) {
82
83
  case LogicalTypeId::BOOLEAN:
83
84
  parquet_type = Type::BOOLEAN;
@@ -95,9 +96,11 @@ bool ParquetWriter::DuckDBTypeToParquetTypeInternal(const LogicalType &duckdb_ty
95
96
  parquet_type = Type::FLOAT;
96
97
  break;
97
98
  case LogicalTypeId::DOUBLE:
98
- case LogicalTypeId::HUGEINT:
99
99
  parquet_type = Type::DOUBLE;
100
100
  break;
101
+ case LogicalTypeId::HUGEINT:
102
+ parquet_type = Type::DOUBLE;
103
+ return CopyTypeSupport::LOSSY;
101
104
  case LogicalTypeId::ENUM:
102
105
  case LogicalTypeId::BLOB:
103
106
  case LogicalTypeId::VARCHAR:
@@ -141,47 +144,62 @@ bool ParquetWriter::DuckDBTypeToParquetTypeInternal(const LogicalType &duckdb_ty
141
144
  }
142
145
  break;
143
146
  default:
144
- // Anything that is not supported returns false
145
- return false;
147
+ // Anything that is not supported
148
+ return CopyTypeSupport::UNSUPPORTED;
146
149
  }
147
- return true;
150
+ return CopyTypeSupport::SUPPORTED;
148
151
  }
149
152
 
150
153
  Type::type ParquetWriter::DuckDBTypeToParquetType(const LogicalType &duckdb_type) {
151
154
  Type::type result;
152
- if (!DuckDBTypeToParquetTypeInternal(duckdb_type, result)) {
155
+ auto type_supports = DuckDBTypeToParquetTypeInternal(duckdb_type, result);
156
+ if (type_supports == CopyTypeSupport::UNSUPPORTED) {
153
157
  throw NotImplementedException("Unimplemented type for Parquet \"%s\"", duckdb_type.ToString());
154
158
  }
155
159
  return result;
156
160
  }
157
161
 
158
- bool ParquetWriter::TypeIsSupported(const LogicalType &type) {
162
+ CopyTypeSupport ParquetWriter::TypeIsSupported(const LogicalType &type) {
159
163
  Type::type unused;
160
164
  auto id = type.id();
161
165
  if (id == LogicalTypeId::LIST) {
162
166
  auto &child_type = ListType::GetChildType(type);
163
167
  return TypeIsSupported(child_type);
164
168
  }
169
+ if (id == LogicalTypeId::UNION) {
170
+ auto count = UnionType::GetMemberCount(type);
171
+ for (idx_t i = 0; i < count; i++) {
172
+ auto &member_type = UnionType::GetMemberType(type, i);
173
+ auto type_support = TypeIsSupported(member_type);
174
+ if (type_support != CopyTypeSupport::SUPPORTED) {
175
+ return type_support;
176
+ }
177
+ }
178
+ return CopyTypeSupport::SUPPORTED;
179
+ }
165
180
  if (id == LogicalTypeId::STRUCT) {
166
181
  auto &children = StructType::GetChildTypes(type);
167
182
  for (auto &child : children) {
168
183
  auto &child_type = child.second;
169
- if (!TypeIsSupported(child_type)) {
170
- return false;
184
+ auto type_support = TypeIsSupported(child_type);
185
+ if (type_support != CopyTypeSupport::SUPPORTED) {
186
+ return type_support;
171
187
  }
172
188
  }
173
- return true;
189
+ return CopyTypeSupport::SUPPORTED;
174
190
  }
175
191
  if (id == LogicalTypeId::MAP) {
176
192
  auto &key_type = MapType::KeyType(type);
177
193
  auto &value_type = MapType::ValueType(type);
178
- if (!TypeIsSupported(key_type)) {
179
- return false;
194
+ auto key_type_support = TypeIsSupported(key_type);
195
+ if (key_type_support != CopyTypeSupport::SUPPORTED) {
196
+ return key_type_support;
180
197
  }
181
- if (!TypeIsSupported(value_type)) {
182
- return false;
198
+ auto value_type_support = TypeIsSupported(value_type);
199
+ if (value_type_support != CopyTypeSupport::SUPPORTED) {
200
+ return value_type_support;
183
201
  }
184
- return true;
202
+ return CopyTypeSupport::SUPPORTED;
185
203
  }
186
204
  return DuckDBTypeToParquetTypeInternal(type, unused);
187
205
  }
@@ -5974,6 +5974,8 @@ const char* EnumUtil::ToChars<UnionInvalidReason>(UnionInvalidReason value) {
5974
5974
  return "NO_MEMBERS";
5975
5975
  case UnionInvalidReason::VALIDITY_OVERLAP:
5976
5976
  return "VALIDITY_OVERLAP";
5977
+ case UnionInvalidReason::TAG_MISMATCH:
5978
+ return "TAG_MISMATCH";
5977
5979
  default:
5978
5980
  throw NotImplementedException(StringUtil::Format("Enum value: '%d' not implemented", value));
5979
5981
  }
@@ -5993,6 +5995,9 @@ UnionInvalidReason EnumUtil::FromString<UnionInvalidReason>(const char *value) {
5993
5995
  if (StringUtil::Equals(value, "VALIDITY_OVERLAP")) {
5994
5996
  return UnionInvalidReason::VALIDITY_OVERLAP;
5995
5997
  }
5998
+ if (StringUtil::Equals(value, "TAG_MISMATCH")) {
5999
+ return UnionInvalidReason::TAG_MISMATCH;
6000
+ }
5996
6001
  throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value));
5997
6002
  }
5998
6003
 
@@ -2007,6 +2007,9 @@ UnionInvalidReason UnionVector::CheckUnionValidity(Vector &vector, idx_t count,
2007
2007
  return UnionInvalidReason::VALIDITY_OVERLAP;
2008
2008
  }
2009
2009
  found_valid = true;
2010
+ if (tag != static_cast<union_tag_t>(member_idx)) {
2011
+ return UnionInvalidReason::TAG_MISMATCH;
2012
+ }
2010
2013
  }
2011
2014
  }
2012
2015
  }
@@ -398,7 +398,7 @@ string LogicalType::ToString() const {
398
398
  if (i > 0) {
399
399
  ret += ", ";
400
400
  }
401
- ret += "'" + KeywordHelper::WriteOptionallyQuoted(EnumType::GetString(*this, i).GetString(), '\'') + "'";
401
+ ret += KeywordHelper::WriteQuoted(EnumType::GetString(*this, i).GetString(), '\'');
402
402
  }
403
403
  ret += ")";
404
404
  return ret;
@@ -55,6 +55,9 @@ struct SniffDialect {
55
55
  if (machine.state == CSVState::INVALID) {
56
56
  return;
57
57
  }
58
+ if (machine.cur_rows < machine.options.sample_chunk_size && machine.state == CSVState::DELIMITER) {
59
+ sniffed_column_counts[machine.cur_rows] = ++machine.column_count;
60
+ }
58
61
  if (machine.cur_rows < machine.options.sample_chunk_size && machine.state != CSVState::EMPTY_LINE) {
59
62
  sniffed_column_counts[machine.cur_rows++] = machine.column_count;
60
63
  }
@@ -148,12 +148,18 @@ void CSVSniffer::DetectHeader() {
148
148
  names.push_back(col_name);
149
149
  name_collision_count[col_name] = 0;
150
150
  }
151
+ if (best_header_row.size() < best_candidate->dialect_options.num_cols && options.null_padding) {
152
+ for (idx_t col = best_header_row.size(); col < best_candidate->dialect_options.num_cols; col++) {
153
+ names.push_back(GenerateColumnName(best_candidate->dialect_options.num_cols, col));
154
+ }
155
+ } else if (best_header_row.size() < best_candidate->dialect_options.num_cols) {
156
+ throw InternalException("Detected header has number of columns inferior to dialect detection");
157
+ }
151
158
 
152
159
  } else {
153
160
  best_candidate->dialect_options.header = false;
154
161
  for (idx_t col = 0; col < best_candidate->dialect_options.num_cols; col++) {
155
- string column_name = GenerateColumnName(best_candidate->dialect_options.num_cols, col);
156
- names.push_back(column_name);
162
+ names.push_back(GenerateColumnName(best_candidate->dialect_options.num_cols, col));
157
163
  }
158
164
  }
159
165
 
@@ -183,6 +183,10 @@ struct SniffValue {
183
183
  }
184
184
 
185
185
  inline static void Finalize(CSVStateMachine &machine, vector<TupleSniffing> &sniffed_values) {
186
+ if (machine.cur_rows < sniffed_values.size() && machine.state == CSVState::DELIMITER) {
187
+ // Started a new empty value
188
+ sniffed_values[machine.cur_rows].values.push_back(Value(machine.value));
189
+ }
186
190
  if (machine.cur_rows < sniffed_values.size() && machine.state != CSVState::EMPTY_LINE) {
187
191
  machine.VerifyUTF8();
188
192
  sniffed_values[machine.cur_rows].line_number = machine.rows_read;
@@ -300,7 +304,7 @@ void CSVSniffer::DetectTypes() {
300
304
 
301
305
  // Potentially Skip Notes (I also find this dirty, but it is what the original code does)
302
306
  while (true_start < tuples.size()) {
303
- if (tuples[true_start].values.size() < max_columns_found) {
307
+ if (tuples[true_start].values.size() < max_columns_found && !options.null_padding) {
304
308
  true_start = tuples[true_start].line_number;
305
309
  values_start++;
306
310
  } else {
@@ -0,0 +1,114 @@
1
+ #include "duckdb/function/cast/bound_cast_data.hpp"
2
+
3
+ namespace duckdb {
4
+
5
+ bool StructToUnionCast::AllowImplicitCastFromStruct(const LogicalType &source, const LogicalType &target) {
6
+ if (source.id() != LogicalTypeId::STRUCT) {
7
+ return false;
8
+ }
9
+ auto target_fields = StructType::GetChildTypes(target);
10
+ auto fields = StructType::GetChildTypes(source);
11
+ if (target_fields.size() != fields.size()) {
12
+ // Struct should have the same amount of fields as the union
13
+ return false;
14
+ }
15
+ for (idx_t i = 0; i < target_fields.size(); i++) {
16
+ auto &target_field = target_fields[i].second;
17
+ auto &target_field_name = target_fields[i].first;
18
+ auto &field = fields[i].second;
19
+ auto &field_name = fields[i].first;
20
+ if (i == 0) {
21
+ // For the tag field we don't accept a type substitute as varchar
22
+ if (target_field != field) {
23
+ return false;
24
+ }
25
+ continue;
26
+ }
27
+ if (!StringUtil::CIEquals(target_field_name, field_name)) {
28
+ return false;
29
+ }
30
+ if (target_field != field && field != LogicalType::VARCHAR) {
31
+ // We allow the field to be VARCHAR, since unsupported types get cast to VARCHAR by EXPORT DATABASE (format
32
+ // PARQUET) i.e UNION(a BIT) becomes STRUCT(a VARCHAR)
33
+ return false;
34
+ }
35
+ }
36
+ return true;
37
+ }
38
+
39
+ // Physical Cast execution
40
+
41
+ bool StructToUnionCast::Cast(Vector &source, Vector &result, idx_t count, CastParameters &parameters) {
42
+ auto &cast_data = parameters.cast_data->Cast<StructBoundCastData>();
43
+ auto &lstate = parameters.local_state->Cast<StructCastLocalState>();
44
+
45
+ D_ASSERT(source.GetType().id() == LogicalTypeId::STRUCT);
46
+ D_ASSERT(result.GetType().id() == LogicalTypeId::UNION);
47
+ D_ASSERT(cast_data.target.id() == LogicalTypeId::UNION);
48
+
49
+ auto &source_children = StructVector::GetEntries(source);
50
+ auto &target_children = StructVector::GetEntries(result);
51
+
52
+ for (idx_t i = 0; i < source_children.size(); i++) {
53
+ auto &result_child_vector = *target_children[i];
54
+ auto &source_child_vector = *source_children[i];
55
+ CastParameters child_parameters(parameters, cast_data.child_cast_info[i].cast_data, lstate.local_states[i]);
56
+ auto converted =
57
+ cast_data.child_cast_info[i].function(source_child_vector, result_child_vector, count, child_parameters);
58
+ (void)converted;
59
+ D_ASSERT(converted);
60
+ }
61
+
62
+ auto check_tags = UnionVector::CheckUnionValidity(result, count);
63
+ switch (check_tags) {
64
+ case UnionInvalidReason::TAG_OUT_OF_RANGE:
65
+ throw ConversionException("One or more of the tags do not point to a valid union member");
66
+ case UnionInvalidReason::VALIDITY_OVERLAP:
67
+ throw ConversionException("One or more rows in the produced UNION have validity set for more than 1 member");
68
+ case UnionInvalidReason::TAG_MISMATCH:
69
+ throw ConversionException(
70
+ "One or more rows in the produced UNION have tags that don't point to the valid member");
71
+ case UnionInvalidReason::VALID:
72
+ break;
73
+ default:
74
+ throw InternalException("Struct to union cast failed for unknown reason");
75
+ }
76
+
77
+ if (source.GetVectorType() == VectorType::CONSTANT_VECTOR) {
78
+ result.SetVectorType(VectorType::CONSTANT_VECTOR);
79
+ ConstantVector::SetNull(result, ConstantVector::IsNull(source));
80
+ } else {
81
+ source.Flatten(count);
82
+ FlatVector::Validity(result) = FlatVector::Validity(source);
83
+ }
84
+ result.Verify(count);
85
+ return true;
86
+ }
87
+
88
+ // Bind cast
89
+
90
+ unique_ptr<BoundCastData> StructToUnionCast::BindData(BindCastInput &input, const LogicalType &source,
91
+ const LogicalType &target) {
92
+ vector<BoundCastInfo> child_cast_info;
93
+ D_ASSERT(source.id() == LogicalTypeId::STRUCT);
94
+ D_ASSERT(target.id() == LogicalTypeId::UNION);
95
+
96
+ auto result_child_count = StructType::GetChildCount(target);
97
+ D_ASSERT(result_child_count == StructType::GetChildCount(source));
98
+
99
+ for (idx_t i = 0; i < result_child_count; i++) {
100
+ auto &source_child = StructType::GetChildType(source, i);
101
+ auto &target_child = StructType::GetChildType(target, i);
102
+
103
+ auto child_cast = input.GetCastFunction(source_child, target_child);
104
+ child_cast_info.push_back(std::move(child_cast));
105
+ }
106
+ return make_uniq<StructBoundCastData>(std::move(child_cast_info), target);
107
+ }
108
+
109
+ BoundCastInfo StructToUnionCast::Bind(BindCastInput &input, const LogicalType &source, const LogicalType &target) {
110
+ auto cast_data = StructToUnionCast::BindData(input, source, target);
111
+ return BoundCastInfo(&StructToUnionCast::Cast, std::move(cast_data), StructBoundCastData::InitStructCastLocalState);
112
+ }
113
+
114
+ } // namespace duckdb
@@ -11,33 +11,10 @@ namespace duckdb {
11
11
  //--------------------------------------------------------------------------------------------------
12
12
  // if the source can be implicitly cast to a member of the target union, the cast is valid
13
13
 
14
- struct ToUnionBoundCastData : public BoundCastData {
15
- ToUnionBoundCastData(union_tag_t member_idx, string name, LogicalType type, int64_t cost,
16
- BoundCastInfo member_cast_info)
17
- : tag(member_idx), name(std::move(name)), type(std::move(type)), cost(cost),
18
- member_cast_info(std::move(member_cast_info)) {
19
- }
20
-
21
- union_tag_t tag;
22
- string name;
23
- LogicalType type;
24
- int64_t cost;
25
- BoundCastInfo member_cast_info;
26
-
27
- public:
28
- unique_ptr<BoundCastData> Copy() const override {
29
- return make_uniq<ToUnionBoundCastData>(tag, name, type, cost, member_cast_info.Copy());
30
- }
31
-
32
- static bool SortByCostAscending(const ToUnionBoundCastData &left, const ToUnionBoundCastData &right) {
33
- return left.cost < right.cost;
34
- }
35
- };
36
-
37
14
  unique_ptr<BoundCastData> BindToUnionCast(BindCastInput &input, const LogicalType &source, const LogicalType &target) {
38
15
  D_ASSERT(target.id() == LogicalTypeId::UNION);
39
16
 
40
- vector<ToUnionBoundCastData> candidates;
17
+ vector<UnionBoundCastData> candidates;
41
18
 
42
19
  for (idx_t member_idx = 0; member_idx < UnionType::GetMemberCount(target); member_idx++) {
43
20
  auto member_type = UnionType::GetMemberType(target, member_idx);
@@ -68,7 +45,7 @@ unique_ptr<BoundCastData> BindToUnionCast(BindCastInput &input, const LogicalTyp
68
45
  }
69
46
 
70
47
  // sort the candidate casts by cost
71
- std::sort(candidates.begin(), candidates.end(), ToUnionBoundCastData::SortByCostAscending);
48
+ std::sort(candidates.begin(), candidates.end(), UnionBoundCastData::SortByCostAscending);
72
49
 
73
50
  // select the lowest possible cost cast
74
51
  auto &selected_cast = candidates[0];
@@ -95,11 +72,11 @@ unique_ptr<BoundCastData> BindToUnionCast(BindCastInput &input, const LogicalTyp
95
72
  }
96
73
 
97
74
  // otherwise, return the selected cast
98
- return make_uniq<ToUnionBoundCastData>(std::move(selected_cast));
75
+ return make_uniq<UnionBoundCastData>(std::move(selected_cast));
99
76
  }
100
77
 
101
78
  unique_ptr<FunctionLocalState> InitToUnionLocalState(CastLocalStateParameters &parameters) {
102
- auto &cast_data = parameters.cast_data->Cast<ToUnionBoundCastData>();
79
+ auto &cast_data = parameters.cast_data->Cast<UnionBoundCastData>();
103
80
  if (!cast_data.member_cast_info.init_local_state) {
104
81
  return nullptr;
105
82
  }
@@ -109,7 +86,7 @@ unique_ptr<FunctionLocalState> InitToUnionLocalState(CastLocalStateParameters &p
109
86
 
110
87
  static bool ToUnionCast(Vector &source, Vector &result, idx_t count, CastParameters &parameters) {
111
88
  D_ASSERT(result.GetType().id() == LogicalTypeId::UNION);
112
- auto &cast_data = parameters.cast_data->Cast<ToUnionBoundCastData>();
89
+ auto &cast_data = parameters.cast_data->Cast<UnionBoundCastData>();
113
90
  auto &selected_member_vector = UnionVector::GetMember(result, cast_data.tag);
114
91
 
115
92
  CastParameters child_parameters(parameters, cast_data.member_cast_info.cast_data, parameters.local_state);
@@ -127,7 +104,13 @@ static bool ToUnionCast(Vector &source, Vector &result, idx_t count, CastParamet
127
104
 
128
105
  BoundCastInfo DefaultCasts::ImplicitToUnionCast(BindCastInput &input, const LogicalType &source,
129
106
  const LogicalType &target) {
130
- return BoundCastInfo(&ToUnionCast, BindToUnionCast(input, source, target), InitToUnionLocalState);
107
+
108
+ D_ASSERT(target.id() == LogicalTypeId::UNION);
109
+ if (StructToUnionCast::AllowImplicitCastFromStruct(source, target)) {
110
+ return StructToUnionCast::Bind(input, source, target);
111
+ }
112
+ auto cast_data = BindToUnionCast(input, source, target);
113
+ return BoundCastInfo(&ToUnionCast, std::move(cast_data), InitToUnionLocalState);
131
114
  }
132
115
 
133
116
  //--------------------------------------------------------------------------------------------------
@@ -143,7 +126,7 @@ BoundCastInfo DefaultCasts::ImplicitToUnionCast(BindCastInput &input, const Logi
143
126
  // INVALID: UNION(A, B) -> UNION(A, C) if B can't be implicitly cast to C
144
127
  // INVALID: UNION(A, B, D) -> UNION(A, B, C)
145
128
 
146
- struct UnionToUnionBoundCastData : public BoundCastData {
129
+ struct UnionUnionBoundCastData : public BoundCastData {
147
130
 
148
131
  // mapping from source member index to target member index
149
132
  // these are always the same size as the source member count
@@ -153,7 +136,7 @@ struct UnionToUnionBoundCastData : public BoundCastData {
153
136
 
154
137
  LogicalType target_type;
155
138
 
156
- UnionToUnionBoundCastData(vector<idx_t> tag_map, vector<BoundCastInfo> member_casts, LogicalType target_type)
139
+ UnionUnionBoundCastData(vector<idx_t> tag_map, vector<BoundCastInfo> member_casts, LogicalType target_type)
157
140
  : tag_map(std::move(tag_map)), member_casts(std::move(member_casts)), target_type(std::move(target_type)) {
158
141
  }
159
142
 
@@ -163,7 +146,7 @@ public:
163
146
  for (auto &member_cast : member_casts) {
164
147
  member_casts_copy.push_back(member_cast.Copy());
165
148
  }
166
- return make_uniq<UnionToUnionBoundCastData>(tag_map, std::move(member_casts_copy), target_type);
149
+ return make_uniq<UnionUnionBoundCastData>(tag_map, std::move(member_casts_copy), target_type);
167
150
  }
168
151
  };
169
152
 
@@ -203,11 +186,11 @@ unique_ptr<BoundCastData> BindUnionToUnionCast(BindCastInput &input, const Logic
203
186
  }
204
187
  }
205
188
 
206
- return make_uniq<UnionToUnionBoundCastData>(tag_map, std::move(member_casts), target);
189
+ return make_uniq<UnionUnionBoundCastData>(tag_map, std::move(member_casts), target);
207
190
  }
208
191
 
209
192
  unique_ptr<FunctionLocalState> InitUnionToUnionLocalState(CastLocalStateParameters &parameters) {
210
- auto &cast_data = parameters.cast_data->Cast<UnionToUnionBoundCastData>();
193
+ auto &cast_data = parameters.cast_data->Cast<UnionUnionBoundCastData>();
211
194
  auto result = make_uniq<StructCastLocalState>();
212
195
 
213
196
  for (auto &entry : cast_data.member_casts) {
@@ -222,7 +205,7 @@ unique_ptr<FunctionLocalState> InitUnionToUnionLocalState(CastLocalStateParamete
222
205
  }
223
206
 
224
207
  static bool UnionToUnionCast(Vector &source, Vector &result, idx_t count, CastParameters &parameters) {
225
- auto &cast_data = parameters.cast_data->Cast<UnionToUnionBoundCastData>();
208
+ auto &cast_data = parameters.cast_data->Cast<UnionUnionBoundCastData>();
226
209
  auto &lstate = parameters.local_state->Cast<StructCastLocalState>();
227
210
 
228
211
  auto source_member_count = UnionType::GetMemberCount(source.GetType());
@@ -313,7 +296,7 @@ static bool UnionToUnionCast(Vector &source, Vector &result, idx_t count, CastPa
313
296
  static bool UnionToVarcharCast(Vector &source, Vector &result, idx_t count, CastParameters &parameters) {
314
297
  auto constant = source.GetVectorType() == VectorType::CONSTANT_VECTOR;
315
298
  // first cast all union members to varchar
316
- auto &cast_data = parameters.cast_data->Cast<UnionToUnionBoundCastData>();
299
+ auto &cast_data = parameters.cast_data->Cast<UnionUnionBoundCastData>();
317
300
  Vector varchar_union(cast_data.target_type, count);
318
301
 
319
302
  UnionToUnionCast(source, varchar_union, count, parameters);
@@ -356,6 +339,7 @@ static bool UnionToVarcharCast(Vector &source, Vector &result, idx_t count, Cast
356
339
 
357
340
  BoundCastInfo DefaultCasts::UnionCastSwitch(BindCastInput &input, const LogicalType &source,
358
341
  const LogicalType &target) {
342
+ D_ASSERT(source.id() == LogicalTypeId::UNION);
359
343
  switch (target.id()) {
360
344
  case LogicalTypeId::VARCHAR: {
361
345
  // bind a cast in which we convert all members to VARCHAR first
@@ -1,8 +1,8 @@
1
1
  #ifndef DUCKDB_VERSION
2
- #define DUCKDB_VERSION "0.8.2-dev4424"
2
+ #define DUCKDB_VERSION "0.8.2-dev4474"
3
3
  #endif
4
4
  #ifndef DUCKDB_SOURCE_ID
5
- #define DUCKDB_SOURCE_ID "b78b24ad26"
5
+ #define DUCKDB_SOURCE_ID "ba71015ee7"
6
6
  #endif
7
7
  #include "duckdb/function/table/system_functions.hpp"
8
8
  #include "duckdb/main/database.hpp"
@@ -447,7 +447,7 @@ struct StructVector {
447
447
  DUCKDB_API static vector<unique_ptr<Vector>> &GetEntries(Vector &vector);
448
448
  };
449
449
 
450
- enum class UnionInvalidReason : uint8_t { VALID, TAG_OUT_OF_RANGE, NO_MEMBERS, VALIDITY_OVERLAP };
450
+ enum class UnionInvalidReason : uint8_t { VALID, TAG_OUT_OF_RANGE, NO_MEMBERS, VALIDITY_OVERLAP, TAG_MISMATCH };
451
451
 
452
452
  struct UnionVector {
453
453
  // Unions are stored as structs, but the first child is always the "tag"
@@ -81,4 +81,36 @@ public:
81
81
  unique_ptr<FunctionLocalState> value_state;
82
82
  };
83
83
 
84
+ struct UnionBoundCastData : public BoundCastData {
85
+ UnionBoundCastData(union_tag_t member_idx, string name, LogicalType type, int64_t cost,
86
+ BoundCastInfo member_cast_info)
87
+ : tag(member_idx), name(std::move(name)), type(std::move(type)), cost(cost),
88
+ member_cast_info(std::move(member_cast_info)) {
89
+ }
90
+
91
+ union_tag_t tag;
92
+ string name;
93
+ LogicalType type;
94
+ int64_t cost;
95
+ BoundCastInfo member_cast_info;
96
+
97
+ public:
98
+ unique_ptr<BoundCastData> Copy() const override {
99
+ return make_uniq<UnionBoundCastData>(tag, name, type, cost, member_cast_info.Copy());
100
+ }
101
+
102
+ static bool SortByCostAscending(const UnionBoundCastData &left, const UnionBoundCastData &right) {
103
+ return left.cost < right.cost;
104
+ }
105
+ };
106
+
107
+ struct StructToUnionCast {
108
+ public:
109
+ static bool AllowImplicitCastFromStruct(const LogicalType &source, const LogicalType &target);
110
+ static bool Cast(Vector &source, Vector &result, idx_t count, CastParameters &parameters);
111
+ static unique_ptr<BoundCastData> BindData(BindCastInput &input, const LogicalType &source,
112
+ const LogicalType &target);
113
+ static BoundCastInfo Bind(BindCastInput &input, const LogicalType &source, const LogicalType &target);
114
+ };
115
+
84
116
  } // namespace duckdb
@@ -99,7 +99,9 @@ typedef void (*copy_flush_batch_t)(ClientContext &context, FunctionData &bind_da
99
99
  PreparedBatchData &batch);
100
100
  typedef idx_t (*copy_desired_batch_size_t)(ClientContext &context, FunctionData &bind_data);
101
101
 
102
- typedef bool (*copy_supports_type_t)(const LogicalType &type);
102
+ enum class CopyTypeSupport { SUPPORTED, LOSSY, UNSUPPORTED };
103
+
104
+ typedef CopyTypeSupport (*copy_supports_type_t)(const LogicalType &type);
103
105
 
104
106
  class CopyFunction : public Function {
105
107
  public:
@@ -36,7 +36,7 @@ string KeywordHelper::EscapeQuotes(const string &text, char quote) {
36
36
  string KeywordHelper::WriteQuoted(const string &text, char quote) {
37
37
  // 1. Escapes all occurences of 'quote' by doubling them (escape in SQL)
38
38
  // 2. Adds quotes around the string
39
- return string(1, quote) + EscapeQuotes(text) + string(1, quote);
39
+ return string(1, quote) + EscapeQuotes(text, quote) + string(1, quote);
40
40
  }
41
41
 
42
42
  string KeywordHelper::WriteOptionallyQuoted(const string &text, char quote, bool allow_caps) {
@@ -115,7 +115,95 @@ string CreateFileName(const string &id_suffix, TableCatalogEntry &table, const s
115
115
  return StringUtil::Format("%s_%s%s.%s", schema, name, id_suffix, extension);
116
116
  }
117
117
 
118
- unique_ptr<QueryNode> CreateSelectStatement(CopyStatement &stmt, vector<unique_ptr<ParsedExpression>> select_list) {
118
+ static bool IsSupported(CopyTypeSupport support_level) {
119
+ // For export purposes we don't want to lose information, so we only accept fully supported types
120
+ return support_level == CopyTypeSupport::SUPPORTED;
121
+ }
122
+
123
+ static LogicalType AlterLogicalType(const LogicalType &original, copy_supports_type_t type_check) {
124
+ D_ASSERT(type_check);
125
+ auto id = original.id();
126
+ switch (id) {
127
+ case LogicalTypeId::LIST: {
128
+ auto child = AlterLogicalType(ListType::GetChildType(original), type_check);
129
+ return LogicalType::LIST(child);
130
+ }
131
+ case LogicalTypeId::STRUCT: {
132
+ auto &original_children = StructType::GetChildTypes(original);
133
+ child_list_t<LogicalType> new_children;
134
+ for (auto &child : original_children) {
135
+ auto &child_name = child.first;
136
+ auto &child_type = child.second;
137
+
138
+ LogicalType new_type;
139
+ if (!IsSupported(type_check(child_type))) {
140
+ new_type = AlterLogicalType(child_type, type_check);
141
+ } else {
142
+ new_type = child_type;
143
+ }
144
+ new_children.push_back(std::make_pair(child_name, new_type));
145
+ }
146
+ return LogicalType::STRUCT(std::move(new_children));
147
+ }
148
+ case LogicalTypeId::UNION: {
149
+ auto member_count = UnionType::GetMemberCount(original);
150
+ child_list_t<LogicalType> new_children;
151
+ for (idx_t i = 0; i < member_count; i++) {
152
+ auto &child_name = UnionType::GetMemberName(original, i);
153
+ auto &child_type = UnionType::GetMemberType(original, i);
154
+
155
+ LogicalType new_type;
156
+ if (!IsSupported(type_check(child_type))) {
157
+ new_type = AlterLogicalType(child_type, type_check);
158
+ } else {
159
+ new_type = child_type;
160
+ }
161
+
162
+ new_children.push_back(std::make_pair(child_name, new_type));
163
+ }
164
+ return LogicalType::UNION(std::move(new_children));
165
+ }
166
+ case LogicalTypeId::MAP: {
167
+ auto &key_type = MapType::KeyType(original);
168
+ auto &value_type = MapType::ValueType(original);
169
+
170
+ LogicalType new_key_type;
171
+ LogicalType new_value_type;
172
+ if (!IsSupported(type_check(key_type))) {
173
+ new_key_type = AlterLogicalType(key_type, type_check);
174
+ } else {
175
+ new_key_type = key_type;
176
+ }
177
+
178
+ if (!IsSupported(type_check(value_type))) {
179
+ new_value_type = AlterLogicalType(value_type, type_check);
180
+ } else {
181
+ new_value_type = value_type;
182
+ }
183
+ return LogicalType::MAP(new_key_type, new_value_type);
184
+ }
185
+ default: {
186
+ D_ASSERT(!IsSupported(type_check(original)));
187
+ return LogicalType::VARCHAR;
188
+ }
189
+ }
190
+ }
191
+
192
+ static bool NeedsCast(LogicalType &type, copy_supports_type_t type_check) {
193
+ if (!type_check) {
194
+ return false;
195
+ }
196
+ if (IsSupported(type_check(type))) {
197
+ // The type is supported in it's entirety, no cast is required
198
+ return false;
199
+ }
200
+ // Change the type to something that is supported
201
+ type = AlterLogicalType(type, type_check);
202
+ return true;
203
+ }
204
+
205
+ static unique_ptr<QueryNode> CreateSelectStatement(CopyStatement &stmt, child_list_t<LogicalType> &select_list,
206
+ copy_supports_type_t type_check) {
119
207
  auto ref = make_uniq<BaseTableRef>();
120
208
  ref->catalog_name = stmt.info->catalog;
121
209
  ref->schema_name = stmt.info->schema;
@@ -123,7 +211,21 @@ unique_ptr<QueryNode> CreateSelectStatement(CopyStatement &stmt, vector<unique_p
123
211
 
124
212
  auto statement = make_uniq<SelectNode>();
125
213
  statement->from_table = std::move(ref);
126
- statement->select_list = std::move(select_list);
214
+
215
+ vector<unique_ptr<ParsedExpression>> expressions;
216
+ for (auto &col : select_list) {
217
+ auto &name = col.first;
218
+ auto &type = col.second;
219
+
220
+ auto expression = make_uniq_base<ParsedExpression, ColumnRefExpression>(name);
221
+ if (NeedsCast(type, type_check)) {
222
+ // Add a cast to a type supported by the copy function
223
+ expression = make_uniq_base<ParsedExpression, CastExpression>(type, std::move(expression));
224
+ }
225
+ expressions.push_back(std::move(expression));
226
+ }
227
+
228
+ statement->select_list = std::move(expressions);
127
229
  return std::move(statement);
128
230
  }
129
231
 
@@ -194,16 +296,10 @@ BoundStatement Binder::Bind(ExportStatement &stmt) {
194
296
  info->table = table.name;
195
297
 
196
298
  // We can not export generated columns
197
- vector<unique_ptr<ParsedExpression>> expressions;
299
+ child_list_t<LogicalType> select_list;
300
+
198
301
  for (auto &col : table.GetColumns().Physical()) {
199
- auto expression = make_uniq_base<ParsedExpression, ColumnRefExpression>(col.GetName());
200
- auto is_supported = copy_function.function.supports_type;
201
- if (is_supported && !is_supported(col.Type())) {
202
- expression =
203
- make_uniq_base<ParsedExpression, CastExpression>(LogicalType::VARCHAR, std::move(expression));
204
- }
205
- expressions.push_back(std::move(expression));
206
- info->select_list.push_back(col.GetName());
302
+ select_list.push_back(std::make_pair(col.Name(), col.Type()));
207
303
  }
208
304
 
209
305
  ExportedTableData exported_data;
@@ -220,7 +316,8 @@ BoundStatement Binder::Bind(ExportStatement &stmt) {
220
316
  // generate the copy statement and bind it
221
317
  CopyStatement copy_stmt;
222
318
  copy_stmt.info = std::move(info);
223
- copy_stmt.select_statement = CreateSelectStatement(copy_stmt, std::move(expressions));
319
+ copy_stmt.select_statement =
320
+ CreateSelectStatement(copy_stmt, select_list, copy_function.function.supports_type);
224
321
 
225
322
  auto copy_binder = Binder::CreateBinder(context, this);
226
323
  auto bound_statement = copy_binder->Bind(copy_stmt);
@@ -292,7 +292,11 @@ void RLESkip(ColumnSegment &segment, ColumnScanState &state, idx_t skip_count) {
292
292
  scan_state.Skip(segment, skip_count);
293
293
  }
294
294
 
295
+ template <bool ENTIRE_VECTOR>
295
296
  static bool CanEmitConstantVector(idx_t position, idx_t run_length, idx_t scan_count) {
297
+ if (!ENTIRE_VECTOR) {
298
+ return false;
299
+ }
296
300
  if (scan_count != STANDARD_VECTOR_SIZE) {
297
301
  // Only when we can fill an entire Vector can we emit a ConstantVector, because subsequent scans require the
298
302
  // input Vector to be flat
@@ -330,9 +334,9 @@ static void RLEScanConstant(RLEScanState<T> &scan_state, rle_count_t *index_poin
330
334
  return;
331
335
  }
332
336
 
333
- template <class T>
334
- void RLEScanPartial(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, Vector &result,
335
- idx_t result_offset) {
337
+ template <class T, bool ENTIRE_VECTOR>
338
+ void RLEScanPartialInternal(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, Vector &result,
339
+ idx_t result_offset) {
336
340
  auto &scan_state = state.scan_state->Cast<RLEScanState<T>>();
337
341
 
338
342
  auto data = scan_state.handle.Ptr() + segment.GetBlockOffset();
@@ -340,7 +344,8 @@ void RLEScanPartial(ColumnSegment &segment, ColumnScanState &state, idx_t scan_c
340
344
  auto index_pointer = reinterpret_cast<rle_count_t *>(data + scan_state.rle_count_offset);
341
345
 
342
346
  // If we are scanning an entire Vector and it contains only a single run
343
- if (CanEmitConstantVector(scan_state.position_in_entry, index_pointer[scan_state.entry_pos], scan_count)) {
347
+ if (CanEmitConstantVector<ENTIRE_VECTOR>(scan_state.position_in_entry, index_pointer[scan_state.entry_pos],
348
+ scan_count)) {
344
349
  RLEScanConstant<T>(scan_state, index_pointer, data_pointer, scan_count, result);
345
350
  return;
346
351
  }
@@ -357,9 +362,15 @@ void RLEScanPartial(ColumnSegment &segment, ColumnScanState &state, idx_t scan_c
357
362
  }
358
363
  }
359
364
 
365
+ template <class T>
366
+ void RLEScanPartial(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, Vector &result,
367
+ idx_t result_offset) {
368
+ return RLEScanPartialInternal<T, false>(segment, state, scan_count, result, result_offset);
369
+ }
370
+
360
371
  template <class T>
361
372
  void RLEScan(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, Vector &result) {
362
- RLEScanPartial<T>(segment, state, scan_count, result, 0);
373
+ RLEScanPartialInternal<T, true>(segment, state, scan_count, result, 0);
363
374
  }
364
375
 
365
376
  //===--------------------------------------------------------------------===//
@@ -0,0 +1,2 @@
1
+ #include "src/function/cast/union/from_struct.cpp"
2
+