duckdb 0.8.2-dev4376.0 → 0.8.2-dev4474.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +1 -0
- package/package.json +1 -1
- package/src/duckdb/extension/parquet/column_writer.cpp +1 -1
- package/src/duckdb/extension/parquet/include/parquet_writer.hpp +4 -3
- package/src/duckdb/extension/parquet/parquet_writer.cpp +33 -15
- package/src/duckdb/src/common/enum_util.cpp +5 -0
- package/src/duckdb/src/common/types/date.cpp +1 -1
- package/src/duckdb/src/common/types/vector.cpp +3 -0
- package/src/duckdb/src/common/types.cpp +1 -1
- package/src/duckdb/src/execution/index/fixed_size_buffer.cpp +3 -10
- package/src/duckdb/src/execution/operator/csv_scanner/parallel_csv_reader.cpp +6 -3
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +3 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +8 -2
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +5 -1
- package/src/duckdb/src/function/cast/union/from_struct.cpp +114 -0
- package/src/duckdb/src/function/cast/union_casts.cpp +20 -36
- package/src/duckdb/src/function/table/read_csv.cpp +5 -22
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/types/vector.hpp +1 -1
- package/src/duckdb/src/include/duckdb/function/cast/bound_cast_data.hpp +32 -0
- package/src/duckdb/src/include/duckdb/function/copy_function.hpp +3 -1
- package/src/duckdb/src/include/duckdb/main/query_result.hpp +1 -1
- package/src/duckdb/src/parser/keyword_helper.cpp +1 -1
- package/src/duckdb/src/planner/binder/statement/bind_export.cpp +109 -12
- package/src/duckdb/src/storage/checkpoint_manager.cpp +37 -36
- package/src/duckdb/src/storage/compression/rle.cpp +16 -5
- package/src/duckdb/src/storage/local_storage.cpp +8 -1
- package/src/duckdb/ub_src_function_cast_union.cpp +2 -0
- package/test/prepare.test.ts +10 -1
- package/test/test_all_types.test.ts +4 -4
package/binding.gyp
CHANGED
@@ -80,6 +80,7 @@
|
|
80
80
|
"src/duckdb/ub_src_function_aggregate.cpp",
|
81
81
|
"src/duckdb/ub_src_function.cpp",
|
82
82
|
"src/duckdb/ub_src_function_cast.cpp",
|
83
|
+
"src/duckdb/ub_src_function_cast_union.cpp",
|
83
84
|
"src/duckdb/ub_src_function_pragma.cpp",
|
84
85
|
"src/duckdb/ub_src_function_scalar_compressed_materialization.cpp",
|
85
86
|
"src/duckdb/ub_src_function_scalar.cpp",
|
package/package.json
CHANGED
@@ -1825,7 +1825,7 @@ unique_ptr<ColumnWriter> ColumnWriter::CreateWriterRecursive(vector<duckdb_parqu
|
|
1825
1825
|
}
|
1826
1826
|
}
|
1827
1827
|
|
1828
|
-
if (type.id() == LogicalTypeId::STRUCT) {
|
1828
|
+
if (type.id() == LogicalTypeId::STRUCT || type.id() == LogicalTypeId::UNION) {
|
1829
1829
|
auto &child_types = StructType::GetChildTypes(type);
|
1830
1830
|
// set up the schema element for this struct
|
1831
1831
|
duckdb_parquet::format::SchemaElement schema_element;
|
@@ -15,6 +15,7 @@
|
|
15
15
|
#include "duckdb/common/mutex.hpp"
|
16
16
|
#include "duckdb/common/serializer/buffered_file_writer.hpp"
|
17
17
|
#include "duckdb/common/types/column/column_data_collection.hpp"
|
18
|
+
#include "duckdb/function/copy_function.hpp"
|
18
19
|
#endif
|
19
20
|
|
20
21
|
#include "column_writer.hpp"
|
@@ -75,11 +76,11 @@ public:
|
|
75
76
|
return *writer;
|
76
77
|
}
|
77
78
|
|
78
|
-
static
|
79
|
+
static CopyTypeSupport TypeIsSupported(const LogicalType &type);
|
79
80
|
|
80
81
|
private:
|
81
|
-
static
|
82
|
-
|
82
|
+
static CopyTypeSupport DuckDBTypeToParquetTypeInternal(const LogicalType &duckdb_type,
|
83
|
+
duckdb_parquet::format::Type::type &type);
|
83
84
|
string file_name;
|
84
85
|
vector<LogicalType> sql_types;
|
85
86
|
vector<string> column_names;
|
@@ -77,7 +77,8 @@ private:
|
|
77
77
|
WriteStream &serializer;
|
78
78
|
};
|
79
79
|
|
80
|
-
|
80
|
+
CopyTypeSupport ParquetWriter::DuckDBTypeToParquetTypeInternal(const LogicalType &duckdb_type,
|
81
|
+
Type::type &parquet_type) {
|
81
82
|
switch (duckdb_type.id()) {
|
82
83
|
case LogicalTypeId::BOOLEAN:
|
83
84
|
parquet_type = Type::BOOLEAN;
|
@@ -95,9 +96,11 @@ bool ParquetWriter::DuckDBTypeToParquetTypeInternal(const LogicalType &duckdb_ty
|
|
95
96
|
parquet_type = Type::FLOAT;
|
96
97
|
break;
|
97
98
|
case LogicalTypeId::DOUBLE:
|
98
|
-
case LogicalTypeId::HUGEINT:
|
99
99
|
parquet_type = Type::DOUBLE;
|
100
100
|
break;
|
101
|
+
case LogicalTypeId::HUGEINT:
|
102
|
+
parquet_type = Type::DOUBLE;
|
103
|
+
return CopyTypeSupport::LOSSY;
|
101
104
|
case LogicalTypeId::ENUM:
|
102
105
|
case LogicalTypeId::BLOB:
|
103
106
|
case LogicalTypeId::VARCHAR:
|
@@ -141,47 +144,62 @@ bool ParquetWriter::DuckDBTypeToParquetTypeInternal(const LogicalType &duckdb_ty
|
|
141
144
|
}
|
142
145
|
break;
|
143
146
|
default:
|
144
|
-
// Anything that is not supported
|
145
|
-
return
|
147
|
+
// Anything that is not supported
|
148
|
+
return CopyTypeSupport::UNSUPPORTED;
|
146
149
|
}
|
147
|
-
return
|
150
|
+
return CopyTypeSupport::SUPPORTED;
|
148
151
|
}
|
149
152
|
|
150
153
|
Type::type ParquetWriter::DuckDBTypeToParquetType(const LogicalType &duckdb_type) {
|
151
154
|
Type::type result;
|
152
|
-
|
155
|
+
auto type_supports = DuckDBTypeToParquetTypeInternal(duckdb_type, result);
|
156
|
+
if (type_supports == CopyTypeSupport::UNSUPPORTED) {
|
153
157
|
throw NotImplementedException("Unimplemented type for Parquet \"%s\"", duckdb_type.ToString());
|
154
158
|
}
|
155
159
|
return result;
|
156
160
|
}
|
157
161
|
|
158
|
-
|
162
|
+
CopyTypeSupport ParquetWriter::TypeIsSupported(const LogicalType &type) {
|
159
163
|
Type::type unused;
|
160
164
|
auto id = type.id();
|
161
165
|
if (id == LogicalTypeId::LIST) {
|
162
166
|
auto &child_type = ListType::GetChildType(type);
|
163
167
|
return TypeIsSupported(child_type);
|
164
168
|
}
|
169
|
+
if (id == LogicalTypeId::UNION) {
|
170
|
+
auto count = UnionType::GetMemberCount(type);
|
171
|
+
for (idx_t i = 0; i < count; i++) {
|
172
|
+
auto &member_type = UnionType::GetMemberType(type, i);
|
173
|
+
auto type_support = TypeIsSupported(member_type);
|
174
|
+
if (type_support != CopyTypeSupport::SUPPORTED) {
|
175
|
+
return type_support;
|
176
|
+
}
|
177
|
+
}
|
178
|
+
return CopyTypeSupport::SUPPORTED;
|
179
|
+
}
|
165
180
|
if (id == LogicalTypeId::STRUCT) {
|
166
181
|
auto &children = StructType::GetChildTypes(type);
|
167
182
|
for (auto &child : children) {
|
168
183
|
auto &child_type = child.second;
|
169
|
-
|
170
|
-
|
184
|
+
auto type_support = TypeIsSupported(child_type);
|
185
|
+
if (type_support != CopyTypeSupport::SUPPORTED) {
|
186
|
+
return type_support;
|
171
187
|
}
|
172
188
|
}
|
173
|
-
return
|
189
|
+
return CopyTypeSupport::SUPPORTED;
|
174
190
|
}
|
175
191
|
if (id == LogicalTypeId::MAP) {
|
176
192
|
auto &key_type = MapType::KeyType(type);
|
177
193
|
auto &value_type = MapType::ValueType(type);
|
178
|
-
|
179
|
-
|
194
|
+
auto key_type_support = TypeIsSupported(key_type);
|
195
|
+
if (key_type_support != CopyTypeSupport::SUPPORTED) {
|
196
|
+
return key_type_support;
|
180
197
|
}
|
181
|
-
|
182
|
-
|
198
|
+
auto value_type_support = TypeIsSupported(value_type);
|
199
|
+
if (value_type_support != CopyTypeSupport::SUPPORTED) {
|
200
|
+
return value_type_support;
|
183
201
|
}
|
184
|
-
return
|
202
|
+
return CopyTypeSupport::SUPPORTED;
|
185
203
|
}
|
186
204
|
return DuckDBTypeToParquetTypeInternal(type, unused);
|
187
205
|
}
|
@@ -5974,6 +5974,8 @@ const char* EnumUtil::ToChars<UnionInvalidReason>(UnionInvalidReason value) {
|
|
5974
5974
|
return "NO_MEMBERS";
|
5975
5975
|
case UnionInvalidReason::VALIDITY_OVERLAP:
|
5976
5976
|
return "VALIDITY_OVERLAP";
|
5977
|
+
case UnionInvalidReason::TAG_MISMATCH:
|
5978
|
+
return "TAG_MISMATCH";
|
5977
5979
|
default:
|
5978
5980
|
throw NotImplementedException(StringUtil::Format("Enum value: '%d' not implemented", value));
|
5979
5981
|
}
|
@@ -5993,6 +5995,9 @@ UnionInvalidReason EnumUtil::FromString<UnionInvalidReason>(const char *value) {
|
|
5993
5995
|
if (StringUtil::Equals(value, "VALIDITY_OVERLAP")) {
|
5994
5996
|
return UnionInvalidReason::VALIDITY_OVERLAP;
|
5995
5997
|
}
|
5998
|
+
if (StringUtil::Equals(value, "TAG_MISMATCH")) {
|
5999
|
+
return UnionInvalidReason::TAG_MISMATCH;
|
6000
|
+
}
|
5996
6001
|
throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value));
|
5997
6002
|
}
|
5998
6003
|
|
@@ -492,7 +492,7 @@ int32_t Date::ExtractDayOfTheYear(date_t date) {
|
|
492
492
|
|
493
493
|
int64_t Date::ExtractJulianDay(date_t date) {
|
494
494
|
// Julian Day 0 is (-4713, 11, 24) in the proleptic Gregorian calendar.
|
495
|
-
static const
|
495
|
+
static const int64_t JULIAN_EPOCH = -2440588;
|
496
496
|
return date.days - JULIAN_EPOCH;
|
497
497
|
}
|
498
498
|
|
@@ -2007,6 +2007,9 @@ UnionInvalidReason UnionVector::CheckUnionValidity(Vector &vector, idx_t count,
|
|
2007
2007
|
return UnionInvalidReason::VALIDITY_OVERLAP;
|
2008
2008
|
}
|
2009
2009
|
found_valid = true;
|
2010
|
+
if (tag != static_cast<union_tag_t>(member_idx)) {
|
2011
|
+
return UnionInvalidReason::TAG_MISMATCH;
|
2012
|
+
}
|
2010
2013
|
}
|
2011
2014
|
}
|
2012
2015
|
}
|
@@ -398,7 +398,7 @@ string LogicalType::ToString() const {
|
|
398
398
|
if (i > 0) {
|
399
399
|
ret += ", ";
|
400
400
|
}
|
401
|
-
ret +=
|
401
|
+
ret += KeywordHelper::WriteQuoted(EnumType::GetString(*this, i).GetString(), '\'');
|
402
402
|
}
|
403
403
|
ret += ")";
|
404
404
|
return ret;
|
@@ -148,9 +148,6 @@ void FixedSizeBuffer::Pin() {
|
|
148
148
|
|
149
149
|
uint32_t FixedSizeBuffer::GetOffset(const idx_t bitmask_count) {
|
150
150
|
|
151
|
-
// this function calls Get() on the buffer, so the buffer must already be in memory
|
152
|
-
D_ASSERT(InMemory());
|
153
|
-
|
154
151
|
// get the bitmask data
|
155
152
|
auto bitmask_ptr = reinterpret_cast<validity_t *>(Get());
|
156
153
|
ValidityMask mask(bitmask_ptr);
|
@@ -200,7 +197,7 @@ uint32_t FixedSizeBuffer::GetOffset(const idx_t bitmask_count) {
|
|
200
197
|
|
201
198
|
uint32_t FixedSizeBuffer::GetMaxOffset(const idx_t available_segments) {
|
202
199
|
|
203
|
-
// this function calls Get() on the buffer
|
200
|
+
// this function calls Get() on the buffer
|
204
201
|
D_ASSERT(InMemory());
|
205
202
|
|
206
203
|
// finds the maximum zero bit in a bitmask, and adds one to it,
|
@@ -259,17 +256,13 @@ uint32_t FixedSizeBuffer::GetMaxOffset(const idx_t available_segments) {
|
|
259
256
|
}
|
260
257
|
|
261
258
|
// there are no allocations in this buffer
|
262
|
-
|
263
|
-
// FIXME: test_index_large_aborted_append.test with force_restart
|
264
|
-
// FIXME: test if we still have non-dirty buffer to serialize after fixing this
|
265
|
-
// throw InternalException("tried to serialize empty buffer");
|
266
|
-
return 0;
|
259
|
+
throw InternalException("tried to serialize empty buffer");
|
267
260
|
}
|
268
261
|
|
269
262
|
void FixedSizeBuffer::SetUninitializedRegions(PartialBlockForIndex &p_block_for_index, const idx_t segment_size,
|
270
263
|
const idx_t offset, const idx_t bitmask_offset) {
|
271
264
|
|
272
|
-
// this function calls Get() on the buffer
|
265
|
+
// this function calls Get() on the buffer
|
273
266
|
D_ASSERT(InMemory());
|
274
267
|
|
275
268
|
auto bitmask_ptr = reinterpret_cast<validity_t *>(Get());
|
@@ -89,17 +89,19 @@ bool ParallelCSVReader::SetPosition() {
|
|
89
89
|
position_buffer++;
|
90
90
|
}
|
91
91
|
if (position_buffer > end_buffer) {
|
92
|
+
VerifyLineLength(position_buffer, buffer->batch_index);
|
92
93
|
return false;
|
93
94
|
}
|
94
95
|
SkipEmptyLines();
|
95
96
|
if (verification_positions.beginning_of_first_line == 0) {
|
96
97
|
verification_positions.beginning_of_first_line = position_buffer;
|
97
98
|
}
|
98
|
-
|
99
|
+
VerifyLineLength(position_buffer, buffer->batch_index);
|
99
100
|
verification_positions.end_of_last_line = position_buffer;
|
100
101
|
return true;
|
101
102
|
}
|
102
103
|
}
|
104
|
+
VerifyLineLength(position_buffer, buffer->batch_index);
|
103
105
|
return false;
|
104
106
|
}
|
105
107
|
SkipEmptyLines();
|
@@ -143,12 +145,13 @@ bool ParallelCSVReader::SetPosition() {
|
|
143
145
|
break;
|
144
146
|
}
|
145
147
|
|
146
|
-
|
148
|
+
auto pos_check = position_buffer == 0 ? position_buffer : position_buffer - 1;
|
149
|
+
if (position_buffer >= end_buffer && !StringUtil::CharacterIsNewline((*buffer)[pos_check])) {
|
147
150
|
break;
|
148
151
|
}
|
149
152
|
|
150
153
|
if (position_buffer > end_buffer && options.dialect_options.new_line == NewLineIdentifier::CARRY_ON &&
|
151
|
-
(*buffer)[
|
154
|
+
(*buffer)[pos_check] == '\n') {
|
152
155
|
break;
|
153
156
|
}
|
154
157
|
idx_t position_set = position_buffer;
|
@@ -55,6 +55,9 @@ struct SniffDialect {
|
|
55
55
|
if (machine.state == CSVState::INVALID) {
|
56
56
|
return;
|
57
57
|
}
|
58
|
+
if (machine.cur_rows < machine.options.sample_chunk_size && machine.state == CSVState::DELIMITER) {
|
59
|
+
sniffed_column_counts[machine.cur_rows] = ++machine.column_count;
|
60
|
+
}
|
58
61
|
if (machine.cur_rows < machine.options.sample_chunk_size && machine.state != CSVState::EMPTY_LINE) {
|
59
62
|
sniffed_column_counts[machine.cur_rows++] = machine.column_count;
|
60
63
|
}
|
@@ -148,12 +148,18 @@ void CSVSniffer::DetectHeader() {
|
|
148
148
|
names.push_back(col_name);
|
149
149
|
name_collision_count[col_name] = 0;
|
150
150
|
}
|
151
|
+
if (best_header_row.size() < best_candidate->dialect_options.num_cols && options.null_padding) {
|
152
|
+
for (idx_t col = best_header_row.size(); col < best_candidate->dialect_options.num_cols; col++) {
|
153
|
+
names.push_back(GenerateColumnName(best_candidate->dialect_options.num_cols, col));
|
154
|
+
}
|
155
|
+
} else if (best_header_row.size() < best_candidate->dialect_options.num_cols) {
|
156
|
+
throw InternalException("Detected header has number of columns inferior to dialect detection");
|
157
|
+
}
|
151
158
|
|
152
159
|
} else {
|
153
160
|
best_candidate->dialect_options.header = false;
|
154
161
|
for (idx_t col = 0; col < best_candidate->dialect_options.num_cols; col++) {
|
155
|
-
|
156
|
-
names.push_back(column_name);
|
162
|
+
names.push_back(GenerateColumnName(best_candidate->dialect_options.num_cols, col));
|
157
163
|
}
|
158
164
|
}
|
159
165
|
|
@@ -183,6 +183,10 @@ struct SniffValue {
|
|
183
183
|
}
|
184
184
|
|
185
185
|
inline static void Finalize(CSVStateMachine &machine, vector<TupleSniffing> &sniffed_values) {
|
186
|
+
if (machine.cur_rows < sniffed_values.size() && machine.state == CSVState::DELIMITER) {
|
187
|
+
// Started a new empty value
|
188
|
+
sniffed_values[machine.cur_rows].values.push_back(Value(machine.value));
|
189
|
+
}
|
186
190
|
if (machine.cur_rows < sniffed_values.size() && machine.state != CSVState::EMPTY_LINE) {
|
187
191
|
machine.VerifyUTF8();
|
188
192
|
sniffed_values[machine.cur_rows].line_number = machine.rows_read;
|
@@ -300,7 +304,7 @@ void CSVSniffer::DetectTypes() {
|
|
300
304
|
|
301
305
|
// Potentially Skip Notes (I also find this dirty, but it is what the original code does)
|
302
306
|
while (true_start < tuples.size()) {
|
303
|
-
if (tuples[true_start].values.size() < max_columns_found) {
|
307
|
+
if (tuples[true_start].values.size() < max_columns_found && !options.null_padding) {
|
304
308
|
true_start = tuples[true_start].line_number;
|
305
309
|
values_start++;
|
306
310
|
} else {
|
@@ -0,0 +1,114 @@
|
|
1
|
+
#include "duckdb/function/cast/bound_cast_data.hpp"
|
2
|
+
|
3
|
+
namespace duckdb {
|
4
|
+
|
5
|
+
bool StructToUnionCast::AllowImplicitCastFromStruct(const LogicalType &source, const LogicalType &target) {
|
6
|
+
if (source.id() != LogicalTypeId::STRUCT) {
|
7
|
+
return false;
|
8
|
+
}
|
9
|
+
auto target_fields = StructType::GetChildTypes(target);
|
10
|
+
auto fields = StructType::GetChildTypes(source);
|
11
|
+
if (target_fields.size() != fields.size()) {
|
12
|
+
// Struct should have the same amount of fields as the union
|
13
|
+
return false;
|
14
|
+
}
|
15
|
+
for (idx_t i = 0; i < target_fields.size(); i++) {
|
16
|
+
auto &target_field = target_fields[i].second;
|
17
|
+
auto &target_field_name = target_fields[i].first;
|
18
|
+
auto &field = fields[i].second;
|
19
|
+
auto &field_name = fields[i].first;
|
20
|
+
if (i == 0) {
|
21
|
+
// For the tag field we don't accept a type substitute as varchar
|
22
|
+
if (target_field != field) {
|
23
|
+
return false;
|
24
|
+
}
|
25
|
+
continue;
|
26
|
+
}
|
27
|
+
if (!StringUtil::CIEquals(target_field_name, field_name)) {
|
28
|
+
return false;
|
29
|
+
}
|
30
|
+
if (target_field != field && field != LogicalType::VARCHAR) {
|
31
|
+
// We allow the field to be VARCHAR, since unsupported types get cast to VARCHAR by EXPORT DATABASE (format
|
32
|
+
// PARQUET) i.e UNION(a BIT) becomes STRUCT(a VARCHAR)
|
33
|
+
return false;
|
34
|
+
}
|
35
|
+
}
|
36
|
+
return true;
|
37
|
+
}
|
38
|
+
|
39
|
+
// Physical Cast execution
|
40
|
+
|
41
|
+
bool StructToUnionCast::Cast(Vector &source, Vector &result, idx_t count, CastParameters ¶meters) {
|
42
|
+
auto &cast_data = parameters.cast_data->Cast<StructBoundCastData>();
|
43
|
+
auto &lstate = parameters.local_state->Cast<StructCastLocalState>();
|
44
|
+
|
45
|
+
D_ASSERT(source.GetType().id() == LogicalTypeId::STRUCT);
|
46
|
+
D_ASSERT(result.GetType().id() == LogicalTypeId::UNION);
|
47
|
+
D_ASSERT(cast_data.target.id() == LogicalTypeId::UNION);
|
48
|
+
|
49
|
+
auto &source_children = StructVector::GetEntries(source);
|
50
|
+
auto &target_children = StructVector::GetEntries(result);
|
51
|
+
|
52
|
+
for (idx_t i = 0; i < source_children.size(); i++) {
|
53
|
+
auto &result_child_vector = *target_children[i];
|
54
|
+
auto &source_child_vector = *source_children[i];
|
55
|
+
CastParameters child_parameters(parameters, cast_data.child_cast_info[i].cast_data, lstate.local_states[i]);
|
56
|
+
auto converted =
|
57
|
+
cast_data.child_cast_info[i].function(source_child_vector, result_child_vector, count, child_parameters);
|
58
|
+
(void)converted;
|
59
|
+
D_ASSERT(converted);
|
60
|
+
}
|
61
|
+
|
62
|
+
auto check_tags = UnionVector::CheckUnionValidity(result, count);
|
63
|
+
switch (check_tags) {
|
64
|
+
case UnionInvalidReason::TAG_OUT_OF_RANGE:
|
65
|
+
throw ConversionException("One or more of the tags do not point to a valid union member");
|
66
|
+
case UnionInvalidReason::VALIDITY_OVERLAP:
|
67
|
+
throw ConversionException("One or more rows in the produced UNION have validity set for more than 1 member");
|
68
|
+
case UnionInvalidReason::TAG_MISMATCH:
|
69
|
+
throw ConversionException(
|
70
|
+
"One or more rows in the produced UNION have tags that don't point to the valid member");
|
71
|
+
case UnionInvalidReason::VALID:
|
72
|
+
break;
|
73
|
+
default:
|
74
|
+
throw InternalException("Struct to union cast failed for unknown reason");
|
75
|
+
}
|
76
|
+
|
77
|
+
if (source.GetVectorType() == VectorType::CONSTANT_VECTOR) {
|
78
|
+
result.SetVectorType(VectorType::CONSTANT_VECTOR);
|
79
|
+
ConstantVector::SetNull(result, ConstantVector::IsNull(source));
|
80
|
+
} else {
|
81
|
+
source.Flatten(count);
|
82
|
+
FlatVector::Validity(result) = FlatVector::Validity(source);
|
83
|
+
}
|
84
|
+
result.Verify(count);
|
85
|
+
return true;
|
86
|
+
}
|
87
|
+
|
88
|
+
// Bind cast
|
89
|
+
|
90
|
+
unique_ptr<BoundCastData> StructToUnionCast::BindData(BindCastInput &input, const LogicalType &source,
|
91
|
+
const LogicalType &target) {
|
92
|
+
vector<BoundCastInfo> child_cast_info;
|
93
|
+
D_ASSERT(source.id() == LogicalTypeId::STRUCT);
|
94
|
+
D_ASSERT(target.id() == LogicalTypeId::UNION);
|
95
|
+
|
96
|
+
auto result_child_count = StructType::GetChildCount(target);
|
97
|
+
D_ASSERT(result_child_count == StructType::GetChildCount(source));
|
98
|
+
|
99
|
+
for (idx_t i = 0; i < result_child_count; i++) {
|
100
|
+
auto &source_child = StructType::GetChildType(source, i);
|
101
|
+
auto &target_child = StructType::GetChildType(target, i);
|
102
|
+
|
103
|
+
auto child_cast = input.GetCastFunction(source_child, target_child);
|
104
|
+
child_cast_info.push_back(std::move(child_cast));
|
105
|
+
}
|
106
|
+
return make_uniq<StructBoundCastData>(std::move(child_cast_info), target);
|
107
|
+
}
|
108
|
+
|
109
|
+
BoundCastInfo StructToUnionCast::Bind(BindCastInput &input, const LogicalType &source, const LogicalType &target) {
|
110
|
+
auto cast_data = StructToUnionCast::BindData(input, source, target);
|
111
|
+
return BoundCastInfo(&StructToUnionCast::Cast, std::move(cast_data), StructBoundCastData::InitStructCastLocalState);
|
112
|
+
}
|
113
|
+
|
114
|
+
} // namespace duckdb
|
@@ -11,33 +11,10 @@ namespace duckdb {
|
|
11
11
|
//--------------------------------------------------------------------------------------------------
|
12
12
|
// if the source can be implicitly cast to a member of the target union, the cast is valid
|
13
13
|
|
14
|
-
struct ToUnionBoundCastData : public BoundCastData {
|
15
|
-
ToUnionBoundCastData(union_tag_t member_idx, string name, LogicalType type, int64_t cost,
|
16
|
-
BoundCastInfo member_cast_info)
|
17
|
-
: tag(member_idx), name(std::move(name)), type(std::move(type)), cost(cost),
|
18
|
-
member_cast_info(std::move(member_cast_info)) {
|
19
|
-
}
|
20
|
-
|
21
|
-
union_tag_t tag;
|
22
|
-
string name;
|
23
|
-
LogicalType type;
|
24
|
-
int64_t cost;
|
25
|
-
BoundCastInfo member_cast_info;
|
26
|
-
|
27
|
-
public:
|
28
|
-
unique_ptr<BoundCastData> Copy() const override {
|
29
|
-
return make_uniq<ToUnionBoundCastData>(tag, name, type, cost, member_cast_info.Copy());
|
30
|
-
}
|
31
|
-
|
32
|
-
static bool SortByCostAscending(const ToUnionBoundCastData &left, const ToUnionBoundCastData &right) {
|
33
|
-
return left.cost < right.cost;
|
34
|
-
}
|
35
|
-
};
|
36
|
-
|
37
14
|
unique_ptr<BoundCastData> BindToUnionCast(BindCastInput &input, const LogicalType &source, const LogicalType &target) {
|
38
15
|
D_ASSERT(target.id() == LogicalTypeId::UNION);
|
39
16
|
|
40
|
-
vector<
|
17
|
+
vector<UnionBoundCastData> candidates;
|
41
18
|
|
42
19
|
for (idx_t member_idx = 0; member_idx < UnionType::GetMemberCount(target); member_idx++) {
|
43
20
|
auto member_type = UnionType::GetMemberType(target, member_idx);
|
@@ -68,7 +45,7 @@ unique_ptr<BoundCastData> BindToUnionCast(BindCastInput &input, const LogicalTyp
|
|
68
45
|
}
|
69
46
|
|
70
47
|
// sort the candidate casts by cost
|
71
|
-
std::sort(candidates.begin(), candidates.end(),
|
48
|
+
std::sort(candidates.begin(), candidates.end(), UnionBoundCastData::SortByCostAscending);
|
72
49
|
|
73
50
|
// select the lowest possible cost cast
|
74
51
|
auto &selected_cast = candidates[0];
|
@@ -95,11 +72,11 @@ unique_ptr<BoundCastData> BindToUnionCast(BindCastInput &input, const LogicalTyp
|
|
95
72
|
}
|
96
73
|
|
97
74
|
// otherwise, return the selected cast
|
98
|
-
return make_uniq<
|
75
|
+
return make_uniq<UnionBoundCastData>(std::move(selected_cast));
|
99
76
|
}
|
100
77
|
|
101
78
|
unique_ptr<FunctionLocalState> InitToUnionLocalState(CastLocalStateParameters ¶meters) {
|
102
|
-
auto &cast_data = parameters.cast_data->Cast<
|
79
|
+
auto &cast_data = parameters.cast_data->Cast<UnionBoundCastData>();
|
103
80
|
if (!cast_data.member_cast_info.init_local_state) {
|
104
81
|
return nullptr;
|
105
82
|
}
|
@@ -109,7 +86,7 @@ unique_ptr<FunctionLocalState> InitToUnionLocalState(CastLocalStateParameters &p
|
|
109
86
|
|
110
87
|
static bool ToUnionCast(Vector &source, Vector &result, idx_t count, CastParameters ¶meters) {
|
111
88
|
D_ASSERT(result.GetType().id() == LogicalTypeId::UNION);
|
112
|
-
auto &cast_data = parameters.cast_data->Cast<
|
89
|
+
auto &cast_data = parameters.cast_data->Cast<UnionBoundCastData>();
|
113
90
|
auto &selected_member_vector = UnionVector::GetMember(result, cast_data.tag);
|
114
91
|
|
115
92
|
CastParameters child_parameters(parameters, cast_data.member_cast_info.cast_data, parameters.local_state);
|
@@ -127,7 +104,13 @@ static bool ToUnionCast(Vector &source, Vector &result, idx_t count, CastParamet
|
|
127
104
|
|
128
105
|
BoundCastInfo DefaultCasts::ImplicitToUnionCast(BindCastInput &input, const LogicalType &source,
|
129
106
|
const LogicalType &target) {
|
130
|
-
|
107
|
+
|
108
|
+
D_ASSERT(target.id() == LogicalTypeId::UNION);
|
109
|
+
if (StructToUnionCast::AllowImplicitCastFromStruct(source, target)) {
|
110
|
+
return StructToUnionCast::Bind(input, source, target);
|
111
|
+
}
|
112
|
+
auto cast_data = BindToUnionCast(input, source, target);
|
113
|
+
return BoundCastInfo(&ToUnionCast, std::move(cast_data), InitToUnionLocalState);
|
131
114
|
}
|
132
115
|
|
133
116
|
//--------------------------------------------------------------------------------------------------
|
@@ -143,7 +126,7 @@ BoundCastInfo DefaultCasts::ImplicitToUnionCast(BindCastInput &input, const Logi
|
|
143
126
|
// INVALID: UNION(A, B) -> UNION(A, C) if B can't be implicitly cast to C
|
144
127
|
// INVALID: UNION(A, B, D) -> UNION(A, B, C)
|
145
128
|
|
146
|
-
struct
|
129
|
+
struct UnionUnionBoundCastData : public BoundCastData {
|
147
130
|
|
148
131
|
// mapping from source member index to target member index
|
149
132
|
// these are always the same size as the source member count
|
@@ -153,7 +136,7 @@ struct UnionToUnionBoundCastData : public BoundCastData {
|
|
153
136
|
|
154
137
|
LogicalType target_type;
|
155
138
|
|
156
|
-
|
139
|
+
UnionUnionBoundCastData(vector<idx_t> tag_map, vector<BoundCastInfo> member_casts, LogicalType target_type)
|
157
140
|
: tag_map(std::move(tag_map)), member_casts(std::move(member_casts)), target_type(std::move(target_type)) {
|
158
141
|
}
|
159
142
|
|
@@ -163,7 +146,7 @@ public:
|
|
163
146
|
for (auto &member_cast : member_casts) {
|
164
147
|
member_casts_copy.push_back(member_cast.Copy());
|
165
148
|
}
|
166
|
-
return make_uniq<
|
149
|
+
return make_uniq<UnionUnionBoundCastData>(tag_map, std::move(member_casts_copy), target_type);
|
167
150
|
}
|
168
151
|
};
|
169
152
|
|
@@ -203,11 +186,11 @@ unique_ptr<BoundCastData> BindUnionToUnionCast(BindCastInput &input, const Logic
|
|
203
186
|
}
|
204
187
|
}
|
205
188
|
|
206
|
-
return make_uniq<
|
189
|
+
return make_uniq<UnionUnionBoundCastData>(tag_map, std::move(member_casts), target);
|
207
190
|
}
|
208
191
|
|
209
192
|
unique_ptr<FunctionLocalState> InitUnionToUnionLocalState(CastLocalStateParameters ¶meters) {
|
210
|
-
auto &cast_data = parameters.cast_data->Cast<
|
193
|
+
auto &cast_data = parameters.cast_data->Cast<UnionUnionBoundCastData>();
|
211
194
|
auto result = make_uniq<StructCastLocalState>();
|
212
195
|
|
213
196
|
for (auto &entry : cast_data.member_casts) {
|
@@ -222,7 +205,7 @@ unique_ptr<FunctionLocalState> InitUnionToUnionLocalState(CastLocalStateParamete
|
|
222
205
|
}
|
223
206
|
|
224
207
|
static bool UnionToUnionCast(Vector &source, Vector &result, idx_t count, CastParameters ¶meters) {
|
225
|
-
auto &cast_data = parameters.cast_data->Cast<
|
208
|
+
auto &cast_data = parameters.cast_data->Cast<UnionUnionBoundCastData>();
|
226
209
|
auto &lstate = parameters.local_state->Cast<StructCastLocalState>();
|
227
210
|
|
228
211
|
auto source_member_count = UnionType::GetMemberCount(source.GetType());
|
@@ -313,7 +296,7 @@ static bool UnionToUnionCast(Vector &source, Vector &result, idx_t count, CastPa
|
|
313
296
|
static bool UnionToVarcharCast(Vector &source, Vector &result, idx_t count, CastParameters ¶meters) {
|
314
297
|
auto constant = source.GetVectorType() == VectorType::CONSTANT_VECTOR;
|
315
298
|
// first cast all union members to varchar
|
316
|
-
auto &cast_data = parameters.cast_data->Cast<
|
299
|
+
auto &cast_data = parameters.cast_data->Cast<UnionUnionBoundCastData>();
|
317
300
|
Vector varchar_union(cast_data.target_type, count);
|
318
301
|
|
319
302
|
UnionToUnionCast(source, varchar_union, count, parameters);
|
@@ -356,6 +339,7 @@ static bool UnionToVarcharCast(Vector &source, Vector &result, idx_t count, Cast
|
|
356
339
|
|
357
340
|
BoundCastInfo DefaultCasts::UnionCastSwitch(BindCastInput &input, const LogicalType &source,
|
358
341
|
const LogicalType &target) {
|
342
|
+
D_ASSERT(source.id() == LogicalTypeId::UNION);
|
359
343
|
switch (target.id()) {
|
360
344
|
case LogicalTypeId::VARCHAR: {
|
361
345
|
// bind a cast in which we convert all members to VARCHAR first
|
@@ -300,7 +300,7 @@ public:
|
|
300
300
|
const CSVReaderOptions &options, idx_t system_threads_p, const vector<string> &files_path_p,
|
301
301
|
bool force_parallelism_p, vector<column_t> column_ids_p)
|
302
302
|
: buffer_manager(std::move(buffer_manager_p)), system_threads(system_threads_p),
|
303
|
-
|
303
|
+
force_parallelism(force_parallelism_p), column_ids(std::move(column_ids_p)),
|
304
304
|
line_info(main_mutex, batch_to_tuple_end, tuple_start, tuple_end) {
|
305
305
|
current_file_path = files_path_p[0];
|
306
306
|
CSVFileHandle *file_handle_ptr;
|
@@ -316,16 +316,6 @@ public:
|
|
316
316
|
first_file_size = file_size;
|
317
317
|
on_disk_file = file_handle_ptr->OnDiskFile();
|
318
318
|
bytes_read = 0;
|
319
|
-
if (buffer_size < file_size || file_size == 0) {
|
320
|
-
bytes_per_local_state = buffer_size / ParallelCSVGlobalState::MaxThreads();
|
321
|
-
} else {
|
322
|
-
bytes_per_local_state = file_size / MaxThreads();
|
323
|
-
}
|
324
|
-
if (bytes_per_local_state == 0) {
|
325
|
-
// In practice, I think this won't happen, it only happens because we are mocking up test scenarios
|
326
|
-
// this boy needs to be at least one.
|
327
|
-
bytes_per_local_state = 1;
|
328
|
-
}
|
329
319
|
running_threads = MaxThreads();
|
330
320
|
|
331
321
|
// Initialize all the book-keeping variables
|
@@ -368,8 +358,6 @@ public:
|
|
368
358
|
|
369
359
|
void UpdateLinesRead(CSVBufferRead &buffer_read, idx_t file_idx);
|
370
360
|
|
371
|
-
void IncrementThread();
|
372
|
-
|
373
361
|
void DecrementThread();
|
374
362
|
|
375
363
|
bool Finished();
|
@@ -402,16 +390,12 @@ private:
|
|
402
390
|
mutex main_mutex;
|
403
391
|
//! Byte set from for last thread
|
404
392
|
idx_t next_byte = 0;
|
405
|
-
//! How many bytes we should execute per local state
|
406
|
-
idx_t bytes_per_local_state;
|
407
393
|
//! Size of first file
|
408
394
|
idx_t first_file_size = 0;
|
409
395
|
//! Whether or not this is an on-disk file
|
410
396
|
bool on_disk_file = true;
|
411
397
|
//! Basically max number of threads in DuckDB
|
412
398
|
idx_t system_threads;
|
413
|
-
//! Size of the buffers
|
414
|
-
idx_t buffer_size;
|
415
399
|
//! Current batch index
|
416
400
|
idx_t batch_index = 0;
|
417
401
|
idx_t local_batch_index = 0;
|
@@ -454,11 +438,6 @@ idx_t ParallelCSVGlobalState::MaxThreads() const {
|
|
454
438
|
return system_threads;
|
455
439
|
}
|
456
440
|
|
457
|
-
void ParallelCSVGlobalState::IncrementThread() {
|
458
|
-
lock_guard<mutex> parallel_lock(main_mutex);
|
459
|
-
running_threads++;
|
460
|
-
}
|
461
|
-
|
462
441
|
void ParallelCSVGlobalState::DecrementThread() {
|
463
442
|
lock_guard<mutex> parallel_lock(main_mutex);
|
464
443
|
D_ASSERT(running_threads > 0);
|
@@ -572,6 +551,7 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
|
|
572
551
|
}
|
573
552
|
// set up the current buffer
|
574
553
|
line_info.current_batches[file_index - 1].insert(local_batch_index);
|
554
|
+
idx_t bytes_per_local_state = current_buffer->actual_size / MaxThreads() + 1;
|
575
555
|
auto result = make_uniq<CSVBufferRead>(
|
576
556
|
buffer_manager->GetBuffer(cur_buffer_idx), buffer_manager->GetBuffer(cur_buffer_idx + 1), next_byte,
|
577
557
|
next_byte + bytes_per_local_state, batch_index++, local_batch_index++, &line_info);
|
@@ -1135,6 +1115,9 @@ unique_ptr<TableRef> ReadCSVReplacement(ClientContext &context, const string &ta
|
|
1135
1115
|
if (StringUtil::EndsWith(lower_name, ".gz")) {
|
1136
1116
|
lower_name = lower_name.substr(0, lower_name.size() - 3);
|
1137
1117
|
} else if (StringUtil::EndsWith(lower_name, ".zst")) {
|
1118
|
+
if (!Catalog::TryAutoLoad(context, "parquet")) {
|
1119
|
+
throw MissingExtensionException("parquet extension is required for reading zst compressed file");
|
1120
|
+
}
|
1138
1121
|
lower_name = lower_name.substr(0, lower_name.size() - 4);
|
1139
1122
|
}
|
1140
1123
|
if (!StringUtil::EndsWith(lower_name, ".csv") && !StringUtil::Contains(lower_name, ".csv?") &&
|
@@ -1,8 +1,8 @@
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
2
|
-
#define DUCKDB_VERSION "0.8.2-
|
2
|
+
#define DUCKDB_VERSION "0.8.2-dev4474"
|
3
3
|
#endif
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
5
|
+
#define DUCKDB_SOURCE_ID "ba71015ee7"
|
6
6
|
#endif
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
8
8
|
#include "duckdb/main/database.hpp"
|
@@ -447,7 +447,7 @@ struct StructVector {
|
|
447
447
|
DUCKDB_API static vector<unique_ptr<Vector>> &GetEntries(Vector &vector);
|
448
448
|
};
|
449
449
|
|
450
|
-
enum class UnionInvalidReason : uint8_t { VALID, TAG_OUT_OF_RANGE, NO_MEMBERS, VALIDITY_OVERLAP };
|
450
|
+
enum class UnionInvalidReason : uint8_t { VALID, TAG_OUT_OF_RANGE, NO_MEMBERS, VALIDITY_OVERLAP, TAG_MISMATCH };
|
451
451
|
|
452
452
|
struct UnionVector {
|
453
453
|
// Unions are stored as structs, but the first child is always the "tag"
|
@@ -81,4 +81,36 @@ public:
|
|
81
81
|
unique_ptr<FunctionLocalState> value_state;
|
82
82
|
};
|
83
83
|
|
84
|
+
struct UnionBoundCastData : public BoundCastData {
|
85
|
+
UnionBoundCastData(union_tag_t member_idx, string name, LogicalType type, int64_t cost,
|
86
|
+
BoundCastInfo member_cast_info)
|
87
|
+
: tag(member_idx), name(std::move(name)), type(std::move(type)), cost(cost),
|
88
|
+
member_cast_info(std::move(member_cast_info)) {
|
89
|
+
}
|
90
|
+
|
91
|
+
union_tag_t tag;
|
92
|
+
string name;
|
93
|
+
LogicalType type;
|
94
|
+
int64_t cost;
|
95
|
+
BoundCastInfo member_cast_info;
|
96
|
+
|
97
|
+
public:
|
98
|
+
unique_ptr<BoundCastData> Copy() const override {
|
99
|
+
return make_uniq<UnionBoundCastData>(tag, name, type, cost, member_cast_info.Copy());
|
100
|
+
}
|
101
|
+
|
102
|
+
static bool SortByCostAscending(const UnionBoundCastData &left, const UnionBoundCastData &right) {
|
103
|
+
return left.cost < right.cost;
|
104
|
+
}
|
105
|
+
};
|
106
|
+
|
107
|
+
struct StructToUnionCast {
|
108
|
+
public:
|
109
|
+
static bool AllowImplicitCastFromStruct(const LogicalType &source, const LogicalType &target);
|
110
|
+
static bool Cast(Vector &source, Vector &result, idx_t count, CastParameters ¶meters);
|
111
|
+
static unique_ptr<BoundCastData> BindData(BindCastInput &input, const LogicalType &source,
|
112
|
+
const LogicalType &target);
|
113
|
+
static BoundCastInfo Bind(BindCastInput &input, const LogicalType &source, const LogicalType &target);
|
114
|
+
};
|
115
|
+
|
84
116
|
} // namespace duckdb
|
@@ -99,7 +99,9 @@ typedef void (*copy_flush_batch_t)(ClientContext &context, FunctionData &bind_da
|
|
99
99
|
PreparedBatchData &batch);
|
100
100
|
typedef idx_t (*copy_desired_batch_size_t)(ClientContext &context, FunctionData &bind_data);
|
101
101
|
|
102
|
-
|
102
|
+
enum class CopyTypeSupport { SUPPORTED, LOSSY, UNSUPPORTED };
|
103
|
+
|
104
|
+
typedef CopyTypeSupport (*copy_supports_type_t)(const LogicalType &type);
|
103
105
|
|
104
106
|
class CopyFunction : public Function {
|
105
107
|
public:
|
@@ -40,7 +40,7 @@ public:
|
|
40
40
|
vector<string> names;
|
41
41
|
|
42
42
|
public:
|
43
|
-
DUCKDB_API void ThrowError(const string &prepended_message = "") const;
|
43
|
+
[[noreturn]] DUCKDB_API void ThrowError(const string &prepended_message = "") const;
|
44
44
|
DUCKDB_API void SetError(PreservedError error);
|
45
45
|
DUCKDB_API bool HasError() const;
|
46
46
|
DUCKDB_API const ExceptionType &GetErrorType() const;
|
@@ -36,7 +36,7 @@ string KeywordHelper::EscapeQuotes(const string &text, char quote) {
|
|
36
36
|
string KeywordHelper::WriteQuoted(const string &text, char quote) {
|
37
37
|
// 1. Escapes all occurences of 'quote' by doubling them (escape in SQL)
|
38
38
|
// 2. Adds quotes around the string
|
39
|
-
return string(1, quote) + EscapeQuotes(text) + string(1, quote);
|
39
|
+
return string(1, quote) + EscapeQuotes(text, quote) + string(1, quote);
|
40
40
|
}
|
41
41
|
|
42
42
|
string KeywordHelper::WriteOptionallyQuoted(const string &text, char quote, bool allow_caps) {
|
@@ -115,7 +115,95 @@ string CreateFileName(const string &id_suffix, TableCatalogEntry &table, const s
|
|
115
115
|
return StringUtil::Format("%s_%s%s.%s", schema, name, id_suffix, extension);
|
116
116
|
}
|
117
117
|
|
118
|
-
|
118
|
+
static bool IsSupported(CopyTypeSupport support_level) {
|
119
|
+
// For export purposes we don't want to lose information, so we only accept fully supported types
|
120
|
+
return support_level == CopyTypeSupport::SUPPORTED;
|
121
|
+
}
|
122
|
+
|
123
|
+
static LogicalType AlterLogicalType(const LogicalType &original, copy_supports_type_t type_check) {
|
124
|
+
D_ASSERT(type_check);
|
125
|
+
auto id = original.id();
|
126
|
+
switch (id) {
|
127
|
+
case LogicalTypeId::LIST: {
|
128
|
+
auto child = AlterLogicalType(ListType::GetChildType(original), type_check);
|
129
|
+
return LogicalType::LIST(child);
|
130
|
+
}
|
131
|
+
case LogicalTypeId::STRUCT: {
|
132
|
+
auto &original_children = StructType::GetChildTypes(original);
|
133
|
+
child_list_t<LogicalType> new_children;
|
134
|
+
for (auto &child : original_children) {
|
135
|
+
auto &child_name = child.first;
|
136
|
+
auto &child_type = child.second;
|
137
|
+
|
138
|
+
LogicalType new_type;
|
139
|
+
if (!IsSupported(type_check(child_type))) {
|
140
|
+
new_type = AlterLogicalType(child_type, type_check);
|
141
|
+
} else {
|
142
|
+
new_type = child_type;
|
143
|
+
}
|
144
|
+
new_children.push_back(std::make_pair(child_name, new_type));
|
145
|
+
}
|
146
|
+
return LogicalType::STRUCT(std::move(new_children));
|
147
|
+
}
|
148
|
+
case LogicalTypeId::UNION: {
|
149
|
+
auto member_count = UnionType::GetMemberCount(original);
|
150
|
+
child_list_t<LogicalType> new_children;
|
151
|
+
for (idx_t i = 0; i < member_count; i++) {
|
152
|
+
auto &child_name = UnionType::GetMemberName(original, i);
|
153
|
+
auto &child_type = UnionType::GetMemberType(original, i);
|
154
|
+
|
155
|
+
LogicalType new_type;
|
156
|
+
if (!IsSupported(type_check(child_type))) {
|
157
|
+
new_type = AlterLogicalType(child_type, type_check);
|
158
|
+
} else {
|
159
|
+
new_type = child_type;
|
160
|
+
}
|
161
|
+
|
162
|
+
new_children.push_back(std::make_pair(child_name, new_type));
|
163
|
+
}
|
164
|
+
return LogicalType::UNION(std::move(new_children));
|
165
|
+
}
|
166
|
+
case LogicalTypeId::MAP: {
|
167
|
+
auto &key_type = MapType::KeyType(original);
|
168
|
+
auto &value_type = MapType::ValueType(original);
|
169
|
+
|
170
|
+
LogicalType new_key_type;
|
171
|
+
LogicalType new_value_type;
|
172
|
+
if (!IsSupported(type_check(key_type))) {
|
173
|
+
new_key_type = AlterLogicalType(key_type, type_check);
|
174
|
+
} else {
|
175
|
+
new_key_type = key_type;
|
176
|
+
}
|
177
|
+
|
178
|
+
if (!IsSupported(type_check(value_type))) {
|
179
|
+
new_value_type = AlterLogicalType(value_type, type_check);
|
180
|
+
} else {
|
181
|
+
new_value_type = value_type;
|
182
|
+
}
|
183
|
+
return LogicalType::MAP(new_key_type, new_value_type);
|
184
|
+
}
|
185
|
+
default: {
|
186
|
+
D_ASSERT(!IsSupported(type_check(original)));
|
187
|
+
return LogicalType::VARCHAR;
|
188
|
+
}
|
189
|
+
}
|
190
|
+
}
|
191
|
+
|
192
|
+
static bool NeedsCast(LogicalType &type, copy_supports_type_t type_check) {
|
193
|
+
if (!type_check) {
|
194
|
+
return false;
|
195
|
+
}
|
196
|
+
if (IsSupported(type_check(type))) {
|
197
|
+
// The type is supported in it's entirety, no cast is required
|
198
|
+
return false;
|
199
|
+
}
|
200
|
+
// Change the type to something that is supported
|
201
|
+
type = AlterLogicalType(type, type_check);
|
202
|
+
return true;
|
203
|
+
}
|
204
|
+
|
205
|
+
static unique_ptr<QueryNode> CreateSelectStatement(CopyStatement &stmt, child_list_t<LogicalType> &select_list,
|
206
|
+
copy_supports_type_t type_check) {
|
119
207
|
auto ref = make_uniq<BaseTableRef>();
|
120
208
|
ref->catalog_name = stmt.info->catalog;
|
121
209
|
ref->schema_name = stmt.info->schema;
|
@@ -123,7 +211,21 @@ unique_ptr<QueryNode> CreateSelectStatement(CopyStatement &stmt, vector<unique_p
|
|
123
211
|
|
124
212
|
auto statement = make_uniq<SelectNode>();
|
125
213
|
statement->from_table = std::move(ref);
|
126
|
-
|
214
|
+
|
215
|
+
vector<unique_ptr<ParsedExpression>> expressions;
|
216
|
+
for (auto &col : select_list) {
|
217
|
+
auto &name = col.first;
|
218
|
+
auto &type = col.second;
|
219
|
+
|
220
|
+
auto expression = make_uniq_base<ParsedExpression, ColumnRefExpression>(name);
|
221
|
+
if (NeedsCast(type, type_check)) {
|
222
|
+
// Add a cast to a type supported by the copy function
|
223
|
+
expression = make_uniq_base<ParsedExpression, CastExpression>(type, std::move(expression));
|
224
|
+
}
|
225
|
+
expressions.push_back(std::move(expression));
|
226
|
+
}
|
227
|
+
|
228
|
+
statement->select_list = std::move(expressions);
|
127
229
|
return std::move(statement);
|
128
230
|
}
|
129
231
|
|
@@ -194,16 +296,10 @@ BoundStatement Binder::Bind(ExportStatement &stmt) {
|
|
194
296
|
info->table = table.name;
|
195
297
|
|
196
298
|
// We can not export generated columns
|
197
|
-
|
299
|
+
child_list_t<LogicalType> select_list;
|
300
|
+
|
198
301
|
for (auto &col : table.GetColumns().Physical()) {
|
199
|
-
|
200
|
-
auto is_supported = copy_function.function.supports_type;
|
201
|
-
if (is_supported && !is_supported(col.Type())) {
|
202
|
-
expression =
|
203
|
-
make_uniq_base<ParsedExpression, CastExpression>(LogicalType::VARCHAR, std::move(expression));
|
204
|
-
}
|
205
|
-
expressions.push_back(std::move(expression));
|
206
|
-
info->select_list.push_back(col.GetName());
|
302
|
+
select_list.push_back(std::make_pair(col.Name(), col.Type()));
|
207
303
|
}
|
208
304
|
|
209
305
|
ExportedTableData exported_data;
|
@@ -220,7 +316,8 @@ BoundStatement Binder::Bind(ExportStatement &stmt) {
|
|
220
316
|
// generate the copy statement and bind it
|
221
317
|
CopyStatement copy_stmt;
|
222
318
|
copy_stmt.info = std::move(info);
|
223
|
-
copy_stmt.select_statement =
|
319
|
+
copy_stmt.select_statement =
|
320
|
+
CreateSelectStatement(copy_stmt, select_list, copy_function.function.supports_type);
|
224
321
|
|
225
322
|
auto copy_binder = Binder::CreateBinder(context, this);
|
226
323
|
auto bound_statement = copy_binder->Bind(copy_stmt);
|
@@ -363,63 +363,64 @@ void CheckpointWriter::WriteIndex(IndexCatalogEntry &index_catalog, Serializer &
|
|
363
363
|
|
364
364
|
void CheckpointReader::ReadIndex(ClientContext &context, Deserializer &deserializer) {
|
365
365
|
|
366
|
-
//
|
367
|
-
auto
|
368
|
-
auto &
|
369
|
-
|
370
|
-
//
|
371
|
-
auto &
|
372
|
-
auto &
|
373
|
-
catalog.GetEntry(context, CatalogType::TABLE_ENTRY,
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
366
|
+
// deserialize the index create info
|
367
|
+
auto create_info = deserializer.ReadProperty<unique_ptr<CreateInfo>>(100, "index");
|
368
|
+
auto &info = create_info->Cast<CreateIndexInfo>();
|
369
|
+
|
370
|
+
// create the index in the catalog
|
371
|
+
auto &schema = catalog.GetSchema(context, create_info->schema);
|
372
|
+
auto &table =
|
373
|
+
catalog.GetEntry(context, CatalogType::TABLE_ENTRY, create_info->schema, info.table).Cast<DuckTableEntry>();
|
374
|
+
|
375
|
+
auto &index = schema.CreateIndex(context, info, table)->Cast<DuckIndexEntry>();
|
376
|
+
|
377
|
+
index.info = table.GetStorage().info;
|
378
|
+
// insert the parsed expressions into the stored index so that we correctly (de)serialize it during consecutive
|
379
|
+
// checkpoints
|
380
|
+
for (auto &parsed_expr : info.parsed_expressions) {
|
381
|
+
index.parsed_expressions.push_back(parsed_expr->Copy());
|
382
|
+
}
|
383
|
+
|
384
|
+
// we deserialize the index lazily, i.e., we do not need to load any node information
|
378
385
|
// except the root block pointer
|
379
|
-
auto
|
386
|
+
auto root_block_pointer = deserializer.ReadProperty<BlockPointer>(101, "root_block_pointer");
|
380
387
|
|
381
|
-
// obtain the expressions of the ART from the index metadata
|
382
|
-
vector<unique_ptr<Expression>> unbound_expressions;
|
388
|
+
// obtain the parsed expressions of the ART from the index metadata
|
383
389
|
vector<unique_ptr<ParsedExpression>> parsed_expressions;
|
384
|
-
for (auto &
|
385
|
-
parsed_expressions.push_back(
|
390
|
+
for (auto &parsed_expr : info.parsed_expressions) {
|
391
|
+
parsed_expressions.push_back(parsed_expr->Copy());
|
386
392
|
}
|
393
|
+
D_ASSERT(!parsed_expressions.empty());
|
387
394
|
|
388
|
-
// bind the parsed expressions
|
389
|
-
// add the table to the bind context
|
395
|
+
// add the table to the bind context to bind the parsed expressions
|
390
396
|
auto binder = Binder::CreateBinder(context);
|
391
397
|
vector<LogicalType> column_types;
|
392
398
|
vector<string> column_names;
|
393
|
-
for (auto &col :
|
399
|
+
for (auto &col : table.GetColumns().Logical()) {
|
394
400
|
column_types.push_back(col.Type());
|
395
401
|
column_names.push_back(col.Name());
|
396
402
|
}
|
403
|
+
|
404
|
+
// create a binder to bind the parsed expressions
|
397
405
|
vector<column_t> column_ids;
|
398
|
-
binder->bind_context.AddBaseTable(0,
|
406
|
+
binder->bind_context.AddBaseTable(0, info.table, column_names, column_types, column_ids, &table);
|
399
407
|
IndexBinder idx_binder(*binder, context);
|
408
|
+
|
409
|
+
// bind the parsed expressions to create unbound expressions
|
410
|
+
vector<unique_ptr<Expression>> unbound_expressions;
|
400
411
|
unbound_expressions.reserve(parsed_expressions.size());
|
401
412
|
for (auto &expr : parsed_expressions) {
|
402
413
|
unbound_expressions.push_back(idx_binder.Bind(expr));
|
403
414
|
}
|
404
415
|
|
405
|
-
if (parsed_expressions.empty()) {
|
406
|
-
// this is a PK/FK index: we create the necessary bound column ref expressions
|
407
|
-
unbound_expressions.reserve(index_info.column_ids.size());
|
408
|
-
for (idx_t key_nr = 0; key_nr < index_info.column_ids.size(); key_nr++) {
|
409
|
-
auto &col = table_catalog.GetColumn(LogicalIndex(index_info.column_ids[key_nr]));
|
410
|
-
unbound_expressions.push_back(
|
411
|
-
make_uniq<BoundColumnRefExpression>(col.GetName(), col.GetType(), ColumnBinding(0, key_nr)));
|
412
|
-
}
|
413
|
-
}
|
414
|
-
|
415
416
|
// create the index and add it to the storage
|
416
|
-
switch (
|
417
|
+
switch (info.index_type) {
|
417
418
|
case IndexType::ART: {
|
418
|
-
auto &storage =
|
419
|
-
auto art = make_uniq<ART>(
|
420
|
-
|
419
|
+
auto &storage = table.GetStorage();
|
420
|
+
auto art = make_uniq<ART>(info.column_ids, TableIOManager::Get(storage), std::move(unbound_expressions),
|
421
|
+
info.constraint_type, storage.db, nullptr, root_block_pointer);
|
421
422
|
|
422
|
-
|
423
|
+
index.index = art.get();
|
423
424
|
storage.info->indexes.AddIndex(std::move(art));
|
424
425
|
} break;
|
425
426
|
default:
|
@@ -292,7 +292,11 @@ void RLESkip(ColumnSegment &segment, ColumnScanState &state, idx_t skip_count) {
|
|
292
292
|
scan_state.Skip(segment, skip_count);
|
293
293
|
}
|
294
294
|
|
295
|
+
template <bool ENTIRE_VECTOR>
|
295
296
|
static bool CanEmitConstantVector(idx_t position, idx_t run_length, idx_t scan_count) {
|
297
|
+
if (!ENTIRE_VECTOR) {
|
298
|
+
return false;
|
299
|
+
}
|
296
300
|
if (scan_count != STANDARD_VECTOR_SIZE) {
|
297
301
|
// Only when we can fill an entire Vector can we emit a ConstantVector, because subsequent scans require the
|
298
302
|
// input Vector to be flat
|
@@ -330,9 +334,9 @@ static void RLEScanConstant(RLEScanState<T> &scan_state, rle_count_t *index_poin
|
|
330
334
|
return;
|
331
335
|
}
|
332
336
|
|
333
|
-
template <class T>
|
334
|
-
void
|
335
|
-
|
337
|
+
template <class T, bool ENTIRE_VECTOR>
|
338
|
+
void RLEScanPartialInternal(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, Vector &result,
|
339
|
+
idx_t result_offset) {
|
336
340
|
auto &scan_state = state.scan_state->Cast<RLEScanState<T>>();
|
337
341
|
|
338
342
|
auto data = scan_state.handle.Ptr() + segment.GetBlockOffset();
|
@@ -340,7 +344,8 @@ void RLEScanPartial(ColumnSegment &segment, ColumnScanState &state, idx_t scan_c
|
|
340
344
|
auto index_pointer = reinterpret_cast<rle_count_t *>(data + scan_state.rle_count_offset);
|
341
345
|
|
342
346
|
// If we are scanning an entire Vector and it contains only a single run
|
343
|
-
if (CanEmitConstantVector(scan_state.position_in_entry, index_pointer[scan_state.entry_pos],
|
347
|
+
if (CanEmitConstantVector<ENTIRE_VECTOR>(scan_state.position_in_entry, index_pointer[scan_state.entry_pos],
|
348
|
+
scan_count)) {
|
344
349
|
RLEScanConstant<T>(scan_state, index_pointer, data_pointer, scan_count, result);
|
345
350
|
return;
|
346
351
|
}
|
@@ -357,9 +362,15 @@ void RLEScanPartial(ColumnSegment &segment, ColumnScanState &state, idx_t scan_c
|
|
357
362
|
}
|
358
363
|
}
|
359
364
|
|
365
|
+
template <class T>
|
366
|
+
void RLEScanPartial(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, Vector &result,
|
367
|
+
idx_t result_offset) {
|
368
|
+
return RLEScanPartialInternal<T, false>(segment, state, scan_count, result, result_offset);
|
369
|
+
}
|
370
|
+
|
360
371
|
template <class T>
|
361
372
|
void RLEScan(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, Vector &result) {
|
362
|
-
|
373
|
+
RLEScanPartialInternal<T, true>(segment, state, scan_count, result, 0);
|
363
374
|
}
|
364
375
|
|
365
376
|
//===--------------------------------------------------------------------===//
|
@@ -159,7 +159,7 @@ void LocalTableStorage::AppendToIndexes(DuckTransaction &transaction, TableAppen
|
|
159
159
|
AppendToIndexes(transaction, *row_groups, table.info->indexes, table.GetTypes(), append_state.current_row);
|
160
160
|
}
|
161
161
|
if (error) {
|
162
|
-
// need to revert
|
162
|
+
// need to revert all appended row ids
|
163
163
|
row_t current_row = append_state.row_start;
|
164
164
|
// remove the data from the indexes, if there are any indexes
|
165
165
|
row_groups->Scan(transaction, [&](DataChunk &chunk) -> bool {
|
@@ -184,6 +184,13 @@ void LocalTableStorage::AppendToIndexes(DuckTransaction &transaction, TableAppen
|
|
184
184
|
if (append_to_table) {
|
185
185
|
table.RevertAppendInternal(append_state.row_start, append_count);
|
186
186
|
}
|
187
|
+
|
188
|
+
// we need to vacuum the indexes to remove any buffers that are now empty
|
189
|
+
// due to reverting the appends
|
190
|
+
table.info->indexes.Scan([&](Index &index) {
|
191
|
+
index.Vacuum();
|
192
|
+
return false;
|
193
|
+
});
|
187
194
|
error.Throw();
|
188
195
|
}
|
189
196
|
}
|
package/test/prepare.test.ts
CHANGED
@@ -652,7 +652,16 @@ describe('prepare', function() {
|
|
652
652
|
});
|
653
653
|
it("should aggregate kurtosis(num)", function (done) {
|
654
654
|
db.all("SELECT kurtosis(num) as kurtosis FROM foo", function (err: null | Error, res: TableData) {
|
655
|
-
|
655
|
+
// The `num` column of table `foo` contains each integer from 0 to 999,999 exactly once.
|
656
|
+
// This is a uniform distribution. The excess kurtosis for a uniform distribution is exactly -1.2.
|
657
|
+
// See https://en.wikipedia.org/wiki/Kurtosis#Other_well-known_distributions
|
658
|
+
const expected = -1.2;
|
659
|
+
|
660
|
+
// The calculated value can differ from the exact answer by small amounts on different platforms due
|
661
|
+
// to floating-point errors. This tolerance was determined experimentally.
|
662
|
+
const tolerance = Number.EPSILON * 10;
|
663
|
+
|
664
|
+
assert.ok(Math.abs(res[0].kurtosis - expected) < tolerance);
|
656
665
|
done(err);
|
657
666
|
});
|
658
667
|
});
|
@@ -90,7 +90,7 @@ const correct_answer_map: Record<string, any[]> = {
|
|
90
90
|
date_array: [
|
91
91
|
[],
|
92
92
|
[
|
93
|
-
new Date(1970, 0, 1),
|
93
|
+
new Date(Date.UTC(1970, 0, 1)),
|
94
94
|
null,
|
95
95
|
new Date("0001-01-01T00:00:00.000Z"),
|
96
96
|
new Date("9999-12-31T00:00:00.000Z"),
|
@@ -100,7 +100,7 @@ const correct_answer_map: Record<string, any[]> = {
|
|
100
100
|
timestamp_array: [
|
101
101
|
[],
|
102
102
|
[
|
103
|
-
new Date(1970, 0, 1),
|
103
|
+
new Date(Date.UTC(1970, 0, 1)),
|
104
104
|
null,
|
105
105
|
new Date("0001-01-01T00:00:00.000Z"),
|
106
106
|
new Date("9999-12-31T23:59:59.999Z"),
|
@@ -111,7 +111,7 @@ const correct_answer_map: Record<string, any[]> = {
|
|
111
111
|
timestamptz_array: [
|
112
112
|
[],
|
113
113
|
[
|
114
|
-
new Date(1970, 0, 1),
|
114
|
+
new Date(Date.UTC(1970, 0, 1)),
|
115
115
|
null,
|
116
116
|
new Date("0001-01-01T00:00:00.000Z"),
|
117
117
|
new Date("9999-12-31T23:59:59.999Z"),
|
@@ -171,7 +171,7 @@ const correct_answer_map: Record<string, any[]> = {
|
|
171
171
|
],
|
172
172
|
|
173
173
|
timestamp: [
|
174
|
-
new Date(
|
174
|
+
new Date(Date.UTC(1990, 0, 1)),
|
175
175
|
new Date("9999-12-31T23:59:59.000Z"),
|
176
176
|
null,
|
177
177
|
],
|