duckdb 0.8.2-dev4653.0 → 0.8.2-dev4871.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +0 -1
- package/binding.gyp.in +0 -1
- package/package.json +1 -1
- package/src/connection.cpp +10 -23
- package/src/data_chunk.cpp +1 -3
- package/src/database.cpp +4 -9
- package/src/duckdb/extension/icu/icu-datepart.cpp +12 -8
- package/src/duckdb/extension/json/json_functions/json_transform.cpp +8 -6
- package/src/duckdb/extension/json/json_functions.cpp +4 -6
- package/src/duckdb/src/common/enum_util.cpp +10 -5
- package/src/duckdb/src/common/radix_partitioning.cpp +1 -1
- package/src/duckdb/src/common/row_operations/row_matcher.cpp +408 -0
- package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +3 -3
- package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +35 -17
- package/src/duckdb/src/common/types/row/tuple_data_scatter_gather.cpp +44 -43
- package/src/duckdb/src/common/vector_operations/vector_hash.cpp +1 -0
- package/src/duckdb/src/core_functions/function_list.cpp +1 -1
- package/src/duckdb/src/core_functions/scalar/date/date_part.cpp +86 -50
- package/src/duckdb/src/core_functions/scalar/generic/hash.cpp +3 -0
- package/src/duckdb/src/core_functions/scalar/string/repeat.cpp +8 -5
- package/src/duckdb/src/execution/aggregate_hashtable.cpp +5 -4
- package/src/duckdb/src/execution/index/fixed_size_allocator.cpp +13 -0
- package/src/duckdb/src/execution/join_hashtable.cpp +71 -59
- package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +3 -3
- package/src/duckdb/src/execution/operator/csv_scanner/base_csv_reader.cpp +5 -1
- package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +18 -9
- package/src/duckdb/src/execution/operator/csv_scanner/csv_reader_options.cpp +11 -27
- package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +1 -2
- package/src/duckdb/src/execution/operator/csv_scanner/parallel_csv_reader.cpp +4 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +11 -2
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +8 -8
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +7 -6
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +27 -6
- package/src/duckdb/src/execution/operator/join/physical_hash_join.cpp +9 -4
- package/src/duckdb/src/execution/physical_plan/plan_comparison_join.cpp +0 -2
- package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +49 -41
- package/src/duckdb/src/execution/reservoir_sample.cpp +3 -9
- package/src/duckdb/src/function/cast/vector_cast_helpers.cpp +8 -2
- package/src/duckdb/src/function/function_binder.cpp +10 -9
- package/src/duckdb/src/function/scalar/string/like.cpp +0 -3
- package/src/duckdb/src/function/table/read_csv.cpp +12 -9
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/enums/date_part_specifier.hpp +11 -3
- package/src/duckdb/src/include/duckdb/common/row_operations/row_matcher.hpp +63 -0
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +8 -2
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +2 -2
- package/src/duckdb/src/include/duckdb/common/types/validity_mask.hpp +4 -1
- package/src/duckdb/src/include/duckdb/core_functions/scalar/string_functions.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +4 -0
- package/src/duckdb/src/include/duckdb/execution/join_hashtable.hpp +14 -8
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/base_csv_reader.hpp +4 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_line_info.hpp +4 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_reader_options.hpp +2 -4
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +3 -1
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp +1 -0
- package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +1 -2
- package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +3 -0
- package/src/duckdb/src/include/duckdb/main/relation.hpp +4 -0
- package/src/duckdb/src/main/config.cpp +1 -1
- package/src/duckdb/src/main/query_result.cpp +16 -10
- package/src/duckdb/src/main/relation.cpp +10 -0
- package/src/duckdb/src/optimizer/rule/date_part_simplification.cpp +0 -3
- package/src/duckdb/src/planner/binder/tableref/plan_joinref.cpp +12 -4
- package/src/duckdb/src/storage/compression/validity_uncompressed.cpp +2 -3
- package/src/duckdb/src/storage/data_table.cpp +10 -0
- package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +42 -44
- package/src/duckdb/ub_src_common_row_operations.cpp +1 -1
- package/src/statement.cpp +2 -4
- package/test/database_fail.test.ts +6 -0
- package/src/duckdb/src/common/row_operations/row_match.cpp +0 -359
@@ -46,7 +46,8 @@ struct Parse {
|
|
46
46
|
validity_mask.SetInvalid(machine.cur_rows);
|
47
47
|
}
|
48
48
|
}
|
49
|
-
if (machine.state == CSVState::STANDARD
|
49
|
+
if (machine.state == CSVState::STANDARD ||
|
50
|
+
(machine.state == CSVState::QUOTED && machine.previous_state == CSVState::QUOTED)) {
|
50
51
|
machine.value += current_char;
|
51
52
|
}
|
52
53
|
machine.cur_rows +=
|
@@ -57,7 +58,7 @@ struct Parse {
|
|
57
58
|
machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return;
|
58
59
|
machine.column_count -= machine.column_count * (machine.state != CSVState::RECORD_SEPARATOR && carriage_return);
|
59
60
|
|
60
|
-
if (machine.cur_rows >=
|
61
|
+
if (machine.cur_rows >= STANDARD_VECTOR_SIZE) {
|
61
62
|
// We sniffed enough rows
|
62
63
|
return true;
|
63
64
|
}
|
@@ -65,11 +66,22 @@ struct Parse {
|
|
65
66
|
}
|
66
67
|
|
67
68
|
inline static void Finalize(CSVStateMachine &machine, DataChunk &parse_chunk) {
|
68
|
-
if (machine.cur_rows <
|
69
|
+
if (machine.cur_rows < STANDARD_VECTOR_SIZE && machine.state != CSVState::EMPTY_LINE) {
|
69
70
|
machine.VerifyUTF8();
|
70
71
|
auto &v = parse_chunk.data[machine.column_count++];
|
71
72
|
auto parse_data = FlatVector::GetData<string_t>(v);
|
72
|
-
|
73
|
+
if (machine.value.empty()) {
|
74
|
+
auto &validity_mask = FlatVector::Validity(v);
|
75
|
+
validity_mask.SetInvalid(machine.cur_rows);
|
76
|
+
} else {
|
77
|
+
parse_data[machine.cur_rows] = StringVector::AddStringOrBlob(v, string_t(machine.value));
|
78
|
+
}
|
79
|
+
while (machine.column_count < parse_chunk.ColumnCount()) {
|
80
|
+
auto &v_pad = parse_chunk.data[machine.column_count++];
|
81
|
+
auto &validity_mask = FlatVector::Validity(v_pad);
|
82
|
+
validity_mask.SetInvalid(machine.cur_rows);
|
83
|
+
}
|
84
|
+
machine.cur_rows++;
|
73
85
|
}
|
74
86
|
parse_chunk.SetCardinality(machine.cur_rows);
|
75
87
|
}
|
@@ -104,8 +116,8 @@ void CSVSniffer::RefineTypes() {
|
|
104
116
|
return;
|
105
117
|
}
|
106
118
|
DataChunk parse_chunk;
|
107
|
-
parse_chunk.Initialize(BufferAllocator::Get(buffer_manager->context), detected_types,
|
108
|
-
for (idx_t i = 1; i < best_candidate->options.
|
119
|
+
parse_chunk.Initialize(BufferAllocator::Get(buffer_manager->context), detected_types, STANDARD_VECTOR_SIZE);
|
120
|
+
for (idx_t i = 1; i < best_candidate->options.sample_size_chunks; i++) {
|
109
121
|
bool finished_file = best_candidate->csv_buffer_iterator.Finished();
|
110
122
|
if (finished_file) {
|
111
123
|
// we finished the file: stop
|
@@ -124,6 +136,7 @@ void CSVSniffer::RefineTypes() {
|
|
124
136
|
best_candidate->csv_buffer_iterator.Process<Parse>(*best_candidate, parse_chunk);
|
125
137
|
for (idx_t col = 0; col < parse_chunk.ColumnCount(); col++) {
|
126
138
|
vector<LogicalType> &col_type_candidates = best_sql_types_candidates_per_column_idx[col];
|
139
|
+
bool is_bool_type = col_type_candidates.back() == LogicalType::BOOLEAN;
|
127
140
|
while (col_type_candidates.size() > 1) {
|
128
141
|
const auto &sql_type = col_type_candidates.back();
|
129
142
|
// narrow down the date formats
|
@@ -154,6 +167,14 @@ void CSVSniffer::RefineTypes() {
|
|
154
167
|
if (TryCastVector(parse_chunk.data[col], parse_chunk.size(), sql_type)) {
|
155
168
|
break;
|
156
169
|
} else {
|
170
|
+
if (col_type_candidates.back() == LogicalType::BOOLEAN && is_bool_type) {
|
171
|
+
// If we thought this was a boolean value (i.e., T,F, True, False) and it is not, we
|
172
|
+
// immediately pop to varchar.
|
173
|
+
while (col_type_candidates.back() != LogicalType::VARCHAR) {
|
174
|
+
col_type_candidates.pop_back();
|
175
|
+
}
|
176
|
+
break;
|
177
|
+
}
|
157
178
|
col_type_candidates.pop_back();
|
158
179
|
}
|
159
180
|
}
|
@@ -420,6 +420,8 @@ public:
|
|
420
420
|
}
|
421
421
|
|
422
422
|
DataChunk join_keys;
|
423
|
+
TupleDataChunkState join_key_state;
|
424
|
+
|
423
425
|
ExpressionExecutor probe_executor;
|
424
426
|
unique_ptr<JoinHashTable::ScanStructure> scan_structure;
|
425
427
|
unique_ptr<OperatorState> perfect_hash_join_state;
|
@@ -446,6 +448,7 @@ unique_ptr<OperatorState> PhysicalHashJoin::GetOperatorState(ExecutionContext &c
|
|
446
448
|
for (auto &cond : conditions) {
|
447
449
|
state->probe_executor.AddExpression(*cond.left);
|
448
450
|
}
|
451
|
+
TupleDataCollection::InitializeChunkState(state->join_key_state, condition_types);
|
449
452
|
}
|
450
453
|
if (sink.external) {
|
451
454
|
state->spill_chunk.Initialize(allocator, sink.probe_types);
|
@@ -502,10 +505,10 @@ OperatorResultType PhysicalHashJoin::ExecuteInternal(ExecutionContext &context,
|
|
502
505
|
|
503
506
|
// perform the actual probe
|
504
507
|
if (sink.external) {
|
505
|
-
state.scan_structure = sink.hash_table->ProbeAndSpill(state.join_keys,
|
506
|
-
state.spill_state, state.spill_chunk);
|
508
|
+
state.scan_structure = sink.hash_table->ProbeAndSpill(state.join_keys, state.join_key_state, input,
|
509
|
+
*sink.probe_spill, state.spill_state, state.spill_chunk);
|
507
510
|
} else {
|
508
|
-
state.scan_structure = sink.hash_table->Probe(state.join_keys);
|
511
|
+
state.scan_structure = sink.hash_table->Probe(state.join_keys, state.join_key_state);
|
509
512
|
}
|
510
513
|
state.scan_structure->Next(state.join_keys, input, chunk);
|
511
514
|
return OperatorResultType::HAVE_MORE_OUTPUT;
|
@@ -605,6 +608,7 @@ public:
|
|
605
608
|
DataChunk probe_chunk;
|
606
609
|
DataChunk join_keys;
|
607
610
|
DataChunk payload;
|
611
|
+
TupleDataChunkState join_key_state;
|
608
612
|
//! Column indices to easily reference the join keys/payload columns in probe_chunk
|
609
613
|
vector<idx_t> join_key_indices;
|
610
614
|
vector<idx_t> payload_indices;
|
@@ -782,6 +786,7 @@ HashJoinLocalSourceState::HashJoinLocalSourceState(const PhysicalHashJoin &op, A
|
|
782
786
|
probe_chunk.Initialize(allocator, sink.probe_types);
|
783
787
|
join_keys.Initialize(allocator, op.condition_types);
|
784
788
|
payload.Initialize(allocator, op.children[0]->types);
|
789
|
+
TupleDataCollection::InitializeChunkState(join_key_state, op.condition_types);
|
785
790
|
|
786
791
|
// Store the indices of the columns to reference them easily
|
787
792
|
idx_t col_idx = 0;
|
@@ -871,7 +876,7 @@ void HashJoinLocalSourceState::ExternalProbe(HashJoinGlobalSinkState &sink, Hash
|
|
871
876
|
}
|
872
877
|
|
873
878
|
// Perform the probe
|
874
|
-
scan_structure = sink.hash_table->Probe(join_keys, precomputed_hashes);
|
879
|
+
scan_structure = sink.hash_table->Probe(join_keys, join_key_state, precomputed_hashes);
|
875
880
|
scan_structure->Next(join_keys, payload, chunk);
|
876
881
|
}
|
877
882
|
|
@@ -254,7 +254,6 @@ unique_ptr<PhysicalOperator> PhysicalPlanGenerator::PlanComparisonJoin(LogicalCo
|
|
254
254
|
}
|
255
255
|
|
256
256
|
bool has_equality = false;
|
257
|
-
// bool has_inequality = false;
|
258
257
|
size_t has_range = 0;
|
259
258
|
for (size_t c = 0; c < op.conditions.size(); ++c) {
|
260
259
|
auto &cond = op.conditions[c];
|
@@ -271,7 +270,6 @@ unique_ptr<PhysicalOperator> PhysicalPlanGenerator::PlanComparisonJoin(LogicalCo
|
|
271
270
|
break;
|
272
271
|
case ExpressionType::COMPARE_NOTEQUAL:
|
273
272
|
case ExpressionType::COMPARE_DISTINCT_FROM:
|
274
|
-
// has_inequality = true;
|
275
273
|
break;
|
276
274
|
default:
|
277
275
|
throw NotImplementedException("Unimplemented comparison join");
|
@@ -474,14 +474,9 @@ void RadixPartitionedHashTable::Finalize(ClientContext &, GlobalSinkState &gstat
|
|
474
474
|
//===--------------------------------------------------------------------===//
|
475
475
|
// Source
|
476
476
|
//===--------------------------------------------------------------------===//
|
477
|
-
idx_t RadixPartitionedHashTable::
|
478
|
-
const auto count = CountInternal(sink_p);
|
479
|
-
return count == 0 && grouping_set.empty() ? 1 : count;
|
480
|
-
}
|
481
|
-
|
482
|
-
idx_t RadixPartitionedHashTable::CountInternal(GlobalSinkState &sink_p) const {
|
477
|
+
idx_t RadixPartitionedHashTable::NumberOfPartitions(GlobalSinkState &sink_p) const {
|
483
478
|
auto &sink = sink_p.Cast<RadixHTGlobalSinkState>();
|
484
|
-
return sink.
|
479
|
+
return sink.partitions.size();
|
485
480
|
}
|
486
481
|
|
487
482
|
void RadixPartitionedHashTable::SetMultiScan(GlobalSinkState &sink_p) {
|
@@ -570,8 +565,7 @@ bool RadixHTGlobalSourceState::AssignTask(RadixHTGlobalSinkState &sink, RadixHTL
|
|
570
565
|
D_ASSERT(lstate.scan_status != RadixHTScanStatus::IN_PROGRESS);
|
571
566
|
|
572
567
|
const auto n_partitions = sink.partitions.size();
|
573
|
-
if (
|
574
|
-
finished = true;
|
568
|
+
if (finished) {
|
575
569
|
return false;
|
576
570
|
}
|
577
571
|
// We first try to assign a Scan task, then a Finalize task if that didn't work, without using any locks
|
@@ -595,6 +589,11 @@ bool RadixHTGlobalSourceState::AssignTask(RadixHTGlobalSinkState &sink, RadixHTL
|
|
595
589
|
return true;
|
596
590
|
}
|
597
591
|
|
592
|
+
// We didn't assign a Scan task
|
593
|
+
if (sink.finalize_idx >= n_partitions) {
|
594
|
+
return false; // No finalize tasks left
|
595
|
+
}
|
596
|
+
|
598
597
|
// We can just increment the atomic here, much simpler than assigning the scan task
|
599
598
|
lstate.task_idx = sink.finalize_idx++;
|
600
599
|
if (lstate.task_idx < n_partitions) {
|
@@ -603,7 +602,7 @@ bool RadixHTGlobalSourceState::AssignTask(RadixHTGlobalSinkState &sink, RadixHTL
|
|
603
602
|
return true;
|
604
603
|
}
|
605
604
|
|
606
|
-
// We didn't manage to assign a
|
605
|
+
// We didn't manage to assign a Finalize task
|
607
606
|
return false;
|
608
607
|
}
|
609
608
|
|
@@ -693,15 +692,18 @@ void RadixHTLocalSourceState::Scan(RadixHTGlobalSinkState &sink, RadixHTGlobalSo
|
|
693
692
|
|
694
693
|
if (!data_collection.Scan(scan_state, scan_chunk)) {
|
695
694
|
scan_status = RadixHTScanStatus::DONE;
|
696
|
-
if (++gstate.scan_done == sink.partitions.size()) {
|
697
|
-
gstate.finished = true;
|
698
|
-
}
|
699
695
|
if (sink.scan_pin_properties == TupleDataPinProperties::DESTROY_AFTER_DONE) {
|
700
696
|
data_collection.Reset();
|
701
697
|
}
|
702
698
|
return;
|
703
699
|
}
|
704
700
|
|
701
|
+
if (data_collection.ScanComplete(scan_state)) {
|
702
|
+
if (++gstate.scan_done == sink.partitions.size()) {
|
703
|
+
gstate.finished = true;
|
704
|
+
}
|
705
|
+
}
|
706
|
+
|
705
707
|
RowOperationsState row_state(aggregate_allocator);
|
706
708
|
const auto group_cols = layout.ColumnCount() - 1;
|
707
709
|
RowOperations::FinalizeStates(row_state, layout, scan_state.chunk_state.row_locations, scan_chunk, group_cols);
|
@@ -758,36 +760,38 @@ SourceResultType RadixPartitionedHashTable::GetData(ExecutionContext &context, D
|
|
758
760
|
return SourceResultType::FINISHED;
|
759
761
|
}
|
760
762
|
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
780
|
-
|
781
|
-
aggr.function.destructor
|
763
|
+
if (sink.count_before_combining == 0) {
|
764
|
+
if (grouping_set.empty()) {
|
765
|
+
// Special case hack to sort out aggregating from empty intermediates for aggregations without groups
|
766
|
+
D_ASSERT(chunk.ColumnCount() == null_groups.size() + op.aggregates.size() + op.grouping_functions.size());
|
767
|
+
// For each column in the aggregates, set to initial state
|
768
|
+
chunk.SetCardinality(1);
|
769
|
+
for (auto null_group : null_groups) {
|
770
|
+
chunk.data[null_group].SetVectorType(VectorType::CONSTANT_VECTOR);
|
771
|
+
ConstantVector::SetNull(chunk.data[null_group], true);
|
772
|
+
}
|
773
|
+
ArenaAllocator allocator(BufferAllocator::Get(context.client));
|
774
|
+
for (idx_t i = 0; i < op.aggregates.size(); i++) {
|
775
|
+
D_ASSERT(op.aggregates[i]->GetExpressionClass() == ExpressionClass::BOUND_AGGREGATE);
|
776
|
+
auto &aggr = op.aggregates[i]->Cast<BoundAggregateExpression>();
|
777
|
+
auto aggr_state = make_unsafe_uniq_array<data_t>(aggr.function.state_size());
|
778
|
+
aggr.function.initialize(aggr_state.get());
|
779
|
+
|
780
|
+
AggregateInputData aggr_input_data(aggr.bind_info.get(), allocator);
|
781
|
+
Vector state_vector(Value::POINTER(CastPointerToValue(aggr_state.get())));
|
782
|
+
aggr.function.finalize(state_vector, aggr_input_data, chunk.data[null_groups.size() + i], 1, 0);
|
783
|
+
if (aggr.function.destructor) {
|
784
|
+
aggr.function.destructor(state_vector, aggr_input_data, 1);
|
785
|
+
}
|
786
|
+
}
|
787
|
+
// Place the grouping values (all the groups of the grouping_set condensed into a single value)
|
788
|
+
// Behind the null groups + aggregates
|
789
|
+
for (idx_t i = 0; i < op.grouping_functions.size(); i++) {
|
790
|
+
chunk.data[null_groups.size() + op.aggregates.size() + i].Reference(grouping_values[i]);
|
782
791
|
}
|
783
|
-
}
|
784
|
-
// Place the grouping values (all the groups of the grouping_set condensed into a single value)
|
785
|
-
// Behind the null groups + aggregates
|
786
|
-
for (idx_t i = 0; i < op.grouping_functions.size(); i++) {
|
787
|
-
chunk.data[null_groups.size() + op.aggregates.size() + i].Reference(grouping_values[i]);
|
788
792
|
}
|
789
793
|
gstate.finished = true;
|
790
|
-
return SourceResultType::
|
794
|
+
return SourceResultType::FINISHED;
|
791
795
|
}
|
792
796
|
|
793
797
|
while (!gstate.finished && chunk.size() == 0) {
|
@@ -796,7 +800,11 @@ SourceResultType RadixPartitionedHashTable::GetData(ExecutionContext &context, D
|
|
796
800
|
}
|
797
801
|
}
|
798
802
|
|
799
|
-
|
803
|
+
if (chunk.size() != 0) {
|
804
|
+
return SourceResultType::HAVE_MORE_OUTPUT;
|
805
|
+
} else {
|
806
|
+
return SourceResultType::FINISHED;
|
807
|
+
}
|
800
808
|
}
|
801
809
|
|
802
810
|
} // namespace duckdb
|
@@ -107,25 +107,19 @@ void ReservoirSamplePercentage::AddToReservoir(DataChunk &input) {
|
|
107
107
|
if (append_to_next_sample > 0) {
|
108
108
|
// we need to also add to the next sample
|
109
109
|
DataChunk new_chunk;
|
110
|
-
new_chunk.
|
111
|
-
|
112
|
-
for (idx_t r = 0; r < append_to_current_sample_count; r++) {
|
113
|
-
sel.set_index(r, r);
|
114
|
-
}
|
115
|
-
new_chunk.Slice(sel, append_to_current_sample_count);
|
110
|
+
new_chunk.InitializeEmpty(input.GetTypes());
|
111
|
+
new_chunk.Slice(input, *FlatVector::IncrementalSelectionVector(), append_to_current_sample_count);
|
116
112
|
new_chunk.Flatten();
|
117
|
-
|
118
113
|
current_sample->AddToReservoir(new_chunk);
|
119
114
|
} else {
|
120
115
|
input.Flatten();
|
121
|
-
|
122
116
|
input.SetCardinality(append_to_current_sample_count);
|
123
117
|
current_sample->AddToReservoir(input);
|
124
118
|
}
|
125
119
|
}
|
126
120
|
if (append_to_next_sample > 0) {
|
127
121
|
// slice the input for the remainder
|
128
|
-
SelectionVector sel(
|
122
|
+
SelectionVector sel(append_to_next_sample);
|
129
123
|
for (idx_t i = 0; i < append_to_next_sample; i++) {
|
130
124
|
sel.set_index(i, append_to_current_sample_count + i);
|
131
125
|
}
|
@@ -20,10 +20,16 @@ inline static void SkipWhitespace(const char *buf, idx_t &pos, idx_t len) {
|
|
20
20
|
static bool SkipToCloseQuotes(idx_t &pos, const char *buf, idx_t &len) {
|
21
21
|
char quote = buf[pos];
|
22
22
|
pos++;
|
23
|
+
bool escaped = false;
|
23
24
|
|
24
25
|
while (pos < len) {
|
25
|
-
if (buf[pos] ==
|
26
|
-
|
26
|
+
if (buf[pos] == '\\') {
|
27
|
+
escaped = !escaped;
|
28
|
+
} else {
|
29
|
+
if (buf[pos] == quote && !escaped) {
|
30
|
+
return true;
|
31
|
+
}
|
32
|
+
escaped = false;
|
27
33
|
}
|
28
34
|
pos++;
|
29
35
|
}
|
@@ -1,16 +1,16 @@
|
|
1
1
|
#include "duckdb/function/function_binder.hpp"
|
2
|
-
#include "duckdb/common/limits.hpp"
|
3
2
|
|
4
|
-
#include "duckdb/
|
5
|
-
#include "duckdb/planner/expression/bound_aggregate_expression.hpp"
|
6
|
-
#include "duckdb/planner/expression/bound_function_expression.hpp"
|
7
|
-
#include "duckdb/planner/expression/bound_constant_expression.hpp"
|
3
|
+
#include "duckdb/catalog/catalog.hpp"
|
8
4
|
#include "duckdb/catalog/catalog_entry/scalar_function_catalog_entry.hpp"
|
9
|
-
|
10
|
-
#include "duckdb/
|
5
|
+
#include "duckdb/common/limits.hpp"
|
6
|
+
#include "duckdb/execution/expression_executor.hpp"
|
11
7
|
#include "duckdb/function/aggregate_function.hpp"
|
12
8
|
#include "duckdb/function/cast_rules.hpp"
|
13
|
-
#include "duckdb/
|
9
|
+
#include "duckdb/planner/expression/bound_aggregate_expression.hpp"
|
10
|
+
#include "duckdb/planner/expression/bound_cast_expression.hpp"
|
11
|
+
#include "duckdb/planner/expression/bound_constant_expression.hpp"
|
12
|
+
#include "duckdb/planner/expression/bound_function_expression.hpp"
|
13
|
+
#include "duckdb/planner/expression_binder.hpp"
|
14
14
|
|
15
15
|
namespace duckdb {
|
16
16
|
|
@@ -268,7 +268,8 @@ unique_ptr<Expression> FunctionBinder::BindScalarFunction(ScalarFunctionCatalogE
|
|
268
268
|
|
269
269
|
if (bound_function.null_handling == FunctionNullHandling::DEFAULT_NULL_HANDLING) {
|
270
270
|
for (auto &child : children) {
|
271
|
-
if (child->return_type == LogicalTypeId::SQLNULL
|
271
|
+
if (child->return_type == LogicalTypeId::SQLNULL ||
|
272
|
+
(child->IsFoldable() && ExpressionExecutor::EvaluateScalar(context, *child).IsNull())) {
|
272
273
|
return make_uniq<BoundConstantExpression>(Value(LogicalType::SQLNULL));
|
273
274
|
}
|
274
275
|
}
|
@@ -196,9 +196,6 @@ static unique_ptr<FunctionData> LikeBindFunction(ClientContext &context, ScalarF
|
|
196
196
|
D_ASSERT(arguments.size() == 2 || arguments.size() == 3);
|
197
197
|
if (arguments[1]->IsFoldable()) {
|
198
198
|
Value pattern_str = ExpressionExecutor::EvaluateScalar(context, *arguments[1]);
|
199
|
-
if (pattern_str.IsNull()) {
|
200
|
-
return nullptr;
|
201
|
-
}
|
202
199
|
return LikeMatcher::CreateLikeMatcher(pattern_str.ToString());
|
203
200
|
}
|
204
201
|
return nullptr;
|
@@ -107,11 +107,11 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
|
|
107
107
|
// Initialize Buffer Manager and Sniffer
|
108
108
|
auto file_handle = BaseCSVReader::OpenCSV(context, options);
|
109
109
|
result->buffer_manager = make_shared<CSVBufferManager>(context, std::move(file_handle), options);
|
110
|
-
CSVSniffer sniffer(options, result->buffer_manager, result->state_machine_cache);
|
110
|
+
CSVSniffer sniffer(options, result->buffer_manager, result->state_machine_cache, explicitly_set_columns);
|
111
111
|
auto sniffer_result = sniffer.SniffCSV();
|
112
|
-
return_types = sniffer_result.return_types;
|
113
112
|
if (names.empty()) {
|
114
113
|
names = sniffer_result.names;
|
114
|
+
return_types = sniffer_result.return_types;
|
115
115
|
} else {
|
116
116
|
if (explicitly_set_columns) {
|
117
117
|
// The user has influenced the names, can't assume they are valid anymore
|
@@ -195,6 +195,7 @@ public:
|
|
195
195
|
auto file_count = files_path_p.size();
|
196
196
|
line_info.current_batches.resize(file_count);
|
197
197
|
line_info.lines_read.resize(file_count);
|
198
|
+
line_info.lines_errored.resize(file_count);
|
198
199
|
tuple_start.resize(file_count);
|
199
200
|
tuple_end.resize(file_count);
|
200
201
|
tuple_end_to_batch.resize(file_count);
|
@@ -509,6 +510,11 @@ bool LineInfo::CanItGetLine(idx_t file_idx, idx_t batch_idx) {
|
|
509
510
|
return false;
|
510
511
|
}
|
511
512
|
|
513
|
+
void LineInfo::Increment(idx_t file_idx, idx_t batch_idx) {
|
514
|
+
auto parallel_lock = duckdb::make_uniq<lock_guard<mutex>>(main_mutex);
|
515
|
+
lines_errored[file_idx][batch_idx]++;
|
516
|
+
}
|
517
|
+
|
512
518
|
// Returns the 1-indexed line number
|
513
519
|
idx_t LineInfo::GetLine(idx_t batch_idx, idx_t line_error, idx_t file_idx, idx_t cur_start, bool verify,
|
514
520
|
bool stop_at_first) {
|
@@ -520,12 +526,11 @@ idx_t LineInfo::GetLine(idx_t batch_idx, idx_t line_error, idx_t file_idx, idx_t
|
|
520
526
|
|
521
527
|
if (!stop_at_first) {
|
522
528
|
// Figure out the amount of lines read in the current file
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
break;
|
529
|
+
for (idx_t cur_batch_idx = 0; cur_batch_idx <= batch_idx; cur_batch_idx++) {
|
530
|
+
if (cur_batch_idx < batch_idx) {
|
531
|
+
line_count += lines_errored[file_idx][cur_batch_idx];
|
527
532
|
}
|
528
|
-
line_count += lines_read[file_idx][
|
533
|
+
line_count += lines_read[file_idx][cur_batch_idx];
|
529
534
|
}
|
530
535
|
return line_count + line_error + 1;
|
531
536
|
}
|
@@ -880,8 +885,6 @@ static void ReadCSVAddNamedParameters(TableFunction &table_function) {
|
|
880
885
|
table_function.named_parameters["header"] = LogicalType::BOOLEAN;
|
881
886
|
table_function.named_parameters["auto_detect"] = LogicalType::BOOLEAN;
|
882
887
|
table_function.named_parameters["sample_size"] = LogicalType::BIGINT;
|
883
|
-
table_function.named_parameters["sample_chunk_size"] = LogicalType::BIGINT;
|
884
|
-
table_function.named_parameters["sample_chunks"] = LogicalType::BIGINT;
|
885
888
|
table_function.named_parameters["all_varchar"] = LogicalType::BOOLEAN;
|
886
889
|
table_function.named_parameters["dateformat"] = LogicalType::VARCHAR;
|
887
890
|
table_function.named_parameters["timestampformat"] = LogicalType::VARCHAR;
|
@@ -1,8 +1,8 @@
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
2
|
-
#define DUCKDB_VERSION "0.8.2-
|
2
|
+
#define DUCKDB_VERSION "0.8.2-dev4871"
|
3
3
|
#endif
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
5
|
+
#define DUCKDB_SOURCE_ID "5a29c99891"
|
6
6
|
#endif
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
8
8
|
#include "duckdb/main/database.hpp"
|
@@ -25,7 +25,6 @@ enum class DatePartSpecifier : uint8_t {
|
|
25
25
|
SECOND,
|
26
26
|
MINUTE,
|
27
27
|
HOUR,
|
28
|
-
EPOCH,
|
29
28
|
DOW,
|
30
29
|
ISODOW,
|
31
30
|
WEEK,
|
@@ -39,11 +38,20 @@ enum class DatePartSpecifier : uint8_t {
|
|
39
38
|
TIMEZONE_MINUTE,
|
40
39
|
|
41
40
|
// DOUBLE values
|
42
|
-
|
41
|
+
EPOCH,
|
42
|
+
JULIAN_DAY,
|
43
|
+
|
44
|
+
// Invalid
|
45
|
+
INVALID,
|
46
|
+
|
47
|
+
// Type ranges
|
48
|
+
BEGIN_BIGINT = YEAR,
|
49
|
+
BEGIN_DOUBLE = EPOCH,
|
50
|
+
BEGIN_INVALID = INVALID,
|
43
51
|
};
|
44
52
|
|
45
53
|
inline bool IsBigintDatepart(DatePartSpecifier part_code) {
|
46
|
-
return size_t(part_code) < size_t(DatePartSpecifier::
|
54
|
+
return size_t(part_code) < size_t(DatePartSpecifier::BEGIN_DOUBLE);
|
47
55
|
}
|
48
56
|
|
49
57
|
DUCKDB_API bool TryGetDatePartSpecifier(const string &specifier, DatePartSpecifier &result);
|
@@ -0,0 +1,63 @@
|
|
1
|
+
//===----------------------------------------------------------------------===//
|
2
|
+
// DuckDB
|
3
|
+
//
|
4
|
+
// duckdb/common/row_operations/row_matcher.hpp
|
5
|
+
//
|
6
|
+
//
|
7
|
+
//===----------------------------------------------------------------------===//
|
8
|
+
|
9
|
+
#pragma once
|
10
|
+
|
11
|
+
#include "duckdb/common/enums/expression_type.hpp"
|
12
|
+
#include "duckdb/common/types.hpp"
|
13
|
+
|
14
|
+
namespace duckdb {
|
15
|
+
|
16
|
+
class Vector;
|
17
|
+
class DataChunk;
|
18
|
+
class TupleDataLayout;
|
19
|
+
struct TupleDataVectorFormat;
|
20
|
+
struct SelectionVector;
|
21
|
+
struct MatchFunction;
|
22
|
+
|
23
|
+
typedef idx_t (*match_function_t)(Vector &lhs_vector, const TupleDataVectorFormat &lhs_format, SelectionVector &sel,
|
24
|
+
const idx_t count, const TupleDataLayout &rhs_layout, Vector &rhs_row_locations,
|
25
|
+
const idx_t col_idx, const vector<MatchFunction> &child_functions,
|
26
|
+
SelectionVector *no_match_sel, idx_t &no_match_count);
|
27
|
+
|
28
|
+
struct MatchFunction {
|
29
|
+
match_function_t function;
|
30
|
+
vector<MatchFunction> child_functions;
|
31
|
+
};
|
32
|
+
|
33
|
+
struct RowMatcher {
|
34
|
+
public:
|
35
|
+
using Predicates = vector<ExpressionType>;
|
36
|
+
|
37
|
+
//! Initializes the RowMatcher, filling match_functions using layout and predicates
|
38
|
+
void Initialize(const bool no_match_sel, const TupleDataLayout &layout, const Predicates &predicates);
|
39
|
+
//! Given a DataChunk on the LHS, on which we've called TupleDataCollection::ToUnifiedFormat,
|
40
|
+
//! we match it with rows on the RHS, according to the given layout and locations.
|
41
|
+
//! Initially, 'sel' has 'count' entries which point to what needs to be compared.
|
42
|
+
//! After matching is done, this returns how many matching entries there are, which 'sel' is modified to point to
|
43
|
+
idx_t Match(DataChunk &lhs, const vector<TupleDataVectorFormat> &lhs_formats, SelectionVector &sel, idx_t count,
|
44
|
+
const TupleDataLayout &rhs_layout, Vector &rhs_row_locations, SelectionVector *no_match_sel,
|
45
|
+
idx_t &no_match_count);
|
46
|
+
|
47
|
+
private:
|
48
|
+
//! Gets the templated match function for a given column
|
49
|
+
MatchFunction GetMatchFunction(const bool no_match_sel, const LogicalType &type, const ExpressionType predicate);
|
50
|
+
template <bool NO_MATCH_SEL>
|
51
|
+
MatchFunction GetMatchFunction(const LogicalType &type, const ExpressionType predicate);
|
52
|
+
template <bool NO_MATCH_SEL, class T>
|
53
|
+
MatchFunction GetMatchFunction(const ExpressionType predicate);
|
54
|
+
template <bool NO_MATCH_SEL>
|
55
|
+
MatchFunction GetStructMatchFunction(const LogicalType &type, const ExpressionType predicate);
|
56
|
+
template <bool NO_MATCH_SEL>
|
57
|
+
MatchFunction GetListMatchFunction(const ExpressionType predicate);
|
58
|
+
|
59
|
+
private:
|
60
|
+
vector<MatchFunction> match_functions;
|
61
|
+
};
|
62
|
+
|
63
|
+
} // namespace duckdb
|
@@ -21,7 +21,7 @@ struct RowOperationsState;
|
|
21
21
|
|
22
22
|
typedef void (*tuple_data_scatter_function_t)(const Vector &source, const TupleDataVectorFormat &source_format,
|
23
23
|
const SelectionVector &append_sel, const idx_t append_count,
|
24
|
-
const TupleDataLayout &layout, Vector &row_locations,
|
24
|
+
const TupleDataLayout &layout, const Vector &row_locations,
|
25
25
|
Vector &heap_locations, const idx_t col_idx,
|
26
26
|
const UnifiedVectorFormat &list_format,
|
27
27
|
const vector<TupleDataScatterFunction> &child_functions);
|
@@ -84,7 +84,11 @@ public:
|
|
84
84
|
TupleDataPinProperties = TupleDataPinProperties::UNPIN_AFTER_DONE);
|
85
85
|
//! Initializes the Chunk state of an Append state
|
86
86
|
//! - Useful for optimizing many appends made to the same tuple data collection
|
87
|
-
void
|
87
|
+
void InitializeChunkState(TupleDataChunkState &chunk_state, vector<column_t> column_ids = {});
|
88
|
+
//! Initializes the Chunk state of an Append state
|
89
|
+
//! - Useful for optimizing many appends made to the same tuple data collection
|
90
|
+
static void InitializeChunkState(TupleDataChunkState &chunk_state, const vector<LogicalType> &types,
|
91
|
+
vector<column_t> column_ids = {});
|
88
92
|
//! Append a DataChunk directly to this TupleDataCollection - calls InitializeAppend and Append internally
|
89
93
|
void Append(DataChunk &new_chunk, const SelectionVector &append_sel = *FlatVector::IncrementalSelectionVector(),
|
90
94
|
idx_t append_count = DConstants::INVALID_INDEX);
|
@@ -159,6 +163,8 @@ public:
|
|
159
163
|
bool Scan(TupleDataScanState &state, DataChunk &result);
|
160
164
|
//! Scans a DataChunk from the TupleDataCollection
|
161
165
|
bool Scan(TupleDataParallelScanState &gstate, TupleDataLocalScanState &lstate, DataChunk &result);
|
166
|
+
//! Whether the last scan has been completed on this TupleDataCollection
|
167
|
+
bool ScanComplete(const TupleDataScanState &state) const;
|
162
168
|
|
163
169
|
//! Gathers a DataChunk from the TupleDataCollection, given the specific row locations (requires full pin)
|
164
170
|
void Gather(Vector &row_locations, const SelectionVector &scan_sel, const idx_t scan_count, DataChunk &result,
|
@@ -42,8 +42,8 @@ struct TupleDataVectorFormat {
|
|
42
42
|
const SelectionVector *original_sel;
|
43
43
|
SelectionVector original_owned_sel;
|
44
44
|
|
45
|
-
UnifiedVectorFormat
|
46
|
-
vector<TupleDataVectorFormat>
|
45
|
+
UnifiedVectorFormat unified;
|
46
|
+
vector<TupleDataVectorFormat> children;
|
47
47
|
unique_ptr<CombinedListData> combined_list_data;
|
48
48
|
};
|
49
49
|
|
@@ -148,6 +148,9 @@ public:
|
|
148
148
|
if (!validity_mask) {
|
149
149
|
return ValidityBuffer::MAX_ENTRY;
|
150
150
|
}
|
151
|
+
return GetValidityEntryUnsafe(entry_idx);
|
152
|
+
}
|
153
|
+
inline V &GetValidityEntryUnsafe(idx_t entry_idx) const {
|
151
154
|
return validity_mask[entry_idx];
|
152
155
|
}
|
153
156
|
static inline bool AllValid(V entry) {
|
@@ -156,7 +159,7 @@ public:
|
|
156
159
|
static inline bool NoneValid(V entry) {
|
157
160
|
return entry == 0;
|
158
161
|
}
|
159
|
-
static inline bool RowIsValid(V entry, idx_t idx_in_entry) {
|
162
|
+
static inline bool RowIsValid(const V &entry, const idx_t &idx_in_entry) {
|
160
163
|
return entry & (V(1) << V(idx_in_entry));
|
161
164
|
}
|
162
165
|
static inline void GetEntryIndex(idx_t row_idx, idx_t &entry_idx, idx_t &idx_in_entry) {
|
@@ -285,7 +285,7 @@ struct RepeatFun {
|
|
285
285
|
static constexpr const char *Description = "Repeats the string count number of times";
|
286
286
|
static constexpr const char *Example = "repeat('A', 5)";
|
287
287
|
|
288
|
-
static
|
288
|
+
static ScalarFunctionSet GetFunctions();
|
289
289
|
};
|
290
290
|
|
291
291
|
struct ReplaceFun {
|
@@ -8,6 +8,7 @@
|
|
8
8
|
|
9
9
|
#pragma once
|
10
10
|
|
11
|
+
#include "duckdb/common/row_operations/row_matcher.hpp"
|
11
12
|
#include "duckdb/common/types/row/partitioned_tuple_data.hpp"
|
12
13
|
#include "duckdb/execution/base_aggregate_hashtable.hpp"
|
13
14
|
#include "duckdb/storage/arena_allocator.hpp"
|
@@ -143,6 +144,9 @@ public:
|
|
143
144
|
void UnpinData();
|
144
145
|
|
145
146
|
private:
|
147
|
+
//! Efficiently matches groups
|
148
|
+
RowMatcher row_matcher;
|
149
|
+
|
146
150
|
//! Append state
|
147
151
|
struct AggregateHTAppendState {
|
148
152
|
AggregateHTAppendState();
|