duckdb 0.8.2-dev4653.0 → 0.8.2-dev4871.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/binding.gyp +0 -1
  2. package/binding.gyp.in +0 -1
  3. package/package.json +1 -1
  4. package/src/connection.cpp +10 -23
  5. package/src/data_chunk.cpp +1 -3
  6. package/src/database.cpp +4 -9
  7. package/src/duckdb/extension/icu/icu-datepart.cpp +12 -8
  8. package/src/duckdb/extension/json/json_functions/json_transform.cpp +8 -6
  9. package/src/duckdb/extension/json/json_functions.cpp +4 -6
  10. package/src/duckdb/src/common/enum_util.cpp +10 -5
  11. package/src/duckdb/src/common/radix_partitioning.cpp +1 -1
  12. package/src/duckdb/src/common/row_operations/row_matcher.cpp +408 -0
  13. package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +3 -3
  14. package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +35 -17
  15. package/src/duckdb/src/common/types/row/tuple_data_scatter_gather.cpp +44 -43
  16. package/src/duckdb/src/common/vector_operations/vector_hash.cpp +1 -0
  17. package/src/duckdb/src/core_functions/function_list.cpp +1 -1
  18. package/src/duckdb/src/core_functions/scalar/date/date_part.cpp +86 -50
  19. package/src/duckdb/src/core_functions/scalar/generic/hash.cpp +3 -0
  20. package/src/duckdb/src/core_functions/scalar/string/repeat.cpp +8 -5
  21. package/src/duckdb/src/execution/aggregate_hashtable.cpp +5 -4
  22. package/src/duckdb/src/execution/index/fixed_size_allocator.cpp +13 -0
  23. package/src/duckdb/src/execution/join_hashtable.cpp +71 -59
  24. package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +3 -3
  25. package/src/duckdb/src/execution/operator/csv_scanner/base_csv_reader.cpp +5 -1
  26. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +18 -9
  27. package/src/duckdb/src/execution/operator/csv_scanner/csv_reader_options.cpp +11 -27
  28. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +1 -2
  29. package/src/duckdb/src/execution/operator/csv_scanner/parallel_csv_reader.cpp +4 -0
  30. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +11 -2
  31. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +8 -8
  32. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +7 -6
  33. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +27 -6
  34. package/src/duckdb/src/execution/operator/join/physical_hash_join.cpp +9 -4
  35. package/src/duckdb/src/execution/physical_plan/plan_comparison_join.cpp +0 -2
  36. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +49 -41
  37. package/src/duckdb/src/execution/reservoir_sample.cpp +3 -9
  38. package/src/duckdb/src/function/cast/vector_cast_helpers.cpp +8 -2
  39. package/src/duckdb/src/function/function_binder.cpp +10 -9
  40. package/src/duckdb/src/function/scalar/string/like.cpp +0 -3
  41. package/src/duckdb/src/function/table/read_csv.cpp +12 -9
  42. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  43. package/src/duckdb/src/include/duckdb/common/enums/date_part_specifier.hpp +11 -3
  44. package/src/duckdb/src/include/duckdb/common/row_operations/row_matcher.hpp +63 -0
  45. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +8 -2
  46. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +2 -2
  47. package/src/duckdb/src/include/duckdb/common/types/validity_mask.hpp +4 -1
  48. package/src/duckdb/src/include/duckdb/core_functions/scalar/string_functions.hpp +1 -1
  49. package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +4 -0
  50. package/src/duckdb/src/include/duckdb/execution/join_hashtable.hpp +14 -8
  51. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/base_csv_reader.hpp +4 -0
  52. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +1 -1
  53. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_line_info.hpp +4 -0
  54. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_reader_options.hpp +2 -4
  55. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +3 -1
  56. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +1 -1
  57. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp +1 -0
  58. package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +1 -2
  59. package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +3 -0
  60. package/src/duckdb/src/include/duckdb/main/relation.hpp +4 -0
  61. package/src/duckdb/src/main/config.cpp +1 -1
  62. package/src/duckdb/src/main/query_result.cpp +16 -10
  63. package/src/duckdb/src/main/relation.cpp +10 -0
  64. package/src/duckdb/src/optimizer/rule/date_part_simplification.cpp +0 -3
  65. package/src/duckdb/src/planner/binder/tableref/plan_joinref.cpp +12 -4
  66. package/src/duckdb/src/storage/compression/validity_uncompressed.cpp +2 -3
  67. package/src/duckdb/src/storage/data_table.cpp +10 -0
  68. package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +42 -44
  69. package/src/duckdb/ub_src_common_row_operations.cpp +1 -1
  70. package/src/statement.cpp +2 -4
  71. package/test/database_fail.test.ts +6 -0
  72. package/src/duckdb/src/common/row_operations/row_match.cpp +0 -359
@@ -46,7 +46,8 @@ struct Parse {
46
46
  validity_mask.SetInvalid(machine.cur_rows);
47
47
  }
48
48
  }
49
- if (machine.state == CSVState::STANDARD) {
49
+ if (machine.state == CSVState::STANDARD ||
50
+ (machine.state == CSVState::QUOTED && machine.previous_state == CSVState::QUOTED)) {
50
51
  machine.value += current_char;
51
52
  }
52
53
  machine.cur_rows +=
@@ -57,7 +58,7 @@ struct Parse {
57
58
  machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return;
58
59
  machine.column_count -= machine.column_count * (machine.state != CSVState::RECORD_SEPARATOR && carriage_return);
59
60
 
60
- if (machine.cur_rows >= machine.options.sample_chunk_size) {
61
+ if (machine.cur_rows >= STANDARD_VECTOR_SIZE) {
61
62
  // We sniffed enough rows
62
63
  return true;
63
64
  }
@@ -65,11 +66,22 @@ struct Parse {
65
66
  }
66
67
 
67
68
  inline static void Finalize(CSVStateMachine &machine, DataChunk &parse_chunk) {
68
- if (machine.cur_rows < machine.options.sample_chunk_size && machine.state != CSVState::EMPTY_LINE) {
69
+ if (machine.cur_rows < STANDARD_VECTOR_SIZE && machine.state != CSVState::EMPTY_LINE) {
69
70
  machine.VerifyUTF8();
70
71
  auto &v = parse_chunk.data[machine.column_count++];
71
72
  auto parse_data = FlatVector::GetData<string_t>(v);
72
- parse_data[machine.cur_rows] = StringVector::AddStringOrBlob(v, string_t(machine.value));
73
+ if (machine.value.empty()) {
74
+ auto &validity_mask = FlatVector::Validity(v);
75
+ validity_mask.SetInvalid(machine.cur_rows);
76
+ } else {
77
+ parse_data[machine.cur_rows] = StringVector::AddStringOrBlob(v, string_t(machine.value));
78
+ }
79
+ while (machine.column_count < parse_chunk.ColumnCount()) {
80
+ auto &v_pad = parse_chunk.data[machine.column_count++];
81
+ auto &validity_mask = FlatVector::Validity(v_pad);
82
+ validity_mask.SetInvalid(machine.cur_rows);
83
+ }
84
+ machine.cur_rows++;
73
85
  }
74
86
  parse_chunk.SetCardinality(machine.cur_rows);
75
87
  }
@@ -104,8 +116,8 @@ void CSVSniffer::RefineTypes() {
104
116
  return;
105
117
  }
106
118
  DataChunk parse_chunk;
107
- parse_chunk.Initialize(BufferAllocator::Get(buffer_manager->context), detected_types, options.sample_chunk_size);
108
- for (idx_t i = 1; i < best_candidate->options.sample_chunks; i++) {
119
+ parse_chunk.Initialize(BufferAllocator::Get(buffer_manager->context), detected_types, STANDARD_VECTOR_SIZE);
120
+ for (idx_t i = 1; i < best_candidate->options.sample_size_chunks; i++) {
109
121
  bool finished_file = best_candidate->csv_buffer_iterator.Finished();
110
122
  if (finished_file) {
111
123
  // we finished the file: stop
@@ -124,6 +136,7 @@ void CSVSniffer::RefineTypes() {
124
136
  best_candidate->csv_buffer_iterator.Process<Parse>(*best_candidate, parse_chunk);
125
137
  for (idx_t col = 0; col < parse_chunk.ColumnCount(); col++) {
126
138
  vector<LogicalType> &col_type_candidates = best_sql_types_candidates_per_column_idx[col];
139
+ bool is_bool_type = col_type_candidates.back() == LogicalType::BOOLEAN;
127
140
  while (col_type_candidates.size() > 1) {
128
141
  const auto &sql_type = col_type_candidates.back();
129
142
  // narrow down the date formats
@@ -154,6 +167,14 @@ void CSVSniffer::RefineTypes() {
154
167
  if (TryCastVector(parse_chunk.data[col], parse_chunk.size(), sql_type)) {
155
168
  break;
156
169
  } else {
170
+ if (col_type_candidates.back() == LogicalType::BOOLEAN && is_bool_type) {
171
+ // If we thought this was a boolean value (i.e., T,F, True, False) and it is not, we
172
+ // immediately pop to varchar.
173
+ while (col_type_candidates.back() != LogicalType::VARCHAR) {
174
+ col_type_candidates.pop_back();
175
+ }
176
+ break;
177
+ }
157
178
  col_type_candidates.pop_back();
158
179
  }
159
180
  }
@@ -420,6 +420,8 @@ public:
420
420
  }
421
421
 
422
422
  DataChunk join_keys;
423
+ TupleDataChunkState join_key_state;
424
+
423
425
  ExpressionExecutor probe_executor;
424
426
  unique_ptr<JoinHashTable::ScanStructure> scan_structure;
425
427
  unique_ptr<OperatorState> perfect_hash_join_state;
@@ -446,6 +448,7 @@ unique_ptr<OperatorState> PhysicalHashJoin::GetOperatorState(ExecutionContext &c
446
448
  for (auto &cond : conditions) {
447
449
  state->probe_executor.AddExpression(*cond.left);
448
450
  }
451
+ TupleDataCollection::InitializeChunkState(state->join_key_state, condition_types);
449
452
  }
450
453
  if (sink.external) {
451
454
  state->spill_chunk.Initialize(allocator, sink.probe_types);
@@ -502,10 +505,10 @@ OperatorResultType PhysicalHashJoin::ExecuteInternal(ExecutionContext &context,
502
505
 
503
506
  // perform the actual probe
504
507
  if (sink.external) {
505
- state.scan_structure = sink.hash_table->ProbeAndSpill(state.join_keys, input, *sink.probe_spill,
506
- state.spill_state, state.spill_chunk);
508
+ state.scan_structure = sink.hash_table->ProbeAndSpill(state.join_keys, state.join_key_state, input,
509
+ *sink.probe_spill, state.spill_state, state.spill_chunk);
507
510
  } else {
508
- state.scan_structure = sink.hash_table->Probe(state.join_keys);
511
+ state.scan_structure = sink.hash_table->Probe(state.join_keys, state.join_key_state);
509
512
  }
510
513
  state.scan_structure->Next(state.join_keys, input, chunk);
511
514
  return OperatorResultType::HAVE_MORE_OUTPUT;
@@ -605,6 +608,7 @@ public:
605
608
  DataChunk probe_chunk;
606
609
  DataChunk join_keys;
607
610
  DataChunk payload;
611
+ TupleDataChunkState join_key_state;
608
612
  //! Column indices to easily reference the join keys/payload columns in probe_chunk
609
613
  vector<idx_t> join_key_indices;
610
614
  vector<idx_t> payload_indices;
@@ -782,6 +786,7 @@ HashJoinLocalSourceState::HashJoinLocalSourceState(const PhysicalHashJoin &op, A
782
786
  probe_chunk.Initialize(allocator, sink.probe_types);
783
787
  join_keys.Initialize(allocator, op.condition_types);
784
788
  payload.Initialize(allocator, op.children[0]->types);
789
+ TupleDataCollection::InitializeChunkState(join_key_state, op.condition_types);
785
790
 
786
791
  // Store the indices of the columns to reference them easily
787
792
  idx_t col_idx = 0;
@@ -871,7 +876,7 @@ void HashJoinLocalSourceState::ExternalProbe(HashJoinGlobalSinkState &sink, Hash
871
876
  }
872
877
 
873
878
  // Perform the probe
874
- scan_structure = sink.hash_table->Probe(join_keys, precomputed_hashes);
879
+ scan_structure = sink.hash_table->Probe(join_keys, join_key_state, precomputed_hashes);
875
880
  scan_structure->Next(join_keys, payload, chunk);
876
881
  }
877
882
 
@@ -254,7 +254,6 @@ unique_ptr<PhysicalOperator> PhysicalPlanGenerator::PlanComparisonJoin(LogicalCo
254
254
  }
255
255
 
256
256
  bool has_equality = false;
257
- // bool has_inequality = false;
258
257
  size_t has_range = 0;
259
258
  for (size_t c = 0; c < op.conditions.size(); ++c) {
260
259
  auto &cond = op.conditions[c];
@@ -271,7 +270,6 @@ unique_ptr<PhysicalOperator> PhysicalPlanGenerator::PlanComparisonJoin(LogicalCo
271
270
  break;
272
271
  case ExpressionType::COMPARE_NOTEQUAL:
273
272
  case ExpressionType::COMPARE_DISTINCT_FROM:
274
- // has_inequality = true;
275
273
  break;
276
274
  default:
277
275
  throw NotImplementedException("Unimplemented comparison join");
@@ -474,14 +474,9 @@ void RadixPartitionedHashTable::Finalize(ClientContext &, GlobalSinkState &gstat
474
474
  //===--------------------------------------------------------------------===//
475
475
  // Source
476
476
  //===--------------------------------------------------------------------===//
477
- idx_t RadixPartitionedHashTable::Count(GlobalSinkState &sink_p) const {
478
- const auto count = CountInternal(sink_p);
479
- return count == 0 && grouping_set.empty() ? 1 : count;
480
- }
481
-
482
- idx_t RadixPartitionedHashTable::CountInternal(GlobalSinkState &sink_p) const {
477
+ idx_t RadixPartitionedHashTable::NumberOfPartitions(GlobalSinkState &sink_p) const {
483
478
  auto &sink = sink_p.Cast<RadixHTGlobalSinkState>();
484
- return sink.count_before_combining;
479
+ return sink.partitions.size();
485
480
  }
486
481
 
487
482
  void RadixPartitionedHashTable::SetMultiScan(GlobalSinkState &sink_p) {
@@ -570,8 +565,7 @@ bool RadixHTGlobalSourceState::AssignTask(RadixHTGlobalSinkState &sink, RadixHTL
570
565
  D_ASSERT(lstate.scan_status != RadixHTScanStatus::IN_PROGRESS);
571
566
 
572
567
  const auto n_partitions = sink.partitions.size();
573
- if (scan_done == n_partitions) {
574
- finished = true;
568
+ if (finished) {
575
569
  return false;
576
570
  }
577
571
  // We first try to assign a Scan task, then a Finalize task if that didn't work, without using any locks
@@ -595,6 +589,11 @@ bool RadixHTGlobalSourceState::AssignTask(RadixHTGlobalSinkState &sink, RadixHTL
595
589
  return true;
596
590
  }
597
591
 
592
+ // We didn't assign a Scan task
593
+ if (sink.finalize_idx >= n_partitions) {
594
+ return false; // No finalize tasks left
595
+ }
596
+
598
597
  // We can just increment the atomic here, much simpler than assigning the scan task
599
598
  lstate.task_idx = sink.finalize_idx++;
600
599
  if (lstate.task_idx < n_partitions) {
@@ -603,7 +602,7 @@ bool RadixHTGlobalSourceState::AssignTask(RadixHTGlobalSinkState &sink, RadixHTL
603
602
  return true;
604
603
  }
605
604
 
606
- // We didn't manage to assign a finalize task
605
+ // We didn't manage to assign a Finalize task
607
606
  return false;
608
607
  }
609
608
 
@@ -693,15 +692,18 @@ void RadixHTLocalSourceState::Scan(RadixHTGlobalSinkState &sink, RadixHTGlobalSo
693
692
 
694
693
  if (!data_collection.Scan(scan_state, scan_chunk)) {
695
694
  scan_status = RadixHTScanStatus::DONE;
696
- if (++gstate.scan_done == sink.partitions.size()) {
697
- gstate.finished = true;
698
- }
699
695
  if (sink.scan_pin_properties == TupleDataPinProperties::DESTROY_AFTER_DONE) {
700
696
  data_collection.Reset();
701
697
  }
702
698
  return;
703
699
  }
704
700
 
701
+ if (data_collection.ScanComplete(scan_state)) {
702
+ if (++gstate.scan_done == sink.partitions.size()) {
703
+ gstate.finished = true;
704
+ }
705
+ }
706
+
705
707
  RowOperationsState row_state(aggregate_allocator);
706
708
  const auto group_cols = layout.ColumnCount() - 1;
707
709
  RowOperations::FinalizeStates(row_state, layout, scan_state.chunk_state.row_locations, scan_chunk, group_cols);
@@ -758,36 +760,38 @@ SourceResultType RadixPartitionedHashTable::GetData(ExecutionContext &context, D
758
760
  return SourceResultType::FINISHED;
759
761
  }
760
762
 
761
- // Special case hack to sort out aggregating from empty intermediates for aggregations without groups
762
- if (CountInternal(sink_p) == 0 && grouping_set.empty()) {
763
- D_ASSERT(chunk.ColumnCount() == null_groups.size() + op.aggregates.size() + op.grouping_functions.size());
764
- // For each column in the aggregates, set to initial state
765
- chunk.SetCardinality(1);
766
- for (auto null_group : null_groups) {
767
- chunk.data[null_group].SetVectorType(VectorType::CONSTANT_VECTOR);
768
- ConstantVector::SetNull(chunk.data[null_group], true);
769
- }
770
- ArenaAllocator allocator(BufferAllocator::Get(context.client));
771
- for (idx_t i = 0; i < op.aggregates.size(); i++) {
772
- D_ASSERT(op.aggregates[i]->GetExpressionClass() == ExpressionClass::BOUND_AGGREGATE);
773
- auto &aggr = op.aggregates[i]->Cast<BoundAggregateExpression>();
774
- auto aggr_state = make_unsafe_uniq_array<data_t>(aggr.function.state_size());
775
- aggr.function.initialize(aggr_state.get());
776
-
777
- AggregateInputData aggr_input_data(aggr.bind_info.get(), allocator);
778
- Vector state_vector(Value::POINTER(CastPointerToValue(aggr_state.get())));
779
- aggr.function.finalize(state_vector, aggr_input_data, chunk.data[null_groups.size() + i], 1, 0);
780
- if (aggr.function.destructor) {
781
- aggr.function.destructor(state_vector, aggr_input_data, 1);
763
+ if (sink.count_before_combining == 0) {
764
+ if (grouping_set.empty()) {
765
+ // Special case hack to sort out aggregating from empty intermediates for aggregations without groups
766
+ D_ASSERT(chunk.ColumnCount() == null_groups.size() + op.aggregates.size() + op.grouping_functions.size());
767
+ // For each column in the aggregates, set to initial state
768
+ chunk.SetCardinality(1);
769
+ for (auto null_group : null_groups) {
770
+ chunk.data[null_group].SetVectorType(VectorType::CONSTANT_VECTOR);
771
+ ConstantVector::SetNull(chunk.data[null_group], true);
772
+ }
773
+ ArenaAllocator allocator(BufferAllocator::Get(context.client));
774
+ for (idx_t i = 0; i < op.aggregates.size(); i++) {
775
+ D_ASSERT(op.aggregates[i]->GetExpressionClass() == ExpressionClass::BOUND_AGGREGATE);
776
+ auto &aggr = op.aggregates[i]->Cast<BoundAggregateExpression>();
777
+ auto aggr_state = make_unsafe_uniq_array<data_t>(aggr.function.state_size());
778
+ aggr.function.initialize(aggr_state.get());
779
+
780
+ AggregateInputData aggr_input_data(aggr.bind_info.get(), allocator);
781
+ Vector state_vector(Value::POINTER(CastPointerToValue(aggr_state.get())));
782
+ aggr.function.finalize(state_vector, aggr_input_data, chunk.data[null_groups.size() + i], 1, 0);
783
+ if (aggr.function.destructor) {
784
+ aggr.function.destructor(state_vector, aggr_input_data, 1);
785
+ }
786
+ }
787
+ // Place the grouping values (all the groups of the grouping_set condensed into a single value)
788
+ // Behind the null groups + aggregates
789
+ for (idx_t i = 0; i < op.grouping_functions.size(); i++) {
790
+ chunk.data[null_groups.size() + op.aggregates.size() + i].Reference(grouping_values[i]);
782
791
  }
783
- }
784
- // Place the grouping values (all the groups of the grouping_set condensed into a single value)
785
- // Behind the null groups + aggregates
786
- for (idx_t i = 0; i < op.grouping_functions.size(); i++) {
787
- chunk.data[null_groups.size() + op.aggregates.size() + i].Reference(grouping_values[i]);
788
792
  }
789
793
  gstate.finished = true;
790
- return SourceResultType::HAVE_MORE_OUTPUT;
794
+ return SourceResultType::FINISHED;
791
795
  }
792
796
 
793
797
  while (!gstate.finished && chunk.size() == 0) {
@@ -796,7 +800,11 @@ SourceResultType RadixPartitionedHashTable::GetData(ExecutionContext &context, D
796
800
  }
797
801
  }
798
802
 
799
- return SourceResultType::HAVE_MORE_OUTPUT;
803
+ if (chunk.size() != 0) {
804
+ return SourceResultType::HAVE_MORE_OUTPUT;
805
+ } else {
806
+ return SourceResultType::FINISHED;
807
+ }
800
808
  }
801
809
 
802
810
  } // namespace duckdb
@@ -107,25 +107,19 @@ void ReservoirSamplePercentage::AddToReservoir(DataChunk &input) {
107
107
  if (append_to_next_sample > 0) {
108
108
  // we need to also add to the next sample
109
109
  DataChunk new_chunk;
110
- new_chunk.Initialize(allocator, input.GetTypes());
111
- SelectionVector sel(append_to_current_sample_count);
112
- for (idx_t r = 0; r < append_to_current_sample_count; r++) {
113
- sel.set_index(r, r);
114
- }
115
- new_chunk.Slice(sel, append_to_current_sample_count);
110
+ new_chunk.InitializeEmpty(input.GetTypes());
111
+ new_chunk.Slice(input, *FlatVector::IncrementalSelectionVector(), append_to_current_sample_count);
116
112
  new_chunk.Flatten();
117
-
118
113
  current_sample->AddToReservoir(new_chunk);
119
114
  } else {
120
115
  input.Flatten();
121
-
122
116
  input.SetCardinality(append_to_current_sample_count);
123
117
  current_sample->AddToReservoir(input);
124
118
  }
125
119
  }
126
120
  if (append_to_next_sample > 0) {
127
121
  // slice the input for the remainder
128
- SelectionVector sel(STANDARD_VECTOR_SIZE);
122
+ SelectionVector sel(append_to_next_sample);
129
123
  for (idx_t i = 0; i < append_to_next_sample; i++) {
130
124
  sel.set_index(i, append_to_current_sample_count + i);
131
125
  }
@@ -20,10 +20,16 @@ inline static void SkipWhitespace(const char *buf, idx_t &pos, idx_t len) {
20
20
  static bool SkipToCloseQuotes(idx_t &pos, const char *buf, idx_t &len) {
21
21
  char quote = buf[pos];
22
22
  pos++;
23
+ bool escaped = false;
23
24
 
24
25
  while (pos < len) {
25
- if (buf[pos] == quote) {
26
- return true;
26
+ if (buf[pos] == '\\') {
27
+ escaped = !escaped;
28
+ } else {
29
+ if (buf[pos] == quote && !escaped) {
30
+ return true;
31
+ }
32
+ escaped = false;
27
33
  }
28
34
  pos++;
29
35
  }
@@ -1,16 +1,16 @@
1
1
  #include "duckdb/function/function_binder.hpp"
2
- #include "duckdb/common/limits.hpp"
3
2
 
4
- #include "duckdb/planner/expression/bound_cast_expression.hpp"
5
- #include "duckdb/planner/expression/bound_aggregate_expression.hpp"
6
- #include "duckdb/planner/expression/bound_function_expression.hpp"
7
- #include "duckdb/planner/expression/bound_constant_expression.hpp"
3
+ #include "duckdb/catalog/catalog.hpp"
8
4
  #include "duckdb/catalog/catalog_entry/scalar_function_catalog_entry.hpp"
9
-
10
- #include "duckdb/planner/expression_binder.hpp"
5
+ #include "duckdb/common/limits.hpp"
6
+ #include "duckdb/execution/expression_executor.hpp"
11
7
  #include "duckdb/function/aggregate_function.hpp"
12
8
  #include "duckdb/function/cast_rules.hpp"
13
- #include "duckdb/catalog/catalog.hpp"
9
+ #include "duckdb/planner/expression/bound_aggregate_expression.hpp"
10
+ #include "duckdb/planner/expression/bound_cast_expression.hpp"
11
+ #include "duckdb/planner/expression/bound_constant_expression.hpp"
12
+ #include "duckdb/planner/expression/bound_function_expression.hpp"
13
+ #include "duckdb/planner/expression_binder.hpp"
14
14
 
15
15
  namespace duckdb {
16
16
 
@@ -268,7 +268,8 @@ unique_ptr<Expression> FunctionBinder::BindScalarFunction(ScalarFunctionCatalogE
268
268
 
269
269
  if (bound_function.null_handling == FunctionNullHandling::DEFAULT_NULL_HANDLING) {
270
270
  for (auto &child : children) {
271
- if (child->return_type == LogicalTypeId::SQLNULL) {
271
+ if (child->return_type == LogicalTypeId::SQLNULL ||
272
+ (child->IsFoldable() && ExpressionExecutor::EvaluateScalar(context, *child).IsNull())) {
272
273
  return make_uniq<BoundConstantExpression>(Value(LogicalType::SQLNULL));
273
274
  }
274
275
  }
@@ -196,9 +196,6 @@ static unique_ptr<FunctionData> LikeBindFunction(ClientContext &context, ScalarF
196
196
  D_ASSERT(arguments.size() == 2 || arguments.size() == 3);
197
197
  if (arguments[1]->IsFoldable()) {
198
198
  Value pattern_str = ExpressionExecutor::EvaluateScalar(context, *arguments[1]);
199
- if (pattern_str.IsNull()) {
200
- return nullptr;
201
- }
202
199
  return LikeMatcher::CreateLikeMatcher(pattern_str.ToString());
203
200
  }
204
201
  return nullptr;
@@ -107,11 +107,11 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
107
107
  // Initialize Buffer Manager and Sniffer
108
108
  auto file_handle = BaseCSVReader::OpenCSV(context, options);
109
109
  result->buffer_manager = make_shared<CSVBufferManager>(context, std::move(file_handle), options);
110
- CSVSniffer sniffer(options, result->buffer_manager, result->state_machine_cache);
110
+ CSVSniffer sniffer(options, result->buffer_manager, result->state_machine_cache, explicitly_set_columns);
111
111
  auto sniffer_result = sniffer.SniffCSV();
112
- return_types = sniffer_result.return_types;
113
112
  if (names.empty()) {
114
113
  names = sniffer_result.names;
114
+ return_types = sniffer_result.return_types;
115
115
  } else {
116
116
  if (explicitly_set_columns) {
117
117
  // The user has influenced the names, can't assume they are valid anymore
@@ -195,6 +195,7 @@ public:
195
195
  auto file_count = files_path_p.size();
196
196
  line_info.current_batches.resize(file_count);
197
197
  line_info.lines_read.resize(file_count);
198
+ line_info.lines_errored.resize(file_count);
198
199
  tuple_start.resize(file_count);
199
200
  tuple_end.resize(file_count);
200
201
  tuple_end_to_batch.resize(file_count);
@@ -509,6 +510,11 @@ bool LineInfo::CanItGetLine(idx_t file_idx, idx_t batch_idx) {
509
510
  return false;
510
511
  }
511
512
 
513
+ void LineInfo::Increment(idx_t file_idx, idx_t batch_idx) {
514
+ auto parallel_lock = duckdb::make_uniq<lock_guard<mutex>>(main_mutex);
515
+ lines_errored[file_idx][batch_idx]++;
516
+ }
517
+
512
518
  // Returns the 1-indexed line number
513
519
  idx_t LineInfo::GetLine(idx_t batch_idx, idx_t line_error, idx_t file_idx, idx_t cur_start, bool verify,
514
520
  bool stop_at_first) {
@@ -520,12 +526,11 @@ idx_t LineInfo::GetLine(idx_t batch_idx, idx_t line_error, idx_t file_idx, idx_t
520
526
 
521
527
  if (!stop_at_first) {
522
528
  // Figure out the amount of lines read in the current file
523
- auto &file_batches = current_batches[file_idx];
524
- for (auto &batch : file_batches) {
525
- if (batch > batch_idx) {
526
- break;
529
+ for (idx_t cur_batch_idx = 0; cur_batch_idx <= batch_idx; cur_batch_idx++) {
530
+ if (cur_batch_idx < batch_idx) {
531
+ line_count += lines_errored[file_idx][cur_batch_idx];
527
532
  }
528
- line_count += lines_read[file_idx][batch];
533
+ line_count += lines_read[file_idx][cur_batch_idx];
529
534
  }
530
535
  return line_count + line_error + 1;
531
536
  }
@@ -880,8 +885,6 @@ static void ReadCSVAddNamedParameters(TableFunction &table_function) {
880
885
  table_function.named_parameters["header"] = LogicalType::BOOLEAN;
881
886
  table_function.named_parameters["auto_detect"] = LogicalType::BOOLEAN;
882
887
  table_function.named_parameters["sample_size"] = LogicalType::BIGINT;
883
- table_function.named_parameters["sample_chunk_size"] = LogicalType::BIGINT;
884
- table_function.named_parameters["sample_chunks"] = LogicalType::BIGINT;
885
888
  table_function.named_parameters["all_varchar"] = LogicalType::BOOLEAN;
886
889
  table_function.named_parameters["dateformat"] = LogicalType::VARCHAR;
887
890
  table_function.named_parameters["timestampformat"] = LogicalType::VARCHAR;
@@ -1,8 +1,8 @@
1
1
  #ifndef DUCKDB_VERSION
2
- #define DUCKDB_VERSION "0.8.2-dev4653"
2
+ #define DUCKDB_VERSION "0.8.2-dev4871"
3
3
  #endif
4
4
  #ifndef DUCKDB_SOURCE_ID
5
- #define DUCKDB_SOURCE_ID "bb287d4b22"
5
+ #define DUCKDB_SOURCE_ID "5a29c99891"
6
6
  #endif
7
7
  #include "duckdb/function/table/system_functions.hpp"
8
8
  #include "duckdb/main/database.hpp"
@@ -25,7 +25,6 @@ enum class DatePartSpecifier : uint8_t {
25
25
  SECOND,
26
26
  MINUTE,
27
27
  HOUR,
28
- EPOCH,
29
28
  DOW,
30
29
  ISODOW,
31
30
  WEEK,
@@ -39,11 +38,20 @@ enum class DatePartSpecifier : uint8_t {
39
38
  TIMEZONE_MINUTE,
40
39
 
41
40
  // DOUBLE values
42
- JULIAN_DAY
41
+ EPOCH,
42
+ JULIAN_DAY,
43
+
44
+ // Invalid
45
+ INVALID,
46
+
47
+ // Type ranges
48
+ BEGIN_BIGINT = YEAR,
49
+ BEGIN_DOUBLE = EPOCH,
50
+ BEGIN_INVALID = INVALID,
43
51
  };
44
52
 
45
53
  inline bool IsBigintDatepart(DatePartSpecifier part_code) {
46
- return size_t(part_code) < size_t(DatePartSpecifier::JULIAN_DAY);
54
+ return size_t(part_code) < size_t(DatePartSpecifier::BEGIN_DOUBLE);
47
55
  }
48
56
 
49
57
  DUCKDB_API bool TryGetDatePartSpecifier(const string &specifier, DatePartSpecifier &result);
@@ -0,0 +1,63 @@
1
+ //===----------------------------------------------------------------------===//
2
+ // DuckDB
3
+ //
4
+ // duckdb/common/row_operations/row_matcher.hpp
5
+ //
6
+ //
7
+ //===----------------------------------------------------------------------===//
8
+
9
+ #pragma once
10
+
11
+ #include "duckdb/common/enums/expression_type.hpp"
12
+ #include "duckdb/common/types.hpp"
13
+
14
+ namespace duckdb {
15
+
16
+ class Vector;
17
+ class DataChunk;
18
+ class TupleDataLayout;
19
+ struct TupleDataVectorFormat;
20
+ struct SelectionVector;
21
+ struct MatchFunction;
22
+
23
+ typedef idx_t (*match_function_t)(Vector &lhs_vector, const TupleDataVectorFormat &lhs_format, SelectionVector &sel,
24
+ const idx_t count, const TupleDataLayout &rhs_layout, Vector &rhs_row_locations,
25
+ const idx_t col_idx, const vector<MatchFunction> &child_functions,
26
+ SelectionVector *no_match_sel, idx_t &no_match_count);
27
+
28
+ struct MatchFunction {
29
+ match_function_t function;
30
+ vector<MatchFunction> child_functions;
31
+ };
32
+
33
+ struct RowMatcher {
34
+ public:
35
+ using Predicates = vector<ExpressionType>;
36
+
37
+ //! Initializes the RowMatcher, filling match_functions using layout and predicates
38
+ void Initialize(const bool no_match_sel, const TupleDataLayout &layout, const Predicates &predicates);
39
+ //! Given a DataChunk on the LHS, on which we've called TupleDataCollection::ToUnifiedFormat,
40
+ //! we match it with rows on the RHS, according to the given layout and locations.
41
+ //! Initially, 'sel' has 'count' entries which point to what needs to be compared.
42
+ //! After matching is done, this returns how many matching entries there are, which 'sel' is modified to point to
43
+ idx_t Match(DataChunk &lhs, const vector<TupleDataVectorFormat> &lhs_formats, SelectionVector &sel, idx_t count,
44
+ const TupleDataLayout &rhs_layout, Vector &rhs_row_locations, SelectionVector *no_match_sel,
45
+ idx_t &no_match_count);
46
+
47
+ private:
48
+ //! Gets the templated match function for a given column
49
+ MatchFunction GetMatchFunction(const bool no_match_sel, const LogicalType &type, const ExpressionType predicate);
50
+ template <bool NO_MATCH_SEL>
51
+ MatchFunction GetMatchFunction(const LogicalType &type, const ExpressionType predicate);
52
+ template <bool NO_MATCH_SEL, class T>
53
+ MatchFunction GetMatchFunction(const ExpressionType predicate);
54
+ template <bool NO_MATCH_SEL>
55
+ MatchFunction GetStructMatchFunction(const LogicalType &type, const ExpressionType predicate);
56
+ template <bool NO_MATCH_SEL>
57
+ MatchFunction GetListMatchFunction(const ExpressionType predicate);
58
+
59
+ private:
60
+ vector<MatchFunction> match_functions;
61
+ };
62
+
63
+ } // namespace duckdb
@@ -21,7 +21,7 @@ struct RowOperationsState;
21
21
 
22
22
  typedef void (*tuple_data_scatter_function_t)(const Vector &source, const TupleDataVectorFormat &source_format,
23
23
  const SelectionVector &append_sel, const idx_t append_count,
24
- const TupleDataLayout &layout, Vector &row_locations,
24
+ const TupleDataLayout &layout, const Vector &row_locations,
25
25
  Vector &heap_locations, const idx_t col_idx,
26
26
  const UnifiedVectorFormat &list_format,
27
27
  const vector<TupleDataScatterFunction> &child_functions);
@@ -84,7 +84,11 @@ public:
84
84
  TupleDataPinProperties = TupleDataPinProperties::UNPIN_AFTER_DONE);
85
85
  //! Initializes the Chunk state of an Append state
86
86
  //! - Useful for optimizing many appends made to the same tuple data collection
87
- void InitializeAppend(TupleDataChunkState &chunk_state, vector<column_t> column_ids = {});
87
+ void InitializeChunkState(TupleDataChunkState &chunk_state, vector<column_t> column_ids = {});
88
+ //! Initializes the Chunk state of an Append state
89
+ //! - Useful for optimizing many appends made to the same tuple data collection
90
+ static void InitializeChunkState(TupleDataChunkState &chunk_state, const vector<LogicalType> &types,
91
+ vector<column_t> column_ids = {});
88
92
  //! Append a DataChunk directly to this TupleDataCollection - calls InitializeAppend and Append internally
89
93
  void Append(DataChunk &new_chunk, const SelectionVector &append_sel = *FlatVector::IncrementalSelectionVector(),
90
94
  idx_t append_count = DConstants::INVALID_INDEX);
@@ -159,6 +163,8 @@ public:
159
163
  bool Scan(TupleDataScanState &state, DataChunk &result);
160
164
  //! Scans a DataChunk from the TupleDataCollection
161
165
  bool Scan(TupleDataParallelScanState &gstate, TupleDataLocalScanState &lstate, DataChunk &result);
166
+ //! Whether the last scan has been completed on this TupleDataCollection
167
+ bool ScanComplete(const TupleDataScanState &state) const;
162
168
 
163
169
  //! Gathers a DataChunk from the TupleDataCollection, given the specific row locations (requires full pin)
164
170
  void Gather(Vector &row_locations, const SelectionVector &scan_sel, const idx_t scan_count, DataChunk &result,
@@ -42,8 +42,8 @@ struct TupleDataVectorFormat {
42
42
  const SelectionVector *original_sel;
43
43
  SelectionVector original_owned_sel;
44
44
 
45
- UnifiedVectorFormat data;
46
- vector<TupleDataVectorFormat> child_formats;
45
+ UnifiedVectorFormat unified;
46
+ vector<TupleDataVectorFormat> children;
47
47
  unique_ptr<CombinedListData> combined_list_data;
48
48
  };
49
49
 
@@ -148,6 +148,9 @@ public:
148
148
  if (!validity_mask) {
149
149
  return ValidityBuffer::MAX_ENTRY;
150
150
  }
151
+ return GetValidityEntryUnsafe(entry_idx);
152
+ }
153
+ inline V &GetValidityEntryUnsafe(idx_t entry_idx) const {
151
154
  return validity_mask[entry_idx];
152
155
  }
153
156
  static inline bool AllValid(V entry) {
@@ -156,7 +159,7 @@ public:
156
159
  static inline bool NoneValid(V entry) {
157
160
  return entry == 0;
158
161
  }
159
- static inline bool RowIsValid(V entry, idx_t idx_in_entry) {
162
+ static inline bool RowIsValid(const V &entry, const idx_t &idx_in_entry) {
160
163
  return entry & (V(1) << V(idx_in_entry));
161
164
  }
162
165
  static inline void GetEntryIndex(idx_t row_idx, idx_t &entry_idx, idx_t &idx_in_entry) {
@@ -285,7 +285,7 @@ struct RepeatFun {
285
285
  static constexpr const char *Description = "Repeats the string count number of times";
286
286
  static constexpr const char *Example = "repeat('A', 5)";
287
287
 
288
- static ScalarFunction GetFunction();
288
+ static ScalarFunctionSet GetFunctions();
289
289
  };
290
290
 
291
291
  struct ReplaceFun {
@@ -8,6 +8,7 @@
8
8
 
9
9
  #pragma once
10
10
 
11
+ #include "duckdb/common/row_operations/row_matcher.hpp"
11
12
  #include "duckdb/common/types/row/partitioned_tuple_data.hpp"
12
13
  #include "duckdb/execution/base_aggregate_hashtable.hpp"
13
14
  #include "duckdb/storage/arena_allocator.hpp"
@@ -143,6 +144,9 @@ public:
143
144
  void UnpinData();
144
145
 
145
146
  private:
147
+ //! Efficiently matches groups
148
+ RowMatcher row_matcher;
149
+
146
150
  //! Append state
147
151
  struct AggregateHTAppendState {
148
152
  AggregateHTAppendState();