duckdb 0.8.2-dev4653.0 → 0.8.2-dev4871.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/binding.gyp +0 -1
  2. package/binding.gyp.in +0 -1
  3. package/package.json +1 -1
  4. package/src/connection.cpp +10 -23
  5. package/src/data_chunk.cpp +1 -3
  6. package/src/database.cpp +4 -9
  7. package/src/duckdb/extension/icu/icu-datepart.cpp +12 -8
  8. package/src/duckdb/extension/json/json_functions/json_transform.cpp +8 -6
  9. package/src/duckdb/extension/json/json_functions.cpp +4 -6
  10. package/src/duckdb/src/common/enum_util.cpp +10 -5
  11. package/src/duckdb/src/common/radix_partitioning.cpp +1 -1
  12. package/src/duckdb/src/common/row_operations/row_matcher.cpp +408 -0
  13. package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +3 -3
  14. package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +35 -17
  15. package/src/duckdb/src/common/types/row/tuple_data_scatter_gather.cpp +44 -43
  16. package/src/duckdb/src/common/vector_operations/vector_hash.cpp +1 -0
  17. package/src/duckdb/src/core_functions/function_list.cpp +1 -1
  18. package/src/duckdb/src/core_functions/scalar/date/date_part.cpp +86 -50
  19. package/src/duckdb/src/core_functions/scalar/generic/hash.cpp +3 -0
  20. package/src/duckdb/src/core_functions/scalar/string/repeat.cpp +8 -5
  21. package/src/duckdb/src/execution/aggregate_hashtable.cpp +5 -4
  22. package/src/duckdb/src/execution/index/fixed_size_allocator.cpp +13 -0
  23. package/src/duckdb/src/execution/join_hashtable.cpp +71 -59
  24. package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +3 -3
  25. package/src/duckdb/src/execution/operator/csv_scanner/base_csv_reader.cpp +5 -1
  26. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +18 -9
  27. package/src/duckdb/src/execution/operator/csv_scanner/csv_reader_options.cpp +11 -27
  28. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +1 -2
  29. package/src/duckdb/src/execution/operator/csv_scanner/parallel_csv_reader.cpp +4 -0
  30. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +11 -2
  31. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +8 -8
  32. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +7 -6
  33. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +27 -6
  34. package/src/duckdb/src/execution/operator/join/physical_hash_join.cpp +9 -4
  35. package/src/duckdb/src/execution/physical_plan/plan_comparison_join.cpp +0 -2
  36. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +49 -41
  37. package/src/duckdb/src/execution/reservoir_sample.cpp +3 -9
  38. package/src/duckdb/src/function/cast/vector_cast_helpers.cpp +8 -2
  39. package/src/duckdb/src/function/function_binder.cpp +10 -9
  40. package/src/duckdb/src/function/scalar/string/like.cpp +0 -3
  41. package/src/duckdb/src/function/table/read_csv.cpp +12 -9
  42. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  43. package/src/duckdb/src/include/duckdb/common/enums/date_part_specifier.hpp +11 -3
  44. package/src/duckdb/src/include/duckdb/common/row_operations/row_matcher.hpp +63 -0
  45. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +8 -2
  46. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +2 -2
  47. package/src/duckdb/src/include/duckdb/common/types/validity_mask.hpp +4 -1
  48. package/src/duckdb/src/include/duckdb/core_functions/scalar/string_functions.hpp +1 -1
  49. package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +4 -0
  50. package/src/duckdb/src/include/duckdb/execution/join_hashtable.hpp +14 -8
  51. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/base_csv_reader.hpp +4 -0
  52. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +1 -1
  53. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_line_info.hpp +4 -0
  54. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_reader_options.hpp +2 -4
  55. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +3 -1
  56. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +1 -1
  57. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp +1 -0
  58. package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +1 -2
  59. package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +3 -0
  60. package/src/duckdb/src/include/duckdb/main/relation.hpp +4 -0
  61. package/src/duckdb/src/main/config.cpp +1 -1
  62. package/src/duckdb/src/main/query_result.cpp +16 -10
  63. package/src/duckdb/src/main/relation.cpp +10 -0
  64. package/src/duckdb/src/optimizer/rule/date_part_simplification.cpp +0 -3
  65. package/src/duckdb/src/planner/binder/tableref/plan_joinref.cpp +12 -4
  66. package/src/duckdb/src/storage/compression/validity_uncompressed.cpp +2 -3
  67. package/src/duckdb/src/storage/data_table.cpp +10 -0
  68. package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +42 -44
  69. package/src/duckdb/ub_src_common_row_operations.cpp +1 -1
  70. package/src/statement.cpp +2 -4
  71. package/test/database_fail.test.ts +6 -0
  72. package/src/duckdb/src/common/row_operations/row_match.cpp +0 -359
@@ -45,6 +45,7 @@ GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, All
45
45
  // Append hash column to the end and initialise the row layout
46
46
  group_types_p.emplace_back(LogicalType::HASH);
47
47
  layout.Initialize(std::move(group_types_p), std::move(aggregate_objects_p));
48
+
48
49
  hash_offset = layout.GetOffsets()[layout.ColumnCount() - 1];
49
50
 
50
51
  // Partitioned data and pointer table
@@ -52,7 +53,8 @@ GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, All
52
53
  Resize(initial_capacity);
53
54
 
54
55
  // Predicates
55
- predicates.resize(layout.ColumnCount() - 1, ExpressionType::COMPARE_EQUAL);
56
+ predicates.resize(layout.ColumnCount() - 1, ExpressionType::COMPARE_NOT_DISTINCT_FROM);
57
+ row_matcher.Initialize(true, layout, predicates);
56
58
  }
57
59
 
58
60
  void GroupedAggregateHashTable::InitializePartitionedData() {
@@ -414,9 +416,8 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(DataChunk &groups, V
414
416
  }
415
417
 
416
418
  // Perform group comparisons
417
- RowOperations::Match(state.group_chunk, state.group_data.get(), layout, addresses_v, predicates,
418
- state.group_compare_vector, need_compare_count, &state.no_match_vector,
419
- no_match_count);
419
+ row_matcher.Match(state.group_chunk, chunk_state.vector_data, state.group_compare_vector,
420
+ need_compare_count, layout, addresses_v, &state.no_match_vector, no_match_count);
420
421
  }
421
422
 
422
423
  // Linear probing: each of the entries that do not match move to the next entry in the HT
@@ -173,6 +173,19 @@ bool FixedSizeAllocator::InitializeVacuum() {
173
173
  return false;
174
174
  }
175
175
 
176
+ // remove all empty buffers
177
+ auto buffer_it = buffers.begin();
178
+ while (buffer_it != buffers.end()) {
179
+ if (!buffer_it->second.segment_count) {
180
+ buffers_with_free_space.erase(buffer_it->first);
181
+ buffer_it->second.Destroy();
182
+ buffer_it = buffers.erase(buffer_it);
183
+ } else {
184
+ buffer_it++;
185
+ }
186
+ }
187
+
188
+ // determine if a vacuum is necessary
176
189
  multimap<idx_t, idx_t> temporary_vacuum_buffers;
177
190
  D_ASSERT(vacuum_buffers.empty());
178
191
  idx_t available_segments_in_memory = 0;
@@ -19,15 +19,15 @@ JoinHashTable::JoinHashTable(BufferManager &buffer_manager_p, const vector<JoinC
19
19
  : buffer_manager(buffer_manager_p), conditions(conditions_p), build_types(std::move(btypes)), entry_size(0),
20
20
  tuple_size(0), vfound(Value::BOOLEAN(false)), join_type(type_p), finalized(false), has_null(false),
21
21
  external(false), radix_bits(4), partition_start(0), partition_end(0) {
22
+
22
23
  for (auto &condition : conditions) {
23
24
  D_ASSERT(condition.left->return_type == condition.right->return_type);
24
25
  auto type = condition.left->return_type;
25
26
  if (condition.comparison == ExpressionType::COMPARE_EQUAL ||
26
- condition.comparison == ExpressionType::COMPARE_NOT_DISTINCT_FROM ||
27
- condition.comparison == ExpressionType::COMPARE_DISTINCT_FROM) {
28
- // all equality conditions should be at the front
29
- // all other conditions at the back
30
- // this assert checks that
27
+ condition.comparison == ExpressionType::COMPARE_NOT_DISTINCT_FROM) {
28
+
29
+ // ensure that all equality conditions are at the front,
30
+ // and that all other conditions are at the back
31
31
  D_ASSERT(equality_types.size() == condition_types.size());
32
32
  equality_types.push_back(type);
33
33
  }
@@ -51,6 +51,8 @@ JoinHashTable::JoinHashTable(BufferManager &buffer_manager_p, const vector<JoinC
51
51
  }
52
52
  layout_types.emplace_back(LogicalType::HASH);
53
53
  layout.Initialize(layout_types, false);
54
+ row_matcher.Initialize(false, layout, predicates);
55
+ row_matcher_no_match_sel.Initialize(true, layout, predicates);
54
56
 
55
57
  const auto &offsets = layout.GetOffsets();
56
58
  tuple_size = offsets[condition_types.size() + build_types.size()];
@@ -142,30 +144,6 @@ static idx_t FilterNullValues(UnifiedVectorFormat &vdata, const SelectionVector
142
144
  return result_count;
143
145
  }
144
146
 
145
- idx_t JoinHashTable::PrepareKeys(DataChunk &keys, unsafe_unique_array<UnifiedVectorFormat> &key_data,
146
- const SelectionVector *&current_sel, SelectionVector &sel, bool build_side) {
147
- key_data = keys.ToUnifiedFormat();
148
-
149
- // figure out which keys are NULL, and create a selection vector out of them
150
- current_sel = FlatVector::IncrementalSelectionVector();
151
- idx_t added_count = keys.size();
152
- if (build_side && IsRightOuterJoin(join_type)) {
153
- // in case of a right or full outer join, we cannot remove NULL keys from the build side
154
- return added_count;
155
- }
156
- for (idx_t i = 0; i < keys.ColumnCount(); i++) {
157
- if (!null_values_are_equal[i]) {
158
- if (key_data[i].validity.AllValid()) {
159
- continue;
160
- }
161
- added_count = FilterNullValues(key_data[i], *current_sel, added_count, sel);
162
- // null values are NOT equal for this column, filter them out
163
- current_sel = &sel;
164
- }
165
- }
166
- return added_count;
167
- }
168
-
169
147
  void JoinHashTable::Build(PartitionedTupleDataAppendState &append_state, DataChunk &keys, DataChunk &payload) {
170
148
  D_ASSERT(!finalized);
171
149
  D_ASSERT(keys.size() == payload.size());
@@ -194,23 +172,6 @@ void JoinHashTable::Build(PartitionedTupleDataAppendState &append_state, DataChu
194
172
  info.correlated_counts->AddChunk(info.group_chunk, info.correlated_payload, AggregateType::NON_DISTINCT);
195
173
  }
196
174
 
197
- // prepare the keys for processing
198
- unsafe_unique_array<UnifiedVectorFormat> key_data;
199
- const SelectionVector *current_sel;
200
- SelectionVector sel(STANDARD_VECTOR_SIZE);
201
- idx_t added_count = PrepareKeys(keys, key_data, current_sel, sel, true);
202
- if (added_count < keys.size()) {
203
- has_null = true;
204
- }
205
- if (added_count == 0) {
206
- return;
207
- }
208
-
209
- // hash the keys and obtain an entry in the list
210
- // note that we only hash the keys used in the equality comparison
211
- Vector hash_values(LogicalType::HASH);
212
- Hash(keys, *current_sel, added_count, hash_values);
213
-
214
175
  // build a chunk to append to the data collection [keys, payload, (optional "found" boolean), hash]
215
176
  DataChunk source_chunk;
216
177
  source_chunk.InitializeEmpty(layout.GetTypes());
@@ -228,13 +189,58 @@ void JoinHashTable::Build(PartitionedTupleDataAppendState &append_state, DataChu
228
189
  source_chunk.data[col_offset].Reference(vfound);
229
190
  col_offset++;
230
191
  }
192
+ Vector hash_values(LogicalType::HASH);
231
193
  source_chunk.data[col_offset].Reference(hash_values);
232
194
  source_chunk.SetCardinality(keys);
233
195
 
196
+ // ToUnifiedFormat the source chunk
197
+ TupleDataCollection::ToUnifiedFormat(append_state.chunk_state, source_chunk);
198
+
199
+ // prepare the keys for processing
200
+ const SelectionVector *current_sel;
201
+ SelectionVector sel(STANDARD_VECTOR_SIZE);
202
+ idx_t added_count = PrepareKeys(keys, append_state.chunk_state.vector_data, current_sel, sel, true);
234
203
  if (added_count < keys.size()) {
235
- source_chunk.Slice(*current_sel, added_count);
204
+ has_null = true;
205
+ }
206
+ if (added_count == 0) {
207
+ return;
236
208
  }
237
- sink_collection->Append(append_state, source_chunk);
209
+
210
+ // hash the keys and obtain an entry in the list
211
+ // note that we only hash the keys used in the equality comparison
212
+ Hash(keys, *current_sel, added_count, hash_values);
213
+
214
+ // Re-reference and ToUnifiedFormat the hash column after computing it
215
+ source_chunk.data[col_offset].Reference(hash_values);
216
+ hash_values.ToUnifiedFormat(source_chunk.size(), append_state.chunk_state.vector_data.back().unified);
217
+
218
+ // We already called TupleDataCollection::ToUnifiedFormat, so we can AppendUnified here
219
+ sink_collection->AppendUnified(append_state, source_chunk, *current_sel, added_count);
220
+ }
221
+
222
+ idx_t JoinHashTable::PrepareKeys(DataChunk &keys, vector<TupleDataVectorFormat> &vector_data,
223
+ const SelectionVector *&current_sel, SelectionVector &sel, bool build_side) {
224
+ // figure out which keys are NULL, and create a selection vector out of them
225
+ current_sel = FlatVector::IncrementalSelectionVector();
226
+ idx_t added_count = keys.size();
227
+ if (build_side && IsRightOuterJoin(join_type)) {
228
+ // in case of a right or full outer join, we cannot remove NULL keys from the build side
229
+ return added_count;
230
+ }
231
+
232
+ for (idx_t col_idx = 0; col_idx < keys.ColumnCount(); col_idx++) {
233
+ if (!null_values_are_equal[col_idx]) {
234
+ auto &col_key_data = vector_data[col_idx].unified;
235
+ if (col_key_data.validity.AllValid()) {
236
+ continue;
237
+ }
238
+ added_count = FilterNullValues(col_key_data, *current_sel, added_count, sel);
239
+ // null values are NOT equal for this column, filter them out
240
+ current_sel = &sel;
241
+ }
242
+ }
243
+ return added_count;
238
244
  }
239
245
 
240
246
  template <bool PARALLEL>
@@ -322,12 +328,13 @@ void JoinHashTable::Finalize(idx_t chunk_idx_from, idx_t chunk_idx_to, bool para
322
328
  } while (iterator.Next());
323
329
  }
324
330
 
325
- unique_ptr<ScanStructure> JoinHashTable::InitializeScanStructure(DataChunk &keys, const SelectionVector *&current_sel) {
331
+ unique_ptr<ScanStructure> JoinHashTable::InitializeScanStructure(DataChunk &keys, TupleDataChunkState &key_state,
332
+ const SelectionVector *&current_sel) {
326
333
  D_ASSERT(Count() > 0); // should be handled before
327
334
  D_ASSERT(finalized);
328
335
 
329
336
  // set up the scan structure
330
- auto ss = make_uniq<ScanStructure>(*this);
337
+ auto ss = make_uniq<ScanStructure>(*this, key_state);
331
338
 
332
339
  if (join_type != JoinType::INNER) {
333
340
  ss->found_match = make_unsafe_uniq_array<bool>(STANDARD_VECTOR_SIZE);
@@ -335,13 +342,15 @@ unique_ptr<ScanStructure> JoinHashTable::InitializeScanStructure(DataChunk &keys
335
342
  }
336
343
 
337
344
  // first prepare the keys for probing
338
- ss->count = PrepareKeys(keys, ss->key_data, current_sel, ss->sel_vector, false);
345
+ TupleDataCollection::ToUnifiedFormat(key_state, keys);
346
+ ss->count = PrepareKeys(keys, key_state.vector_data, current_sel, ss->sel_vector, false);
339
347
  return ss;
340
348
  }
341
349
 
342
- unique_ptr<ScanStructure> JoinHashTable::Probe(DataChunk &keys, Vector *precomputed_hashes) {
350
+ unique_ptr<ScanStructure> JoinHashTable::Probe(DataChunk &keys, TupleDataChunkState &key_state,
351
+ Vector *precomputed_hashes) {
343
352
  const SelectionVector *current_sel;
344
- auto ss = InitializeScanStructure(keys, current_sel);
353
+ auto ss = InitializeScanStructure(keys, key_state, current_sel);
345
354
  if (ss->count == 0) {
346
355
  return ss;
347
356
  }
@@ -363,8 +372,9 @@ unique_ptr<ScanStructure> JoinHashTable::Probe(DataChunk &keys, Vector *precompu
363
372
  return ss;
364
373
  }
365
374
 
366
- ScanStructure::ScanStructure(JoinHashTable &ht)
367
- : pointers(LogicalType::POINTER), sel_vector(STANDARD_VECTOR_SIZE), ht(ht), finished(false) {
375
+ ScanStructure::ScanStructure(JoinHashTable &ht_p, TupleDataChunkState &key_state_p)
376
+ : key_state(key_state_p), pointers(LogicalType::POINTER), sel_vector(STANDARD_VECTOR_SIZE), ht(ht_p),
377
+ finished(false) {
368
378
  }
369
379
 
370
380
  void ScanStructure::Next(DataChunk &keys, DataChunk &left, DataChunk &result) {
@@ -404,8 +414,9 @@ idx_t ScanStructure::ResolvePredicates(DataChunk &keys, SelectionVector &match_s
404
414
  }
405
415
  idx_t no_match_count = 0;
406
416
 
407
- return RowOperations::Match(keys, key_data.get(), ht.layout, pointers, ht.predicates, match_sel, this->count,
408
- no_match_sel, no_match_count);
417
+ auto &matcher = no_match_sel ? ht.row_matcher_no_match_sel : ht.row_matcher;
418
+ return matcher.Match(keys, key_state.vector_data, match_sel, this->count, ht.layout, pointers, no_match_sel,
419
+ no_match_count);
409
420
  }
410
421
 
411
422
  idx_t ScanStructure::ScanInnerJoin(DataChunk &keys, SelectionVector &result_vector) {
@@ -990,7 +1001,8 @@ static void CreateSpillChunk(DataChunk &spill_chunk, DataChunk &keys, DataChunk
990
1001
  spill_chunk.data[spill_col_idx].Reference(hashes);
991
1002
  }
992
1003
 
993
- unique_ptr<ScanStructure> JoinHashTable::ProbeAndSpill(DataChunk &keys, DataChunk &payload, ProbeSpill &probe_spill,
1004
+ unique_ptr<ScanStructure> JoinHashTable::ProbeAndSpill(DataChunk &keys, TupleDataChunkState &key_state,
1005
+ DataChunk &payload, ProbeSpill &probe_spill,
994
1006
  ProbeSpillLocalAppendState &spill_state,
995
1007
  DataChunk &spill_chunk) {
996
1008
  // hash all the keys
@@ -1019,7 +1031,7 @@ unique_ptr<ScanStructure> JoinHashTable::ProbeAndSpill(DataChunk &keys, DataChun
1019
1031
  payload.Slice(true_sel, true_count);
1020
1032
 
1021
1033
  const SelectionVector *current_sel;
1022
- auto ss = InitializeScanStructure(keys, current_sel);
1034
+ auto ss = InitializeScanStructure(keys, key_state, current_sel);
1023
1035
  if (ss->count == 0) {
1024
1036
  return ss;
1025
1037
  }
@@ -782,13 +782,13 @@ public:
782
782
  }
783
783
 
784
784
  auto &ht_state = op.sink_state->Cast<HashAggregateGlobalSinkState>();
785
- idx_t count = 0;
785
+ idx_t partitions = 0;
786
786
  for (size_t sidx = 0; sidx < op.groupings.size(); ++sidx) {
787
787
  auto &grouping = op.groupings[sidx];
788
788
  auto &grouping_gstate = ht_state.grouping_states[sidx];
789
- count += grouping.table_data.Count(*grouping_gstate.table_state);
789
+ partitions += grouping.table_data.NumberOfPartitions(*grouping_gstate.table_state);
790
790
  }
791
- return MaxValue<idx_t>(1, count / STANDARD_VECTOR_SIZE);
791
+ return MaxValue<idx_t>(1, partitions);
792
792
  }
793
793
  };
794
794
 
@@ -263,7 +263,7 @@ bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error
263
263
  return true;
264
264
  }
265
265
 
266
- if (mode == ParserMode::SNIFFING_DATATYPES && parse_chunk.size() == options.sample_chunk_size) {
266
+ if (mode == ParserMode::SNIFFING_DATATYPES) {
267
267
  return true;
268
268
  }
269
269
 
@@ -480,6 +480,10 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, idx_t buffer_idx, bool try_ad
480
480
 
481
481
  bool was_already_null = FlatVector::IsNull(parse_vector, row_idx);
482
482
  if (!was_already_null && FlatVector::IsNull(result_vector, row_idx)) {
483
+ Increment(buffer_idx);
484
+ auto bla = GetLineError(global_row_idx, buffer_idx, false);
485
+ row_idx += bla;
486
+ row_idx -= bla;
483
487
  row_failed = true;
484
488
  failed_cells.emplace_back(row_idx, col_idx, row_line);
485
489
  }
@@ -8,10 +8,14 @@ CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle
8
8
  : context(context), first_buffer(true), file_number(file_number_p), can_seek(file_handle.CanSeek()) {
9
9
  AllocateBuffer(buffer_size_p);
10
10
  auto buffer = Ptr();
11
- file_size = file_handle.Read(buffer, buffer_size_p);
11
+ actual_buffer_size = file_handle.Read(buffer, buffer_size_p);
12
+ while (actual_buffer_size < buffer_size_p && !file_handle.FinishedReading()) {
13
+ // We keep reading until this block is full
14
+ actual_buffer_size += file_handle.Read(&buffer[actual_buffer_size], buffer_size_p - actual_buffer_size);
15
+ }
12
16
  global_csv_start = global_csv_current_position;
13
17
  // BOM check (https://en.wikipedia.org/wiki/Byte_order_mark)
14
- if (file_size >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') {
18
+ if (actual_buffer_size >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') {
15
19
  start_position += 3;
16
20
  }
17
21
  last_buffer = file_handle.FinishedReading();
@@ -22,13 +26,18 @@ CSVBuffer::CSVBuffer(CSVFileHandle &file_handle, ClientContext &context, idx_t b
22
26
  : context(context), global_csv_start(global_csv_current_position), file_number(file_number_p),
23
27
  can_seek(file_handle.CanSeek()) {
24
28
  AllocateBuffer(buffer_size);
25
- file_size = file_handle.Read(handle.Ptr(), buffer_size);
29
+ auto buffer = handle.Ptr();
30
+ actual_buffer_size = file_handle.Read(handle.Ptr(), buffer_size);
31
+ while (actual_buffer_size < buffer_size && !file_handle.FinishedReading()) {
32
+ // We keep reading until this block is full
33
+ actual_buffer_size += file_handle.Read(&buffer[actual_buffer_size], buffer_size - actual_buffer_size);
34
+ }
26
35
  last_buffer = file_handle.FinishedReading();
27
36
  }
28
37
 
29
38
  shared_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t file_number_p) {
30
39
  auto next_csv_buffer =
31
- make_shared<CSVBuffer>(file_handle, context, buffer_size, global_csv_start + file_size, file_number_p);
40
+ make_shared<CSVBuffer>(file_handle, context, buffer_size, global_csv_start + actual_buffer_size, file_number_p);
32
41
  if (next_csv_buffer->GetBufferSize() == 0) {
33
42
  // We are done reading
34
43
  return nullptr;
@@ -43,13 +52,13 @@ void CSVBuffer::AllocateBuffer(idx_t buffer_size) {
43
52
  }
44
53
 
45
54
  idx_t CSVBuffer::GetBufferSize() {
46
- return file_size;
55
+ return actual_buffer_size;
47
56
  }
48
57
 
49
58
  void CSVBuffer::Reload(CSVFileHandle &file_handle) {
50
- AllocateBuffer(file_size);
59
+ AllocateBuffer(actual_buffer_size);
51
60
  file_handle.Seek(global_csv_start);
52
- file_handle.Read(handle.Ptr(), file_size);
61
+ file_handle.Read(handle.Ptr(), actual_buffer_size);
53
62
  }
54
63
 
55
64
  unique_ptr<CSVBufferHandle> CSVBuffer::Pin(CSVFileHandle &file_handle) {
@@ -59,8 +68,8 @@ unique_ptr<CSVBufferHandle> CSVBuffer::Pin(CSVFileHandle &file_handle) {
59
68
  block = nullptr;
60
69
  Reload(file_handle);
61
70
  }
62
- return make_uniq<CSVBufferHandle>(buffer_manager.Pin(block), file_size, first_buffer, last_buffer, global_csv_start,
63
- start_position, file_number);
71
+ return make_uniq<CSVBufferHandle>(buffer_manager.Pin(block), actual_buffer_size, first_buffer, last_buffer,
72
+ global_csv_start, start_position, file_number);
64
73
  }
65
74
 
66
75
  void CSVBuffer::Unpin() {
@@ -168,38 +168,24 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value,
168
168
  if (loption == "auto_detect") {
169
169
  auto_detect = ParseBoolean(value, loption);
170
170
  } else if (loption == "sample_size") {
171
- int64_t sample_size = ParseInteger(value, loption);
172
- if (sample_size < 1 && sample_size != -1) {
171
+ int64_t sample_size_option = ParseInteger(value, loption);
172
+ if (sample_size_option < 1 && sample_size_option != -1) {
173
173
  throw BinderException("Unsupported parameter for SAMPLE_SIZE: cannot be smaller than 1");
174
174
  }
175
- if (sample_size == -1) {
176
- sample_chunks = std::numeric_limits<uint64_t>::max();
177
- sample_chunk_size = STANDARD_VECTOR_SIZE;
178
- } else if (sample_size <= STANDARD_VECTOR_SIZE) {
179
- sample_chunk_size = sample_size;
180
- sample_chunks = 1;
175
+ if (sample_size_option == -1) {
176
+ // If -1, we basically read the whole thing
177
+ sample_size_chunks = NumericLimits<idx_t>().Maximum();
181
178
  } else {
182
- sample_chunk_size = STANDARD_VECTOR_SIZE;
183
- sample_chunks = sample_size / STANDARD_VECTOR_SIZE + 1;
179
+ sample_size_chunks = sample_size_option / STANDARD_VECTOR_SIZE;
180
+ if (sample_size_option % STANDARD_VECTOR_SIZE != 0) {
181
+ sample_size_chunks++;
182
+ }
184
183
  }
184
+
185
185
  } else if (loption == "skip") {
186
186
  SetSkipRows(ParseInteger(value, loption));
187
187
  } else if (loption == "max_line_size" || loption == "maximum_line_size") {
188
188
  maximum_line_size = ParseInteger(value, loption);
189
- } else if (loption == "sample_chunk_size") {
190
- sample_chunk_size = ParseInteger(value, loption);
191
- if (sample_chunk_size > STANDARD_VECTOR_SIZE) {
192
- throw BinderException(
193
- "Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be bigger than STANDARD_VECTOR_SIZE %d",
194
- STANDARD_VECTOR_SIZE);
195
- } else if (sample_chunk_size < 1) {
196
- throw BinderException("Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be smaller than 1");
197
- }
198
- } else if (loption == "sample_chunks") {
199
- sample_chunks = ParseInteger(value, loption);
200
- if (sample_chunks < 1) {
201
- throw BinderException("Unsupported parameter for SAMPLE_CHUNKS: cannot be smaller than 1");
202
- }
203
189
  } else if (loption == "force_not_null") {
204
190
  force_not_null = ParseColumnList(value, expected_names, loption);
205
191
  } else if (loption == "date_format" || loption == "dateformat") {
@@ -322,7 +308,7 @@ string CSVReaderOptions::ToString() const {
322
308
  (has_escape ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) +
323
309
  "\n header=" + std::to_string(dialect_options.header) +
324
310
  (has_header ? "" : (auto_detect ? " (auto detected)" : "' (default)")) +
325
- "\n sample_size=" + std::to_string(sample_chunk_size * sample_chunks) +
311
+ "\n sample_size=" + std::to_string(sample_size_chunks * STANDARD_VECTOR_SIZE) +
326
312
  "\n ignore_errors=" + std::to_string(ignore_errors) + "\n all_varchar=" + std::to_string(all_varchar);
327
313
  }
328
314
 
@@ -489,8 +475,6 @@ void CSVReaderOptions::ToNamedParameters(named_parameter_map_t &named_params) {
489
475
  if (skip_rows_set) {
490
476
  named_params["skip"] = Value::BIGINT(GetSkipRows());
491
477
  }
492
- named_params["sample_chunks"] = Value::BIGINT(sample_chunks);
493
- named_params["sample_chunk_size"] = Value::BIGINT(sample_chunk_size);
494
478
  named_params["null_padding"] = Value::BOOLEAN(null_padding);
495
479
  if (!date_format.at(LogicalType::DATE).format_specifier.empty()) {
496
480
  named_params["dateformat"] = Value(date_format.at(LogicalType::DATE).format_specifier);
@@ -29,8 +29,7 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
29
29
  InitializeTransitionArray(transition_array[i], quoted_state);
30
30
  break;
31
31
  case unquoted_state:
32
- InitializeTransitionArray(transition_array[i], invalid_state);
33
- break;
32
+ case invalid_state:
34
33
  case escape_state:
35
34
  InitializeTransitionArray(transition_array[i], invalid_state);
36
35
  break;
@@ -647,6 +647,10 @@ idx_t ParallelCSVReader::GetLineError(idx_t line_error, idx_t buffer_idx, bool s
647
647
  }
648
648
  }
649
649
 
650
+ void ParallelCSVReader::Increment(idx_t buffer_idx) {
651
+ return buffer->line_info->Increment(file_idx, buffer_idx);
652
+ }
653
+
650
654
  bool ParallelCSVReader::TryParseCSV(ParserMode mode) {
651
655
  DataChunk dummy_chunk;
652
656
  string error_message;
@@ -3,8 +3,9 @@
3
3
  namespace duckdb {
4
4
 
5
5
  CSVSniffer::CSVSniffer(CSVReaderOptions &options_p, shared_ptr<CSVBufferManager> buffer_manager_p,
6
- CSVStateMachineCache &state_machine_cache_p)
7
- : state_machine_cache(state_machine_cache_p), options(options_p), buffer_manager(std::move(buffer_manager_p)) {
6
+ CSVStateMachineCache &state_machine_cache_p, bool explicit_set_columns_p)
7
+ : state_machine_cache(state_machine_cache_p), options(options_p), buffer_manager(std::move(buffer_manager_p)),
8
+ explicit_set_columns(explicit_set_columns_p) {
8
9
 
9
10
  // Check if any type is BLOB
10
11
  for (auto &type : options.sql_type_list) {
@@ -24,6 +25,14 @@ CSVSniffer::CSVSniffer(CSVReaderOptions &options_p, shared_ptr<CSVBufferManager>
24
25
  SnifferResult CSVSniffer::SniffCSV() {
25
26
  // 1. Dialect Detection
26
27
  DetectDialect();
28
+ if (explicit_set_columns) {
29
+ if (!candidates.empty()) {
30
+ options.dialect_options.state_machine_options = candidates[0]->dialect_options.state_machine_options;
31
+ options.dialect_options.new_line = candidates[0]->dialect_options.new_line;
32
+ }
33
+ // We do not need to run type and header detection as these were defined by the user
34
+ return SnifferResult(detected_types, names);
35
+ }
27
36
  // 2. Type Detection
28
37
  DetectTypes();
29
38
  // 3. Header Detection
@@ -15,7 +15,7 @@ struct SniffDialect {
15
15
  inline static bool Process(CSVStateMachine &machine, vector<idx_t> &sniffed_column_counts, char current_char,
16
16
  idx_t current_pos) {
17
17
 
18
- D_ASSERT(sniffed_column_counts.size() == machine.options.sample_chunk_size);
18
+ D_ASSERT(sniffed_column_counts.size() == STANDARD_VECTOR_SIZE);
19
19
 
20
20
  if (machine.state == CSVState::INVALID) {
21
21
  sniffed_column_counts.clear();
@@ -45,7 +45,7 @@ struct SniffDialect {
45
45
  machine.single_record_separator = ((machine.state != CSVState::RECORD_SEPARATOR && carriage_return) ||
46
46
  (machine.state == CSVState::RECORD_SEPARATOR && !carriage_return)) ||
47
47
  machine.single_record_separator;
48
- if (machine.cur_rows >= machine.options.sample_chunk_size) {
48
+ if (machine.cur_rows >= STANDARD_VECTOR_SIZE) {
49
49
  // We sniffed enough rows
50
50
  return true;
51
51
  }
@@ -55,10 +55,10 @@ struct SniffDialect {
55
55
  if (machine.state == CSVState::INVALID) {
56
56
  return;
57
57
  }
58
- if (machine.cur_rows < machine.options.sample_chunk_size && machine.state == CSVState::DELIMITER) {
58
+ if (machine.cur_rows < STANDARD_VECTOR_SIZE && machine.state == CSVState::DELIMITER) {
59
59
  sniffed_column_counts[machine.cur_rows] = ++machine.column_count;
60
60
  }
61
- if (machine.cur_rows < machine.options.sample_chunk_size && machine.state != CSVState::EMPTY_LINE) {
61
+ if (machine.cur_rows < STANDARD_VECTOR_SIZE && machine.state != CSVState::EMPTY_LINE) {
62
62
  sniffed_column_counts[machine.cur_rows++] = machine.column_count;
63
63
  }
64
64
  NewLineIdentifier suggested_newline;
@@ -145,7 +145,7 @@ void CSVSniffer::GenerateStateMachineSearchSpace(vector<unique_ptr<CSVStateMachi
145
145
  void CSVSniffer::AnalyzeDialectCandidate(unique_ptr<CSVStateMachine> state_machine, idx_t &rows_read,
146
146
  idx_t &best_consistent_rows, idx_t &prev_padding_count) {
147
147
  // The sniffed_column_counts variable keeps track of the number of columns found for each row
148
- vector<idx_t> sniffed_column_counts(options.sample_chunk_size);
148
+ vector<idx_t> sniffed_column_counts(STANDARD_VECTOR_SIZE);
149
149
 
150
150
  state_machine->csv_buffer_iterator.Process<SniffDialect>(*state_machine, sniffed_column_counts);
151
151
  idx_t start_row = options.dialect_options.skip_rows;
@@ -244,7 +244,7 @@ void CSVSniffer::AnalyzeDialectCandidate(unique_ptr<CSVStateMachine> state_machi
244
244
  }
245
245
 
246
246
  bool CSVSniffer::RefineCandidateNextChunk(CSVStateMachine &candidate) {
247
- vector<idx_t> sniffed_column_counts(options.sample_chunk_size);
247
+ vector<idx_t> sniffed_column_counts(STANDARD_VECTOR_SIZE);
248
248
  candidate.csv_buffer_iterator.Process<SniffDialect>(candidate, sniffed_column_counts);
249
249
  bool allow_padding = options.null_padding;
250
250
 
@@ -268,9 +268,9 @@ void CSVSniffer::RefineCandidates() {
268
268
  return;
269
269
  }
270
270
  for (auto &cur_candidate : candidates) {
271
- for (idx_t i = 1; i <= options.sample_chunks; i++) {
271
+ for (idx_t i = 1; i <= options.sample_size_chunks; i++) {
272
272
  bool finished_file = cur_candidate->csv_buffer_iterator.Finished();
273
- if (finished_file || i == options.sample_chunks) {
273
+ if (finished_file || i == options.sample_size_chunks) {
274
274
  // we finished the file or our chunk sample successfully: stop
275
275
  auto successful_candidate = std::move(cur_candidate);
276
276
  candidates.clear();
@@ -283,11 +283,7 @@ void CSVSniffer::DetectTypes() {
283
283
  candidate->Reset();
284
284
 
285
285
  // Parse chunk and read csv with info candidate
286
- idx_t sample_size = options.sample_chunk_size;
287
- if (options.sample_chunk_size == 1) {
288
- sample_size++;
289
- }
290
- vector<TupleSniffing> tuples(sample_size);
286
+ vector<TupleSniffing> tuples(STANDARD_VECTOR_SIZE);
291
287
  candidate->csv_buffer_iterator.Process<SniffValue>(*candidate, tuples);
292
288
  // Potentially Skip empty rows (I find this dirty, but it is what the original code does)
293
289
  idx_t true_start = 0;
@@ -311,8 +307,10 @@ void CSVSniffer::DetectTypes() {
311
307
  break;
312
308
  }
313
309
  }
310
+ if (values_start > 0) {
311
+ tuples.erase(tuples.begin(), tuples.begin() + values_start);
312
+ }
314
313
 
315
- tuples.erase(tuples.begin(), tuples.begin() + values_start);
316
314
  idx_t row_idx = 0;
317
315
  if (tuples.size() > 1 && (!options.has_header || (options.has_header && options.dialect_options.header))) {
318
316
  // This means we have more than one row, hence we can use the first row to detect if we have a header
@@ -327,6 +325,9 @@ void CSVSniffer::DetectTypes() {
327
325
  for (; row_idx < tuples.size(); row_idx++) {
328
326
  for (idx_t col = 0; col < tuples[row_idx].values.size(); col++) {
329
327
  auto &col_type_candidates = info_sql_types_candidates[col];
328
+ // col_type_candidates can't be empty since anything in a CSV file should at least be a string
329
+ // and we validate utf-8 compatibility when creating the type
330
+ D_ASSERT(!col_type_candidates.empty());
330
331
  auto cur_top_candidate = col_type_candidates.back();
331
332
  auto dummy_val = tuples[row_idx].values[col];
332
333
  // try cast from string to sql_type