duckdb 0.9.1-dev0.0 → 0.9.1-dev143.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/package.json +1 -1
  2. package/src/duckdb/extension/parquet/column_reader.cpp +26 -1
  3. package/src/duckdb/extension/parquet/include/column_reader.hpp +2 -0
  4. package/src/duckdb/extension/parquet/include/parquet_bss_decoder.hpp +49 -0
  5. package/src/duckdb/src/common/enum_util.cpp +1 -1
  6. package/src/duckdb/src/common/serializer/binary_deserializer.cpp +4 -2
  7. package/src/duckdb/src/common/types/data_chunk.cpp +1 -1
  8. package/src/duckdb/src/core_functions/scalar/map/map.cpp +66 -32
  9. package/src/duckdb/src/execution/expression_executor/execute_reference.cpp +1 -1
  10. package/src/duckdb/src/execution/expression_executor_state.cpp +8 -2
  11. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +41 -48
  12. package/src/duckdb/src/execution/operator/csv_scanner/parallel_csv_reader.cpp +13 -9
  13. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +22 -24
  14. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +6 -11
  15. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +8 -3
  16. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +5 -9
  17. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +8 -13
  18. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +2 -2
  19. package/src/duckdb/src/execution/operator/helper/physical_reset.cpp +1 -4
  20. package/src/duckdb/src/execution/operator/helper/physical_set.cpp +2 -4
  21. package/src/duckdb/src/execution/perfect_aggregate_hashtable.cpp +4 -6
  22. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +1 -1
  23. package/src/duckdb/src/function/table/read_csv.cpp +1 -1
  24. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  25. package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +1 -0
  26. package/src/duckdb/src/include/duckdb/execution/expression_executor_state.hpp +1 -1
  27. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +12 -10
  28. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state.hpp +28 -0
  29. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +9 -14
  30. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +20 -6
  31. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp +1 -1
  32. package/src/duckdb/src/include/duckdb/main/config.hpp +2 -0
  33. package/src/duckdb/src/include/duckdb/planner/expression_binder.hpp +2 -2
  34. package/src/duckdb/src/include/duckdb.h +5 -5
  35. package/src/duckdb/src/main/config.cpp +14 -0
  36. package/src/duckdb/src/main/extension/extension_helper.cpp +7 -0
  37. package/src/duckdb/src/optimizer/common_aggregate_optimizer.cpp +2 -2
  38. package/src/duckdb/src/planner/binder/expression/bind_between_expression.cpp +5 -7
  39. package/src/duckdb/src/planner/binder/expression/bind_collate_expression.cpp +4 -2
  40. package/src/duckdb/src/planner/binder/expression/bind_comparison_expression.cpp +17 -14
  41. package/src/duckdb/src/planner/binder/query_node/bind_select_node.cpp +5 -12
  42. package/src/duckdb/src/planner/binder/tableref/plan_joinref.cpp +3 -0
  43. package/src/duckdb/src/transaction/duck_transaction_manager.cpp +13 -9
  44. package/src/duckdb/third_party/parquet/parquet_types.h +2 -1
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "0.9.1-dev0.0",
5
+ "version": "0.9.1-dev143.0",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
@@ -243,6 +243,7 @@ void ColumnReader::InitializeRead(idx_t row_group_idx_p, const vector<ColumnChun
243
243
  void ColumnReader::PrepareRead(parquet_filter_t &filter) {
244
244
  dict_decoder.reset();
245
245
  defined_decoder.reset();
246
+ bss_decoder.reset();
246
247
  block.reset();
247
248
  PageHeader page_hdr;
248
249
  page_hdr.read(protocol);
@@ -443,6 +444,13 @@ void ColumnReader::PrepareDataPage(PageHeader &page_hdr) {
443
444
  PrepareDeltaByteArray(*block);
444
445
  break;
445
446
  }
447
+ case Encoding::BYTE_STREAM_SPLIT: {
448
+ // Subtract 1 from length as the block is allocated with 1 extra byte,
449
+ // but the byte stream split encoder needs to know the correct data size.
450
+ bss_decoder = make_uniq<BssDecoder>(block->ptr, block->len - 1);
451
+ block->inc(block->len);
452
+ break;
453
+ }
446
454
  case Encoding::PLAIN:
447
455
  // nothing to do here, will be read directly below
448
456
  break;
@@ -488,7 +496,7 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr
488
496
 
489
497
  idx_t null_count = 0;
490
498
 
491
- if ((dict_decoder || dbp_decoder || rle_decoder) && HasDefines()) {
499
+ if ((dict_decoder || dbp_decoder || rle_decoder || bss_decoder) && HasDefines()) {
492
500
  // we need the null count because the dictionary offsets have no entries for nulls
493
501
  for (idx_t i = 0; i < read_now; i++) {
494
502
  if (define_out[i + result_offset] != max_define) {
@@ -534,6 +542,23 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr
534
542
  } else if (byte_array_data) {
535
543
  // DELTA_BYTE_ARRAY or DELTA_LENGTH_BYTE_ARRAY
536
544
  DeltaByteArray(define_out, read_now, filter, result_offset, result);
545
+ } else if (bss_decoder) {
546
+ auto read_buf = make_shared<ResizeableBuffer>();
547
+
548
+ switch (schema.type) {
549
+ case duckdb_parquet::format::Type::FLOAT:
550
+ read_buf->resize(reader.allocator, sizeof(float) * (read_now - null_count));
551
+ bss_decoder->GetBatch<float>(read_buf->ptr, read_now - null_count);
552
+ break;
553
+ case duckdb_parquet::format::Type::DOUBLE:
554
+ read_buf->resize(reader.allocator, sizeof(double) * (read_now - null_count));
555
+ bss_decoder->GetBatch<double>(read_buf->ptr, read_now - null_count);
556
+ break;
557
+ default:
558
+ throw std::runtime_error("BYTE_STREAM_SPLIT encoding is only supported for FLOAT or DOUBLE data");
559
+ }
560
+
561
+ Plain(read_buf, define_out, read_now, filter, result_offset, result);
537
562
  } else {
538
563
  PlainReference(block, result);
539
564
  Plain(block, define_out, read_now, filter, result_offset, result);
@@ -9,6 +9,7 @@
9
9
  #pragma once
10
10
 
11
11
  #include "duckdb.hpp"
12
+ #include "parquet_bss_decoder.hpp"
12
13
  #include "parquet_dbp_decoder.hpp"
13
14
  #include "parquet_rle_bp_decoder.hpp"
14
15
  #include "parquet_statistics.hpp"
@@ -161,6 +162,7 @@ private:
161
162
  unique_ptr<RleBpDecoder> repeated_decoder;
162
163
  unique_ptr<DbpDecoder> dbp_decoder;
163
164
  unique_ptr<RleBpDecoder> rle_decoder;
165
+ unique_ptr<BssDecoder> bss_decoder;
164
166
 
165
167
  // dummies for Skip()
166
168
  parquet_filter_t none_filter;
@@ -0,0 +1,49 @@
1
+ //===----------------------------------------------------------------------===//
2
+ // DuckDB
3
+ //
4
+ // parquet_bss_decoder.hpp
5
+ //
6
+ //
7
+ //===----------------------------------------------------------------------===//
8
+
9
+ #pragma once
10
+ #include "parquet_types.h"
11
+ #include "resizable_buffer.hpp"
12
+
13
+ namespace duckdb {
14
+
15
+ /// Decoder for the Byte Stream Split encoding
16
+ class BssDecoder {
17
+ public:
18
+ /// Create a decoder object. buffer/buffer_len is the encoded data.
19
+ BssDecoder(data_ptr_t buffer, uint32_t buffer_len) : buffer_(buffer, buffer_len), value_offset_(0) {
20
+ }
21
+
22
+ public:
23
+ template <typename T>
24
+ void GetBatch(data_ptr_t values_target_ptr, uint32_t batch_size) {
25
+ if (buffer_.len % sizeof(T) != 0) {
26
+ std::stringstream error;
27
+ error << "Data buffer size for the BYTE_STREAM_SPLIT encoding (" << buffer_.len
28
+ << ") should be a multiple of the type size (" << sizeof(T) << ")";
29
+ throw std::runtime_error(error.str());
30
+ }
31
+ uint32_t num_buffer_values = buffer_.len / sizeof(T);
32
+
33
+ buffer_.available((value_offset_ + batch_size) * sizeof(T));
34
+
35
+ for (uint32_t byte_offset = 0; byte_offset < sizeof(T); ++byte_offset) {
36
+ data_ptr_t input_bytes = buffer_.ptr + byte_offset * num_buffer_values + value_offset_;
37
+ for (uint32_t i = 0; i < batch_size; ++i) {
38
+ values_target_ptr[byte_offset + i * sizeof(T)] = *(input_bytes + i);
39
+ }
40
+ }
41
+ value_offset_ += batch_size;
42
+ }
43
+
44
+ private:
45
+ ByteBuffer buffer_;
46
+ uint32_t value_offset_;
47
+ };
48
+
49
+ } // namespace duckdb
@@ -68,7 +68,7 @@
68
68
  #include "duckdb/execution/index/art/node.hpp"
69
69
  #include "duckdb/execution/operator/scan/csv/base_csv_reader.hpp"
70
70
  #include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
71
- #include "duckdb/execution/operator/scan/csv/csv_state_machine.hpp"
71
+ #include "duckdb/execution/operator/scan/csv/csv_state.hpp"
72
72
  #include "duckdb/execution/operator/scan/csv/quote_rules.hpp"
73
73
  #include "duckdb/function/aggregate_state.hpp"
74
74
  #include "duckdb/function/function.hpp"
@@ -8,7 +8,8 @@ namespace duckdb {
8
8
  void BinaryDeserializer::OnPropertyBegin(const field_id_t field_id, const char *) {
9
9
  auto field = NextField();
10
10
  if (field != field_id) {
11
- throw InternalException("Failed to deserialize: field id mismatch, expected: %d, got: %d", field_id, field);
11
+ throw SerializationException("Failed to deserialize: field id mismatch, expected: %d, got: %d", field_id,
12
+ field);
12
13
  }
13
14
  }
14
15
 
@@ -34,7 +35,8 @@ void BinaryDeserializer::OnObjectBegin() {
34
35
  void BinaryDeserializer::OnObjectEnd() {
35
36
  auto next_field = NextField();
36
37
  if (next_field != MESSAGE_TERMINATOR_FIELD_ID) {
37
- throw InternalException("Failed to deserialize: expected end of object, but found field id: %d", next_field);
38
+ throw SerializationException("Failed to deserialize: expected end of object, but found field id: %d",
39
+ next_field);
38
40
  }
39
41
  nesting_level--;
40
42
  }
@@ -64,7 +64,7 @@ void DataChunk::InitializeEmpty(vector<LogicalType>::const_iterator begin, vecto
64
64
  }
65
65
 
66
66
  void DataChunk::Reset() {
67
- if (data.empty()) {
67
+ if (data.empty() || vector_caches.empty()) {
68
68
  return;
69
69
  }
70
70
  if (vector_caches.size() != data.size()) {
@@ -87,11 +87,24 @@ static bool ListEntriesEqual(Vector &keys, Vector &values, idx_t count) {
87
87
  return true;
88
88
  }
89
89
 
90
+ static list_entry_t *GetBiggestList(Vector &key, Vector &value, idx_t &size) {
91
+ auto key_size = ListVector::GetListSize(key);
92
+ auto value_size = ListVector::GetListSize(value);
93
+ if (key_size > value_size) {
94
+ size = key_size;
95
+ return ListVector::GetData(key);
96
+ }
97
+ size = value_size;
98
+ return ListVector::GetData(value);
99
+ }
100
+
90
101
  static void MapFunction(DataChunk &args, ExpressionState &state, Vector &result) {
91
102
  D_ASSERT(result.GetType().id() == LogicalTypeId::MAP);
92
103
 
93
- auto &key_vector = MapVector::GetKeys(result);
94
- auto &value_vector = MapVector::GetValues(result);
104
+ auto count = args.size();
105
+
106
+ auto &map_key_vector = MapVector::GetKeys(result);
107
+ auto &map_value_vector = MapVector::GetValues(result);
95
108
  auto result_data = ListVector::GetData(result);
96
109
 
97
110
  result.SetVectorType(VectorType::CONSTANT_VECTOR);
@@ -99,52 +112,73 @@ static void MapFunction(DataChunk &args, ExpressionState &state, Vector &result)
99
112
  ListVector::SetListSize(result, 0);
100
113
  result_data->offset = 0;
101
114
  result_data->length = 0;
102
- result.Verify(args.size());
115
+ result.Verify(count);
103
116
  return;
104
117
  }
105
118
 
106
- bool keys_are_const = args.data[0].GetVectorType() == VectorType::CONSTANT_VECTOR;
107
- bool values_are_const = args.data[1].GetVectorType() == VectorType::CONSTANT_VECTOR;
108
- if (!keys_are_const || !values_are_const) {
109
- result.SetVectorType(VectorType::FLAT_VECTOR);
119
+ D_ASSERT(args.ColumnCount() == 2);
120
+ auto &key_vector = args.data[0];
121
+ auto &value_vector = args.data[1];
122
+
123
+ if (args.AllConstant()) {
124
+ auto key_data = ListVector::GetData(key_vector);
125
+ auto value_data = ListVector::GetData(value_vector);
126
+ auto key_entry = key_data[0];
127
+ auto value_entry = value_data[0];
128
+ if (key_entry != value_entry) {
129
+ throw BinderException("Key and value list sizes don't match");
130
+ }
131
+ result_data[0] = key_entry;
132
+ ListVector::SetListSize(result, ListVector::GetListSize(key_vector));
133
+ map_key_vector.Reference(ListVector::GetEntry(key_vector));
134
+ map_value_vector.Reference(ListVector::GetEntry(value_vector));
135
+ MapVector::MapConversionVerify(result, count);
136
+ result.Verify(count);
137
+ return;
110
138
  }
111
139
 
112
- auto key_count = ListVector::GetListSize(args.data[0]);
113
- auto value_count = ListVector::GetListSize(args.data[1]);
114
- auto key_data = ListVector::GetData(args.data[0]);
115
- auto value_data = ListVector::GetData(args.data[1]);
116
- auto src_data = key_data;
117
-
118
- if (keys_are_const && !values_are_const) {
119
- AlignVectorToReference(args.data[0], args.data[1], args.size(), key_vector);
120
- src_data = value_data;
121
- } else if (values_are_const && !keys_are_const) {
122
- AlignVectorToReference(args.data[1], args.data[0], args.size(), value_vector);
140
+ result.SetVectorType(VectorType::FLAT_VECTOR);
141
+
142
+ if (key_vector.GetVectorType() == VectorType::CONSTANT_VECTOR) {
143
+ D_ASSERT(value_vector.GetVectorType() != VectorType::CONSTANT_VECTOR);
144
+ Vector expanded_const(ListType::GetChildType(key_vector.GetType()), count);
145
+ AlignVectorToReference(key_vector, value_vector, count, expanded_const);
146
+ map_key_vector.Reference(expanded_const);
147
+
148
+ value_vector.Flatten(count);
149
+ map_value_vector.Reference(ListVector::GetEntry(value_vector));
150
+ } else if (value_vector.GetVectorType() == VectorType::CONSTANT_VECTOR) {
151
+ D_ASSERT(key_vector.GetVectorType() != VectorType::CONSTANT_VECTOR);
152
+ Vector expanded_const(ListType::GetChildType(value_vector.GetType()), count);
153
+ AlignVectorToReference(value_vector, key_vector, count, expanded_const);
154
+ map_value_vector.Reference(expanded_const);
155
+
156
+ key_vector.Flatten(count);
157
+ map_key_vector.Reference(ListVector::GetEntry(key_vector));
123
158
  } else {
124
- if (!ListEntriesEqual(args.data[0], args.data[1], args.size())) {
159
+ key_vector.Flatten(count);
160
+ value_vector.Flatten(count);
161
+
162
+ if (!ListEntriesEqual(key_vector, value_vector, count)) {
125
163
  throw InvalidInputException("Error in MAP creation: key list and value list do not align. i.e. different "
126
164
  "size or incompatible structure");
127
165
  }
166
+
167
+ map_value_vector.Reference(ListVector::GetEntry(value_vector));
168
+ map_key_vector.Reference(ListVector::GetEntry(key_vector));
128
169
  }
129
170
 
130
- ListVector::SetListSize(result, MaxValue(key_count, value_count));
171
+ idx_t list_size;
172
+ auto src_data = GetBiggestList(key_vector, value_vector, list_size);
173
+ ListVector::SetListSize(result, list_size);
131
174
 
132
175
  result_data = ListVector::GetData(result);
133
- for (idx_t i = 0; i < args.size(); i++) {
176
+ for (idx_t i = 0; i < count; i++) {
134
177
  result_data[i] = src_data[i];
135
178
  }
136
179
 
137
- // check whether one of the vectors has already been referenced to an expanded vector in the case of const/non-const
138
- // combination. If not, then referencing is still necessary
139
- if (!(keys_are_const && !values_are_const)) {
140
- key_vector.Reference(ListVector::GetEntry(args.data[0]));
141
- }
142
- if (!(values_are_const && !keys_are_const)) {
143
- value_vector.Reference(ListVector::GetEntry(args.data[1]));
144
- }
145
-
146
- MapVector::MapConversionVerify(result, args.size());
147
- result.Verify(args.size());
180
+ MapVector::MapConversionVerify(result, count);
181
+ result.Verify(count);
148
182
  }
149
183
 
150
184
  static unique_ptr<FunctionData> MapBind(ClientContext &context, ScalarFunction &bound_function,
@@ -6,7 +6,7 @@ namespace duckdb {
6
6
  unique_ptr<ExpressionState> ExpressionExecutor::InitializeState(const BoundReferenceExpression &expr,
7
7
  ExpressionExecutorState &root) {
8
8
  auto result = make_uniq<ExpressionState>(expr, root);
9
- result->Finalize();
9
+ result->Finalize(true);
10
10
  return result;
11
11
  }
12
12
 
@@ -1,4 +1,5 @@
1
1
  #include "duckdb/execution/expression_executor_state.hpp"
2
+
2
3
  #include "duckdb/execution/expression_executor.hpp"
3
4
  #include "duckdb/planner/expression.hpp"
4
5
  #include "duckdb/planner/expression/bound_function_expression.hpp"
@@ -10,8 +11,13 @@ void ExpressionState::AddChild(Expression *expr) {
10
11
  child_states.push_back(ExpressionExecutor::InitializeState(*expr, root));
11
12
  }
12
13
 
13
- void ExpressionState::Finalize() {
14
- if (!types.empty()) {
14
+ void ExpressionState::Finalize(bool empty) {
15
+ if (types.empty()) {
16
+ return;
17
+ }
18
+ if (empty) {
19
+ intermediate_chunk.InitializeEmpty(types);
20
+ } else {
15
21
  intermediate_chunk.Initialize(GetAllocator(), types);
16
22
  }
17
23
  }
@@ -3,8 +3,8 @@
3
3
 
4
4
  namespace duckdb {
5
5
 
6
- void InitializeTransitionArray(unsigned char *transition_array, const uint8_t state) {
7
- for (uint32_t i = 0; i < NUM_TRANSITIONS; i++) {
6
+ void InitializeTransitionArray(CSVState *transition_array, const CSVState state) {
7
+ for (uint32_t i = 0; i < StateMachine::NUM_TRANSITIONS; i++) {
8
8
  transition_array[i] = state;
9
9
  }
10
10
  }
@@ -13,72 +13,65 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
13
13
  D_ASSERT(state_machine_cache.find(state_machine_options) == state_machine_cache.end());
14
14
  // Initialize transition array with default values to the Standard option
15
15
  auto &transition_array = state_machine_cache[state_machine_options];
16
- const uint8_t standard_state = static_cast<uint8_t>(CSVState::STANDARD);
17
- const uint8_t field_separator_state = static_cast<uint8_t>(CSVState::DELIMITER);
18
- const uint8_t record_separator_state = static_cast<uint8_t>(CSVState::RECORD_SEPARATOR);
19
- const uint8_t carriage_return_state = static_cast<uint8_t>(CSVState::CARRIAGE_RETURN);
20
- const uint8_t quoted_state = static_cast<uint8_t>(CSVState::QUOTED);
21
- const uint8_t unquoted_state = static_cast<uint8_t>(CSVState::UNQUOTED);
22
- const uint8_t escape_state = static_cast<uint8_t>(CSVState::ESCAPE);
23
- const uint8_t empty_line_state = static_cast<uint8_t>(CSVState::EMPTY_LINE);
24
- const uint8_t invalid_state = static_cast<uint8_t>(CSVState::INVALID);
25
16
 
26
- for (uint32_t i = 0; i < NUM_STATES; i++) {
27
- switch (i) {
28
- case quoted_state:
29
- InitializeTransitionArray(transition_array[i], quoted_state);
17
+ for (uint32_t i = 0; i < StateMachine::NUM_STATES; i++) {
18
+ CSVState cur_state = CSVState(i);
19
+ switch (cur_state) {
20
+ case CSVState::QUOTED:
21
+ InitializeTransitionArray(transition_array[cur_state], CSVState::QUOTED);
30
22
  break;
31
- case unquoted_state:
32
- case invalid_state:
33
- case escape_state:
34
- InitializeTransitionArray(transition_array[i], invalid_state);
23
+ case CSVState::UNQUOTED:
24
+ case CSVState::INVALID:
25
+ case CSVState::ESCAPE:
26
+ InitializeTransitionArray(transition_array[cur_state], CSVState::INVALID);
35
27
  break;
36
28
  default:
37
- InitializeTransitionArray(transition_array[i], standard_state);
29
+ InitializeTransitionArray(transition_array[cur_state], CSVState::STANDARD);
38
30
  break;
39
31
  }
40
32
  }
41
33
 
42
34
  // Now set values depending on configuration
43
35
  // 1) Standard State
44
- transition_array[standard_state][static_cast<uint8_t>(state_machine_options.delimiter)] = field_separator_state;
45
- transition_array[standard_state][static_cast<uint8_t>('\n')] = record_separator_state;
46
- transition_array[standard_state][static_cast<uint8_t>('\r')] = carriage_return_state;
47
- transition_array[standard_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
36
+ transition_array[CSVState::STANDARD][static_cast<uint8_t>(state_machine_options.delimiter)] = CSVState::DELIMITER;
37
+ transition_array[CSVState::STANDARD][static_cast<uint8_t>('\n')] = CSVState::RECORD_SEPARATOR;
38
+ transition_array[CSVState::STANDARD][static_cast<uint8_t>('\r')] = CSVState::CARRIAGE_RETURN;
39
+ transition_array[CSVState::STANDARD][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
48
40
  // 2) Field Separator State
49
- transition_array[field_separator_state][static_cast<uint8_t>(state_machine_options.delimiter)] =
50
- field_separator_state;
51
- transition_array[field_separator_state][static_cast<uint8_t>('\n')] = record_separator_state;
52
- transition_array[field_separator_state][static_cast<uint8_t>('\r')] = carriage_return_state;
53
- transition_array[field_separator_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
41
+ transition_array[CSVState::DELIMITER][static_cast<uint8_t>(state_machine_options.delimiter)] = CSVState::DELIMITER;
42
+ transition_array[CSVState::DELIMITER][static_cast<uint8_t>('\n')] = CSVState::RECORD_SEPARATOR;
43
+ transition_array[CSVState::DELIMITER][static_cast<uint8_t>('\r')] = CSVState::CARRIAGE_RETURN;
44
+ transition_array[CSVState::DELIMITER][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
54
45
  // 3) Record Separator State
55
- transition_array[record_separator_state][static_cast<uint8_t>(state_machine_options.delimiter)] =
56
- field_separator_state;
57
- transition_array[record_separator_state][static_cast<uint8_t>('\n')] = empty_line_state;
58
- transition_array[record_separator_state][static_cast<uint8_t>('\r')] = empty_line_state;
59
- transition_array[record_separator_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
46
+ transition_array[CSVState::RECORD_SEPARATOR][static_cast<uint8_t>(state_machine_options.delimiter)] =
47
+ CSVState::DELIMITER;
48
+ transition_array[CSVState::RECORD_SEPARATOR][static_cast<uint8_t>('\n')] = CSVState::EMPTY_LINE;
49
+ transition_array[CSVState::RECORD_SEPARATOR][static_cast<uint8_t>('\r')] = CSVState::EMPTY_LINE;
50
+ transition_array[CSVState::RECORD_SEPARATOR][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
60
51
  // 4) Carriage Return State
61
- transition_array[carriage_return_state][static_cast<uint8_t>('\n')] = record_separator_state;
62
- transition_array[carriage_return_state][static_cast<uint8_t>('\r')] = empty_line_state;
63
- transition_array[carriage_return_state][static_cast<uint8_t>(state_machine_options.escape)] = escape_state;
52
+ transition_array[CSVState::CARRIAGE_RETURN][static_cast<uint8_t>('\n')] = CSVState::RECORD_SEPARATOR;
53
+ transition_array[CSVState::CARRIAGE_RETURN][static_cast<uint8_t>('\r')] = CSVState::EMPTY_LINE;
54
+ transition_array[CSVState::CARRIAGE_RETURN][static_cast<uint8_t>(state_machine_options.escape)] = CSVState::ESCAPE;
64
55
  // 5) Quoted State
65
- transition_array[quoted_state][static_cast<uint8_t>(state_machine_options.quote)] = unquoted_state;
56
+ transition_array[CSVState::QUOTED][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::UNQUOTED;
66
57
  if (state_machine_options.quote != state_machine_options.escape) {
67
- transition_array[quoted_state][static_cast<uint8_t>(state_machine_options.escape)] = escape_state;
58
+ transition_array[CSVState::QUOTED][static_cast<uint8_t>(state_machine_options.escape)] = CSVState::ESCAPE;
68
59
  }
69
60
  // 6) Unquoted State
70
- transition_array[unquoted_state][static_cast<uint8_t>('\n')] = record_separator_state;
71
- transition_array[unquoted_state][static_cast<uint8_t>('\r')] = carriage_return_state;
72
- transition_array[unquoted_state][static_cast<uint8_t>(state_machine_options.delimiter)] = field_separator_state;
61
+ transition_array[CSVState::UNQUOTED][static_cast<uint8_t>('\n')] = CSVState::RECORD_SEPARATOR;
62
+ transition_array[CSVState::UNQUOTED][static_cast<uint8_t>('\r')] = CSVState::CARRIAGE_RETURN;
63
+ transition_array[CSVState::UNQUOTED][static_cast<uint8_t>(state_machine_options.delimiter)] = CSVState::DELIMITER;
73
64
  if (state_machine_options.quote == state_machine_options.escape) {
74
- transition_array[unquoted_state][static_cast<uint8_t>(state_machine_options.escape)] = quoted_state;
65
+ transition_array[CSVState::UNQUOTED][static_cast<uint8_t>(state_machine_options.escape)] = CSVState::QUOTED;
75
66
  }
76
67
  // 7) Escaped State
77
- transition_array[escape_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
78
- transition_array[escape_state][static_cast<uint8_t>(state_machine_options.escape)] = quoted_state;
68
+ transition_array[CSVState::ESCAPE][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
69
+ transition_array[CSVState::ESCAPE][static_cast<uint8_t>(state_machine_options.escape)] = CSVState::QUOTED;
79
70
  // 8) Empty Line State
80
- transition_array[empty_line_state][static_cast<uint8_t>('\r')] = empty_line_state;
81
- transition_array[empty_line_state][static_cast<uint8_t>('\n')] = empty_line_state;
71
+ transition_array[CSVState::EMPTY_LINE][static_cast<uint8_t>('\r')] = CSVState::EMPTY_LINE;
72
+ transition_array[CSVState::EMPTY_LINE][static_cast<uint8_t>('\n')] = CSVState::EMPTY_LINE;
73
+ transition_array[CSVState::EMPTY_LINE][static_cast<uint8_t>(state_machine_options.delimiter)] = CSVState::DELIMITER;
74
+ transition_array[CSVState::EMPTY_LINE][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
82
75
  }
83
76
 
84
77
  CSVStateMachineCache::CSVStateMachineCache() {
@@ -95,7 +88,7 @@ CSVStateMachineCache::CSVStateMachineCache() {
95
88
  }
96
89
  }
97
90
 
98
- const state_machine_t &CSVStateMachineCache::Get(const CSVStateMachineOptions &state_machine_options) {
91
+ const StateMachine &CSVStateMachineCache::Get(const CSVStateMachineOptions &state_machine_options) {
99
92
  //! Custom State Machine, we need to create it and cache it first
100
93
  if (state_machine_cache.find(state_machine_options) == state_machine_cache.end()) {
101
94
  Insert(state_machine_options);
@@ -49,11 +49,12 @@ bool ParallelCSVReader::NewLineDelimiter(bool carry, bool carry_followed_by_nl,
49
49
  return (carry && carry_followed_by_nl) || (!carry && first_char);
50
50
  }
51
51
 
52
- void ParallelCSVReader::SkipEmptyLines() {
52
+ bool ParallelCSVReader::SkipEmptyLines() {
53
+ const idx_t initial_position_buffer = position_buffer;
53
54
  idx_t new_pos_buffer = position_buffer;
54
55
  if (parse_chunk.data.size() == 1) {
55
56
  // Empty lines are null data.
56
- return;
57
+ return initial_position_buffer != position_buffer;
57
58
  }
58
59
  for (; new_pos_buffer < end_buffer; new_pos_buffer++) {
59
60
  if (StringUtil::CharacterIsNewline((*buffer)[new_pos_buffer])) {
@@ -63,13 +64,14 @@ void ParallelCSVReader::SkipEmptyLines() {
63
64
  position_buffer++;
64
65
  }
65
66
  if (new_pos_buffer > end_buffer) {
66
- return;
67
+ return initial_position_buffer != position_buffer;
67
68
  }
68
69
  position_buffer = new_pos_buffer;
69
70
  } else if ((*buffer)[new_pos_buffer] != ' ') {
70
- return;
71
+ return initial_position_buffer != position_buffer;
71
72
  }
72
73
  }
74
+ return initial_position_buffer != position_buffer;
73
75
  }
74
76
 
75
77
  bool ParallelCSVReader::SetPosition() {
@@ -185,7 +187,6 @@ bool ParallelCSVReader::SetPosition() {
185
187
  }
186
188
  // Ensure that parse_chunk has no gunk when trying to figure new line
187
189
  parse_chunk.Reset();
188
-
189
190
  verification_positions.end_of_last_line = position_buffer;
190
191
  finished = false;
191
192
  return successfully_read_first_line;
@@ -288,7 +289,7 @@ bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error
288
289
  idx_t column = 0;
289
290
  idx_t offset = 0;
290
291
  bool has_quotes = false;
291
-
292
+ bool last_line_empty = false;
292
293
  vector<idx_t> escape_positions;
293
294
  if ((start_buffer == buffer->buffer_start || start_buffer == buffer->buffer_end) && !try_add_line) {
294
295
  // First time reading this buffer piece
@@ -454,7 +455,10 @@ add_row : {
454
455
  if (!BufferRemainder()) {
455
456
  goto final_state;
456
457
  }
457
- SkipEmptyLines();
458
+ if (SkipEmptyLines() && reached_remainder_state) {
459
+ last_line_empty = true;
460
+ goto final_state;
461
+ }
458
462
  if (position_buffer - verification_positions.end_of_last_line > options.buffer_size) {
459
463
  error_message = "Line does not fit in one buffer. Increase the buffer size.";
460
464
  return false;
@@ -583,8 +587,8 @@ final_state : {
583
587
  return true;
584
588
  }
585
589
  // If this is the last buffer, we have to read the last value
586
- if (buffer->buffer->is_last_buffer || !buffer->next_buffer ||
587
- (buffer->next_buffer && buffer->next_buffer->is_last_buffer)) {
590
+ if (!last_line_empty && (buffer->buffer->is_last_buffer || !buffer->next_buffer ||
591
+ (buffer->next_buffer && buffer->next_buffer->is_last_buffer))) {
588
592
  if (column > 0 || start_buffer != position_buffer || try_add_line ||
589
593
  (insert_chunk.data.size() == 1 && start_buffer != position_buffer)) {
590
594
  // remaining values to be added to the chunk
@@ -22,30 +22,9 @@ CSVSniffer::CSVSniffer(CSVReaderOptions &options_p, shared_ptr<CSVBufferManager>
22
22
  }
23
23
  }
24
24
 
25
- SnifferResult CSVSniffer::SniffCSV() {
26
- // 1. Dialect Detection
27
- DetectDialect();
28
- if (explicit_set_columns) {
29
- if (!candidates.empty()) {
30
- options.dialect_options.state_machine_options = candidates[0]->dialect_options.state_machine_options;
31
- options.dialect_options.new_line = candidates[0]->dialect_options.new_line;
32
- }
33
- // We do not need to run type and header detection as these were defined by the user
34
- return SnifferResult(detected_types, names);
35
- }
36
- // 2. Type Detection
37
- DetectTypes();
38
- // 3. Header Detection
39
- DetectHeader();
40
- D_ASSERT(best_sql_types_candidates_per_column_idx.size() == names.size());
41
- // 4. Type Replacement
42
- ReplaceTypes();
43
- // 5. Type Refinement
44
- RefineTypes();
45
- // We are done, construct and return the result.
46
-
47
- // Set the CSV Options in the reference
25
+ void CSVSniffer::SetResultOptions() {
48
26
  options.dialect_options = best_candidate->dialect_options;
27
+ options.dialect_options.new_line = best_candidate->dialect_options.new_line;
49
28
  options.has_header = best_candidate->dialect_options.header;
50
29
  options.skip_rows_set = options.dialect_options.skip_rows > 0;
51
30
  if (options.has_header) {
@@ -53,8 +32,27 @@ SnifferResult CSVSniffer::SniffCSV() {
53
32
  } else {
54
33
  options.dialect_options.true_start = best_start_without_header;
55
34
  }
35
+ }
56
36
 
57
- // Return the types and names
37
+ SnifferResult CSVSniffer::SniffCSV() {
38
+ // 1. Dialect Detection
39
+ DetectDialect();
40
+ // 2. Type Detection
41
+ DetectTypes();
42
+ // 3. Type Refinement
43
+ RefineTypes();
44
+ // 4. Header Detection
45
+ DetectHeader();
46
+ if (explicit_set_columns) {
47
+ SetResultOptions();
48
+ // We do not need to run type refinement, since the types have been given by the user
49
+ return SnifferResult({}, {});
50
+ }
51
+ // 5. Type Replacement
52
+ ReplaceTypes();
53
+ D_ASSERT(best_sql_types_candidates_per_column_idx.size() == names.size());
54
+ // We are done, Set the CSV Options in the reference. Construct and return the result.
55
+ SetResultOptions();
58
56
  return SnifferResult(detected_types, names);
59
57
  }
60
58
 
@@ -5,9 +5,9 @@ namespace duckdb {
5
5
 
6
6
  struct SniffDialect {
7
7
  inline static void Initialize(CSVStateMachine &machine) {
8
- machine.state = CSVState::STANDARD;
9
- machine.previous_state = CSVState::STANDARD;
10
- machine.pre_previous_state = CSVState::STANDARD;
8
+ machine.state = CSVState::EMPTY_LINE;
9
+ machine.previous_state = CSVState::EMPTY_LINE;
10
+ machine.pre_previous_state = CSVState::EMPTY_LINE;
11
11
  machine.cur_rows = 0;
12
12
  machine.column_count = 1;
13
13
  }
@@ -21,17 +21,12 @@ struct SniffDialect {
21
21
  sniffed_column_counts.clear();
22
22
  return true;
23
23
  }
24
- machine.pre_previous_state = machine.previous_state;
25
- machine.previous_state = machine.state;
26
-
27
- machine.state = static_cast<CSVState>(
28
- machine.transition_array[static_cast<uint8_t>(machine.state)][static_cast<uint8_t>(current_char)]);
24
+ machine.Transition(current_char);
29
25
 
30
26
  bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN;
31
27
  machine.column_count += machine.previous_state == CSVState::DELIMITER;
32
28
  sniffed_column_counts[machine.cur_rows] = machine.column_count;
33
- machine.cur_rows +=
34
- machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE;
29
+ machine.cur_rows += machine.previous_state == CSVState::RECORD_SEPARATOR;
35
30
  machine.column_count -= (machine.column_count - 1) * (machine.previous_state == CSVState::RECORD_SEPARATOR);
36
31
 
37
32
  // It means our carriage return is actually a record separator
@@ -304,7 +299,7 @@ void CSVSniffer::DetectDialect() {
304
299
  unordered_map<uint8_t, vector<char>> quote_candidates_map;
305
300
  // Candidates for the escape option
306
301
  unordered_map<uint8_t, vector<char>> escape_candidates_map;
307
- escape_candidates_map[(uint8_t)QuoteRule::QUOTES_RFC] = {'\0', '\"', '\''};
302
+ escape_candidates_map[(uint8_t)QuoteRule::QUOTES_RFC] = {'\"', '\'', '\0'};
308
303
  escape_candidates_map[(uint8_t)QuoteRule::QUOTES_OTHER] = {'\\'};
309
304
  escape_candidates_map[(uint8_t)QuoteRule::NO_QUOTES] = {'\0'};
310
305
  // Number of rows read