duckdb 0.9.1-dev0.0 → 0.9.1-dev143.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/extension/parquet/column_reader.cpp +26 -1
- package/src/duckdb/extension/parquet/include/column_reader.hpp +2 -0
- package/src/duckdb/extension/parquet/include/parquet_bss_decoder.hpp +49 -0
- package/src/duckdb/src/common/enum_util.cpp +1 -1
- package/src/duckdb/src/common/serializer/binary_deserializer.cpp +4 -2
- package/src/duckdb/src/common/types/data_chunk.cpp +1 -1
- package/src/duckdb/src/core_functions/scalar/map/map.cpp +66 -32
- package/src/duckdb/src/execution/expression_executor/execute_reference.cpp +1 -1
- package/src/duckdb/src/execution/expression_executor_state.cpp +8 -2
- package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +41 -48
- package/src/duckdb/src/execution/operator/csv_scanner/parallel_csv_reader.cpp +13 -9
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +22 -24
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +6 -11
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +8 -3
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +5 -9
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +8 -13
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +2 -2
- package/src/duckdb/src/execution/operator/helper/physical_reset.cpp +1 -4
- package/src/duckdb/src/execution/operator/helper/physical_set.cpp +2 -4
- package/src/duckdb/src/execution/perfect_aggregate_hashtable.cpp +4 -6
- package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +1 -1
- package/src/duckdb/src/function/table/read_csv.cpp +1 -1
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +1 -0
- package/src/duckdb/src/include/duckdb/execution/expression_executor_state.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +12 -10
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state.hpp +28 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +9 -14
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +20 -6
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp +1 -1
- package/src/duckdb/src/include/duckdb/main/config.hpp +2 -0
- package/src/duckdb/src/include/duckdb/planner/expression_binder.hpp +2 -2
- package/src/duckdb/src/include/duckdb.h +5 -5
- package/src/duckdb/src/main/config.cpp +14 -0
- package/src/duckdb/src/main/extension/extension_helper.cpp +7 -0
- package/src/duckdb/src/optimizer/common_aggregate_optimizer.cpp +2 -2
- package/src/duckdb/src/planner/binder/expression/bind_between_expression.cpp +5 -7
- package/src/duckdb/src/planner/binder/expression/bind_collate_expression.cpp +4 -2
- package/src/duckdb/src/planner/binder/expression/bind_comparison_expression.cpp +17 -14
- package/src/duckdb/src/planner/binder/query_node/bind_select_node.cpp +5 -12
- package/src/duckdb/src/planner/binder/tableref/plan_joinref.cpp +3 -0
- package/src/duckdb/src/transaction/duck_transaction_manager.cpp +13 -9
- package/src/duckdb/third_party/parquet/parquet_types.h +2 -1
package/package.json
CHANGED
@@ -243,6 +243,7 @@ void ColumnReader::InitializeRead(idx_t row_group_idx_p, const vector<ColumnChun
|
|
243
243
|
void ColumnReader::PrepareRead(parquet_filter_t &filter) {
|
244
244
|
dict_decoder.reset();
|
245
245
|
defined_decoder.reset();
|
246
|
+
bss_decoder.reset();
|
246
247
|
block.reset();
|
247
248
|
PageHeader page_hdr;
|
248
249
|
page_hdr.read(protocol);
|
@@ -443,6 +444,13 @@ void ColumnReader::PrepareDataPage(PageHeader &page_hdr) {
|
|
443
444
|
PrepareDeltaByteArray(*block);
|
444
445
|
break;
|
445
446
|
}
|
447
|
+
case Encoding::BYTE_STREAM_SPLIT: {
|
448
|
+
// Subtract 1 from length as the block is allocated with 1 extra byte,
|
449
|
+
// but the byte stream split encoder needs to know the correct data size.
|
450
|
+
bss_decoder = make_uniq<BssDecoder>(block->ptr, block->len - 1);
|
451
|
+
block->inc(block->len);
|
452
|
+
break;
|
453
|
+
}
|
446
454
|
case Encoding::PLAIN:
|
447
455
|
// nothing to do here, will be read directly below
|
448
456
|
break;
|
@@ -488,7 +496,7 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr
|
|
488
496
|
|
489
497
|
idx_t null_count = 0;
|
490
498
|
|
491
|
-
if ((dict_decoder || dbp_decoder || rle_decoder) && HasDefines()) {
|
499
|
+
if ((dict_decoder || dbp_decoder || rle_decoder || bss_decoder) && HasDefines()) {
|
492
500
|
// we need the null count because the dictionary offsets have no entries for nulls
|
493
501
|
for (idx_t i = 0; i < read_now; i++) {
|
494
502
|
if (define_out[i + result_offset] != max_define) {
|
@@ -534,6 +542,23 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr
|
|
534
542
|
} else if (byte_array_data) {
|
535
543
|
// DELTA_BYTE_ARRAY or DELTA_LENGTH_BYTE_ARRAY
|
536
544
|
DeltaByteArray(define_out, read_now, filter, result_offset, result);
|
545
|
+
} else if (bss_decoder) {
|
546
|
+
auto read_buf = make_shared<ResizeableBuffer>();
|
547
|
+
|
548
|
+
switch (schema.type) {
|
549
|
+
case duckdb_parquet::format::Type::FLOAT:
|
550
|
+
read_buf->resize(reader.allocator, sizeof(float) * (read_now - null_count));
|
551
|
+
bss_decoder->GetBatch<float>(read_buf->ptr, read_now - null_count);
|
552
|
+
break;
|
553
|
+
case duckdb_parquet::format::Type::DOUBLE:
|
554
|
+
read_buf->resize(reader.allocator, sizeof(double) * (read_now - null_count));
|
555
|
+
bss_decoder->GetBatch<double>(read_buf->ptr, read_now - null_count);
|
556
|
+
break;
|
557
|
+
default:
|
558
|
+
throw std::runtime_error("BYTE_STREAM_SPLIT encoding is only supported for FLOAT or DOUBLE data");
|
559
|
+
}
|
560
|
+
|
561
|
+
Plain(read_buf, define_out, read_now, filter, result_offset, result);
|
537
562
|
} else {
|
538
563
|
PlainReference(block, result);
|
539
564
|
Plain(block, define_out, read_now, filter, result_offset, result);
|
@@ -9,6 +9,7 @@
|
|
9
9
|
#pragma once
|
10
10
|
|
11
11
|
#include "duckdb.hpp"
|
12
|
+
#include "parquet_bss_decoder.hpp"
|
12
13
|
#include "parquet_dbp_decoder.hpp"
|
13
14
|
#include "parquet_rle_bp_decoder.hpp"
|
14
15
|
#include "parquet_statistics.hpp"
|
@@ -161,6 +162,7 @@ private:
|
|
161
162
|
unique_ptr<RleBpDecoder> repeated_decoder;
|
162
163
|
unique_ptr<DbpDecoder> dbp_decoder;
|
163
164
|
unique_ptr<RleBpDecoder> rle_decoder;
|
165
|
+
unique_ptr<BssDecoder> bss_decoder;
|
164
166
|
|
165
167
|
// dummies for Skip()
|
166
168
|
parquet_filter_t none_filter;
|
@@ -0,0 +1,49 @@
|
|
1
|
+
//===----------------------------------------------------------------------===//
|
2
|
+
// DuckDB
|
3
|
+
//
|
4
|
+
// parquet_bss_decoder.hpp
|
5
|
+
//
|
6
|
+
//
|
7
|
+
//===----------------------------------------------------------------------===//
|
8
|
+
|
9
|
+
#pragma once
|
10
|
+
#include "parquet_types.h"
|
11
|
+
#include "resizable_buffer.hpp"
|
12
|
+
|
13
|
+
namespace duckdb {
|
14
|
+
|
15
|
+
/// Decoder for the Byte Stream Split encoding
|
16
|
+
class BssDecoder {
|
17
|
+
public:
|
18
|
+
/// Create a decoder object. buffer/buffer_len is the encoded data.
|
19
|
+
BssDecoder(data_ptr_t buffer, uint32_t buffer_len) : buffer_(buffer, buffer_len), value_offset_(0) {
|
20
|
+
}
|
21
|
+
|
22
|
+
public:
|
23
|
+
template <typename T>
|
24
|
+
void GetBatch(data_ptr_t values_target_ptr, uint32_t batch_size) {
|
25
|
+
if (buffer_.len % sizeof(T) != 0) {
|
26
|
+
std::stringstream error;
|
27
|
+
error << "Data buffer size for the BYTE_STREAM_SPLIT encoding (" << buffer_.len
|
28
|
+
<< ") should be a multiple of the type size (" << sizeof(T) << ")";
|
29
|
+
throw std::runtime_error(error.str());
|
30
|
+
}
|
31
|
+
uint32_t num_buffer_values = buffer_.len / sizeof(T);
|
32
|
+
|
33
|
+
buffer_.available((value_offset_ + batch_size) * sizeof(T));
|
34
|
+
|
35
|
+
for (uint32_t byte_offset = 0; byte_offset < sizeof(T); ++byte_offset) {
|
36
|
+
data_ptr_t input_bytes = buffer_.ptr + byte_offset * num_buffer_values + value_offset_;
|
37
|
+
for (uint32_t i = 0; i < batch_size; ++i) {
|
38
|
+
values_target_ptr[byte_offset + i * sizeof(T)] = *(input_bytes + i);
|
39
|
+
}
|
40
|
+
}
|
41
|
+
value_offset_ += batch_size;
|
42
|
+
}
|
43
|
+
|
44
|
+
private:
|
45
|
+
ByteBuffer buffer_;
|
46
|
+
uint32_t value_offset_;
|
47
|
+
};
|
48
|
+
|
49
|
+
} // namespace duckdb
|
@@ -68,7 +68,7 @@
|
|
68
68
|
#include "duckdb/execution/index/art/node.hpp"
|
69
69
|
#include "duckdb/execution/operator/scan/csv/base_csv_reader.hpp"
|
70
70
|
#include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
|
71
|
-
#include "duckdb/execution/operator/scan/csv/
|
71
|
+
#include "duckdb/execution/operator/scan/csv/csv_state.hpp"
|
72
72
|
#include "duckdb/execution/operator/scan/csv/quote_rules.hpp"
|
73
73
|
#include "duckdb/function/aggregate_state.hpp"
|
74
74
|
#include "duckdb/function/function.hpp"
|
@@ -8,7 +8,8 @@ namespace duckdb {
|
|
8
8
|
void BinaryDeserializer::OnPropertyBegin(const field_id_t field_id, const char *) {
|
9
9
|
auto field = NextField();
|
10
10
|
if (field != field_id) {
|
11
|
-
throw
|
11
|
+
throw SerializationException("Failed to deserialize: field id mismatch, expected: %d, got: %d", field_id,
|
12
|
+
field);
|
12
13
|
}
|
13
14
|
}
|
14
15
|
|
@@ -34,7 +35,8 @@ void BinaryDeserializer::OnObjectBegin() {
|
|
34
35
|
void BinaryDeserializer::OnObjectEnd() {
|
35
36
|
auto next_field = NextField();
|
36
37
|
if (next_field != MESSAGE_TERMINATOR_FIELD_ID) {
|
37
|
-
throw
|
38
|
+
throw SerializationException("Failed to deserialize: expected end of object, but found field id: %d",
|
39
|
+
next_field);
|
38
40
|
}
|
39
41
|
nesting_level--;
|
40
42
|
}
|
@@ -87,11 +87,24 @@ static bool ListEntriesEqual(Vector &keys, Vector &values, idx_t count) {
|
|
87
87
|
return true;
|
88
88
|
}
|
89
89
|
|
90
|
+
static list_entry_t *GetBiggestList(Vector &key, Vector &value, idx_t &size) {
|
91
|
+
auto key_size = ListVector::GetListSize(key);
|
92
|
+
auto value_size = ListVector::GetListSize(value);
|
93
|
+
if (key_size > value_size) {
|
94
|
+
size = key_size;
|
95
|
+
return ListVector::GetData(key);
|
96
|
+
}
|
97
|
+
size = value_size;
|
98
|
+
return ListVector::GetData(value);
|
99
|
+
}
|
100
|
+
|
90
101
|
static void MapFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
91
102
|
D_ASSERT(result.GetType().id() == LogicalTypeId::MAP);
|
92
103
|
|
93
|
-
auto
|
94
|
-
|
104
|
+
auto count = args.size();
|
105
|
+
|
106
|
+
auto &map_key_vector = MapVector::GetKeys(result);
|
107
|
+
auto &map_value_vector = MapVector::GetValues(result);
|
95
108
|
auto result_data = ListVector::GetData(result);
|
96
109
|
|
97
110
|
result.SetVectorType(VectorType::CONSTANT_VECTOR);
|
@@ -99,52 +112,73 @@ static void MapFunction(DataChunk &args, ExpressionState &state, Vector &result)
|
|
99
112
|
ListVector::SetListSize(result, 0);
|
100
113
|
result_data->offset = 0;
|
101
114
|
result_data->length = 0;
|
102
|
-
result.Verify(
|
115
|
+
result.Verify(count);
|
103
116
|
return;
|
104
117
|
}
|
105
118
|
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
119
|
+
D_ASSERT(args.ColumnCount() == 2);
|
120
|
+
auto &key_vector = args.data[0];
|
121
|
+
auto &value_vector = args.data[1];
|
122
|
+
|
123
|
+
if (args.AllConstant()) {
|
124
|
+
auto key_data = ListVector::GetData(key_vector);
|
125
|
+
auto value_data = ListVector::GetData(value_vector);
|
126
|
+
auto key_entry = key_data[0];
|
127
|
+
auto value_entry = value_data[0];
|
128
|
+
if (key_entry != value_entry) {
|
129
|
+
throw BinderException("Key and value list sizes don't match");
|
130
|
+
}
|
131
|
+
result_data[0] = key_entry;
|
132
|
+
ListVector::SetListSize(result, ListVector::GetListSize(key_vector));
|
133
|
+
map_key_vector.Reference(ListVector::GetEntry(key_vector));
|
134
|
+
map_value_vector.Reference(ListVector::GetEntry(value_vector));
|
135
|
+
MapVector::MapConversionVerify(result, count);
|
136
|
+
result.Verify(count);
|
137
|
+
return;
|
110
138
|
}
|
111
139
|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
140
|
+
result.SetVectorType(VectorType::FLAT_VECTOR);
|
141
|
+
|
142
|
+
if (key_vector.GetVectorType() == VectorType::CONSTANT_VECTOR) {
|
143
|
+
D_ASSERT(value_vector.GetVectorType() != VectorType::CONSTANT_VECTOR);
|
144
|
+
Vector expanded_const(ListType::GetChildType(key_vector.GetType()), count);
|
145
|
+
AlignVectorToReference(key_vector, value_vector, count, expanded_const);
|
146
|
+
map_key_vector.Reference(expanded_const);
|
147
|
+
|
148
|
+
value_vector.Flatten(count);
|
149
|
+
map_value_vector.Reference(ListVector::GetEntry(value_vector));
|
150
|
+
} else if (value_vector.GetVectorType() == VectorType::CONSTANT_VECTOR) {
|
151
|
+
D_ASSERT(key_vector.GetVectorType() != VectorType::CONSTANT_VECTOR);
|
152
|
+
Vector expanded_const(ListType::GetChildType(value_vector.GetType()), count);
|
153
|
+
AlignVectorToReference(value_vector, key_vector, count, expanded_const);
|
154
|
+
map_value_vector.Reference(expanded_const);
|
155
|
+
|
156
|
+
key_vector.Flatten(count);
|
157
|
+
map_key_vector.Reference(ListVector::GetEntry(key_vector));
|
123
158
|
} else {
|
124
|
-
|
159
|
+
key_vector.Flatten(count);
|
160
|
+
value_vector.Flatten(count);
|
161
|
+
|
162
|
+
if (!ListEntriesEqual(key_vector, value_vector, count)) {
|
125
163
|
throw InvalidInputException("Error in MAP creation: key list and value list do not align. i.e. different "
|
126
164
|
"size or incompatible structure");
|
127
165
|
}
|
166
|
+
|
167
|
+
map_value_vector.Reference(ListVector::GetEntry(value_vector));
|
168
|
+
map_key_vector.Reference(ListVector::GetEntry(key_vector));
|
128
169
|
}
|
129
170
|
|
130
|
-
|
171
|
+
idx_t list_size;
|
172
|
+
auto src_data = GetBiggestList(key_vector, value_vector, list_size);
|
173
|
+
ListVector::SetListSize(result, list_size);
|
131
174
|
|
132
175
|
result_data = ListVector::GetData(result);
|
133
|
-
for (idx_t i = 0; i <
|
176
|
+
for (idx_t i = 0; i < count; i++) {
|
134
177
|
result_data[i] = src_data[i];
|
135
178
|
}
|
136
179
|
|
137
|
-
|
138
|
-
|
139
|
-
if (!(keys_are_const && !values_are_const)) {
|
140
|
-
key_vector.Reference(ListVector::GetEntry(args.data[0]));
|
141
|
-
}
|
142
|
-
if (!(values_are_const && !keys_are_const)) {
|
143
|
-
value_vector.Reference(ListVector::GetEntry(args.data[1]));
|
144
|
-
}
|
145
|
-
|
146
|
-
MapVector::MapConversionVerify(result, args.size());
|
147
|
-
result.Verify(args.size());
|
180
|
+
MapVector::MapConversionVerify(result, count);
|
181
|
+
result.Verify(count);
|
148
182
|
}
|
149
183
|
|
150
184
|
static unique_ptr<FunctionData> MapBind(ClientContext &context, ScalarFunction &bound_function,
|
@@ -6,7 +6,7 @@ namespace duckdb {
|
|
6
6
|
unique_ptr<ExpressionState> ExpressionExecutor::InitializeState(const BoundReferenceExpression &expr,
|
7
7
|
ExpressionExecutorState &root) {
|
8
8
|
auto result = make_uniq<ExpressionState>(expr, root);
|
9
|
-
result->Finalize();
|
9
|
+
result->Finalize(true);
|
10
10
|
return result;
|
11
11
|
}
|
12
12
|
|
@@ -1,4 +1,5 @@
|
|
1
1
|
#include "duckdb/execution/expression_executor_state.hpp"
|
2
|
+
|
2
3
|
#include "duckdb/execution/expression_executor.hpp"
|
3
4
|
#include "duckdb/planner/expression.hpp"
|
4
5
|
#include "duckdb/planner/expression/bound_function_expression.hpp"
|
@@ -10,8 +11,13 @@ void ExpressionState::AddChild(Expression *expr) {
|
|
10
11
|
child_states.push_back(ExpressionExecutor::InitializeState(*expr, root));
|
11
12
|
}
|
12
13
|
|
13
|
-
void ExpressionState::Finalize() {
|
14
|
-
if (
|
14
|
+
void ExpressionState::Finalize(bool empty) {
|
15
|
+
if (types.empty()) {
|
16
|
+
return;
|
17
|
+
}
|
18
|
+
if (empty) {
|
19
|
+
intermediate_chunk.InitializeEmpty(types);
|
20
|
+
} else {
|
15
21
|
intermediate_chunk.Initialize(GetAllocator(), types);
|
16
22
|
}
|
17
23
|
}
|
@@ -3,8 +3,8 @@
|
|
3
3
|
|
4
4
|
namespace duckdb {
|
5
5
|
|
6
|
-
void InitializeTransitionArray(
|
7
|
-
for (uint32_t i = 0; i < NUM_TRANSITIONS; i++) {
|
6
|
+
void InitializeTransitionArray(CSVState *transition_array, const CSVState state) {
|
7
|
+
for (uint32_t i = 0; i < StateMachine::NUM_TRANSITIONS; i++) {
|
8
8
|
transition_array[i] = state;
|
9
9
|
}
|
10
10
|
}
|
@@ -13,72 +13,65 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
|
|
13
13
|
D_ASSERT(state_machine_cache.find(state_machine_options) == state_machine_cache.end());
|
14
14
|
// Initialize transition array with default values to the Standard option
|
15
15
|
auto &transition_array = state_machine_cache[state_machine_options];
|
16
|
-
const uint8_t standard_state = static_cast<uint8_t>(CSVState::STANDARD);
|
17
|
-
const uint8_t field_separator_state = static_cast<uint8_t>(CSVState::DELIMITER);
|
18
|
-
const uint8_t record_separator_state = static_cast<uint8_t>(CSVState::RECORD_SEPARATOR);
|
19
|
-
const uint8_t carriage_return_state = static_cast<uint8_t>(CSVState::CARRIAGE_RETURN);
|
20
|
-
const uint8_t quoted_state = static_cast<uint8_t>(CSVState::QUOTED);
|
21
|
-
const uint8_t unquoted_state = static_cast<uint8_t>(CSVState::UNQUOTED);
|
22
|
-
const uint8_t escape_state = static_cast<uint8_t>(CSVState::ESCAPE);
|
23
|
-
const uint8_t empty_line_state = static_cast<uint8_t>(CSVState::EMPTY_LINE);
|
24
|
-
const uint8_t invalid_state = static_cast<uint8_t>(CSVState::INVALID);
|
25
16
|
|
26
|
-
for (uint32_t i = 0; i < NUM_STATES; i++) {
|
27
|
-
|
28
|
-
|
29
|
-
|
17
|
+
for (uint32_t i = 0; i < StateMachine::NUM_STATES; i++) {
|
18
|
+
CSVState cur_state = CSVState(i);
|
19
|
+
switch (cur_state) {
|
20
|
+
case CSVState::QUOTED:
|
21
|
+
InitializeTransitionArray(transition_array[cur_state], CSVState::QUOTED);
|
30
22
|
break;
|
31
|
-
case
|
32
|
-
case
|
33
|
-
case
|
34
|
-
InitializeTransitionArray(transition_array[
|
23
|
+
case CSVState::UNQUOTED:
|
24
|
+
case CSVState::INVALID:
|
25
|
+
case CSVState::ESCAPE:
|
26
|
+
InitializeTransitionArray(transition_array[cur_state], CSVState::INVALID);
|
35
27
|
break;
|
36
28
|
default:
|
37
|
-
InitializeTransitionArray(transition_array[
|
29
|
+
InitializeTransitionArray(transition_array[cur_state], CSVState::STANDARD);
|
38
30
|
break;
|
39
31
|
}
|
40
32
|
}
|
41
33
|
|
42
34
|
// Now set values depending on configuration
|
43
35
|
// 1) Standard State
|
44
|
-
transition_array[
|
45
|
-
transition_array[
|
46
|
-
transition_array[
|
47
|
-
transition_array[
|
36
|
+
transition_array[CSVState::STANDARD][static_cast<uint8_t>(state_machine_options.delimiter)] = CSVState::DELIMITER;
|
37
|
+
transition_array[CSVState::STANDARD][static_cast<uint8_t>('\n')] = CSVState::RECORD_SEPARATOR;
|
38
|
+
transition_array[CSVState::STANDARD][static_cast<uint8_t>('\r')] = CSVState::CARRIAGE_RETURN;
|
39
|
+
transition_array[CSVState::STANDARD][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
|
48
40
|
// 2) Field Separator State
|
49
|
-
transition_array[
|
50
|
-
|
51
|
-
transition_array[
|
52
|
-
transition_array[
|
53
|
-
transition_array[field_separator_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
|
41
|
+
transition_array[CSVState::DELIMITER][static_cast<uint8_t>(state_machine_options.delimiter)] = CSVState::DELIMITER;
|
42
|
+
transition_array[CSVState::DELIMITER][static_cast<uint8_t>('\n')] = CSVState::RECORD_SEPARATOR;
|
43
|
+
transition_array[CSVState::DELIMITER][static_cast<uint8_t>('\r')] = CSVState::CARRIAGE_RETURN;
|
44
|
+
transition_array[CSVState::DELIMITER][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
|
54
45
|
// 3) Record Separator State
|
55
|
-
transition_array[
|
56
|
-
|
57
|
-
transition_array[
|
58
|
-
transition_array[
|
59
|
-
transition_array[
|
46
|
+
transition_array[CSVState::RECORD_SEPARATOR][static_cast<uint8_t>(state_machine_options.delimiter)] =
|
47
|
+
CSVState::DELIMITER;
|
48
|
+
transition_array[CSVState::RECORD_SEPARATOR][static_cast<uint8_t>('\n')] = CSVState::EMPTY_LINE;
|
49
|
+
transition_array[CSVState::RECORD_SEPARATOR][static_cast<uint8_t>('\r')] = CSVState::EMPTY_LINE;
|
50
|
+
transition_array[CSVState::RECORD_SEPARATOR][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
|
60
51
|
// 4) Carriage Return State
|
61
|
-
transition_array[
|
62
|
-
transition_array[
|
63
|
-
transition_array[
|
52
|
+
transition_array[CSVState::CARRIAGE_RETURN][static_cast<uint8_t>('\n')] = CSVState::RECORD_SEPARATOR;
|
53
|
+
transition_array[CSVState::CARRIAGE_RETURN][static_cast<uint8_t>('\r')] = CSVState::EMPTY_LINE;
|
54
|
+
transition_array[CSVState::CARRIAGE_RETURN][static_cast<uint8_t>(state_machine_options.escape)] = CSVState::ESCAPE;
|
64
55
|
// 5) Quoted State
|
65
|
-
transition_array[
|
56
|
+
transition_array[CSVState::QUOTED][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::UNQUOTED;
|
66
57
|
if (state_machine_options.quote != state_machine_options.escape) {
|
67
|
-
transition_array[
|
58
|
+
transition_array[CSVState::QUOTED][static_cast<uint8_t>(state_machine_options.escape)] = CSVState::ESCAPE;
|
68
59
|
}
|
69
60
|
// 6) Unquoted State
|
70
|
-
transition_array[
|
71
|
-
transition_array[
|
72
|
-
transition_array[
|
61
|
+
transition_array[CSVState::UNQUOTED][static_cast<uint8_t>('\n')] = CSVState::RECORD_SEPARATOR;
|
62
|
+
transition_array[CSVState::UNQUOTED][static_cast<uint8_t>('\r')] = CSVState::CARRIAGE_RETURN;
|
63
|
+
transition_array[CSVState::UNQUOTED][static_cast<uint8_t>(state_machine_options.delimiter)] = CSVState::DELIMITER;
|
73
64
|
if (state_machine_options.quote == state_machine_options.escape) {
|
74
|
-
transition_array[
|
65
|
+
transition_array[CSVState::UNQUOTED][static_cast<uint8_t>(state_machine_options.escape)] = CSVState::QUOTED;
|
75
66
|
}
|
76
67
|
// 7) Escaped State
|
77
|
-
transition_array[
|
78
|
-
transition_array[
|
68
|
+
transition_array[CSVState::ESCAPE][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
|
69
|
+
transition_array[CSVState::ESCAPE][static_cast<uint8_t>(state_machine_options.escape)] = CSVState::QUOTED;
|
79
70
|
// 8) Empty Line State
|
80
|
-
transition_array[
|
81
|
-
transition_array[
|
71
|
+
transition_array[CSVState::EMPTY_LINE][static_cast<uint8_t>('\r')] = CSVState::EMPTY_LINE;
|
72
|
+
transition_array[CSVState::EMPTY_LINE][static_cast<uint8_t>('\n')] = CSVState::EMPTY_LINE;
|
73
|
+
transition_array[CSVState::EMPTY_LINE][static_cast<uint8_t>(state_machine_options.delimiter)] = CSVState::DELIMITER;
|
74
|
+
transition_array[CSVState::EMPTY_LINE][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
|
82
75
|
}
|
83
76
|
|
84
77
|
CSVStateMachineCache::CSVStateMachineCache() {
|
@@ -95,7 +88,7 @@ CSVStateMachineCache::CSVStateMachineCache() {
|
|
95
88
|
}
|
96
89
|
}
|
97
90
|
|
98
|
-
const
|
91
|
+
const StateMachine &CSVStateMachineCache::Get(const CSVStateMachineOptions &state_machine_options) {
|
99
92
|
//! Custom State Machine, we need to create it and cache it first
|
100
93
|
if (state_machine_cache.find(state_machine_options) == state_machine_cache.end()) {
|
101
94
|
Insert(state_machine_options);
|
@@ -49,11 +49,12 @@ bool ParallelCSVReader::NewLineDelimiter(bool carry, bool carry_followed_by_nl,
|
|
49
49
|
return (carry && carry_followed_by_nl) || (!carry && first_char);
|
50
50
|
}
|
51
51
|
|
52
|
-
|
52
|
+
bool ParallelCSVReader::SkipEmptyLines() {
|
53
|
+
const idx_t initial_position_buffer = position_buffer;
|
53
54
|
idx_t new_pos_buffer = position_buffer;
|
54
55
|
if (parse_chunk.data.size() == 1) {
|
55
56
|
// Empty lines are null data.
|
56
|
-
return;
|
57
|
+
return initial_position_buffer != position_buffer;
|
57
58
|
}
|
58
59
|
for (; new_pos_buffer < end_buffer; new_pos_buffer++) {
|
59
60
|
if (StringUtil::CharacterIsNewline((*buffer)[new_pos_buffer])) {
|
@@ -63,13 +64,14 @@ void ParallelCSVReader::SkipEmptyLines() {
|
|
63
64
|
position_buffer++;
|
64
65
|
}
|
65
66
|
if (new_pos_buffer > end_buffer) {
|
66
|
-
return;
|
67
|
+
return initial_position_buffer != position_buffer;
|
67
68
|
}
|
68
69
|
position_buffer = new_pos_buffer;
|
69
70
|
} else if ((*buffer)[new_pos_buffer] != ' ') {
|
70
|
-
return;
|
71
|
+
return initial_position_buffer != position_buffer;
|
71
72
|
}
|
72
73
|
}
|
74
|
+
return initial_position_buffer != position_buffer;
|
73
75
|
}
|
74
76
|
|
75
77
|
bool ParallelCSVReader::SetPosition() {
|
@@ -185,7 +187,6 @@ bool ParallelCSVReader::SetPosition() {
|
|
185
187
|
}
|
186
188
|
// Ensure that parse_chunk has no gunk when trying to figure new line
|
187
189
|
parse_chunk.Reset();
|
188
|
-
|
189
190
|
verification_positions.end_of_last_line = position_buffer;
|
190
191
|
finished = false;
|
191
192
|
return successfully_read_first_line;
|
@@ -288,7 +289,7 @@ bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error
|
|
288
289
|
idx_t column = 0;
|
289
290
|
idx_t offset = 0;
|
290
291
|
bool has_quotes = false;
|
291
|
-
|
292
|
+
bool last_line_empty = false;
|
292
293
|
vector<idx_t> escape_positions;
|
293
294
|
if ((start_buffer == buffer->buffer_start || start_buffer == buffer->buffer_end) && !try_add_line) {
|
294
295
|
// First time reading this buffer piece
|
@@ -454,7 +455,10 @@ add_row : {
|
|
454
455
|
if (!BufferRemainder()) {
|
455
456
|
goto final_state;
|
456
457
|
}
|
457
|
-
SkipEmptyLines()
|
458
|
+
if (SkipEmptyLines() && reached_remainder_state) {
|
459
|
+
last_line_empty = true;
|
460
|
+
goto final_state;
|
461
|
+
}
|
458
462
|
if (position_buffer - verification_positions.end_of_last_line > options.buffer_size) {
|
459
463
|
error_message = "Line does not fit in one buffer. Increase the buffer size.";
|
460
464
|
return false;
|
@@ -583,8 +587,8 @@ final_state : {
|
|
583
587
|
return true;
|
584
588
|
}
|
585
589
|
// If this is the last buffer, we have to read the last value
|
586
|
-
if (buffer->buffer->is_last_buffer || !buffer->next_buffer ||
|
587
|
-
|
590
|
+
if (!last_line_empty && (buffer->buffer->is_last_buffer || !buffer->next_buffer ||
|
591
|
+
(buffer->next_buffer && buffer->next_buffer->is_last_buffer))) {
|
588
592
|
if (column > 0 || start_buffer != position_buffer || try_add_line ||
|
589
593
|
(insert_chunk.data.size() == 1 && start_buffer != position_buffer)) {
|
590
594
|
// remaining values to be added to the chunk
|
@@ -22,30 +22,9 @@ CSVSniffer::CSVSniffer(CSVReaderOptions &options_p, shared_ptr<CSVBufferManager>
|
|
22
22
|
}
|
23
23
|
}
|
24
24
|
|
25
|
-
|
26
|
-
// 1. Dialect Detection
|
27
|
-
DetectDialect();
|
28
|
-
if (explicit_set_columns) {
|
29
|
-
if (!candidates.empty()) {
|
30
|
-
options.dialect_options.state_machine_options = candidates[0]->dialect_options.state_machine_options;
|
31
|
-
options.dialect_options.new_line = candidates[0]->dialect_options.new_line;
|
32
|
-
}
|
33
|
-
// We do not need to run type and header detection as these were defined by the user
|
34
|
-
return SnifferResult(detected_types, names);
|
35
|
-
}
|
36
|
-
// 2. Type Detection
|
37
|
-
DetectTypes();
|
38
|
-
// 3. Header Detection
|
39
|
-
DetectHeader();
|
40
|
-
D_ASSERT(best_sql_types_candidates_per_column_idx.size() == names.size());
|
41
|
-
// 4. Type Replacement
|
42
|
-
ReplaceTypes();
|
43
|
-
// 5. Type Refinement
|
44
|
-
RefineTypes();
|
45
|
-
// We are done, construct and return the result.
|
46
|
-
|
47
|
-
// Set the CSV Options in the reference
|
25
|
+
void CSVSniffer::SetResultOptions() {
|
48
26
|
options.dialect_options = best_candidate->dialect_options;
|
27
|
+
options.dialect_options.new_line = best_candidate->dialect_options.new_line;
|
49
28
|
options.has_header = best_candidate->dialect_options.header;
|
50
29
|
options.skip_rows_set = options.dialect_options.skip_rows > 0;
|
51
30
|
if (options.has_header) {
|
@@ -53,8 +32,27 @@ SnifferResult CSVSniffer::SniffCSV() {
|
|
53
32
|
} else {
|
54
33
|
options.dialect_options.true_start = best_start_without_header;
|
55
34
|
}
|
35
|
+
}
|
56
36
|
|
57
|
-
|
37
|
+
SnifferResult CSVSniffer::SniffCSV() {
|
38
|
+
// 1. Dialect Detection
|
39
|
+
DetectDialect();
|
40
|
+
// 2. Type Detection
|
41
|
+
DetectTypes();
|
42
|
+
// 3. Type Refinement
|
43
|
+
RefineTypes();
|
44
|
+
// 4. Header Detection
|
45
|
+
DetectHeader();
|
46
|
+
if (explicit_set_columns) {
|
47
|
+
SetResultOptions();
|
48
|
+
// We do not need to run type refinement, since the types have been given by the user
|
49
|
+
return SnifferResult({}, {});
|
50
|
+
}
|
51
|
+
// 5. Type Replacement
|
52
|
+
ReplaceTypes();
|
53
|
+
D_ASSERT(best_sql_types_candidates_per_column_idx.size() == names.size());
|
54
|
+
// We are done, Set the CSV Options in the reference. Construct and return the result.
|
55
|
+
SetResultOptions();
|
58
56
|
return SnifferResult(detected_types, names);
|
59
57
|
}
|
60
58
|
|
@@ -5,9 +5,9 @@ namespace duckdb {
|
|
5
5
|
|
6
6
|
struct SniffDialect {
|
7
7
|
inline static void Initialize(CSVStateMachine &machine) {
|
8
|
-
machine.state = CSVState::
|
9
|
-
machine.previous_state = CSVState::
|
10
|
-
machine.pre_previous_state = CSVState::
|
8
|
+
machine.state = CSVState::EMPTY_LINE;
|
9
|
+
machine.previous_state = CSVState::EMPTY_LINE;
|
10
|
+
machine.pre_previous_state = CSVState::EMPTY_LINE;
|
11
11
|
machine.cur_rows = 0;
|
12
12
|
machine.column_count = 1;
|
13
13
|
}
|
@@ -21,17 +21,12 @@ struct SniffDialect {
|
|
21
21
|
sniffed_column_counts.clear();
|
22
22
|
return true;
|
23
23
|
}
|
24
|
-
machine.
|
25
|
-
machine.previous_state = machine.state;
|
26
|
-
|
27
|
-
machine.state = static_cast<CSVState>(
|
28
|
-
machine.transition_array[static_cast<uint8_t>(machine.state)][static_cast<uint8_t>(current_char)]);
|
24
|
+
machine.Transition(current_char);
|
29
25
|
|
30
26
|
bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN;
|
31
27
|
machine.column_count += machine.previous_state == CSVState::DELIMITER;
|
32
28
|
sniffed_column_counts[machine.cur_rows] = machine.column_count;
|
33
|
-
machine.cur_rows +=
|
34
|
-
machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE;
|
29
|
+
machine.cur_rows += machine.previous_state == CSVState::RECORD_SEPARATOR;
|
35
30
|
machine.column_count -= (machine.column_count - 1) * (machine.previous_state == CSVState::RECORD_SEPARATOR);
|
36
31
|
|
37
32
|
// It means our carriage return is actually a record separator
|
@@ -304,7 +299,7 @@ void CSVSniffer::DetectDialect() {
|
|
304
299
|
unordered_map<uint8_t, vector<char>> quote_candidates_map;
|
305
300
|
// Candidates for the escape option
|
306
301
|
unordered_map<uint8_t, vector<char>> escape_candidates_map;
|
307
|
-
escape_candidates_map[(uint8_t)QuoteRule::QUOTES_RFC] = {'\
|
302
|
+
escape_candidates_map[(uint8_t)QuoteRule::QUOTES_RFC] = {'\"', '\'', '\0'};
|
308
303
|
escape_candidates_map[(uint8_t)QuoteRule::QUOTES_OTHER] = {'\\'};
|
309
304
|
escape_candidates_map[(uint8_t)QuoteRule::NO_QUOTES] = {'\0'};
|
310
305
|
// Number of rows read
|