duckdb 0.9.0 → 0.9.1-dev120.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/src/common/enum_util.cpp +1 -1
- package/src/duckdb/src/common/serializer/binary_deserializer.cpp +4 -2
- package/src/duckdb/src/common/types/data_chunk.cpp +1 -1
- package/src/duckdb/src/core_functions/scalar/map/map.cpp +66 -32
- package/src/duckdb/src/execution/expression_executor/execute_reference.cpp +1 -1
- package/src/duckdb/src/execution/expression_executor_state.cpp +8 -2
- package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +41 -48
- package/src/duckdb/src/execution/operator/csv_scanner/parallel_csv_reader.cpp +13 -9
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +22 -24
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +6 -11
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +8 -3
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +5 -9
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +8 -13
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +2 -2
- package/src/duckdb/src/execution/perfect_aggregate_hashtable.cpp +4 -6
- package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +1 -1
- package/src/duckdb/src/function/table/read_csv.cpp +1 -1
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +1 -0
- package/src/duckdb/src/include/duckdb/execution/expression_executor_state.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +12 -10
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state.hpp +28 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +9 -14
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +20 -6
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp +1 -1
- package/src/duckdb/src/include/duckdb.h +5 -5
- package/src/duckdb/src/main/extension/extension_helper.cpp +7 -0
- package/src/duckdb/src/optimizer/common_aggregate_optimizer.cpp +2 -2
- package/src/duckdb/src/planner/binder/tableref/plan_joinref.cpp +3 -0
package/package.json
CHANGED
@@ -68,7 +68,7 @@
|
|
68
68
|
#include "duckdb/execution/index/art/node.hpp"
|
69
69
|
#include "duckdb/execution/operator/scan/csv/base_csv_reader.hpp"
|
70
70
|
#include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
|
71
|
-
#include "duckdb/execution/operator/scan/csv/
|
71
|
+
#include "duckdb/execution/operator/scan/csv/csv_state.hpp"
|
72
72
|
#include "duckdb/execution/operator/scan/csv/quote_rules.hpp"
|
73
73
|
#include "duckdb/function/aggregate_state.hpp"
|
74
74
|
#include "duckdb/function/function.hpp"
|
@@ -8,7 +8,8 @@ namespace duckdb {
|
|
8
8
|
void BinaryDeserializer::OnPropertyBegin(const field_id_t field_id, const char *) {
|
9
9
|
auto field = NextField();
|
10
10
|
if (field != field_id) {
|
11
|
-
throw
|
11
|
+
throw SerializationException("Failed to deserialize: field id mismatch, expected: %d, got: %d", field_id,
|
12
|
+
field);
|
12
13
|
}
|
13
14
|
}
|
14
15
|
|
@@ -34,7 +35,8 @@ void BinaryDeserializer::OnObjectBegin() {
|
|
34
35
|
void BinaryDeserializer::OnObjectEnd() {
|
35
36
|
auto next_field = NextField();
|
36
37
|
if (next_field != MESSAGE_TERMINATOR_FIELD_ID) {
|
37
|
-
throw
|
38
|
+
throw SerializationException("Failed to deserialize: expected end of object, but found field id: %d",
|
39
|
+
next_field);
|
38
40
|
}
|
39
41
|
nesting_level--;
|
40
42
|
}
|
@@ -87,11 +87,24 @@ static bool ListEntriesEqual(Vector &keys, Vector &values, idx_t count) {
|
|
87
87
|
return true;
|
88
88
|
}
|
89
89
|
|
90
|
+
static list_entry_t *GetBiggestList(Vector &key, Vector &value, idx_t &size) {
|
91
|
+
auto key_size = ListVector::GetListSize(key);
|
92
|
+
auto value_size = ListVector::GetListSize(value);
|
93
|
+
if (key_size > value_size) {
|
94
|
+
size = key_size;
|
95
|
+
return ListVector::GetData(key);
|
96
|
+
}
|
97
|
+
size = value_size;
|
98
|
+
return ListVector::GetData(value);
|
99
|
+
}
|
100
|
+
|
90
101
|
static void MapFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
91
102
|
D_ASSERT(result.GetType().id() == LogicalTypeId::MAP);
|
92
103
|
|
93
|
-
auto
|
94
|
-
|
104
|
+
auto count = args.size();
|
105
|
+
|
106
|
+
auto &map_key_vector = MapVector::GetKeys(result);
|
107
|
+
auto &map_value_vector = MapVector::GetValues(result);
|
95
108
|
auto result_data = ListVector::GetData(result);
|
96
109
|
|
97
110
|
result.SetVectorType(VectorType::CONSTANT_VECTOR);
|
@@ -99,52 +112,73 @@ static void MapFunction(DataChunk &args, ExpressionState &state, Vector &result)
|
|
99
112
|
ListVector::SetListSize(result, 0);
|
100
113
|
result_data->offset = 0;
|
101
114
|
result_data->length = 0;
|
102
|
-
result.Verify(
|
115
|
+
result.Verify(count);
|
103
116
|
return;
|
104
117
|
}
|
105
118
|
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
119
|
+
D_ASSERT(args.ColumnCount() == 2);
|
120
|
+
auto &key_vector = args.data[0];
|
121
|
+
auto &value_vector = args.data[1];
|
122
|
+
|
123
|
+
if (args.AllConstant()) {
|
124
|
+
auto key_data = ListVector::GetData(key_vector);
|
125
|
+
auto value_data = ListVector::GetData(value_vector);
|
126
|
+
auto key_entry = key_data[0];
|
127
|
+
auto value_entry = value_data[0];
|
128
|
+
if (key_entry != value_entry) {
|
129
|
+
throw BinderException("Key and value list sizes don't match");
|
130
|
+
}
|
131
|
+
result_data[0] = key_entry;
|
132
|
+
ListVector::SetListSize(result, ListVector::GetListSize(key_vector));
|
133
|
+
map_key_vector.Reference(ListVector::GetEntry(key_vector));
|
134
|
+
map_value_vector.Reference(ListVector::GetEntry(value_vector));
|
135
|
+
MapVector::MapConversionVerify(result, count);
|
136
|
+
result.Verify(count);
|
137
|
+
return;
|
110
138
|
}
|
111
139
|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
140
|
+
result.SetVectorType(VectorType::FLAT_VECTOR);
|
141
|
+
|
142
|
+
if (key_vector.GetVectorType() == VectorType::CONSTANT_VECTOR) {
|
143
|
+
D_ASSERT(value_vector.GetVectorType() != VectorType::CONSTANT_VECTOR);
|
144
|
+
Vector expanded_const(ListType::GetChildType(key_vector.GetType()), count);
|
145
|
+
AlignVectorToReference(key_vector, value_vector, count, expanded_const);
|
146
|
+
map_key_vector.Reference(expanded_const);
|
147
|
+
|
148
|
+
value_vector.Flatten(count);
|
149
|
+
map_value_vector.Reference(ListVector::GetEntry(value_vector));
|
150
|
+
} else if (value_vector.GetVectorType() == VectorType::CONSTANT_VECTOR) {
|
151
|
+
D_ASSERT(key_vector.GetVectorType() != VectorType::CONSTANT_VECTOR);
|
152
|
+
Vector expanded_const(ListType::GetChildType(value_vector.GetType()), count);
|
153
|
+
AlignVectorToReference(value_vector, key_vector, count, expanded_const);
|
154
|
+
map_value_vector.Reference(expanded_const);
|
155
|
+
|
156
|
+
key_vector.Flatten(count);
|
157
|
+
map_key_vector.Reference(ListVector::GetEntry(key_vector));
|
123
158
|
} else {
|
124
|
-
|
159
|
+
key_vector.Flatten(count);
|
160
|
+
value_vector.Flatten(count);
|
161
|
+
|
162
|
+
if (!ListEntriesEqual(key_vector, value_vector, count)) {
|
125
163
|
throw InvalidInputException("Error in MAP creation: key list and value list do not align. i.e. different "
|
126
164
|
"size or incompatible structure");
|
127
165
|
}
|
166
|
+
|
167
|
+
map_value_vector.Reference(ListVector::GetEntry(value_vector));
|
168
|
+
map_key_vector.Reference(ListVector::GetEntry(key_vector));
|
128
169
|
}
|
129
170
|
|
130
|
-
|
171
|
+
idx_t list_size;
|
172
|
+
auto src_data = GetBiggestList(key_vector, value_vector, list_size);
|
173
|
+
ListVector::SetListSize(result, list_size);
|
131
174
|
|
132
175
|
result_data = ListVector::GetData(result);
|
133
|
-
for (idx_t i = 0; i <
|
176
|
+
for (idx_t i = 0; i < count; i++) {
|
134
177
|
result_data[i] = src_data[i];
|
135
178
|
}
|
136
179
|
|
137
|
-
|
138
|
-
|
139
|
-
if (!(keys_are_const && !values_are_const)) {
|
140
|
-
key_vector.Reference(ListVector::GetEntry(args.data[0]));
|
141
|
-
}
|
142
|
-
if (!(values_are_const && !keys_are_const)) {
|
143
|
-
value_vector.Reference(ListVector::GetEntry(args.data[1]));
|
144
|
-
}
|
145
|
-
|
146
|
-
MapVector::MapConversionVerify(result, args.size());
|
147
|
-
result.Verify(args.size());
|
180
|
+
MapVector::MapConversionVerify(result, count);
|
181
|
+
result.Verify(count);
|
148
182
|
}
|
149
183
|
|
150
184
|
static unique_ptr<FunctionData> MapBind(ClientContext &context, ScalarFunction &bound_function,
|
@@ -6,7 +6,7 @@ namespace duckdb {
|
|
6
6
|
unique_ptr<ExpressionState> ExpressionExecutor::InitializeState(const BoundReferenceExpression &expr,
|
7
7
|
ExpressionExecutorState &root) {
|
8
8
|
auto result = make_uniq<ExpressionState>(expr, root);
|
9
|
-
result->Finalize();
|
9
|
+
result->Finalize(true);
|
10
10
|
return result;
|
11
11
|
}
|
12
12
|
|
@@ -1,4 +1,5 @@
|
|
1
1
|
#include "duckdb/execution/expression_executor_state.hpp"
|
2
|
+
|
2
3
|
#include "duckdb/execution/expression_executor.hpp"
|
3
4
|
#include "duckdb/planner/expression.hpp"
|
4
5
|
#include "duckdb/planner/expression/bound_function_expression.hpp"
|
@@ -10,8 +11,13 @@ void ExpressionState::AddChild(Expression *expr) {
|
|
10
11
|
child_states.push_back(ExpressionExecutor::InitializeState(*expr, root));
|
11
12
|
}
|
12
13
|
|
13
|
-
void ExpressionState::Finalize() {
|
14
|
-
if (
|
14
|
+
void ExpressionState::Finalize(bool empty) {
|
15
|
+
if (types.empty()) {
|
16
|
+
return;
|
17
|
+
}
|
18
|
+
if (empty) {
|
19
|
+
intermediate_chunk.InitializeEmpty(types);
|
20
|
+
} else {
|
15
21
|
intermediate_chunk.Initialize(GetAllocator(), types);
|
16
22
|
}
|
17
23
|
}
|
@@ -3,8 +3,8 @@
|
|
3
3
|
|
4
4
|
namespace duckdb {
|
5
5
|
|
6
|
-
void InitializeTransitionArray(
|
7
|
-
for (uint32_t i = 0; i < NUM_TRANSITIONS; i++) {
|
6
|
+
void InitializeTransitionArray(CSVState *transition_array, const CSVState state) {
|
7
|
+
for (uint32_t i = 0; i < StateMachine::NUM_TRANSITIONS; i++) {
|
8
8
|
transition_array[i] = state;
|
9
9
|
}
|
10
10
|
}
|
@@ -13,72 +13,65 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
|
|
13
13
|
D_ASSERT(state_machine_cache.find(state_machine_options) == state_machine_cache.end());
|
14
14
|
// Initialize transition array with default values to the Standard option
|
15
15
|
auto &transition_array = state_machine_cache[state_machine_options];
|
16
|
-
const uint8_t standard_state = static_cast<uint8_t>(CSVState::STANDARD);
|
17
|
-
const uint8_t field_separator_state = static_cast<uint8_t>(CSVState::DELIMITER);
|
18
|
-
const uint8_t record_separator_state = static_cast<uint8_t>(CSVState::RECORD_SEPARATOR);
|
19
|
-
const uint8_t carriage_return_state = static_cast<uint8_t>(CSVState::CARRIAGE_RETURN);
|
20
|
-
const uint8_t quoted_state = static_cast<uint8_t>(CSVState::QUOTED);
|
21
|
-
const uint8_t unquoted_state = static_cast<uint8_t>(CSVState::UNQUOTED);
|
22
|
-
const uint8_t escape_state = static_cast<uint8_t>(CSVState::ESCAPE);
|
23
|
-
const uint8_t empty_line_state = static_cast<uint8_t>(CSVState::EMPTY_LINE);
|
24
|
-
const uint8_t invalid_state = static_cast<uint8_t>(CSVState::INVALID);
|
25
16
|
|
26
|
-
for (uint32_t i = 0; i < NUM_STATES; i++) {
|
27
|
-
|
28
|
-
|
29
|
-
|
17
|
+
for (uint32_t i = 0; i < StateMachine::NUM_STATES; i++) {
|
18
|
+
CSVState cur_state = CSVState(i);
|
19
|
+
switch (cur_state) {
|
20
|
+
case CSVState::QUOTED:
|
21
|
+
InitializeTransitionArray(transition_array[cur_state], CSVState::QUOTED);
|
30
22
|
break;
|
31
|
-
case
|
32
|
-
case
|
33
|
-
case
|
34
|
-
InitializeTransitionArray(transition_array[
|
23
|
+
case CSVState::UNQUOTED:
|
24
|
+
case CSVState::INVALID:
|
25
|
+
case CSVState::ESCAPE:
|
26
|
+
InitializeTransitionArray(transition_array[cur_state], CSVState::INVALID);
|
35
27
|
break;
|
36
28
|
default:
|
37
|
-
InitializeTransitionArray(transition_array[
|
29
|
+
InitializeTransitionArray(transition_array[cur_state], CSVState::STANDARD);
|
38
30
|
break;
|
39
31
|
}
|
40
32
|
}
|
41
33
|
|
42
34
|
// Now set values depending on configuration
|
43
35
|
// 1) Standard State
|
44
|
-
transition_array[
|
45
|
-
transition_array[
|
46
|
-
transition_array[
|
47
|
-
transition_array[
|
36
|
+
transition_array[CSVState::STANDARD][static_cast<uint8_t>(state_machine_options.delimiter)] = CSVState::DELIMITER;
|
37
|
+
transition_array[CSVState::STANDARD][static_cast<uint8_t>('\n')] = CSVState::RECORD_SEPARATOR;
|
38
|
+
transition_array[CSVState::STANDARD][static_cast<uint8_t>('\r')] = CSVState::CARRIAGE_RETURN;
|
39
|
+
transition_array[CSVState::STANDARD][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
|
48
40
|
// 2) Field Separator State
|
49
|
-
transition_array[
|
50
|
-
|
51
|
-
transition_array[
|
52
|
-
transition_array[
|
53
|
-
transition_array[field_separator_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
|
41
|
+
transition_array[CSVState::DELIMITER][static_cast<uint8_t>(state_machine_options.delimiter)] = CSVState::DELIMITER;
|
42
|
+
transition_array[CSVState::DELIMITER][static_cast<uint8_t>('\n')] = CSVState::RECORD_SEPARATOR;
|
43
|
+
transition_array[CSVState::DELIMITER][static_cast<uint8_t>('\r')] = CSVState::CARRIAGE_RETURN;
|
44
|
+
transition_array[CSVState::DELIMITER][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
|
54
45
|
// 3) Record Separator State
|
55
|
-
transition_array[
|
56
|
-
|
57
|
-
transition_array[
|
58
|
-
transition_array[
|
59
|
-
transition_array[
|
46
|
+
transition_array[CSVState::RECORD_SEPARATOR][static_cast<uint8_t>(state_machine_options.delimiter)] =
|
47
|
+
CSVState::DELIMITER;
|
48
|
+
transition_array[CSVState::RECORD_SEPARATOR][static_cast<uint8_t>('\n')] = CSVState::EMPTY_LINE;
|
49
|
+
transition_array[CSVState::RECORD_SEPARATOR][static_cast<uint8_t>('\r')] = CSVState::EMPTY_LINE;
|
50
|
+
transition_array[CSVState::RECORD_SEPARATOR][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
|
60
51
|
// 4) Carriage Return State
|
61
|
-
transition_array[
|
62
|
-
transition_array[
|
63
|
-
transition_array[
|
52
|
+
transition_array[CSVState::CARRIAGE_RETURN][static_cast<uint8_t>('\n')] = CSVState::RECORD_SEPARATOR;
|
53
|
+
transition_array[CSVState::CARRIAGE_RETURN][static_cast<uint8_t>('\r')] = CSVState::EMPTY_LINE;
|
54
|
+
transition_array[CSVState::CARRIAGE_RETURN][static_cast<uint8_t>(state_machine_options.escape)] = CSVState::ESCAPE;
|
64
55
|
// 5) Quoted State
|
65
|
-
transition_array[
|
56
|
+
transition_array[CSVState::QUOTED][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::UNQUOTED;
|
66
57
|
if (state_machine_options.quote != state_machine_options.escape) {
|
67
|
-
transition_array[
|
58
|
+
transition_array[CSVState::QUOTED][static_cast<uint8_t>(state_machine_options.escape)] = CSVState::ESCAPE;
|
68
59
|
}
|
69
60
|
// 6) Unquoted State
|
70
|
-
transition_array[
|
71
|
-
transition_array[
|
72
|
-
transition_array[
|
61
|
+
transition_array[CSVState::UNQUOTED][static_cast<uint8_t>('\n')] = CSVState::RECORD_SEPARATOR;
|
62
|
+
transition_array[CSVState::UNQUOTED][static_cast<uint8_t>('\r')] = CSVState::CARRIAGE_RETURN;
|
63
|
+
transition_array[CSVState::UNQUOTED][static_cast<uint8_t>(state_machine_options.delimiter)] = CSVState::DELIMITER;
|
73
64
|
if (state_machine_options.quote == state_machine_options.escape) {
|
74
|
-
transition_array[
|
65
|
+
transition_array[CSVState::UNQUOTED][static_cast<uint8_t>(state_machine_options.escape)] = CSVState::QUOTED;
|
75
66
|
}
|
76
67
|
// 7) Escaped State
|
77
|
-
transition_array[
|
78
|
-
transition_array[
|
68
|
+
transition_array[CSVState::ESCAPE][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
|
69
|
+
transition_array[CSVState::ESCAPE][static_cast<uint8_t>(state_machine_options.escape)] = CSVState::QUOTED;
|
79
70
|
// 8) Empty Line State
|
80
|
-
transition_array[
|
81
|
-
transition_array[
|
71
|
+
transition_array[CSVState::EMPTY_LINE][static_cast<uint8_t>('\r')] = CSVState::EMPTY_LINE;
|
72
|
+
transition_array[CSVState::EMPTY_LINE][static_cast<uint8_t>('\n')] = CSVState::EMPTY_LINE;
|
73
|
+
transition_array[CSVState::EMPTY_LINE][static_cast<uint8_t>(state_machine_options.delimiter)] = CSVState::DELIMITER;
|
74
|
+
transition_array[CSVState::EMPTY_LINE][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
|
82
75
|
}
|
83
76
|
|
84
77
|
CSVStateMachineCache::CSVStateMachineCache() {
|
@@ -95,7 +88,7 @@ CSVStateMachineCache::CSVStateMachineCache() {
|
|
95
88
|
}
|
96
89
|
}
|
97
90
|
|
98
|
-
const
|
91
|
+
const StateMachine &CSVStateMachineCache::Get(const CSVStateMachineOptions &state_machine_options) {
|
99
92
|
//! Custom State Machine, we need to create it and cache it first
|
100
93
|
if (state_machine_cache.find(state_machine_options) == state_machine_cache.end()) {
|
101
94
|
Insert(state_machine_options);
|
@@ -49,11 +49,12 @@ bool ParallelCSVReader::NewLineDelimiter(bool carry, bool carry_followed_by_nl,
|
|
49
49
|
return (carry && carry_followed_by_nl) || (!carry && first_char);
|
50
50
|
}
|
51
51
|
|
52
|
-
|
52
|
+
bool ParallelCSVReader::SkipEmptyLines() {
|
53
|
+
const idx_t initial_position_buffer = position_buffer;
|
53
54
|
idx_t new_pos_buffer = position_buffer;
|
54
55
|
if (parse_chunk.data.size() == 1) {
|
55
56
|
// Empty lines are null data.
|
56
|
-
return;
|
57
|
+
return initial_position_buffer != position_buffer;
|
57
58
|
}
|
58
59
|
for (; new_pos_buffer < end_buffer; new_pos_buffer++) {
|
59
60
|
if (StringUtil::CharacterIsNewline((*buffer)[new_pos_buffer])) {
|
@@ -63,13 +64,14 @@ void ParallelCSVReader::SkipEmptyLines() {
|
|
63
64
|
position_buffer++;
|
64
65
|
}
|
65
66
|
if (new_pos_buffer > end_buffer) {
|
66
|
-
return;
|
67
|
+
return initial_position_buffer != position_buffer;
|
67
68
|
}
|
68
69
|
position_buffer = new_pos_buffer;
|
69
70
|
} else if ((*buffer)[new_pos_buffer] != ' ') {
|
70
|
-
return;
|
71
|
+
return initial_position_buffer != position_buffer;
|
71
72
|
}
|
72
73
|
}
|
74
|
+
return initial_position_buffer != position_buffer;
|
73
75
|
}
|
74
76
|
|
75
77
|
bool ParallelCSVReader::SetPosition() {
|
@@ -185,7 +187,6 @@ bool ParallelCSVReader::SetPosition() {
|
|
185
187
|
}
|
186
188
|
// Ensure that parse_chunk has no gunk when trying to figure new line
|
187
189
|
parse_chunk.Reset();
|
188
|
-
|
189
190
|
verification_positions.end_of_last_line = position_buffer;
|
190
191
|
finished = false;
|
191
192
|
return successfully_read_first_line;
|
@@ -288,7 +289,7 @@ bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error
|
|
288
289
|
idx_t column = 0;
|
289
290
|
idx_t offset = 0;
|
290
291
|
bool has_quotes = false;
|
291
|
-
|
292
|
+
bool last_line_empty = false;
|
292
293
|
vector<idx_t> escape_positions;
|
293
294
|
if ((start_buffer == buffer->buffer_start || start_buffer == buffer->buffer_end) && !try_add_line) {
|
294
295
|
// First time reading this buffer piece
|
@@ -454,7 +455,10 @@ add_row : {
|
|
454
455
|
if (!BufferRemainder()) {
|
455
456
|
goto final_state;
|
456
457
|
}
|
457
|
-
SkipEmptyLines()
|
458
|
+
if (SkipEmptyLines() && reached_remainder_state) {
|
459
|
+
last_line_empty = true;
|
460
|
+
goto final_state;
|
461
|
+
}
|
458
462
|
if (position_buffer - verification_positions.end_of_last_line > options.buffer_size) {
|
459
463
|
error_message = "Line does not fit in one buffer. Increase the buffer size.";
|
460
464
|
return false;
|
@@ -583,8 +587,8 @@ final_state : {
|
|
583
587
|
return true;
|
584
588
|
}
|
585
589
|
// If this is the last buffer, we have to read the last value
|
586
|
-
if (buffer->buffer->is_last_buffer || !buffer->next_buffer ||
|
587
|
-
|
590
|
+
if (!last_line_empty && (buffer->buffer->is_last_buffer || !buffer->next_buffer ||
|
591
|
+
(buffer->next_buffer && buffer->next_buffer->is_last_buffer))) {
|
588
592
|
if (column > 0 || start_buffer != position_buffer || try_add_line ||
|
589
593
|
(insert_chunk.data.size() == 1 && start_buffer != position_buffer)) {
|
590
594
|
// remaining values to be added to the chunk
|
@@ -22,30 +22,9 @@ CSVSniffer::CSVSniffer(CSVReaderOptions &options_p, shared_ptr<CSVBufferManager>
|
|
22
22
|
}
|
23
23
|
}
|
24
24
|
|
25
|
-
|
26
|
-
// 1. Dialect Detection
|
27
|
-
DetectDialect();
|
28
|
-
if (explicit_set_columns) {
|
29
|
-
if (!candidates.empty()) {
|
30
|
-
options.dialect_options.state_machine_options = candidates[0]->dialect_options.state_machine_options;
|
31
|
-
options.dialect_options.new_line = candidates[0]->dialect_options.new_line;
|
32
|
-
}
|
33
|
-
// We do not need to run type and header detection as these were defined by the user
|
34
|
-
return SnifferResult(detected_types, names);
|
35
|
-
}
|
36
|
-
// 2. Type Detection
|
37
|
-
DetectTypes();
|
38
|
-
// 3. Header Detection
|
39
|
-
DetectHeader();
|
40
|
-
D_ASSERT(best_sql_types_candidates_per_column_idx.size() == names.size());
|
41
|
-
// 4. Type Replacement
|
42
|
-
ReplaceTypes();
|
43
|
-
// 5. Type Refinement
|
44
|
-
RefineTypes();
|
45
|
-
// We are done, construct and return the result.
|
46
|
-
|
47
|
-
// Set the CSV Options in the reference
|
25
|
+
void CSVSniffer::SetResultOptions() {
|
48
26
|
options.dialect_options = best_candidate->dialect_options;
|
27
|
+
options.dialect_options.new_line = best_candidate->dialect_options.new_line;
|
49
28
|
options.has_header = best_candidate->dialect_options.header;
|
50
29
|
options.skip_rows_set = options.dialect_options.skip_rows > 0;
|
51
30
|
if (options.has_header) {
|
@@ -53,8 +32,27 @@ SnifferResult CSVSniffer::SniffCSV() {
|
|
53
32
|
} else {
|
54
33
|
options.dialect_options.true_start = best_start_without_header;
|
55
34
|
}
|
35
|
+
}
|
56
36
|
|
57
|
-
|
37
|
+
SnifferResult CSVSniffer::SniffCSV() {
|
38
|
+
// 1. Dialect Detection
|
39
|
+
DetectDialect();
|
40
|
+
// 2. Type Detection
|
41
|
+
DetectTypes();
|
42
|
+
// 3. Type Refinement
|
43
|
+
RefineTypes();
|
44
|
+
// 4. Header Detection
|
45
|
+
DetectHeader();
|
46
|
+
if (explicit_set_columns) {
|
47
|
+
SetResultOptions();
|
48
|
+
// We do not need to run type refinement, since the types have been given by the user
|
49
|
+
return SnifferResult({}, {});
|
50
|
+
}
|
51
|
+
// 5. Type Replacement
|
52
|
+
ReplaceTypes();
|
53
|
+
D_ASSERT(best_sql_types_candidates_per_column_idx.size() == names.size());
|
54
|
+
// We are done, Set the CSV Options in the reference. Construct and return the result.
|
55
|
+
SetResultOptions();
|
58
56
|
return SnifferResult(detected_types, names);
|
59
57
|
}
|
60
58
|
|
@@ -5,9 +5,9 @@ namespace duckdb {
|
|
5
5
|
|
6
6
|
struct SniffDialect {
|
7
7
|
inline static void Initialize(CSVStateMachine &machine) {
|
8
|
-
machine.state = CSVState::
|
9
|
-
machine.previous_state = CSVState::
|
10
|
-
machine.pre_previous_state = CSVState::
|
8
|
+
machine.state = CSVState::EMPTY_LINE;
|
9
|
+
machine.previous_state = CSVState::EMPTY_LINE;
|
10
|
+
machine.pre_previous_state = CSVState::EMPTY_LINE;
|
11
11
|
machine.cur_rows = 0;
|
12
12
|
machine.column_count = 1;
|
13
13
|
}
|
@@ -21,17 +21,12 @@ struct SniffDialect {
|
|
21
21
|
sniffed_column_counts.clear();
|
22
22
|
return true;
|
23
23
|
}
|
24
|
-
machine.
|
25
|
-
machine.previous_state = machine.state;
|
26
|
-
|
27
|
-
machine.state = static_cast<CSVState>(
|
28
|
-
machine.transition_array[static_cast<uint8_t>(machine.state)][static_cast<uint8_t>(current_char)]);
|
24
|
+
machine.Transition(current_char);
|
29
25
|
|
30
26
|
bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN;
|
31
27
|
machine.column_count += machine.previous_state == CSVState::DELIMITER;
|
32
28
|
sniffed_column_counts[machine.cur_rows] = machine.column_count;
|
33
|
-
machine.cur_rows +=
|
34
|
-
machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE;
|
29
|
+
machine.cur_rows += machine.previous_state == CSVState::RECORD_SEPARATOR;
|
35
30
|
machine.column_count -= (machine.column_count - 1) * (machine.previous_state == CSVState::RECORD_SEPARATOR);
|
36
31
|
|
37
32
|
// It means our carriage return is actually a record separator
|
@@ -304,7 +299,7 @@ void CSVSniffer::DetectDialect() {
|
|
304
299
|
unordered_map<uint8_t, vector<char>> quote_candidates_map;
|
305
300
|
// Candidates for the escape option
|
306
301
|
unordered_map<uint8_t, vector<char>> escape_candidates_map;
|
307
|
-
escape_candidates_map[(uint8_t)QuoteRule::QUOTES_RFC] = {'\
|
302
|
+
escape_candidates_map[(uint8_t)QuoteRule::QUOTES_RFC] = {'\"', '\'', '\0'};
|
308
303
|
escape_candidates_map[(uint8_t)QuoteRule::QUOTES_OTHER] = {'\\'};
|
309
304
|
escape_candidates_map[(uint8_t)QuoteRule::NO_QUOTES] = {'\0'};
|
310
305
|
// Number of rows read
|
@@ -97,9 +97,14 @@ void CSVSniffer::DetectHeader() {
|
|
97
97
|
bool first_row_consistent = true;
|
98
98
|
// check if header row is all null and/or consistent with detected column data types
|
99
99
|
bool first_row_nulls = true;
|
100
|
-
//
|
101
|
-
|
102
|
-
|
100
|
+
// If null-padding is not allowed and there is a mismatch between our header candidate and the number of columns
|
101
|
+
// We can't detect the dialect/type options properly
|
102
|
+
if (!best_candidate->options.null_padding &&
|
103
|
+
best_sql_types_candidates_per_column_idx.size() != best_header_row.size()) {
|
104
|
+
throw InvalidInputException(
|
105
|
+
"Error in file \"%s\": CSV options could not be auto-detected. Consider setting parser options manually.",
|
106
|
+
options.file_path);
|
107
|
+
}
|
103
108
|
for (idx_t col = 0; col < best_header_row.size(); col++) {
|
104
109
|
auto dummy_val = best_header_row[col];
|
105
110
|
if (!dummy_val.IsNull()) {
|
@@ -143,20 +143,17 @@ struct SniffValue {
|
|
143
143
|
machine.rows_read++;
|
144
144
|
}
|
145
145
|
|
146
|
-
if ((machine.previous_state == CSVState::RECORD_SEPARATOR
|
146
|
+
if ((machine.previous_state == CSVState::RECORD_SEPARATOR) ||
|
147
147
|
(machine.state != CSVState::RECORD_SEPARATOR && machine.previous_state == CSVState::CARRIAGE_RETURN)) {
|
148
148
|
sniffed_values[machine.cur_rows].position = machine.line_start_pos;
|
149
149
|
sniffed_values[machine.cur_rows].set = true;
|
150
150
|
machine.line_start_pos = current_pos;
|
151
151
|
}
|
152
|
-
|
153
|
-
machine.
|
154
|
-
machine.state = static_cast<CSVState>(
|
155
|
-
machine.transition_array[static_cast<uint8_t>(machine.state)][static_cast<uint8_t>(current_char)]);
|
152
|
+
|
153
|
+
machine.Transition(current_char);
|
156
154
|
|
157
155
|
bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN;
|
158
|
-
if (machine.previous_state == CSVState::DELIMITER ||
|
159
|
-
(machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE) ||
|
156
|
+
if (machine.previous_state == CSVState::DELIMITER || (machine.previous_state == CSVState::RECORD_SEPARATOR) ||
|
160
157
|
(machine.state != CSVState::RECORD_SEPARATOR && carriage_return)) {
|
161
158
|
// Started a new value
|
162
159
|
// Check if it's UTF-8
|
@@ -175,8 +172,7 @@ struct SniffValue {
|
|
175
172
|
(machine.state == CSVState::QUOTED && machine.previous_state == CSVState::QUOTED)) {
|
176
173
|
machine.value += current_char;
|
177
174
|
}
|
178
|
-
machine.cur_rows +=
|
179
|
-
machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE;
|
175
|
+
machine.cur_rows += machine.previous_state == CSVState::RECORD_SEPARATOR;
|
180
176
|
// It means our carriage return is actually a record separator
|
181
177
|
machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return;
|
182
178
|
if (machine.cur_rows >= sniffed_values.size()) {
|
@@ -3,9 +3,9 @@
|
|
3
3
|
namespace duckdb {
|
4
4
|
struct Parse {
|
5
5
|
inline static void Initialize(CSVStateMachine &machine) {
|
6
|
-
machine.state = CSVState::
|
7
|
-
machine.previous_state = CSVState::
|
8
|
-
machine.pre_previous_state = CSVState::
|
6
|
+
machine.state = CSVState::EMPTY_LINE;
|
7
|
+
machine.previous_state = CSVState::EMPTY_LINE;
|
8
|
+
machine.pre_previous_state = CSVState::EMPTY_LINE;
|
9
9
|
|
10
10
|
machine.cur_rows = 0;
|
11
11
|
machine.column_count = 0;
|
@@ -14,22 +14,18 @@ struct Parse {
|
|
14
14
|
|
15
15
|
inline static bool Process(CSVStateMachine &machine, DataChunk &parse_chunk, char current_char, idx_t current_pos) {
|
16
16
|
|
17
|
-
machine.
|
18
|
-
machine.previous_state = machine.state;
|
19
|
-
machine.state = static_cast<CSVState>(
|
20
|
-
machine.transition_array[static_cast<uint8_t>(machine.state)][static_cast<uint8_t>(current_char)]);
|
17
|
+
machine.Transition(current_char);
|
21
18
|
|
22
19
|
bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN;
|
23
|
-
if (machine.previous_state == CSVState::DELIMITER ||
|
24
|
-
(machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE) ||
|
20
|
+
if (machine.previous_state == CSVState::DELIMITER || (machine.previous_state == CSVState::RECORD_SEPARATOR) ||
|
25
21
|
(machine.state != CSVState::RECORD_SEPARATOR && carriage_return)) {
|
26
22
|
// Started a new value
|
27
23
|
// Check if it's UTF-8 (Or not?)
|
28
24
|
machine.VerifyUTF8();
|
29
25
|
auto &v = parse_chunk.data[machine.column_count++];
|
30
26
|
auto parse_data = FlatVector::GetData<string_t>(v);
|
31
|
-
auto &validity_mask = FlatVector::Validity(v);
|
32
27
|
if (machine.value.empty()) {
|
28
|
+
auto &validity_mask = FlatVector::Validity(v);
|
33
29
|
validity_mask.SetInvalid(machine.cur_rows);
|
34
30
|
} else {
|
35
31
|
parse_data[machine.cur_rows] = StringVector::AddStringOrBlob(v, string_t(machine.value));
|
@@ -50,12 +46,11 @@ struct Parse {
|
|
50
46
|
(machine.state == CSVState::QUOTED && machine.previous_state == CSVState::QUOTED)) {
|
51
47
|
machine.value += current_char;
|
52
48
|
}
|
53
|
-
machine.cur_rows +=
|
54
|
-
machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE;
|
49
|
+
machine.cur_rows += machine.previous_state == CSVState::RECORD_SEPARATOR && machine.column_count > 0;
|
55
50
|
machine.column_count -= machine.column_count * (machine.previous_state == CSVState::RECORD_SEPARATOR);
|
56
51
|
|
57
52
|
// It means our carriage return is actually a record separator
|
58
|
-
machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return;
|
53
|
+
machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return && machine.column_count > 0;
|
59
54
|
machine.column_count -= machine.column_count * (machine.state != CSVState::RECORD_SEPARATOR && carriage_return);
|
60
55
|
|
61
56
|
if (machine.cur_rows >= STANDARD_VECTOR_SIZE) {
|
@@ -14,7 +14,7 @@ void CSVSniffer::ReplaceTypes() {
|
|
14
14
|
for (idx_t i = 0; i < names.size(); i++) {
|
15
15
|
auto it = best_candidate->options.sql_types_per_column.find(names[i]);
|
16
16
|
if (it != best_candidate->options.sql_types_per_column.end()) {
|
17
|
-
|
17
|
+
detected_types[i] = best_candidate->options.sql_type_list[it->second];
|
18
18
|
found++;
|
19
19
|
}
|
20
20
|
}
|
@@ -33,7 +33,7 @@ void CSVSniffer::ReplaceTypes() {
|
|
33
33
|
best_candidate->options.sql_type_list.size(), names.size());
|
34
34
|
}
|
35
35
|
for (idx_t i = 0; i < best_candidate->options.sql_type_list.size(); i++) {
|
36
|
-
|
36
|
+
detected_types[i] = best_candidate->options.sql_type_list[i];
|
37
37
|
}
|
38
38
|
}
|
39
39
|
} // namespace duckdb
|
@@ -298,12 +298,10 @@ void PerfectAggregateHashTable::Destroy() {
|
|
298
298
|
RowOperationsState row_state(*aggregate_allocator);
|
299
299
|
data_ptr_t payload_ptr = data;
|
300
300
|
for (idx_t i = 0; i < total_groups; i++) {
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
count = 0;
|
306
|
-
}
|
301
|
+
data_pointers[count++] = payload_ptr;
|
302
|
+
if (count == STANDARD_VECTOR_SIZE) {
|
303
|
+
RowOperations::DestroyStates(row_state, layout, addresses, count);
|
304
|
+
count = 0;
|
307
305
|
}
|
308
306
|
payload_ptr += tuple_size;
|
309
307
|
}
|
@@ -261,7 +261,7 @@ idx_t RadixHTConfig::ExternalRadixBits(const idx_t &maximum_sink_radix_bits_p) {
|
|
261
261
|
idx_t RadixHTConfig::SinkCapacity(ClientContext &context) {
|
262
262
|
// Get active and maximum number of threads
|
263
263
|
const idx_t active_threads = TaskScheduler::GetScheduler(context).NumberOfThreads();
|
264
|
-
const auto max_threads = DBConfig::
|
264
|
+
const auto max_threads = DBConfig::GetConfig(context).options.maximum_threads;
|
265
265
|
|
266
266
|
// Compute cache size per active thread (assuming cache is shared)
|
267
267
|
const auto total_shared_cache_size = max_threads * L3_CACHE_SIZE;
|
@@ -38,7 +38,7 @@ void ReadCSVData::FinalizeRead(ClientContext &context) {
|
|
38
38
|
auto number_of_threads = TaskScheduler::GetScheduler(context).NumberOfThreads();
|
39
39
|
//! If we have many csv files, we run single-threaded on each file and parallelize on the number of files
|
40
40
|
bool many_csv_files = files.size() > 1 && int64_t(files.size() * 2) >= number_of_threads;
|
41
|
-
if (options.parallel_mode != ParallelMode::PARALLEL && many_csv_files) {
|
41
|
+
if (options.parallel_mode != ParallelMode::PARALLEL && (many_csv_files || number_of_threads == 1)) {
|
42
42
|
single_threaded = true;
|
43
43
|
}
|
44
44
|
if (options.parallel_mode == ParallelMode::SINGLE_THREADED || not_supported_options ||
|
@@ -1,8 +1,8 @@
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
2
|
-
#define DUCKDB_VERSION "
|
2
|
+
#define DUCKDB_VERSION "v0.9.1-dev120"
|
3
3
|
#endif
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
5
|
+
#define DUCKDB_SOURCE_ID "af666ad8ba"
|
6
6
|
#endif
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
8
8
|
#include "duckdb/main/database.hpp"
|
@@ -34,9 +34,9 @@ public:
|
|
34
34
|
//! CSV Sniffing consists of five steps:
|
35
35
|
//! 1. Dialect Detection: Generate the CSV Options (delimiter, quote, escape, etc.)
|
36
36
|
//! 2. Type Detection: Figures out the types of the columns (For one chunk)
|
37
|
-
//! 3.
|
38
|
-
//! 4.
|
39
|
-
//! 5. Type
|
37
|
+
//! 3. Type Refinement: Refines the types of the columns for the remaining chunks
|
38
|
+
//! 4. Header Detection: Figures out if the CSV file has a header and produces the names of the columns
|
39
|
+
//! 5. Type Replacement: Replaces the types of the columns if the user specified them
|
40
40
|
SnifferResult SniffCSV();
|
41
41
|
|
42
42
|
private:
|
@@ -50,6 +50,8 @@ private:
|
|
50
50
|
CSVReaderOptions &options;
|
51
51
|
//! Buffer being used on sniffer
|
52
52
|
shared_ptr<CSVBufferManager> buffer_manager;
|
53
|
+
//! Sets the result options
|
54
|
+
void SetResultOptions();
|
53
55
|
|
54
56
|
//! ------------------------------------------------------//
|
55
57
|
//! ----------------- Dialect Detection ----------------- //
|
@@ -105,6 +107,13 @@ private:
|
|
105
107
|
idx_t best_start_without_header = 0;
|
106
108
|
vector<Value> best_header_row;
|
107
109
|
|
110
|
+
//! ------------------------------------------------------//
|
111
|
+
//! ------------------ Type Refinement ------------------ //
|
112
|
+
//! ------------------------------------------------------//
|
113
|
+
void RefineTypes();
|
114
|
+
bool TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type);
|
115
|
+
vector<LogicalType> detected_types;
|
116
|
+
|
108
117
|
//! ------------------------------------------------------//
|
109
118
|
//! ------------------ Header Detection ----------------- //
|
110
119
|
//! ------------------------------------------------------//
|
@@ -117,13 +126,6 @@ private:
|
|
117
126
|
//! ------------------ Type Replacement ----------------- //
|
118
127
|
//! ------------------------------------------------------//
|
119
128
|
void ReplaceTypes();
|
120
|
-
|
121
|
-
//! ------------------------------------------------------//
|
122
|
-
//! ------------------ Type Refinement ------------------ //
|
123
|
-
//! ------------------------------------------------------//
|
124
|
-
void RefineTypes();
|
125
|
-
bool TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type);
|
126
|
-
vector<LogicalType> detected_types;
|
127
129
|
};
|
128
130
|
|
129
131
|
} // namespace duckdb
|
@@ -0,0 +1,28 @@
|
|
1
|
+
//===----------------------------------------------------------------------===//
|
2
|
+
// DuckDB
|
3
|
+
//
|
4
|
+
// duckdb/execution/operator/scan/csv/csv_state.hpp
|
5
|
+
//
|
6
|
+
//
|
7
|
+
//===----------------------------------------------------------------------===//
|
8
|
+
|
9
|
+
#pragma once
|
10
|
+
|
11
|
+
#include <cstdint>
|
12
|
+
|
13
|
+
namespace duckdb {
|
14
|
+
|
15
|
+
//! All States of CSV Parsing
|
16
|
+
enum class CSVState : uint8_t {
|
17
|
+
STANDARD = 0, //! Regular unquoted field state
|
18
|
+
DELIMITER = 1, //! State after encountering a field separator (e.g., ;)
|
19
|
+
RECORD_SEPARATOR = 2, //! State after encountering a record separator (i.e., \n)
|
20
|
+
CARRIAGE_RETURN = 3, //! State after encountering a carriage return(i.e., \r)
|
21
|
+
QUOTED = 4, //! State when inside a quoted field
|
22
|
+
UNQUOTED = 5, //! State when leaving a quoted field
|
23
|
+
ESCAPE = 6, //! State when encountering an escape character (e.g., \)
|
24
|
+
EMPTY_LINE = 7, //! State when encountering an empty line (i.e., \r\r \n\n, \n\r)
|
25
|
+
INVALID = 8 //! Got to an Invalid State, this should error.
|
26
|
+
};
|
27
|
+
|
28
|
+
} // namespace duckdb
|
@@ -14,19 +14,6 @@
|
|
14
14
|
|
15
15
|
namespace duckdb {
|
16
16
|
|
17
|
-
//! All States of CSV Parsing
|
18
|
-
enum class CSVState : uint8_t {
|
19
|
-
STANDARD = 0, //! Regular unquoted field state
|
20
|
-
DELIMITER = 1, //! State after encountering a field separator (e.g., ;)
|
21
|
-
RECORD_SEPARATOR = 2, //! State after encountering a record separator (i.e., \n)
|
22
|
-
CARRIAGE_RETURN = 3, //! State after encountering a carriage return(i.e., \r)
|
23
|
-
QUOTED = 4, //! State when inside a quoted field
|
24
|
-
UNQUOTED = 5, //! State when leaving a quoted field
|
25
|
-
ESCAPE = 6, //! State when encountering an escape character (e.g., \)
|
26
|
-
EMPTY_LINE = 7, //! State when encountering an empty line (i.e., \r\r \n\n, \n\r)
|
27
|
-
INVALID = 8 //! Got to an Invalid State, this should error.
|
28
|
-
};
|
29
|
-
|
30
17
|
//! The CSV State Machine comprises a state transition array (STA).
|
31
18
|
//! The STA indicates the current state of parsing based on both the current and preceding characters.
|
32
19
|
//! This reveals whether we are dealing with a Field, a New Line, a Delimiter, and so forth.
|
@@ -38,6 +25,14 @@ public:
|
|
38
25
|
explicit CSVStateMachine(CSVReaderOptions &options_p, const CSVStateMachineOptions &state_machine_options,
|
39
26
|
shared_ptr<CSVBufferManager> buffer_manager_p,
|
40
27
|
CSVStateMachineCache &csv_state_machine_cache_p);
|
28
|
+
|
29
|
+
//! Transition all states to next state, that depends on the current char
|
30
|
+
inline void Transition(char current_char) {
|
31
|
+
pre_previous_state = previous_state;
|
32
|
+
previous_state = state;
|
33
|
+
state = transition_array[state][static_cast<uint8_t>(current_char)];
|
34
|
+
}
|
35
|
+
|
41
36
|
//! Resets the state machine, so it can be used again
|
42
37
|
void Reset();
|
43
38
|
|
@@ -52,7 +47,7 @@ public:
|
|
52
47
|
idx_t start_row = 0;
|
53
48
|
//! The Transition Array is a Finite State Machine
|
54
49
|
//! It holds the transitions of all states, on all 256 possible different characters
|
55
|
-
const
|
50
|
+
const StateMachine &transition_array;
|
56
51
|
|
57
52
|
//! Both these variables are used for new line identifier detection
|
58
53
|
bool single_record_separator = false;
|
package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp
CHANGED
@@ -8,14 +8,28 @@
|
|
8
8
|
|
9
9
|
#pragma once
|
10
10
|
|
11
|
-
#include "duckdb/execution/operator/scan/csv/
|
11
|
+
#include "duckdb/execution/operator/scan/csv/csv_state.hpp"
|
12
12
|
#include "duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp"
|
13
|
+
#include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
|
13
14
|
#include "duckdb/execution/operator/scan/csv/quote_rules.hpp"
|
14
15
|
|
15
16
|
namespace duckdb {
|
16
|
-
|
17
|
-
|
18
|
-
|
17
|
+
|
18
|
+
//! Class to wrap the state machine matrix
|
19
|
+
class StateMachine {
|
20
|
+
public:
|
21
|
+
static constexpr uint32_t NUM_STATES = 9;
|
22
|
+
static constexpr uint32_t NUM_TRANSITIONS = 256;
|
23
|
+
CSVState state_machine[NUM_STATES][NUM_TRANSITIONS];
|
24
|
+
|
25
|
+
const CSVState *operator[](CSVState state) const {
|
26
|
+
return state_machine[static_cast<uint8_t>(state)];
|
27
|
+
}
|
28
|
+
|
29
|
+
CSVState *operator[](CSVState state) {
|
30
|
+
return state_machine[static_cast<uint8_t>(state)];
|
31
|
+
}
|
32
|
+
};
|
19
33
|
|
20
34
|
//! Hash function used in out state machine cache, it hashes and combines all options used to generate a state machine
|
21
35
|
struct HashCSVStateMachineConfig {
|
@@ -36,12 +50,12 @@ public:
|
|
36
50
|
~CSVStateMachineCache() {};
|
37
51
|
//! Gets a state machine from the cache, if it's not from one the default options
|
38
52
|
//! It first caches it, then returns it.
|
39
|
-
const
|
53
|
+
const StateMachine &Get(const CSVStateMachineOptions &state_machine_options);
|
40
54
|
|
41
55
|
private:
|
42
56
|
void Insert(const CSVStateMachineOptions &state_machine_options);
|
43
57
|
//! Cache on delimiter|quote|escape
|
44
|
-
unordered_map<CSVStateMachineOptions,
|
58
|
+
unordered_map<CSVStateMachineOptions, StateMachine, HashCSVStateMachineConfig> state_machine_cache;
|
45
59
|
//! Default value for options used to intialize CSV State Machine Cache
|
46
60
|
const vector<char> default_delimiter = {',', '|', ';', '\t'};
|
47
61
|
const vector<vector<char>> default_quote = {{'\"'}, {'\"', '\''}, {'\0'}};
|
@@ -148,7 +148,7 @@ private:
|
|
148
148
|
//! Sets Position depending on the byte_start of this thread
|
149
149
|
bool SetPosition();
|
150
150
|
//! Called when scanning the 1st buffer, skips empty lines
|
151
|
-
|
151
|
+
bool SkipEmptyLines();
|
152
152
|
//! When a buffer finishes reading its piece, it still can try to scan up to the real end of the buffer
|
153
153
|
//! Up to finding a new line. This function sets the buffer_end and marks a boolean variable
|
154
154
|
//! when changing the buffer end the first time.
|
@@ -317,7 +317,7 @@ typedef enum {
|
|
317
317
|
//===--------------------------------------------------------------------===//
|
318
318
|
|
319
319
|
/*!
|
320
|
-
Creates a new database or opens an existing database file stored at the
|
320
|
+
Creates a new database or opens an existing database file stored at the given path.
|
321
321
|
If no path is given a new in-memory database is created instead.
|
322
322
|
The instantiated database should be closed with 'duckdb_close'
|
323
323
|
|
@@ -328,7 +328,7 @@ The instantiated database should be closed with 'duckdb_close'
|
|
328
328
|
DUCKDB_API duckdb_state duckdb_open(const char *path, duckdb_database *out_database);
|
329
329
|
|
330
330
|
/*!
|
331
|
-
Extended version of duckdb_open. Creates a new database or opens an existing database file stored at the
|
331
|
+
Extended version of duckdb_open. Creates a new database or opens an existing database file stored at the given path.
|
332
332
|
|
333
333
|
* path: Path to the database file on disk, or `nullptr` or `:memory:` to open an in-memory database.
|
334
334
|
* out_database: The result database object.
|
@@ -1009,7 +1009,7 @@ Binds an int64_t value to the prepared statement at the specified index.
|
|
1009
1009
|
DUCKDB_API duckdb_state duckdb_bind_int64(duckdb_prepared_statement prepared_statement, idx_t param_idx, int64_t val);
|
1010
1010
|
|
1011
1011
|
/*!
|
1012
|
-
Binds
|
1012
|
+
Binds a duckdb_hugeint value to the prepared statement at the specified index.
|
1013
1013
|
*/
|
1014
1014
|
DUCKDB_API duckdb_state duckdb_bind_hugeint(duckdb_prepared_statement prepared_statement, idx_t param_idx,
|
1015
1015
|
duckdb_hugeint val);
|
@@ -1040,12 +1040,12 @@ Binds an uint64_t value to the prepared statement at the specified index.
|
|
1040
1040
|
DUCKDB_API duckdb_state duckdb_bind_uint64(duckdb_prepared_statement prepared_statement, idx_t param_idx, uint64_t val);
|
1041
1041
|
|
1042
1042
|
/*!
|
1043
|
-
Binds
|
1043
|
+
Binds a float value to the prepared statement at the specified index.
|
1044
1044
|
*/
|
1045
1045
|
DUCKDB_API duckdb_state duckdb_bind_float(duckdb_prepared_statement prepared_statement, idx_t param_idx, float val);
|
1046
1046
|
|
1047
1047
|
/*!
|
1048
|
-
Binds
|
1048
|
+
Binds a double value to the prepared statement at the specified index.
|
1049
1049
|
*/
|
1050
1050
|
DUCKDB_API duckdb_state duckdb_bind_double(duckdb_prepared_statement prepared_statement, idx_t param_idx, double val);
|
1051
1051
|
|
@@ -196,6 +196,9 @@ string ExtensionHelper::AddExtensionInstallHintToErrorMsg(ClientContext &context
|
|
196
196
|
}
|
197
197
|
|
198
198
|
bool ExtensionHelper::TryAutoLoadExtension(ClientContext &context, const string &extension_name) noexcept {
|
199
|
+
if (context.db->ExtensionIsLoaded(extension_name)) {
|
200
|
+
return true;
|
201
|
+
}
|
199
202
|
auto &dbconfig = DBConfig::GetConfig(context);
|
200
203
|
try {
|
201
204
|
if (dbconfig.options.autoinstall_known_extensions) {
|
@@ -211,6 +214,10 @@ bool ExtensionHelper::TryAutoLoadExtension(ClientContext &context, const string
|
|
211
214
|
}
|
212
215
|
|
213
216
|
void ExtensionHelper::AutoLoadExtension(ClientContext &context, const string &extension_name) {
|
217
|
+
if (context.db->ExtensionIsLoaded(extension_name)) {
|
218
|
+
// Avoid downloading again
|
219
|
+
return;
|
220
|
+
}
|
214
221
|
auto &dbconfig = DBConfig::GetConfig(context);
|
215
222
|
try {
|
216
223
|
#ifndef DUCKDB_WASM
|
@@ -38,8 +38,8 @@ void CommonAggregateOptimizer::ExtractCommonAggregates(LogicalAggregate &aggr) {
|
|
38
38
|
// aggregate does not exist yet: add it to the map
|
39
39
|
aggregate_remap[*aggr.expressions[i]] = i;
|
40
40
|
if (i != original_index) {
|
41
|
-
// this aggregate is not erased, however an
|
42
|
-
// so we need to remap this
|
41
|
+
// this aggregate is not erased, however an aggregate BEFORE it has been erased
|
42
|
+
// so we need to remap this aggregate
|
43
43
|
ColumnBinding original_binding(aggr.aggregate_index, original_index);
|
44
44
|
ColumnBinding new_binding(aggr.aggregate_index, i);
|
45
45
|
aggregate_map[original_binding] = new_binding;
|
@@ -135,6 +135,9 @@ unique_ptr<LogicalOperator> LogicalComparisonJoin::CreateJoin(ClientContext &con
|
|
135
135
|
bool need_to_consider_arbitrary_expressions = true;
|
136
136
|
switch (reftype) {
|
137
137
|
case JoinRefType::ASOF: {
|
138
|
+
if (!arbitrary_expressions.empty()) {
|
139
|
+
throw BinderException("Invalid ASOF JOIN condition");
|
140
|
+
}
|
138
141
|
need_to_consider_arbitrary_expressions = false;
|
139
142
|
auto asof_idx = conditions.size();
|
140
143
|
for (size_t c = 0; c < conditions.size(); ++c) {
|