duckdb 0.9.1-dev0.0 → 0.9.1-dev19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/src/common/enum_util.cpp +1 -1
- package/src/duckdb/src/common/serializer/binary_deserializer.cpp +4 -2
- package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +39 -48
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +1 -5
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +8 -3
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +2 -4
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +1 -4
- package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +1 -1
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +1 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state.hpp +28 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +9 -14
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +20 -6
package/package.json
CHANGED
@@ -68,7 +68,7 @@
|
|
68
68
|
#include "duckdb/execution/index/art/node.hpp"
|
69
69
|
#include "duckdb/execution/operator/scan/csv/base_csv_reader.hpp"
|
70
70
|
#include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
|
71
|
-
#include "duckdb/execution/operator/scan/csv/
|
71
|
+
#include "duckdb/execution/operator/scan/csv/csv_state.hpp"
|
72
72
|
#include "duckdb/execution/operator/scan/csv/quote_rules.hpp"
|
73
73
|
#include "duckdb/function/aggregate_state.hpp"
|
74
74
|
#include "duckdb/function/function.hpp"
|
@@ -8,7 +8,8 @@ namespace duckdb {
|
|
8
8
|
void BinaryDeserializer::OnPropertyBegin(const field_id_t field_id, const char *) {
|
9
9
|
auto field = NextField();
|
10
10
|
if (field != field_id) {
|
11
|
-
throw
|
11
|
+
throw SerializationException("Failed to deserialize: field id mismatch, expected: %d, got: %d", field_id,
|
12
|
+
field);
|
12
13
|
}
|
13
14
|
}
|
14
15
|
|
@@ -34,7 +35,8 @@ void BinaryDeserializer::OnObjectBegin() {
|
|
34
35
|
void BinaryDeserializer::OnObjectEnd() {
|
35
36
|
auto next_field = NextField();
|
36
37
|
if (next_field != MESSAGE_TERMINATOR_FIELD_ID) {
|
37
|
-
throw
|
38
|
+
throw SerializationException("Failed to deserialize: expected end of object, but found field id: %d",
|
39
|
+
next_field);
|
38
40
|
}
|
39
41
|
nesting_level--;
|
40
42
|
}
|
@@ -3,8 +3,8 @@
|
|
3
3
|
|
4
4
|
namespace duckdb {
|
5
5
|
|
6
|
-
void InitializeTransitionArray(
|
7
|
-
for (uint32_t i = 0; i < NUM_TRANSITIONS; i++) {
|
6
|
+
void InitializeTransitionArray(CSVState *transition_array, const CSVState state) {
|
7
|
+
for (uint32_t i = 0; i < StateMachine::NUM_TRANSITIONS; i++) {
|
8
8
|
transition_array[i] = state;
|
9
9
|
}
|
10
10
|
}
|
@@ -13,72 +13,63 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
|
|
13
13
|
D_ASSERT(state_machine_cache.find(state_machine_options) == state_machine_cache.end());
|
14
14
|
// Initialize transition array with default values to the Standard option
|
15
15
|
auto &transition_array = state_machine_cache[state_machine_options];
|
16
|
-
const uint8_t standard_state = static_cast<uint8_t>(CSVState::STANDARD);
|
17
|
-
const uint8_t field_separator_state = static_cast<uint8_t>(CSVState::DELIMITER);
|
18
|
-
const uint8_t record_separator_state = static_cast<uint8_t>(CSVState::RECORD_SEPARATOR);
|
19
|
-
const uint8_t carriage_return_state = static_cast<uint8_t>(CSVState::CARRIAGE_RETURN);
|
20
|
-
const uint8_t quoted_state = static_cast<uint8_t>(CSVState::QUOTED);
|
21
|
-
const uint8_t unquoted_state = static_cast<uint8_t>(CSVState::UNQUOTED);
|
22
|
-
const uint8_t escape_state = static_cast<uint8_t>(CSVState::ESCAPE);
|
23
|
-
const uint8_t empty_line_state = static_cast<uint8_t>(CSVState::EMPTY_LINE);
|
24
|
-
const uint8_t invalid_state = static_cast<uint8_t>(CSVState::INVALID);
|
25
16
|
|
26
|
-
for (uint32_t i = 0; i < NUM_STATES; i++) {
|
27
|
-
|
28
|
-
|
29
|
-
|
17
|
+
for (uint32_t i = 0; i < StateMachine::NUM_STATES; i++) {
|
18
|
+
CSVState cur_state = CSVState(i);
|
19
|
+
switch (cur_state) {
|
20
|
+
case CSVState::QUOTED:
|
21
|
+
InitializeTransitionArray(transition_array[cur_state], CSVState::QUOTED);
|
30
22
|
break;
|
31
|
-
case
|
32
|
-
case
|
33
|
-
case
|
34
|
-
InitializeTransitionArray(transition_array[
|
23
|
+
case CSVState::UNQUOTED:
|
24
|
+
case CSVState::INVALID:
|
25
|
+
case CSVState::ESCAPE:
|
26
|
+
InitializeTransitionArray(transition_array[cur_state], CSVState::INVALID);
|
35
27
|
break;
|
36
28
|
default:
|
37
|
-
InitializeTransitionArray(transition_array[
|
29
|
+
InitializeTransitionArray(transition_array[cur_state], CSVState::STANDARD);
|
38
30
|
break;
|
39
31
|
}
|
40
32
|
}
|
41
33
|
|
42
34
|
// Now set values depending on configuration
|
43
35
|
// 1) Standard State
|
44
|
-
transition_array[
|
45
|
-
transition_array[
|
46
|
-
transition_array[
|
47
|
-
transition_array[
|
36
|
+
transition_array[CSVState::STANDARD][static_cast<uint8_t>(state_machine_options.delimiter)] = CSVState::DELIMITER;
|
37
|
+
transition_array[CSVState::STANDARD][static_cast<uint8_t>('\n')] = CSVState::RECORD_SEPARATOR;
|
38
|
+
transition_array[CSVState::STANDARD][static_cast<uint8_t>('\r')] = CSVState::CARRIAGE_RETURN;
|
39
|
+
transition_array[CSVState::STANDARD][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
|
48
40
|
// 2) Field Separator State
|
49
|
-
transition_array[
|
50
|
-
|
51
|
-
transition_array[
|
52
|
-
transition_array[
|
53
|
-
transition_array[field_separator_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
|
41
|
+
transition_array[CSVState::DELIMITER][static_cast<uint8_t>(state_machine_options.delimiter)] = CSVState::DELIMITER;
|
42
|
+
transition_array[CSVState::DELIMITER][static_cast<uint8_t>('\n')] = CSVState::RECORD_SEPARATOR;
|
43
|
+
transition_array[CSVState::DELIMITER][static_cast<uint8_t>('\r')] = CSVState::CARRIAGE_RETURN;
|
44
|
+
transition_array[CSVState::DELIMITER][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
|
54
45
|
// 3) Record Separator State
|
55
|
-
transition_array[
|
56
|
-
|
57
|
-
transition_array[
|
58
|
-
transition_array[
|
59
|
-
transition_array[
|
46
|
+
transition_array[CSVState::RECORD_SEPARATOR][static_cast<uint8_t>(state_machine_options.delimiter)] =
|
47
|
+
CSVState::DELIMITER;
|
48
|
+
transition_array[CSVState::RECORD_SEPARATOR][static_cast<uint8_t>('\n')] = CSVState::EMPTY_LINE;
|
49
|
+
transition_array[CSVState::RECORD_SEPARATOR][static_cast<uint8_t>('\r')] = CSVState::EMPTY_LINE;
|
50
|
+
transition_array[CSVState::RECORD_SEPARATOR][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
|
60
51
|
// 4) Carriage Return State
|
61
|
-
transition_array[
|
62
|
-
transition_array[
|
63
|
-
transition_array[
|
52
|
+
transition_array[CSVState::CARRIAGE_RETURN][static_cast<uint8_t>('\n')] = CSVState::RECORD_SEPARATOR;
|
53
|
+
transition_array[CSVState::CARRIAGE_RETURN][static_cast<uint8_t>('\r')] = CSVState::EMPTY_LINE;
|
54
|
+
transition_array[CSVState::CARRIAGE_RETURN][static_cast<uint8_t>(state_machine_options.escape)] = CSVState::ESCAPE;
|
64
55
|
// 5) Quoted State
|
65
|
-
transition_array[
|
56
|
+
transition_array[CSVState::QUOTED][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::UNQUOTED;
|
66
57
|
if (state_machine_options.quote != state_machine_options.escape) {
|
67
|
-
transition_array[
|
58
|
+
transition_array[CSVState::QUOTED][static_cast<uint8_t>(state_machine_options.escape)] = CSVState::ESCAPE;
|
68
59
|
}
|
69
60
|
// 6) Unquoted State
|
70
|
-
transition_array[
|
71
|
-
transition_array[
|
72
|
-
transition_array[
|
61
|
+
transition_array[CSVState::UNQUOTED][static_cast<uint8_t>('\n')] = CSVState::RECORD_SEPARATOR;
|
62
|
+
transition_array[CSVState::UNQUOTED][static_cast<uint8_t>('\r')] = CSVState::CARRIAGE_RETURN;
|
63
|
+
transition_array[CSVState::UNQUOTED][static_cast<uint8_t>(state_machine_options.delimiter)] = CSVState::DELIMITER;
|
73
64
|
if (state_machine_options.quote == state_machine_options.escape) {
|
74
|
-
transition_array[
|
65
|
+
transition_array[CSVState::UNQUOTED][static_cast<uint8_t>(state_machine_options.escape)] = CSVState::QUOTED;
|
75
66
|
}
|
76
67
|
// 7) Escaped State
|
77
|
-
transition_array[
|
78
|
-
transition_array[
|
68
|
+
transition_array[CSVState::ESCAPE][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
|
69
|
+
transition_array[CSVState::ESCAPE][static_cast<uint8_t>(state_machine_options.escape)] = CSVState::QUOTED;
|
79
70
|
// 8) Empty Line State
|
80
|
-
transition_array[
|
81
|
-
transition_array[
|
71
|
+
transition_array[CSVState::EMPTY_LINE][static_cast<uint8_t>('\r')] = CSVState::EMPTY_LINE;
|
72
|
+
transition_array[CSVState::EMPTY_LINE][static_cast<uint8_t>('\n')] = CSVState::EMPTY_LINE;
|
82
73
|
}
|
83
74
|
|
84
75
|
CSVStateMachineCache::CSVStateMachineCache() {
|
@@ -95,7 +86,7 @@ CSVStateMachineCache::CSVStateMachineCache() {
|
|
95
86
|
}
|
96
87
|
}
|
97
88
|
|
98
|
-
const
|
89
|
+
const StateMachine &CSVStateMachineCache::Get(const CSVStateMachineOptions &state_machine_options) {
|
99
90
|
//! Custom State Machine, we need to create it and cache it first
|
100
91
|
if (state_machine_cache.find(state_machine_options) == state_machine_cache.end()) {
|
101
92
|
Insert(state_machine_options);
|
@@ -21,11 +21,7 @@ struct SniffDialect {
|
|
21
21
|
sniffed_column_counts.clear();
|
22
22
|
return true;
|
23
23
|
}
|
24
|
-
machine.
|
25
|
-
machine.previous_state = machine.state;
|
26
|
-
|
27
|
-
machine.state = static_cast<CSVState>(
|
28
|
-
machine.transition_array[static_cast<uint8_t>(machine.state)][static_cast<uint8_t>(current_char)]);
|
24
|
+
machine.Transition(current_char);
|
29
25
|
|
30
26
|
bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN;
|
31
27
|
machine.column_count += machine.previous_state == CSVState::DELIMITER;
|
@@ -97,9 +97,14 @@ void CSVSniffer::DetectHeader() {
|
|
97
97
|
bool first_row_consistent = true;
|
98
98
|
// check if header row is all null and/or consistent with detected column data types
|
99
99
|
bool first_row_nulls = true;
|
100
|
-
//
|
101
|
-
|
102
|
-
|
100
|
+
// If null-padding is not allowed and there is a mismatch between our header candidate and the number of columns
|
101
|
+
// We can't detect the dialect/type options properly
|
102
|
+
if (!best_candidate->options.null_padding &&
|
103
|
+
best_sql_types_candidates_per_column_idx.size() != best_header_row.size()) {
|
104
|
+
throw InvalidInputException(
|
105
|
+
"Error in file \"%s\": CSV options could not be auto-detected. Consider setting parser options manually.",
|
106
|
+
options.file_path);
|
107
|
+
}
|
103
108
|
for (idx_t col = 0; col < best_header_row.size(); col++) {
|
104
109
|
auto dummy_val = best_header_row[col];
|
105
110
|
if (!dummy_val.IsNull()) {
|
@@ -149,10 +149,8 @@ struct SniffValue {
|
|
149
149
|
sniffed_values[machine.cur_rows].set = true;
|
150
150
|
machine.line_start_pos = current_pos;
|
151
151
|
}
|
152
|
-
|
153
|
-
machine.
|
154
|
-
machine.state = static_cast<CSVState>(
|
155
|
-
machine.transition_array[static_cast<uint8_t>(machine.state)][static_cast<uint8_t>(current_char)]);
|
152
|
+
|
153
|
+
machine.Transition(current_char);
|
156
154
|
|
157
155
|
bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN;
|
158
156
|
if (machine.previous_state == CSVState::DELIMITER ||
|
@@ -14,10 +14,7 @@ struct Parse {
|
|
14
14
|
|
15
15
|
inline static bool Process(CSVStateMachine &machine, DataChunk &parse_chunk, char current_char, idx_t current_pos) {
|
16
16
|
|
17
|
-
machine.
|
18
|
-
machine.previous_state = machine.state;
|
19
|
-
machine.state = static_cast<CSVState>(
|
20
|
-
machine.transition_array[static_cast<uint8_t>(machine.state)][static_cast<uint8_t>(current_char)]);
|
17
|
+
machine.Transition(current_char);
|
21
18
|
|
22
19
|
bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN;
|
23
20
|
if (machine.previous_state == CSVState::DELIMITER ||
|
@@ -261,7 +261,7 @@ idx_t RadixHTConfig::ExternalRadixBits(const idx_t &maximum_sink_radix_bits_p) {
|
|
261
261
|
idx_t RadixHTConfig::SinkCapacity(ClientContext &context) {
|
262
262
|
// Get active and maximum number of threads
|
263
263
|
const idx_t active_threads = TaskScheduler::GetScheduler(context).NumberOfThreads();
|
264
|
-
const auto max_threads = DBConfig::
|
264
|
+
const auto max_threads = DBConfig::GetConfig(context).options.maximum_threads;
|
265
265
|
|
266
266
|
// Compute cache size per active thread (assuming cache is shared)
|
267
267
|
const auto total_shared_cache_size = max_threads * L3_CACHE_SIZE;
|
@@ -1,8 +1,8 @@
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
2
|
-
#define DUCKDB_VERSION "0.9.
|
2
|
+
#define DUCKDB_VERSION "0.9.1-dev19"
|
3
3
|
#endif
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
5
|
+
#define DUCKDB_SOURCE_ID "1ea87567af"
|
6
6
|
#endif
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
8
8
|
#include "duckdb/main/database.hpp"
|
@@ -0,0 +1,28 @@
|
|
1
|
+
//===----------------------------------------------------------------------===//
|
2
|
+
// DuckDB
|
3
|
+
//
|
4
|
+
// duckdb/execution/operator/scan/csv/csv_state.hpp
|
5
|
+
//
|
6
|
+
//
|
7
|
+
//===----------------------------------------------------------------------===//
|
8
|
+
|
9
|
+
#pragma once
|
10
|
+
|
11
|
+
#include <cstdint>
|
12
|
+
|
13
|
+
namespace duckdb {
|
14
|
+
|
15
|
+
//! All States of CSV Parsing
|
16
|
+
enum class CSVState : uint8_t {
|
17
|
+
STANDARD = 0, //! Regular unquoted field state
|
18
|
+
DELIMITER = 1, //! State after encountering a field separator (e.g., ;)
|
19
|
+
RECORD_SEPARATOR = 2, //! State after encountering a record separator (i.e., \n)
|
20
|
+
CARRIAGE_RETURN = 3, //! State after encountering a carriage return(i.e., \r)
|
21
|
+
QUOTED = 4, //! State when inside a quoted field
|
22
|
+
UNQUOTED = 5, //! State when leaving a quoted field
|
23
|
+
ESCAPE = 6, //! State when encountering an escape character (e.g., \)
|
24
|
+
EMPTY_LINE = 7, //! State when encountering an empty line (i.e., \r\r \n\n, \n\r)
|
25
|
+
INVALID = 8 //! Got to an Invalid State, this should error.
|
26
|
+
};
|
27
|
+
|
28
|
+
} // namespace duckdb
|
@@ -14,19 +14,6 @@
|
|
14
14
|
|
15
15
|
namespace duckdb {
|
16
16
|
|
17
|
-
//! All States of CSV Parsing
|
18
|
-
enum class CSVState : uint8_t {
|
19
|
-
STANDARD = 0, //! Regular unquoted field state
|
20
|
-
DELIMITER = 1, //! State after encountering a field separator (e.g., ;)
|
21
|
-
RECORD_SEPARATOR = 2, //! State after encountering a record separator (i.e., \n)
|
22
|
-
CARRIAGE_RETURN = 3, //! State after encountering a carriage return(i.e., \r)
|
23
|
-
QUOTED = 4, //! State when inside a quoted field
|
24
|
-
UNQUOTED = 5, //! State when leaving a quoted field
|
25
|
-
ESCAPE = 6, //! State when encountering an escape character (e.g., \)
|
26
|
-
EMPTY_LINE = 7, //! State when encountering an empty line (i.e., \r\r \n\n, \n\r)
|
27
|
-
INVALID = 8 //! Got to an Invalid State, this should error.
|
28
|
-
};
|
29
|
-
|
30
17
|
//! The CSV State Machine comprises a state transition array (STA).
|
31
18
|
//! The STA indicates the current state of parsing based on both the current and preceding characters.
|
32
19
|
//! This reveals whether we are dealing with a Field, a New Line, a Delimiter, and so forth.
|
@@ -38,6 +25,14 @@ public:
|
|
38
25
|
explicit CSVStateMachine(CSVReaderOptions &options_p, const CSVStateMachineOptions &state_machine_options,
|
39
26
|
shared_ptr<CSVBufferManager> buffer_manager_p,
|
40
27
|
CSVStateMachineCache &csv_state_machine_cache_p);
|
28
|
+
|
29
|
+
//! Transition all states to next state, that depends on the current char
|
30
|
+
inline void Transition(char current_char) {
|
31
|
+
pre_previous_state = previous_state;
|
32
|
+
previous_state = state;
|
33
|
+
state = transition_array[state][static_cast<uint8_t>(current_char)];
|
34
|
+
}
|
35
|
+
|
41
36
|
//! Resets the state machine, so it can be used again
|
42
37
|
void Reset();
|
43
38
|
|
@@ -52,7 +47,7 @@ public:
|
|
52
47
|
idx_t start_row = 0;
|
53
48
|
//! The Transition Array is a Finite State Machine
|
54
49
|
//! It holds the transitions of all states, on all 256 possible different characters
|
55
|
-
const
|
50
|
+
const StateMachine &transition_array;
|
56
51
|
|
57
52
|
//! Both these variables are used for new line identifier detection
|
58
53
|
bool single_record_separator = false;
|
package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp
CHANGED
@@ -8,14 +8,28 @@
|
|
8
8
|
|
9
9
|
#pragma once
|
10
10
|
|
11
|
-
#include "duckdb/execution/operator/scan/csv/
|
11
|
+
#include "duckdb/execution/operator/scan/csv/csv_state.hpp"
|
12
12
|
#include "duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp"
|
13
|
+
#include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
|
13
14
|
#include "duckdb/execution/operator/scan/csv/quote_rules.hpp"
|
14
15
|
|
15
16
|
namespace duckdb {
|
16
|
-
|
17
|
-
|
18
|
-
|
17
|
+
|
18
|
+
//! Class to wrap the state machine matrix
|
19
|
+
class StateMachine {
|
20
|
+
public:
|
21
|
+
static constexpr uint32_t NUM_STATES = 9;
|
22
|
+
static constexpr uint32_t NUM_TRANSITIONS = 256;
|
23
|
+
CSVState state_machine[NUM_STATES][NUM_TRANSITIONS];
|
24
|
+
|
25
|
+
const CSVState *operator[](CSVState state) const {
|
26
|
+
return state_machine[static_cast<uint8_t>(state)];
|
27
|
+
}
|
28
|
+
|
29
|
+
CSVState *operator[](CSVState state) {
|
30
|
+
return state_machine[static_cast<uint8_t>(state)];
|
31
|
+
}
|
32
|
+
};
|
19
33
|
|
20
34
|
//! Hash function used in out state machine cache, it hashes and combines all options used to generate a state machine
|
21
35
|
struct HashCSVStateMachineConfig {
|
@@ -36,12 +50,12 @@ public:
|
|
36
50
|
~CSVStateMachineCache() {};
|
37
51
|
//! Gets a state machine from the cache, if it's not from one the default options
|
38
52
|
//! It first caches it, then returns it.
|
39
|
-
const
|
53
|
+
const StateMachine &Get(const CSVStateMachineOptions &state_machine_options);
|
40
54
|
|
41
55
|
private:
|
42
56
|
void Insert(const CSVStateMachineOptions &state_machine_options);
|
43
57
|
//! Cache on delimiter|quote|escape
|
44
|
-
unordered_map<CSVStateMachineOptions,
|
58
|
+
unordered_map<CSVStateMachineOptions, StateMachine, HashCSVStateMachineConfig> state_machine_cache;
|
45
59
|
//! Default value for options used to intialize CSV State Machine Cache
|
46
60
|
const vector<char> default_delimiter = {',', '|', ';', '\t'};
|
47
61
|
const vector<vector<char>> default_quote = {{'\"'}, {'\"', '\''}, {'\0'}};
|