duckdb 0.9.1-dev0.0 → 0.9.1-dev43.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "0.9.1-dev0.0",
5
+ "version": "0.9.1-dev43.0",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
@@ -68,7 +68,7 @@
68
68
  #include "duckdb/execution/index/art/node.hpp"
69
69
  #include "duckdb/execution/operator/scan/csv/base_csv_reader.hpp"
70
70
  #include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
71
- #include "duckdb/execution/operator/scan/csv/csv_state_machine.hpp"
71
+ #include "duckdb/execution/operator/scan/csv/csv_state.hpp"
72
72
  #include "duckdb/execution/operator/scan/csv/quote_rules.hpp"
73
73
  #include "duckdb/function/aggregate_state.hpp"
74
74
  #include "duckdb/function/function.hpp"
@@ -8,7 +8,8 @@ namespace duckdb {
8
8
  void BinaryDeserializer::OnPropertyBegin(const field_id_t field_id, const char *) {
9
9
  auto field = NextField();
10
10
  if (field != field_id) {
11
- throw InternalException("Failed to deserialize: field id mismatch, expected: %d, got: %d", field_id, field);
11
+ throw SerializationException("Failed to deserialize: field id mismatch, expected: %d, got: %d", field_id,
12
+ field);
12
13
  }
13
14
  }
14
15
 
@@ -34,7 +35,8 @@ void BinaryDeserializer::OnObjectBegin() {
34
35
  void BinaryDeserializer::OnObjectEnd() {
35
36
  auto next_field = NextField();
36
37
  if (next_field != MESSAGE_TERMINATOR_FIELD_ID) {
37
- throw InternalException("Failed to deserialize: expected end of object, but found field id: %d", next_field);
38
+ throw SerializationException("Failed to deserialize: expected end of object, but found field id: %d",
39
+ next_field);
38
40
  }
39
41
  nesting_level--;
40
42
  }
@@ -3,8 +3,8 @@
3
3
 
4
4
  namespace duckdb {
5
5
 
6
- void InitializeTransitionArray(unsigned char *transition_array, const uint8_t state) {
7
- for (uint32_t i = 0; i < NUM_TRANSITIONS; i++) {
6
+ void InitializeTransitionArray(CSVState *transition_array, const CSVState state) {
7
+ for (uint32_t i = 0; i < StateMachine::NUM_TRANSITIONS; i++) {
8
8
  transition_array[i] = state;
9
9
  }
10
10
  }
@@ -13,72 +13,63 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
13
13
  D_ASSERT(state_machine_cache.find(state_machine_options) == state_machine_cache.end());
14
14
  // Initialize transition array with default values to the Standard option
15
15
  auto &transition_array = state_machine_cache[state_machine_options];
16
- const uint8_t standard_state = static_cast<uint8_t>(CSVState::STANDARD);
17
- const uint8_t field_separator_state = static_cast<uint8_t>(CSVState::DELIMITER);
18
- const uint8_t record_separator_state = static_cast<uint8_t>(CSVState::RECORD_SEPARATOR);
19
- const uint8_t carriage_return_state = static_cast<uint8_t>(CSVState::CARRIAGE_RETURN);
20
- const uint8_t quoted_state = static_cast<uint8_t>(CSVState::QUOTED);
21
- const uint8_t unquoted_state = static_cast<uint8_t>(CSVState::UNQUOTED);
22
- const uint8_t escape_state = static_cast<uint8_t>(CSVState::ESCAPE);
23
- const uint8_t empty_line_state = static_cast<uint8_t>(CSVState::EMPTY_LINE);
24
- const uint8_t invalid_state = static_cast<uint8_t>(CSVState::INVALID);
25
16
 
26
- for (uint32_t i = 0; i < NUM_STATES; i++) {
27
- switch (i) {
28
- case quoted_state:
29
- InitializeTransitionArray(transition_array[i], quoted_state);
17
+ for (uint32_t i = 0; i < StateMachine::NUM_STATES; i++) {
18
+ CSVState cur_state = CSVState(i);
19
+ switch (cur_state) {
20
+ case CSVState::QUOTED:
21
+ InitializeTransitionArray(transition_array[cur_state], CSVState::QUOTED);
30
22
  break;
31
- case unquoted_state:
32
- case invalid_state:
33
- case escape_state:
34
- InitializeTransitionArray(transition_array[i], invalid_state);
23
+ case CSVState::UNQUOTED:
24
+ case CSVState::INVALID:
25
+ case CSVState::ESCAPE:
26
+ InitializeTransitionArray(transition_array[cur_state], CSVState::INVALID);
35
27
  break;
36
28
  default:
37
- InitializeTransitionArray(transition_array[i], standard_state);
29
+ InitializeTransitionArray(transition_array[cur_state], CSVState::STANDARD);
38
30
  break;
39
31
  }
40
32
  }
41
33
 
42
34
  // Now set values depending on configuration
43
35
  // 1) Standard State
44
- transition_array[standard_state][static_cast<uint8_t>(state_machine_options.delimiter)] = field_separator_state;
45
- transition_array[standard_state][static_cast<uint8_t>('\n')] = record_separator_state;
46
- transition_array[standard_state][static_cast<uint8_t>('\r')] = carriage_return_state;
47
- transition_array[standard_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
36
+ transition_array[CSVState::STANDARD][static_cast<uint8_t>(state_machine_options.delimiter)] = CSVState::DELIMITER;
37
+ transition_array[CSVState::STANDARD][static_cast<uint8_t>('\n')] = CSVState::RECORD_SEPARATOR;
38
+ transition_array[CSVState::STANDARD][static_cast<uint8_t>('\r')] = CSVState::CARRIAGE_RETURN;
39
+ transition_array[CSVState::STANDARD][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
48
40
  // 2) Field Separator State
49
- transition_array[field_separator_state][static_cast<uint8_t>(state_machine_options.delimiter)] =
50
- field_separator_state;
51
- transition_array[field_separator_state][static_cast<uint8_t>('\n')] = record_separator_state;
52
- transition_array[field_separator_state][static_cast<uint8_t>('\r')] = carriage_return_state;
53
- transition_array[field_separator_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
41
+ transition_array[CSVState::DELIMITER][static_cast<uint8_t>(state_machine_options.delimiter)] = CSVState::DELIMITER;
42
+ transition_array[CSVState::DELIMITER][static_cast<uint8_t>('\n')] = CSVState::RECORD_SEPARATOR;
43
+ transition_array[CSVState::DELIMITER][static_cast<uint8_t>('\r')] = CSVState::CARRIAGE_RETURN;
44
+ transition_array[CSVState::DELIMITER][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
54
45
  // 3) Record Separator State
55
- transition_array[record_separator_state][static_cast<uint8_t>(state_machine_options.delimiter)] =
56
- field_separator_state;
57
- transition_array[record_separator_state][static_cast<uint8_t>('\n')] = empty_line_state;
58
- transition_array[record_separator_state][static_cast<uint8_t>('\r')] = empty_line_state;
59
- transition_array[record_separator_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
46
+ transition_array[CSVState::RECORD_SEPARATOR][static_cast<uint8_t>(state_machine_options.delimiter)] =
47
+ CSVState::DELIMITER;
48
+ transition_array[CSVState::RECORD_SEPARATOR][static_cast<uint8_t>('\n')] = CSVState::EMPTY_LINE;
49
+ transition_array[CSVState::RECORD_SEPARATOR][static_cast<uint8_t>('\r')] = CSVState::EMPTY_LINE;
50
+ transition_array[CSVState::RECORD_SEPARATOR][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
60
51
  // 4) Carriage Return State
61
- transition_array[carriage_return_state][static_cast<uint8_t>('\n')] = record_separator_state;
62
- transition_array[carriage_return_state][static_cast<uint8_t>('\r')] = empty_line_state;
63
- transition_array[carriage_return_state][static_cast<uint8_t>(state_machine_options.escape)] = escape_state;
52
+ transition_array[CSVState::CARRIAGE_RETURN][static_cast<uint8_t>('\n')] = CSVState::RECORD_SEPARATOR;
53
+ transition_array[CSVState::CARRIAGE_RETURN][static_cast<uint8_t>('\r')] = CSVState::EMPTY_LINE;
54
+ transition_array[CSVState::CARRIAGE_RETURN][static_cast<uint8_t>(state_machine_options.escape)] = CSVState::ESCAPE;
64
55
  // 5) Quoted State
65
- transition_array[quoted_state][static_cast<uint8_t>(state_machine_options.quote)] = unquoted_state;
56
+ transition_array[CSVState::QUOTED][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::UNQUOTED;
66
57
  if (state_machine_options.quote != state_machine_options.escape) {
67
- transition_array[quoted_state][static_cast<uint8_t>(state_machine_options.escape)] = escape_state;
58
+ transition_array[CSVState::QUOTED][static_cast<uint8_t>(state_machine_options.escape)] = CSVState::ESCAPE;
68
59
  }
69
60
  // 6) Unquoted State
70
- transition_array[unquoted_state][static_cast<uint8_t>('\n')] = record_separator_state;
71
- transition_array[unquoted_state][static_cast<uint8_t>('\r')] = carriage_return_state;
72
- transition_array[unquoted_state][static_cast<uint8_t>(state_machine_options.delimiter)] = field_separator_state;
61
+ transition_array[CSVState::UNQUOTED][static_cast<uint8_t>('\n')] = CSVState::RECORD_SEPARATOR;
62
+ transition_array[CSVState::UNQUOTED][static_cast<uint8_t>('\r')] = CSVState::CARRIAGE_RETURN;
63
+ transition_array[CSVState::UNQUOTED][static_cast<uint8_t>(state_machine_options.delimiter)] = CSVState::DELIMITER;
73
64
  if (state_machine_options.quote == state_machine_options.escape) {
74
- transition_array[unquoted_state][static_cast<uint8_t>(state_machine_options.escape)] = quoted_state;
65
+ transition_array[CSVState::UNQUOTED][static_cast<uint8_t>(state_machine_options.escape)] = CSVState::QUOTED;
75
66
  }
76
67
  // 7) Escaped State
77
- transition_array[escape_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
78
- transition_array[escape_state][static_cast<uint8_t>(state_machine_options.escape)] = quoted_state;
68
+ transition_array[CSVState::ESCAPE][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
69
+ transition_array[CSVState::ESCAPE][static_cast<uint8_t>(state_machine_options.escape)] = CSVState::QUOTED;
79
70
  // 8) Empty Line State
80
- transition_array[empty_line_state][static_cast<uint8_t>('\r')] = empty_line_state;
81
- transition_array[empty_line_state][static_cast<uint8_t>('\n')] = empty_line_state;
71
+ transition_array[CSVState::EMPTY_LINE][static_cast<uint8_t>('\r')] = CSVState::EMPTY_LINE;
72
+ transition_array[CSVState::EMPTY_LINE][static_cast<uint8_t>('\n')] = CSVState::EMPTY_LINE;
82
73
  }
83
74
 
84
75
  CSVStateMachineCache::CSVStateMachineCache() {
@@ -95,7 +86,7 @@ CSVStateMachineCache::CSVStateMachineCache() {
95
86
  }
96
87
  }
97
88
 
98
- const state_machine_t &CSVStateMachineCache::Get(const CSVStateMachineOptions &state_machine_options) {
89
+ const StateMachine &CSVStateMachineCache::Get(const CSVStateMachineOptions &state_machine_options) {
99
90
  //! Custom State Machine, we need to create it and cache it first
100
91
  if (state_machine_cache.find(state_machine_options) == state_machine_cache.end()) {
101
92
  Insert(state_machine_options);
@@ -49,11 +49,12 @@ bool ParallelCSVReader::NewLineDelimiter(bool carry, bool carry_followed_by_nl,
49
49
  return (carry && carry_followed_by_nl) || (!carry && first_char);
50
50
  }
51
51
 
52
- void ParallelCSVReader::SkipEmptyLines() {
52
+ bool ParallelCSVReader::SkipEmptyLines() {
53
+ const idx_t initial_position_buffer = position_buffer;
53
54
  idx_t new_pos_buffer = position_buffer;
54
55
  if (parse_chunk.data.size() == 1) {
55
56
  // Empty lines are null data.
56
- return;
57
+ return initial_position_buffer != position_buffer;
57
58
  }
58
59
  for (; new_pos_buffer < end_buffer; new_pos_buffer++) {
59
60
  if (StringUtil::CharacterIsNewline((*buffer)[new_pos_buffer])) {
@@ -63,13 +64,14 @@ void ParallelCSVReader::SkipEmptyLines() {
63
64
  position_buffer++;
64
65
  }
65
66
  if (new_pos_buffer > end_buffer) {
66
- return;
67
+ return initial_position_buffer != position_buffer;
67
68
  }
68
69
  position_buffer = new_pos_buffer;
69
70
  } else if ((*buffer)[new_pos_buffer] != ' ') {
70
- return;
71
+ return initial_position_buffer != position_buffer;
71
72
  }
72
73
  }
74
+ return initial_position_buffer != position_buffer;
73
75
  }
74
76
 
75
77
  bool ParallelCSVReader::SetPosition() {
@@ -185,7 +187,6 @@ bool ParallelCSVReader::SetPosition() {
185
187
  }
186
188
  // Ensure that parse_chunk has no gunk when trying to figure new line
187
189
  parse_chunk.Reset();
188
-
189
190
  verification_positions.end_of_last_line = position_buffer;
190
191
  finished = false;
191
192
  return successfully_read_first_line;
@@ -288,7 +289,7 @@ bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error
288
289
  idx_t column = 0;
289
290
  idx_t offset = 0;
290
291
  bool has_quotes = false;
291
-
292
+ bool last_line_empty = false;
292
293
  vector<idx_t> escape_positions;
293
294
  if ((start_buffer == buffer->buffer_start || start_buffer == buffer->buffer_end) && !try_add_line) {
294
295
  // First time reading this buffer piece
@@ -454,7 +455,10 @@ add_row : {
454
455
  if (!BufferRemainder()) {
455
456
  goto final_state;
456
457
  }
457
- SkipEmptyLines();
458
+ if (SkipEmptyLines() && reached_remainder_state) {
459
+ last_line_empty = true;
460
+ goto final_state;
461
+ }
458
462
  if (position_buffer - verification_positions.end_of_last_line > options.buffer_size) {
459
463
  error_message = "Line does not fit in one buffer. Increase the buffer size.";
460
464
  return false;
@@ -583,8 +587,8 @@ final_state : {
583
587
  return true;
584
588
  }
585
589
  // If this is the last buffer, we have to read the last value
586
- if (buffer->buffer->is_last_buffer || !buffer->next_buffer ||
587
- (buffer->next_buffer && buffer->next_buffer->is_last_buffer)) {
590
+ if (!last_line_empty && (buffer->buffer->is_last_buffer || !buffer->next_buffer ||
591
+ (buffer->next_buffer && buffer->next_buffer->is_last_buffer))) {
588
592
  if (column > 0 || start_buffer != position_buffer || try_add_line ||
589
593
  (insert_chunk.data.size() == 1 && start_buffer != position_buffer)) {
590
594
  // remaining values to be added to the chunk
@@ -21,17 +21,12 @@ struct SniffDialect {
21
21
  sniffed_column_counts.clear();
22
22
  return true;
23
23
  }
24
- machine.pre_previous_state = machine.previous_state;
25
- machine.previous_state = machine.state;
26
-
27
- machine.state = static_cast<CSVState>(
28
- machine.transition_array[static_cast<uint8_t>(machine.state)][static_cast<uint8_t>(current_char)]);
24
+ machine.Transition(current_char);
29
25
 
30
26
  bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN;
31
27
  machine.column_count += machine.previous_state == CSVState::DELIMITER;
32
28
  sniffed_column_counts[machine.cur_rows] = machine.column_count;
33
- machine.cur_rows +=
34
- machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE;
29
+ machine.cur_rows += machine.previous_state == CSVState::RECORD_SEPARATOR;
35
30
  machine.column_count -= (machine.column_count - 1) * (machine.previous_state == CSVState::RECORD_SEPARATOR);
36
31
 
37
32
  // It means our carriage return is actually a record separator
@@ -97,9 +97,14 @@ void CSVSniffer::DetectHeader() {
97
97
  bool first_row_consistent = true;
98
98
  // check if header row is all null and/or consistent with detected column data types
99
99
  bool first_row_nulls = true;
100
- // This case will fail in dialect detection, so we assert here just for sanity
101
- D_ASSERT(best_candidate->options.null_padding ||
102
- best_sql_types_candidates_per_column_idx.size() == best_header_row.size());
100
+ // If null-padding is not allowed and there is a mismatch between our header candidate and the number of columns
101
+ // We can't detect the dialect/type options properly
102
+ if (!best_candidate->options.null_padding &&
103
+ best_sql_types_candidates_per_column_idx.size() != best_header_row.size()) {
104
+ throw InvalidInputException(
105
+ "Error in file \"%s\": CSV options could not be auto-detected. Consider setting parser options manually.",
106
+ options.file_path);
107
+ }
103
108
  for (idx_t col = 0; col < best_header_row.size(); col++) {
104
109
  auto dummy_val = best_header_row[col];
105
110
  if (!dummy_val.IsNull()) {
@@ -143,20 +143,17 @@ struct SniffValue {
143
143
  machine.rows_read++;
144
144
  }
145
145
 
146
- if ((machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE) ||
146
+ if ((machine.previous_state == CSVState::RECORD_SEPARATOR) ||
147
147
  (machine.state != CSVState::RECORD_SEPARATOR && machine.previous_state == CSVState::CARRIAGE_RETURN)) {
148
148
  sniffed_values[machine.cur_rows].position = machine.line_start_pos;
149
149
  sniffed_values[machine.cur_rows].set = true;
150
150
  machine.line_start_pos = current_pos;
151
151
  }
152
- machine.pre_previous_state = machine.previous_state;
153
- machine.previous_state = machine.state;
154
- machine.state = static_cast<CSVState>(
155
- machine.transition_array[static_cast<uint8_t>(machine.state)][static_cast<uint8_t>(current_char)]);
152
+
153
+ machine.Transition(current_char);
156
154
 
157
155
  bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN;
158
- if (machine.previous_state == CSVState::DELIMITER ||
159
- (machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE) ||
156
+ if (machine.previous_state == CSVState::DELIMITER || (machine.previous_state == CSVState::RECORD_SEPARATOR) ||
160
157
  (machine.state != CSVState::RECORD_SEPARATOR && carriage_return)) {
161
158
  // Started a new value
162
159
  // Check if it's UTF-8
@@ -175,8 +172,7 @@ struct SniffValue {
175
172
  (machine.state == CSVState::QUOTED && machine.previous_state == CSVState::QUOTED)) {
176
173
  machine.value += current_char;
177
174
  }
178
- machine.cur_rows +=
179
- machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE;
175
+ machine.cur_rows += machine.previous_state == CSVState::RECORD_SEPARATOR;
180
176
  // It means our carriage return is actually a record separator
181
177
  machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return;
182
178
  if (machine.cur_rows >= sniffed_values.size()) {
@@ -3,9 +3,9 @@
3
3
  namespace duckdb {
4
4
  struct Parse {
5
5
  inline static void Initialize(CSVStateMachine &machine) {
6
- machine.state = CSVState::STANDARD;
7
- machine.previous_state = CSVState::STANDARD;
8
- machine.pre_previous_state = CSVState::STANDARD;
6
+ machine.state = CSVState::EMPTY_LINE;
7
+ machine.previous_state = CSVState::EMPTY_LINE;
8
+ machine.pre_previous_state = CSVState::EMPTY_LINE;
9
9
 
10
10
  machine.cur_rows = 0;
11
11
  machine.column_count = 0;
@@ -14,22 +14,18 @@ struct Parse {
14
14
 
15
15
  inline static bool Process(CSVStateMachine &machine, DataChunk &parse_chunk, char current_char, idx_t current_pos) {
16
16
 
17
- machine.pre_previous_state = machine.previous_state;
18
- machine.previous_state = machine.state;
19
- machine.state = static_cast<CSVState>(
20
- machine.transition_array[static_cast<uint8_t>(machine.state)][static_cast<uint8_t>(current_char)]);
17
+ machine.Transition(current_char);
21
18
 
22
19
  bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN;
23
- if (machine.previous_state == CSVState::DELIMITER ||
24
- (machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE) ||
20
+ if (machine.previous_state == CSVState::DELIMITER || (machine.previous_state == CSVState::RECORD_SEPARATOR) ||
25
21
  (machine.state != CSVState::RECORD_SEPARATOR && carriage_return)) {
26
22
  // Started a new value
27
23
  // Check if it's UTF-8 (Or not?)
28
24
  machine.VerifyUTF8();
29
25
  auto &v = parse_chunk.data[machine.column_count++];
30
26
  auto parse_data = FlatVector::GetData<string_t>(v);
31
- auto &validity_mask = FlatVector::Validity(v);
32
27
  if (machine.value.empty()) {
28
+ auto &validity_mask = FlatVector::Validity(v);
33
29
  validity_mask.SetInvalid(machine.cur_rows);
34
30
  } else {
35
31
  parse_data[machine.cur_rows] = StringVector::AddStringOrBlob(v, string_t(machine.value));
@@ -50,12 +46,11 @@ struct Parse {
50
46
  (machine.state == CSVState::QUOTED && machine.previous_state == CSVState::QUOTED)) {
51
47
  machine.value += current_char;
52
48
  }
53
- machine.cur_rows +=
54
- machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE;
49
+ machine.cur_rows += machine.previous_state == CSVState::RECORD_SEPARATOR && machine.column_count > 0;
55
50
  machine.column_count -= machine.column_count * (machine.previous_state == CSVState::RECORD_SEPARATOR);
56
51
 
57
52
  // It means our carriage return is actually a record separator
58
- machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return;
53
+ machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return && machine.column_count > 0;
59
54
  machine.column_count -= machine.column_count * (machine.state != CSVState::RECORD_SEPARATOR && carriage_return);
60
55
 
61
56
  if (machine.cur_rows >= STANDARD_VECTOR_SIZE) {
@@ -261,7 +261,7 @@ idx_t RadixHTConfig::ExternalRadixBits(const idx_t &maximum_sink_radix_bits_p) {
261
261
  idx_t RadixHTConfig::SinkCapacity(ClientContext &context) {
262
262
  // Get active and maximum number of threads
263
263
  const idx_t active_threads = TaskScheduler::GetScheduler(context).NumberOfThreads();
264
- const auto max_threads = DBConfig::GetSystemMaxThreads(FileSystem::GetFileSystem(context));
264
+ const auto max_threads = DBConfig::GetConfig(context).options.maximum_threads;
265
265
 
266
266
  // Compute cache size per active thread (assuming cache is shared)
267
267
  const auto total_shared_cache_size = max_threads * L3_CACHE_SIZE;
@@ -1,8 +1,8 @@
1
1
  #ifndef DUCKDB_VERSION
2
- #define DUCKDB_VERSION "0.9.0"
2
+ #define DUCKDB_VERSION "v0.9.1-dev43"
3
3
  #endif
4
4
  #ifndef DUCKDB_SOURCE_ID
5
- #define DUCKDB_SOURCE_ID "0d84ccf478"
5
+ #define DUCKDB_SOURCE_ID "709ea292d3"
6
6
  #endif
7
7
  #include "duckdb/function/table/system_functions.hpp"
8
8
  #include "duckdb/main/database.hpp"
@@ -1,6 +1,7 @@
1
1
  #pragma once
2
2
  #include <type_traits>
3
3
  #include <cstdint>
4
+ #include <atomic>
4
5
 
5
6
  #include "duckdb/common/vector.hpp"
6
7
  #include "duckdb/common/unordered_map.hpp"
@@ -0,0 +1,28 @@
1
+ //===----------------------------------------------------------------------===//
2
+ // DuckDB
3
+ //
4
+ // duckdb/execution/operator/scan/csv/csv_state.hpp
5
+ //
6
+ //
7
+ //===----------------------------------------------------------------------===//
8
+
9
+ #pragma once
10
+
11
+ #include <cstdint>
12
+
13
+ namespace duckdb {
14
+
15
+ //! All States of CSV Parsing
16
+ enum class CSVState : uint8_t {
17
+ STANDARD = 0, //! Regular unquoted field state
18
+ DELIMITER = 1, //! State after encountering a field separator (e.g., ;)
19
+ RECORD_SEPARATOR = 2, //! State after encountering a record separator (i.e., \n)
20
+ CARRIAGE_RETURN = 3, //! State after encountering a carriage return(i.e., \r)
21
+ QUOTED = 4, //! State when inside a quoted field
22
+ UNQUOTED = 5, //! State when leaving a quoted field
23
+ ESCAPE = 6, //! State when encountering an escape character (e.g., \)
24
+ EMPTY_LINE = 7, //! State when encountering an empty line (i.e., \r\r \n\n, \n\r)
25
+ INVALID = 8 //! Got to an Invalid State, this should error.
26
+ };
27
+
28
+ } // namespace duckdb
@@ -14,19 +14,6 @@
14
14
 
15
15
  namespace duckdb {
16
16
 
17
- //! All States of CSV Parsing
18
- enum class CSVState : uint8_t {
19
- STANDARD = 0, //! Regular unquoted field state
20
- DELIMITER = 1, //! State after encountering a field separator (e.g., ;)
21
- RECORD_SEPARATOR = 2, //! State after encountering a record separator (i.e., \n)
22
- CARRIAGE_RETURN = 3, //! State after encountering a carriage return(i.e., \r)
23
- QUOTED = 4, //! State when inside a quoted field
24
- UNQUOTED = 5, //! State when leaving a quoted field
25
- ESCAPE = 6, //! State when encountering an escape character (e.g., \)
26
- EMPTY_LINE = 7, //! State when encountering an empty line (i.e., \r\r \n\n, \n\r)
27
- INVALID = 8 //! Got to an Invalid State, this should error.
28
- };
29
-
30
17
  //! The CSV State Machine comprises a state transition array (STA).
31
18
  //! The STA indicates the current state of parsing based on both the current and preceding characters.
32
19
  //! This reveals whether we are dealing with a Field, a New Line, a Delimiter, and so forth.
@@ -38,6 +25,14 @@ public:
38
25
  explicit CSVStateMachine(CSVReaderOptions &options_p, const CSVStateMachineOptions &state_machine_options,
39
26
  shared_ptr<CSVBufferManager> buffer_manager_p,
40
27
  CSVStateMachineCache &csv_state_machine_cache_p);
28
+
29
+ //! Transition all states to next state, that depends on the current char
30
+ inline void Transition(char current_char) {
31
+ pre_previous_state = previous_state;
32
+ previous_state = state;
33
+ state = transition_array[state][static_cast<uint8_t>(current_char)];
34
+ }
35
+
41
36
  //! Resets the state machine, so it can be used again
42
37
  void Reset();
43
38
 
@@ -52,7 +47,7 @@ public:
52
47
  idx_t start_row = 0;
53
48
  //! The Transition Array is a Finite State Machine
54
49
  //! It holds the transitions of all states, on all 256 possible different characters
55
- const state_machine_t &transition_array;
50
+ const StateMachine &transition_array;
56
51
 
57
52
  //! Both these variables are used for new line identifier detection
58
53
  bool single_record_separator = false;
@@ -8,14 +8,28 @@
8
8
 
9
9
  #pragma once
10
10
 
11
- #include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
11
+ #include "duckdb/execution/operator/scan/csv/csv_state.hpp"
12
12
  #include "duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp"
13
+ #include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
13
14
  #include "duckdb/execution/operator/scan/csv/quote_rules.hpp"
14
15
 
15
16
  namespace duckdb {
16
- static constexpr uint32_t NUM_STATES = 9;
17
- static constexpr uint32_t NUM_TRANSITIONS = 256;
18
- typedef uint8_t state_machine_t[NUM_STATES][NUM_TRANSITIONS];
17
+
18
+ //! Class to wrap the state machine matrix
19
+ class StateMachine {
20
+ public:
21
+ static constexpr uint32_t NUM_STATES = 9;
22
+ static constexpr uint32_t NUM_TRANSITIONS = 256;
23
+ CSVState state_machine[NUM_STATES][NUM_TRANSITIONS];
24
+
25
+ const CSVState *operator[](CSVState state) const {
26
+ return state_machine[static_cast<uint8_t>(state)];
27
+ }
28
+
29
+ CSVState *operator[](CSVState state) {
30
+ return state_machine[static_cast<uint8_t>(state)];
31
+ }
32
+ };
19
33
 
20
34
  //! Hash function used in out state machine cache, it hashes and combines all options used to generate a state machine
21
35
  struct HashCSVStateMachineConfig {
@@ -36,12 +50,12 @@ public:
36
50
  ~CSVStateMachineCache() {};
37
51
  //! Gets a state machine from the cache, if it's not from one the default options
38
52
  //! It first caches it, then returns it.
39
- const state_machine_t &Get(const CSVStateMachineOptions &state_machine_options);
53
+ const StateMachine &Get(const CSVStateMachineOptions &state_machine_options);
40
54
 
41
55
  private:
42
56
  void Insert(const CSVStateMachineOptions &state_machine_options);
43
57
  //! Cache on delimiter|quote|escape
44
- unordered_map<CSVStateMachineOptions, state_machine_t, HashCSVStateMachineConfig> state_machine_cache;
58
+ unordered_map<CSVStateMachineOptions, StateMachine, HashCSVStateMachineConfig> state_machine_cache;
45
59
  //! Default value for options used to intialize CSV State Machine Cache
46
60
  const vector<char> default_delimiter = {',', '|', ';', '\t'};
47
61
  const vector<vector<char>> default_quote = {{'\"'}, {'\"', '\''}, {'\0'}};
@@ -148,7 +148,7 @@ private:
148
148
  //! Sets Position depending on the byte_start of this thread
149
149
  bool SetPosition();
150
150
  //! Called when scanning the 1st buffer, skips empty lines
151
- void SkipEmptyLines();
151
+ bool SkipEmptyLines();
152
152
  //! When a buffer finishes reading its piece, it still can try to scan up to the real end of the buffer
153
153
  //! Up to finding a new line. This function sets the buffer_end and marks a boolean variable
154
154
  //! when changing the buffer end the first time.