duckdb 0.9.1-dev19.0 → 0.9.1-dev67.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "0.9.1-dev19.0",
5
+ "version": "0.9.1-dev67.0",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
@@ -49,11 +49,12 @@ bool ParallelCSVReader::NewLineDelimiter(bool carry, bool carry_followed_by_nl,
49
49
  return (carry && carry_followed_by_nl) || (!carry && first_char);
50
50
  }
51
51
 
52
- void ParallelCSVReader::SkipEmptyLines() {
52
+ bool ParallelCSVReader::SkipEmptyLines() {
53
+ const idx_t initial_position_buffer = position_buffer;
53
54
  idx_t new_pos_buffer = position_buffer;
54
55
  if (parse_chunk.data.size() == 1) {
55
56
  // Empty lines are null data.
56
- return;
57
+ return initial_position_buffer != position_buffer;
57
58
  }
58
59
  for (; new_pos_buffer < end_buffer; new_pos_buffer++) {
59
60
  if (StringUtil::CharacterIsNewline((*buffer)[new_pos_buffer])) {
@@ -63,13 +64,14 @@ void ParallelCSVReader::SkipEmptyLines() {
63
64
  position_buffer++;
64
65
  }
65
66
  if (new_pos_buffer > end_buffer) {
66
- return;
67
+ return initial_position_buffer != position_buffer;
67
68
  }
68
69
  position_buffer = new_pos_buffer;
69
70
  } else if ((*buffer)[new_pos_buffer] != ' ') {
70
- return;
71
+ return initial_position_buffer != position_buffer;
71
72
  }
72
73
  }
74
+ return initial_position_buffer != position_buffer;
73
75
  }
74
76
 
75
77
  bool ParallelCSVReader::SetPosition() {
@@ -185,7 +187,6 @@ bool ParallelCSVReader::SetPosition() {
185
187
  }
186
188
  // Ensure that parse_chunk has no gunk when trying to figure new line
187
189
  parse_chunk.Reset();
188
-
189
190
  verification_positions.end_of_last_line = position_buffer;
190
191
  finished = false;
191
192
  return successfully_read_first_line;
@@ -288,7 +289,7 @@ bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error
288
289
  idx_t column = 0;
289
290
  idx_t offset = 0;
290
291
  bool has_quotes = false;
291
-
292
+ bool last_line_empty = false;
292
293
  vector<idx_t> escape_positions;
293
294
  if ((start_buffer == buffer->buffer_start || start_buffer == buffer->buffer_end) && !try_add_line) {
294
295
  // First time reading this buffer piece
@@ -454,7 +455,10 @@ add_row : {
454
455
  if (!BufferRemainder()) {
455
456
  goto final_state;
456
457
  }
457
- SkipEmptyLines();
458
+ if (SkipEmptyLines() && reached_remainder_state) {
459
+ last_line_empty = true;
460
+ goto final_state;
461
+ }
458
462
  if (position_buffer - verification_positions.end_of_last_line > options.buffer_size) {
459
463
  error_message = "Line does not fit in one buffer. Increase the buffer size.";
460
464
  return false;
@@ -583,8 +587,8 @@ final_state : {
583
587
  return true;
584
588
  }
585
589
  // If this is the last buffer, we have to read the last value
586
- if (buffer->buffer->is_last_buffer || !buffer->next_buffer ||
587
- (buffer->next_buffer && buffer->next_buffer->is_last_buffer)) {
590
+ if (!last_line_empty && (buffer->buffer->is_last_buffer || !buffer->next_buffer ||
591
+ (buffer->next_buffer && buffer->next_buffer->is_last_buffer))) {
588
592
  if (column > 0 || start_buffer != position_buffer || try_add_line ||
589
593
  (insert_chunk.data.size() == 1 && start_buffer != position_buffer)) {
590
594
  // remaining values to be added to the chunk
@@ -26,8 +26,7 @@ struct SniffDialect {
26
26
  bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN;
27
27
  machine.column_count += machine.previous_state == CSVState::DELIMITER;
28
28
  sniffed_column_counts[machine.cur_rows] = machine.column_count;
29
- machine.cur_rows +=
30
- machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE;
29
+ machine.cur_rows += machine.previous_state == CSVState::RECORD_SEPARATOR;
31
30
  machine.column_count -= (machine.column_count - 1) * (machine.previous_state == CSVState::RECORD_SEPARATOR);
32
31
 
33
32
  // It means our carriage return is actually a record separator
@@ -143,7 +143,7 @@ struct SniffValue {
143
143
  machine.rows_read++;
144
144
  }
145
145
 
146
- if ((machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE) ||
146
+ if ((machine.previous_state == CSVState::RECORD_SEPARATOR) ||
147
147
  (machine.state != CSVState::RECORD_SEPARATOR && machine.previous_state == CSVState::CARRIAGE_RETURN)) {
148
148
  sniffed_values[machine.cur_rows].position = machine.line_start_pos;
149
149
  sniffed_values[machine.cur_rows].set = true;
@@ -153,8 +153,7 @@ struct SniffValue {
153
153
  machine.Transition(current_char);
154
154
 
155
155
  bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN;
156
- if (machine.previous_state == CSVState::DELIMITER ||
157
- (machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE) ||
156
+ if (machine.previous_state == CSVState::DELIMITER || (machine.previous_state == CSVState::RECORD_SEPARATOR) ||
158
157
  (machine.state != CSVState::RECORD_SEPARATOR && carriage_return)) {
159
158
  // Started a new value
160
159
  // Check if it's UTF-8
@@ -173,8 +172,7 @@ struct SniffValue {
173
172
  (machine.state == CSVState::QUOTED && machine.previous_state == CSVState::QUOTED)) {
174
173
  machine.value += current_char;
175
174
  }
176
- machine.cur_rows +=
177
- machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE;
175
+ machine.cur_rows += machine.previous_state == CSVState::RECORD_SEPARATOR;
178
176
  // It means our carriage return is actually a record separator
179
177
  machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return;
180
178
  if (machine.cur_rows >= sniffed_values.size()) {
@@ -3,9 +3,9 @@
3
3
  namespace duckdb {
4
4
  struct Parse {
5
5
  inline static void Initialize(CSVStateMachine &machine) {
6
- machine.state = CSVState::STANDARD;
7
- machine.previous_state = CSVState::STANDARD;
8
- machine.pre_previous_state = CSVState::STANDARD;
6
+ machine.state = CSVState::EMPTY_LINE;
7
+ machine.previous_state = CSVState::EMPTY_LINE;
8
+ machine.pre_previous_state = CSVState::EMPTY_LINE;
9
9
 
10
10
  machine.cur_rows = 0;
11
11
  machine.column_count = 0;
@@ -17,16 +17,15 @@ struct Parse {
17
17
  machine.Transition(current_char);
18
18
 
19
19
  bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN;
20
- if (machine.previous_state == CSVState::DELIMITER ||
21
- (machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE) ||
20
+ if (machine.previous_state == CSVState::DELIMITER || (machine.previous_state == CSVState::RECORD_SEPARATOR) ||
22
21
  (machine.state != CSVState::RECORD_SEPARATOR && carriage_return)) {
23
22
  // Started a new value
24
23
  // Check if it's UTF-8 (Or not?)
25
24
  machine.VerifyUTF8();
26
25
  auto &v = parse_chunk.data[machine.column_count++];
27
26
  auto parse_data = FlatVector::GetData<string_t>(v);
28
- auto &validity_mask = FlatVector::Validity(v);
29
27
  if (machine.value.empty()) {
28
+ auto &validity_mask = FlatVector::Validity(v);
30
29
  validity_mask.SetInvalid(machine.cur_rows);
31
30
  } else {
32
31
  parse_data[machine.cur_rows] = StringVector::AddStringOrBlob(v, string_t(machine.value));
@@ -47,12 +46,11 @@ struct Parse {
47
46
  (machine.state == CSVState::QUOTED && machine.previous_state == CSVState::QUOTED)) {
48
47
  machine.value += current_char;
49
48
  }
50
- machine.cur_rows +=
51
- machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE;
49
+ machine.cur_rows += machine.previous_state == CSVState::RECORD_SEPARATOR && machine.column_count > 0;
52
50
  machine.column_count -= machine.column_count * (machine.previous_state == CSVState::RECORD_SEPARATOR);
53
51
 
54
52
  // It means our carriage return is actually a record separator
55
- machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return;
53
+ machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return && machine.column_count > 0;
56
54
  machine.column_count -= machine.column_count * (machine.state != CSVState::RECORD_SEPARATOR && carriage_return);
57
55
 
58
56
  if (machine.cur_rows >= STANDARD_VECTOR_SIZE) {
@@ -38,7 +38,7 @@ void ReadCSVData::FinalizeRead(ClientContext &context) {
38
38
  auto number_of_threads = TaskScheduler::GetScheduler(context).NumberOfThreads();
39
39
  //! If we have many csv files, we run single-threaded on each file and parallelize on the number of files
40
40
  bool many_csv_files = files.size() > 1 && int64_t(files.size() * 2) >= number_of_threads;
41
- if (options.parallel_mode != ParallelMode::PARALLEL && many_csv_files) {
41
+ if (options.parallel_mode != ParallelMode::PARALLEL && (many_csv_files || number_of_threads == 1)) {
42
42
  single_threaded = true;
43
43
  }
44
44
  if (options.parallel_mode == ParallelMode::SINGLE_THREADED || not_supported_options ||
@@ -1,8 +1,8 @@
1
1
  #ifndef DUCKDB_VERSION
2
- #define DUCKDB_VERSION "0.9.1-dev19"
2
+ #define DUCKDB_VERSION "v0.9.1-dev67"
3
3
  #endif
4
4
  #ifndef DUCKDB_SOURCE_ID
5
- #define DUCKDB_SOURCE_ID "1ea87567af"
5
+ #define DUCKDB_SOURCE_ID "7512d7ff4f"
6
6
  #endif
7
7
  #include "duckdb/function/table/system_functions.hpp"
8
8
  #include "duckdb/main/database.hpp"
@@ -148,7 +148,7 @@ private:
148
148
  //! Sets Position depending on the byte_start of this thread
149
149
  bool SetPosition();
150
150
  //! Called when scanning the 1st buffer, skips empty lines
151
- void SkipEmptyLines();
151
+ bool SkipEmptyLines();
152
152
  //! When a buffer finishes reading its piece, it still can try to scan up to the real end of the buffer
153
153
  //! Up to finding a new line. This function sets the buffer_end and marks a boolean variable
154
154
  //! when changing the buffer end the first time.
@@ -38,8 +38,8 @@ void CommonAggregateOptimizer::ExtractCommonAggregates(LogicalAggregate &aggr) {
38
38
  // aggregate does not exist yet: add it to the map
39
39
  aggregate_remap[*aggr.expressions[i]] = i;
40
40
  if (i != original_index) {
41
- // this aggregate is not erased, however an agregate BEFORE it has been erased
42
- // so we need to remap this aggregaet
41
+ // this aggregate is not erased, however an aggregate BEFORE it has been erased
42
+ // so we need to remap this aggregate
43
43
  ColumnBinding original_binding(aggr.aggregate_index, original_index);
44
44
  ColumnBinding new_binding(aggr.aggregate_index, i);
45
45
  aggregate_map[original_binding] = new_binding;