duckdb 0.5.2-dev2181.0 → 0.5.2-dev2189.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "0.5.2-dev2181.0",
5
+ "version": "0.5.2-dev2189.0",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
package/src/duckdb.cpp CHANGED
@@ -81381,17 +81381,21 @@ bool BufferedCSVReader::TryParseCSV(ParserMode parser_mode, DataChunk &insert_ch
81381
81381
 
81382
81382
  namespace duckdb {
81383
81383
 
81384
- CSVBuffer::CSVBuffer(idx_t buffer_size_p, CSVFileHandle &file_handle) : first_buffer(true) {
81385
- buffer = unique_ptr<char[]>(new char[buffer_size_p]);
81386
- actual_size = file_handle.Read(buffer.get(), buffer_size_p);
81384
+ CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle)
81385
+ : context(context), first_buffer(true) {
81386
+ this->handle = AllocateBuffer(buffer_size_p);
81387
+
81388
+ auto buffer = Ptr();
81389
+ actual_size = file_handle.Read(buffer, buffer_size_p);
81387
81390
  if (actual_size >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') {
81388
81391
  start_position += 3;
81389
81392
  }
81390
81393
  last_buffer = file_handle.FinishedReading();
81391
81394
  }
81392
81395
 
81393
- CSVBuffer::CSVBuffer(unique_ptr<char[]> buffer_p, idx_t buffer_size_p, idx_t actual_size_p, bool final_buffer)
81394
- : buffer(move(buffer_p)), actual_size(actual_size_p), last_buffer(final_buffer) {
81396
+ CSVBuffer::CSVBuffer(ClientContext &context, BufferHandle buffer_p, idx_t buffer_size_p, idx_t actual_size_p,
81397
+ bool final_buffer)
81398
+ : context(context), handle(move(buffer_p)), actual_size(actual_size_p), last_buffer(final_buffer) {
81395
81399
  }
81396
81400
 
81397
81401
  unique_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t set_buffer_size) {
@@ -81400,14 +81404,18 @@ unique_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t set_buff
81400
81404
  return nullptr;
81401
81405
  }
81402
81406
 
81403
- auto next_buffer = unique_ptr<char[]>(new char[set_buffer_size]);
81404
-
81405
- idx_t next_buffer_actual_size = file_handle.Read(next_buffer.get(), set_buffer_size);
81407
+ auto next_buffer = AllocateBuffer(set_buffer_size);
81408
+ idx_t next_buffer_actual_size = file_handle.Read(next_buffer.Ptr(), set_buffer_size);
81406
81409
 
81407
- return make_unique<CSVBuffer>(move(next_buffer), set_buffer_size, next_buffer_actual_size,
81410
+ return make_unique<CSVBuffer>(context, move(next_buffer), set_buffer_size, next_buffer_actual_size,
81408
81411
  file_handle.FinishedReading());
81409
81412
  }
81410
81413
 
81414
+ BufferHandle CSVBuffer::AllocateBuffer(idx_t buffer_size) {
81415
+ auto &buffer_manager = BufferManager::GetBufferManager(context);
81416
+ return buffer_manager.Allocate(MaxValue<idx_t>(Storage::BLOCK_SIZE, buffer_size));
81417
+ }
81418
+
81411
81419
  idx_t CSVBuffer::GetBufferSize() {
81412
81420
  return actual_size;
81413
81421
  }
@@ -81458,6 +81466,9 @@ static bool ParseBoolean(const Value &value, const string &loption) {
81458
81466
  }
81459
81467
 
81460
81468
  static string ParseString(const Value &value, const string &loption) {
81469
+ if (value.IsNull()) {
81470
+ return string();
81471
+ }
81461
81472
  if (value.type().id() == LogicalTypeId::LIST) {
81462
81473
  auto &children = ListValue::GetChildren(value);
81463
81474
  if (children.size() != 1) {
@@ -81612,6 +81623,11 @@ void BufferedCSVReaderOptions::SetReadOption(const string &loption, const Value
81612
81623
  ignore_errors = ParseBoolean(value, loption);
81613
81624
  } else if (loption == "union_by_name") {
81614
81625
  union_by_name = ParseBoolean(value, loption);
81626
+ } else if (loption == "buffer_size") {
81627
+ buffer_size = ParseInteger(value, loption);
81628
+ if (buffer_size == 0) {
81629
+ throw InvalidInputException("Buffer Size option must be higher than 0");
81630
+ }
81615
81631
  } else {
81616
81632
  throw BinderException("Unrecognized option for CSV reader \"%s\"", loption);
81617
81633
  }
@@ -81725,34 +81741,38 @@ struct CSVBufferRead {
81725
81741
 
81726
81742
  const char &operator[](size_t i) const {
81727
81743
  if (i < buffer->GetBufferSize()) {
81728
- return buffer->buffer[i];
81744
+ auto buffer_ptr = buffer->Ptr();
81745
+ return buffer_ptr[i];
81729
81746
  }
81730
- return next_buffer->buffer[i - buffer->GetBufferSize()];
81747
+ auto next_ptr = next_buffer->Ptr();
81748
+ return next_ptr[i - buffer->GetBufferSize()];
81731
81749
  }
81732
81750
 
81733
81751
  string_t GetValue(idx_t start_buffer, idx_t position_buffer, idx_t offset) {
81734
81752
  idx_t length = position_buffer - start_buffer - offset;
81735
81753
  // 1) It's all in the current buffer
81736
81754
  if (start_buffer + length <= buffer->GetBufferSize()) {
81737
- auto buffer_ptr = buffer->buffer.get();
81755
+ auto buffer_ptr = buffer->Ptr();
81738
81756
  return string_t(buffer_ptr + start_buffer, length);
81739
81757
  } else if (start_buffer >= buffer->GetBufferSize()) {
81740
81758
  // 2) It's all in the next buffer
81741
81759
  D_ASSERT(next_buffer);
81742
81760
  D_ASSERT(next_buffer->GetBufferSize() >= length + (start_buffer - buffer->GetBufferSize()));
81743
- auto buffer_ptr = next_buffer->buffer.get();
81761
+ auto buffer_ptr = next_buffer->Ptr();
81744
81762
  return string_t(buffer_ptr + (start_buffer - buffer->GetBufferSize()), length);
81745
81763
  } else {
81746
81764
  // 3) It starts in the current buffer and ends in the next buffer
81747
81765
  D_ASSERT(next_buffer);
81748
81766
  auto intersection = unique_ptr<char[]>(new char[length]);
81749
81767
  idx_t cur_pos = 0;
81768
+ auto buffer_ptr = buffer->Ptr();
81750
81769
  for (idx_t i = start_buffer; i < buffer->GetBufferSize(); i++) {
81751
- intersection[cur_pos++] = buffer->buffer[i];
81770
+ intersection[cur_pos++] = buffer_ptr[i];
81752
81771
  }
81753
81772
  idx_t nxt_buffer_pos = 0;
81773
+ auto next_buffer_ptr = next_buffer->Ptr();
81754
81774
  for (; cur_pos < length; cur_pos++) {
81755
- intersection[cur_pos] = next_buffer->buffer[nxt_buffer_pos++];
81775
+ intersection[cur_pos] = next_buffer_ptr[nxt_buffer_pos++];
81756
81776
  }
81757
81777
  intersections.emplace_back(move(intersection));
81758
81778
  return string_t(intersections.back().get(), length);
@@ -82065,10 +82085,11 @@ normal : {
82065
82085
  /* state: normal parsing state */
82066
82086
  // this state parses the remainder of a non-quoted value until we reach a delimiter or newline
82067
82087
  for (; position_buffer < end_buffer; position_buffer++) {
82068
- if ((*buffer)[position_buffer] == options.delimiter[0]) {
82088
+ auto c = (*buffer)[position_buffer];
82089
+ if (c == options.delimiter[0]) {
82069
82090
  // delimiter: end the value and add it to the chunk
82070
82091
  goto add_value;
82071
- } else if (StringUtil::CharacterIsNewline((*buffer)[position_buffer])) {
82092
+ } else if (StringUtil::CharacterIsNewline(c)) {
82072
82093
  // newline: add row
82073
82094
  D_ASSERT(try_add_line || column == insert_chunk.ColumnCount() - 1);
82074
82095
  goto add_row;
@@ -82138,10 +82159,11 @@ in_quotes:
82138
82159
  has_quotes = true;
82139
82160
  position_buffer++;
82140
82161
  for (; position_buffer < end_buffer; position_buffer++) {
82141
- if ((*buffer)[position_buffer] == options.quote[0]) {
82162
+ auto c = (*buffer)[position_buffer];
82163
+ if (c == options.quote[0]) {
82142
82164
  // quote: move to unquoted state
82143
82165
  goto unquote;
82144
- } else if ((*buffer)[position_buffer] == options.escape[0]) {
82166
+ } else if (c == options.escape[0]) {
82145
82167
  // escape: store the escaped position and move to handle_escape state
82146
82168
  escape_positions.push_back(position_buffer - start_buffer);
82147
82169
  goto handle_escape;
@@ -82163,7 +82185,7 @@ in_quotes:
82163
82185
  goto in_quotes;
82164
82186
  }
82165
82187
 
82166
- unquote:
82188
+ unquote : {
82167
82189
  /* state: unquote: this state handles the state directly after we unquote*/
82168
82190
  //
82169
82191
  // in this state we expect either another quote (entering the quoted state again, and escaping the quote)
@@ -82173,16 +82195,16 @@ unquote:
82173
82195
  offset = 1;
82174
82196
  goto final_state;
82175
82197
  }
82176
- if ((*buffer)[position_buffer] == options.quote[0] &&
82177
- (options.escape.empty() || options.escape[0] == options.quote[0])) {
82198
+ auto c = (*buffer)[position_buffer];
82199
+ if (c == options.quote[0] && (options.escape.empty() || options.escape[0] == options.quote[0])) {
82178
82200
  // escaped quote, return to quoted state and store escape position
82179
82201
  escape_positions.push_back(position_buffer - start_buffer);
82180
82202
  goto in_quotes;
82181
- } else if ((*buffer)[position_buffer] == options.delimiter[0]) {
82203
+ } else if (c == options.delimiter[0]) {
82182
82204
  // delimiter, add value
82183
82205
  offset = 1;
82184
82206
  goto add_value;
82185
- } else if (StringUtil::CharacterIsNewline((*buffer)[position_buffer])) {
82207
+ } else if (StringUtil::CharacterIsNewline(c)) {
82186
82208
  offset = 1;
82187
82209
  D_ASSERT(column == insert_chunk.ColumnCount() - 1);
82188
82210
  goto add_row;
@@ -82197,6 +82219,7 @@ unquote:
82197
82219
  options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
82198
82220
  return false;
82199
82221
  }
82222
+ }
82200
82223
  handle_escape : {
82201
82224
  /* state: handle_escape */
82202
82225
  // escape should be followed by a quote or another escape character
@@ -125104,7 +125127,7 @@ void SubstringDetection(string &str_1, string &str_2, const string &name_str_1,
125104
125127
  if (str_1.empty() || str_2.empty()) {
125105
125128
  return;
125106
125129
  }
125107
- if ((str_1.find(str_2) != string::npos || str_2.find(str_1) != std::string::npos) && str_1 != "NULL") {
125130
+ if ((str_1.find(str_2) != string::npos || str_2.find(str_1) != std::string::npos)) {
125108
125131
  throw BinderException("%s must not appear in the %s specification and vice versa", name_str_1, name_str_2);
125109
125132
  }
125110
125133
  }
@@ -125197,6 +125220,11 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, CopyInfo &in
125197
125220
  options.force_not_null.resize(expected_types.size(), false);
125198
125221
  }
125199
125222
  bind_data->FinalizeRead(context);
125223
+ if (!bind_data->single_threaded && options.auto_detect) {
125224
+ options.file_path = bind_data->files[0];
125225
+ auto initial_reader = make_unique<BufferedCSVReader>(context, options);
125226
+ options = initial_reader->options;
125227
+ }
125200
125228
  return move(bind_data);
125201
125229
  }
125202
125230
 
@@ -126339,11 +126367,6 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
126339
126367
  options.include_file_name = BooleanValue::Get(kv.second);
126340
126368
  } else if (loption == "hive_partitioning") {
126341
126369
  options.include_parsed_hive_partitions = BooleanValue::Get(kv.second);
126342
- } else if (loption == "buffer_size") {
126343
- options.buffer_size = kv.second.GetValue<uint64_t>();
126344
- if (options.buffer_size == 0) {
126345
- throw InvalidInputException("Buffer Size option must be higher than 0");
126346
- }
126347
126370
  } else {
126348
126371
  options.SetReadOption(loption, kv.second, names);
126349
126372
  }
@@ -126362,7 +126385,7 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
126362
126385
  } else {
126363
126386
  D_ASSERT(return_types.size() == names.size());
126364
126387
  }
126365
- options = result->options;
126388
+ options = initial_reader->options;
126366
126389
  result->sql_types = initial_reader->sql_types;
126367
126390
  result->initial_reader = move(initial_reader);
126368
126391
  } else {
@@ -126460,8 +126483,9 @@ static unique_ptr<FunctionData> ReadCSVAutoBind(ClientContext &context, TableFun
126460
126483
  //===--------------------------------------------------------------------===//
126461
126484
  struct ParallelCSVGlobalState : public GlobalTableFunctionState {
126462
126485
  public:
126463
- ParallelCSVGlobalState(unique_ptr<CSVFileHandle> file_handle_p, vector<string> &files_path_p,
126464
- idx_t system_threads_p, idx_t buffer_size_p, idx_t rows_to_skip)
126486
+ ParallelCSVGlobalState(ClientContext &context, unique_ptr<CSVFileHandle> file_handle_p,
126487
+ vector<string> &files_path_p, idx_t system_threads_p, idx_t buffer_size_p,
126488
+ idx_t rows_to_skip)
126465
126489
  : file_handle(move(file_handle_p)), system_threads(system_threads_p), buffer_size(buffer_size_p) {
126466
126490
  for (idx_t i = 0; i < rows_to_skip; i++) {
126467
126491
  file_handle->ReadLine();
@@ -126475,7 +126499,7 @@ public:
126475
126499
  } else {
126476
126500
  bytes_per_local_state = file_size / MaxThreads();
126477
126501
  }
126478
- current_buffer = make_shared<CSVBuffer>(buffer_size, *file_handle);
126502
+ current_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle);
126479
126503
  next_buffer = current_buffer->Next(*file_handle, buffer_size);
126480
126504
  }
126481
126505
  ParallelCSVGlobalState() {
@@ -126562,7 +126586,7 @@ unique_ptr<CSVBufferRead> ParallelCSVGlobalState::Next(ClientContext &context, R
126562
126586
  if (file_index < bind_data.files.size()) {
126563
126587
  bind_data.options.file_path = bind_data.files[file_index++];
126564
126588
  file_handle = ReadCSV::OpenCSV(bind_data.options, context);
126565
- next_buffer = make_shared<CSVBuffer>(buffer_size, *file_handle);
126589
+ next_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle);
126566
126590
  }
126567
126591
  }
126568
126592
  return result;
@@ -126583,8 +126607,9 @@ static unique_ptr<GlobalTableFunctionState> ParallelCSVInitGlobal(ClientContext
126583
126607
  file_handle = ReadCSV::OpenCSV(bind_data.options, context);
126584
126608
  }
126585
126609
  idx_t rows_to_skip = bind_data.options.skip_rows + (bind_data.options.has_header ? 1 : 0);
126586
- return make_unique<ParallelCSVGlobalState>(move(file_handle), bind_data.files, context.db->NumberOfThreads(),
126587
- bind_data.options.buffer_size, rows_to_skip);
126610
+ return make_unique<ParallelCSVGlobalState>(context, move(file_handle), bind_data.files,
126611
+ context.db->NumberOfThreads(), bind_data.options.buffer_size,
126612
+ rows_to_skip);
126588
126613
  }
126589
126614
 
126590
126615
  //===--------------------------------------------------------------------===//
package/src/duckdb.hpp CHANGED
@@ -11,8 +11,8 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI
11
11
  #pragma once
12
12
  #define DUCKDB_AMALGAMATION 1
13
13
  #define DUCKDB_AMALGAMATION_EXTENDED 1
14
- #define DUCKDB_SOURCE_ID "fc9fe05841"
15
- #define DUCKDB_VERSION "v0.5.2-dev2181"
14
+ #define DUCKDB_SOURCE_ID "85077a1a4b"
15
+ #define DUCKDB_VERSION "v0.5.2-dev2189"
16
16
  //===----------------------------------------------------------------------===//
17
17
  // DuckDB
18
18
  //
@@ -6367,9 +6367,6 @@ public:
6367
6367
  DUCKDB_API static bool CharacterIsNewline(char c) {
6368
6368
  return c == '\n' || c == '\r';
6369
6369
  }
6370
- DUCKDB_API static bool CharacterIsNullTerminator(char c) {
6371
- return c == '\0';
6372
- }
6373
6370
  DUCKDB_API static bool CharacterIsDigit(char c) {
6374
6371
  return c >= '0' && c <= '9';
6375
6372
  }
@@ -28261,6 +28258,7 @@ private:
28261
28258
  } // namespace duckdb
28262
28259
 
28263
28260
 
28261
+
28264
28262
  namespace duckdb {
28265
28263
 
28266
28264
  class CSVBuffer {
@@ -28269,10 +28267,10 @@ public:
28269
28267
  static constexpr idx_t INITIAL_BUFFER_SIZE_COLOSSAL = 32000000; // 32MB
28270
28268
 
28271
28269
  //! Constructor for Initial Buffer
28272
- CSVBuffer(idx_t buffer_size_p, CSVFileHandle &file_handle);
28270
+ CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle);
28273
28271
 
28274
28272
  //! Constructor for `Next()` Buffers
28275
- CSVBuffer(unique_ptr<char[]> buffer_p, idx_t buffer_size_p, idx_t actual_size_p, bool final_buffer);
28273
+ CSVBuffer(ClientContext &context, BufferHandle handle, idx_t buffer_size_p, idx_t actual_size_p, bool final_buffer);
28276
28274
 
28277
28275
  //! Creates a new buffer with the next part of the CSV File
28278
28276
  unique_ptr<CSVBuffer> Next(CSVFileHandle &file_handle, idx_t set_buffer_size);
@@ -28288,10 +28286,17 @@ public:
28288
28286
 
28289
28287
  //! If this buffer is the first buffer of the CSV File
28290
28288
  bool IsCSVFileFirstBuffer();
28291
- //! The actual buffer
28292
- unique_ptr<char[]> buffer;
28289
+
28290
+ BufferHandle AllocateBuffer(idx_t buffer_size);
28291
+
28292
+ char *Ptr() {
28293
+ return (char *)handle.Ptr();
28294
+ }
28293
28295
 
28294
28296
  private:
28297
+ ClientContext &context;
28298
+
28299
+ BufferHandle handle;
28295
28300
  //! Actual size can be smaller than the buffer size in case we allocate it too optimistically.
28296
28301
  idx_t actual_size;
28297
28302
  //! We need to check for Byte Order Mark, to define the start position of this buffer