duckdb 0.5.2-dev2181.0 → 0.5.2-dev2196.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb.cpp +62 -37
- package/src/duckdb.hpp +14 -9
- package/src/parquet-amalgamation.cpp +37741 -37741
package/package.json
CHANGED
package/src/duckdb.cpp
CHANGED
|
@@ -81381,17 +81381,21 @@ bool BufferedCSVReader::TryParseCSV(ParserMode parser_mode, DataChunk &insert_ch
|
|
|
81381
81381
|
|
|
81382
81382
|
namespace duckdb {
|
|
81383
81383
|
|
|
81384
|
-
CSVBuffer::CSVBuffer(idx_t buffer_size_p, CSVFileHandle &file_handle)
|
|
81385
|
-
|
|
81386
|
-
|
|
81384
|
+
CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle)
|
|
81385
|
+
: context(context), first_buffer(true) {
|
|
81386
|
+
this->handle = AllocateBuffer(buffer_size_p);
|
|
81387
|
+
|
|
81388
|
+
auto buffer = Ptr();
|
|
81389
|
+
actual_size = file_handle.Read(buffer, buffer_size_p);
|
|
81387
81390
|
if (actual_size >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') {
|
|
81388
81391
|
start_position += 3;
|
|
81389
81392
|
}
|
|
81390
81393
|
last_buffer = file_handle.FinishedReading();
|
|
81391
81394
|
}
|
|
81392
81395
|
|
|
81393
|
-
CSVBuffer::CSVBuffer(
|
|
81394
|
-
|
|
81396
|
+
CSVBuffer::CSVBuffer(ClientContext &context, BufferHandle buffer_p, idx_t buffer_size_p, idx_t actual_size_p,
|
|
81397
|
+
bool final_buffer)
|
|
81398
|
+
: context(context), handle(move(buffer_p)), actual_size(actual_size_p), last_buffer(final_buffer) {
|
|
81395
81399
|
}
|
|
81396
81400
|
|
|
81397
81401
|
unique_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t set_buffer_size) {
|
|
@@ -81400,14 +81404,18 @@ unique_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t set_buff
|
|
|
81400
81404
|
return nullptr;
|
|
81401
81405
|
}
|
|
81402
81406
|
|
|
81403
|
-
auto next_buffer =
|
|
81404
|
-
|
|
81405
|
-
idx_t next_buffer_actual_size = file_handle.Read(next_buffer.get(), set_buffer_size);
|
|
81407
|
+
auto next_buffer = AllocateBuffer(set_buffer_size);
|
|
81408
|
+
idx_t next_buffer_actual_size = file_handle.Read(next_buffer.Ptr(), set_buffer_size);
|
|
81406
81409
|
|
|
81407
|
-
return make_unique<CSVBuffer>(move(next_buffer), set_buffer_size, next_buffer_actual_size,
|
|
81410
|
+
return make_unique<CSVBuffer>(context, move(next_buffer), set_buffer_size, next_buffer_actual_size,
|
|
81408
81411
|
file_handle.FinishedReading());
|
|
81409
81412
|
}
|
|
81410
81413
|
|
|
81414
|
+
BufferHandle CSVBuffer::AllocateBuffer(idx_t buffer_size) {
|
|
81415
|
+
auto &buffer_manager = BufferManager::GetBufferManager(context);
|
|
81416
|
+
return buffer_manager.Allocate(MaxValue<idx_t>(Storage::BLOCK_SIZE, buffer_size));
|
|
81417
|
+
}
|
|
81418
|
+
|
|
81411
81419
|
idx_t CSVBuffer::GetBufferSize() {
|
|
81412
81420
|
return actual_size;
|
|
81413
81421
|
}
|
|
@@ -81458,6 +81466,9 @@ static bool ParseBoolean(const Value &value, const string &loption) {
|
|
|
81458
81466
|
}
|
|
81459
81467
|
|
|
81460
81468
|
static string ParseString(const Value &value, const string &loption) {
|
|
81469
|
+
if (value.IsNull()) {
|
|
81470
|
+
return string();
|
|
81471
|
+
}
|
|
81461
81472
|
if (value.type().id() == LogicalTypeId::LIST) {
|
|
81462
81473
|
auto &children = ListValue::GetChildren(value);
|
|
81463
81474
|
if (children.size() != 1) {
|
|
@@ -81612,6 +81623,11 @@ void BufferedCSVReaderOptions::SetReadOption(const string &loption, const Value
|
|
|
81612
81623
|
ignore_errors = ParseBoolean(value, loption);
|
|
81613
81624
|
} else if (loption == "union_by_name") {
|
|
81614
81625
|
union_by_name = ParseBoolean(value, loption);
|
|
81626
|
+
} else if (loption == "buffer_size") {
|
|
81627
|
+
buffer_size = ParseInteger(value, loption);
|
|
81628
|
+
if (buffer_size == 0) {
|
|
81629
|
+
throw InvalidInputException("Buffer Size option must be higher than 0");
|
|
81630
|
+
}
|
|
81615
81631
|
} else {
|
|
81616
81632
|
throw BinderException("Unrecognized option for CSV reader \"%s\"", loption);
|
|
81617
81633
|
}
|
|
@@ -81725,34 +81741,38 @@ struct CSVBufferRead {
|
|
|
81725
81741
|
|
|
81726
81742
|
const char &operator[](size_t i) const {
|
|
81727
81743
|
if (i < buffer->GetBufferSize()) {
|
|
81728
|
-
|
|
81744
|
+
auto buffer_ptr = buffer->Ptr();
|
|
81745
|
+
return buffer_ptr[i];
|
|
81729
81746
|
}
|
|
81730
|
-
|
|
81747
|
+
auto next_ptr = next_buffer->Ptr();
|
|
81748
|
+
return next_ptr[i - buffer->GetBufferSize()];
|
|
81731
81749
|
}
|
|
81732
81750
|
|
|
81733
81751
|
string_t GetValue(idx_t start_buffer, idx_t position_buffer, idx_t offset) {
|
|
81734
81752
|
idx_t length = position_buffer - start_buffer - offset;
|
|
81735
81753
|
// 1) It's all in the current buffer
|
|
81736
81754
|
if (start_buffer + length <= buffer->GetBufferSize()) {
|
|
81737
|
-
auto buffer_ptr = buffer->
|
|
81755
|
+
auto buffer_ptr = buffer->Ptr();
|
|
81738
81756
|
return string_t(buffer_ptr + start_buffer, length);
|
|
81739
81757
|
} else if (start_buffer >= buffer->GetBufferSize()) {
|
|
81740
81758
|
// 2) It's all in the next buffer
|
|
81741
81759
|
D_ASSERT(next_buffer);
|
|
81742
81760
|
D_ASSERT(next_buffer->GetBufferSize() >= length + (start_buffer - buffer->GetBufferSize()));
|
|
81743
|
-
auto buffer_ptr = next_buffer->
|
|
81761
|
+
auto buffer_ptr = next_buffer->Ptr();
|
|
81744
81762
|
return string_t(buffer_ptr + (start_buffer - buffer->GetBufferSize()), length);
|
|
81745
81763
|
} else {
|
|
81746
81764
|
// 3) It starts in the current buffer and ends in the next buffer
|
|
81747
81765
|
D_ASSERT(next_buffer);
|
|
81748
81766
|
auto intersection = unique_ptr<char[]>(new char[length]);
|
|
81749
81767
|
idx_t cur_pos = 0;
|
|
81768
|
+
auto buffer_ptr = buffer->Ptr();
|
|
81750
81769
|
for (idx_t i = start_buffer; i < buffer->GetBufferSize(); i++) {
|
|
81751
|
-
intersection[cur_pos++] =
|
|
81770
|
+
intersection[cur_pos++] = buffer_ptr[i];
|
|
81752
81771
|
}
|
|
81753
81772
|
idx_t nxt_buffer_pos = 0;
|
|
81773
|
+
auto next_buffer_ptr = next_buffer->Ptr();
|
|
81754
81774
|
for (; cur_pos < length; cur_pos++) {
|
|
81755
|
-
intersection[cur_pos] =
|
|
81775
|
+
intersection[cur_pos] = next_buffer_ptr[nxt_buffer_pos++];
|
|
81756
81776
|
}
|
|
81757
81777
|
intersections.emplace_back(move(intersection));
|
|
81758
81778
|
return string_t(intersections.back().get(), length);
|
|
@@ -82065,10 +82085,11 @@ normal : {
|
|
|
82065
82085
|
/* state: normal parsing state */
|
|
82066
82086
|
// this state parses the remainder of a non-quoted value until we reach a delimiter or newline
|
|
82067
82087
|
for (; position_buffer < end_buffer; position_buffer++) {
|
|
82068
|
-
|
|
82088
|
+
auto c = (*buffer)[position_buffer];
|
|
82089
|
+
if (c == options.delimiter[0]) {
|
|
82069
82090
|
// delimiter: end the value and add it to the chunk
|
|
82070
82091
|
goto add_value;
|
|
82071
|
-
} else if (StringUtil::CharacterIsNewline(
|
|
82092
|
+
} else if (StringUtil::CharacterIsNewline(c)) {
|
|
82072
82093
|
// newline: add row
|
|
82073
82094
|
D_ASSERT(try_add_line || column == insert_chunk.ColumnCount() - 1);
|
|
82074
82095
|
goto add_row;
|
|
@@ -82138,10 +82159,11 @@ in_quotes:
|
|
|
82138
82159
|
has_quotes = true;
|
|
82139
82160
|
position_buffer++;
|
|
82140
82161
|
for (; position_buffer < end_buffer; position_buffer++) {
|
|
82141
|
-
|
|
82162
|
+
auto c = (*buffer)[position_buffer];
|
|
82163
|
+
if (c == options.quote[0]) {
|
|
82142
82164
|
// quote: move to unquoted state
|
|
82143
82165
|
goto unquote;
|
|
82144
|
-
} else if (
|
|
82166
|
+
} else if (c == options.escape[0]) {
|
|
82145
82167
|
// escape: store the escaped position and move to handle_escape state
|
|
82146
82168
|
escape_positions.push_back(position_buffer - start_buffer);
|
|
82147
82169
|
goto handle_escape;
|
|
@@ -82163,7 +82185,7 @@ in_quotes:
|
|
|
82163
82185
|
goto in_quotes;
|
|
82164
82186
|
}
|
|
82165
82187
|
|
|
82166
|
-
unquote:
|
|
82188
|
+
unquote : {
|
|
82167
82189
|
/* state: unquote: this state handles the state directly after we unquote*/
|
|
82168
82190
|
//
|
|
82169
82191
|
// in this state we expect either another quote (entering the quoted state again, and escaping the quote)
|
|
@@ -82173,16 +82195,16 @@ unquote:
|
|
|
82173
82195
|
offset = 1;
|
|
82174
82196
|
goto final_state;
|
|
82175
82197
|
}
|
|
82176
|
-
|
|
82177
|
-
|
|
82198
|
+
auto c = (*buffer)[position_buffer];
|
|
82199
|
+
if (c == options.quote[0] && (options.escape.empty() || options.escape[0] == options.quote[0])) {
|
|
82178
82200
|
// escaped quote, return to quoted state and store escape position
|
|
82179
82201
|
escape_positions.push_back(position_buffer - start_buffer);
|
|
82180
82202
|
goto in_quotes;
|
|
82181
|
-
} else if (
|
|
82203
|
+
} else if (c == options.delimiter[0]) {
|
|
82182
82204
|
// delimiter, add value
|
|
82183
82205
|
offset = 1;
|
|
82184
82206
|
goto add_value;
|
|
82185
|
-
} else if (StringUtil::CharacterIsNewline(
|
|
82207
|
+
} else if (StringUtil::CharacterIsNewline(c)) {
|
|
82186
82208
|
offset = 1;
|
|
82187
82209
|
D_ASSERT(column == insert_chunk.ColumnCount() - 1);
|
|
82188
82210
|
goto add_row;
|
|
@@ -82197,6 +82219,7 @@ unquote:
|
|
|
82197
82219
|
options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
|
|
82198
82220
|
return false;
|
|
82199
82221
|
}
|
|
82222
|
+
}
|
|
82200
82223
|
handle_escape : {
|
|
82201
82224
|
/* state: handle_escape */
|
|
82202
82225
|
// escape should be followed by a quote or another escape character
|
|
@@ -125104,7 +125127,7 @@ void SubstringDetection(string &str_1, string &str_2, const string &name_str_1,
|
|
|
125104
125127
|
if (str_1.empty() || str_2.empty()) {
|
|
125105
125128
|
return;
|
|
125106
125129
|
}
|
|
125107
|
-
if ((str_1.find(str_2) != string::npos || str_2.find(str_1) != std::string::npos)
|
|
125130
|
+
if ((str_1.find(str_2) != string::npos || str_2.find(str_1) != std::string::npos)) {
|
|
125108
125131
|
throw BinderException("%s must not appear in the %s specification and vice versa", name_str_1, name_str_2);
|
|
125109
125132
|
}
|
|
125110
125133
|
}
|
|
@@ -125197,6 +125220,11 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, CopyInfo &in
|
|
|
125197
125220
|
options.force_not_null.resize(expected_types.size(), false);
|
|
125198
125221
|
}
|
|
125199
125222
|
bind_data->FinalizeRead(context);
|
|
125223
|
+
if (!bind_data->single_threaded && options.auto_detect) {
|
|
125224
|
+
options.file_path = bind_data->files[0];
|
|
125225
|
+
auto initial_reader = make_unique<BufferedCSVReader>(context, options);
|
|
125226
|
+
options = initial_reader->options;
|
|
125227
|
+
}
|
|
125200
125228
|
return move(bind_data);
|
|
125201
125229
|
}
|
|
125202
125230
|
|
|
@@ -126339,11 +126367,6 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
|
|
|
126339
126367
|
options.include_file_name = BooleanValue::Get(kv.second);
|
|
126340
126368
|
} else if (loption == "hive_partitioning") {
|
|
126341
126369
|
options.include_parsed_hive_partitions = BooleanValue::Get(kv.second);
|
|
126342
|
-
} else if (loption == "buffer_size") {
|
|
126343
|
-
options.buffer_size = kv.second.GetValue<uint64_t>();
|
|
126344
|
-
if (options.buffer_size == 0) {
|
|
126345
|
-
throw InvalidInputException("Buffer Size option must be higher than 0");
|
|
126346
|
-
}
|
|
126347
126370
|
} else {
|
|
126348
126371
|
options.SetReadOption(loption, kv.second, names);
|
|
126349
126372
|
}
|
|
@@ -126362,7 +126385,7 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
|
|
|
126362
126385
|
} else {
|
|
126363
126386
|
D_ASSERT(return_types.size() == names.size());
|
|
126364
126387
|
}
|
|
126365
|
-
options =
|
|
126388
|
+
options = initial_reader->options;
|
|
126366
126389
|
result->sql_types = initial_reader->sql_types;
|
|
126367
126390
|
result->initial_reader = move(initial_reader);
|
|
126368
126391
|
} else {
|
|
@@ -126460,8 +126483,9 @@ static unique_ptr<FunctionData> ReadCSVAutoBind(ClientContext &context, TableFun
|
|
|
126460
126483
|
//===--------------------------------------------------------------------===//
|
|
126461
126484
|
struct ParallelCSVGlobalState : public GlobalTableFunctionState {
|
|
126462
126485
|
public:
|
|
126463
|
-
ParallelCSVGlobalState(unique_ptr<CSVFileHandle> file_handle_p,
|
|
126464
|
-
idx_t system_threads_p, idx_t buffer_size_p,
|
|
126486
|
+
ParallelCSVGlobalState(ClientContext &context, unique_ptr<CSVFileHandle> file_handle_p,
|
|
126487
|
+
vector<string> &files_path_p, idx_t system_threads_p, idx_t buffer_size_p,
|
|
126488
|
+
idx_t rows_to_skip)
|
|
126465
126489
|
: file_handle(move(file_handle_p)), system_threads(system_threads_p), buffer_size(buffer_size_p) {
|
|
126466
126490
|
for (idx_t i = 0; i < rows_to_skip; i++) {
|
|
126467
126491
|
file_handle->ReadLine();
|
|
@@ -126475,7 +126499,7 @@ public:
|
|
|
126475
126499
|
} else {
|
|
126476
126500
|
bytes_per_local_state = file_size / MaxThreads();
|
|
126477
126501
|
}
|
|
126478
|
-
current_buffer = make_shared<CSVBuffer>(buffer_size, *file_handle);
|
|
126502
|
+
current_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle);
|
|
126479
126503
|
next_buffer = current_buffer->Next(*file_handle, buffer_size);
|
|
126480
126504
|
}
|
|
126481
126505
|
ParallelCSVGlobalState() {
|
|
@@ -126562,7 +126586,7 @@ unique_ptr<CSVBufferRead> ParallelCSVGlobalState::Next(ClientContext &context, R
|
|
|
126562
126586
|
if (file_index < bind_data.files.size()) {
|
|
126563
126587
|
bind_data.options.file_path = bind_data.files[file_index++];
|
|
126564
126588
|
file_handle = ReadCSV::OpenCSV(bind_data.options, context);
|
|
126565
|
-
next_buffer = make_shared<CSVBuffer>(buffer_size, *file_handle);
|
|
126589
|
+
next_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle);
|
|
126566
126590
|
}
|
|
126567
126591
|
}
|
|
126568
126592
|
return result;
|
|
@@ -126583,8 +126607,9 @@ static unique_ptr<GlobalTableFunctionState> ParallelCSVInitGlobal(ClientContext
|
|
|
126583
126607
|
file_handle = ReadCSV::OpenCSV(bind_data.options, context);
|
|
126584
126608
|
}
|
|
126585
126609
|
idx_t rows_to_skip = bind_data.options.skip_rows + (bind_data.options.has_header ? 1 : 0);
|
|
126586
|
-
return make_unique<ParallelCSVGlobalState>(move(file_handle), bind_data.files,
|
|
126587
|
-
bind_data.options.buffer_size,
|
|
126610
|
+
return make_unique<ParallelCSVGlobalState>(context, move(file_handle), bind_data.files,
|
|
126611
|
+
context.db->NumberOfThreads(), bind_data.options.buffer_size,
|
|
126612
|
+
rows_to_skip);
|
|
126588
126613
|
}
|
|
126589
126614
|
|
|
126590
126615
|
//===--------------------------------------------------------------------===//
|
package/src/duckdb.hpp
CHANGED
|
@@ -11,8 +11,8 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI
|
|
|
11
11
|
#pragma once
|
|
12
12
|
#define DUCKDB_AMALGAMATION 1
|
|
13
13
|
#define DUCKDB_AMALGAMATION_EXTENDED 1
|
|
14
|
-
#define DUCKDB_SOURCE_ID "
|
|
15
|
-
#define DUCKDB_VERSION "v0.5.2-
|
|
14
|
+
#define DUCKDB_SOURCE_ID "0ac5e8ee35"
|
|
15
|
+
#define DUCKDB_VERSION "v0.5.2-dev2196"
|
|
16
16
|
//===----------------------------------------------------------------------===//
|
|
17
17
|
// DuckDB
|
|
18
18
|
//
|
|
@@ -6367,9 +6367,6 @@ public:
|
|
|
6367
6367
|
DUCKDB_API static bool CharacterIsNewline(char c) {
|
|
6368
6368
|
return c == '\n' || c == '\r';
|
|
6369
6369
|
}
|
|
6370
|
-
DUCKDB_API static bool CharacterIsNullTerminator(char c) {
|
|
6371
|
-
return c == '\0';
|
|
6372
|
-
}
|
|
6373
6370
|
DUCKDB_API static bool CharacterIsDigit(char c) {
|
|
6374
6371
|
return c >= '0' && c <= '9';
|
|
6375
6372
|
}
|
|
@@ -28261,6 +28258,7 @@ private:
|
|
|
28261
28258
|
} // namespace duckdb
|
|
28262
28259
|
|
|
28263
28260
|
|
|
28261
|
+
|
|
28264
28262
|
namespace duckdb {
|
|
28265
28263
|
|
|
28266
28264
|
class CSVBuffer {
|
|
@@ -28269,10 +28267,10 @@ public:
|
|
|
28269
28267
|
static constexpr idx_t INITIAL_BUFFER_SIZE_COLOSSAL = 32000000; // 32MB
|
|
28270
28268
|
|
|
28271
28269
|
//! Constructor for Initial Buffer
|
|
28272
|
-
CSVBuffer(idx_t buffer_size_p, CSVFileHandle &file_handle);
|
|
28270
|
+
CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle);
|
|
28273
28271
|
|
|
28274
28272
|
//! Constructor for `Next()` Buffers
|
|
28275
|
-
CSVBuffer(
|
|
28273
|
+
CSVBuffer(ClientContext &context, BufferHandle handle, idx_t buffer_size_p, idx_t actual_size_p, bool final_buffer);
|
|
28276
28274
|
|
|
28277
28275
|
//! Creates a new buffer with the next part of the CSV File
|
|
28278
28276
|
unique_ptr<CSVBuffer> Next(CSVFileHandle &file_handle, idx_t set_buffer_size);
|
|
@@ -28288,10 +28286,17 @@ public:
|
|
|
28288
28286
|
|
|
28289
28287
|
//! If this buffer is the first buffer of the CSV File
|
|
28290
28288
|
bool IsCSVFileFirstBuffer();
|
|
28291
|
-
|
|
28292
|
-
|
|
28289
|
+
|
|
28290
|
+
BufferHandle AllocateBuffer(idx_t buffer_size);
|
|
28291
|
+
|
|
28292
|
+
char *Ptr() {
|
|
28293
|
+
return (char *)handle.Ptr();
|
|
28294
|
+
}
|
|
28293
28295
|
|
|
28294
28296
|
private:
|
|
28297
|
+
ClientContext &context;
|
|
28298
|
+
|
|
28299
|
+
BufferHandle handle;
|
|
28295
28300
|
//! Actual size can be smaller than the buffer size in case we allocate it too optimistically.
|
|
28296
28301
|
idx_t actual_size;
|
|
28297
28302
|
//! We need to check for Byte Order Mark, to define the start position of this buffer
|