duckdb 0.6.2-dev761.0 → 0.6.2-dev768.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/extension/parquet/column_writer.cpp +10 -9
- package/src/duckdb/extension/parquet/include/parquet_writer.hpp +13 -5
- package/src/duckdb/extension/parquet/parquet-extension.cpp +12 -0
- package/src/duckdb/extension/parquet/parquet_writer.cpp +10 -3
- package/src/duckdb/src/execution/operator/persistent/physical_copy_to_file.cpp +1 -1
- package/src/duckdb/src/execution/physical_plan/plan_copy_to_file.cpp +3 -0
- package/src/duckdb/src/function/table/copy_csv.cpp +13 -0
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_copy_to_file.hpp +2 -1
- package/src/duckdb/src/include/duckdb/function/copy_function.hpp +4 -2
- package/src/duckdb/src/main/extension/extension_load.cpp +2 -2
package/package.json
CHANGED
|
@@ -185,7 +185,7 @@ ColumnWriterState::~ColumnWriterState() {
|
|
|
185
185
|
|
|
186
186
|
void ColumnWriter::CompressPage(BufferedSerializer &temp_writer, size_t &compressed_size, data_ptr_t &compressed_data,
|
|
187
187
|
unique_ptr<data_t[]> &compressed_buf) {
|
|
188
|
-
switch (writer.
|
|
188
|
+
switch (writer.GetCodec()) {
|
|
189
189
|
case CompressionCodec::UNCOMPRESSED:
|
|
190
190
|
compressed_size = temp_writer.blob.size;
|
|
191
191
|
compressed_data = temp_writer.blob.data.get();
|
|
@@ -396,10 +396,10 @@ unique_ptr<ColumnWriterState> BasicColumnWriter::InitializeWriteState(duckdb_par
|
|
|
396
396
|
void BasicColumnWriter::RegisterToRowGroup(duckdb_parquet::format::RowGroup &row_group) {
|
|
397
397
|
format::ColumnChunk column_chunk;
|
|
398
398
|
column_chunk.__isset.meta_data = true;
|
|
399
|
-
column_chunk.meta_data.codec = writer.
|
|
399
|
+
column_chunk.meta_data.codec = writer.GetCodec();
|
|
400
400
|
column_chunk.meta_data.path_in_schema = schema_path;
|
|
401
401
|
column_chunk.meta_data.num_values = 0;
|
|
402
|
-
column_chunk.meta_data.type = writer.
|
|
402
|
+
column_chunk.meta_data.type = writer.GetType(schema_idx);
|
|
403
403
|
row_group.columns.push_back(move(column_chunk));
|
|
404
404
|
}
|
|
405
405
|
|
|
@@ -645,7 +645,8 @@ void BasicColumnWriter::FinalizeWrite(ColumnWriterState &state_p) {
|
|
|
645
645
|
// flush the last page (if any remains)
|
|
646
646
|
FlushPage(state);
|
|
647
647
|
|
|
648
|
-
auto
|
|
648
|
+
auto &column_writer = writer.GetWriter();
|
|
649
|
+
auto start_offset = column_writer.GetTotalWritten();
|
|
649
650
|
auto page_offset = start_offset;
|
|
650
651
|
// flush the dictionary
|
|
651
652
|
if (HasDictionary(state)) {
|
|
@@ -665,14 +666,14 @@ void BasicColumnWriter::FinalizeWrite(ColumnWriterState &state_p) {
|
|
|
665
666
|
idx_t total_uncompressed_size = 0;
|
|
666
667
|
for (auto &write_info : state.write_info) {
|
|
667
668
|
D_ASSERT(write_info.page_header.uncompressed_page_size > 0);
|
|
668
|
-
auto header_start_offset =
|
|
669
|
-
write_info.page_header.write(writer.
|
|
669
|
+
auto header_start_offset = column_writer.GetTotalWritten();
|
|
670
|
+
write_info.page_header.write(writer.GetProtocol());
|
|
670
671
|
// total uncompressed size in the column chunk includes the header size (!)
|
|
671
|
-
total_uncompressed_size +=
|
|
672
|
+
total_uncompressed_size += column_writer.GetTotalWritten() - header_start_offset;
|
|
672
673
|
total_uncompressed_size += write_info.page_header.uncompressed_page_size;
|
|
673
|
-
|
|
674
|
+
column_writer.WriteData(write_info.compressed_data, write_info.compressed_size);
|
|
674
675
|
}
|
|
675
|
-
column_chunk.meta_data.total_compressed_size =
|
|
676
|
+
column_chunk.meta_data.total_compressed_size = column_writer.GetTotalWritten() - start_offset;
|
|
676
677
|
column_chunk.meta_data.total_uncompressed_size = total_uncompressed_size;
|
|
677
678
|
}
|
|
678
679
|
|
|
@@ -26,11 +26,6 @@ class FileSystem;
|
|
|
26
26
|
class FileOpener;
|
|
27
27
|
|
|
28
28
|
class ParquetWriter {
|
|
29
|
-
friend class ColumnWriter;
|
|
30
|
-
friend class BasicColumnWriter;
|
|
31
|
-
friend class ListColumnWriter;
|
|
32
|
-
friend class StructColumnWriter;
|
|
33
|
-
|
|
34
29
|
public:
|
|
35
30
|
ParquetWriter(FileSystem &fs, string file_name, FileOpener *file_opener, vector<LogicalType> types,
|
|
36
31
|
vector<string> names, duckdb_parquet::format::CompressionCodec::type codec);
|
|
@@ -42,6 +37,19 @@ public:
|
|
|
42
37
|
static duckdb_parquet::format::Type::type DuckDBTypeToParquetType(const LogicalType &duckdb_type);
|
|
43
38
|
static void SetSchemaProperties(const LogicalType &duckdb_type, duckdb_parquet::format::SchemaElement &schema_ele);
|
|
44
39
|
|
|
40
|
+
duckdb_apache::thrift::protocol::TProtocol *GetProtocol() {
|
|
41
|
+
return protocol.get();
|
|
42
|
+
}
|
|
43
|
+
duckdb_parquet::format::CompressionCodec::type GetCodec() {
|
|
44
|
+
return codec;
|
|
45
|
+
}
|
|
46
|
+
duckdb_parquet::format::Type::type GetType(idx_t schema_idx) {
|
|
47
|
+
return file_meta_data.schema[schema_idx].type;
|
|
48
|
+
}
|
|
49
|
+
BufferedFileWriter &GetWriter() {
|
|
50
|
+
return *writer;
|
|
51
|
+
}
|
|
52
|
+
|
|
45
53
|
private:
|
|
46
54
|
string file_name;
|
|
47
55
|
vector<LogicalType> sql_types;
|
|
@@ -733,6 +733,17 @@ unique_ptr<LocalFunctionData> ParquetWriteInitializeLocal(ExecutionContext &cont
|
|
|
733
733
|
return make_unique<ParquetWriteLocalState>(context.client, bind_data.sql_types);
|
|
734
734
|
}
|
|
735
735
|
|
|
736
|
+
//===--------------------------------------------------------------------===//
|
|
737
|
+
// Parallel
|
|
738
|
+
//===--------------------------------------------------------------------===//
|
|
739
|
+
bool ParquetWriteIsParallel(ClientContext &context, FunctionData &bind_data) {
|
|
740
|
+
auto &config = DBConfig::GetConfig(context);
|
|
741
|
+
if (config.options.preserve_insertion_order) {
|
|
742
|
+
return false;
|
|
743
|
+
}
|
|
744
|
+
return true;
|
|
745
|
+
}
|
|
746
|
+
|
|
736
747
|
unique_ptr<TableFunctionRef> ParquetScanReplacement(ClientContext &context, const string &table_name,
|
|
737
748
|
ReplacementScanData *data) {
|
|
738
749
|
auto lower_name = StringUtil::Lower(table_name);
|
|
@@ -769,6 +780,7 @@ void ParquetExtension::Load(DuckDB &db) {
|
|
|
769
780
|
function.copy_to_sink = ParquetWriteSink;
|
|
770
781
|
function.copy_to_combine = ParquetWriteCombine;
|
|
771
782
|
function.copy_to_finalize = ParquetWriteFinalize;
|
|
783
|
+
function.parallel = ParquetWriteIsParallel;
|
|
772
784
|
function.copy_from_bind = ParquetScanFunction::ParquetReadBind;
|
|
773
785
|
function.copy_from_function = scan_fun.functions[0];
|
|
774
786
|
|
|
@@ -249,18 +249,17 @@ void ParquetWriter::Flush(ColumnDataCollection &buffer) {
|
|
|
249
249
|
if (buffer.Count() == 0) {
|
|
250
250
|
return;
|
|
251
251
|
}
|
|
252
|
-
lock_guard<mutex> glock(lock);
|
|
253
252
|
|
|
254
253
|
// set up a new row group for this chunk collection
|
|
255
254
|
ParquetRowGroup row_group;
|
|
256
255
|
row_group.num_rows = buffer.Count();
|
|
257
|
-
row_group.file_offset = writer->GetTotalWritten();
|
|
258
256
|
row_group.__isset.file_offset = true;
|
|
259
257
|
|
|
258
|
+
vector<unique_ptr<ColumnWriterState>> states;
|
|
260
259
|
// iterate over each of the columns of the chunk collection and write them
|
|
261
260
|
D_ASSERT(buffer.ColumnCount() == column_writers.size());
|
|
262
261
|
for (idx_t col_idx = 0; col_idx < buffer.ColumnCount(); col_idx++) {
|
|
263
|
-
const
|
|
262
|
+
const auto &col_writer = column_writers[col_idx];
|
|
264
263
|
auto write_state = col_writer->InitializeWriteState(row_group, buffer.GetAllocator());
|
|
265
264
|
if (col_writer->HasAnalyze()) {
|
|
266
265
|
for (auto &chunk : buffer.Chunks()) {
|
|
@@ -275,6 +274,14 @@ void ParquetWriter::Flush(ColumnDataCollection &buffer) {
|
|
|
275
274
|
for (auto &chunk : buffer.Chunks()) {
|
|
276
275
|
col_writer->Write(*write_state, chunk.data[col_idx], chunk.size());
|
|
277
276
|
}
|
|
277
|
+
states.push_back(move(write_state));
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
lock_guard<mutex> glock(lock);
|
|
281
|
+
row_group.file_offset = writer->GetTotalWritten();
|
|
282
|
+
for (idx_t col_idx = 0; col_idx < buffer.ColumnCount(); col_idx++) {
|
|
283
|
+
const auto &col_writer = column_writers[col_idx];
|
|
284
|
+
auto write_state = move(states[col_idx]);
|
|
278
285
|
col_writer->FinalizeWrite(*write_state);
|
|
279
286
|
}
|
|
280
287
|
|
|
@@ -40,7 +40,7 @@ void MoveTmpFile(ClientContext &context, const string &tmp_file_path) {
|
|
|
40
40
|
PhysicalCopyToFile::PhysicalCopyToFile(vector<LogicalType> types, CopyFunction function_p,
|
|
41
41
|
unique_ptr<FunctionData> bind_data, idx_t estimated_cardinality)
|
|
42
42
|
: PhysicalOperator(PhysicalOperatorType::COPY_TO_FILE, move(types), estimated_cardinality),
|
|
43
|
-
function(move(function_p)), bind_data(move(bind_data)) {
|
|
43
|
+
function(move(function_p)), bind_data(move(bind_data)), parallel(false) {
|
|
44
44
|
}
|
|
45
45
|
|
|
46
46
|
SinkResultType PhysicalCopyToFile::Sink(ExecutionContext &context, GlobalSinkState &gstate, LocalSinkState &lstate,
|
|
@@ -18,6 +18,9 @@ unique_ptr<PhysicalOperator> PhysicalPlanGenerator::CreatePlan(LogicalCopyToFile
|
|
|
18
18
|
copy->file_path = op.file_path;
|
|
19
19
|
copy->use_tmp_file = use_tmp_file;
|
|
20
20
|
copy->per_thread_output = op.per_thread_output;
|
|
21
|
+
if (op.function.parallel) {
|
|
22
|
+
copy->parallel = op.function.parallel(context, *copy->bind_data);
|
|
23
|
+
}
|
|
21
24
|
|
|
22
25
|
copy->children.push_back(move(plan));
|
|
23
26
|
return move(copy);
|
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
#include "duckdb/common/types/string_type.hpp"
|
|
9
9
|
#include "duckdb/common/vector_operations/vector_operations.hpp"
|
|
10
10
|
#include "duckdb/function/scalar/string_functions.hpp"
|
|
11
|
+
#include "duckdb/main/config.hpp"
|
|
11
12
|
#include <limits>
|
|
12
13
|
|
|
13
14
|
namespace duckdb {
|
|
@@ -390,6 +391,17 @@ void WriteCSVFinalize(ClientContext &context, FunctionData &bind_data, GlobalFun
|
|
|
390
391
|
global_state.handle.reset();
|
|
391
392
|
}
|
|
392
393
|
|
|
394
|
+
//===--------------------------------------------------------------------===//
|
|
395
|
+
// Parallel
|
|
396
|
+
//===--------------------------------------------------------------------===//
|
|
397
|
+
bool WriteCSVIsParallel(ClientContext &context, FunctionData &bind_data) {
|
|
398
|
+
auto &config = DBConfig::GetConfig(context);
|
|
399
|
+
if (config.options.preserve_insertion_order) {
|
|
400
|
+
return false;
|
|
401
|
+
}
|
|
402
|
+
return true;
|
|
403
|
+
}
|
|
404
|
+
|
|
393
405
|
void CSVCopyFunction::RegisterFunction(BuiltinFunctions &set) {
|
|
394
406
|
CopyFunction info("csv");
|
|
395
407
|
info.copy_to_bind = WriteCSVBind;
|
|
@@ -398,6 +410,7 @@ void CSVCopyFunction::RegisterFunction(BuiltinFunctions &set) {
|
|
|
398
410
|
info.copy_to_sink = WriteCSVSink;
|
|
399
411
|
info.copy_to_combine = WriteCSVCombine;
|
|
400
412
|
info.copy_to_finalize = WriteCSVFinalize;
|
|
413
|
+
info.parallel = WriteCSVIsParallel;
|
|
401
414
|
|
|
402
415
|
info.copy_from_bind = ReadCSVBind;
|
|
403
416
|
info.copy_from_function = ReadCSVTableFunction::GetFunction();
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
|
2
|
-
#define DUCKDB_VERSION "0.6.2-
|
|
2
|
+
#define DUCKDB_VERSION "0.6.2-dev768"
|
|
3
3
|
#endif
|
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
|
5
|
+
#define DUCKDB_SOURCE_ID "bade062078"
|
|
6
6
|
#endif
|
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
|
8
8
|
#include "duckdb/main/database.hpp"
|
package/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_copy_to_file.hpp
CHANGED
|
@@ -24,6 +24,7 @@ public:
|
|
|
24
24
|
unique_ptr<FunctionData> bind_data;
|
|
25
25
|
string file_path;
|
|
26
26
|
bool use_tmp_file;
|
|
27
|
+
bool parallel;
|
|
27
28
|
bool per_thread_output;
|
|
28
29
|
|
|
29
30
|
public:
|
|
@@ -51,7 +52,7 @@ public:
|
|
|
51
52
|
}
|
|
52
53
|
|
|
53
54
|
bool ParallelSink() const override {
|
|
54
|
-
return per_thread_output;
|
|
55
|
+
return per_thread_output || parallel;
|
|
55
56
|
}
|
|
56
57
|
};
|
|
57
58
|
} // namespace duckdb
|
|
@@ -44,13 +44,14 @@ typedef unique_ptr<FunctionData> (*copy_to_deserialize_t)(ClientContext &context
|
|
|
44
44
|
typedef unique_ptr<FunctionData> (*copy_from_bind_t)(ClientContext &context, CopyInfo &info,
|
|
45
45
|
vector<string> &expected_names,
|
|
46
46
|
vector<LogicalType> &expected_types);
|
|
47
|
+
typedef bool (*copy_to_is_parallel_t)(ClientContext &context, FunctionData &bind_data);
|
|
47
48
|
|
|
48
49
|
class CopyFunction : public Function {
|
|
49
50
|
public:
|
|
50
51
|
explicit CopyFunction(string name)
|
|
51
52
|
: Function(name), copy_to_bind(nullptr), copy_to_initialize_local(nullptr), copy_to_initialize_global(nullptr),
|
|
52
|
-
copy_to_sink(nullptr), copy_to_combine(nullptr), copy_to_finalize(nullptr),
|
|
53
|
-
deserialize(nullptr), copy_from_bind(nullptr) {
|
|
53
|
+
copy_to_sink(nullptr), copy_to_combine(nullptr), copy_to_finalize(nullptr), parallel(nullptr),
|
|
54
|
+
serialize(nullptr), deserialize(nullptr), copy_from_bind(nullptr) {
|
|
54
55
|
}
|
|
55
56
|
|
|
56
57
|
copy_to_bind_t copy_to_bind;
|
|
@@ -59,6 +60,7 @@ public:
|
|
|
59
60
|
copy_to_sink_t copy_to_sink;
|
|
60
61
|
copy_to_combine_t copy_to_combine;
|
|
61
62
|
copy_to_finalize_t copy_to_finalize;
|
|
63
|
+
copy_to_is_parallel_t parallel;
|
|
62
64
|
|
|
63
65
|
copy_to_serialize_t serialize;
|
|
64
66
|
copy_to_deserialize_t deserialize;
|
|
@@ -49,7 +49,7 @@ ExtensionInitResult ExtensionHelper::InitialLoad(DBConfig &config, FileOpener *o
|
|
|
49
49
|
}
|
|
50
50
|
throw IOException("Extension \"%s\" not found.\n%s", filename, message);
|
|
51
51
|
}
|
|
52
|
-
{
|
|
52
|
+
if (!config.options.allow_unsigned_extensions) {
|
|
53
53
|
auto handle = fs.OpenFile(filename, FileFlags::FILE_FLAGS_READ);
|
|
54
54
|
|
|
55
55
|
// signature is the last 265 bytes of the file
|
|
@@ -75,7 +75,7 @@ ExtensionInitResult ExtensionHelper::InitialLoad(DBConfig &config, FileOpener *o
|
|
|
75
75
|
break;
|
|
76
76
|
}
|
|
77
77
|
}
|
|
78
|
-
if (!any_valid
|
|
78
|
+
if (!any_valid) {
|
|
79
79
|
throw IOException(config.error_manager->FormatException(ErrorType::UNSIGNED_EXTENSION, filename));
|
|
80
80
|
}
|
|
81
81
|
}
|