duckdb 0.8.2-dev1764.0 → 0.8.2-dev1791.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -0
- package/package.json +1 -1
- package/src/duckdb/extension/parquet/include/parquet_reader.hpp +1 -0
- package/src/duckdb/extension/parquet/parquet_extension.cpp +38 -22
- package/src/duckdb/extension/parquet/parquet_reader.cpp +1 -4
- package/src/duckdb/src/common/sort/sort_state.cpp +1 -1
- package/src/duckdb/src/common/sort/sorted_block.cpp +1 -1
- package/src/duckdb/src/common/types/column/column_data_collection.cpp +8 -0
- package/src/duckdb/src/common/types/column/column_data_collection_segment.cpp +5 -0
- package/src/duckdb/src/common/types/string_heap.cpp +4 -0
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/types/column/column_data_allocator.hpp +10 -0
- package/src/duckdb/src/include/duckdb/common/types/column/column_data_collection.hpp +3 -0
- package/src/duckdb/src/include/duckdb/common/types/column/column_data_collection_segment.hpp +2 -0
- package/src/duckdb/src/include/duckdb/common/types/string_heap.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/arena_allocator.hpp +1 -0
- package/src/duckdb/src/parser/parser.cpp +43 -38
- package/src/duckdb/src/storage/arena_allocator.cpp +12 -0
package/README.md
CHANGED
@@ -100,6 +100,13 @@ var stmt = con.prepare('select ?::INTEGER as fortytwo', function(err, stmt) {
|
|
100
100
|
});
|
101
101
|
```
|
102
102
|
|
103
|
+
## Supported Node versions
|
104
|
+
We actively support only LTS and In-Support Node versions, as per July 2023, they are: Node 16, Node 18 and Node 20.
|
105
|
+
Release schedule for Node.js can be checked here: https://github.com/nodejs/release#release-schedule.
|
106
|
+
|
107
|
+
We currently bundle and test DuckDB also for Node 10, 12, 14, 17 and 19. We plan of going so going forward as long as the tooling supports it.
|
108
|
+
As per July 2023, Node 15 has been removed from the supported versions.
|
109
|
+
|
103
110
|
## Development
|
104
111
|
|
105
112
|
### First install:
|
package/package.json
CHANGED
@@ -93,6 +93,7 @@ public:
|
|
93
93
|
shared_ptr<ParquetFileMetadataCache> metadata;
|
94
94
|
ParquetOptions parquet_options;
|
95
95
|
MultiFileReaderData reader_data;
|
96
|
+
unique_ptr<ColumnReader> root_reader;
|
96
97
|
|
97
98
|
public:
|
98
99
|
void InitializeScan(ParquetReaderScanState &state, vector<idx_t> groups_to_read);
|
@@ -116,6 +116,11 @@ struct ParquetWriteBindData : public TableFunctionData {
|
|
116
116
|
vector<string> column_names;
|
117
117
|
duckdb_parquet::format::CompressionCodec::type codec = duckdb_parquet::format::CompressionCodec::SNAPPY;
|
118
118
|
idx_t row_group_size = RowGroup::ROW_GROUP_SIZE;
|
119
|
+
|
120
|
+
//! If row_group_size_bytes is not set, we default to row_group_size * BYTES_PER_ROW
|
121
|
+
static constexpr const idx_t BYTES_PER_ROW = 1024;
|
122
|
+
idx_t row_group_size_bytes;
|
123
|
+
|
119
124
|
ChildFieldIDs field_ids;
|
120
125
|
};
|
121
126
|
|
@@ -741,33 +746,39 @@ static void GetFieldIDs(const Value &field_ids_value, ChildFieldIDs &field_ids,
|
|
741
746
|
unique_ptr<FunctionData> ParquetWriteBind(ClientContext &context, CopyInfo &info, vector<string> &names,
|
742
747
|
vector<LogicalType> &sql_types) {
|
743
748
|
D_ASSERT(names.size() == sql_types.size());
|
749
|
+
bool row_group_size_bytes_set = false;
|
744
750
|
auto bind_data = make_uniq<ParquetWriteBindData>();
|
745
751
|
for (auto &option : info.options) {
|
746
|
-
auto loption = StringUtil::Lower(option.first);
|
752
|
+
const auto loption = StringUtil::Lower(option.first);
|
753
|
+
if (option.second.size() != 1) {
|
754
|
+
// All parquet write options require exactly one argument
|
755
|
+
throw BinderException("%s requires exactly one argument", StringUtil::Upper(loption));
|
756
|
+
}
|
747
757
|
if (loption == "row_group_size" || loption == "chunk_size") {
|
748
758
|
bind_data->row_group_size = option.second[0].GetValue<uint64_t>();
|
759
|
+
} else if (loption == "row_group_size_bytes") {
|
760
|
+
auto roption = option.second[0];
|
761
|
+
if (roption.GetTypeMutable().id() == LogicalTypeId::VARCHAR) {
|
762
|
+
bind_data->row_group_size_bytes = DBConfig::ParseMemoryLimit(roption.ToString());
|
763
|
+
} else {
|
764
|
+
bind_data->row_group_size_bytes = option.second[0].GetValue<uint64_t>();
|
765
|
+
}
|
766
|
+
row_group_size_bytes_set = true;
|
749
767
|
} else if (loption == "compression" || loption == "codec") {
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
bind_data->codec = duckdb_parquet::format::CompressionCodec::ZSTD;
|
763
|
-
continue;
|
764
|
-
}
|
768
|
+
const auto roption = StringUtil::Lower(option.second[0].ToString());
|
769
|
+
if (roption == "uncompressed") {
|
770
|
+
bind_data->codec = duckdb_parquet::format::CompressionCodec::UNCOMPRESSED;
|
771
|
+
} else if (roption == "snappy") {
|
772
|
+
bind_data->codec = duckdb_parquet::format::CompressionCodec::SNAPPY;
|
773
|
+
} else if (roption == "gzip") {
|
774
|
+
bind_data->codec = duckdb_parquet::format::CompressionCodec::GZIP;
|
775
|
+
} else if (roption == "zstd") {
|
776
|
+
bind_data->codec = duckdb_parquet::format::CompressionCodec::ZSTD;
|
777
|
+
} else {
|
778
|
+
throw BinderException("Expected %s argument to be either [uncompressed, snappy, gzip or zstd]",
|
779
|
+
loption);
|
765
780
|
}
|
766
|
-
throw BinderException("Expected %s argument to be either [uncompressed, snappy, gzip or zstd]", loption);
|
767
781
|
} else if (loption == "field_ids") {
|
768
|
-
if (option.second.size() != 1) {
|
769
|
-
throw BinderException("FIELD_IDS requires exactly one argument");
|
770
|
-
}
|
771
782
|
if (option.second[0].type().id() == LogicalTypeId::VARCHAR &&
|
772
783
|
StringUtil::Lower(StringValue::Get(option.second[0])) == "auto") {
|
773
784
|
idx_t field_id = 0;
|
@@ -788,6 +799,9 @@ unique_ptr<FunctionData> ParquetWriteBind(ClientContext &context, CopyInfo &info
|
|
788
799
|
throw NotImplementedException("Unrecognized option for PARQUET: %s", option.first.c_str());
|
789
800
|
}
|
790
801
|
}
|
802
|
+
if (!row_group_size_bytes_set) {
|
803
|
+
bind_data->row_group_size_bytes = bind_data->row_group_size * ParquetWriteBindData::BYTES_PER_ROW;
|
804
|
+
}
|
791
805
|
bind_data->sql_types = sql_types;
|
792
806
|
bind_data->column_names = names;
|
793
807
|
return std::move(bind_data);
|
@@ -812,8 +826,10 @@ void ParquetWriteSink(ExecutionContext &context, FunctionData &bind_data_p, Glob
|
|
812
826
|
|
813
827
|
// append data to the local (buffered) chunk collection
|
814
828
|
local_state.buffer.Append(local_state.append_state, input);
|
815
|
-
|
816
|
-
|
829
|
+
|
830
|
+
if (local_state.buffer.Count() > bind_data.row_group_size ||
|
831
|
+
local_state.buffer.SizeInBytes() > bind_data.row_group_size_bytes) {
|
832
|
+
// if the chunk collection exceeds a certain size (rows/bytes) we flush it to the parquet file
|
817
833
|
local_state.append_state.current_chunk_state.handles.clear();
|
818
834
|
global_state.writer->Flush(local_state.buffer);
|
819
835
|
local_state.buffer.InitializeAppend(local_state.append_state);
|
@@ -399,8 +399,7 @@ void ParquetReader::InitializeSchema() {
|
|
399
399
|
if (file_meta_data->schema.size() < 2) {
|
400
400
|
throw FormatException("Need at least one non-root column in the file");
|
401
401
|
}
|
402
|
-
|
403
|
-
|
402
|
+
root_reader = CreateReader();
|
404
403
|
auto &root_type = root_reader->Type();
|
405
404
|
auto &child_types = StructType::GetChildTypes(root_type);
|
406
405
|
D_ASSERT(root_type.id() == LogicalTypeId::STRUCT);
|
@@ -450,7 +449,6 @@ ParquetReader::ParquetReader(ClientContext &context_p, string file_name_p, Parqu
|
|
450
449
|
ObjectCache::GetObjectCache(context_p).Put(file_name, metadata);
|
451
450
|
}
|
452
451
|
}
|
453
|
-
|
454
452
|
InitializeSchema();
|
455
453
|
}
|
456
454
|
|
@@ -483,7 +481,6 @@ unique_ptr<BaseStatistics> ParquetReader::ReadStatistics(const string &name) {
|
|
483
481
|
|
484
482
|
unique_ptr<BaseStatistics> column_stats;
|
485
483
|
auto file_meta_data = GetFileMetadata();
|
486
|
-
auto root_reader = CreateReader();
|
487
484
|
auto column_reader = root_reader->Cast<StructColumnReader>().GetChildReader(file_col_idx);
|
488
485
|
|
489
486
|
for (idx_t row_group_idx = 0; row_group_idx < file_meta_data->row_groups.size(); row_group_idx++) {
|
@@ -315,7 +315,7 @@ void LocalSortState::ReOrder(SortedData &sd, data_ptr_t sorting_ptr, RowDataColl
|
|
315
315
|
sd.data_blocks.back()->block->SetSwizzling(nullptr);
|
316
316
|
// Create a single heap block to store the ordered heap
|
317
317
|
idx_t total_byte_offset =
|
318
|
-
std::accumulate(heap.blocks.begin(), heap.blocks.end(), 0,
|
318
|
+
std::accumulate(heap.blocks.begin(), heap.blocks.end(), (idx_t)0,
|
319
319
|
[](idx_t a, const unique_ptr<RowDataBlock> &b) { return a + b->byte_offset; });
|
320
320
|
idx_t heap_block_size = MaxValue(total_byte_offset, (idx_t)Storage::BLOCK_SIZE);
|
321
321
|
auto ordered_heap_block = make_uniq<RowDataBlock>(*buffer_manager, heap_block_size, 1);
|
@@ -85,7 +85,7 @@ SortedBlock::SortedBlock(BufferManager &buffer_manager, GlobalSortState &state)
|
|
85
85
|
}
|
86
86
|
|
87
87
|
idx_t SortedBlock::Count() const {
|
88
|
-
idx_t count = std::accumulate(radix_sorting_data.begin(), radix_sorting_data.end(), 0,
|
88
|
+
idx_t count = std::accumulate(radix_sorting_data.begin(), radix_sorting_data.end(), (idx_t)0,
|
89
89
|
[](idx_t a, const unique_ptr<RowDataBlock> &b) { return a + b->count; });
|
90
90
|
if (!sort_layout.all_constant) {
|
91
91
|
D_ASSERT(count == blob_sorting_data->Count());
|
@@ -100,6 +100,14 @@ Allocator &ColumnDataCollection::GetAllocator() const {
|
|
100
100
|
return allocator->GetAllocator();
|
101
101
|
}
|
102
102
|
|
103
|
+
idx_t ColumnDataCollection::SizeInBytes() const {
|
104
|
+
idx_t total_size = 0;
|
105
|
+
for (const auto &segment : segments) {
|
106
|
+
total_size += segment->SizeInBytes();
|
107
|
+
}
|
108
|
+
return total_size;
|
109
|
+
}
|
110
|
+
|
103
111
|
//===--------------------------------------------------------------------===//
|
104
112
|
// ColumnDataRow
|
105
113
|
//===--------------------------------------------------------------------===//
|
@@ -243,6 +243,11 @@ idx_t ColumnDataCollectionSegment::ChunkCount() const {
|
|
243
243
|
return chunk_data.size();
|
244
244
|
}
|
245
245
|
|
246
|
+
idx_t ColumnDataCollectionSegment::SizeInBytes() const {
|
247
|
+
D_ASSERT(!allocator->IsShared());
|
248
|
+
return allocator->SizeInBytes() + heap->SizeInBytes();
|
249
|
+
}
|
250
|
+
|
246
251
|
void ColumnDataCollectionSegment::FetchChunk(idx_t chunk_idx, DataChunk &result) {
|
247
252
|
vector<column_t> column_ids;
|
248
253
|
column_ids.reserve(types.size());
|
@@ -1,8 +1,8 @@
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
2
|
-
#define DUCKDB_VERSION "0.8.2-
|
2
|
+
#define DUCKDB_VERSION "0.8.2-dev1791"
|
3
3
|
#endif
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
5
|
+
#define DUCKDB_SOURCE_ID "ecae3d0c87"
|
6
6
|
#endif
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
8
8
|
#include "duckdb/main/database.hpp"
|
@@ -43,9 +43,19 @@ public:
|
|
43
43
|
void MakeShared() {
|
44
44
|
shared = true;
|
45
45
|
}
|
46
|
+
bool IsShared() const {
|
47
|
+
return shared;
|
48
|
+
}
|
46
49
|
idx_t BlockCount() const {
|
47
50
|
return blocks.size();
|
48
51
|
}
|
52
|
+
idx_t SizeInBytes() const {
|
53
|
+
idx_t total_size = 0;
|
54
|
+
for (const auto &block : blocks) {
|
55
|
+
total_size += block.size;
|
56
|
+
}
|
57
|
+
return total_size;
|
58
|
+
}
|
49
59
|
|
50
60
|
public:
|
51
61
|
void AllocateData(idx_t size, uint32_t &block_id, uint32_t &offset, ChunkManagementState *chunk_state);
|
@@ -193,48 +193,53 @@ void Parser::ParseQuery(const string &query) {
|
|
193
193
|
auto query_statements = SplitQueryStringIntoStatements(query);
|
194
194
|
auto stmt_loc = 0;
|
195
195
|
for (auto const &query_statement : query_statements) {
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
statements.back()->stmt_length = query_statement.size() - 1;
|
208
|
-
statements.back()->stmt_location = stmt_loc;
|
209
|
-
stmt_loc += query_statement.size();
|
210
|
-
} else {
|
211
|
-
// let extensions parse the statement which DuckDB failed to parse
|
212
|
-
bool parsed_single_statement = false;
|
213
|
-
for (auto &ext : *options.extensions) {
|
214
|
-
D_ASSERT(!parsed_single_statement);
|
215
|
-
D_ASSERT(ext.parse_function);
|
216
|
-
auto result = ext.parse_function(ext.parser_info.get(), query_statement);
|
217
|
-
if (result.type == ParserExtensionResultType::PARSE_SUCCESSFUL) {
|
218
|
-
auto statement = make_uniq<ExtensionStatement>(ext, std::move(result.parse_data));
|
219
|
-
statement->stmt_length = query_statement.size() - 1;
|
220
|
-
statement->stmt_location = stmt_loc;
|
221
|
-
stmt_loc += query_statement.size();
|
222
|
-
statements.push_back(std::move(statement));
|
223
|
-
parsed_single_statement = true;
|
224
|
-
break;
|
225
|
-
} else if (result.type == ParserExtensionResultType::DISPLAY_EXTENSION_ERROR) {
|
226
|
-
throw ParserException(result.error);
|
227
|
-
} else {
|
228
|
-
// We move to the next one!
|
196
|
+
string another_parser_error;
|
197
|
+
// Creating a new scope to allow extensions to use PostgresParser, which is not reentrant
|
198
|
+
{
|
199
|
+
PostgresParser another_parser;
|
200
|
+
another_parser.Parse(query_statement);
|
201
|
+
// LCOV_EXCL_START
|
202
|
+
// first see if DuckDB can parse this individual query statement
|
203
|
+
if (another_parser.success) {
|
204
|
+
if (!another_parser.parse_tree) {
|
205
|
+
// empty statement
|
206
|
+
continue;
|
229
207
|
}
|
208
|
+
transformer.TransformParseTree(another_parser.parse_tree, statements);
|
209
|
+
// important to set in the case of a mixture of DDB and parser ext statements
|
210
|
+
statements.back()->stmt_length = query_statement.size() - 1;
|
211
|
+
statements.back()->stmt_location = stmt_loc;
|
212
|
+
stmt_loc += query_statement.size();
|
213
|
+
continue;
|
214
|
+
} else {
|
215
|
+
another_parser_error = QueryErrorContext::Format(query, another_parser.error_message,
|
216
|
+
another_parser.error_location - 1);
|
230
217
|
}
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
218
|
+
} // LCOV_EXCL_STOP
|
219
|
+
// LCOV_EXCL_START
|
220
|
+
// let extensions parse the statement which DuckDB failed to parse
|
221
|
+
bool parsed_single_statement = false;
|
222
|
+
for (auto &ext : *options.extensions) {
|
223
|
+
D_ASSERT(!parsed_single_statement);
|
224
|
+
D_ASSERT(ext.parse_function);
|
225
|
+
auto result = ext.parse_function(ext.parser_info.get(), query_statement);
|
226
|
+
if (result.type == ParserExtensionResultType::PARSE_SUCCESSFUL) {
|
227
|
+
auto statement = make_uniq<ExtensionStatement>(ext, std::move(result.parse_data));
|
228
|
+
statement->stmt_length = query_statement.size() - 1;
|
229
|
+
statement->stmt_location = stmt_loc;
|
230
|
+
stmt_loc += query_statement.size();
|
231
|
+
statements.push_back(std::move(statement));
|
232
|
+
parsed_single_statement = true;
|
233
|
+
break;
|
234
|
+
} else if (result.type == ParserExtensionResultType::DISPLAY_EXTENSION_ERROR) {
|
235
|
+
throw ParserException(result.error);
|
236
|
+
} else {
|
237
|
+
// We move to the next one!
|
235
238
|
}
|
236
239
|
}
|
237
|
-
|
240
|
+
if (!parsed_single_statement) {
|
241
|
+
throw ParserException(parser_error);
|
242
|
+
} // LCOV_EXCL_STOP
|
238
243
|
}
|
239
244
|
}
|
240
245
|
}
|
@@ -151,4 +151,16 @@ bool ArenaAllocator::IsEmpty() const {
|
|
151
151
|
return head == nullptr;
|
152
152
|
}
|
153
153
|
|
154
|
+
idx_t ArenaAllocator::SizeInBytes() const {
|
155
|
+
idx_t total_size = 0;
|
156
|
+
if (!IsEmpty()) {
|
157
|
+
auto current = head.get();
|
158
|
+
while (current != nullptr) {
|
159
|
+
total_size += current->current_position;
|
160
|
+
current = current->next.get();
|
161
|
+
}
|
162
|
+
}
|
163
|
+
return total_size;
|
164
|
+
}
|
165
|
+
|
154
166
|
} // namespace duckdb
|