duckdb 0.8.2-dev1764.0 → 0.8.2-dev1791.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -100,6 +100,13 @@ var stmt = con.prepare('select ?::INTEGER as fortytwo', function(err, stmt) {
100
100
  });
101
101
  ```
102
102
 
103
+ ## Supported Node versions
104
+ We actively support only LTS and In-Support Node versions, as per July 2023, they are: Node 16, Node 18 and Node 20.
105
+ Release schedule for Node.js can be checked here: https://github.com/nodejs/release#release-schedule.
106
+
107
+ We currently bundle and test DuckDB also for Node 10, 12, 14, 17 and 19. We plan of going so going forward as long as the tooling supports it.
108
+ As per July 2023, Node 15 has been removed from the supported versions.
109
+
103
110
  ## Development
104
111
 
105
112
  ### First install:
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "0.8.2-dev1764.0",
5
+ "version": "0.8.2-dev1791.0",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
@@ -93,6 +93,7 @@ public:
93
93
  shared_ptr<ParquetFileMetadataCache> metadata;
94
94
  ParquetOptions parquet_options;
95
95
  MultiFileReaderData reader_data;
96
+ unique_ptr<ColumnReader> root_reader;
96
97
 
97
98
  public:
98
99
  void InitializeScan(ParquetReaderScanState &state, vector<idx_t> groups_to_read);
@@ -116,6 +116,11 @@ struct ParquetWriteBindData : public TableFunctionData {
116
116
  vector<string> column_names;
117
117
  duckdb_parquet::format::CompressionCodec::type codec = duckdb_parquet::format::CompressionCodec::SNAPPY;
118
118
  idx_t row_group_size = RowGroup::ROW_GROUP_SIZE;
119
+
120
+ //! If row_group_size_bytes is not set, we default to row_group_size * BYTES_PER_ROW
121
+ static constexpr const idx_t BYTES_PER_ROW = 1024;
122
+ idx_t row_group_size_bytes;
123
+
119
124
  ChildFieldIDs field_ids;
120
125
  };
121
126
 
@@ -741,33 +746,39 @@ static void GetFieldIDs(const Value &field_ids_value, ChildFieldIDs &field_ids,
741
746
  unique_ptr<FunctionData> ParquetWriteBind(ClientContext &context, CopyInfo &info, vector<string> &names,
742
747
  vector<LogicalType> &sql_types) {
743
748
  D_ASSERT(names.size() == sql_types.size());
749
+ bool row_group_size_bytes_set = false;
744
750
  auto bind_data = make_uniq<ParquetWriteBindData>();
745
751
  for (auto &option : info.options) {
746
- auto loption = StringUtil::Lower(option.first);
752
+ const auto loption = StringUtil::Lower(option.first);
753
+ if (option.second.size() != 1) {
754
+ // All parquet write options require exactly one argument
755
+ throw BinderException("%s requires exactly one argument", StringUtil::Upper(loption));
756
+ }
747
757
  if (loption == "row_group_size" || loption == "chunk_size") {
748
758
  bind_data->row_group_size = option.second[0].GetValue<uint64_t>();
759
+ } else if (loption == "row_group_size_bytes") {
760
+ auto roption = option.second[0];
761
+ if (roption.GetTypeMutable().id() == LogicalTypeId::VARCHAR) {
762
+ bind_data->row_group_size_bytes = DBConfig::ParseMemoryLimit(roption.ToString());
763
+ } else {
764
+ bind_data->row_group_size_bytes = option.second[0].GetValue<uint64_t>();
765
+ }
766
+ row_group_size_bytes_set = true;
749
767
  } else if (loption == "compression" || loption == "codec") {
750
- if (!option.second.empty()) {
751
- auto roption = StringUtil::Lower(option.second[0].ToString());
752
- if (roption == "uncompressed") {
753
- bind_data->codec = duckdb_parquet::format::CompressionCodec::UNCOMPRESSED;
754
- continue;
755
- } else if (roption == "snappy") {
756
- bind_data->codec = duckdb_parquet::format::CompressionCodec::SNAPPY;
757
- continue;
758
- } else if (roption == "gzip") {
759
- bind_data->codec = duckdb_parquet::format::CompressionCodec::GZIP;
760
- continue;
761
- } else if (roption == "zstd") {
762
- bind_data->codec = duckdb_parquet::format::CompressionCodec::ZSTD;
763
- continue;
764
- }
768
+ const auto roption = StringUtil::Lower(option.second[0].ToString());
769
+ if (roption == "uncompressed") {
770
+ bind_data->codec = duckdb_parquet::format::CompressionCodec::UNCOMPRESSED;
771
+ } else if (roption == "snappy") {
772
+ bind_data->codec = duckdb_parquet::format::CompressionCodec::SNAPPY;
773
+ } else if (roption == "gzip") {
774
+ bind_data->codec = duckdb_parquet::format::CompressionCodec::GZIP;
775
+ } else if (roption == "zstd") {
776
+ bind_data->codec = duckdb_parquet::format::CompressionCodec::ZSTD;
777
+ } else {
778
+ throw BinderException("Expected %s argument to be either [uncompressed, snappy, gzip or zstd]",
779
+ loption);
765
780
  }
766
- throw BinderException("Expected %s argument to be either [uncompressed, snappy, gzip or zstd]", loption);
767
781
  } else if (loption == "field_ids") {
768
- if (option.second.size() != 1) {
769
- throw BinderException("FIELD_IDS requires exactly one argument");
770
- }
771
782
  if (option.second[0].type().id() == LogicalTypeId::VARCHAR &&
772
783
  StringUtil::Lower(StringValue::Get(option.second[0])) == "auto") {
773
784
  idx_t field_id = 0;
@@ -788,6 +799,9 @@ unique_ptr<FunctionData> ParquetWriteBind(ClientContext &context, CopyInfo &info
788
799
  throw NotImplementedException("Unrecognized option for PARQUET: %s", option.first.c_str());
789
800
  }
790
801
  }
802
+ if (!row_group_size_bytes_set) {
803
+ bind_data->row_group_size_bytes = bind_data->row_group_size * ParquetWriteBindData::BYTES_PER_ROW;
804
+ }
791
805
  bind_data->sql_types = sql_types;
792
806
  bind_data->column_names = names;
793
807
  return std::move(bind_data);
@@ -812,8 +826,10 @@ void ParquetWriteSink(ExecutionContext &context, FunctionData &bind_data_p, Glob
812
826
 
813
827
  // append data to the local (buffered) chunk collection
814
828
  local_state.buffer.Append(local_state.append_state, input);
815
- if (local_state.buffer.Count() > bind_data.row_group_size) {
816
- // if the chunk collection exceeds a certain size we flush it to the parquet file
829
+
830
+ if (local_state.buffer.Count() > bind_data.row_group_size ||
831
+ local_state.buffer.SizeInBytes() > bind_data.row_group_size_bytes) {
832
+ // if the chunk collection exceeds a certain size (rows/bytes) we flush it to the parquet file
817
833
  local_state.append_state.current_chunk_state.handles.clear();
818
834
  global_state.writer->Flush(local_state.buffer);
819
835
  local_state.buffer.InitializeAppend(local_state.append_state);
@@ -399,8 +399,7 @@ void ParquetReader::InitializeSchema() {
399
399
  if (file_meta_data->schema.size() < 2) {
400
400
  throw FormatException("Need at least one non-root column in the file");
401
401
  }
402
- auto root_reader = CreateReader();
403
-
402
+ root_reader = CreateReader();
404
403
  auto &root_type = root_reader->Type();
405
404
  auto &child_types = StructType::GetChildTypes(root_type);
406
405
  D_ASSERT(root_type.id() == LogicalTypeId::STRUCT);
@@ -450,7 +449,6 @@ ParquetReader::ParquetReader(ClientContext &context_p, string file_name_p, Parqu
450
449
  ObjectCache::GetObjectCache(context_p).Put(file_name, metadata);
451
450
  }
452
451
  }
453
-
454
452
  InitializeSchema();
455
453
  }
456
454
 
@@ -483,7 +481,6 @@ unique_ptr<BaseStatistics> ParquetReader::ReadStatistics(const string &name) {
483
481
 
484
482
  unique_ptr<BaseStatistics> column_stats;
485
483
  auto file_meta_data = GetFileMetadata();
486
- auto root_reader = CreateReader();
487
484
  auto column_reader = root_reader->Cast<StructColumnReader>().GetChildReader(file_col_idx);
488
485
 
489
486
  for (idx_t row_group_idx = 0; row_group_idx < file_meta_data->row_groups.size(); row_group_idx++) {
@@ -315,7 +315,7 @@ void LocalSortState::ReOrder(SortedData &sd, data_ptr_t sorting_ptr, RowDataColl
315
315
  sd.data_blocks.back()->block->SetSwizzling(nullptr);
316
316
  // Create a single heap block to store the ordered heap
317
317
  idx_t total_byte_offset =
318
- std::accumulate(heap.blocks.begin(), heap.blocks.end(), 0,
318
+ std::accumulate(heap.blocks.begin(), heap.blocks.end(), (idx_t)0,
319
319
  [](idx_t a, const unique_ptr<RowDataBlock> &b) { return a + b->byte_offset; });
320
320
  idx_t heap_block_size = MaxValue(total_byte_offset, (idx_t)Storage::BLOCK_SIZE);
321
321
  auto ordered_heap_block = make_uniq<RowDataBlock>(*buffer_manager, heap_block_size, 1);
@@ -85,7 +85,7 @@ SortedBlock::SortedBlock(BufferManager &buffer_manager, GlobalSortState &state)
85
85
  }
86
86
 
87
87
  idx_t SortedBlock::Count() const {
88
- idx_t count = std::accumulate(radix_sorting_data.begin(), radix_sorting_data.end(), 0,
88
+ idx_t count = std::accumulate(radix_sorting_data.begin(), radix_sorting_data.end(), (idx_t)0,
89
89
  [](idx_t a, const unique_ptr<RowDataBlock> &b) { return a + b->count; });
90
90
  if (!sort_layout.all_constant) {
91
91
  D_ASSERT(count == blob_sorting_data->Count());
@@ -100,6 +100,14 @@ Allocator &ColumnDataCollection::GetAllocator() const {
100
100
  return allocator->GetAllocator();
101
101
  }
102
102
 
103
+ idx_t ColumnDataCollection::SizeInBytes() const {
104
+ idx_t total_size = 0;
105
+ for (const auto &segment : segments) {
106
+ total_size += segment->SizeInBytes();
107
+ }
108
+ return total_size;
109
+ }
110
+
103
111
  //===--------------------------------------------------------------------===//
104
112
  // ColumnDataRow
105
113
  //===--------------------------------------------------------------------===//
@@ -243,6 +243,11 @@ idx_t ColumnDataCollectionSegment::ChunkCount() const {
243
243
  return chunk_data.size();
244
244
  }
245
245
 
246
+ idx_t ColumnDataCollectionSegment::SizeInBytes() const {
247
+ D_ASSERT(!allocator->IsShared());
248
+ return allocator->SizeInBytes() + heap->SizeInBytes();
249
+ }
250
+
246
251
  void ColumnDataCollectionSegment::FetchChunk(idx_t chunk_idx, DataChunk &result) {
247
252
  vector<column_t> column_ids;
248
253
  column_ids.reserve(types.size());
@@ -55,4 +55,8 @@ string_t StringHeap::EmptyString(idx_t len) {
55
55
  return string_t(insert_pos, len);
56
56
  }
57
57
 
58
+ idx_t StringHeap::SizeInBytes() const {
59
+ return allocator.SizeInBytes();
60
+ }
61
+
58
62
  } // namespace duckdb
@@ -1,8 +1,8 @@
1
1
  #ifndef DUCKDB_VERSION
2
- #define DUCKDB_VERSION "0.8.2-dev1764"
2
+ #define DUCKDB_VERSION "0.8.2-dev1791"
3
3
  #endif
4
4
  #ifndef DUCKDB_SOURCE_ID
5
- #define DUCKDB_SOURCE_ID "07b0b0a2a4"
5
+ #define DUCKDB_SOURCE_ID "ecae3d0c87"
6
6
  #endif
7
7
  #include "duckdb/function/table/system_functions.hpp"
8
8
  #include "duckdb/main/database.hpp"
@@ -43,9 +43,19 @@ public:
43
43
  void MakeShared() {
44
44
  shared = true;
45
45
  }
46
+ bool IsShared() const {
47
+ return shared;
48
+ }
46
49
  idx_t BlockCount() const {
47
50
  return blocks.size();
48
51
  }
52
+ idx_t SizeInBytes() const {
53
+ idx_t total_size = 0;
54
+ for (const auto &block : blocks) {
55
+ total_size += block.size;
56
+ }
57
+ return total_size;
58
+ }
49
59
 
50
60
  public:
51
61
  void AllocateData(idx_t size, uint32_t &block_id, uint32_t &offset, ChunkManagementState *chunk_state);
@@ -61,6 +61,9 @@ public:
61
61
  return types.size();
62
62
  }
63
63
 
64
+ //! The size (in bytes) of this ColumnDataCollection
65
+ idx_t SizeInBytes() const;
66
+
64
67
  //! Get the allocator
65
68
  DUCKDB_API Allocator &GetAllocator() const;
66
69
 
@@ -126,6 +126,8 @@ public:
126
126
  }
127
127
 
128
128
  idx_t ChunkCount() const;
129
+ idx_t SizeInBytes() const;
130
+
129
131
  void FetchChunk(idx_t chunk_idx, DataChunk &result);
130
132
  void FetchChunk(idx_t chunk_idx, DataChunk &result, const vector<column_t> &column_ids);
131
133
 
@@ -38,6 +38,9 @@ public:
38
38
  //! Allocates space for an empty string of size "len" on the heap
39
39
  DUCKDB_API string_t EmptyString(idx_t len);
40
40
 
41
+ //! Size of strings
42
+ DUCKDB_API idx_t SizeInBytes() const;
43
+
41
44
  private:
42
45
  ArenaAllocator allocator;
43
46
  };
@@ -46,6 +46,7 @@ public:
46
46
  DUCKDB_API ArenaChunk *GetTail();
47
47
 
48
48
  DUCKDB_API bool IsEmpty() const;
49
+ DUCKDB_API idx_t SizeInBytes() const;
49
50
 
50
51
  //! Returns an "Allocator" wrapper for this arena allocator
51
52
  Allocator &GetAllocator() {
@@ -193,48 +193,53 @@ void Parser::ParseQuery(const string &query) {
193
193
  auto query_statements = SplitQueryStringIntoStatements(query);
194
194
  auto stmt_loc = 0;
195
195
  for (auto const &query_statement : query_statements) {
196
- PostgresParser another_parser;
197
- another_parser.Parse(query_statement);
198
- // LCOV_EXCL_START
199
- // first see if DuckDB can parse this individual query statement
200
- if (another_parser.success) {
201
- if (!another_parser.parse_tree) {
202
- // empty statement
203
- continue;
204
- }
205
- transformer.TransformParseTree(another_parser.parse_tree, statements);
206
- // important to set in the case of a mixture of DDB and parser ext statements
207
- statements.back()->stmt_length = query_statement.size() - 1;
208
- statements.back()->stmt_location = stmt_loc;
209
- stmt_loc += query_statement.size();
210
- } else {
211
- // let extensions parse the statement which DuckDB failed to parse
212
- bool parsed_single_statement = false;
213
- for (auto &ext : *options.extensions) {
214
- D_ASSERT(!parsed_single_statement);
215
- D_ASSERT(ext.parse_function);
216
- auto result = ext.parse_function(ext.parser_info.get(), query_statement);
217
- if (result.type == ParserExtensionResultType::PARSE_SUCCESSFUL) {
218
- auto statement = make_uniq<ExtensionStatement>(ext, std::move(result.parse_data));
219
- statement->stmt_length = query_statement.size() - 1;
220
- statement->stmt_location = stmt_loc;
221
- stmt_loc += query_statement.size();
222
- statements.push_back(std::move(statement));
223
- parsed_single_statement = true;
224
- break;
225
- } else if (result.type == ParserExtensionResultType::DISPLAY_EXTENSION_ERROR) {
226
- throw ParserException(result.error);
227
- } else {
228
- // We move to the next one!
196
+ string another_parser_error;
197
+ // Creating a new scope to allow extensions to use PostgresParser, which is not reentrant
198
+ {
199
+ PostgresParser another_parser;
200
+ another_parser.Parse(query_statement);
201
+ // LCOV_EXCL_START
202
+ // first see if DuckDB can parse this individual query statement
203
+ if (another_parser.success) {
204
+ if (!another_parser.parse_tree) {
205
+ // empty statement
206
+ continue;
229
207
  }
208
+ transformer.TransformParseTree(another_parser.parse_tree, statements);
209
+ // important to set in the case of a mixture of DDB and parser ext statements
210
+ statements.back()->stmt_length = query_statement.size() - 1;
211
+ statements.back()->stmt_location = stmt_loc;
212
+ stmt_loc += query_statement.size();
213
+ continue;
214
+ } else {
215
+ another_parser_error = QueryErrorContext::Format(query, another_parser.error_message,
216
+ another_parser.error_location - 1);
230
217
  }
231
- if (!parsed_single_statement) {
232
- parser_error = QueryErrorContext::Format(query, another_parser.error_message,
233
- another_parser.error_location - 1);
234
- throw ParserException(parser_error);
218
+ } // LCOV_EXCL_STOP
219
+ // LCOV_EXCL_START
220
+ // let extensions parse the statement which DuckDB failed to parse
221
+ bool parsed_single_statement = false;
222
+ for (auto &ext : *options.extensions) {
223
+ D_ASSERT(!parsed_single_statement);
224
+ D_ASSERT(ext.parse_function);
225
+ auto result = ext.parse_function(ext.parser_info.get(), query_statement);
226
+ if (result.type == ParserExtensionResultType::PARSE_SUCCESSFUL) {
227
+ auto statement = make_uniq<ExtensionStatement>(ext, std::move(result.parse_data));
228
+ statement->stmt_length = query_statement.size() - 1;
229
+ statement->stmt_location = stmt_loc;
230
+ stmt_loc += query_statement.size();
231
+ statements.push_back(std::move(statement));
232
+ parsed_single_statement = true;
233
+ break;
234
+ } else if (result.type == ParserExtensionResultType::DISPLAY_EXTENSION_ERROR) {
235
+ throw ParserException(result.error);
236
+ } else {
237
+ // We move to the next one!
235
238
  }
236
239
  }
237
- // LCOV_EXCL_STOP
240
+ if (!parsed_single_statement) {
241
+ throw ParserException(parser_error);
242
+ } // LCOV_EXCL_STOP
238
243
  }
239
244
  }
240
245
  }
@@ -151,4 +151,16 @@ bool ArenaAllocator::IsEmpty() const {
151
151
  return head == nullptr;
152
152
  }
153
153
 
154
+ idx_t ArenaAllocator::SizeInBytes() const {
155
+ idx_t total_size = 0;
156
+ if (!IsEmpty()) {
157
+ auto current = head.get();
158
+ while (current != nullptr) {
159
+ total_size += current->current_position;
160
+ current = current->next.get();
161
+ }
162
+ }
163
+ return total_size;
164
+ }
165
+
154
166
  } // namespace duckdb