duckdb 0.9.1-dev120.0 → 0.9.1-dev157.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "0.9.1-dev120.0",
5
+ "version": "0.9.1-dev157.0",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
@@ -243,6 +243,7 @@ void ColumnReader::InitializeRead(idx_t row_group_idx_p, const vector<ColumnChun
243
243
  void ColumnReader::PrepareRead(parquet_filter_t &filter) {
244
244
  dict_decoder.reset();
245
245
  defined_decoder.reset();
246
+ bss_decoder.reset();
246
247
  block.reset();
247
248
  PageHeader page_hdr;
248
249
  page_hdr.read(protocol);
@@ -443,6 +444,13 @@ void ColumnReader::PrepareDataPage(PageHeader &page_hdr) {
443
444
  PrepareDeltaByteArray(*block);
444
445
  break;
445
446
  }
447
+ case Encoding::BYTE_STREAM_SPLIT: {
448
+ // Subtract 1 from length as the block is allocated with 1 extra byte,
449
+ // but the byte stream split encoder needs to know the correct data size.
450
+ bss_decoder = make_uniq<BssDecoder>(block->ptr, block->len - 1);
451
+ block->inc(block->len);
452
+ break;
453
+ }
446
454
  case Encoding::PLAIN:
447
455
  // nothing to do here, will be read directly below
448
456
  break;
@@ -488,7 +496,7 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr
488
496
 
489
497
  idx_t null_count = 0;
490
498
 
491
- if ((dict_decoder || dbp_decoder || rle_decoder) && HasDefines()) {
499
+ if ((dict_decoder || dbp_decoder || rle_decoder || bss_decoder) && HasDefines()) {
492
500
  // we need the null count because the dictionary offsets have no entries for nulls
493
501
  for (idx_t i = 0; i < read_now; i++) {
494
502
  if (define_out[i + result_offset] != max_define) {
@@ -534,6 +542,23 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr
534
542
  } else if (byte_array_data) {
535
543
  // DELTA_BYTE_ARRAY or DELTA_LENGTH_BYTE_ARRAY
536
544
  DeltaByteArray(define_out, read_now, filter, result_offset, result);
545
+ } else if (bss_decoder) {
546
+ auto read_buf = make_shared<ResizeableBuffer>();
547
+
548
+ switch (schema.type) {
549
+ case duckdb_parquet::format::Type::FLOAT:
550
+ read_buf->resize(reader.allocator, sizeof(float) * (read_now - null_count));
551
+ bss_decoder->GetBatch<float>(read_buf->ptr, read_now - null_count);
552
+ break;
553
+ case duckdb_parquet::format::Type::DOUBLE:
554
+ read_buf->resize(reader.allocator, sizeof(double) * (read_now - null_count));
555
+ bss_decoder->GetBatch<double>(read_buf->ptr, read_now - null_count);
556
+ break;
557
+ default:
558
+ throw std::runtime_error("BYTE_STREAM_SPLIT encoding is only supported for FLOAT or DOUBLE data");
559
+ }
560
+
561
+ Plain(read_buf, define_out, read_now, filter, result_offset, result);
537
562
  } else {
538
563
  PlainReference(block, result);
539
564
  Plain(block, define_out, read_now, filter, result_offset, result);
@@ -9,6 +9,7 @@
9
9
  #pragma once
10
10
 
11
11
  #include "duckdb.hpp"
12
+ #include "parquet_bss_decoder.hpp"
12
13
  #include "parquet_dbp_decoder.hpp"
13
14
  #include "parquet_rle_bp_decoder.hpp"
14
15
  #include "parquet_statistics.hpp"
@@ -161,6 +162,7 @@ private:
161
162
  unique_ptr<RleBpDecoder> repeated_decoder;
162
163
  unique_ptr<DbpDecoder> dbp_decoder;
163
164
  unique_ptr<RleBpDecoder> rle_decoder;
165
+ unique_ptr<BssDecoder> bss_decoder;
164
166
 
165
167
  // dummies for Skip()
166
168
  parquet_filter_t none_filter;
@@ -0,0 +1,49 @@
1
+ //===----------------------------------------------------------------------===//
2
+ // DuckDB
3
+ //
4
+ // parquet_bss_decoder.hpp
5
+ //
6
+ //
7
+ //===----------------------------------------------------------------------===//
8
+
9
+ #pragma once
10
+ #include "parquet_types.h"
11
+ #include "resizable_buffer.hpp"
12
+
13
+ namespace duckdb {
14
+
15
+ /// Decoder for the Byte Stream Split encoding
16
+ class BssDecoder {
17
+ public:
18
+ /// Create a decoder object. buffer/buffer_len is the encoded data.
19
+ BssDecoder(data_ptr_t buffer, uint32_t buffer_len) : buffer_(buffer, buffer_len), value_offset_(0) {
20
+ }
21
+
22
+ public:
23
+ template <typename T>
24
+ void GetBatch(data_ptr_t values_target_ptr, uint32_t batch_size) {
25
+ if (buffer_.len % sizeof(T) != 0) {
26
+ std::stringstream error;
27
+ error << "Data buffer size for the BYTE_STREAM_SPLIT encoding (" << buffer_.len
28
+ << ") should be a multiple of the type size (" << sizeof(T) << ")";
29
+ throw std::runtime_error(error.str());
30
+ }
31
+ uint32_t num_buffer_values = buffer_.len / sizeof(T);
32
+
33
+ buffer_.available((value_offset_ + batch_size) * sizeof(T));
34
+
35
+ for (uint32_t byte_offset = 0; byte_offset < sizeof(T); ++byte_offset) {
36
+ data_ptr_t input_bytes = buffer_.ptr + byte_offset * num_buffer_values + value_offset_;
37
+ for (uint32_t i = 0; i < batch_size; ++i) {
38
+ values_target_ptr[byte_offset + i * sizeof(T)] = *(input_bytes + i);
39
+ }
40
+ }
41
+ value_offset_ += batch_size;
42
+ }
43
+
44
+ private:
45
+ ByteBuffer buffer_;
46
+ uint32_t value_offset_;
47
+ };
48
+
49
+ } // namespace duckdb
@@ -21,10 +21,7 @@ void PhysicalReset::ResetExtensionVariable(ExecutionContext &context, DBConfig &
21
21
 
22
22
  SourceResultType PhysicalReset::GetData(ExecutionContext &context, DataChunk &chunk, OperatorSourceInput &input) const {
23
23
  auto &config = DBConfig::GetConfig(context.client);
24
- if (config.options.lock_configuration) {
25
- throw InvalidInputException("Cannot reset configuration option \"%s\" - the configuration has been locked",
26
- name);
27
- }
24
+ config.CheckLock(name);
28
25
  auto option = DBConfig::GetOptionByName(name);
29
26
  if (!option) {
30
27
  // check if this is an extra extension variable
@@ -24,10 +24,8 @@ void PhysicalSet::SetExtensionVariable(ClientContext &context, ExtensionOption &
24
24
 
25
25
  SourceResultType PhysicalSet::GetData(ExecutionContext &context, DataChunk &chunk, OperatorSourceInput &input) const {
26
26
  auto &config = DBConfig::GetConfig(context.client);
27
- if (config.options.lock_configuration) {
28
- throw InvalidInputException("Cannot change configuration option \"%s\" - the configuration has been locked",
29
- name);
30
- }
27
+ // check if we are allowed to change the configuration option
28
+ config.CheckLock(name);
31
29
  auto option = DBConfig::GetOptionByName(name);
32
30
  if (!option) {
33
31
  // check if this is an extra extension variable
@@ -1,8 +1,8 @@
1
1
  #ifndef DUCKDB_VERSION
2
- #define DUCKDB_VERSION "v0.9.1-dev120"
2
+ #define DUCKDB_VERSION "v0.9.1-dev157"
3
3
  #endif
4
4
  #ifndef DUCKDB_SOURCE_ID
5
- #define DUCKDB_SOURCE_ID "af666ad8ba"
5
+ #define DUCKDB_SOURCE_ID "fa87a54b70"
6
6
  #endif
7
7
  #include "duckdb/function/table/system_functions.hpp"
8
8
  #include "duckdb/main/database.hpp"
@@ -240,6 +240,8 @@ public:
240
240
  DUCKDB_API void SetOption(const string &name, Value value);
241
241
  DUCKDB_API void ResetOption(const string &name);
242
242
 
243
+ DUCKDB_API void CheckLock(const string &name);
244
+
243
245
  DUCKDB_API static idx_t ParseMemoryLimit(const string &arg);
244
246
 
245
247
  //! Return the list of possible compression functions for the specific physical type
@@ -90,8 +90,8 @@ public:
90
90
  void QualifyColumnNames(unique_ptr<ParsedExpression> &expr);
91
91
  static void QualifyColumnNames(Binder &binder, unique_ptr<ParsedExpression> &expr);
92
92
 
93
- static unique_ptr<Expression> PushCollation(ClientContext &context, unique_ptr<Expression> source,
94
- const string &collation, bool equality_only = false);
93
+ static bool PushCollation(ClientContext &context, unique_ptr<Expression> &source, const LogicalType &sql_type,
94
+ bool equality_only = false);
95
95
  static void TestCollation(ClientContext &context, const string &collation);
96
96
 
97
97
  bool BindCorrelatedColumns(unique_ptr<ParsedExpression> &expr);
@@ -233,6 +233,20 @@ void DBConfig::SetDefaultMaxMemory() {
233
233
  }
234
234
  }
235
235
 
236
+ void DBConfig::CheckLock(const string &name) {
237
+ if (!options.lock_configuration) {
238
+ // not locked
239
+ return;
240
+ }
241
+ case_insensitive_set_t allowed_settings {"schema", "search_path"};
242
+ if (allowed_settings.find(name) != allowed_settings.end()) {
243
+ // we are always allowed to change these settings
244
+ return;
245
+ }
246
+ // not allowed!
247
+ throw InvalidInputException("Cannot change configuration option \"%s\" - the configuration has been locked", name);
248
+ }
249
+
236
250
  idx_t CGroupBandwidthQuota(idx_t physical_cores, FileSystem &fs) {
237
251
  static constexpr const char *CPU_MAX = "/sys/fs/cgroup/cpu.max";
238
252
  static constexpr const char *CFS_QUOTA = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us";
@@ -34,13 +34,11 @@ BindResult ExpressionBinder::BindExpression(BetweenExpression &expr, idx_t depth
34
34
  input = BoundCastExpression::AddCastToType(context, std::move(input), input_type);
35
35
  lower = BoundCastExpression::AddCastToType(context, std::move(lower), input_type);
36
36
  upper = BoundCastExpression::AddCastToType(context, std::move(upper), input_type);
37
- if (input_type.id() == LogicalTypeId::VARCHAR) {
38
- // handle collation
39
- auto collation = StringType::GetCollation(input_type);
40
- input = PushCollation(context, std::move(input), collation, false);
41
- lower = PushCollation(context, std::move(lower), collation, false);
42
- upper = PushCollation(context, std::move(upper), collation, false);
43
- }
37
+ // handle collation
38
+ PushCollation(context, input, input_type, false);
39
+ PushCollation(context, lower, input_type, false);
40
+ PushCollation(context, upper, input_type, false);
41
+
44
42
  if (!input->HasSideEffects() && !input->HasParameter() && !input->HasSubquery()) {
45
43
  // the expression does not have side effects and can be copied: create two comparisons
46
44
  // the reason we do this is that individual comparisons are easier to handle in optimizers
@@ -18,8 +18,10 @@ BindResult ExpressionBinder::BindExpression(CollateExpression &expr, idx_t depth
18
18
  throw BinderException("collations are only supported for type varchar");
19
19
  }
20
20
  // Validate the collation, but don't use it
21
- PushCollation(context, child->Copy(), expr.collation, false);
22
- child->return_type = LogicalType::VARCHAR_COLLATION(expr.collation);
21
+ auto child_copy = child->Copy();
22
+ auto collation_type = LogicalType::VARCHAR_COLLATION(expr.collation);
23
+ PushCollation(context, child_copy, collation_type, false);
24
+ child->return_type = collation_type;
23
25
  return BindResult(std::move(child));
24
26
  }
25
27
 
@@ -18,20 +18,25 @@
18
18
 
19
19
  namespace duckdb {
20
20
 
21
- unique_ptr<Expression> ExpressionBinder::PushCollation(ClientContext &context, unique_ptr<Expression> source,
22
- const string &collation_p, bool equality_only) {
21
+ bool ExpressionBinder::PushCollation(ClientContext &context, unique_ptr<Expression> &source,
22
+ const LogicalType &sql_type, bool equality_only) {
23
+ if (sql_type.id() != LogicalTypeId::VARCHAR) {
24
+ // only VARCHAR columns require collation
25
+ return false;
26
+ }
23
27
  // replace default collation with system collation
28
+ auto str_collation = StringType::GetCollation(sql_type);
24
29
  string collation;
25
- if (collation_p.empty()) {
30
+ if (str_collation.empty()) {
26
31
  collation = DBConfig::GetConfig(context).options.collation;
27
32
  } else {
28
- collation = collation_p;
33
+ collation = str_collation;
29
34
  }
30
35
  collation = StringUtil::Lower(collation);
31
36
  // bind the collation
32
37
  if (collation.empty() || collation == "binary" || collation == "c" || collation == "posix") {
33
- // binary collation: just skip
34
- return source;
38
+ // no collation or binary collation: skip
39
+ return false;
35
40
  }
36
41
  auto &catalog = Catalog::GetSystemCatalog(context);
37
42
  auto splits = StringUtil::Split(StringUtil::Lower(collation), ".");
@@ -60,11 +65,12 @@ unique_ptr<Expression> ExpressionBinder::PushCollation(ClientContext &context, u
60
65
  auto function = function_binder.BindScalarFunction(collation_entry.function, std::move(children));
61
66
  source = std::move(function);
62
67
  }
63
- return source;
68
+ return true;
64
69
  }
65
70
 
66
71
  void ExpressionBinder::TestCollation(ClientContext &context, const string &collation) {
67
- PushCollation(context, make_uniq<BoundConstantExpression>(Value("")), collation);
72
+ auto expr = make_uniq_base<Expression, BoundConstantExpression>(Value(""));
73
+ PushCollation(context, expr, LogicalType::VARCHAR_COLLATION(collation));
68
74
  }
69
75
 
70
76
  LogicalType BoundComparisonExpression::BindComparison(LogicalType left_type, LogicalType right_type) {
@@ -134,12 +140,9 @@ BindResult ExpressionBinder::BindExpression(ComparisonExpression &expr, idx_t de
134
140
  right = BoundCastExpression::AddCastToType(context, std::move(right), input_type,
135
141
  input_type.id() == LogicalTypeId::ENUM);
136
142
 
137
- if (input_type.id() == LogicalTypeId::VARCHAR) {
138
- // handle collation
139
- auto collation = StringType::GetCollation(input_type);
140
- left = PushCollation(context, std::move(left), collation, expr.type == ExpressionType::COMPARE_EQUAL);
141
- right = PushCollation(context, std::move(right), collation, expr.type == ExpressionType::COMPARE_EQUAL);
142
- }
143
+ PushCollation(context, left, input_type, expr.type == ExpressionType::COMPARE_EQUAL);
144
+ PushCollation(context, right, input_type, expr.type == ExpressionType::COMPARE_EQUAL);
145
+
143
146
  // now create the bound comparison expression
144
147
  return BindResult(make_uniq<BoundComparisonExpression>(expr.type, std::move(left), std::move(right)));
145
148
  }
@@ -222,10 +222,7 @@ void Binder::BindModifierTypes(BoundQueryNode &result, const vector<LogicalType>
222
222
  for (auto &target_distinct : distinct.target_distincts) {
223
223
  auto &bound_colref = target_distinct->Cast<BoundColumnRefExpression>();
224
224
  const auto &sql_type = sql_types[bound_colref.binding.column_index];
225
- if (sql_type.id() == LogicalTypeId::VARCHAR) {
226
- target_distinct = ExpressionBinder::PushCollation(context, std::move(target_distinct),
227
- StringType::GetCollation(sql_type), true);
228
- }
225
+ ExpressionBinder::PushCollation(context, target_distinct, sql_type, true);
229
226
  }
230
227
  break;
231
228
  }
@@ -253,10 +250,7 @@ void Binder::BindModifierTypes(BoundQueryNode &result, const vector<LogicalType>
253
250
  D_ASSERT(bound_colref.binding.column_index < sql_types.size());
254
251
  const auto &sql_type = sql_types[bound_colref.binding.column_index];
255
252
  bound_colref.return_type = sql_types[bound_colref.binding.column_index];
256
- if (sql_type.id() == LogicalTypeId::VARCHAR) {
257
- order_node.expression = ExpressionBinder::PushCollation(context, std::move(order_node.expression),
258
- StringType::GetCollation(sql_type));
259
- }
253
+ ExpressionBinder::PushCollation(context, order_node.expression, sql_type);
260
254
  }
261
255
  break;
262
256
  }
@@ -389,9 +383,8 @@ unique_ptr<BoundQueryNode> Binder::BindSelectNode(SelectNode &statement, unique_
389
383
  bool contains_subquery = bound_expr_ref.HasSubquery();
390
384
 
391
385
  // push a potential collation, if necessary
392
- auto collated_expr = ExpressionBinder::PushCollation(context, std::move(bound_expr),
393
- StringType::GetCollation(group_type), true);
394
- if (!contains_subquery && !collated_expr->Equals(bound_expr_ref)) {
386
+ bool requires_collation = ExpressionBinder::PushCollation(context, bound_expr, group_type, true);
387
+ if (!contains_subquery && requires_collation) {
395
388
  // if there is a collation on a group x, we should group by the collated expr,
396
389
  // but also push a first(x) aggregate in case x is selected (uncollated)
397
390
  info.collated_groups[i] = result->aggregates.size();
@@ -405,7 +398,7 @@ unique_ptr<BoundQueryNode> Binder::BindSelectNode(SelectNode &statement, unique_
405
398
  auto function = function_binder.BindAggregateFunction(first_fun, std::move(first_children));
406
399
  result->aggregates.push_back(std::move(function));
407
400
  }
408
- result->groups.group_expressions.push_back(std::move(collated_expr));
401
+ result->groups.group_expressions.push_back(std::move(bound_expr));
409
402
 
410
403
  // in the unbound expression we DO bind the table names of any ColumnRefs
411
404
  // we do this to make sure that "table.a" and "a" are treated the same
@@ -252,6 +252,7 @@ void DuckTransactionManager::RollbackTransaction(Transaction *transaction_p) {
252
252
  }
253
253
 
254
254
  void DuckTransactionManager::RemoveTransaction(DuckTransaction &transaction) noexcept {
255
+ bool changes_made = transaction.ChangesMade();
255
256
  // remove the transaction from the list of active transactions
256
257
  idx_t t_index = active_transactions.size();
257
258
  // check for the lowest and highest start time in the list of transactions
@@ -275,15 +276,18 @@ void DuckTransactionManager::RemoveTransaction(DuckTransaction &transaction) noe
275
276
  D_ASSERT(t_index != active_transactions.size());
276
277
  auto current_transaction = std::move(active_transactions[t_index]);
277
278
  auto current_query = DatabaseManager::Get(db).ActiveQueryNumber();
278
- if (transaction.commit_id != 0) {
279
- // the transaction was committed, add it to the list of recently
280
- // committed transactions
281
- recently_committed_transactions.push_back(std::move(current_transaction));
282
- } else {
283
- // the transaction was aborted, but we might still need its information
284
- // add it to the set of transactions awaiting GC
285
- current_transaction->highest_active_query = current_query;
286
- old_transactions.push_back(std::move(current_transaction));
279
+ if (changes_made) {
280
+ // if the transaction made any changes we need to keep it around
281
+ if (transaction.commit_id != 0) {
282
+ // the transaction was committed, add it to the list of recently
283
+ // committed transactions
284
+ recently_committed_transactions.push_back(std::move(current_transaction));
285
+ } else {
286
+ // the transaction was aborted, but we might still need its information
287
+ // add it to the set of transactions awaiting GC
288
+ current_transaction->highest_active_query = current_query;
289
+ old_transactions.push_back(std::move(current_transaction));
290
+ }
287
291
  }
288
292
  // remove the transaction from the set of currently active transactions
289
293
  active_transactions.erase(active_transactions.begin() + t_index);
@@ -92,7 +92,8 @@ struct Encoding {
92
92
  DELTA_BINARY_PACKED = 5,
93
93
  DELTA_LENGTH_BYTE_ARRAY = 6,
94
94
  DELTA_BYTE_ARRAY = 7,
95
- RLE_DICTIONARY = 8
95
+ RLE_DICTIONARY = 8,
96
+ BYTE_STREAM_SPLIT = 9,
96
97
  };
97
98
  };
98
99