duckdb 0.9.1-dev120.0 → 0.9.1-dev157.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/extension/parquet/column_reader.cpp +26 -1
- package/src/duckdb/extension/parquet/include/column_reader.hpp +2 -0
- package/src/duckdb/extension/parquet/include/parquet_bss_decoder.hpp +49 -0
- package/src/duckdb/src/execution/operator/helper/physical_reset.cpp +1 -4
- package/src/duckdb/src/execution/operator/helper/physical_set.cpp +2 -4
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/main/config.hpp +2 -0
- package/src/duckdb/src/include/duckdb/planner/expression_binder.hpp +2 -2
- package/src/duckdb/src/main/config.cpp +14 -0
- package/src/duckdb/src/planner/binder/expression/bind_between_expression.cpp +5 -7
- package/src/duckdb/src/planner/binder/expression/bind_collate_expression.cpp +4 -2
- package/src/duckdb/src/planner/binder/expression/bind_comparison_expression.cpp +17 -14
- package/src/duckdb/src/planner/binder/query_node/bind_select_node.cpp +5 -12
- package/src/duckdb/src/transaction/duck_transaction_manager.cpp +13 -9
- package/src/duckdb/third_party/parquet/parquet_types.h +2 -1
package/package.json
CHANGED
@@ -243,6 +243,7 @@ void ColumnReader::InitializeRead(idx_t row_group_idx_p, const vector<ColumnChun
|
|
243
243
|
void ColumnReader::PrepareRead(parquet_filter_t &filter) {
|
244
244
|
dict_decoder.reset();
|
245
245
|
defined_decoder.reset();
|
246
|
+
bss_decoder.reset();
|
246
247
|
block.reset();
|
247
248
|
PageHeader page_hdr;
|
248
249
|
page_hdr.read(protocol);
|
@@ -443,6 +444,13 @@ void ColumnReader::PrepareDataPage(PageHeader &page_hdr) {
|
|
443
444
|
PrepareDeltaByteArray(*block);
|
444
445
|
break;
|
445
446
|
}
|
447
|
+
case Encoding::BYTE_STREAM_SPLIT: {
|
448
|
+
// Subtract 1 from length as the block is allocated with 1 extra byte,
|
449
|
+
// but the byte stream split encoder needs to know the correct data size.
|
450
|
+
bss_decoder = make_uniq<BssDecoder>(block->ptr, block->len - 1);
|
451
|
+
block->inc(block->len);
|
452
|
+
break;
|
453
|
+
}
|
446
454
|
case Encoding::PLAIN:
|
447
455
|
// nothing to do here, will be read directly below
|
448
456
|
break;
|
@@ -488,7 +496,7 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr
|
|
488
496
|
|
489
497
|
idx_t null_count = 0;
|
490
498
|
|
491
|
-
if ((dict_decoder || dbp_decoder || rle_decoder) && HasDefines()) {
|
499
|
+
if ((dict_decoder || dbp_decoder || rle_decoder || bss_decoder) && HasDefines()) {
|
492
500
|
// we need the null count because the dictionary offsets have no entries for nulls
|
493
501
|
for (idx_t i = 0; i < read_now; i++) {
|
494
502
|
if (define_out[i + result_offset] != max_define) {
|
@@ -534,6 +542,23 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr
|
|
534
542
|
} else if (byte_array_data) {
|
535
543
|
// DELTA_BYTE_ARRAY or DELTA_LENGTH_BYTE_ARRAY
|
536
544
|
DeltaByteArray(define_out, read_now, filter, result_offset, result);
|
545
|
+
} else if (bss_decoder) {
|
546
|
+
auto read_buf = make_shared<ResizeableBuffer>();
|
547
|
+
|
548
|
+
switch (schema.type) {
|
549
|
+
case duckdb_parquet::format::Type::FLOAT:
|
550
|
+
read_buf->resize(reader.allocator, sizeof(float) * (read_now - null_count));
|
551
|
+
bss_decoder->GetBatch<float>(read_buf->ptr, read_now - null_count);
|
552
|
+
break;
|
553
|
+
case duckdb_parquet::format::Type::DOUBLE:
|
554
|
+
read_buf->resize(reader.allocator, sizeof(double) * (read_now - null_count));
|
555
|
+
bss_decoder->GetBatch<double>(read_buf->ptr, read_now - null_count);
|
556
|
+
break;
|
557
|
+
default:
|
558
|
+
throw std::runtime_error("BYTE_STREAM_SPLIT encoding is only supported for FLOAT or DOUBLE data");
|
559
|
+
}
|
560
|
+
|
561
|
+
Plain(read_buf, define_out, read_now, filter, result_offset, result);
|
537
562
|
} else {
|
538
563
|
PlainReference(block, result);
|
539
564
|
Plain(block, define_out, read_now, filter, result_offset, result);
|
@@ -9,6 +9,7 @@
|
|
9
9
|
#pragma once
|
10
10
|
|
11
11
|
#include "duckdb.hpp"
|
12
|
+
#include "parquet_bss_decoder.hpp"
|
12
13
|
#include "parquet_dbp_decoder.hpp"
|
13
14
|
#include "parquet_rle_bp_decoder.hpp"
|
14
15
|
#include "parquet_statistics.hpp"
|
@@ -161,6 +162,7 @@ private:
|
|
161
162
|
unique_ptr<RleBpDecoder> repeated_decoder;
|
162
163
|
unique_ptr<DbpDecoder> dbp_decoder;
|
163
164
|
unique_ptr<RleBpDecoder> rle_decoder;
|
165
|
+
unique_ptr<BssDecoder> bss_decoder;
|
164
166
|
|
165
167
|
// dummies for Skip()
|
166
168
|
parquet_filter_t none_filter;
|
@@ -0,0 +1,49 @@
|
|
1
|
+
//===----------------------------------------------------------------------===//
|
2
|
+
// DuckDB
|
3
|
+
//
|
4
|
+
// parquet_bss_decoder.hpp
|
5
|
+
//
|
6
|
+
//
|
7
|
+
//===----------------------------------------------------------------------===//
|
8
|
+
|
9
|
+
#pragma once
|
10
|
+
#include "parquet_types.h"
|
11
|
+
#include "resizable_buffer.hpp"
|
12
|
+
|
13
|
+
namespace duckdb {
|
14
|
+
|
15
|
+
/// Decoder for the Byte Stream Split encoding
|
16
|
+
class BssDecoder {
|
17
|
+
public:
|
18
|
+
/// Create a decoder object. buffer/buffer_len is the encoded data.
|
19
|
+
BssDecoder(data_ptr_t buffer, uint32_t buffer_len) : buffer_(buffer, buffer_len), value_offset_(0) {
|
20
|
+
}
|
21
|
+
|
22
|
+
public:
|
23
|
+
template <typename T>
|
24
|
+
void GetBatch(data_ptr_t values_target_ptr, uint32_t batch_size) {
|
25
|
+
if (buffer_.len % sizeof(T) != 0) {
|
26
|
+
std::stringstream error;
|
27
|
+
error << "Data buffer size for the BYTE_STREAM_SPLIT encoding (" << buffer_.len
|
28
|
+
<< ") should be a multiple of the type size (" << sizeof(T) << ")";
|
29
|
+
throw std::runtime_error(error.str());
|
30
|
+
}
|
31
|
+
uint32_t num_buffer_values = buffer_.len / sizeof(T);
|
32
|
+
|
33
|
+
buffer_.available((value_offset_ + batch_size) * sizeof(T));
|
34
|
+
|
35
|
+
for (uint32_t byte_offset = 0; byte_offset < sizeof(T); ++byte_offset) {
|
36
|
+
data_ptr_t input_bytes = buffer_.ptr + byte_offset * num_buffer_values + value_offset_;
|
37
|
+
for (uint32_t i = 0; i < batch_size; ++i) {
|
38
|
+
values_target_ptr[byte_offset + i * sizeof(T)] = *(input_bytes + i);
|
39
|
+
}
|
40
|
+
}
|
41
|
+
value_offset_ += batch_size;
|
42
|
+
}
|
43
|
+
|
44
|
+
private:
|
45
|
+
ByteBuffer buffer_;
|
46
|
+
uint32_t value_offset_;
|
47
|
+
};
|
48
|
+
|
49
|
+
} // namespace duckdb
|
@@ -21,10 +21,7 @@ void PhysicalReset::ResetExtensionVariable(ExecutionContext &context, DBConfig &
|
|
21
21
|
|
22
22
|
SourceResultType PhysicalReset::GetData(ExecutionContext &context, DataChunk &chunk, OperatorSourceInput &input) const {
|
23
23
|
auto &config = DBConfig::GetConfig(context.client);
|
24
|
-
|
25
|
-
throw InvalidInputException("Cannot reset configuration option \"%s\" - the configuration has been locked",
|
26
|
-
name);
|
27
|
-
}
|
24
|
+
config.CheckLock(name);
|
28
25
|
auto option = DBConfig::GetOptionByName(name);
|
29
26
|
if (!option) {
|
30
27
|
// check if this is an extra extension variable
|
@@ -24,10 +24,8 @@ void PhysicalSet::SetExtensionVariable(ClientContext &context, ExtensionOption &
|
|
24
24
|
|
25
25
|
SourceResultType PhysicalSet::GetData(ExecutionContext &context, DataChunk &chunk, OperatorSourceInput &input) const {
|
26
26
|
auto &config = DBConfig::GetConfig(context.client);
|
27
|
-
if
|
28
|
-
|
29
|
-
name);
|
30
|
-
}
|
27
|
+
// check if we are allowed to change the configuration option
|
28
|
+
config.CheckLock(name);
|
31
29
|
auto option = DBConfig::GetOptionByName(name);
|
32
30
|
if (!option) {
|
33
31
|
// check if this is an extra extension variable
|
@@ -1,8 +1,8 @@
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
2
|
-
#define DUCKDB_VERSION "v0.9.1-
|
2
|
+
#define DUCKDB_VERSION "v0.9.1-dev157"
|
3
3
|
#endif
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
5
|
+
#define DUCKDB_SOURCE_ID "fa87a54b70"
|
6
6
|
#endif
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
8
8
|
#include "duckdb/main/database.hpp"
|
@@ -240,6 +240,8 @@ public:
|
|
240
240
|
DUCKDB_API void SetOption(const string &name, Value value);
|
241
241
|
DUCKDB_API void ResetOption(const string &name);
|
242
242
|
|
243
|
+
DUCKDB_API void CheckLock(const string &name);
|
244
|
+
|
243
245
|
DUCKDB_API static idx_t ParseMemoryLimit(const string &arg);
|
244
246
|
|
245
247
|
//! Return the list of possible compression functions for the specific physical type
|
@@ -90,8 +90,8 @@ public:
|
|
90
90
|
void QualifyColumnNames(unique_ptr<ParsedExpression> &expr);
|
91
91
|
static void QualifyColumnNames(Binder &binder, unique_ptr<ParsedExpression> &expr);
|
92
92
|
|
93
|
-
static
|
94
|
-
|
93
|
+
static bool PushCollation(ClientContext &context, unique_ptr<Expression> &source, const LogicalType &sql_type,
|
94
|
+
bool equality_only = false);
|
95
95
|
static void TestCollation(ClientContext &context, const string &collation);
|
96
96
|
|
97
97
|
bool BindCorrelatedColumns(unique_ptr<ParsedExpression> &expr);
|
@@ -233,6 +233,20 @@ void DBConfig::SetDefaultMaxMemory() {
|
|
233
233
|
}
|
234
234
|
}
|
235
235
|
|
236
|
+
void DBConfig::CheckLock(const string &name) {
|
237
|
+
if (!options.lock_configuration) {
|
238
|
+
// not locked
|
239
|
+
return;
|
240
|
+
}
|
241
|
+
case_insensitive_set_t allowed_settings {"schema", "search_path"};
|
242
|
+
if (allowed_settings.find(name) != allowed_settings.end()) {
|
243
|
+
// we are always allowed to change these settings
|
244
|
+
return;
|
245
|
+
}
|
246
|
+
// not allowed!
|
247
|
+
throw InvalidInputException("Cannot change configuration option \"%s\" - the configuration has been locked", name);
|
248
|
+
}
|
249
|
+
|
236
250
|
idx_t CGroupBandwidthQuota(idx_t physical_cores, FileSystem &fs) {
|
237
251
|
static constexpr const char *CPU_MAX = "/sys/fs/cgroup/cpu.max";
|
238
252
|
static constexpr const char *CFS_QUOTA = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us";
|
@@ -34,13 +34,11 @@ BindResult ExpressionBinder::BindExpression(BetweenExpression &expr, idx_t depth
|
|
34
34
|
input = BoundCastExpression::AddCastToType(context, std::move(input), input_type);
|
35
35
|
lower = BoundCastExpression::AddCastToType(context, std::move(lower), input_type);
|
36
36
|
upper = BoundCastExpression::AddCastToType(context, std::move(upper), input_type);
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
upper = PushCollation(context, std::move(upper), collation, false);
|
43
|
-
}
|
37
|
+
// handle collation
|
38
|
+
PushCollation(context, input, input_type, false);
|
39
|
+
PushCollation(context, lower, input_type, false);
|
40
|
+
PushCollation(context, upper, input_type, false);
|
41
|
+
|
44
42
|
if (!input->HasSideEffects() && !input->HasParameter() && !input->HasSubquery()) {
|
45
43
|
// the expression does not have side effects and can be copied: create two comparisons
|
46
44
|
// the reason we do this is that individual comparisons are easier to handle in optimizers
|
@@ -18,8 +18,10 @@ BindResult ExpressionBinder::BindExpression(CollateExpression &expr, idx_t depth
|
|
18
18
|
throw BinderException("collations are only supported for type varchar");
|
19
19
|
}
|
20
20
|
// Validate the collation, but don't use it
|
21
|
-
|
22
|
-
|
21
|
+
auto child_copy = child->Copy();
|
22
|
+
auto collation_type = LogicalType::VARCHAR_COLLATION(expr.collation);
|
23
|
+
PushCollation(context, child_copy, collation_type, false);
|
24
|
+
child->return_type = collation_type;
|
23
25
|
return BindResult(std::move(child));
|
24
26
|
}
|
25
27
|
|
@@ -18,20 +18,25 @@
|
|
18
18
|
|
19
19
|
namespace duckdb {
|
20
20
|
|
21
|
-
|
22
|
-
|
21
|
+
bool ExpressionBinder::PushCollation(ClientContext &context, unique_ptr<Expression> &source,
|
22
|
+
const LogicalType &sql_type, bool equality_only) {
|
23
|
+
if (sql_type.id() != LogicalTypeId::VARCHAR) {
|
24
|
+
// only VARCHAR columns require collation
|
25
|
+
return false;
|
26
|
+
}
|
23
27
|
// replace default collation with system collation
|
28
|
+
auto str_collation = StringType::GetCollation(sql_type);
|
24
29
|
string collation;
|
25
|
-
if (
|
30
|
+
if (str_collation.empty()) {
|
26
31
|
collation = DBConfig::GetConfig(context).options.collation;
|
27
32
|
} else {
|
28
|
-
collation =
|
33
|
+
collation = str_collation;
|
29
34
|
}
|
30
35
|
collation = StringUtil::Lower(collation);
|
31
36
|
// bind the collation
|
32
37
|
if (collation.empty() || collation == "binary" || collation == "c" || collation == "posix") {
|
33
|
-
// binary collation:
|
34
|
-
return
|
38
|
+
// no collation or binary collation: skip
|
39
|
+
return false;
|
35
40
|
}
|
36
41
|
auto &catalog = Catalog::GetSystemCatalog(context);
|
37
42
|
auto splits = StringUtil::Split(StringUtil::Lower(collation), ".");
|
@@ -60,11 +65,12 @@ unique_ptr<Expression> ExpressionBinder::PushCollation(ClientContext &context, u
|
|
60
65
|
auto function = function_binder.BindScalarFunction(collation_entry.function, std::move(children));
|
61
66
|
source = std::move(function);
|
62
67
|
}
|
63
|
-
return
|
68
|
+
return true;
|
64
69
|
}
|
65
70
|
|
66
71
|
void ExpressionBinder::TestCollation(ClientContext &context, const string &collation) {
|
67
|
-
|
72
|
+
auto expr = make_uniq_base<Expression, BoundConstantExpression>(Value(""));
|
73
|
+
PushCollation(context, expr, LogicalType::VARCHAR_COLLATION(collation));
|
68
74
|
}
|
69
75
|
|
70
76
|
LogicalType BoundComparisonExpression::BindComparison(LogicalType left_type, LogicalType right_type) {
|
@@ -134,12 +140,9 @@ BindResult ExpressionBinder::BindExpression(ComparisonExpression &expr, idx_t de
|
|
134
140
|
right = BoundCastExpression::AddCastToType(context, std::move(right), input_type,
|
135
141
|
input_type.id() == LogicalTypeId::ENUM);
|
136
142
|
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
left = PushCollation(context, std::move(left), collation, expr.type == ExpressionType::COMPARE_EQUAL);
|
141
|
-
right = PushCollation(context, std::move(right), collation, expr.type == ExpressionType::COMPARE_EQUAL);
|
142
|
-
}
|
143
|
+
PushCollation(context, left, input_type, expr.type == ExpressionType::COMPARE_EQUAL);
|
144
|
+
PushCollation(context, right, input_type, expr.type == ExpressionType::COMPARE_EQUAL);
|
145
|
+
|
143
146
|
// now create the bound comparison expression
|
144
147
|
return BindResult(make_uniq<BoundComparisonExpression>(expr.type, std::move(left), std::move(right)));
|
145
148
|
}
|
@@ -222,10 +222,7 @@ void Binder::BindModifierTypes(BoundQueryNode &result, const vector<LogicalType>
|
|
222
222
|
for (auto &target_distinct : distinct.target_distincts) {
|
223
223
|
auto &bound_colref = target_distinct->Cast<BoundColumnRefExpression>();
|
224
224
|
const auto &sql_type = sql_types[bound_colref.binding.column_index];
|
225
|
-
|
226
|
-
target_distinct = ExpressionBinder::PushCollation(context, std::move(target_distinct),
|
227
|
-
StringType::GetCollation(sql_type), true);
|
228
|
-
}
|
225
|
+
ExpressionBinder::PushCollation(context, target_distinct, sql_type, true);
|
229
226
|
}
|
230
227
|
break;
|
231
228
|
}
|
@@ -253,10 +250,7 @@ void Binder::BindModifierTypes(BoundQueryNode &result, const vector<LogicalType>
|
|
253
250
|
D_ASSERT(bound_colref.binding.column_index < sql_types.size());
|
254
251
|
const auto &sql_type = sql_types[bound_colref.binding.column_index];
|
255
252
|
bound_colref.return_type = sql_types[bound_colref.binding.column_index];
|
256
|
-
|
257
|
-
order_node.expression = ExpressionBinder::PushCollation(context, std::move(order_node.expression),
|
258
|
-
StringType::GetCollation(sql_type));
|
259
|
-
}
|
253
|
+
ExpressionBinder::PushCollation(context, order_node.expression, sql_type);
|
260
254
|
}
|
261
255
|
break;
|
262
256
|
}
|
@@ -389,9 +383,8 @@ unique_ptr<BoundQueryNode> Binder::BindSelectNode(SelectNode &statement, unique_
|
|
389
383
|
bool contains_subquery = bound_expr_ref.HasSubquery();
|
390
384
|
|
391
385
|
// push a potential collation, if necessary
|
392
|
-
|
393
|
-
|
394
|
-
if (!contains_subquery && !collated_expr->Equals(bound_expr_ref)) {
|
386
|
+
bool requires_collation = ExpressionBinder::PushCollation(context, bound_expr, group_type, true);
|
387
|
+
if (!contains_subquery && requires_collation) {
|
395
388
|
// if there is a collation on a group x, we should group by the collated expr,
|
396
389
|
// but also push a first(x) aggregate in case x is selected (uncollated)
|
397
390
|
info.collated_groups[i] = result->aggregates.size();
|
@@ -405,7 +398,7 @@ unique_ptr<BoundQueryNode> Binder::BindSelectNode(SelectNode &statement, unique_
|
|
405
398
|
auto function = function_binder.BindAggregateFunction(first_fun, std::move(first_children));
|
406
399
|
result->aggregates.push_back(std::move(function));
|
407
400
|
}
|
408
|
-
result->groups.group_expressions.push_back(std::move(
|
401
|
+
result->groups.group_expressions.push_back(std::move(bound_expr));
|
409
402
|
|
410
403
|
// in the unbound expression we DO bind the table names of any ColumnRefs
|
411
404
|
// we do this to make sure that "table.a" and "a" are treated the same
|
@@ -252,6 +252,7 @@ void DuckTransactionManager::RollbackTransaction(Transaction *transaction_p) {
|
|
252
252
|
}
|
253
253
|
|
254
254
|
void DuckTransactionManager::RemoveTransaction(DuckTransaction &transaction) noexcept {
|
255
|
+
bool changes_made = transaction.ChangesMade();
|
255
256
|
// remove the transaction from the list of active transactions
|
256
257
|
idx_t t_index = active_transactions.size();
|
257
258
|
// check for the lowest and highest start time in the list of transactions
|
@@ -275,15 +276,18 @@ void DuckTransactionManager::RemoveTransaction(DuckTransaction &transaction) noe
|
|
275
276
|
D_ASSERT(t_index != active_transactions.size());
|
276
277
|
auto current_transaction = std::move(active_transactions[t_index]);
|
277
278
|
auto current_query = DatabaseManager::Get(db).ActiveQueryNumber();
|
278
|
-
if (
|
279
|
-
// the transaction
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
279
|
+
if (changes_made) {
|
280
|
+
// if the transaction made any changes we need to keep it around
|
281
|
+
if (transaction.commit_id != 0) {
|
282
|
+
// the transaction was committed, add it to the list of recently
|
283
|
+
// committed transactions
|
284
|
+
recently_committed_transactions.push_back(std::move(current_transaction));
|
285
|
+
} else {
|
286
|
+
// the transaction was aborted, but we might still need its information
|
287
|
+
// add it to the set of transactions awaiting GC
|
288
|
+
current_transaction->highest_active_query = current_query;
|
289
|
+
old_transactions.push_back(std::move(current_transaction));
|
290
|
+
}
|
287
291
|
}
|
288
292
|
// remove the transaction from the set of currently active transactions
|
289
293
|
active_transactions.erase(active_transactions.begin() + t_index);
|