duckdb 0.7.2-dev1138.0 → 0.7.2-dev1188.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/extension/icu/third_party/icu/stubdata/stubdata.cpp +1 -1
- package/src/duckdb/extension/parquet/parquet-extension.cpp +2 -1
- package/src/duckdb/src/common/local_file_system.cpp +64 -7
- package/src/duckdb/src/common/types/column_data_collection_segment.cpp +1 -4
- package/src/duckdb/src/common/types/validity_mask.cpp +24 -7
- package/src/duckdb/src/common/types/vector.cpp +2 -6
- package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +17 -10
- package/src/duckdb/src/function/cast_rules.cpp +9 -4
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/types/validity_mask.hpp +4 -1
|
@@ -122,7 +122,8 @@ struct ParquetWriteGlobalState : public GlobalFunctionData {
|
|
|
122
122
|
};
|
|
123
123
|
|
|
124
124
|
struct ParquetWriteLocalState : public LocalFunctionData {
|
|
125
|
-
explicit ParquetWriteLocalState(ClientContext &context, const vector<LogicalType> &types)
|
|
125
|
+
explicit ParquetWriteLocalState(ClientContext &context, const vector<LogicalType> &types)
|
|
126
|
+
: buffer(Allocator::Get(context), types) {
|
|
126
127
|
}
|
|
127
128
|
|
|
128
129
|
ColumnDataCollection buffer;
|
|
@@ -832,6 +832,46 @@ static bool HasGlob(const string &str) {
|
|
|
832
832
|
}
|
|
833
833
|
return false;
|
|
834
834
|
}
|
|
835
|
+
static bool IsCrawl(const string &glob) {
|
|
836
|
+
// glob must match exactly
|
|
837
|
+
return glob == "**";
|
|
838
|
+
}
|
|
839
|
+
static bool HasMultipleCrawl(const vector<string> &splits) {
|
|
840
|
+
return std::count(splits.begin(), splits.end(), "**") > 1;
|
|
841
|
+
}
|
|
842
|
+
static bool IsSymbolicLink(const string &path) {
|
|
843
|
+
#ifndef _WIN32
|
|
844
|
+
struct stat status;
|
|
845
|
+
return (lstat(path.c_str(), &status) != -1 && S_ISLNK(status.st_mode));
|
|
846
|
+
#else
|
|
847
|
+
auto attributes = WindowsGetFileAttributes(path);
|
|
848
|
+
if (attributes == INVALID_FILE_ATTRIBUTES)
|
|
849
|
+
return false;
|
|
850
|
+
return attributes & FILE_ATTRIBUTE_REPARSE_POINT;
|
|
851
|
+
#endif
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
static void RecursiveGlobDirectories(FileSystem &fs, const string &path, vector<string> &result, bool match_directory,
|
|
855
|
+
bool join_path) {
|
|
856
|
+
|
|
857
|
+
fs.ListFiles(path, [&](const string &fname, bool is_directory) {
|
|
858
|
+
string concat;
|
|
859
|
+
if (join_path) {
|
|
860
|
+
concat = fs.JoinPath(path, fname);
|
|
861
|
+
} else {
|
|
862
|
+
concat = fname;
|
|
863
|
+
}
|
|
864
|
+
if (IsSymbolicLink(concat)) {
|
|
865
|
+
return;
|
|
866
|
+
}
|
|
867
|
+
if (is_directory == match_directory) {
|
|
868
|
+
result.push_back(concat);
|
|
869
|
+
}
|
|
870
|
+
if (is_directory) {
|
|
871
|
+
RecursiveGlobDirectories(fs, concat, result, match_directory, true);
|
|
872
|
+
}
|
|
873
|
+
});
|
|
874
|
+
}
|
|
835
875
|
|
|
836
876
|
static void GlobFilesInternal(FileSystem &fs, const string &path, const string &glob, bool match_directory,
|
|
837
877
|
vector<string> &result, bool join_path) {
|
|
@@ -933,6 +973,10 @@ vector<string> LocalFileSystem::Glob(const string &path, FileOpener *opener) {
|
|
|
933
973
|
}
|
|
934
974
|
}
|
|
935
975
|
|
|
976
|
+
if (HasMultipleCrawl(splits)) {
|
|
977
|
+
throw IOException("Cannot use multiple \'**\' in one path");
|
|
978
|
+
}
|
|
979
|
+
|
|
936
980
|
for (idx_t i = absolute_path ? 1 : 0; i < splits.size(); i++) {
|
|
937
981
|
bool is_last_chunk = i + 1 == splits.size();
|
|
938
982
|
bool has_glob = HasGlob(splits[i]);
|
|
@@ -949,14 +993,27 @@ vector<string> LocalFileSystem::Glob(const string &path, FileOpener *opener) {
|
|
|
949
993
|
}
|
|
950
994
|
}
|
|
951
995
|
} else {
|
|
952
|
-
if (
|
|
953
|
-
|
|
954
|
-
|
|
996
|
+
if (IsCrawl(splits[i])) {
|
|
997
|
+
if (!is_last_chunk) {
|
|
998
|
+
result = previous_directories;
|
|
999
|
+
}
|
|
1000
|
+
if (previous_directories.empty()) {
|
|
1001
|
+
RecursiveGlobDirectories(*this, ".", result, !is_last_chunk, false);
|
|
1002
|
+
} else {
|
|
1003
|
+
for (auto &prev_dir : previous_directories) {
|
|
1004
|
+
RecursiveGlobDirectories(*this, prev_dir, result, !is_last_chunk, true);
|
|
1005
|
+
}
|
|
1006
|
+
}
|
|
955
1007
|
} else {
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
1008
|
+
if (previous_directories.empty()) {
|
|
1009
|
+
// no previous directories: list in the current path
|
|
1010
|
+
GlobFilesInternal(*this, ".", splits[i], !is_last_chunk, result, false);
|
|
1011
|
+
} else {
|
|
1012
|
+
// previous directories
|
|
1013
|
+
// we iterate over each of the previous directories, and apply the glob of the current directory
|
|
1014
|
+
for (auto &prev_directory : previous_directories) {
|
|
1015
|
+
GlobFilesInternal(*this, prev_directory, splits[i], !is_last_chunk, result, true);
|
|
1016
|
+
}
|
|
960
1017
|
}
|
|
961
1018
|
}
|
|
962
1019
|
}
|
|
@@ -169,11 +169,8 @@ idx_t ColumnDataCollectionSegment::ReadVectorInternal(ChunkManagementState &stat
|
|
|
169
169
|
if (type_size > 0) {
|
|
170
170
|
memcpy(target_data + current_offset * type_size, base_ptr, current_vdata.count * type_size);
|
|
171
171
|
}
|
|
172
|
-
// FIXME: use bitwise operations here
|
|
173
172
|
ValidityMask current_validity(validity_data);
|
|
174
|
-
|
|
175
|
-
target_validity.Set(current_offset + k, current_validity.RowIsValid(k));
|
|
176
|
-
}
|
|
173
|
+
target_validity.SliceInPlace(current_validity, current_offset, 0, current_vdata.count);
|
|
177
174
|
current_offset += current_vdata.count;
|
|
178
175
|
next_index = current_vdata.next_data;
|
|
179
176
|
}
|
|
@@ -68,24 +68,41 @@ void ValidityMask::Resize(idx_t old_size, idx_t new_size) {
|
|
|
68
68
|
}
|
|
69
69
|
}
|
|
70
70
|
|
|
71
|
-
void ValidityMask::Slice(const ValidityMask &other, idx_t
|
|
71
|
+
void ValidityMask::Slice(const ValidityMask &other, idx_t source_offset, idx_t count) {
|
|
72
72
|
if (other.AllValid()) {
|
|
73
73
|
validity_mask = nullptr;
|
|
74
74
|
validity_data.reset();
|
|
75
75
|
return;
|
|
76
76
|
}
|
|
77
|
-
if (
|
|
77
|
+
if (source_offset == 0) {
|
|
78
78
|
Initialize(other);
|
|
79
79
|
return;
|
|
80
80
|
}
|
|
81
|
-
ValidityMask new_mask(
|
|
81
|
+
ValidityMask new_mask(count);
|
|
82
|
+
new_mask.SliceInPlace(other, 0, source_offset, count);
|
|
83
|
+
Initialize(new_mask);
|
|
84
|
+
}
|
|
82
85
|
|
|
83
|
-
|
|
86
|
+
bool ValidityMask::IsAligned(idx_t count) {
|
|
87
|
+
return count % BITS_PER_VALUE == 0;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
void ValidityMask::SliceInPlace(const ValidityMask &other, idx_t target_offset, idx_t source_offset, idx_t count) {
|
|
91
|
+
if (IsAligned(source_offset) && IsAligned(target_offset)) {
|
|
92
|
+
auto target_validity = GetData();
|
|
93
|
+
auto source_validity = other.GetData();
|
|
94
|
+
auto source_offset_entries = EntryCount(source_offset);
|
|
95
|
+
auto target_offset_entries = EntryCount(target_offset);
|
|
96
|
+
memcpy(target_validity + target_offset_entries, source_validity + source_offset_entries,
|
|
97
|
+
sizeof(validity_t) * EntryCount(count));
|
|
98
|
+
return;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// FIXME: use bitwise operations here
|
|
84
102
|
#if 1
|
|
85
|
-
for (idx_t i =
|
|
86
|
-
|
|
103
|
+
for (idx_t i = 0; i < count; i++) {
|
|
104
|
+
Set(target_offset + i, other.RowIsValid(source_offset + i));
|
|
87
105
|
}
|
|
88
|
-
Initialize(new_mask);
|
|
89
106
|
#else
|
|
90
107
|
// first shift the "whole" units
|
|
91
108
|
idx_t entire_units = offset / BITS_PER_VALUE;
|
|
@@ -136,17 +136,13 @@ void Vector::Slice(Vector &other, idx_t offset, idx_t end) {
|
|
|
136
136
|
for (idx_t i = 0; i < entries.size(); i++) {
|
|
137
137
|
entries[i]->Slice(*other_entries[i], offset, end);
|
|
138
138
|
}
|
|
139
|
-
|
|
140
|
-
new_vector.validity.Slice(other.validity, offset, end);
|
|
141
|
-
} else {
|
|
142
|
-
new_vector.validity = other.validity;
|
|
143
|
-
}
|
|
139
|
+
new_vector.validity.Slice(other.validity, offset, end - offset);
|
|
144
140
|
Reference(new_vector);
|
|
145
141
|
} else {
|
|
146
142
|
Reference(other);
|
|
147
143
|
if (offset > 0) {
|
|
148
144
|
data = data + GetTypeIdSize(internal_type) * offset;
|
|
149
|
-
validity.Slice(other.validity, offset, end);
|
|
145
|
+
validity.Slice(other.validity, offset, end - offset);
|
|
150
146
|
}
|
|
151
147
|
}
|
|
152
148
|
}
|
|
@@ -53,10 +53,13 @@ RadixPartitionedHashTable::RadixPartitionedHashTable(GroupingSet &grouping_set_p
|
|
|
53
53
|
// Sink
|
|
54
54
|
//===--------------------------------------------------------------------===//
|
|
55
55
|
class RadixHTGlobalState : public GlobalSinkState {
|
|
56
|
+
constexpr const static idx_t MAX_RADIX_PARTITIONS = 32;
|
|
57
|
+
|
|
56
58
|
public:
|
|
57
59
|
explicit RadixHTGlobalState(ClientContext &context)
|
|
58
|
-
: is_empty(true), multi_scan(true),
|
|
59
|
-
partition_info(
|
|
60
|
+
: is_empty(true), multi_scan(true), partitioned(false),
|
|
61
|
+
partition_info(
|
|
62
|
+
MinValue<idx_t>(MAX_RADIX_PARTITIONS, TaskScheduler::GetScheduler(context).NumberOfThreads())) {
|
|
60
63
|
}
|
|
61
64
|
|
|
62
65
|
vector<unique_ptr<PartitionableHashTable>> intermediate_hts;
|
|
@@ -68,8 +71,8 @@ public:
|
|
|
68
71
|
bool multi_scan;
|
|
69
72
|
//! The lock for updating the global aggregate state
|
|
70
73
|
mutex lock;
|
|
71
|
-
//!
|
|
72
|
-
atomic<
|
|
74
|
+
//! Whether or not any thread has crossed the partitioning threshold
|
|
75
|
+
atomic<bool> partitioned;
|
|
73
76
|
|
|
74
77
|
bool is_finalized = false;
|
|
75
78
|
bool is_partitioned = false;
|
|
@@ -79,7 +82,7 @@ public:
|
|
|
79
82
|
|
|
80
83
|
class RadixHTLocalState : public LocalSinkState {
|
|
81
84
|
public:
|
|
82
|
-
explicit RadixHTLocalState(const RadixPartitionedHashTable &ht) : is_empty(true) {
|
|
85
|
+
explicit RadixHTLocalState(const RadixPartitionedHashTable &ht) : total_groups(0), is_empty(true) {
|
|
83
86
|
// if there are no groups we create a fake group so everything has the same group
|
|
84
87
|
group_chunk.InitializeEmpty(ht.group_types);
|
|
85
88
|
if (ht.grouping_set.empty()) {
|
|
@@ -90,6 +93,8 @@ public:
|
|
|
90
93
|
DataChunk group_chunk;
|
|
91
94
|
//! The aggregate HT
|
|
92
95
|
unique_ptr<PartitionableHashTable> ht;
|
|
96
|
+
//! The total number of groups found by this thread
|
|
97
|
+
idx_t total_groups;
|
|
93
98
|
|
|
94
99
|
//! Whether or not any tuples were added to the HT
|
|
95
100
|
bool is_empty;
|
|
@@ -146,7 +151,7 @@ void RadixPartitionedHashTable::Sink(ExecutionContext &context, GlobalSinkState
|
|
|
146
151
|
}
|
|
147
152
|
D_ASSERT(gstate.finalized_hts.size() == 1);
|
|
148
153
|
D_ASSERT(gstate.finalized_hts[0]);
|
|
149
|
-
|
|
154
|
+
llstate.total_groups += gstate.finalized_hts[0]->AddChunk(group_chunk, payload_input, filter);
|
|
150
155
|
return;
|
|
151
156
|
}
|
|
152
157
|
|
|
@@ -160,9 +165,11 @@ void RadixPartitionedHashTable::Sink(ExecutionContext &context, GlobalSinkState
|
|
|
160
165
|
group_types, op.payload_types, op.bindings);
|
|
161
166
|
}
|
|
162
167
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
168
|
+
llstate.total_groups += llstate.ht->AddChunk(group_chunk, payload_input,
|
|
169
|
+
gstate.partitioned && gstate.partition_info.n_partitions > 1, filter);
|
|
170
|
+
if (llstate.total_groups >= radix_limit) {
|
|
171
|
+
gstate.partitioned = true;
|
|
172
|
+
}
|
|
166
173
|
}
|
|
167
174
|
|
|
168
175
|
void RadixPartitionedHashTable::Combine(ExecutionContext &context, GlobalSinkState &state,
|
|
@@ -183,7 +190,7 @@ void RadixPartitionedHashTable::Combine(ExecutionContext &context, GlobalSinkSta
|
|
|
183
190
|
return; // no data
|
|
184
191
|
}
|
|
185
192
|
|
|
186
|
-
if (!llstate.ht->IsPartitioned() && gstate.partition_info.n_partitions > 1 && gstate.
|
|
193
|
+
if (!llstate.ht->IsPartitioned() && gstate.partition_info.n_partitions > 1 && gstate.partitioned) {
|
|
187
194
|
llstate.ht->Partition();
|
|
188
195
|
}
|
|
189
196
|
|
|
@@ -207,6 +207,15 @@ int64_t CastRules::ImplicitCast(const LogicalType &from, const LogicalType &to)
|
|
|
207
207
|
// if aliases are different, an implicit cast is not possible
|
|
208
208
|
return -1;
|
|
209
209
|
}
|
|
210
|
+
if (from.id() == LogicalTypeId::LIST && to.id() == LogicalTypeId::LIST) {
|
|
211
|
+
// Lists can be cast if their child types can be cast
|
|
212
|
+
auto child_cost = ImplicitCast(ListType::GetChildType(from), ListType::GetChildType(to));
|
|
213
|
+
if (child_cost >= 100) {
|
|
214
|
+
// subtract one from the cost because we prefer LIST[X] -> LIST[VARCHAR] over LIST[X] -> VARCHAR
|
|
215
|
+
child_cost--;
|
|
216
|
+
}
|
|
217
|
+
return child_cost;
|
|
218
|
+
}
|
|
210
219
|
if (from.id() == to.id()) {
|
|
211
220
|
// arguments match: do nothing
|
|
212
221
|
return 0;
|
|
@@ -219,10 +228,6 @@ int64_t CastRules::ImplicitCast(const LogicalType &from, const LogicalType &to)
|
|
|
219
228
|
// everything can be cast to VARCHAR, but this cast has a high cost
|
|
220
229
|
return TargetTypeCost(to);
|
|
221
230
|
}
|
|
222
|
-
if (from.id() == LogicalTypeId::LIST && to.id() == LogicalTypeId::LIST) {
|
|
223
|
-
// Lists can be cast if their child types can be cast
|
|
224
|
-
return ImplicitCast(ListType::GetChildType(from), ListType::GetChildType(to));
|
|
225
|
-
}
|
|
226
231
|
|
|
227
232
|
if (from.id() == LogicalTypeId::UNION && to.id() == LogicalTypeId::UNION) {
|
|
228
233
|
// Unions can be cast if the source tags are a subset of the target tags
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
|
2
|
-
#define DUCKDB_VERSION "0.7.2-
|
|
2
|
+
#define DUCKDB_VERSION "0.7.2-dev1188"
|
|
3
3
|
#endif
|
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
|
5
|
+
#define DUCKDB_SOURCE_ID "d1518bdfe8"
|
|
6
6
|
#endif
|
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
|
8
8
|
#include "duckdb/main/database.hpp"
|
|
@@ -323,9 +323,12 @@ public:
|
|
|
323
323
|
public:
|
|
324
324
|
DUCKDB_API void Resize(idx_t old_size, idx_t new_size);
|
|
325
325
|
|
|
326
|
-
DUCKDB_API void
|
|
326
|
+
DUCKDB_API void SliceInPlace(const ValidityMask &other, idx_t target_offset, idx_t source_offset, idx_t count);
|
|
327
|
+
DUCKDB_API void Slice(const ValidityMask &other, idx_t source_offset, idx_t count);
|
|
327
328
|
DUCKDB_API void Combine(const ValidityMask &other, idx_t count);
|
|
328
329
|
DUCKDB_API string ToString(idx_t count) const;
|
|
330
|
+
|
|
331
|
+
DUCKDB_API static bool IsAligned(idx_t count);
|
|
329
332
|
};
|
|
330
333
|
|
|
331
334
|
} // namespace duckdb
|