duckdb 0.5.2-dev1579.0 → 0.5.2-dev1610.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb.cpp +383 -281
- package/src/duckdb.hpp +26 -2
- package/src/parquet-amalgamation.cpp +25254 -25254
package/src/duckdb.cpp
CHANGED
|
@@ -9200,7 +9200,7 @@ void BoxRenderer::Render(ClientContext &context, const vector<string> &names, co
|
|
|
9200
9200
|
|
|
9201
9201
|
// for each column, figure out the width
|
|
9202
9202
|
// start off by figuring out the name of the header by looking at the column name and column type
|
|
9203
|
-
idx_t min_width = has_hidden_rows ? minimum_row_length : 0;
|
|
9203
|
+
idx_t min_width = has_hidden_rows || row_count == 0 ? minimum_row_length : 0;
|
|
9204
9204
|
vector<idx_t> column_map;
|
|
9205
9205
|
idx_t total_length;
|
|
9206
9206
|
auto widths = ComputeRenderWidths(names, result, collections, min_width, max_width, column_map, total_length);
|
|
@@ -52186,6 +52186,15 @@ idx_t ListVector::GetListSize(const Vector &vec) {
|
|
|
52186
52186
|
return ((VectorListBuffer &)*vec.auxiliary).size;
|
|
52187
52187
|
}
|
|
52188
52188
|
|
|
52189
|
+
idx_t ListVector::GetListCapacity(const Vector &vec) {
|
|
52190
|
+
if (vec.GetVectorType() == VectorType::DICTIONARY_VECTOR) {
|
|
52191
|
+
auto &child = DictionaryVector::Child(vec);
|
|
52192
|
+
return ListVector::GetListSize(child);
|
|
52193
|
+
}
|
|
52194
|
+
D_ASSERT(vec.auxiliary);
|
|
52195
|
+
return ((VectorListBuffer &)*vec.auxiliary).capacity;
|
|
52196
|
+
}
|
|
52197
|
+
|
|
52189
52198
|
void ListVector::ReferenceEntry(Vector &vector, Vector &other) {
|
|
52190
52199
|
D_ASSERT(vector.GetType().id() == LogicalTypeId::LIST);
|
|
52191
52200
|
D_ASSERT(vector.GetVectorType() == VectorType::FLAT_VECTOR ||
|
|
@@ -98646,6 +98655,7 @@ struct HandleVectorCastError {
|
|
|
98646
98655
|
|
|
98647
98656
|
|
|
98648
98657
|
|
|
98658
|
+
|
|
98649
98659
|
namespace duckdb {
|
|
98650
98660
|
|
|
98651
98661
|
template <class OP>
|
|
@@ -98829,9 +98839,20 @@ struct VectorCastHelpers {
|
|
|
98829
98839
|
}
|
|
98830
98840
|
};
|
|
98831
98841
|
|
|
98832
|
-
struct
|
|
98842
|
+
struct VectorStringToList {
|
|
98833
98843
|
static idx_t CountParts(const string_t &input);
|
|
98834
98844
|
static bool SplitStringifiedList(const string_t &input, string_t *child_data, idx_t &child_start, Vector &child);
|
|
98845
|
+
static bool StringToNestedTypeCastLoop(string_t *source_data, ValidityMask &source_mask, Vector &result,
|
|
98846
|
+
ValidityMask &result_mask, idx_t count, CastParameters ¶meters,
|
|
98847
|
+
const SelectionVector *sel);
|
|
98848
|
+
};
|
|
98849
|
+
|
|
98850
|
+
struct VectorStringToStruct {
|
|
98851
|
+
static bool SplitStruct(string_t &input, std::vector<std::unique_ptr<Vector>> &varchar_vectors, idx_t &row_idx,
|
|
98852
|
+
string_map_t<idx_t> &child_names, std::vector<ValidityMask *> &child_masks);
|
|
98853
|
+
static bool StringToNestedTypeCastLoop(string_t *source_data, ValidityMask &source_mask, Vector &result,
|
|
98854
|
+
ValidityMask &result_mask, idx_t count, CastParameters ¶meters,
|
|
98855
|
+
const SelectionVector *sel);
|
|
98835
98856
|
};
|
|
98836
98857
|
|
|
98837
98858
|
} // namespace duckdb
|
|
@@ -99956,6 +99977,8 @@ BoundCastInfo DefaultCasts::PointerCastSwitch(BindCastInput &input, const Logica
|
|
|
99956
99977
|
|
|
99957
99978
|
|
|
99958
99979
|
|
|
99980
|
+
|
|
99981
|
+
|
|
99959
99982
|
namespace duckdb {
|
|
99960
99983
|
|
|
99961
99984
|
template <class T>
|
|
@@ -100067,8 +100090,9 @@ static BoundCastInfo VectorStringCastNumericSwitch(BindCastInput &input, const L
|
|
|
100067
100090
|
}
|
|
100068
100091
|
}
|
|
100069
100092
|
|
|
100070
|
-
bool
|
|
100071
|
-
|
|
100093
|
+
bool VectorStringToList::StringToNestedTypeCastLoop(string_t *source_data, ValidityMask &source_mask, Vector &result,
|
|
100094
|
+
ValidityMask &result_mask, idx_t count, CastParameters ¶meters,
|
|
100095
|
+
const SelectionVector *sel) {
|
|
100072
100096
|
|
|
100073
100097
|
idx_t total_list_size = 0;
|
|
100074
100098
|
for (idx_t i = 0; i < count; i++) {
|
|
@@ -100079,7 +100103,7 @@ bool StringListCastLoop(string_t *source_data, ValidityMask &source_mask, Vector
|
|
|
100079
100103
|
if (!source_mask.RowIsValid(idx)) {
|
|
100080
100104
|
continue;
|
|
100081
100105
|
}
|
|
100082
|
-
total_list_size +=
|
|
100106
|
+
total_list_size += VectorStringToList::CountParts(source_data[idx]);
|
|
100083
100107
|
}
|
|
100084
100108
|
|
|
100085
100109
|
Vector varchar_vector(LogicalType::VARCHAR, total_list_size);
|
|
@@ -100103,9 +100127,7 @@ bool StringListCastLoop(string_t *source_data, ValidityMask &source_mask, Vector
|
|
|
100103
100127
|
}
|
|
100104
100128
|
|
|
100105
100129
|
list_data[i].offset = total;
|
|
100106
|
-
|
|
100107
|
-
VectorStringifiedListParser::SplitStringifiedList(source_data[idx], child_data, total, varchar_vector);
|
|
100108
|
-
if (!valid) {
|
|
100130
|
+
if (!VectorStringToList::SplitStringifiedList(source_data[idx], child_data, total, varchar_vector)) {
|
|
100109
100131
|
string text = "Type VARCHAR with value '" + source_data[idx].GetString() +
|
|
100110
100132
|
"' can't be cast to the destination type LIST";
|
|
100111
100133
|
HandleVectorCastError::Operation<string_t>(text, result_mask, idx, parameters.error_message, all_converted);
|
|
@@ -100121,23 +100143,85 @@ bool StringListCastLoop(string_t *source_data, ValidityMask &source_mask, Vector
|
|
|
100121
100143
|
all_converted;
|
|
100122
100144
|
}
|
|
100123
100145
|
|
|
100124
|
-
|
|
100146
|
+
static LogicalType InitVarcharStructType(const LogicalType &target) {
|
|
100147
|
+
child_list_t<LogicalType> child_types;
|
|
100148
|
+
for (auto &child : StructType::GetChildTypes(target)) {
|
|
100149
|
+
child_types.push_back(make_pair(child.first, LogicalType::VARCHAR));
|
|
100150
|
+
}
|
|
100151
|
+
|
|
100152
|
+
return LogicalType::STRUCT(child_types);
|
|
100153
|
+
}
|
|
100154
|
+
|
|
100155
|
+
bool VectorStringToStruct::StringToNestedTypeCastLoop(string_t *source_data, ValidityMask &source_mask, Vector &result,
|
|
100156
|
+
ValidityMask &result_mask, idx_t count,
|
|
100157
|
+
CastParameters ¶meters, const SelectionVector *sel) {
|
|
100158
|
+
|
|
100159
|
+
auto varchar_struct_type = InitVarcharStructType(result.GetType());
|
|
100160
|
+
Vector varchar_vector(varchar_struct_type, count);
|
|
100161
|
+
auto &child_vectors = StructVector::GetEntries(varchar_vector);
|
|
100162
|
+
auto &result_children = StructVector::GetEntries(result);
|
|
100163
|
+
|
|
100164
|
+
string_map_t<idx_t> child_names;
|
|
100165
|
+
vector<ValidityMask *> child_masks;
|
|
100166
|
+
for (idx_t child_idx = 0; child_idx < result_children.size(); child_idx++) {
|
|
100167
|
+
child_names.insert({StructType::GetChildName(result.GetType(), child_idx), child_idx});
|
|
100168
|
+
child_masks.emplace_back(&FlatVector::Validity(*child_vectors[child_idx]));
|
|
100169
|
+
child_masks[child_idx]->SetAllInvalid(count);
|
|
100170
|
+
}
|
|
100171
|
+
|
|
100172
|
+
bool all_converted = true;
|
|
100173
|
+
for (idx_t i = 0; i < count; i++) {
|
|
100174
|
+
idx_t idx = i;
|
|
100175
|
+
if (sel) {
|
|
100176
|
+
idx = sel->get_index(i);
|
|
100177
|
+
}
|
|
100178
|
+
if (!source_mask.RowIsValid(idx)) {
|
|
100179
|
+
result_mask.SetInvalid(i);
|
|
100180
|
+
continue;
|
|
100181
|
+
}
|
|
100182
|
+
if (!VectorStringToStruct::SplitStruct(source_data[idx], child_vectors, i, child_names, child_masks)) {
|
|
100183
|
+
string text = "Type VARCHAR with value '" + source_data[idx].GetString() +
|
|
100184
|
+
"' can't be cast to the destination type STRUCT";
|
|
100185
|
+
for (auto &child_mask : child_masks) {
|
|
100186
|
+
child_mask->SetInvalid(idx); // some values may have already been found and set valid
|
|
100187
|
+
}
|
|
100188
|
+
HandleVectorCastError::Operation<string_t>(text, result_mask, idx, parameters.error_message, all_converted);
|
|
100189
|
+
}
|
|
100190
|
+
}
|
|
100191
|
+
|
|
100192
|
+
auto &cast_data = (StructBoundCastData &)*parameters.cast_data;
|
|
100193
|
+
D_ASSERT(cast_data.child_cast_info.size() == result_children.size());
|
|
100194
|
+
|
|
100195
|
+
for (idx_t child_idx = 0; child_idx < result_children.size(); child_idx++) {
|
|
100196
|
+
auto &varchar_vector = *child_vectors[child_idx];
|
|
100197
|
+
auto &result_child_vector = *result_children[child_idx];
|
|
100198
|
+
auto &child_cast_info = cast_data.child_cast_info[child_idx];
|
|
100199
|
+
// get the correct casting function (VARCHAR -> result_child_type) from cast_data
|
|
100200
|
+
// casting functions are determined by BindStructtoStructCast
|
|
100201
|
+
CastParameters child_parameters(parameters, child_cast_info.cast_data.get());
|
|
100202
|
+
if (!child_cast_info.function(varchar_vector, result_child_vector, count, child_parameters)) {
|
|
100203
|
+
all_converted = false;
|
|
100204
|
+
}
|
|
100205
|
+
}
|
|
100206
|
+
return all_converted;
|
|
100207
|
+
}
|
|
100208
|
+
|
|
100209
|
+
template <class T>
|
|
100210
|
+
bool StringToNestedTypeCast(Vector &source, Vector &result, idx_t count, CastParameters ¶meters) {
|
|
100125
100211
|
D_ASSERT(source.GetType().id() == LogicalTypeId::VARCHAR);
|
|
100126
|
-
D_ASSERT(result.GetType().id() == LogicalTypeId::LIST);
|
|
100127
100212
|
|
|
100128
100213
|
switch (source.GetVectorType()) {
|
|
100129
100214
|
case VectorType::CONSTANT_VECTOR: {
|
|
100130
|
-
result.SetVectorType(VectorType::CONSTANT_VECTOR);
|
|
100131
|
-
|
|
100132
100215
|
auto source_data = ConstantVector::GetData<string_t>(source);
|
|
100133
100216
|
auto &source_mask = ConstantVector::Validity(source);
|
|
100134
|
-
auto &result_mask =
|
|
100217
|
+
auto &result_mask = FlatVector::Validity(result);
|
|
100135
100218
|
|
|
100136
|
-
|
|
100219
|
+
auto ret = T::StringToNestedTypeCastLoop(source_data, source_mask, result, result_mask, 1, parameters, nullptr);
|
|
100220
|
+
result.SetVectorType(VectorType::CONSTANT_VECTOR);
|
|
100221
|
+
return ret;
|
|
100137
100222
|
}
|
|
100138
100223
|
default: {
|
|
100139
100224
|
UnifiedVectorFormat unified_source;
|
|
100140
|
-
result.SetVectorType(VectorType::FLAT_VECTOR);
|
|
100141
100225
|
|
|
100142
100226
|
source.ToUnifiedFormat(count, unified_source);
|
|
100143
100227
|
auto source_sel = unified_source.sel;
|
|
@@ -100145,17 +100229,12 @@ bool StringListCast(Vector &source, Vector &result, idx_t count, CastParameters
|
|
|
100145
100229
|
auto &source_mask = unified_source.validity;
|
|
100146
100230
|
auto &result_mask = FlatVector::Validity(result);
|
|
100147
100231
|
|
|
100148
|
-
return
|
|
100232
|
+
return T::StringToNestedTypeCastLoop(source_data, source_mask, result, result_mask, count, parameters,
|
|
100233
|
+
source_sel);
|
|
100149
100234
|
}
|
|
100150
100235
|
}
|
|
100151
100236
|
}
|
|
100152
100237
|
|
|
100153
|
-
BoundCastInfo StringToListCast(BindCastInput &input, const LogicalType &source, const LogicalType &target) {
|
|
100154
|
-
// second argument allows for a secondary casting function to be passed in the CastParameters
|
|
100155
|
-
return BoundCastInfo(&StringListCast,
|
|
100156
|
-
ListBoundCastData::BindListToListCast(input, LogicalType::LIST(LogicalType::VARCHAR), target));
|
|
100157
|
-
}
|
|
100158
|
-
|
|
100159
100238
|
BoundCastInfo DefaultCasts::StringCastSwitch(BindCastInput &input, const LogicalType &source,
|
|
100160
100239
|
const LogicalType &target) {
|
|
100161
100240
|
switch (target.id()) {
|
|
@@ -100186,7 +100265,13 @@ BoundCastInfo DefaultCasts::StringCastSwitch(BindCastInput &input, const Logical
|
|
|
100186
100265
|
case LogicalTypeId::JSON:
|
|
100187
100266
|
return &DefaultCasts::ReinterpretCast;
|
|
100188
100267
|
case LogicalTypeId::LIST:
|
|
100189
|
-
|
|
100268
|
+
// the second argument allows for a secondary casting function to be passed in the CastParameters
|
|
100269
|
+
return BoundCastInfo(
|
|
100270
|
+
&StringToNestedTypeCast<VectorStringToList>,
|
|
100271
|
+
ListBoundCastData::BindListToListCast(input, LogicalType::LIST(LogicalType::VARCHAR), target));
|
|
100272
|
+
case LogicalTypeId::STRUCT:
|
|
100273
|
+
return BoundCastInfo(&StringToNestedTypeCast<VectorStringToStruct>,
|
|
100274
|
+
StructBoundCastData::BindStructToStructCast(input, InitVarcharStructType(target), target));
|
|
100190
100275
|
default:
|
|
100191
100276
|
return VectorStringCastNumericSwitch(input, source, target);
|
|
100192
100277
|
}
|
|
@@ -100198,26 +100283,8 @@ BoundCastInfo DefaultCasts::StringCastSwitch(BindCastInput &input, const Logical
|
|
|
100198
100283
|
|
|
100199
100284
|
namespace duckdb {
|
|
100200
100285
|
|
|
100201
|
-
|
|
100202
|
-
|
|
100203
|
-
: child_cast_info(move(child_casts)), target(move(target_p)) {
|
|
100204
|
-
}
|
|
100205
|
-
|
|
100206
|
-
vector<BoundCastInfo> child_cast_info;
|
|
100207
|
-
LogicalType target;
|
|
100208
|
-
|
|
100209
|
-
public:
|
|
100210
|
-
unique_ptr<BoundCastData> Copy() const override {
|
|
100211
|
-
vector<BoundCastInfo> copy_info;
|
|
100212
|
-
for (auto &info : child_cast_info) {
|
|
100213
|
-
copy_info.push_back(info.Copy());
|
|
100214
|
-
}
|
|
100215
|
-
return make_unique<StructBoundCastData>(move(copy_info), target);
|
|
100216
|
-
}
|
|
100217
|
-
};
|
|
100218
|
-
|
|
100219
|
-
unique_ptr<BoundCastData> BindStructToStructCast(BindCastInput &input, const LogicalType &source,
|
|
100220
|
-
const LogicalType &target) {
|
|
100286
|
+
unique_ptr<BoundCastData> StructBoundCastData::BindStructToStructCast(BindCastInput &input, const LogicalType &source,
|
|
100287
|
+
const LogicalType &target) {
|
|
100221
100288
|
vector<BoundCastInfo> child_cast_info;
|
|
100222
100289
|
auto &source_child_types = StructType::GetChildTypes(source);
|
|
100223
100290
|
auto &result_child_types = StructType::GetChildTypes(target);
|
|
@@ -100238,13 +100305,14 @@ static bool StructToStructCast(Vector &source, Vector &result, idx_t count, Cast
|
|
|
100238
100305
|
D_ASSERT(source_children.size() == StructType::GetChildTypes(result.GetType()).size());
|
|
100239
100306
|
|
|
100240
100307
|
auto &result_children = StructVector::GetEntries(result);
|
|
100308
|
+
bool all_converted = true;
|
|
100241
100309
|
for (idx_t c_idx = 0; c_idx < source_child_types.size(); c_idx++) {
|
|
100242
100310
|
auto &result_child_vector = *result_children[c_idx];
|
|
100243
100311
|
auto &source_child_vector = *source_children[c_idx];
|
|
100244
100312
|
CastParameters child_parameters(parameters, cast_data.child_cast_info[c_idx].cast_data.get());
|
|
100245
100313
|
if (!cast_data.child_cast_info[c_idx].function(source_child_vector, result_child_vector, count,
|
|
100246
100314
|
child_parameters)) {
|
|
100247
|
-
|
|
100315
|
+
all_converted = false;
|
|
100248
100316
|
}
|
|
100249
100317
|
}
|
|
100250
100318
|
if (source.GetVectorType() == VectorType::CONSTANT_VECTOR) {
|
|
@@ -100254,7 +100322,7 @@ static bool StructToStructCast(Vector &source, Vector &result, idx_t count, Cast
|
|
|
100254
100322
|
source.Flatten(count);
|
|
100255
100323
|
FlatVector::Validity(result) = FlatVector::Validity(source);
|
|
100256
100324
|
}
|
|
100257
|
-
return
|
|
100325
|
+
return all_converted;
|
|
100258
100326
|
}
|
|
100259
100327
|
|
|
100260
100328
|
static bool StructToVarcharCast(Vector &source, Vector &result, idx_t count, CastParameters ¶meters) {
|
|
@@ -100333,7 +100401,7 @@ BoundCastInfo DefaultCasts::StructCastSwitch(BindCastInput &input, const Logical
|
|
|
100333
100401
|
const LogicalType &target) {
|
|
100334
100402
|
switch (target.id()) {
|
|
100335
100403
|
case LogicalTypeId::STRUCT:
|
|
100336
|
-
return BoundCastInfo(StructToStructCast, BindStructToStructCast(input, source, target));
|
|
100404
|
+
return BoundCastInfo(StructToStructCast, StructBoundCastData::BindStructToStructCast(input, source, target));
|
|
100337
100405
|
case LogicalTypeId::JSON:
|
|
100338
100406
|
case LogicalTypeId::VARCHAR: {
|
|
100339
100407
|
// bind a cast in which we convert all child entries to VARCHAR entries
|
|
@@ -100343,7 +100411,8 @@ BoundCastInfo DefaultCasts::StructCastSwitch(BindCastInput &input, const Logical
|
|
|
100343
100411
|
varchar_children.push_back(make_pair(child_entry.first, LogicalType::VARCHAR));
|
|
100344
100412
|
}
|
|
100345
100413
|
auto varchar_type = LogicalType::STRUCT(move(varchar_children));
|
|
100346
|
-
return BoundCastInfo(StructToVarcharCast,
|
|
100414
|
+
return BoundCastInfo(StructToVarcharCast,
|
|
100415
|
+
StructBoundCastData::BindStructToStructCast(input, source, varchar_type));
|
|
100347
100416
|
}
|
|
100348
100417
|
default:
|
|
100349
100418
|
return TryVectorNullCast;
|
|
@@ -100898,6 +100967,33 @@ BoundCastInfo DefaultCasts::UUIDCastSwitch(BindCastInput &input, const LogicalTy
|
|
|
100898
100967
|
|
|
100899
100968
|
namespace duckdb {
|
|
100900
100969
|
|
|
100970
|
+
static bool IsNull(const char *buf, idx_t start_pos, Vector &child, idx_t row_idx) {
|
|
100971
|
+
if (buf[start_pos] == 'N' && buf[start_pos + 1] == 'U' && buf[start_pos + 2] == 'L' && buf[start_pos + 3] == 'L') {
|
|
100972
|
+
FlatVector::SetNull(child, row_idx, true);
|
|
100973
|
+
return true;
|
|
100974
|
+
}
|
|
100975
|
+
return false;
|
|
100976
|
+
}
|
|
100977
|
+
|
|
100978
|
+
inline static void SkipWhitespace(const char *buf, idx_t &pos, idx_t len) {
|
|
100979
|
+
while (pos < len && StringUtil::CharacterIsSpace(buf[pos])) {
|
|
100980
|
+
pos++;
|
|
100981
|
+
}
|
|
100982
|
+
}
|
|
100983
|
+
|
|
100984
|
+
static idx_t StringTrim(const char *buf, idx_t &start_pos, idx_t pos) {
|
|
100985
|
+
idx_t trailing_whitespace = 0;
|
|
100986
|
+
while (StringUtil::CharacterIsSpace(buf[pos - trailing_whitespace - 1])) {
|
|
100987
|
+
trailing_whitespace++;
|
|
100988
|
+
}
|
|
100989
|
+
if ((buf[start_pos] == '"' && buf[pos - trailing_whitespace - 1] == '"') ||
|
|
100990
|
+
(buf[start_pos] == '\'' && buf[pos - trailing_whitespace - 1] == '\'')) {
|
|
100991
|
+
start_pos++;
|
|
100992
|
+
trailing_whitespace++;
|
|
100993
|
+
}
|
|
100994
|
+
return (pos - trailing_whitespace);
|
|
100995
|
+
}
|
|
100996
|
+
|
|
100901
100997
|
struct CountPartOperation {
|
|
100902
100998
|
idx_t count = 0;
|
|
100903
100999
|
|
|
@@ -100916,10 +101012,7 @@ struct SplitStringOperation {
|
|
|
100916
101012
|
Vector &child;
|
|
100917
101013
|
|
|
100918
101014
|
void HandleValue(const char *buf, idx_t start_pos, idx_t pos) {
|
|
100919
|
-
|
|
100920
|
-
if ((pos - start_pos) >= 4 && buf[start_pos] == 'N' && buf[start_pos + 1] == 'U' && buf[start_pos + 2] == 'L' &&
|
|
100921
|
-
buf[start_pos + 3] == 'L') {
|
|
100922
|
-
FlatVector::SetNull(child, child_start, true);
|
|
101015
|
+
if ((pos - start_pos) == 4 && IsNull(buf, start_pos, child, child_start)) {
|
|
100923
101016
|
child_start++;
|
|
100924
101017
|
return;
|
|
100925
101018
|
}
|
|
@@ -100941,20 +101034,27 @@ static bool SkipToCloseQuotes(idx_t &pos, const char *buf, idx_t &len) {
|
|
|
100941
101034
|
return false;
|
|
100942
101035
|
}
|
|
100943
101036
|
|
|
100944
|
-
static bool SkipToClose(idx_t &idx, const char *buf, idx_t &len, idx_t &lvl) {
|
|
101037
|
+
static bool SkipToClose(idx_t &idx, const char *buf, idx_t &len, idx_t &lvl, char close_bracket) {
|
|
101038
|
+
idx++;
|
|
101039
|
+
|
|
100945
101040
|
while (idx < len) {
|
|
100946
|
-
if (buf[idx] == '[') {
|
|
100947
|
-
if (!
|
|
101041
|
+
if (buf[idx] == '"' || buf[idx] == '\'') {
|
|
101042
|
+
if (!SkipToCloseQuotes(idx, buf, len)) {
|
|
101043
|
+
return false;
|
|
101044
|
+
}
|
|
101045
|
+
} else if (buf[idx] == '{') {
|
|
101046
|
+
if (!SkipToClose(idx, buf, len, lvl, '}')) {
|
|
101047
|
+
return false;
|
|
101048
|
+
}
|
|
101049
|
+
} else if (buf[idx] == '[') {
|
|
101050
|
+
if (!SkipToClose(idx, buf, len, lvl, ']')) {
|
|
100948
101051
|
return false;
|
|
100949
101052
|
}
|
|
100950
101053
|
lvl++;
|
|
100951
|
-
|
|
100952
|
-
|
|
100953
|
-
|
|
100954
|
-
|
|
100955
|
-
}
|
|
100956
|
-
if (buf[idx] == ']') {
|
|
100957
|
-
lvl--;
|
|
101054
|
+
} else if (buf[idx] == close_bracket) {
|
|
101055
|
+
if (close_bracket == ']') {
|
|
101056
|
+
lvl--;
|
|
101057
|
+
}
|
|
100958
101058
|
return true;
|
|
100959
101059
|
}
|
|
100960
101060
|
idx++;
|
|
@@ -100969,68 +101069,133 @@ static bool SplitStringifiedListInternal(const string_t &input, OP &state) {
|
|
|
100969
101069
|
idx_t lvl = 1;
|
|
100970
101070
|
idx_t pos = 0;
|
|
100971
101071
|
|
|
100972
|
-
|
|
100973
|
-
pos++;
|
|
100974
|
-
}
|
|
101072
|
+
SkipWhitespace(buf, pos, len);
|
|
100975
101073
|
if (pos == len || buf[pos] != '[') {
|
|
100976
101074
|
return false;
|
|
100977
101075
|
}
|
|
100978
|
-
pos++;
|
|
100979
|
-
while (pos < len && StringUtil::CharacterIsSpace(buf[pos])) {
|
|
100980
|
-
pos++;
|
|
100981
|
-
}
|
|
100982
101076
|
|
|
101077
|
+
SkipWhitespace(buf, ++pos, len);
|
|
100983
101078
|
idx_t start_pos = pos;
|
|
100984
101079
|
while (pos < len) {
|
|
100985
101080
|
if (buf[pos] == '[') {
|
|
100986
|
-
if (!SkipToClose(
|
|
101081
|
+
if (!SkipToClose(pos, buf, len, ++lvl, ']')) {
|
|
100987
101082
|
return false;
|
|
100988
101083
|
}
|
|
100989
101084
|
} else if (buf[pos] == '"' || buf[pos] == '\'') {
|
|
100990
101085
|
SkipToCloseQuotes(pos, buf, len);
|
|
101086
|
+
} else if (buf[pos] == '{') {
|
|
101087
|
+
idx_t struct_lvl = 0;
|
|
101088
|
+
SkipToClose(pos, buf, len, struct_lvl, '}');
|
|
100991
101089
|
} else if (buf[pos] == ',' || buf[pos] == ']') {
|
|
100992
101090
|
idx_t trailing_whitespace = 0;
|
|
100993
101091
|
while (StringUtil::CharacterIsSpace(buf[pos - trailing_whitespace - 1])) {
|
|
100994
101092
|
trailing_whitespace++;
|
|
100995
101093
|
}
|
|
100996
|
-
if (!(buf[pos] == ']' && start_pos ==
|
|
101094
|
+
if (!(buf[pos] == ']' && start_pos == pos)) {
|
|
100997
101095
|
state.HandleValue(buf, start_pos, pos - trailing_whitespace);
|
|
100998
101096
|
} // else the list is empty
|
|
100999
101097
|
if (buf[pos] == ']') {
|
|
101000
101098
|
lvl--;
|
|
101001
101099
|
break;
|
|
101002
101100
|
}
|
|
101003
|
-
|
|
101004
|
-
|
|
101005
|
-
|
|
101006
|
-
start_pos = pos + 1;
|
|
101007
|
-
}
|
|
101008
|
-
pos++;
|
|
101009
|
-
}
|
|
101010
|
-
pos++;
|
|
101011
|
-
while (pos < len) {
|
|
101012
|
-
if (!StringUtil::CharacterIsSpace(buf[pos])) {
|
|
101013
|
-
return false;
|
|
101101
|
+
SkipWhitespace(buf, ++pos, len);
|
|
101102
|
+
start_pos = pos;
|
|
101103
|
+
continue;
|
|
101014
101104
|
}
|
|
101015
101105
|
pos++;
|
|
101016
101106
|
}
|
|
101017
|
-
|
|
101018
|
-
|
|
101019
|
-
}
|
|
101020
|
-
return true;
|
|
101107
|
+
SkipWhitespace(buf, ++pos, len);
|
|
101108
|
+
return (pos == len && lvl == 0);
|
|
101021
101109
|
}
|
|
101022
101110
|
|
|
101023
|
-
bool
|
|
101024
|
-
|
|
101111
|
+
bool VectorStringToList::SplitStringifiedList(const string_t &input, string_t *child_data, idx_t &child_start,
|
|
101112
|
+
Vector &child) {
|
|
101025
101113
|
SplitStringOperation state(child_data, child_start, child);
|
|
101026
101114
|
return SplitStringifiedListInternal<SplitStringOperation>(input, state);
|
|
101027
101115
|
}
|
|
101028
101116
|
|
|
101029
|
-
idx_t
|
|
101117
|
+
idx_t VectorStringToList::CountParts(const string_t &input) {
|
|
101030
101118
|
CountPartOperation state;
|
|
101031
101119
|
SplitStringifiedListInternal<CountPartOperation>(input, state);
|
|
101032
101120
|
return state.count;
|
|
101033
101121
|
}
|
|
101122
|
+
|
|
101123
|
+
static bool FindKey(const char *buf, idx_t len, idx_t &pos) {
|
|
101124
|
+
while (pos < len) {
|
|
101125
|
+
if (buf[pos] == ':') {
|
|
101126
|
+
return true;
|
|
101127
|
+
}
|
|
101128
|
+
pos++;
|
|
101129
|
+
}
|
|
101130
|
+
return false;
|
|
101131
|
+
}
|
|
101132
|
+
|
|
101133
|
+
static bool FindValue(const char *buf, idx_t len, idx_t &pos, Vector &varchar_child, idx_t &row_idx,
|
|
101134
|
+
ValidityMask *child_mask) {
|
|
101135
|
+
auto start_pos = pos;
|
|
101136
|
+
idx_t lvl = 0;
|
|
101137
|
+
while (pos < len) {
|
|
101138
|
+
if (buf[pos] == '"' || buf[pos] == '\'') {
|
|
101139
|
+
SkipToCloseQuotes(pos, buf, len);
|
|
101140
|
+
} else if (buf[pos] == '{') {
|
|
101141
|
+
SkipToClose(pos, buf, len, lvl, '}');
|
|
101142
|
+
} else if (buf[pos] == '[') {
|
|
101143
|
+
SkipToClose(pos, buf, len, lvl, ']');
|
|
101144
|
+
} else if (buf[pos] == ',' || buf[pos] == '}') {
|
|
101145
|
+
idx_t end_pos = StringTrim(buf, start_pos, pos);
|
|
101146
|
+
if ((end_pos - start_pos) == 4 && IsNull(buf, start_pos, varchar_child, row_idx)) {
|
|
101147
|
+
return true;
|
|
101148
|
+
}
|
|
101149
|
+
FlatVector::GetData<string_t>(varchar_child)[row_idx] =
|
|
101150
|
+
StringVector::AddString(varchar_child, buf + start_pos, end_pos - start_pos);
|
|
101151
|
+
child_mask->SetValid(row_idx); // any child not set to valid will remain invalid
|
|
101152
|
+
return true;
|
|
101153
|
+
}
|
|
101154
|
+
pos++;
|
|
101155
|
+
}
|
|
101156
|
+
return false;
|
|
101157
|
+
}
|
|
101158
|
+
|
|
101159
|
+
bool VectorStringToStruct::SplitStruct(string_t &input, std::vector<std::unique_ptr<Vector>> &varchar_vectors,
|
|
101160
|
+
idx_t &row_idx, string_map_t<idx_t> &child_names,
|
|
101161
|
+
std::vector<ValidityMask *> &child_masks) {
|
|
101162
|
+
const char *buf = input.GetDataUnsafe();
|
|
101163
|
+
idx_t len = input.GetSize();
|
|
101164
|
+
idx_t pos = 0;
|
|
101165
|
+
idx_t child_idx;
|
|
101166
|
+
|
|
101167
|
+
SkipWhitespace(buf, pos, len);
|
|
101168
|
+
if (pos == len || buf[pos] != '{') {
|
|
101169
|
+
return false;
|
|
101170
|
+
}
|
|
101171
|
+
SkipWhitespace(buf, ++pos, len);
|
|
101172
|
+
if (buf[pos] == '}') {
|
|
101173
|
+
pos++;
|
|
101174
|
+
} else {
|
|
101175
|
+
while (pos < len) {
|
|
101176
|
+
auto key_start = pos;
|
|
101177
|
+
if (!FindKey(buf, len, pos)) {
|
|
101178
|
+
return false;
|
|
101179
|
+
}
|
|
101180
|
+
auto key_end = StringTrim(buf, key_start, pos);
|
|
101181
|
+
string_t found_key(buf + key_start, key_end - key_start);
|
|
101182
|
+
|
|
101183
|
+
auto it = child_names.find(found_key);
|
|
101184
|
+
if (it == child_names.end()) {
|
|
101185
|
+
return false; // false key
|
|
101186
|
+
}
|
|
101187
|
+
child_idx = it->second;
|
|
101188
|
+
SkipWhitespace(buf, ++pos, len);
|
|
101189
|
+
if (!FindValue(buf, len, pos, *varchar_vectors[child_idx], row_idx, child_masks[child_idx])) {
|
|
101190
|
+
return false;
|
|
101191
|
+
}
|
|
101192
|
+
SkipWhitespace(buf, ++pos, len);
|
|
101193
|
+
}
|
|
101194
|
+
}
|
|
101195
|
+
SkipWhitespace(buf, pos, len);
|
|
101196
|
+
return (pos == len);
|
|
101197
|
+
}
|
|
101198
|
+
|
|
101034
101199
|
} // namespace duckdb
|
|
101035
101200
|
|
|
101036
101201
|
|
|
@@ -118794,6 +118959,20 @@ struct RegexpExtractBindData : public RegexpBaseBindData {
|
|
|
118794
118959
|
bool Equals(const FunctionData &other_p) const override;
|
|
118795
118960
|
};
|
|
118796
118961
|
|
|
118962
|
+
struct RegexLocalState : public FunctionLocalState {
|
|
118963
|
+
explicit RegexLocalState(RegexpBaseBindData &info)
|
|
118964
|
+
: constant_pattern(duckdb_re2::StringPiece(info.constant_string.c_str(), info.constant_string.size()),
|
|
118965
|
+
info.options) {
|
|
118966
|
+
D_ASSERT(info.constant_pattern);
|
|
118967
|
+
}
|
|
118968
|
+
|
|
118969
|
+
RE2 constant_pattern;
|
|
118970
|
+
};
|
|
118971
|
+
|
|
118972
|
+
unique_ptr<FunctionLocalState> RegexInitLocalState(const BoundFunctionExpression &expr, FunctionData *bind_data);
|
|
118973
|
+
unique_ptr<FunctionData> RegexpMatchesBind(ClientContext &context, ScalarFunction &bound_function,
|
|
118974
|
+
vector<unique_ptr<Expression>> &arguments);
|
|
118975
|
+
|
|
118797
118976
|
} // namespace duckdb
|
|
118798
118977
|
|
|
118799
118978
|
|
|
@@ -118833,18 +119012,7 @@ static inline duckdb_re2::StringPiece CreateStringPiece(string_t &input) {
|
|
|
118833
119012
|
return duckdb_re2::StringPiece(input.GetDataUnsafe(), input.GetSize());
|
|
118834
119013
|
}
|
|
118835
119014
|
|
|
118836
|
-
|
|
118837
|
-
explicit RegexLocalState(RegexpBaseBindData &info)
|
|
118838
|
-
: constant_pattern(duckdb_re2::StringPiece(info.constant_string.c_str(), info.constant_string.size()),
|
|
118839
|
-
info.options) {
|
|
118840
|
-
D_ASSERT(info.constant_pattern);
|
|
118841
|
-
}
|
|
118842
|
-
|
|
118843
|
-
RE2 constant_pattern;
|
|
118844
|
-
};
|
|
118845
|
-
|
|
118846
|
-
static unique_ptr<FunctionLocalState> RegexInitLocalState(const BoundFunctionExpression &expr,
|
|
118847
|
-
FunctionData *bind_data) {
|
|
119015
|
+
unique_ptr<FunctionLocalState> RegexInitLocalState(const BoundFunctionExpression &expr, FunctionData *bind_data) {
|
|
118848
119016
|
auto &info = (RegexpBaseBindData &)*bind_data;
|
|
118849
119017
|
if (info.constant_pattern) {
|
|
118850
119018
|
return make_unique<RegexLocalState>(info);
|
|
@@ -118951,8 +119119,8 @@ unique_ptr<FunctionData> RegexpMatchesBindData::Copy() const {
|
|
|
118951
119119
|
range_success);
|
|
118952
119120
|
}
|
|
118953
119121
|
|
|
118954
|
-
|
|
118955
|
-
|
|
119122
|
+
unique_ptr<FunctionData> RegexpMatchesBind(ClientContext &context, ScalarFunction &bound_function,
|
|
119123
|
+
vector<unique_ptr<Expression>> &arguments) {
|
|
118956
119124
|
// pattern is the second argument. If its constant, we can already prepare the pattern and store it for later.
|
|
118957
119125
|
D_ASSERT(arguments.size() == 2 || arguments.size() == 3);
|
|
118958
119126
|
RE2::Options options;
|
|
@@ -119398,188 +119566,106 @@ void ReverseFun::RegisterFunction(BuiltinFunctions &set) {
|
|
|
119398
119566
|
|
|
119399
119567
|
|
|
119400
119568
|
|
|
119401
|
-
|
|
119402
|
-
|
|
119403
|
-
|
|
119404
119569
|
namespace duckdb {
|
|
119405
119570
|
|
|
119406
|
-
struct
|
|
119407
|
-
|
|
119408
|
-
|
|
119409
|
-
}
|
|
119410
|
-
virtual ~StringSplitIterator() {
|
|
119571
|
+
struct StringSplitInput {
|
|
119572
|
+
StringSplitInput(Vector &result_list, Vector &result_child, idx_t offset)
|
|
119573
|
+
: result_list(result_list), result_child(result_child), offset(offset) {
|
|
119411
119574
|
}
|
|
119412
119575
|
|
|
119413
|
-
|
|
119576
|
+
Vector &result_list;
|
|
119577
|
+
Vector &result_child;
|
|
119578
|
+
idx_t offset;
|
|
119414
119579
|
|
|
119415
|
-
|
|
119416
|
-
|
|
119417
|
-
|
|
119418
|
-
|
|
119419
|
-
|
|
119420
|
-
|
|
119421
|
-
|
|
119580
|
+
void AddSplit(const char *split_data, idx_t split_size, idx_t list_idx) {
|
|
119581
|
+
auto list_entry = offset + list_idx;
|
|
119582
|
+
if (list_entry >= ListVector::GetListCapacity(result_list)) {
|
|
119583
|
+
ListVector::SetListSize(result_list, offset + list_idx);
|
|
119584
|
+
ListVector::Reserve(result_list, ListVector::GetListCapacity(result_list) * 2);
|
|
119585
|
+
}
|
|
119586
|
+
FlatVector::GetData<string_t>(result_child)[list_entry] =
|
|
119587
|
+
StringVector::AddString(result_child, split_data, split_size);
|
|
119422
119588
|
}
|
|
119423
|
-
|
|
119424
|
-
protected:
|
|
119425
|
-
idx_t start = 0; // end of last place a delim match was found
|
|
119426
|
-
idx_t offset = 0; // current position
|
|
119427
119589
|
};
|
|
119428
119590
|
|
|
119429
|
-
struct
|
|
119430
|
-
|
|
119431
|
-
|
|
119432
|
-
|
|
119433
|
-
}
|
|
119434
|
-
idx_t Next(const char *input) override {
|
|
119435
|
-
// special case: separate by empty delimiter
|
|
119591
|
+
struct RegularStringSplit {
|
|
119592
|
+
static idx_t Find(const char *input_data, idx_t input_size, const char *delim_data, idx_t delim_size,
|
|
119593
|
+
idx_t &match_size, void *data) {
|
|
119594
|
+
match_size = delim_size;
|
|
119436
119595
|
if (delim_size == 0) {
|
|
119437
|
-
|
|
119438
|
-
start = offset;
|
|
119439
|
-
return offset;
|
|
119440
|
-
}
|
|
119441
|
-
for (offset = start; HasNext(); offset++) {
|
|
119442
|
-
// potential delimiter match
|
|
119443
|
-
if (input[offset] == delim[0] && offset + delim_size <= size) {
|
|
119444
|
-
idx_t i;
|
|
119445
|
-
for (i = 1; i < delim_size; i++) {
|
|
119446
|
-
if (input[offset + i] != delim[i]) {
|
|
119447
|
-
break;
|
|
119448
|
-
}
|
|
119449
|
-
}
|
|
119450
|
-
// delimiter found: skip start over delimiter
|
|
119451
|
-
if (i == delim_size) {
|
|
119452
|
-
start = offset + delim_size;
|
|
119453
|
-
return offset;
|
|
119454
|
-
}
|
|
119455
|
-
}
|
|
119596
|
+
return 0;
|
|
119456
119597
|
}
|
|
119457
|
-
return
|
|
119598
|
+
return ContainsFun::Find((const unsigned char *)input_data, input_size, (const unsigned char *)delim_data,
|
|
119599
|
+
delim_size);
|
|
119458
119600
|
}
|
|
119601
|
+
};
|
|
119459
119602
|
|
|
119460
|
-
|
|
119461
|
-
const char *
|
|
119462
|
-
|
|
119603
|
+
struct ConstantRegexpStringSplit {
|
|
119604
|
+
static idx_t Find(const char *input_data, idx_t input_size, const char *delim_data, idx_t delim_size,
|
|
119605
|
+
idx_t &match_size, void *data) {
|
|
119606
|
+
D_ASSERT(data);
|
|
119607
|
+
auto regex = (duckdb_re2::RE2 *)data;
|
|
119608
|
+
duckdb_re2::StringPiece match;
|
|
119609
|
+
if (!regex->Match(duckdb_re2::StringPiece(input_data, input_size), 0, input_size, RE2::UNANCHORED, &match, 1)) {
|
|
119610
|
+
return DConstants::INVALID_INDEX;
|
|
119611
|
+
}
|
|
119612
|
+
match_size = match.size();
|
|
119613
|
+
return match.data() - input_data;
|
|
119614
|
+
}
|
|
119463
119615
|
};
|
|
119464
119616
|
|
|
119465
|
-
struct
|
|
119466
|
-
|
|
119467
|
-
|
|
119468
|
-
|
|
119469
|
-
|
|
119470
|
-
|
|
119471
|
-
delim_cps.push_back(utf8proc_codepoint(delim, cp_sz));
|
|
119617
|
+
struct RegexpStringSplit {
|
|
119618
|
+
static idx_t Find(const char *input_data, idx_t input_size, const char *delim_data, idx_t delim_size,
|
|
119619
|
+
idx_t &match_size, void *data) {
|
|
119620
|
+
duckdb_re2::RE2 regex(duckdb_re2::StringPiece(delim_data, delim_size));
|
|
119621
|
+
if (!regex.ok()) {
|
|
119622
|
+
throw InvalidInputException(regex.error());
|
|
119472
119623
|
}
|
|
119624
|
+
return ConstantRegexpStringSplit::Find(input_data, input_size, delim_data, delim_size, match_size, ®ex);
|
|
119473
119625
|
}
|
|
119474
|
-
|
|
119475
|
-
|
|
119476
|
-
|
|
119477
|
-
|
|
119478
|
-
|
|
119479
|
-
|
|
119480
|
-
|
|
119481
|
-
|
|
119482
|
-
|
|
119483
|
-
|
|
119484
|
-
|
|
119485
|
-
|
|
119486
|
-
|
|
119487
|
-
|
|
119626
|
+
};
|
|
119627
|
+
|
|
119628
|
+
struct StringSplitter {
|
|
119629
|
+
template <class OP>
|
|
119630
|
+
static idx_t Split(string_t input, string_t delim, StringSplitInput &state, void *data) {
|
|
119631
|
+
auto input_data = input.GetDataUnsafe();
|
|
119632
|
+
auto input_size = input.GetSize();
|
|
119633
|
+
auto delim_data = delim.GetDataUnsafe();
|
|
119634
|
+
auto delim_size = delim.GetSize();
|
|
119635
|
+
idx_t list_idx = 0;
|
|
119636
|
+
while (input_size > 0) {
|
|
119637
|
+
idx_t match_size;
|
|
119638
|
+
auto pos = OP::Find(input_data, input_size, delim_data, delim_size, match_size, data);
|
|
119639
|
+
if (pos > input_size) {
|
|
119640
|
+
break;
|
|
119641
|
+
}
|
|
119642
|
+
if (match_size == 0 && pos == 0) {
|
|
119643
|
+
// special case: 0 length match and pos is 0
|
|
119644
|
+
// move to the next character
|
|
119645
|
+
for (pos++; pos < input_size; pos++) {
|
|
119646
|
+
if (LengthFun::IsCharacter(input_data[pos])) {
|
|
119488
119647
|
break;
|
|
119489
119648
|
}
|
|
119490
|
-
delim_offset += cp_sz;
|
|
119491
119649
|
}
|
|
119492
|
-
|
|
119493
|
-
|
|
119494
|
-
start = offset + delim_size;
|
|
119495
|
-
return offset;
|
|
119650
|
+
if (pos == input_size) {
|
|
119651
|
+
break;
|
|
119496
119652
|
}
|
|
119497
119653
|
}
|
|
119498
|
-
|
|
119499
|
-
|
|
119500
|
-
}
|
|
119501
|
-
|
|
119502
|
-
protected:
|
|
119503
|
-
vector<utf8proc_int32_t> delim_cps;
|
|
119504
|
-
size_t delim_size;
|
|
119505
|
-
};
|
|
119654
|
+
D_ASSERT(input_size >= pos + match_size);
|
|
119655
|
+
state.AddSplit(input_data, pos, list_idx);
|
|
119506
119656
|
|
|
119507
|
-
|
|
119508
|
-
|
|
119509
|
-
|
|
119510
|
-
: StringSplitIterator(input_size), re(move(re)), ascii_only(ascii_only) {
|
|
119511
|
-
}
|
|
119512
|
-
idx_t Next(const char *input) override {
|
|
119513
|
-
duckdb_re2::StringPiece input_sp(input, size);
|
|
119514
|
-
duckdb_re2::StringPiece match;
|
|
119515
|
-
if (re->Match(input_sp, start, size, RE2::UNANCHORED, &match, 1)) {
|
|
119516
|
-
offset = match.data() - input;
|
|
119517
|
-
// special case: 0 length match
|
|
119518
|
-
if (match.empty() && start < size) {
|
|
119519
|
-
if (ascii_only) {
|
|
119520
|
-
offset++;
|
|
119521
|
-
} else {
|
|
119522
|
-
offset = utf8proc_next_grapheme(input, size, offset);
|
|
119523
|
-
}
|
|
119524
|
-
start = offset;
|
|
119525
|
-
} else {
|
|
119526
|
-
start = offset + match.size();
|
|
119527
|
-
}
|
|
119528
|
-
} else {
|
|
119529
|
-
offset = size;
|
|
119657
|
+
list_idx++;
|
|
119658
|
+
input_data += (pos + match_size);
|
|
119659
|
+
input_size -= (pos + match_size);
|
|
119530
119660
|
}
|
|
119531
|
-
|
|
119661
|
+
state.AddSplit(input_data, input_size, list_idx);
|
|
119662
|
+
list_idx++;
|
|
119663
|
+
return list_idx;
|
|
119532
119664
|
}
|
|
119533
|
-
|
|
119534
|
-
protected:
|
|
119535
|
-
unique_ptr<RE2> re;
|
|
119536
|
-
bool ascii_only;
|
|
119537
119665
|
};
|
|
119538
119666
|
|
|
119539
|
-
|
|
119540
|
-
|
|
119541
|
-
if (iter.size == 0) {
|
|
119542
|
-
Value val = StringVector::AddString(ListVector::GetEntry(result), &input[0], 0);
|
|
119543
|
-
ListVector::PushBack(result, val);
|
|
119544
|
-
return;
|
|
119545
|
-
}
|
|
119546
|
-
while (iter.HasNext()) {
|
|
119547
|
-
idx_t start = iter.Start();
|
|
119548
|
-
idx_t end = iter.Next(input);
|
|
119549
|
-
size_t length = end - start;
|
|
119550
|
-
Value to_insert(StringVector::AddString(ListVector::GetEntry(result), &input[start], length));
|
|
119551
|
-
ListVector::PushBack(result, to_insert);
|
|
119552
|
-
}
|
|
119553
|
-
}
|
|
119554
|
-
|
|
119555
|
-
unique_ptr<Vector> BaseStringSplitFunction(string_t input, string_t delim, const bool regex) {
|
|
119556
|
-
const char *input_data = input.GetDataUnsafe();
|
|
119557
|
-
size_t input_size = input.GetSize();
|
|
119558
|
-
const char *delim_data = delim.GetDataUnsafe();
|
|
119559
|
-
size_t delim_size = delim.GetSize();
|
|
119560
|
-
|
|
119561
|
-
bool ascii_only = Utf8Proc::Analyze(input_data, input_size) == UnicodeType::ASCII;
|
|
119562
|
-
|
|
119563
|
-
auto list_type = LogicalType::LIST(LogicalType::VARCHAR);
|
|
119564
|
-
auto output = make_unique<Vector>(list_type);
|
|
119565
|
-
unique_ptr<StringSplitIterator> iter;
|
|
119566
|
-
if (regex) {
|
|
119567
|
-
auto re = make_unique<RE2>(duckdb_re2::StringPiece(delim_data, delim_size));
|
|
119568
|
-
if (!re->ok()) {
|
|
119569
|
-
throw Exception(re->error());
|
|
119570
|
-
}
|
|
119571
|
-
iter = make_unique_base<StringSplitIterator, RegexStringSplitIterator>(input_size, move(re), ascii_only);
|
|
119572
|
-
} else if (ascii_only) {
|
|
119573
|
-
iter = make_unique_base<StringSplitIterator, AsciiStringSplitIterator>(input_size, delim_data, delim_size);
|
|
119574
|
-
} else {
|
|
119575
|
-
iter = make_unique_base<StringSplitIterator, UnicodeStringSplitIterator>(input_size, delim_data, delim_size);
|
|
119576
|
-
}
|
|
119577
|
-
BaseStringSplitFunction(input_data, *iter, *output);
|
|
119578
|
-
|
|
119579
|
-
return output;
|
|
119580
|
-
}
|
|
119581
|
-
|
|
119582
|
-
static void StringSplitExecutor(DataChunk &args, ExpressionState &state, Vector &result, const bool regex) {
|
|
119667
|
+
template <class OP>
|
|
119668
|
+
static void StringSplitExecutor(DataChunk &args, ExpressionState &state, Vector &result, void *data = nullptr) {
|
|
119583
119669
|
UnifiedVectorFormat input_data;
|
|
119584
119670
|
args.data[0].ToUnifiedFormat(args.size(), input_data);
|
|
119585
119671
|
auto inputs = (string_t *)input_data.data;
|
|
@@ -119594,10 +119680,11 @@ static void StringSplitExecutor(DataChunk &args, ExpressionState &state, Vector
|
|
|
119594
119680
|
ListVector::SetListSize(result, 0);
|
|
119595
119681
|
|
|
119596
119682
|
auto list_struct_data = FlatVector::GetData<list_entry_t>(result);
|
|
119597
|
-
auto list_vector_type = LogicalType::LIST(LogicalType::VARCHAR);
|
|
119598
119683
|
|
|
119599
|
-
|
|
119684
|
+
// count all the splits and set up the list entries
|
|
119685
|
+
auto &child_entry = ListVector::GetEntry(result);
|
|
119600
119686
|
auto &result_mask = FlatVector::Validity(result);
|
|
119687
|
+
idx_t total_splits = 0;
|
|
119601
119688
|
for (idx_t i = 0; i < args.size(); i++) {
|
|
119602
119689
|
auto input_idx = input_data.sel->get_index(i);
|
|
119603
119690
|
auto delim_idx = delim_data.sel->get_index(i);
|
|
@@ -119605,36 +119692,43 @@ static void StringSplitExecutor(DataChunk &args, ExpressionState &state, Vector
|
|
|
119605
119692
|
result_mask.SetInvalid(i);
|
|
119606
119693
|
continue;
|
|
119607
119694
|
}
|
|
119608
|
-
|
|
119609
|
-
|
|
119610
|
-
unique_ptr<Vector> split_input;
|
|
119695
|
+
StringSplitInput split_input(result, child_entry, total_splits);
|
|
119611
119696
|
if (!delim_data.validity.RowIsValid(delim_idx)) {
|
|
119612
|
-
//
|
|
119613
|
-
split_input
|
|
119614
|
-
|
|
119615
|
-
|
|
119616
|
-
|
|
119617
|
-
|
|
119618
|
-
split_input = BaseStringSplitFunction(input, delim, regex);
|
|
119697
|
+
// delim is NULL: copy the complete entry
|
|
119698
|
+
split_input.AddSplit(inputs[input_idx].GetDataUnsafe(), inputs[input_idx].GetSize(), 0);
|
|
119699
|
+
list_struct_data[i].length = 1;
|
|
119700
|
+
list_struct_data[i].offset = total_splits;
|
|
119701
|
+
total_splits++;
|
|
119702
|
+
continue;
|
|
119619
119703
|
}
|
|
119620
|
-
|
|
119621
|
-
list_struct_data[i].
|
|
119622
|
-
|
|
119623
|
-
|
|
119704
|
+
auto list_length = StringSplitter::Split<OP>(inputs[input_idx], delims[delim_idx], split_input, data);
|
|
119705
|
+
list_struct_data[i].length = list_length;
|
|
119706
|
+
list_struct_data[i].offset = total_splits;
|
|
119707
|
+
total_splits += list_length;
|
|
119624
119708
|
}
|
|
119709
|
+
ListVector::SetListSize(result, total_splits);
|
|
119710
|
+
D_ASSERT(ListVector::GetListSize(result) == total_splits);
|
|
119625
119711
|
|
|
119626
|
-
D_ASSERT(ListVector::GetListSize(result) == total_len);
|
|
119627
119712
|
if (args.AllConstant()) {
|
|
119628
119713
|
result.SetVectorType(VectorType::CONSTANT_VECTOR);
|
|
119629
119714
|
}
|
|
119630
119715
|
}
|
|
119631
119716
|
|
|
119632
119717
|
static void StringSplitFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
|
119633
|
-
StringSplitExecutor(args, state, result,
|
|
119718
|
+
StringSplitExecutor<RegularStringSplit>(args, state, result, nullptr);
|
|
119634
119719
|
}
|
|
119635
119720
|
|
|
119636
119721
|
static void StringSplitRegexFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
|
119637
|
-
|
|
119722
|
+
auto &func_expr = (BoundFunctionExpression &)state.expr;
|
|
119723
|
+
auto &info = (RegexpMatchesBindData &)*func_expr.bind_info;
|
|
119724
|
+
if (info.constant_pattern) {
|
|
119725
|
+
// fast path: pre-compiled regex
|
|
119726
|
+
auto &lstate = (RegexLocalState &)*ExecuteFunctionState::GetFunctionState(state);
|
|
119727
|
+
StringSplitExecutor<ConstantRegexpStringSplit>(args, state, result, &lstate.constant_pattern);
|
|
119728
|
+
} else {
|
|
119729
|
+
// slow path: have to re-compile regex for every row
|
|
119730
|
+
StringSplitExecutor<RegexpStringSplit>(args, state, result);
|
|
119731
|
+
}
|
|
119638
119732
|
}
|
|
119639
119733
|
|
|
119640
119734
|
void StringSplitFun::RegisterFunction(BuiltinFunctions &set) {
|
|
@@ -119645,10 +119739,18 @@ void StringSplitFun::RegisterFunction(BuiltinFunctions &set) {
|
|
|
119645
119739
|
regular_fun.null_handling = FunctionNullHandling::SPECIAL_HANDLING;
|
|
119646
119740
|
set.AddFunction({"string_split", "str_split", "string_to_array", "split"}, regular_fun);
|
|
119647
119741
|
|
|
119648
|
-
|
|
119649
|
-
|
|
119650
|
-
|
|
119651
|
-
|
|
119742
|
+
ScalarFunctionSet regexp_split("string_split_regex");
|
|
119743
|
+
ScalarFunction regex_fun({LogicalType::VARCHAR, LogicalType::VARCHAR}, varchar_list_type, StringSplitRegexFunction,
|
|
119744
|
+
RegexpMatchesBind, nullptr, nullptr, RegexInitLocalState, LogicalType::INVALID,
|
|
119745
|
+
FunctionSideEffects::NO_SIDE_EFFECTS, FunctionNullHandling::SPECIAL_HANDLING);
|
|
119746
|
+
regexp_split.AddFunction(regex_fun);
|
|
119747
|
+
// regexp options
|
|
119748
|
+
regex_fun.arguments.emplace_back(LogicalType::VARCHAR);
|
|
119749
|
+
regexp_split.AddFunction(regex_fun);
|
|
119750
|
+
for (auto &name : {"string_split_regex", "str_split_regex", "regexp_split_to_array"}) {
|
|
119751
|
+
regexp_split.name = name;
|
|
119752
|
+
set.AddFunction(regexp_split);
|
|
119753
|
+
}
|
|
119652
119754
|
}
|
|
119653
119755
|
|
|
119654
119756
|
} // namespace duckdb
|