duckdb 0.5.2-dev863.0 → 0.5.2-dev870.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb.cpp +188 -26
- package/src/duckdb.hpp +685 -685
- package/src/parquet-amalgamation.cpp +33813 -33813
package/package.json
CHANGED
package/src/duckdb.cpp
CHANGED
|
@@ -16820,14 +16820,25 @@ struct ConcatFun {
|
|
|
16820
16820
|
static void RegisterFunction(BuiltinFunctions &set);
|
|
16821
16821
|
};
|
|
16822
16822
|
|
|
16823
|
-
struct ConcatWSFun {
|
|
16824
|
-
static void RegisterFunction(BuiltinFunctions &set);
|
|
16825
|
-
};
|
|
16826
|
-
|
|
16827
16823
|
struct LengthFun {
|
|
16828
16824
|
static void RegisterFunction(BuiltinFunctions &set);
|
|
16825
|
+
static inline bool IsCharacter(char c) {
|
|
16826
|
+
return (c & 0xc0) != 0x80;
|
|
16827
|
+
}
|
|
16828
|
+
|
|
16829
16829
|
template <class TA, class TR>
|
|
16830
16830
|
static inline TR Length(TA input) {
|
|
16831
|
+
auto input_data = input.GetDataUnsafe();
|
|
16832
|
+
auto input_length = input.GetSize();
|
|
16833
|
+
TR length = 0;
|
|
16834
|
+
for (idx_t i = 0; i < input_length; i++) {
|
|
16835
|
+
length += IsCharacter(input_data[i]);
|
|
16836
|
+
}
|
|
16837
|
+
return length;
|
|
16838
|
+
}
|
|
16839
|
+
|
|
16840
|
+
template <class TA, class TR>
|
|
16841
|
+
static inline TR GraphemeCount(TA input) {
|
|
16831
16842
|
auto input_data = input.GetDataUnsafe();
|
|
16832
16843
|
auto input_length = input.GetSize();
|
|
16833
16844
|
for (idx_t i = 0; i < input_length; i++) {
|
|
@@ -16881,7 +16892,8 @@ struct RegexpFun {
|
|
|
16881
16892
|
|
|
16882
16893
|
struct SubstringFun {
|
|
16883
16894
|
static void RegisterFunction(BuiltinFunctions &set);
|
|
16884
|
-
static string_t
|
|
16895
|
+
static string_t SubstringUnicode(Vector &result, string_t input, int64_t offset, int64_t length);
|
|
16896
|
+
static string_t SubstringGrapheme(Vector &result, string_t input, int64_t offset, int64_t length);
|
|
16885
16897
|
};
|
|
16886
16898
|
|
|
16887
16899
|
struct PrintfFun {
|
|
@@ -104993,7 +105005,7 @@ list_entry_t SliceValue(Vector &result, list_entry_t input, int64_t begin, int64
|
|
|
104993
105005
|
template <>
|
|
104994
105006
|
string_t SliceValue(Vector &result, string_t input, int32_t begin, int32_t end) {
|
|
104995
105007
|
// one-based - zero has strange semantics
|
|
104996
|
-
return SubstringFun::
|
|
105008
|
+
return SubstringFun::SubstringUnicode(result, input, begin + 1, end - begin);
|
|
104997
105009
|
}
|
|
104998
105010
|
|
|
104999
105011
|
template <typename INPUT_TYPE, typename INDEX_TYPE>
|
|
@@ -106341,7 +106353,7 @@ static void ExecuteListExtract(Vector &result, Vector &list, Vector &offsets, co
|
|
|
106341
106353
|
static void ExecuteStringExtract(Vector &result, Vector &input_vector, Vector &subscript_vector, const idx_t count) {
|
|
106342
106354
|
BinaryExecutor::Execute<string_t, int64_t, string_t>(
|
|
106343
106355
|
input_vector, subscript_vector, result, count, [&](string_t input_string, int64_t subscript) {
|
|
106344
|
-
return SubstringFun::
|
|
106356
|
+
return SubstringFun::SubstringUnicode(result, input_string, subscript, 1);
|
|
106345
106357
|
});
|
|
106346
106358
|
}
|
|
106347
106359
|
|
|
@@ -113771,54 +113783,84 @@ void JaroWinklerFun::RegisterFunction(BuiltinFunctions &set) {
|
|
|
113771
113783
|
|
|
113772
113784
|
namespace duckdb {
|
|
113773
113785
|
|
|
113786
|
+
struct LeftRightUnicode {
|
|
113787
|
+
template <class TA, class TR>
|
|
113788
|
+
static inline TR Operation(TA input) {
|
|
113789
|
+
return LengthFun::Length<TA, TR>(input);
|
|
113790
|
+
}
|
|
113791
|
+
|
|
113792
|
+
static string_t Substring(Vector &result, string_t input, int64_t offset, int64_t length) {
|
|
113793
|
+
return SubstringFun::SubstringUnicode(result, input, offset, length);
|
|
113794
|
+
}
|
|
113795
|
+
};
|
|
113796
|
+
|
|
113797
|
+
struct LeftRightGrapheme {
|
|
113798
|
+
template <class TA, class TR>
|
|
113799
|
+
static inline TR Operation(TA input) {
|
|
113800
|
+
return LengthFun::GraphemeCount<TA, TR>(input);
|
|
113801
|
+
}
|
|
113802
|
+
|
|
113803
|
+
static string_t Substring(Vector &result, string_t input, int64_t offset, int64_t length) {
|
|
113804
|
+
return SubstringFun::SubstringGrapheme(result, input, offset, length);
|
|
113805
|
+
}
|
|
113806
|
+
};
|
|
113807
|
+
|
|
113808
|
+
template <class OP>
|
|
113774
113809
|
static string_t LeftScalarFunction(Vector &result, const string_t str, int64_t pos) {
|
|
113775
113810
|
if (pos >= 0) {
|
|
113776
|
-
return
|
|
113811
|
+
return OP::Substring(result, str, 1, pos);
|
|
113777
113812
|
}
|
|
113778
113813
|
|
|
113779
|
-
int64_t num_characters =
|
|
113814
|
+
int64_t num_characters = OP::template Operation<string_t, int64_t>(str);
|
|
113780
113815
|
pos = MaxValue<int64_t>(0, num_characters + pos);
|
|
113781
|
-
return
|
|
113816
|
+
return OP::Substring(result, str, 1, pos);
|
|
113782
113817
|
}
|
|
113783
113818
|
|
|
113819
|
+
template <class OP>
|
|
113784
113820
|
static void LeftFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
|
113785
113821
|
auto &str_vec = args.data[0];
|
|
113786
113822
|
auto &pos_vec = args.data[1];
|
|
113787
113823
|
|
|
113788
113824
|
BinaryExecutor::Execute<string_t, int64_t, string_t>(
|
|
113789
113825
|
str_vec, pos_vec, result, args.size(),
|
|
113790
|
-
[&](string_t str, int64_t pos) { return LeftScalarFunction(result, str, pos); });
|
|
113826
|
+
[&](string_t str, int64_t pos) { return LeftScalarFunction<OP>(result, str, pos); });
|
|
113791
113827
|
}
|
|
113792
113828
|
|
|
113793
113829
|
void LeftFun::RegisterFunction(BuiltinFunctions &set) {
|
|
113794
|
-
set.AddFunction(
|
|
113795
|
-
|
|
113830
|
+
set.AddFunction(ScalarFunction("left", {LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
|
|
113831
|
+
LeftFunction<LeftRightUnicode>));
|
|
113832
|
+
set.AddFunction(ScalarFunction("left_grapheme", {LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
|
|
113833
|
+
LeftFunction<LeftRightGrapheme>));
|
|
113796
113834
|
}
|
|
113797
113835
|
|
|
113836
|
+
template <class OP>
|
|
113798
113837
|
static string_t RightScalarFunction(Vector &result, const string_t str, int64_t pos) {
|
|
113799
|
-
int64_t num_characters =
|
|
113838
|
+
int64_t num_characters = OP::template Operation<string_t, int64_t>(str);
|
|
113800
113839
|
if (pos >= 0) {
|
|
113801
113840
|
int64_t len = MinValue<int64_t>(num_characters, pos);
|
|
113802
113841
|
int64_t start = num_characters - len + 1;
|
|
113803
|
-
return
|
|
113842
|
+
return OP::Substring(result, str, start, len);
|
|
113804
113843
|
}
|
|
113805
113844
|
|
|
113806
113845
|
int64_t len = num_characters - MinValue<int64_t>(num_characters, -pos);
|
|
113807
113846
|
int64_t start = num_characters - len + 1;
|
|
113808
|
-
return
|
|
113847
|
+
return OP::Substring(result, str, start, len);
|
|
113809
113848
|
}
|
|
113810
113849
|
|
|
113850
|
+
template <class OP>
|
|
113811
113851
|
static void RightFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
|
113812
113852
|
auto &str_vec = args.data[0];
|
|
113813
113853
|
auto &pos_vec = args.data[1];
|
|
113814
113854
|
BinaryExecutor::Execute<string_t, int64_t, string_t>(
|
|
113815
113855
|
str_vec, pos_vec, result, args.size(),
|
|
113816
|
-
[&](string_t str, int64_t pos) { return RightScalarFunction(result, str, pos); });
|
|
113856
|
+
[&](string_t str, int64_t pos) { return RightScalarFunction<OP>(result, str, pos); });
|
|
113817
113857
|
}
|
|
113818
113858
|
|
|
113819
113859
|
void RightFun::RegisterFunction(BuiltinFunctions &set) {
|
|
113820
|
-
set.AddFunction(
|
|
113821
|
-
|
|
113860
|
+
set.AddFunction(ScalarFunction("right", {LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
|
|
113861
|
+
RightFunction<LeftRightUnicode>));
|
|
113862
|
+
set.AddFunction(ScalarFunction("right_grapheme", {LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
|
|
113863
|
+
RightFunction<LeftRightGrapheme>));
|
|
113822
113864
|
}
|
|
113823
113865
|
|
|
113824
113866
|
} // namespace duckdb
|
|
@@ -113833,7 +113875,7 @@ void RightFun::RegisterFunction(BuiltinFunctions &set) {
|
|
|
113833
113875
|
|
|
113834
113876
|
namespace duckdb {
|
|
113835
113877
|
|
|
113836
|
-
// length returns the
|
|
113878
|
+
// length returns the number of unicode codepoints
|
|
113837
113879
|
struct StringLengthOperator {
|
|
113838
113880
|
template <class TA, class TR>
|
|
113839
113881
|
static inline TR Operation(TA input) {
|
|
@@ -113841,6 +113883,13 @@ struct StringLengthOperator {
|
|
|
113841
113883
|
}
|
|
113842
113884
|
};
|
|
113843
113885
|
|
|
113886
|
+
struct GraphemeCountOperator {
|
|
113887
|
+
template <class TA, class TR>
|
|
113888
|
+
static inline TR Operation(TA input) {
|
|
113889
|
+
return LengthFun::GraphemeCount<TA, TR>(input);
|
|
113890
|
+
}
|
|
113891
|
+
};
|
|
113892
|
+
|
|
113844
113893
|
struct ArrayLengthOperator {
|
|
113845
113894
|
template <class TA, class TR>
|
|
113846
113895
|
static inline TR Operation(TA input) {
|
|
@@ -113911,6 +113960,12 @@ void LengthFun::RegisterFunction(BuiltinFunctions &set) {
|
|
|
113911
113960
|
length.name = "len";
|
|
113912
113961
|
set.AddFunction(length);
|
|
113913
113962
|
|
|
113963
|
+
ScalarFunctionSet length_grapheme("length_grapheme");
|
|
113964
|
+
length_grapheme.AddFunction(ScalarFunction({LogicalType::VARCHAR}, LogicalType::BIGINT,
|
|
113965
|
+
ScalarFunction::UnaryFunction<string_t, int64_t, GraphemeCountOperator>,
|
|
113966
|
+
nullptr, nullptr, LengthPropagateStats));
|
|
113967
|
+
set.AddFunction(length_grapheme);
|
|
113968
|
+
|
|
113914
113969
|
ScalarFunctionSet array_length("array_length");
|
|
113915
113970
|
array_length.AddFunction(array_length_unary);
|
|
113916
113971
|
array_length.AddFunction(ScalarFunction(
|
|
@@ -116186,7 +116241,92 @@ string_t SubstringASCII(Vector &result, string_t input, int64_t offset, int64_t
|
|
|
116186
116241
|
return SubstringSlice(result, input_data, start, end - start);
|
|
116187
116242
|
}
|
|
116188
116243
|
|
|
116189
|
-
string_t SubstringFun::
|
|
116244
|
+
string_t SubstringFun::SubstringUnicode(Vector &result, string_t input, int64_t offset, int64_t length) {
|
|
116245
|
+
auto input_data = input.GetDataUnsafe();
|
|
116246
|
+
auto input_size = input.GetSize();
|
|
116247
|
+
|
|
116248
|
+
if (length == 0) {
|
|
116249
|
+
return SubstringEmptyString(result);
|
|
116250
|
+
}
|
|
116251
|
+
// first figure out which direction we need to scan
|
|
116252
|
+
idx_t start_pos;
|
|
116253
|
+
idx_t end_pos;
|
|
116254
|
+
if (offset < 0) {
|
|
116255
|
+
start_pos = 0;
|
|
116256
|
+
end_pos = DConstants::INVALID_INDEX;
|
|
116257
|
+
|
|
116258
|
+
// negative offset: scan backwards
|
|
116259
|
+
int64_t start, end;
|
|
116260
|
+
|
|
116261
|
+
// we express start and end as unicode codepoints from the back
|
|
116262
|
+
offset--;
|
|
116263
|
+
if (length < 0) {
|
|
116264
|
+
// negative length
|
|
116265
|
+
start = -offset - length;
|
|
116266
|
+
end = -offset;
|
|
116267
|
+
} else {
|
|
116268
|
+
// positive length
|
|
116269
|
+
start = -offset;
|
|
116270
|
+
end = -offset - length;
|
|
116271
|
+
}
|
|
116272
|
+
if (end <= 0) {
|
|
116273
|
+
end_pos = input_size;
|
|
116274
|
+
}
|
|
116275
|
+
int64_t current_character = 0;
|
|
116276
|
+
for (idx_t i = input_size; i > 0; i--) {
|
|
116277
|
+
if (LengthFun::IsCharacter(input_data[i - 1])) {
|
|
116278
|
+
current_character++;
|
|
116279
|
+
if (current_character == start) {
|
|
116280
|
+
start_pos = i;
|
|
116281
|
+
break;
|
|
116282
|
+
} else if (current_character == end) {
|
|
116283
|
+
end_pos = i;
|
|
116284
|
+
}
|
|
116285
|
+
}
|
|
116286
|
+
}
|
|
116287
|
+
if (end_pos == DConstants::INVALID_INDEX) {
|
|
116288
|
+
return SubstringEmptyString(result);
|
|
116289
|
+
}
|
|
116290
|
+
} else {
|
|
116291
|
+
start_pos = DConstants::INVALID_INDEX;
|
|
116292
|
+
end_pos = input_size;
|
|
116293
|
+
|
|
116294
|
+
// positive offset: scan forwards
|
|
116295
|
+
int64_t start, end;
|
|
116296
|
+
|
|
116297
|
+
// we express start and end as unicode codepoints from the front
|
|
116298
|
+
if (length < 0) {
|
|
116299
|
+
// negative length
|
|
116300
|
+
start = MaxValue<int64_t>(0, offset + length - 1);
|
|
116301
|
+
end = offset - 1;
|
|
116302
|
+
} else {
|
|
116303
|
+
// positive length
|
|
116304
|
+
start = MaxValue<int64_t>(0, offset - 1);
|
|
116305
|
+
end = offset + length - 1;
|
|
116306
|
+
}
|
|
116307
|
+
|
|
116308
|
+
int64_t current_character = 0;
|
|
116309
|
+
for (idx_t i = 0; i < input_size; i++) {
|
|
116310
|
+
if (LengthFun::IsCharacter(input_data[i])) {
|
|
116311
|
+
if (current_character == start) {
|
|
116312
|
+
start_pos = i;
|
|
116313
|
+
} else if (current_character == end) {
|
|
116314
|
+
end_pos = i;
|
|
116315
|
+
break;
|
|
116316
|
+
}
|
|
116317
|
+
current_character++;
|
|
116318
|
+
}
|
|
116319
|
+
}
|
|
116320
|
+
if (start_pos == DConstants::INVALID_INDEX || end == 0 || end <= start) {
|
|
116321
|
+
return SubstringEmptyString(result);
|
|
116322
|
+
}
|
|
116323
|
+
}
|
|
116324
|
+
D_ASSERT(end_pos >= start_pos);
|
|
116325
|
+
// after we have found these, we can slice the substring
|
|
116326
|
+
return SubstringSlice(result, input_data, start_pos, end_pos - start_pos);
|
|
116327
|
+
}
|
|
116328
|
+
|
|
116329
|
+
string_t SubstringFun::SubstringGrapheme(Vector &result, string_t input, int64_t offset, int64_t length) {
|
|
116190
116330
|
auto input_data = input.GetDataUnsafe();
|
|
116191
116331
|
auto input_size = input.GetSize();
|
|
116192
116332
|
|
|
@@ -116247,6 +116387,19 @@ string_t SubstringFun::SubstringScalarFunction(Vector &result, string_t input, i
|
|
|
116247
116387
|
return SubstringSlice(result, input_data, start_pos, end_pos - start_pos);
|
|
116248
116388
|
}
|
|
116249
116389
|
|
|
116390
|
+
struct SubstringUnicodeOp {
|
|
116391
|
+
static string_t Substring(Vector &result, string_t input, int64_t offset, int64_t length) {
|
|
116392
|
+
return SubstringFun::SubstringUnicode(result, input, offset, length);
|
|
116393
|
+
}
|
|
116394
|
+
};
|
|
116395
|
+
|
|
116396
|
+
struct SubstringGraphemeOp {
|
|
116397
|
+
static string_t Substring(Vector &result, string_t input, int64_t offset, int64_t length) {
|
|
116398
|
+
return SubstringFun::SubstringGrapheme(result, input, offset, length);
|
|
116399
|
+
}
|
|
116400
|
+
};
|
|
116401
|
+
|
|
116402
|
+
template <class OP>
|
|
116250
116403
|
static void SubstringFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
|
116251
116404
|
auto &input_vector = args.data[0];
|
|
116252
116405
|
auto &offset_vector = args.data[1];
|
|
@@ -116256,13 +116409,12 @@ static void SubstringFunction(DataChunk &args, ExpressionState &state, Vector &r
|
|
|
116256
116409
|
TernaryExecutor::Execute<string_t, int64_t, int64_t, string_t>(
|
|
116257
116410
|
input_vector, offset_vector, length_vector, result, args.size(),
|
|
116258
116411
|
[&](string_t input_string, int64_t offset, int64_t length) {
|
|
116259
|
-
return
|
|
116412
|
+
return OP::Substring(result, input_string, offset, length);
|
|
116260
116413
|
});
|
|
116261
116414
|
} else {
|
|
116262
116415
|
BinaryExecutor::Execute<string_t, int64_t, string_t>(
|
|
116263
116416
|
input_vector, offset_vector, result, args.size(), [&](string_t input_string, int64_t offset) {
|
|
116264
|
-
return
|
|
116265
|
-
NumericLimits<int64_t>::Maximum() - offset);
|
|
116417
|
+
return OP::Substring(result, input_string, offset, NumericLimits<int64_t>::Maximum() - offset);
|
|
116266
116418
|
});
|
|
116267
116419
|
}
|
|
116268
116420
|
}
|
|
@@ -116304,13 +116456,23 @@ static unique_ptr<BaseStatistics> SubstringPropagateStats(ClientContext &context
|
|
|
116304
116456
|
void SubstringFun::RegisterFunction(BuiltinFunctions &set) {
|
|
116305
116457
|
ScalarFunctionSet substr("substring");
|
|
116306
116458
|
substr.AddFunction(ScalarFunction({LogicalType::VARCHAR, LogicalType::BIGINT, LogicalType::BIGINT},
|
|
116307
|
-
LogicalType::VARCHAR, SubstringFunction
|
|
116459
|
+
LogicalType::VARCHAR, SubstringFunction<SubstringUnicodeOp>, nullptr, nullptr,
|
|
116308
116460
|
SubstringPropagateStats));
|
|
116309
116461
|
substr.AddFunction(ScalarFunction({LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
|
|
116310
|
-
SubstringFunction
|
|
116462
|
+
SubstringFunction<SubstringUnicodeOp>, nullptr, nullptr,
|
|
116463
|
+
SubstringPropagateStats));
|
|
116311
116464
|
set.AddFunction(substr);
|
|
116312
116465
|
substr.name = "substr";
|
|
116313
116466
|
set.AddFunction(substr);
|
|
116467
|
+
|
|
116468
|
+
ScalarFunctionSet substr_grapheme("substring_grapheme");
|
|
116469
|
+
substr_grapheme.AddFunction(ScalarFunction({LogicalType::VARCHAR, LogicalType::BIGINT, LogicalType::BIGINT},
|
|
116470
|
+
LogicalType::VARCHAR, SubstringFunction<SubstringGraphemeOp>, nullptr,
|
|
116471
|
+
nullptr, SubstringPropagateStats));
|
|
116472
|
+
substr_grapheme.AddFunction(ScalarFunction({LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
|
|
116473
|
+
SubstringFunction<SubstringGraphemeOp>, nullptr, nullptr,
|
|
116474
|
+
SubstringPropagateStats));
|
|
116475
|
+
set.AddFunction(substr_grapheme);
|
|
116314
116476
|
}
|
|
116315
116477
|
|
|
116316
116478
|
} // namespace duckdb
|