duckdb 0.5.2-dev863.0 → 0.5.2-dev874.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb.cpp +200 -27
- package/src/duckdb.hpp +685 -685
- package/src/parquet-amalgamation.cpp +37394 -37394
package/package.json
CHANGED
package/src/duckdb.cpp
CHANGED
|
@@ -16820,14 +16820,25 @@ struct ConcatFun {
|
|
|
16820
16820
|
static void RegisterFunction(BuiltinFunctions &set);
|
|
16821
16821
|
};
|
|
16822
16822
|
|
|
16823
|
-
struct ConcatWSFun {
|
|
16824
|
-
static void RegisterFunction(BuiltinFunctions &set);
|
|
16825
|
-
};
|
|
16826
|
-
|
|
16827
16823
|
struct LengthFun {
|
|
16828
16824
|
static void RegisterFunction(BuiltinFunctions &set);
|
|
16825
|
+
static inline bool IsCharacter(char c) {
|
|
16826
|
+
return (c & 0xc0) != 0x80;
|
|
16827
|
+
}
|
|
16828
|
+
|
|
16829
16829
|
template <class TA, class TR>
|
|
16830
16830
|
static inline TR Length(TA input) {
|
|
16831
|
+
auto input_data = input.GetDataUnsafe();
|
|
16832
|
+
auto input_length = input.GetSize();
|
|
16833
|
+
TR length = 0;
|
|
16834
|
+
for (idx_t i = 0; i < input_length; i++) {
|
|
16835
|
+
length += IsCharacter(input_data[i]);
|
|
16836
|
+
}
|
|
16837
|
+
return length;
|
|
16838
|
+
}
|
|
16839
|
+
|
|
16840
|
+
template <class TA, class TR>
|
|
16841
|
+
static inline TR GraphemeCount(TA input) {
|
|
16831
16842
|
auto input_data = input.GetDataUnsafe();
|
|
16832
16843
|
auto input_length = input.GetSize();
|
|
16833
16844
|
for (idx_t i = 0; i < input_length; i++) {
|
|
@@ -16881,7 +16892,8 @@ struct RegexpFun {
|
|
|
16881
16892
|
|
|
16882
16893
|
struct SubstringFun {
|
|
16883
16894
|
static void RegisterFunction(BuiltinFunctions &set);
|
|
16884
|
-
static string_t
|
|
16895
|
+
static string_t SubstringUnicode(Vector &result, string_t input, int64_t offset, int64_t length);
|
|
16896
|
+
static string_t SubstringGrapheme(Vector &result, string_t input, int64_t offset, int64_t length);
|
|
16885
16897
|
};
|
|
16886
16898
|
|
|
16887
16899
|
struct PrintfFun {
|
|
@@ -21114,7 +21126,7 @@ vector<string> LocalFileSystem::FetchFileWithoutGlob(const string &path, FileOpe
|
|
|
21114
21126
|
result.push_back(path);
|
|
21115
21127
|
} else if (!absolute_path) {
|
|
21116
21128
|
Value value;
|
|
21117
|
-
if (opener->TryGetCurrentSetting("file_search_path", value)) {
|
|
21129
|
+
if (opener && opener->TryGetCurrentSetting("file_search_path", value)) {
|
|
21118
21130
|
auto search_paths_str = value.ToString();
|
|
21119
21131
|
std::vector<std::string> search_paths = StringUtil::Split(search_paths_str, ',');
|
|
21120
21132
|
for (const auto &search_path : search_paths) {
|
|
@@ -21180,7 +21192,18 @@ vector<string> LocalFileSystem::Glob(const string &path, FileOpener *opener) {
|
|
|
21180
21192
|
if (absolute_path) {
|
|
21181
21193
|
// for absolute paths, we don't start by scanning the current directory
|
|
21182
21194
|
previous_directories.push_back(splits[0]);
|
|
21195
|
+
} else {
|
|
21196
|
+
// If file_search_path is set, use those paths as the first glob elements
|
|
21197
|
+
Value value;
|
|
21198
|
+
if (opener && opener->TryGetCurrentSetting("file_search_path", value)) {
|
|
21199
|
+
auto search_paths_str = value.ToString();
|
|
21200
|
+
std::vector<std::string> search_paths = StringUtil::Split(search_paths_str, ',');
|
|
21201
|
+
for (const auto &search_path : search_paths) {
|
|
21202
|
+
previous_directories.push_back(search_path);
|
|
21203
|
+
}
|
|
21204
|
+
}
|
|
21183
21205
|
}
|
|
21206
|
+
|
|
21184
21207
|
for (idx_t i = absolute_path ? 1 : 0; i < splits.size(); i++) {
|
|
21185
21208
|
bool is_last_chunk = i + 1 == splits.size();
|
|
21186
21209
|
bool has_glob = HasGlob(splits[i]);
|
|
@@ -104993,7 +105016,7 @@ list_entry_t SliceValue(Vector &result, list_entry_t input, int64_t begin, int64
|
|
|
104993
105016
|
template <>
|
|
104994
105017
|
string_t SliceValue(Vector &result, string_t input, int32_t begin, int32_t end) {
|
|
104995
105018
|
// one-based - zero has strange semantics
|
|
104996
|
-
return SubstringFun::
|
|
105019
|
+
return SubstringFun::SubstringUnicode(result, input, begin + 1, end - begin);
|
|
104997
105020
|
}
|
|
104998
105021
|
|
|
104999
105022
|
template <typename INPUT_TYPE, typename INDEX_TYPE>
|
|
@@ -106341,7 +106364,7 @@ static void ExecuteListExtract(Vector &result, Vector &list, Vector &offsets, co
|
|
|
106341
106364
|
static void ExecuteStringExtract(Vector &result, Vector &input_vector, Vector &subscript_vector, const idx_t count) {
|
|
106342
106365
|
BinaryExecutor::Execute<string_t, int64_t, string_t>(
|
|
106343
106366
|
input_vector, subscript_vector, result, count, [&](string_t input_string, int64_t subscript) {
|
|
106344
|
-
return SubstringFun::
|
|
106367
|
+
return SubstringFun::SubstringUnicode(result, input_string, subscript, 1);
|
|
106345
106368
|
});
|
|
106346
106369
|
}
|
|
106347
106370
|
|
|
@@ -113771,54 +113794,84 @@ void JaroWinklerFun::RegisterFunction(BuiltinFunctions &set) {
|
|
|
113771
113794
|
|
|
113772
113795
|
namespace duckdb {
|
|
113773
113796
|
|
|
113797
|
+
struct LeftRightUnicode {
|
|
113798
|
+
template <class TA, class TR>
|
|
113799
|
+
static inline TR Operation(TA input) {
|
|
113800
|
+
return LengthFun::Length<TA, TR>(input);
|
|
113801
|
+
}
|
|
113802
|
+
|
|
113803
|
+
static string_t Substring(Vector &result, string_t input, int64_t offset, int64_t length) {
|
|
113804
|
+
return SubstringFun::SubstringUnicode(result, input, offset, length);
|
|
113805
|
+
}
|
|
113806
|
+
};
|
|
113807
|
+
|
|
113808
|
+
struct LeftRightGrapheme {
|
|
113809
|
+
template <class TA, class TR>
|
|
113810
|
+
static inline TR Operation(TA input) {
|
|
113811
|
+
return LengthFun::GraphemeCount<TA, TR>(input);
|
|
113812
|
+
}
|
|
113813
|
+
|
|
113814
|
+
static string_t Substring(Vector &result, string_t input, int64_t offset, int64_t length) {
|
|
113815
|
+
return SubstringFun::SubstringGrapheme(result, input, offset, length);
|
|
113816
|
+
}
|
|
113817
|
+
};
|
|
113818
|
+
|
|
113819
|
+
template <class OP>
|
|
113774
113820
|
static string_t LeftScalarFunction(Vector &result, const string_t str, int64_t pos) {
|
|
113775
113821
|
if (pos >= 0) {
|
|
113776
|
-
return
|
|
113822
|
+
return OP::Substring(result, str, 1, pos);
|
|
113777
113823
|
}
|
|
113778
113824
|
|
|
113779
|
-
int64_t num_characters =
|
|
113825
|
+
int64_t num_characters = OP::template Operation<string_t, int64_t>(str);
|
|
113780
113826
|
pos = MaxValue<int64_t>(0, num_characters + pos);
|
|
113781
|
-
return
|
|
113827
|
+
return OP::Substring(result, str, 1, pos);
|
|
113782
113828
|
}
|
|
113783
113829
|
|
|
113830
|
+
template <class OP>
|
|
113784
113831
|
static void LeftFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
|
113785
113832
|
auto &str_vec = args.data[0];
|
|
113786
113833
|
auto &pos_vec = args.data[1];
|
|
113787
113834
|
|
|
113788
113835
|
BinaryExecutor::Execute<string_t, int64_t, string_t>(
|
|
113789
113836
|
str_vec, pos_vec, result, args.size(),
|
|
113790
|
-
[&](string_t str, int64_t pos) { return LeftScalarFunction(result, str, pos); });
|
|
113837
|
+
[&](string_t str, int64_t pos) { return LeftScalarFunction<OP>(result, str, pos); });
|
|
113791
113838
|
}
|
|
113792
113839
|
|
|
113793
113840
|
void LeftFun::RegisterFunction(BuiltinFunctions &set) {
|
|
113794
|
-
set.AddFunction(
|
|
113795
|
-
|
|
113841
|
+
set.AddFunction(ScalarFunction("left", {LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
|
|
113842
|
+
LeftFunction<LeftRightUnicode>));
|
|
113843
|
+
set.AddFunction(ScalarFunction("left_grapheme", {LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
|
|
113844
|
+
LeftFunction<LeftRightGrapheme>));
|
|
113796
113845
|
}
|
|
113797
113846
|
|
|
113847
|
+
template <class OP>
|
|
113798
113848
|
static string_t RightScalarFunction(Vector &result, const string_t str, int64_t pos) {
|
|
113799
|
-
int64_t num_characters =
|
|
113849
|
+
int64_t num_characters = OP::template Operation<string_t, int64_t>(str);
|
|
113800
113850
|
if (pos >= 0) {
|
|
113801
113851
|
int64_t len = MinValue<int64_t>(num_characters, pos);
|
|
113802
113852
|
int64_t start = num_characters - len + 1;
|
|
113803
|
-
return
|
|
113853
|
+
return OP::Substring(result, str, start, len);
|
|
113804
113854
|
}
|
|
113805
113855
|
|
|
113806
113856
|
int64_t len = num_characters - MinValue<int64_t>(num_characters, -pos);
|
|
113807
113857
|
int64_t start = num_characters - len + 1;
|
|
113808
|
-
return
|
|
113858
|
+
return OP::Substring(result, str, start, len);
|
|
113809
113859
|
}
|
|
113810
113860
|
|
|
113861
|
+
template <class OP>
|
|
113811
113862
|
static void RightFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
|
113812
113863
|
auto &str_vec = args.data[0];
|
|
113813
113864
|
auto &pos_vec = args.data[1];
|
|
113814
113865
|
BinaryExecutor::Execute<string_t, int64_t, string_t>(
|
|
113815
113866
|
str_vec, pos_vec, result, args.size(),
|
|
113816
|
-
[&](string_t str, int64_t pos) { return RightScalarFunction(result, str, pos); });
|
|
113867
|
+
[&](string_t str, int64_t pos) { return RightScalarFunction<OP>(result, str, pos); });
|
|
113817
113868
|
}
|
|
113818
113869
|
|
|
113819
113870
|
void RightFun::RegisterFunction(BuiltinFunctions &set) {
|
|
113820
|
-
set.AddFunction(
|
|
113821
|
-
|
|
113871
|
+
set.AddFunction(ScalarFunction("right", {LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
|
|
113872
|
+
RightFunction<LeftRightUnicode>));
|
|
113873
|
+
set.AddFunction(ScalarFunction("right_grapheme", {LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
|
|
113874
|
+
RightFunction<LeftRightGrapheme>));
|
|
113822
113875
|
}
|
|
113823
113876
|
|
|
113824
113877
|
} // namespace duckdb
|
|
@@ -113833,7 +113886,7 @@ void RightFun::RegisterFunction(BuiltinFunctions &set) {
|
|
|
113833
113886
|
|
|
113834
113887
|
namespace duckdb {
|
|
113835
113888
|
|
|
113836
|
-
// length returns the
|
|
113889
|
+
// length returns the number of unicode codepoints
|
|
113837
113890
|
struct StringLengthOperator {
|
|
113838
113891
|
template <class TA, class TR>
|
|
113839
113892
|
static inline TR Operation(TA input) {
|
|
@@ -113841,6 +113894,13 @@ struct StringLengthOperator {
|
|
|
113841
113894
|
}
|
|
113842
113895
|
};
|
|
113843
113896
|
|
|
113897
|
+
struct GraphemeCountOperator {
|
|
113898
|
+
template <class TA, class TR>
|
|
113899
|
+
static inline TR Operation(TA input) {
|
|
113900
|
+
return LengthFun::GraphemeCount<TA, TR>(input);
|
|
113901
|
+
}
|
|
113902
|
+
};
|
|
113903
|
+
|
|
113844
113904
|
struct ArrayLengthOperator {
|
|
113845
113905
|
template <class TA, class TR>
|
|
113846
113906
|
static inline TR Operation(TA input) {
|
|
@@ -113911,6 +113971,12 @@ void LengthFun::RegisterFunction(BuiltinFunctions &set) {
|
|
|
113911
113971
|
length.name = "len";
|
|
113912
113972
|
set.AddFunction(length);
|
|
113913
113973
|
|
|
113974
|
+
ScalarFunctionSet length_grapheme("length_grapheme");
|
|
113975
|
+
length_grapheme.AddFunction(ScalarFunction({LogicalType::VARCHAR}, LogicalType::BIGINT,
|
|
113976
|
+
ScalarFunction::UnaryFunction<string_t, int64_t, GraphemeCountOperator>,
|
|
113977
|
+
nullptr, nullptr, LengthPropagateStats));
|
|
113978
|
+
set.AddFunction(length_grapheme);
|
|
113979
|
+
|
|
113914
113980
|
ScalarFunctionSet array_length("array_length");
|
|
113915
113981
|
array_length.AddFunction(array_length_unary);
|
|
113916
113982
|
array_length.AddFunction(ScalarFunction(
|
|
@@ -116186,7 +116252,92 @@ string_t SubstringASCII(Vector &result, string_t input, int64_t offset, int64_t
|
|
|
116186
116252
|
return SubstringSlice(result, input_data, start, end - start);
|
|
116187
116253
|
}
|
|
116188
116254
|
|
|
116189
|
-
string_t SubstringFun::
|
|
116255
|
+
string_t SubstringFun::SubstringUnicode(Vector &result, string_t input, int64_t offset, int64_t length) {
|
|
116256
|
+
auto input_data = input.GetDataUnsafe();
|
|
116257
|
+
auto input_size = input.GetSize();
|
|
116258
|
+
|
|
116259
|
+
if (length == 0) {
|
|
116260
|
+
return SubstringEmptyString(result);
|
|
116261
|
+
}
|
|
116262
|
+
// first figure out which direction we need to scan
|
|
116263
|
+
idx_t start_pos;
|
|
116264
|
+
idx_t end_pos;
|
|
116265
|
+
if (offset < 0) {
|
|
116266
|
+
start_pos = 0;
|
|
116267
|
+
end_pos = DConstants::INVALID_INDEX;
|
|
116268
|
+
|
|
116269
|
+
// negative offset: scan backwards
|
|
116270
|
+
int64_t start, end;
|
|
116271
|
+
|
|
116272
|
+
// we express start and end as unicode codepoints from the back
|
|
116273
|
+
offset--;
|
|
116274
|
+
if (length < 0) {
|
|
116275
|
+
// negative length
|
|
116276
|
+
start = -offset - length;
|
|
116277
|
+
end = -offset;
|
|
116278
|
+
} else {
|
|
116279
|
+
// positive length
|
|
116280
|
+
start = -offset;
|
|
116281
|
+
end = -offset - length;
|
|
116282
|
+
}
|
|
116283
|
+
if (end <= 0) {
|
|
116284
|
+
end_pos = input_size;
|
|
116285
|
+
}
|
|
116286
|
+
int64_t current_character = 0;
|
|
116287
|
+
for (idx_t i = input_size; i > 0; i--) {
|
|
116288
|
+
if (LengthFun::IsCharacter(input_data[i - 1])) {
|
|
116289
|
+
current_character++;
|
|
116290
|
+
if (current_character == start) {
|
|
116291
|
+
start_pos = i;
|
|
116292
|
+
break;
|
|
116293
|
+
} else if (current_character == end) {
|
|
116294
|
+
end_pos = i;
|
|
116295
|
+
}
|
|
116296
|
+
}
|
|
116297
|
+
}
|
|
116298
|
+
if (end_pos == DConstants::INVALID_INDEX) {
|
|
116299
|
+
return SubstringEmptyString(result);
|
|
116300
|
+
}
|
|
116301
|
+
} else {
|
|
116302
|
+
start_pos = DConstants::INVALID_INDEX;
|
|
116303
|
+
end_pos = input_size;
|
|
116304
|
+
|
|
116305
|
+
// positive offset: scan forwards
|
|
116306
|
+
int64_t start, end;
|
|
116307
|
+
|
|
116308
|
+
// we express start and end as unicode codepoints from the front
|
|
116309
|
+
if (length < 0) {
|
|
116310
|
+
// negative length
|
|
116311
|
+
start = MaxValue<int64_t>(0, offset + length - 1);
|
|
116312
|
+
end = offset - 1;
|
|
116313
|
+
} else {
|
|
116314
|
+
// positive length
|
|
116315
|
+
start = MaxValue<int64_t>(0, offset - 1);
|
|
116316
|
+
end = offset + length - 1;
|
|
116317
|
+
}
|
|
116318
|
+
|
|
116319
|
+
int64_t current_character = 0;
|
|
116320
|
+
for (idx_t i = 0; i < input_size; i++) {
|
|
116321
|
+
if (LengthFun::IsCharacter(input_data[i])) {
|
|
116322
|
+
if (current_character == start) {
|
|
116323
|
+
start_pos = i;
|
|
116324
|
+
} else if (current_character == end) {
|
|
116325
|
+
end_pos = i;
|
|
116326
|
+
break;
|
|
116327
|
+
}
|
|
116328
|
+
current_character++;
|
|
116329
|
+
}
|
|
116330
|
+
}
|
|
116331
|
+
if (start_pos == DConstants::INVALID_INDEX || end == 0 || end <= start) {
|
|
116332
|
+
return SubstringEmptyString(result);
|
|
116333
|
+
}
|
|
116334
|
+
}
|
|
116335
|
+
D_ASSERT(end_pos >= start_pos);
|
|
116336
|
+
// after we have found these, we can slice the substring
|
|
116337
|
+
return SubstringSlice(result, input_data, start_pos, end_pos - start_pos);
|
|
116338
|
+
}
|
|
116339
|
+
|
|
116340
|
+
string_t SubstringFun::SubstringGrapheme(Vector &result, string_t input, int64_t offset, int64_t length) {
|
|
116190
116341
|
auto input_data = input.GetDataUnsafe();
|
|
116191
116342
|
auto input_size = input.GetSize();
|
|
116192
116343
|
|
|
@@ -116247,6 +116398,19 @@ string_t SubstringFun::SubstringScalarFunction(Vector &result, string_t input, i
|
|
|
116247
116398
|
return SubstringSlice(result, input_data, start_pos, end_pos - start_pos);
|
|
116248
116399
|
}
|
|
116249
116400
|
|
|
116401
|
+
struct SubstringUnicodeOp {
|
|
116402
|
+
static string_t Substring(Vector &result, string_t input, int64_t offset, int64_t length) {
|
|
116403
|
+
return SubstringFun::SubstringUnicode(result, input, offset, length);
|
|
116404
|
+
}
|
|
116405
|
+
};
|
|
116406
|
+
|
|
116407
|
+
struct SubstringGraphemeOp {
|
|
116408
|
+
static string_t Substring(Vector &result, string_t input, int64_t offset, int64_t length) {
|
|
116409
|
+
return SubstringFun::SubstringGrapheme(result, input, offset, length);
|
|
116410
|
+
}
|
|
116411
|
+
};
|
|
116412
|
+
|
|
116413
|
+
template <class OP>
|
|
116250
116414
|
static void SubstringFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
|
116251
116415
|
auto &input_vector = args.data[0];
|
|
116252
116416
|
auto &offset_vector = args.data[1];
|
|
@@ -116256,13 +116420,12 @@ static void SubstringFunction(DataChunk &args, ExpressionState &state, Vector &r
|
|
|
116256
116420
|
TernaryExecutor::Execute<string_t, int64_t, int64_t, string_t>(
|
|
116257
116421
|
input_vector, offset_vector, length_vector, result, args.size(),
|
|
116258
116422
|
[&](string_t input_string, int64_t offset, int64_t length) {
|
|
116259
|
-
return
|
|
116423
|
+
return OP::Substring(result, input_string, offset, length);
|
|
116260
116424
|
});
|
|
116261
116425
|
} else {
|
|
116262
116426
|
BinaryExecutor::Execute<string_t, int64_t, string_t>(
|
|
116263
116427
|
input_vector, offset_vector, result, args.size(), [&](string_t input_string, int64_t offset) {
|
|
116264
|
-
return
|
|
116265
|
-
NumericLimits<int64_t>::Maximum() - offset);
|
|
116428
|
+
return OP::Substring(result, input_string, offset, NumericLimits<int64_t>::Maximum() - offset);
|
|
116266
116429
|
});
|
|
116267
116430
|
}
|
|
116268
116431
|
}
|
|
@@ -116304,13 +116467,23 @@ static unique_ptr<BaseStatistics> SubstringPropagateStats(ClientContext &context
|
|
|
116304
116467
|
void SubstringFun::RegisterFunction(BuiltinFunctions &set) {
|
|
116305
116468
|
ScalarFunctionSet substr("substring");
|
|
116306
116469
|
substr.AddFunction(ScalarFunction({LogicalType::VARCHAR, LogicalType::BIGINT, LogicalType::BIGINT},
|
|
116307
|
-
LogicalType::VARCHAR, SubstringFunction
|
|
116470
|
+
LogicalType::VARCHAR, SubstringFunction<SubstringUnicodeOp>, nullptr, nullptr,
|
|
116308
116471
|
SubstringPropagateStats));
|
|
116309
116472
|
substr.AddFunction(ScalarFunction({LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
|
|
116310
|
-
SubstringFunction
|
|
116473
|
+
SubstringFunction<SubstringUnicodeOp>, nullptr, nullptr,
|
|
116474
|
+
SubstringPropagateStats));
|
|
116311
116475
|
set.AddFunction(substr);
|
|
116312
116476
|
substr.name = "substr";
|
|
116313
116477
|
set.AddFunction(substr);
|
|
116478
|
+
|
|
116479
|
+
ScalarFunctionSet substr_grapheme("substring_grapheme");
|
|
116480
|
+
substr_grapheme.AddFunction(ScalarFunction({LogicalType::VARCHAR, LogicalType::BIGINT, LogicalType::BIGINT},
|
|
116481
|
+
LogicalType::VARCHAR, SubstringFunction<SubstringGraphemeOp>, nullptr,
|
|
116482
|
+
nullptr, SubstringPropagateStats));
|
|
116483
|
+
substr_grapheme.AddFunction(ScalarFunction({LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
|
|
116484
|
+
SubstringFunction<SubstringGraphemeOp>, nullptr, nullptr,
|
|
116485
|
+
SubstringPropagateStats));
|
|
116486
|
+
set.AddFunction(substr_grapheme);
|
|
116314
116487
|
}
|
|
116315
116488
|
|
|
116316
116489
|
} // namespace duckdb
|