duckdb 0.5.2-dev863.0 → 0.5.2-dev874.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
- "version": "0.5.2-dev863.0",
4
+ "version": "0.5.2-dev874.0",
5
5
  "description": "DuckDB node.js API",
6
6
  "gypfile": true,
7
7
  "dependencies": {
package/src/duckdb.cpp CHANGED
@@ -16820,14 +16820,25 @@ struct ConcatFun {
16820
16820
  static void RegisterFunction(BuiltinFunctions &set);
16821
16821
  };
16822
16822
 
16823
- struct ConcatWSFun {
16824
- static void RegisterFunction(BuiltinFunctions &set);
16825
- };
16826
-
16827
16823
  struct LengthFun {
16828
16824
  static void RegisterFunction(BuiltinFunctions &set);
16825
+ static inline bool IsCharacter(char c) {
16826
+ return (c & 0xc0) != 0x80;
16827
+ }
16828
+
16829
16829
  template <class TA, class TR>
16830
16830
  static inline TR Length(TA input) {
16831
+ auto input_data = input.GetDataUnsafe();
16832
+ auto input_length = input.GetSize();
16833
+ TR length = 0;
16834
+ for (idx_t i = 0; i < input_length; i++) {
16835
+ length += IsCharacter(input_data[i]);
16836
+ }
16837
+ return length;
16838
+ }
16839
+
16840
+ template <class TA, class TR>
16841
+ static inline TR GraphemeCount(TA input) {
16831
16842
  auto input_data = input.GetDataUnsafe();
16832
16843
  auto input_length = input.GetSize();
16833
16844
  for (idx_t i = 0; i < input_length; i++) {
@@ -16881,7 +16892,8 @@ struct RegexpFun {
16881
16892
 
16882
16893
  struct SubstringFun {
16883
16894
  static void RegisterFunction(BuiltinFunctions &set);
16884
- static string_t SubstringScalarFunction(Vector &result, string_t input, int64_t offset, int64_t length);
16895
+ static string_t SubstringUnicode(Vector &result, string_t input, int64_t offset, int64_t length);
16896
+ static string_t SubstringGrapheme(Vector &result, string_t input, int64_t offset, int64_t length);
16885
16897
  };
16886
16898
 
16887
16899
  struct PrintfFun {
@@ -21114,7 +21126,7 @@ vector<string> LocalFileSystem::FetchFileWithoutGlob(const string &path, FileOpe
21114
21126
  result.push_back(path);
21115
21127
  } else if (!absolute_path) {
21116
21128
  Value value;
21117
- if (opener->TryGetCurrentSetting("file_search_path", value)) {
21129
+ if (opener && opener->TryGetCurrentSetting("file_search_path", value)) {
21118
21130
  auto search_paths_str = value.ToString();
21119
21131
  std::vector<std::string> search_paths = StringUtil::Split(search_paths_str, ',');
21120
21132
  for (const auto &search_path : search_paths) {
@@ -21180,7 +21192,18 @@ vector<string> LocalFileSystem::Glob(const string &path, FileOpener *opener) {
21180
21192
  if (absolute_path) {
21181
21193
  // for absolute paths, we don't start by scanning the current directory
21182
21194
  previous_directories.push_back(splits[0]);
21195
+ } else {
21196
+ // If file_search_path is set, use those paths as the first glob elements
21197
+ Value value;
21198
+ if (opener && opener->TryGetCurrentSetting("file_search_path", value)) {
21199
+ auto search_paths_str = value.ToString();
21200
+ std::vector<std::string> search_paths = StringUtil::Split(search_paths_str, ',');
21201
+ for (const auto &search_path : search_paths) {
21202
+ previous_directories.push_back(search_path);
21203
+ }
21204
+ }
21183
21205
  }
21206
+
21184
21207
  for (idx_t i = absolute_path ? 1 : 0; i < splits.size(); i++) {
21185
21208
  bool is_last_chunk = i + 1 == splits.size();
21186
21209
  bool has_glob = HasGlob(splits[i]);
@@ -104993,7 +105016,7 @@ list_entry_t SliceValue(Vector &result, list_entry_t input, int64_t begin, int64
104993
105016
  template <>
104994
105017
  string_t SliceValue(Vector &result, string_t input, int32_t begin, int32_t end) {
104995
105018
  // one-based - zero has strange semantics
104996
- return SubstringFun::SubstringScalarFunction(result, input, begin + 1, end - begin);
105019
+ return SubstringFun::SubstringUnicode(result, input, begin + 1, end - begin);
104997
105020
  }
104998
105021
 
104999
105022
  template <typename INPUT_TYPE, typename INDEX_TYPE>
@@ -106341,7 +106364,7 @@ static void ExecuteListExtract(Vector &result, Vector &list, Vector &offsets, co
106341
106364
  static void ExecuteStringExtract(Vector &result, Vector &input_vector, Vector &subscript_vector, const idx_t count) {
106342
106365
  BinaryExecutor::Execute<string_t, int64_t, string_t>(
106343
106366
  input_vector, subscript_vector, result, count, [&](string_t input_string, int64_t subscript) {
106344
- return SubstringFun::SubstringScalarFunction(result, input_string, subscript, 1);
106367
+ return SubstringFun::SubstringUnicode(result, input_string, subscript, 1);
106345
106368
  });
106346
106369
  }
106347
106370
 
@@ -113771,54 +113794,84 @@ void JaroWinklerFun::RegisterFunction(BuiltinFunctions &set) {
113771
113794
 
113772
113795
  namespace duckdb {
113773
113796
 
113797
+ struct LeftRightUnicode {
113798
+ template <class TA, class TR>
113799
+ static inline TR Operation(TA input) {
113800
+ return LengthFun::Length<TA, TR>(input);
113801
+ }
113802
+
113803
+ static string_t Substring(Vector &result, string_t input, int64_t offset, int64_t length) {
113804
+ return SubstringFun::SubstringUnicode(result, input, offset, length);
113805
+ }
113806
+ };
113807
+
113808
+ struct LeftRightGrapheme {
113809
+ template <class TA, class TR>
113810
+ static inline TR Operation(TA input) {
113811
+ return LengthFun::GraphemeCount<TA, TR>(input);
113812
+ }
113813
+
113814
+ static string_t Substring(Vector &result, string_t input, int64_t offset, int64_t length) {
113815
+ return SubstringFun::SubstringGrapheme(result, input, offset, length);
113816
+ }
113817
+ };
113818
+
113819
+ template <class OP>
113774
113820
  static string_t LeftScalarFunction(Vector &result, const string_t str, int64_t pos) {
113775
113821
  if (pos >= 0) {
113776
- return SubstringFun::SubstringScalarFunction(result, str, 1, pos);
113822
+ return OP::Substring(result, str, 1, pos);
113777
113823
  }
113778
113824
 
113779
- int64_t num_characters = LengthFun::Length<string_t, int64_t>(str);
113825
+ int64_t num_characters = OP::template Operation<string_t, int64_t>(str);
113780
113826
  pos = MaxValue<int64_t>(0, num_characters + pos);
113781
- return SubstringFun::SubstringScalarFunction(result, str, 1, pos);
113827
+ return OP::Substring(result, str, 1, pos);
113782
113828
  }
113783
113829
 
113830
+ template <class OP>
113784
113831
  static void LeftFunction(DataChunk &args, ExpressionState &state, Vector &result) {
113785
113832
  auto &str_vec = args.data[0];
113786
113833
  auto &pos_vec = args.data[1];
113787
113834
 
113788
113835
  BinaryExecutor::Execute<string_t, int64_t, string_t>(
113789
113836
  str_vec, pos_vec, result, args.size(),
113790
- [&](string_t str, int64_t pos) { return LeftScalarFunction(result, str, pos); });
113837
+ [&](string_t str, int64_t pos) { return LeftScalarFunction<OP>(result, str, pos); });
113791
113838
  }
113792
113839
 
113793
113840
  void LeftFun::RegisterFunction(BuiltinFunctions &set) {
113794
- set.AddFunction(
113795
- ScalarFunction("left", {LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR, LeftFunction));
113841
+ set.AddFunction(ScalarFunction("left", {LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
113842
+ LeftFunction<LeftRightUnicode>));
113843
+ set.AddFunction(ScalarFunction("left_grapheme", {LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
113844
+ LeftFunction<LeftRightGrapheme>));
113796
113845
  }
113797
113846
 
113847
+ template <class OP>
113798
113848
  static string_t RightScalarFunction(Vector &result, const string_t str, int64_t pos) {
113799
- int64_t num_characters = LengthFun::Length<string_t, int64_t>(str);
113849
+ int64_t num_characters = OP::template Operation<string_t, int64_t>(str);
113800
113850
  if (pos >= 0) {
113801
113851
  int64_t len = MinValue<int64_t>(num_characters, pos);
113802
113852
  int64_t start = num_characters - len + 1;
113803
- return SubstringFun::SubstringScalarFunction(result, str, start, len);
113853
+ return OP::Substring(result, str, start, len);
113804
113854
  }
113805
113855
 
113806
113856
  int64_t len = num_characters - MinValue<int64_t>(num_characters, -pos);
113807
113857
  int64_t start = num_characters - len + 1;
113808
- return SubstringFun::SubstringScalarFunction(result, str, start, len);
113858
+ return OP::Substring(result, str, start, len);
113809
113859
  }
113810
113860
 
113861
+ template <class OP>
113811
113862
  static void RightFunction(DataChunk &args, ExpressionState &state, Vector &result) {
113812
113863
  auto &str_vec = args.data[0];
113813
113864
  auto &pos_vec = args.data[1];
113814
113865
  BinaryExecutor::Execute<string_t, int64_t, string_t>(
113815
113866
  str_vec, pos_vec, result, args.size(),
113816
- [&](string_t str, int64_t pos) { return RightScalarFunction(result, str, pos); });
113867
+ [&](string_t str, int64_t pos) { return RightScalarFunction<OP>(result, str, pos); });
113817
113868
  }
113818
113869
 
113819
113870
  void RightFun::RegisterFunction(BuiltinFunctions &set) {
113820
- set.AddFunction(
113821
- ScalarFunction("right", {LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR, RightFunction));
113871
+ set.AddFunction(ScalarFunction("right", {LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
113872
+ RightFunction<LeftRightUnicode>));
113873
+ set.AddFunction(ScalarFunction("right_grapheme", {LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
113874
+ RightFunction<LeftRightGrapheme>));
113822
113875
  }
113823
113876
 
113824
113877
  } // namespace duckdb
@@ -113833,7 +113886,7 @@ void RightFun::RegisterFunction(BuiltinFunctions &set) {
113833
113886
 
113834
113887
  namespace duckdb {
113835
113888
 
113836
- // length returns the size in characters
113889
+ // length returns the number of unicode codepoints
113837
113890
  struct StringLengthOperator {
113838
113891
  template <class TA, class TR>
113839
113892
  static inline TR Operation(TA input) {
@@ -113841,6 +113894,13 @@ struct StringLengthOperator {
113841
113894
  }
113842
113895
  };
113843
113896
 
113897
+ struct GraphemeCountOperator {
113898
+ template <class TA, class TR>
113899
+ static inline TR Operation(TA input) {
113900
+ return LengthFun::GraphemeCount<TA, TR>(input);
113901
+ }
113902
+ };
113903
+
113844
113904
  struct ArrayLengthOperator {
113845
113905
  template <class TA, class TR>
113846
113906
  static inline TR Operation(TA input) {
@@ -113911,6 +113971,12 @@ void LengthFun::RegisterFunction(BuiltinFunctions &set) {
113911
113971
  length.name = "len";
113912
113972
  set.AddFunction(length);
113913
113973
 
113974
+ ScalarFunctionSet length_grapheme("length_grapheme");
113975
+ length_grapheme.AddFunction(ScalarFunction({LogicalType::VARCHAR}, LogicalType::BIGINT,
113976
+ ScalarFunction::UnaryFunction<string_t, int64_t, GraphemeCountOperator>,
113977
+ nullptr, nullptr, LengthPropagateStats));
113978
+ set.AddFunction(length_grapheme);
113979
+
113914
113980
  ScalarFunctionSet array_length("array_length");
113915
113981
  array_length.AddFunction(array_length_unary);
113916
113982
  array_length.AddFunction(ScalarFunction(
@@ -116186,7 +116252,92 @@ string_t SubstringASCII(Vector &result, string_t input, int64_t offset, int64_t
116186
116252
  return SubstringSlice(result, input_data, start, end - start);
116187
116253
  }
116188
116254
 
116189
- string_t SubstringFun::SubstringScalarFunction(Vector &result, string_t input, int64_t offset, int64_t length) {
116255
+ string_t SubstringFun::SubstringUnicode(Vector &result, string_t input, int64_t offset, int64_t length) {
116256
+ auto input_data = input.GetDataUnsafe();
116257
+ auto input_size = input.GetSize();
116258
+
116259
+ if (length == 0) {
116260
+ return SubstringEmptyString(result);
116261
+ }
116262
+ // first figure out which direction we need to scan
116263
+ idx_t start_pos;
116264
+ idx_t end_pos;
116265
+ if (offset < 0) {
116266
+ start_pos = 0;
116267
+ end_pos = DConstants::INVALID_INDEX;
116268
+
116269
+ // negative offset: scan backwards
116270
+ int64_t start, end;
116271
+
116272
+ // we express start and end as unicode codepoints from the back
116273
+ offset--;
116274
+ if (length < 0) {
116275
+ // negative length
116276
+ start = -offset - length;
116277
+ end = -offset;
116278
+ } else {
116279
+ // positive length
116280
+ start = -offset;
116281
+ end = -offset - length;
116282
+ }
116283
+ if (end <= 0) {
116284
+ end_pos = input_size;
116285
+ }
116286
+ int64_t current_character = 0;
116287
+ for (idx_t i = input_size; i > 0; i--) {
116288
+ if (LengthFun::IsCharacter(input_data[i - 1])) {
116289
+ current_character++;
116290
+ if (current_character == start) {
116291
+ start_pos = i;
116292
+ break;
116293
+ } else if (current_character == end) {
116294
+ end_pos = i;
116295
+ }
116296
+ }
116297
+ }
116298
+ if (end_pos == DConstants::INVALID_INDEX) {
116299
+ return SubstringEmptyString(result);
116300
+ }
116301
+ } else {
116302
+ start_pos = DConstants::INVALID_INDEX;
116303
+ end_pos = input_size;
116304
+
116305
+ // positive offset: scan forwards
116306
+ int64_t start, end;
116307
+
116308
+ // we express start and end as unicode codepoints from the front
116309
+ if (length < 0) {
116310
+ // negative length
116311
+ start = MaxValue<int64_t>(0, offset + length - 1);
116312
+ end = offset - 1;
116313
+ } else {
116314
+ // positive length
116315
+ start = MaxValue<int64_t>(0, offset - 1);
116316
+ end = offset + length - 1;
116317
+ }
116318
+
116319
+ int64_t current_character = 0;
116320
+ for (idx_t i = 0; i < input_size; i++) {
116321
+ if (LengthFun::IsCharacter(input_data[i])) {
116322
+ if (current_character == start) {
116323
+ start_pos = i;
116324
+ } else if (current_character == end) {
116325
+ end_pos = i;
116326
+ break;
116327
+ }
116328
+ current_character++;
116329
+ }
116330
+ }
116331
+ if (start_pos == DConstants::INVALID_INDEX || end == 0 || end <= start) {
116332
+ return SubstringEmptyString(result);
116333
+ }
116334
+ }
116335
+ D_ASSERT(end_pos >= start_pos);
116336
+ // after we have found these, we can slice the substring
116337
+ return SubstringSlice(result, input_data, start_pos, end_pos - start_pos);
116338
+ }
116339
+
116340
+ string_t SubstringFun::SubstringGrapheme(Vector &result, string_t input, int64_t offset, int64_t length) {
116190
116341
  auto input_data = input.GetDataUnsafe();
116191
116342
  auto input_size = input.GetSize();
116192
116343
 
@@ -116247,6 +116398,19 @@ string_t SubstringFun::SubstringScalarFunction(Vector &result, string_t input, i
116247
116398
  return SubstringSlice(result, input_data, start_pos, end_pos - start_pos);
116248
116399
  }
116249
116400
 
116401
+ struct SubstringUnicodeOp {
116402
+ static string_t Substring(Vector &result, string_t input, int64_t offset, int64_t length) {
116403
+ return SubstringFun::SubstringUnicode(result, input, offset, length);
116404
+ }
116405
+ };
116406
+
116407
+ struct SubstringGraphemeOp {
116408
+ static string_t Substring(Vector &result, string_t input, int64_t offset, int64_t length) {
116409
+ return SubstringFun::SubstringGrapheme(result, input, offset, length);
116410
+ }
116411
+ };
116412
+
116413
+ template <class OP>
116250
116414
  static void SubstringFunction(DataChunk &args, ExpressionState &state, Vector &result) {
116251
116415
  auto &input_vector = args.data[0];
116252
116416
  auto &offset_vector = args.data[1];
@@ -116256,13 +116420,12 @@ static void SubstringFunction(DataChunk &args, ExpressionState &state, Vector &r
116256
116420
  TernaryExecutor::Execute<string_t, int64_t, int64_t, string_t>(
116257
116421
  input_vector, offset_vector, length_vector, result, args.size(),
116258
116422
  [&](string_t input_string, int64_t offset, int64_t length) {
116259
- return SubstringFun::SubstringScalarFunction(result, input_string, offset, length);
116423
+ return OP::Substring(result, input_string, offset, length);
116260
116424
  });
116261
116425
  } else {
116262
116426
  BinaryExecutor::Execute<string_t, int64_t, string_t>(
116263
116427
  input_vector, offset_vector, result, args.size(), [&](string_t input_string, int64_t offset) {
116264
- return SubstringFun::SubstringScalarFunction(result, input_string, offset,
116265
- NumericLimits<int64_t>::Maximum() - offset);
116428
+ return OP::Substring(result, input_string, offset, NumericLimits<int64_t>::Maximum() - offset);
116266
116429
  });
116267
116430
  }
116268
116431
  }
@@ -116304,13 +116467,23 @@ static unique_ptr<BaseStatistics> SubstringPropagateStats(ClientContext &context
116304
116467
  void SubstringFun::RegisterFunction(BuiltinFunctions &set) {
116305
116468
  ScalarFunctionSet substr("substring");
116306
116469
  substr.AddFunction(ScalarFunction({LogicalType::VARCHAR, LogicalType::BIGINT, LogicalType::BIGINT},
116307
- LogicalType::VARCHAR, SubstringFunction, nullptr, nullptr,
116470
+ LogicalType::VARCHAR, SubstringFunction<SubstringUnicodeOp>, nullptr, nullptr,
116308
116471
  SubstringPropagateStats));
116309
116472
  substr.AddFunction(ScalarFunction({LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
116310
- SubstringFunction, nullptr, nullptr, SubstringPropagateStats));
116473
+ SubstringFunction<SubstringUnicodeOp>, nullptr, nullptr,
116474
+ SubstringPropagateStats));
116311
116475
  set.AddFunction(substr);
116312
116476
  substr.name = "substr";
116313
116477
  set.AddFunction(substr);
116478
+
116479
+ ScalarFunctionSet substr_grapheme("substring_grapheme");
116480
+ substr_grapheme.AddFunction(ScalarFunction({LogicalType::VARCHAR, LogicalType::BIGINT, LogicalType::BIGINT},
116481
+ LogicalType::VARCHAR, SubstringFunction<SubstringGraphemeOp>, nullptr,
116482
+ nullptr, SubstringPropagateStats));
116483
+ substr_grapheme.AddFunction(ScalarFunction({LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
116484
+ SubstringFunction<SubstringGraphemeOp>, nullptr, nullptr,
116485
+ SubstringPropagateStats));
116486
+ set.AddFunction(substr_grapheme);
116314
116487
  }
116315
116488
 
116316
116489
  } // namespace duckdb