duckdb 0.5.2-dev863.0 → 0.5.2-dev870.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
- "version": "0.5.2-dev863.0",
4
+ "version": "0.5.2-dev870.0",
5
5
  "description": "DuckDB node.js API",
6
6
  "gypfile": true,
7
7
  "dependencies": {
package/src/duckdb.cpp CHANGED
@@ -16820,14 +16820,25 @@ struct ConcatFun {
16820
16820
  static void RegisterFunction(BuiltinFunctions &set);
16821
16821
  };
16822
16822
 
16823
- struct ConcatWSFun {
16824
- static void RegisterFunction(BuiltinFunctions &set);
16825
- };
16826
-
16827
16823
  struct LengthFun {
16828
16824
  static void RegisterFunction(BuiltinFunctions &set);
16825
+ static inline bool IsCharacter(char c) {
16826
+ return (c & 0xc0) != 0x80;
16827
+ }
16828
+
16829
16829
  template <class TA, class TR>
16830
16830
  static inline TR Length(TA input) {
16831
+ auto input_data = input.GetDataUnsafe();
16832
+ auto input_length = input.GetSize();
16833
+ TR length = 0;
16834
+ for (idx_t i = 0; i < input_length; i++) {
16835
+ length += IsCharacter(input_data[i]);
16836
+ }
16837
+ return length;
16838
+ }
16839
+
16840
+ template <class TA, class TR>
16841
+ static inline TR GraphemeCount(TA input) {
16831
16842
  auto input_data = input.GetDataUnsafe();
16832
16843
  auto input_length = input.GetSize();
16833
16844
  for (idx_t i = 0; i < input_length; i++) {
@@ -16881,7 +16892,8 @@ struct RegexpFun {
16881
16892
 
16882
16893
  struct SubstringFun {
16883
16894
  static void RegisterFunction(BuiltinFunctions &set);
16884
- static string_t SubstringScalarFunction(Vector &result, string_t input, int64_t offset, int64_t length);
16895
+ static string_t SubstringUnicode(Vector &result, string_t input, int64_t offset, int64_t length);
16896
+ static string_t SubstringGrapheme(Vector &result, string_t input, int64_t offset, int64_t length);
16885
16897
  };
16886
16898
 
16887
16899
  struct PrintfFun {
@@ -104993,7 +105005,7 @@ list_entry_t SliceValue(Vector &result, list_entry_t input, int64_t begin, int64
104993
105005
  template <>
104994
105006
  string_t SliceValue(Vector &result, string_t input, int32_t begin, int32_t end) {
104995
105007
  // one-based - zero has strange semantics
104996
- return SubstringFun::SubstringScalarFunction(result, input, begin + 1, end - begin);
105008
+ return SubstringFun::SubstringUnicode(result, input, begin + 1, end - begin);
104997
105009
  }
104998
105010
 
104999
105011
  template <typename INPUT_TYPE, typename INDEX_TYPE>
@@ -106341,7 +106353,7 @@ static void ExecuteListExtract(Vector &result, Vector &list, Vector &offsets, co
106341
106353
  static void ExecuteStringExtract(Vector &result, Vector &input_vector, Vector &subscript_vector, const idx_t count) {
106342
106354
  BinaryExecutor::Execute<string_t, int64_t, string_t>(
106343
106355
  input_vector, subscript_vector, result, count, [&](string_t input_string, int64_t subscript) {
106344
- return SubstringFun::SubstringScalarFunction(result, input_string, subscript, 1);
106356
+ return SubstringFun::SubstringUnicode(result, input_string, subscript, 1);
106345
106357
  });
106346
106358
  }
106347
106359
 
@@ -113771,54 +113783,84 @@ void JaroWinklerFun::RegisterFunction(BuiltinFunctions &set) {
113771
113783
 
113772
113784
  namespace duckdb {
113773
113785
 
113786
+ struct LeftRightUnicode {
113787
+ template <class TA, class TR>
113788
+ static inline TR Operation(TA input) {
113789
+ return LengthFun::Length<TA, TR>(input);
113790
+ }
113791
+
113792
+ static string_t Substring(Vector &result, string_t input, int64_t offset, int64_t length) {
113793
+ return SubstringFun::SubstringUnicode(result, input, offset, length);
113794
+ }
113795
+ };
113796
+
113797
+ struct LeftRightGrapheme {
113798
+ template <class TA, class TR>
113799
+ static inline TR Operation(TA input) {
113800
+ return LengthFun::GraphemeCount<TA, TR>(input);
113801
+ }
113802
+
113803
+ static string_t Substring(Vector &result, string_t input, int64_t offset, int64_t length) {
113804
+ return SubstringFun::SubstringGrapheme(result, input, offset, length);
113805
+ }
113806
+ };
113807
+
113808
+ template <class OP>
113774
113809
  static string_t LeftScalarFunction(Vector &result, const string_t str, int64_t pos) {
113775
113810
  if (pos >= 0) {
113776
- return SubstringFun::SubstringScalarFunction(result, str, 1, pos);
113811
+ return OP::Substring(result, str, 1, pos);
113777
113812
  }
113778
113813
 
113779
- int64_t num_characters = LengthFun::Length<string_t, int64_t>(str);
113814
+ int64_t num_characters = OP::template Operation<string_t, int64_t>(str);
113780
113815
  pos = MaxValue<int64_t>(0, num_characters + pos);
113781
- return SubstringFun::SubstringScalarFunction(result, str, 1, pos);
113816
+ return OP::Substring(result, str, 1, pos);
113782
113817
  }
113783
113818
 
113819
+ template <class OP>
113784
113820
  static void LeftFunction(DataChunk &args, ExpressionState &state, Vector &result) {
113785
113821
  auto &str_vec = args.data[0];
113786
113822
  auto &pos_vec = args.data[1];
113787
113823
 
113788
113824
  BinaryExecutor::Execute<string_t, int64_t, string_t>(
113789
113825
  str_vec, pos_vec, result, args.size(),
113790
- [&](string_t str, int64_t pos) { return LeftScalarFunction(result, str, pos); });
113826
+ [&](string_t str, int64_t pos) { return LeftScalarFunction<OP>(result, str, pos); });
113791
113827
  }
113792
113828
 
113793
113829
  void LeftFun::RegisterFunction(BuiltinFunctions &set) {
113794
- set.AddFunction(
113795
- ScalarFunction("left", {LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR, LeftFunction));
113830
+ set.AddFunction(ScalarFunction("left", {LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
113831
+ LeftFunction<LeftRightUnicode>));
113832
+ set.AddFunction(ScalarFunction("left_grapheme", {LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
113833
+ LeftFunction<LeftRightGrapheme>));
113796
113834
  }
113797
113835
 
113836
+ template <class OP>
113798
113837
  static string_t RightScalarFunction(Vector &result, const string_t str, int64_t pos) {
113799
- int64_t num_characters = LengthFun::Length<string_t, int64_t>(str);
113838
+ int64_t num_characters = OP::template Operation<string_t, int64_t>(str);
113800
113839
  if (pos >= 0) {
113801
113840
  int64_t len = MinValue<int64_t>(num_characters, pos);
113802
113841
  int64_t start = num_characters - len + 1;
113803
- return SubstringFun::SubstringScalarFunction(result, str, start, len);
113842
+ return OP::Substring(result, str, start, len);
113804
113843
  }
113805
113844
 
113806
113845
  int64_t len = num_characters - MinValue<int64_t>(num_characters, -pos);
113807
113846
  int64_t start = num_characters - len + 1;
113808
- return SubstringFun::SubstringScalarFunction(result, str, start, len);
113847
+ return OP::Substring(result, str, start, len);
113809
113848
  }
113810
113849
 
113850
+ template <class OP>
113811
113851
  static void RightFunction(DataChunk &args, ExpressionState &state, Vector &result) {
113812
113852
  auto &str_vec = args.data[0];
113813
113853
  auto &pos_vec = args.data[1];
113814
113854
  BinaryExecutor::Execute<string_t, int64_t, string_t>(
113815
113855
  str_vec, pos_vec, result, args.size(),
113816
- [&](string_t str, int64_t pos) { return RightScalarFunction(result, str, pos); });
113856
+ [&](string_t str, int64_t pos) { return RightScalarFunction<OP>(result, str, pos); });
113817
113857
  }
113818
113858
 
113819
113859
  void RightFun::RegisterFunction(BuiltinFunctions &set) {
113820
- set.AddFunction(
113821
- ScalarFunction("right", {LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR, RightFunction));
113860
+ set.AddFunction(ScalarFunction("right", {LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
113861
+ RightFunction<LeftRightUnicode>));
113862
+ set.AddFunction(ScalarFunction("right_grapheme", {LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
113863
+ RightFunction<LeftRightGrapheme>));
113822
113864
  }
113823
113865
 
113824
113866
  } // namespace duckdb
@@ -113833,7 +113875,7 @@ void RightFun::RegisterFunction(BuiltinFunctions &set) {
113833
113875
 
113834
113876
  namespace duckdb {
113835
113877
 
113836
- // length returns the size in characters
113878
+ // length returns the number of unicode codepoints
113837
113879
  struct StringLengthOperator {
113838
113880
  template <class TA, class TR>
113839
113881
  static inline TR Operation(TA input) {
@@ -113841,6 +113883,13 @@ struct StringLengthOperator {
113841
113883
  }
113842
113884
  };
113843
113885
 
113886
+ struct GraphemeCountOperator {
113887
+ template <class TA, class TR>
113888
+ static inline TR Operation(TA input) {
113889
+ return LengthFun::GraphemeCount<TA, TR>(input);
113890
+ }
113891
+ };
113892
+
113844
113893
  struct ArrayLengthOperator {
113845
113894
  template <class TA, class TR>
113846
113895
  static inline TR Operation(TA input) {
@@ -113911,6 +113960,12 @@ void LengthFun::RegisterFunction(BuiltinFunctions &set) {
113911
113960
  length.name = "len";
113912
113961
  set.AddFunction(length);
113913
113962
 
113963
+ ScalarFunctionSet length_grapheme("length_grapheme");
113964
+ length_grapheme.AddFunction(ScalarFunction({LogicalType::VARCHAR}, LogicalType::BIGINT,
113965
+ ScalarFunction::UnaryFunction<string_t, int64_t, GraphemeCountOperator>,
113966
+ nullptr, nullptr, LengthPropagateStats));
113967
+ set.AddFunction(length_grapheme);
113968
+
113914
113969
  ScalarFunctionSet array_length("array_length");
113915
113970
  array_length.AddFunction(array_length_unary);
113916
113971
  array_length.AddFunction(ScalarFunction(
@@ -116186,7 +116241,92 @@ string_t SubstringASCII(Vector &result, string_t input, int64_t offset, int64_t
116186
116241
  return SubstringSlice(result, input_data, start, end - start);
116187
116242
  }
116188
116243
 
116189
- string_t SubstringFun::SubstringScalarFunction(Vector &result, string_t input, int64_t offset, int64_t length) {
116244
+ string_t SubstringFun::SubstringUnicode(Vector &result, string_t input, int64_t offset, int64_t length) {
116245
+ auto input_data = input.GetDataUnsafe();
116246
+ auto input_size = input.GetSize();
116247
+
116248
+ if (length == 0) {
116249
+ return SubstringEmptyString(result);
116250
+ }
116251
+ // first figure out which direction we need to scan
116252
+ idx_t start_pos;
116253
+ idx_t end_pos;
116254
+ if (offset < 0) {
116255
+ start_pos = 0;
116256
+ end_pos = DConstants::INVALID_INDEX;
116257
+
116258
+ // negative offset: scan backwards
116259
+ int64_t start, end;
116260
+
116261
+ // we express start and end as unicode codepoints from the back
116262
+ offset--;
116263
+ if (length < 0) {
116264
+ // negative length
116265
+ start = -offset - length;
116266
+ end = -offset;
116267
+ } else {
116268
+ // positive length
116269
+ start = -offset;
116270
+ end = -offset - length;
116271
+ }
116272
+ if (end <= 0) {
116273
+ end_pos = input_size;
116274
+ }
116275
+ int64_t current_character = 0;
116276
+ for (idx_t i = input_size; i > 0; i--) {
116277
+ if (LengthFun::IsCharacter(input_data[i - 1])) {
116278
+ current_character++;
116279
+ if (current_character == start) {
116280
+ start_pos = i;
116281
+ break;
116282
+ } else if (current_character == end) {
116283
+ end_pos = i;
116284
+ }
116285
+ }
116286
+ }
116287
+ if (end_pos == DConstants::INVALID_INDEX) {
116288
+ return SubstringEmptyString(result);
116289
+ }
116290
+ } else {
116291
+ start_pos = DConstants::INVALID_INDEX;
116292
+ end_pos = input_size;
116293
+
116294
+ // positive offset: scan forwards
116295
+ int64_t start, end;
116296
+
116297
+ // we express start and end as unicode codepoints from the front
116298
+ if (length < 0) {
116299
+ // negative length
116300
+ start = MaxValue<int64_t>(0, offset + length - 1);
116301
+ end = offset - 1;
116302
+ } else {
116303
+ // positive length
116304
+ start = MaxValue<int64_t>(0, offset - 1);
116305
+ end = offset + length - 1;
116306
+ }
116307
+
116308
+ int64_t current_character = 0;
116309
+ for (idx_t i = 0; i < input_size; i++) {
116310
+ if (LengthFun::IsCharacter(input_data[i])) {
116311
+ if (current_character == start) {
116312
+ start_pos = i;
116313
+ } else if (current_character == end) {
116314
+ end_pos = i;
116315
+ break;
116316
+ }
116317
+ current_character++;
116318
+ }
116319
+ }
116320
+ if (start_pos == DConstants::INVALID_INDEX || end == 0 || end <= start) {
116321
+ return SubstringEmptyString(result);
116322
+ }
116323
+ }
116324
+ D_ASSERT(end_pos >= start_pos);
116325
+ // after we have found these, we can slice the substring
116326
+ return SubstringSlice(result, input_data, start_pos, end_pos - start_pos);
116327
+ }
116328
+
116329
+ string_t SubstringFun::SubstringGrapheme(Vector &result, string_t input, int64_t offset, int64_t length) {
116190
116330
  auto input_data = input.GetDataUnsafe();
116191
116331
  auto input_size = input.GetSize();
116192
116332
 
@@ -116247,6 +116387,19 @@ string_t SubstringFun::SubstringScalarFunction(Vector &result, string_t input, i
116247
116387
  return SubstringSlice(result, input_data, start_pos, end_pos - start_pos);
116248
116388
  }
116249
116389
 
116390
+ struct SubstringUnicodeOp {
116391
+ static string_t Substring(Vector &result, string_t input, int64_t offset, int64_t length) {
116392
+ return SubstringFun::SubstringUnicode(result, input, offset, length);
116393
+ }
116394
+ };
116395
+
116396
+ struct SubstringGraphemeOp {
116397
+ static string_t Substring(Vector &result, string_t input, int64_t offset, int64_t length) {
116398
+ return SubstringFun::SubstringGrapheme(result, input, offset, length);
116399
+ }
116400
+ };
116401
+
116402
+ template <class OP>
116250
116403
  static void SubstringFunction(DataChunk &args, ExpressionState &state, Vector &result) {
116251
116404
  auto &input_vector = args.data[0];
116252
116405
  auto &offset_vector = args.data[1];
@@ -116256,13 +116409,12 @@ static void SubstringFunction(DataChunk &args, ExpressionState &state, Vector &r
116256
116409
  TernaryExecutor::Execute<string_t, int64_t, int64_t, string_t>(
116257
116410
  input_vector, offset_vector, length_vector, result, args.size(),
116258
116411
  [&](string_t input_string, int64_t offset, int64_t length) {
116259
- return SubstringFun::SubstringScalarFunction(result, input_string, offset, length);
116412
+ return OP::Substring(result, input_string, offset, length);
116260
116413
  });
116261
116414
  } else {
116262
116415
  BinaryExecutor::Execute<string_t, int64_t, string_t>(
116263
116416
  input_vector, offset_vector, result, args.size(), [&](string_t input_string, int64_t offset) {
116264
- return SubstringFun::SubstringScalarFunction(result, input_string, offset,
116265
- NumericLimits<int64_t>::Maximum() - offset);
116417
+ return OP::Substring(result, input_string, offset, NumericLimits<int64_t>::Maximum() - offset);
116266
116418
  });
116267
116419
  }
116268
116420
  }
@@ -116304,13 +116456,23 @@ static unique_ptr<BaseStatistics> SubstringPropagateStats(ClientContext &context
116304
116456
  void SubstringFun::RegisterFunction(BuiltinFunctions &set) {
116305
116457
  ScalarFunctionSet substr("substring");
116306
116458
  substr.AddFunction(ScalarFunction({LogicalType::VARCHAR, LogicalType::BIGINT, LogicalType::BIGINT},
116307
- LogicalType::VARCHAR, SubstringFunction, nullptr, nullptr,
116459
+ LogicalType::VARCHAR, SubstringFunction<SubstringUnicodeOp>, nullptr, nullptr,
116308
116460
  SubstringPropagateStats));
116309
116461
  substr.AddFunction(ScalarFunction({LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
116310
- SubstringFunction, nullptr, nullptr, SubstringPropagateStats));
116462
+ SubstringFunction<SubstringUnicodeOp>, nullptr, nullptr,
116463
+ SubstringPropagateStats));
116311
116464
  set.AddFunction(substr);
116312
116465
  substr.name = "substr";
116313
116466
  set.AddFunction(substr);
116467
+
116468
+ ScalarFunctionSet substr_grapheme("substring_grapheme");
116469
+ substr_grapheme.AddFunction(ScalarFunction({LogicalType::VARCHAR, LogicalType::BIGINT, LogicalType::BIGINT},
116470
+ LogicalType::VARCHAR, SubstringFunction<SubstringGraphemeOp>, nullptr,
116471
+ nullptr, SubstringPropagateStats));
116472
+ substr_grapheme.AddFunction(ScalarFunction({LogicalType::VARCHAR, LogicalType::BIGINT}, LogicalType::VARCHAR,
116473
+ SubstringFunction<SubstringGraphemeOp>, nullptr, nullptr,
116474
+ SubstringPropagateStats));
116475
+ set.AddFunction(substr_grapheme);
116314
116476
  }
116315
116477
 
116316
116478
  } // namespace duckdb