duckdb 0.9.1-dev97.0 → 0.9.2-dev2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/.github/workflows/NodeJS.yml +250 -0
  2. package/LICENSE +7 -0
  3. package/Makefile +3 -9
  4. package/README.md +2 -2
  5. package/binding.gyp +8 -8
  6. package/package.json +4 -4
  7. package/scripts/install_node.sh +21 -0
  8. package/scripts/node_build.sh +40 -0
  9. package/scripts/node_build_win.sh +21 -0
  10. package/scripts/node_version.sh +33 -0
  11. package/src/duckdb/extension/icu/icu-makedate.cpp +1 -1
  12. package/src/duckdb/extension/icu/icu-strptime.cpp +0 -2
  13. package/src/duckdb/extension/icu/icu_extension.cpp +0 -1
  14. package/src/duckdb/extension/json/json_functions/json_create.cpp +27 -14
  15. package/src/duckdb/extension/json/json_functions/json_transform.cpp +26 -14
  16. package/src/duckdb/extension/json/json_functions.cpp +1 -10
  17. package/src/duckdb/extension/parquet/column_reader.cpp +26 -1
  18. package/src/duckdb/extension/parquet/column_writer.cpp +10 -1
  19. package/src/duckdb/extension/parquet/include/column_reader.hpp +2 -0
  20. package/src/duckdb/extension/parquet/include/parquet_bss_decoder.hpp +49 -0
  21. package/src/duckdb/extension/parquet/parquet_extension.cpp +3 -4
  22. package/src/duckdb/extension/parquet/parquet_timestamp.cpp +3 -4
  23. package/src/duckdb/src/common/arrow/appender/list_data.cpp +2 -2
  24. package/src/duckdb/src/common/arrow/appender/map_data.cpp +15 -10
  25. package/src/duckdb/src/common/arrow/appender/struct_data.cpp +2 -2
  26. package/src/duckdb/src/common/arrow/appender/union_data.cpp +2 -2
  27. package/src/duckdb/src/common/arrow/arrow_appender.cpp +26 -7
  28. package/src/duckdb/src/common/arrow/arrow_wrapper.cpp +3 -3
  29. package/src/duckdb/src/common/exception.cpp +60 -84
  30. package/src/duckdb/src/common/preserved_error.cpp +20 -0
  31. package/src/duckdb/src/common/types/data_chunk.cpp +1 -1
  32. package/src/duckdb/src/execution/expression_executor/execute_reference.cpp +1 -1
  33. package/src/duckdb/src/execution/expression_executor_state.cpp +8 -2
  34. package/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp +1 -1
  35. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +2 -0
  36. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +5 -5
  37. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +4 -4
  38. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +2 -2
  39. package/src/duckdb/src/execution/operator/helper/physical_reset.cpp +1 -4
  40. package/src/duckdb/src/execution/operator/helper/physical_set.cpp +2 -4
  41. package/src/duckdb/src/function/function_binder.cpp +1 -1
  42. package/src/duckdb/src/function/table/arrow_conversion.cpp +2 -1
  43. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  44. package/src/duckdb/src/include/duckdb/common/arrow/appender/append_data.hpp +4 -0
  45. package/src/duckdb/src/include/duckdb/common/arrow/appender/enum_data.hpp +3 -1
  46. package/src/duckdb/src/include/duckdb/common/arrow/arrow_appender.hpp +2 -1
  47. package/src/duckdb/src/include/duckdb/common/arrow/arrow_wrapper.hpp +3 -0
  48. package/src/duckdb/src/include/duckdb/common/exception.hpp +1 -0
  49. package/src/duckdb/src/include/duckdb/common/preserved_error.hpp +1 -3
  50. package/src/duckdb/src/include/duckdb/execution/expression_executor_state.hpp +1 -1
  51. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/base_csv_reader.hpp +0 -4
  52. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +10 -10
  53. package/src/duckdb/src/include/duckdb/function/replacement_scan.hpp +20 -0
  54. package/src/duckdb/src/include/duckdb/main/config.hpp +2 -0
  55. package/src/duckdb/src/include/duckdb/optimizer/filter_pushdown.hpp +2 -0
  56. package/src/duckdb/src/include/duckdb/planner/binder.hpp +1 -0
  57. package/src/duckdb/src/include/duckdb/planner/bound_parameter_map.hpp +3 -0
  58. package/src/duckdb/src/include/duckdb/planner/expression_binder.hpp +2 -2
  59. package/src/duckdb/src/main/capi/arrow-c.cpp +7 -4
  60. package/src/duckdb/src/main/config.cpp +14 -0
  61. package/src/duckdb/src/main/extension/extension_install.cpp +14 -12
  62. package/src/duckdb/src/optimizer/filter_pushdown.cpp +1 -0
  63. package/src/duckdb/src/optimizer/pushdown/pushdown_distinct.cpp +19 -0
  64. package/src/duckdb/src/parser/transform/statement/transform_copy.cpp +4 -2
  65. package/src/duckdb/src/parser/transform/statement/transform_create_sequence.cpp +10 -5
  66. package/src/duckdb/src/planner/binder/expression/bind_between_expression.cpp +5 -7
  67. package/src/duckdb/src/planner/binder/expression/bind_collate_expression.cpp +4 -2
  68. package/src/duckdb/src/planner/binder/expression/bind_comparison_expression.cpp +17 -14
  69. package/src/duckdb/src/planner/binder/query_node/bind_select_node.cpp +5 -12
  70. package/src/duckdb/src/planner/binder/statement/bind_create.cpp +15 -1
  71. package/src/duckdb/src/planner/bound_parameter_map.cpp +16 -5
  72. package/src/duckdb/src/planner/expression_binder/base_select_binder.cpp +2 -5
  73. package/src/duckdb/src/planner/planner.cpp +1 -1
  74. package/src/duckdb/src/transaction/duck_transaction_manager.cpp +13 -9
  75. package/src/duckdb/third_party/parquet/parquet_types.h +2 -1
  76. package/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp +5 -5
  77. package/src/duckdb/ub_src_optimizer_pushdown.cpp +2 -0
  78. package/src/statement.cpp +4 -4
  79. package/test/arrow.test.ts +3 -1
  80. package/test/parquet.test.ts +1 -1
  81. package/test/userdata1.parquet +0 -0
  82. package/{configure → vendor} +1 -1
  83. package/{configure.py → vendor.py} +12 -1
  84. package/duckdb_extension_config.cmake +0 -10
@@ -243,6 +243,7 @@ void ColumnReader::InitializeRead(idx_t row_group_idx_p, const vector<ColumnChun
243
243
  void ColumnReader::PrepareRead(parquet_filter_t &filter) {
244
244
  dict_decoder.reset();
245
245
  defined_decoder.reset();
246
+ bss_decoder.reset();
246
247
  block.reset();
247
248
  PageHeader page_hdr;
248
249
  page_hdr.read(protocol);
@@ -443,6 +444,13 @@ void ColumnReader::PrepareDataPage(PageHeader &page_hdr) {
443
444
  PrepareDeltaByteArray(*block);
444
445
  break;
445
446
  }
447
+ case Encoding::BYTE_STREAM_SPLIT: {
448
+ // Subtract 1 from length as the block is allocated with 1 extra byte,
449
+ // but the byte stream split encoder needs to know the correct data size.
450
+ bss_decoder = make_uniq<BssDecoder>(block->ptr, block->len - 1);
451
+ block->inc(block->len);
452
+ break;
453
+ }
446
454
  case Encoding::PLAIN:
447
455
  // nothing to do here, will be read directly below
448
456
  break;
@@ -488,7 +496,7 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr
488
496
 
489
497
  idx_t null_count = 0;
490
498
 
491
- if ((dict_decoder || dbp_decoder || rle_decoder) && HasDefines()) {
499
+ if ((dict_decoder || dbp_decoder || rle_decoder || bss_decoder) && HasDefines()) {
492
500
  // we need the null count because the dictionary offsets have no entries for nulls
493
501
  for (idx_t i = 0; i < read_now; i++) {
494
502
  if (define_out[i + result_offset] != max_define) {
@@ -534,6 +542,23 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr
534
542
  } else if (byte_array_data) {
535
543
  // DELTA_BYTE_ARRAY or DELTA_LENGTH_BYTE_ARRAY
536
544
  DeltaByteArray(define_out, read_now, filter, result_offset, result);
545
+ } else if (bss_decoder) {
546
+ auto read_buf = make_shared<ResizeableBuffer>();
547
+
548
+ switch (schema.type) {
549
+ case duckdb_parquet::format::Type::FLOAT:
550
+ read_buf->resize(reader.allocator, sizeof(float) * (read_now - null_count));
551
+ bss_decoder->GetBatch<float>(read_buf->ptr, read_now - null_count);
552
+ break;
553
+ case duckdb_parquet::format::Type::DOUBLE:
554
+ read_buf->resize(reader.allocator, sizeof(double) * (read_now - null_count));
555
+ bss_decoder->GetBatch<double>(read_buf->ptr, read_now - null_count);
556
+ break;
557
+ default:
558
+ throw std::runtime_error("BYTE_STREAM_SPLIT encoding is only supported for FLOAT or DOUBLE data");
559
+ }
560
+
561
+ Plain(read_buf, define_out, read_now, filter, result_offset, result);
537
562
  } else {
538
563
  PlainReference(block, result);
539
564
  Plain(block, define_out, read_now, filter, result_offset, result);
@@ -796,6 +796,13 @@ struct ParquetTimestampSOperator : public BaseParquetOperator {
796
796
  }
797
797
  };
798
798
 
799
+ struct ParquetTimeTZOperator : public BaseParquetOperator {
800
+ template <class SRC, class TGT>
801
+ static TGT Operation(SRC input) {
802
+ return input.time().micros;
803
+ }
804
+ };
805
+
799
806
  struct ParquetHugeintOperator {
800
807
  template <class SRC, class TGT>
801
808
  static TGT Operation(SRC input) {
@@ -1975,12 +1982,14 @@ unique_ptr<ColumnWriter> ColumnWriter::CreateWriterRecursive(vector<duckdb_parqu
1975
1982
  max_define, can_have_nulls);
1976
1983
  case LogicalTypeId::BIGINT:
1977
1984
  case LogicalTypeId::TIME:
1978
- case LogicalTypeId::TIME_TZ:
1979
1985
  case LogicalTypeId::TIMESTAMP:
1980
1986
  case LogicalTypeId::TIMESTAMP_TZ:
1981
1987
  case LogicalTypeId::TIMESTAMP_MS:
1982
1988
  return make_uniq<StandardColumnWriter<int64_t, int64_t>>(writer, schema_idx, std::move(schema_path), max_repeat,
1983
1989
  max_define, can_have_nulls);
1990
+ case LogicalTypeId::TIME_TZ:
1991
+ return make_uniq<StandardColumnWriter<dtime_tz_t, int64_t, ParquetTimeTZOperator>>(
1992
+ writer, schema_idx, std::move(schema_path), max_repeat, max_define, can_have_nulls);
1984
1993
  case LogicalTypeId::HUGEINT:
1985
1994
  return make_uniq<StandardColumnWriter<hugeint_t, double, ParquetHugeintOperator>>(
1986
1995
  writer, schema_idx, std::move(schema_path), max_repeat, max_define, can_have_nulls);
@@ -9,6 +9,7 @@
9
9
  #pragma once
10
10
 
11
11
  #include "duckdb.hpp"
12
+ #include "parquet_bss_decoder.hpp"
12
13
  #include "parquet_dbp_decoder.hpp"
13
14
  #include "parquet_rle_bp_decoder.hpp"
14
15
  #include "parquet_statistics.hpp"
@@ -161,6 +162,7 @@ private:
161
162
  unique_ptr<RleBpDecoder> repeated_decoder;
162
163
  unique_ptr<DbpDecoder> dbp_decoder;
163
164
  unique_ptr<RleBpDecoder> rle_decoder;
165
+ unique_ptr<BssDecoder> bss_decoder;
164
166
 
165
167
  // dummies for Skip()
166
168
  parquet_filter_t none_filter;
@@ -0,0 +1,49 @@
1
+ //===----------------------------------------------------------------------===//
2
+ // DuckDB
3
+ //
4
+ // parquet_bss_decoder.hpp
5
+ //
6
+ //
7
+ //===----------------------------------------------------------------------===//
8
+
9
+ #pragma once
10
+ #include "parquet_types.h"
11
+ #include "resizable_buffer.hpp"
12
+
13
+ namespace duckdb {
14
+
15
+ /// Decoder for the Byte Stream Split encoding
16
+ class BssDecoder {
17
+ public:
18
+ /// Create a decoder object. buffer/buffer_len is the encoded data.
19
+ BssDecoder(data_ptr_t buffer, uint32_t buffer_len) : buffer_(buffer, buffer_len), value_offset_(0) {
20
+ }
21
+
22
+ public:
23
+ template <typename T>
24
+ void GetBatch(data_ptr_t values_target_ptr, uint32_t batch_size) {
25
+ if (buffer_.len % sizeof(T) != 0) {
26
+ std::stringstream error;
27
+ error << "Data buffer size for the BYTE_STREAM_SPLIT encoding (" << buffer_.len
28
+ << ") should be a multiple of the type size (" << sizeof(T) << ")";
29
+ throw std::runtime_error(error.str());
30
+ }
31
+ uint32_t num_buffer_values = buffer_.len / sizeof(T);
32
+
33
+ buffer_.available((value_offset_ + batch_size) * sizeof(T));
34
+
35
+ for (uint32_t byte_offset = 0; byte_offset < sizeof(T); ++byte_offset) {
36
+ data_ptr_t input_bytes = buffer_.ptr + byte_offset * num_buffer_values + value_offset_;
37
+ for (uint32_t i = 0; i < batch_size; ++i) {
38
+ values_target_ptr[byte_offset + i * sizeof(T)] = *(input_bytes + i);
39
+ }
40
+ }
41
+ value_offset_ += batch_size;
42
+ }
43
+
44
+ private:
45
+ ByteBuffer buffer_;
46
+ uint32_t value_offset_;
47
+ };
48
+
49
+ } // namespace duckdb
@@ -20,6 +20,8 @@
20
20
  #include "duckdb/common/enums/file_compression_type.hpp"
21
21
  #include "duckdb/common/file_system.hpp"
22
22
  #include "duckdb/common/multi_file_reader.hpp"
23
+ #include "duckdb/common/serializer/deserializer.hpp"
24
+ #include "duckdb/common/serializer/serializer.hpp"
23
25
  #include "duckdb/common/types/chunk_collection.hpp"
24
26
  #include "duckdb/function/copy_function.hpp"
25
27
  #include "duckdb/function/table_function.hpp"
@@ -34,8 +36,6 @@
34
36
  #include "duckdb/planner/operator/logical_get.hpp"
35
37
  #include "duckdb/storage/statistics/base_statistics.hpp"
36
38
  #include "duckdb/storage/table/row_group.hpp"
37
- #include "duckdb/common/serializer/serializer.hpp"
38
- #include "duckdb/common/serializer/deserializer.hpp"
39
39
  #endif
40
40
 
41
41
  namespace duckdb {
@@ -983,8 +983,7 @@ idx_t ParquetWriteDesiredBatchSize(ClientContext &context, FunctionData &bind_da
983
983
  //===--------------------------------------------------------------------===//
984
984
  unique_ptr<TableRef> ParquetScanReplacement(ClientContext &context, const string &table_name,
985
985
  ReplacementScanData *data) {
986
- auto lower_name = StringUtil::Lower(table_name);
987
- if (!StringUtil::EndsWith(lower_name, ".parquet") && !StringUtil::Contains(lower_name, ".parquet?")) {
986
+ if (!ReplacementScan::CanReplace(table_name, {"parquet"})) {
988
987
  return nullptr;
989
988
  }
990
989
  auto table_function = make_uniq<TableFunctionRef>();
@@ -66,10 +66,9 @@ dtime_t ParquetIntToTimeNs(const int64_t &raw_time) {
66
66
  return Time::FromTimeNs(raw_time);
67
67
  }
68
68
 
69
- dtime_tz_t ParquetIntToTimeTZ(const int64_t &raw_time) {
70
- dtime_tz_t result;
71
- result.bits = raw_time;
72
- return result;
69
+ dtime_tz_t ParquetIntToTimeTZ(const int64_t &raw_micros) {
70
+ dtime_t t(raw_micros);
71
+ return dtime_tz_t(t, 0);
73
72
  }
74
73
 
75
74
  } // namespace duckdb
@@ -69,10 +69,10 @@ void ArrowListData::Finalize(ArrowAppendData &append_data, const LogicalType &ty
69
69
  result->buffers[1] = append_data.main_buffer.data();
70
70
 
71
71
  auto &child_type = ListType::GetChildType(type);
72
- append_data.child_pointers.resize(1);
72
+ ArrowAppender::AddChildren(append_data, 1);
73
73
  result->children = append_data.child_pointers.data();
74
74
  result->n_children = 1;
75
- append_data.child_pointers[0] = ArrowAppender::FinalizeChild(child_type, *append_data.child_data[0]);
75
+ append_data.child_arrays[0] = *ArrowAppender::FinalizeChild(child_type, std::move(append_data.child_data[0]));
76
76
  }
77
77
 
78
78
  } // namespace duckdb
@@ -52,33 +52,38 @@ void ArrowMapData::Append(ArrowAppendData &append_data, Vector &input, idx_t fro
52
52
 
53
53
  void ArrowMapData::Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) {
54
54
  // set up the main map buffer
55
+ D_ASSERT(result);
55
56
  result->n_buffers = 2;
56
57
  result->buffers[1] = append_data.main_buffer.data();
57
58
 
58
59
  // the main map buffer has a single child: a struct
59
- append_data.child_pointers.resize(1);
60
+ ArrowAppender::AddChildren(append_data, 1);
60
61
  result->children = append_data.child_pointers.data();
61
62
  result->n_children = 1;
62
- append_data.child_pointers[0] = ArrowAppender::FinalizeChild(type, *append_data.child_data[0]);
63
63
 
64
- // now that struct has two children: the key and the value type
65
64
  auto &struct_data = *append_data.child_data[0];
66
- auto &struct_result = append_data.child_pointers[0];
67
- struct_data.child_pointers.resize(2);
65
+ auto struct_result = ArrowAppender::FinalizeChild(type, std::move(append_data.child_data[0]));
66
+
67
+ // Initialize the struct array data
68
+ const auto struct_child_count = 2;
69
+ ArrowAppender::AddChildren(struct_data, struct_child_count);
70
+ struct_result->children = struct_data.child_pointers.data();
68
71
  struct_result->n_buffers = 1;
69
- struct_result->n_children = 2;
72
+ struct_result->n_children = struct_child_count;
70
73
  struct_result->length = struct_data.child_data[0]->row_count;
71
- struct_result->children = struct_data.child_pointers.data();
74
+
75
+ append_data.child_arrays[0] = *struct_result;
72
76
 
73
77
  D_ASSERT(struct_data.child_data[0]->row_count == struct_data.child_data[1]->row_count);
74
78
 
75
79
  auto &key_type = MapType::KeyType(type);
76
80
  auto &value_type = MapType::ValueType(type);
77
- struct_data.child_pointers[0] = ArrowAppender::FinalizeChild(key_type, *struct_data.child_data[0]);
78
- struct_data.child_pointers[1] = ArrowAppender::FinalizeChild(value_type, *struct_data.child_data[1]);
81
+ auto key_data = ArrowAppender::FinalizeChild(key_type, std::move(struct_data.child_data[0]));
82
+ struct_data.child_arrays[0] = *key_data;
83
+ struct_data.child_arrays[1] = *ArrowAppender::FinalizeChild(value_type, std::move(struct_data.child_data[1]));
79
84
 
80
85
  // keys cannot have null values
81
- if (struct_data.child_pointers[0]->null_count > 0) {
86
+ if (key_data->null_count > 0) {
82
87
  throw std::runtime_error("Arrow doesn't accept NULL keys on Maps");
83
88
  }
84
89
  }
@@ -33,12 +33,12 @@ void ArrowStructData::Finalize(ArrowAppendData &append_data, const LogicalType &
33
33
  result->n_buffers = 1;
34
34
 
35
35
  auto &child_types = StructType::GetChildTypes(type);
36
- append_data.child_pointers.resize(child_types.size());
36
+ ArrowAppender::AddChildren(append_data, child_types.size());
37
37
  result->children = append_data.child_pointers.data();
38
38
  result->n_children = child_types.size();
39
39
  for (idx_t i = 0; i < child_types.size(); i++) {
40
40
  auto &child_type = child_types[i].second;
41
- append_data.child_pointers[i] = ArrowAppender::FinalizeChild(child_type, *append_data.child_data[i]);
41
+ append_data.child_arrays[i] = *ArrowAppender::FinalizeChild(child_type, std::move(append_data.child_data[i]));
42
42
  }
43
43
  }
44
44
 
@@ -58,12 +58,12 @@ void ArrowUnionData::Finalize(ArrowAppendData &append_data, const LogicalType &t
58
58
  result->buffers[1] = append_data.main_buffer.data();
59
59
 
60
60
  auto &child_types = UnionType::CopyMemberTypes(type);
61
- append_data.child_pointers.resize(child_types.size());
61
+ ArrowAppender::AddChildren(append_data, child_types.size());
62
62
  result->children = append_data.child_pointers.data();
63
63
  result->n_children = child_types.size();
64
64
  for (idx_t i = 0; i < child_types.size(); i++) {
65
65
  auto &child_type = child_types[i].second;
66
- append_data.child_pointers[i] = ArrowAppender::FinalizeChild(child_type, *append_data.child_data[i]);
66
+ append_data.child_arrays[i] = *ArrowAppender::FinalizeChild(child_type, std::move(append_data.child_data[i]));
67
67
  }
68
68
  }
69
69
 
@@ -39,18 +39,31 @@ void ArrowAppender::ReleaseArray(ArrowArray *array) {
39
39
  if (!array || !array->release) {
40
40
  return;
41
41
  }
42
- array->release = nullptr;
43
42
  auto holder = static_cast<ArrowAppendData *>(array->private_data);
43
+ for (int64_t i = 0; i < array->n_children; i++) {
44
+ auto child = array->children[i];
45
+ if (!child->release) {
46
+ // Child was moved out of the array
47
+ continue;
48
+ }
49
+ child->release(child);
50
+ D_ASSERT(!child->release);
51
+ }
52
+ if (array->dictionary && array->dictionary->release) {
53
+ array->dictionary->release(array->dictionary);
54
+ }
55
+ array->release = nullptr;
44
56
  delete holder;
45
57
  }
46
58
 
47
59
  //===--------------------------------------------------------------------===//
48
60
  // Finalize Arrow Child
49
61
  //===--------------------------------------------------------------------===//
50
- ArrowArray *ArrowAppender::FinalizeChild(const LogicalType &type, ArrowAppendData &append_data) {
62
+ ArrowArray *ArrowAppender::FinalizeChild(const LogicalType &type, unique_ptr<ArrowAppendData> append_data_p) {
51
63
  auto result = make_uniq<ArrowArray>();
52
64
 
53
- result->private_data = nullptr;
65
+ auto &append_data = *append_data_p;
66
+ result->private_data = append_data_p.release();
54
67
  result->release = ArrowAppender::ReleaseArray;
55
68
  result->n_children = 0;
56
69
  result->null_count = 0;
@@ -75,7 +88,7 @@ ArrowArray ArrowAppender::Finalize() {
75
88
  auto root_holder = make_uniq<ArrowAppendData>(options);
76
89
 
77
90
  ArrowArray result;
78
- root_holder->child_pointers.resize(types.size());
91
+ AddChildren(*root_holder, types.size());
79
92
  result.children = root_holder->child_pointers.data();
80
93
  result.n_children = types.size();
81
94
 
@@ -88,10 +101,8 @@ ArrowArray ArrowAppender::Finalize() {
88
101
  result.dictionary = nullptr;
89
102
  root_holder->child_data = std::move(root_data);
90
103
 
91
- // FIXME: this violates a property of the arrow format, if root owns all the child memory then consumers can't move
92
- // child arrays https://arrow.apache.org/docs/format/CDataInterface.html#moving-child-arrays
93
104
  for (idx_t i = 0; i < root_holder->child_data.size(); i++) {
94
- root_holder->child_pointers[i] = ArrowAppender::FinalizeChild(types[i], *root_holder->child_data[i]);
105
+ root_holder->child_arrays[i] = *ArrowAppender::FinalizeChild(types[i], std::move(root_holder->child_data[i]));
95
106
  }
96
107
 
97
108
  // Release ownership to caller
@@ -238,4 +249,12 @@ unique_ptr<ArrowAppendData> ArrowAppender::InitializeChild(const LogicalType &ty
238
249
  return result;
239
250
  }
240
251
 
252
+ void ArrowAppender::AddChildren(ArrowAppendData &data, idx_t count) {
253
+ data.child_pointers.resize(count);
254
+ data.child_arrays.resize(count);
255
+ for (idx_t i = 0; i < count; i++) {
256
+ data.child_pointers[i] = &data.child_arrays[i];
257
+ }
258
+ }
259
+
241
260
  } // namespace duckdb
@@ -16,21 +16,21 @@ namespace duckdb {
16
16
  ArrowSchemaWrapper::~ArrowSchemaWrapper() {
17
17
  if (arrow_schema.release) {
18
18
  arrow_schema.release(&arrow_schema);
19
- arrow_schema.release = nullptr;
19
+ D_ASSERT(!arrow_schema.release);
20
20
  }
21
21
  }
22
22
 
23
23
  ArrowArrayWrapper::~ArrowArrayWrapper() {
24
24
  if (arrow_array.release) {
25
25
  arrow_array.release(&arrow_array);
26
- arrow_array.release = nullptr;
26
+ D_ASSERT(!arrow_array.release);
27
27
  }
28
28
  }
29
29
 
30
30
  ArrowArrayStreamWrapper::~ArrowArrayStreamWrapper() {
31
31
  if (arrow_array_stream.release) {
32
32
  arrow_array_stream.release(&arrow_array_stream);
33
- arrow_array_stream.release = nullptr;
33
+ D_ASSERT(!arrow_array_stream.release);
34
34
  }
35
35
  }
36
36
 
@@ -1,5 +1,4 @@
1
1
  #include "duckdb/common/exception.hpp"
2
-
3
2
  #include "duckdb/common/string_util.hpp"
4
3
  #include "duckdb/common/to_string.hpp"
5
4
  #include "duckdb/common/types.hpp"
@@ -82,91 +81,68 @@ string Exception::ConstructMessageRecursive(const string &msg, std::vector<Excep
82
81
  return ExceptionFormatValue::Format(msg, values);
83
82
  }
84
83
 
84
+ struct ExceptionEntry {
85
+ ExceptionType type;
86
+ char text[48];
87
+ };
88
+
89
+ static constexpr ExceptionEntry EXCEPTION_MAP[] = {{ExceptionType::INVALID, "Invalid"},
90
+ {ExceptionType::OUT_OF_RANGE, "Out of Range"},
91
+ {ExceptionType::CONVERSION, "Conversion"},
92
+ {ExceptionType::UNKNOWN_TYPE, "Unknown Type"},
93
+ {ExceptionType::DECIMAL, "Decimal"},
94
+ {ExceptionType::MISMATCH_TYPE, "Mismatch Type"},
95
+ {ExceptionType::DIVIDE_BY_ZERO, "Divide by Zero"},
96
+ {ExceptionType::OBJECT_SIZE, "Object Size"},
97
+ {ExceptionType::INVALID_TYPE, "Invalid type"},
98
+ {ExceptionType::SERIALIZATION, "Serialization"},
99
+ {ExceptionType::TRANSACTION, "TransactionContext"},
100
+ {ExceptionType::NOT_IMPLEMENTED, "Not implemented"},
101
+ {ExceptionType::EXPRESSION, "Expression"},
102
+ {ExceptionType::CATALOG, "Catalog"},
103
+ {ExceptionType::PARSER, "Parser"},
104
+ {ExceptionType::BINDER, "Binder"},
105
+ {ExceptionType::PLANNER, "Planner"},
106
+ {ExceptionType::SCHEDULER, "Scheduler"},
107
+ {ExceptionType::EXECUTOR, "Executor"},
108
+ {ExceptionType::CONSTRAINT, "Constraint"},
109
+ {ExceptionType::INDEX, "Index"},
110
+ {ExceptionType::STAT, "Stat"},
111
+ {ExceptionType::CONNECTION, "Connection"},
112
+ {ExceptionType::SYNTAX, "Syntax"},
113
+ {ExceptionType::SETTINGS, "Settings"},
114
+ {ExceptionType::OPTIMIZER, "Optimizer"},
115
+ {ExceptionType::NULL_POINTER, "NullPointer"},
116
+ {ExceptionType::IO, "IO"},
117
+ {ExceptionType::INTERRUPT, "INTERRUPT"},
118
+ {ExceptionType::FATAL, "FATAL"},
119
+ {ExceptionType::INTERNAL, "INTERNAL"},
120
+ {ExceptionType::INVALID_INPUT, "Invalid Input"},
121
+ {ExceptionType::OUT_OF_MEMORY, "Out of Memory"},
122
+ {ExceptionType::PERMISSION, "Permission"},
123
+ {ExceptionType::PARAMETER_NOT_RESOLVED, "Parameter Not Resolved"},
124
+ {ExceptionType::PARAMETER_NOT_ALLOWED, "Parameter Not Allowed"},
125
+ {ExceptionType::DEPENDENCY, "Dependency"},
126
+ {ExceptionType::MISSING_EXTENSION, "Missing Extension"},
127
+ {ExceptionType::HTTP, "HTTP"},
128
+ {ExceptionType::AUTOLOAD, "Extension Autoloading"}};
129
+
85
130
  string Exception::ExceptionTypeToString(ExceptionType type) {
86
- switch (type) {
87
- case ExceptionType::INVALID:
88
- return "Invalid";
89
- case ExceptionType::OUT_OF_RANGE:
90
- return "Out of Range";
91
- case ExceptionType::CONVERSION:
92
- return "Conversion";
93
- case ExceptionType::UNKNOWN_TYPE:
94
- return "Unknown Type";
95
- case ExceptionType::DECIMAL:
96
- return "Decimal";
97
- case ExceptionType::MISMATCH_TYPE:
98
- return "Mismatch Type";
99
- case ExceptionType::DIVIDE_BY_ZERO:
100
- return "Divide by Zero";
101
- case ExceptionType::OBJECT_SIZE:
102
- return "Object Size";
103
- case ExceptionType::INVALID_TYPE:
104
- return "Invalid type";
105
- case ExceptionType::SERIALIZATION:
106
- return "Serialization";
107
- case ExceptionType::TRANSACTION:
108
- return "TransactionContext";
109
- case ExceptionType::NOT_IMPLEMENTED:
110
- return "Not implemented";
111
- case ExceptionType::EXPRESSION:
112
- return "Expression";
113
- case ExceptionType::CATALOG:
114
- return "Catalog";
115
- case ExceptionType::PARSER:
116
- return "Parser";
117
- case ExceptionType::BINDER:
118
- return "Binder";
119
- case ExceptionType::PLANNER:
120
- return "Planner";
121
- case ExceptionType::SCHEDULER:
122
- return "Scheduler";
123
- case ExceptionType::EXECUTOR:
124
- return "Executor";
125
- case ExceptionType::CONSTRAINT:
126
- return "Constraint";
127
- case ExceptionType::INDEX:
128
- return "Index";
129
- case ExceptionType::STAT:
130
- return "Stat";
131
- case ExceptionType::CONNECTION:
132
- return "Connection";
133
- case ExceptionType::SYNTAX:
134
- return "Syntax";
135
- case ExceptionType::SETTINGS:
136
- return "Settings";
137
- case ExceptionType::OPTIMIZER:
138
- return "Optimizer";
139
- case ExceptionType::NULL_POINTER:
140
- return "NullPointer";
141
- case ExceptionType::IO:
142
- return "IO";
143
- case ExceptionType::INTERRUPT:
144
- return "INTERRUPT";
145
- case ExceptionType::FATAL:
146
- return "FATAL";
147
- case ExceptionType::INTERNAL:
148
- return "INTERNAL";
149
- case ExceptionType::INVALID_INPUT:
150
- return "Invalid Input";
151
- case ExceptionType::OUT_OF_MEMORY:
152
- return "Out of Memory";
153
- case ExceptionType::PERMISSION:
154
- return "Permission";
155
- case ExceptionType::PARAMETER_NOT_RESOLVED:
156
- return "Parameter Not Resolved";
157
- case ExceptionType::PARAMETER_NOT_ALLOWED:
158
- return "Parameter Not Allowed";
159
- case ExceptionType::DEPENDENCY:
160
- return "Dependency";
161
- case ExceptionType::MISSING_EXTENSION:
162
- return "Missing Extension";
163
- case ExceptionType::HTTP:
164
- return "HTTP";
165
- case ExceptionType::AUTOLOAD:
166
- return "Extension Autoloading";
167
- default:
168
- return "Unknown";
131
+ for (auto &e : EXCEPTION_MAP) {
132
+ if (e.type == type) {
133
+ return e.text;
134
+ }
135
+ }
136
+ return "Unknown";
137
+ }
138
+
139
+ ExceptionType Exception::StringToExceptionType(const string &type) {
140
+ for (auto &e : EXCEPTION_MAP) {
141
+ if (e.text == type) {
142
+ return e.type;
143
+ }
169
144
  }
145
+ return ExceptionType::INVALID;
170
146
  }
171
147
 
172
148
  const HTTPException &Exception::AsHTTPException() const {
@@ -18,6 +18,26 @@ PreservedError::PreservedError(const Exception &exception)
18
18
  PreservedError::PreservedError(const string &message)
19
19
  : initialized(true), type(ExceptionType::INVALID), raw_message(SanitizeErrorMessage(message)),
20
20
  exception_instance(nullptr) {
21
+ // Given a message in the form: xxxxx Error: yyyyy
22
+ // Try to match xxxxxxx with known error so to potentially reconstruct the original error type
23
+ auto position_semicolon = raw_message.find(':');
24
+ if (position_semicolon == std::string::npos) {
25
+ // Semicolon not found, bail out
26
+ return;
27
+ }
28
+ if (position_semicolon + 2 >= raw_message.size()) {
29
+ // Not enough characters afterward, bail out
30
+ return;
31
+ }
32
+ string err = raw_message.substr(0, position_semicolon);
33
+ string msg = raw_message.substr(position_semicolon + 2);
34
+ if (err.size() > 6 && err.substr(err.size() - 6) == " Error" && !msg.empty()) {
35
+ ExceptionType new_type = Exception::StringToExceptionType(err.substr(0, err.size() - 6));
36
+ if (new_type != type) {
37
+ type = new_type;
38
+ raw_message = msg;
39
+ }
40
+ }
21
41
  }
22
42
 
23
43
  const string &PreservedError::Message() {
@@ -64,7 +64,7 @@ void DataChunk::InitializeEmpty(vector<LogicalType>::const_iterator begin, vecto
64
64
  }
65
65
 
66
66
  void DataChunk::Reset() {
67
- if (data.empty()) {
67
+ if (data.empty() || vector_caches.empty()) {
68
68
  return;
69
69
  }
70
70
  if (vector_caches.size() != data.size()) {
@@ -6,7 +6,7 @@ namespace duckdb {
6
6
  unique_ptr<ExpressionState> ExpressionExecutor::InitializeState(const BoundReferenceExpression &expr,
7
7
  ExpressionExecutorState &root) {
8
8
  auto result = make_uniq<ExpressionState>(expr, root);
9
- result->Finalize();
9
+ result->Finalize(true);
10
10
  return result;
11
11
  }
12
12
 
@@ -1,4 +1,5 @@
1
1
  #include "duckdb/execution/expression_executor_state.hpp"
2
+
2
3
  #include "duckdb/execution/expression_executor.hpp"
3
4
  #include "duckdb/planner/expression.hpp"
4
5
  #include "duckdb/planner/expression/bound_function_expression.hpp"
@@ -10,8 +11,13 @@ void ExpressionState::AddChild(Expression *expr) {
10
11
  child_states.push_back(ExpressionExecutor::InitializeState(*expr, root));
11
12
  }
12
13
 
13
- void ExpressionState::Finalize() {
14
- if (!types.empty()) {
14
+ void ExpressionState::Finalize(bool empty) {
15
+ if (types.empty()) {
16
+ return;
17
+ }
18
+ if (empty) {
19
+ intermediate_chunk.InitializeEmpty(types);
20
+ } else {
15
21
  intermediate_chunk.Initialize(GetAllocator(), types);
16
22
  }
17
23
  }
@@ -192,6 +192,7 @@ void BufferedCSVReader::ParseCSV(ParserMode mode) {
192
192
  }
193
193
 
194
194
  bool BufferedCSVReader::TryParseCSV(ParserMode parser_mode, DataChunk &insert_chunk, string &error_message) {
195
+ cached_buffers.clear();
195
196
  mode = parser_mode;
196
197
  // used for parsing algorithm
197
198
  bool finished_chunk = false;
@@ -427,7 +428,6 @@ final_state:
427
428
  Flush(insert_chunk);
428
429
  }
429
430
 
430
- end_of_file_reached = true;
431
431
  return true;
432
432
  }
433
433
 
@@ -70,6 +70,8 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
70
70
  // 8) Empty Line State
71
71
  transition_array[CSVState::EMPTY_LINE][static_cast<uint8_t>('\r')] = CSVState::EMPTY_LINE;
72
72
  transition_array[CSVState::EMPTY_LINE][static_cast<uint8_t>('\n')] = CSVState::EMPTY_LINE;
73
+ transition_array[CSVState::EMPTY_LINE][static_cast<uint8_t>(state_machine_options.delimiter)] = CSVState::DELIMITER;
74
+ transition_array[CSVState::EMPTY_LINE][static_cast<uint8_t>(state_machine_options.quote)] = CSVState::QUOTED;
73
75
  }
74
76
 
75
77
  CSVStateMachineCache::CSVStateMachineCache() {