duckdb 1.1.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. package/binding.gyp +2 -1
  2. package/package.json +1 -1
  3. package/src/duckdb/extension/icu/third_party/icu/stubdata/stubdata.cpp +1 -1
  4. package/src/duckdb/extension/json/include/json_common.hpp +14 -4
  5. package/src/duckdb/extension/json/include/json_executors.hpp +11 -3
  6. package/src/duckdb/extension/json/json_extension.cpp +1 -1
  7. package/src/duckdb/extension/json/json_functions/json_extract.cpp +11 -3
  8. package/src/duckdb/extension/json/json_functions/json_value.cpp +4 -3
  9. package/src/duckdb/extension/json/json_functions.cpp +16 -7
  10. package/src/duckdb/extension/parquet/column_reader.cpp +3 -0
  11. package/src/duckdb/extension/parquet/column_writer.cpp +54 -43
  12. package/src/duckdb/extension/parquet/geo_parquet.cpp +19 -0
  13. package/src/duckdb/extension/parquet/include/geo_parquet.hpp +10 -6
  14. package/src/duckdb/extension/parquet/include/templated_column_reader.hpp +3 -3
  15. package/src/duckdb/extension/parquet/parquet_writer.cpp +2 -1
  16. package/src/duckdb/src/common/arrow/arrow_converter.cpp +1 -1
  17. package/src/duckdb/src/common/arrow/arrow_merge_event.cpp +1 -0
  18. package/src/duckdb/src/common/arrow/arrow_util.cpp +60 -0
  19. package/src/duckdb/src/common/arrow/arrow_wrapper.cpp +1 -53
  20. package/src/duckdb/src/common/cgroups.cpp +15 -24
  21. package/src/duckdb/src/common/constants.cpp +8 -0
  22. package/src/duckdb/src/common/enum_util.cpp +331 -326
  23. package/src/duckdb/src/common/http_util.cpp +5 -1
  24. package/src/duckdb/src/common/operator/cast_operators.cpp +6 -60
  25. package/src/duckdb/src/common/types/bit.cpp +1 -1
  26. package/src/duckdb/src/common/types/column/column_data_allocator.cpp +18 -1
  27. package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +2 -1
  28. package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +5 -0
  29. package/src/duckdb/src/core_functions/aggregate/distributive/arg_min_max.cpp +1 -1
  30. package/src/duckdb/src/core_functions/aggregate/distributive/minmax.cpp +2 -1
  31. package/src/duckdb/src/execution/index/art/iterator.cpp +17 -15
  32. package/src/duckdb/src/execution/index/art/prefix.cpp +9 -34
  33. package/src/duckdb/src/execution/index/fixed_size_buffer.cpp +4 -3
  34. package/src/duckdb/src/execution/operator/aggregate/physical_ungrouped_aggregate.cpp +1 -0
  35. package/src/duckdb/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp +2 -1
  36. package/src/duckdb/src/execution/operator/csv_scanner/scanner/base_scanner.cpp +2 -2
  37. package/src/duckdb/src/execution/operator/csv_scanner/scanner/column_count_scanner.cpp +23 -1
  38. package/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +33 -4
  39. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +23 -13
  40. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +23 -19
  41. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +12 -11
  42. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +20 -14
  43. package/src/duckdb/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp +4 -4
  44. package/src/duckdb/src/execution/operator/csv_scanner/util/csv_error.cpp +3 -1
  45. package/src/duckdb/src/execution/operator/join/physical_piecewise_merge_join.cpp +6 -1
  46. package/src/duckdb/src/function/cast/decimal_cast.cpp +33 -3
  47. package/src/duckdb/src/function/table/arrow/arrow_duck_schema.cpp +9 -0
  48. package/src/duckdb/src/function/table/arrow.cpp +34 -22
  49. package/src/duckdb/src/function/table/sniff_csv.cpp +4 -1
  50. package/src/duckdb/src/function/table/version/pragma_version.cpp +3 -3
  51. package/src/duckdb/src/include/duckdb/common/arrow/arrow_util.hpp +31 -0
  52. package/src/duckdb/src/include/duckdb/common/arrow/arrow_wrapper.hpp +2 -16
  53. package/src/duckdb/src/include/duckdb/common/operator/cast_operators.hpp +60 -0
  54. package/src/duckdb/src/include/duckdb/common/types/column/column_data_allocator.hpp +1 -0
  55. package/src/duckdb/src/include/duckdb/common/types/hugeint.hpp +0 -1
  56. package/src/duckdb/src/include/duckdb/common/types/row/row_data_collection.hpp +2 -1
  57. package/src/duckdb/src/include/duckdb/core_functions/aggregate/minmax_n_helpers.hpp +9 -5
  58. package/src/duckdb/src/include/duckdb/execution/executor.hpp +1 -0
  59. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/base_scanner.hpp +5 -2
  60. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/column_count_scanner.hpp +5 -1
  61. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_sniffer.hpp +5 -5
  62. package/src/duckdb/src/include/duckdb/execution/operator/helper/physical_result_collector.hpp +1 -0
  63. package/src/duckdb/src/include/duckdb/function/table/arrow/arrow_duck_schema.hpp +11 -0
  64. package/src/duckdb/src/include/duckdb/main/config.hpp +2 -2
  65. package/src/duckdb/src/include/duckdb/main/extension.hpp +1 -0
  66. package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +14 -5
  67. package/src/duckdb/src/include/duckdb/main/extension_helper.hpp +1 -1
  68. package/src/duckdb/src/include/duckdb/main/settings.hpp +4 -2
  69. package/src/duckdb/src/include/duckdb/parser/keyword_helper.hpp +3 -0
  70. package/src/duckdb/src/include/duckdb/parser/parser.hpp +1 -1
  71. package/src/duckdb/src/include/duckdb/parser/simplified_token.hpp +7 -1
  72. package/src/duckdb/src/include/duckdb/planner/binder.hpp +2 -0
  73. package/src/duckdb/src/include/duckdb/planner/expression_binder/select_binder.hpp +2 -0
  74. package/src/duckdb/src/include/duckdb/planner/expression_binder.hpp +3 -1
  75. package/src/duckdb/src/include/duckdb/storage/block_manager.hpp +3 -1
  76. package/src/duckdb/src/include/duckdb/storage/buffer/block_handle.hpp +7 -4
  77. package/src/duckdb/src/include/duckdb/storage/buffer/buffer_handle.hpp +2 -2
  78. package/src/duckdb/src/include/duckdb/storage/buffer/buffer_pool.hpp +2 -1
  79. package/src/duckdb/src/include/duckdb/storage/buffer_manager.hpp +4 -4
  80. package/src/duckdb/src/include/duckdb/storage/standard_buffer_manager.hpp +3 -4
  81. package/src/duckdb/src/include/duckdb/storage/table/column_data.hpp +1 -1
  82. package/src/duckdb/src/include/duckdb/storage/table/row_group_collection.hpp +4 -2
  83. package/src/duckdb/src/include/duckdb/storage/table/standard_column_data.hpp +1 -1
  84. package/src/duckdb/src/include/duckdb/transaction/duck_transaction.hpp +1 -0
  85. package/src/duckdb/src/include/duckdb/transaction/local_storage.hpp +1 -0
  86. package/src/duckdb/src/include/duckdb/transaction/transaction_manager.hpp +1 -1
  87. package/src/duckdb/src/include/duckdb.h +8 -8
  88. package/src/duckdb/src/main/appender.cpp +1 -1
  89. package/src/duckdb/src/main/capi/duckdb_value-c.cpp +3 -3
  90. package/src/duckdb/src/main/capi/helper-c.cpp +4 -0
  91. package/src/duckdb/src/main/config.cpp +24 -11
  92. package/src/duckdb/src/main/database.cpp +6 -5
  93. package/src/duckdb/src/main/extension/extension_install.cpp +13 -8
  94. package/src/duckdb/src/main/extension/extension_load.cpp +10 -4
  95. package/src/duckdb/src/main/extension.cpp +1 -1
  96. package/src/duckdb/src/optimizer/filter_pushdown.cpp +10 -1
  97. package/src/duckdb/src/optimizer/join_filter_pushdown_optimizer.cpp +9 -5
  98. package/src/duckdb/src/optimizer/join_order/cardinality_estimator.cpp +14 -8
  99. package/src/duckdb/src/optimizer/join_order/query_graph_manager.cpp +2 -0
  100. package/src/duckdb/src/optimizer/join_order/relation_manager.cpp +15 -0
  101. package/src/duckdb/src/optimizer/optimizer.cpp +4 -1
  102. package/src/duckdb/src/optimizer/pushdown/pushdown_cross_product.cpp +1 -11
  103. package/src/duckdb/src/optimizer/pushdown/pushdown_inner_join.cpp +1 -7
  104. package/src/duckdb/src/optimizer/pushdown/pushdown_left_join.cpp +1 -1
  105. package/src/duckdb/src/optimizer/statistics/expression/propagate_cast.cpp +3 -0
  106. package/src/duckdb/src/optimizer/statistics/operator/propagate_join.cpp +1 -0
  107. package/src/duckdb/src/parser/keyword_helper.cpp +4 -0
  108. package/src/duckdb/src/parser/parser.cpp +20 -18
  109. package/src/duckdb/src/parser/transform/statement/transform_select_node.cpp +8 -3
  110. package/src/duckdb/src/planner/binder/expression/bind_function_expression.cpp +3 -0
  111. package/src/duckdb/src/planner/binder/expression/bind_lambda.cpp +7 -1
  112. package/src/duckdb/src/planner/binder/expression/bind_unnest_expression.cpp +13 -0
  113. package/src/duckdb/src/planner/binder/statement/bind_copy_database.cpp +7 -11
  114. package/src/duckdb/src/planner/binder/statement/bind_create.cpp +27 -10
  115. package/src/duckdb/src/planner/binder/statement/bind_export.cpp +24 -9
  116. package/src/duckdb/src/planner/binder/tableref/plan_joinref.cpp +1 -3
  117. package/src/duckdb/src/planner/binder.cpp +5 -6
  118. package/src/duckdb/src/planner/expression/bound_cast_expression.cpp +1 -0
  119. package/src/duckdb/src/planner/expression_binder/select_binder.cpp +9 -0
  120. package/src/duckdb/src/planner/operator/logical_copy_to_file.cpp +2 -2
  121. package/src/duckdb/src/planner/operator/logical_positional_join.cpp +1 -0
  122. package/src/duckdb/src/storage/buffer/block_handle.cpp +18 -21
  123. package/src/duckdb/src/storage/buffer/block_manager.cpp +12 -4
  124. package/src/duckdb/src/storage/buffer/buffer_handle.cpp +2 -2
  125. package/src/duckdb/src/storage/buffer/buffer_pool.cpp +12 -2
  126. package/src/duckdb/src/storage/buffer_manager.cpp +3 -2
  127. package/src/duckdb/src/storage/compression/rle.cpp +5 -2
  128. package/src/duckdb/src/storage/compression/string_uncompressed.cpp +2 -1
  129. package/src/duckdb/src/storage/metadata/metadata_manager.cpp +8 -7
  130. package/src/duckdb/src/storage/standard_buffer_manager.cpp +19 -20
  131. package/src/duckdb/src/storage/statistics/column_statistics.cpp +1 -2
  132. package/src/duckdb/src/storage/table/column_data.cpp +5 -2
  133. package/src/duckdb/src/storage/table/column_segment.cpp +2 -2
  134. package/src/duckdb/src/storage/table/row_group_collection.cpp +18 -14
  135. package/src/duckdb/src/storage/table/standard_column_data.cpp +3 -3
  136. package/src/duckdb/src/storage/wal_replay.cpp +2 -3
  137. package/src/duckdb/third_party/libpg_query/include/common/keywords.hpp +1 -0
  138. package/src/duckdb/third_party/libpg_query/include/nodes/parsenodes.hpp +1 -0
  139. package/src/duckdb/third_party/libpg_query/include/parser/parser.hpp +1 -2
  140. package/src/duckdb/third_party/libpg_query/include/pg_simplified_token.hpp +6 -4
  141. package/src/duckdb/third_party/libpg_query/include/postgres_parser.hpp +1 -1
  142. package/src/duckdb/third_party/libpg_query/postgres_parser.cpp +1 -1
  143. package/src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp +801 -799
  144. package/src/duckdb/third_party/libpg_query/src_backend_parser_parser.cpp +6 -2
  145. package/src/duckdb/third_party/libpg_query/src_common_keywords.cpp +0 -1
  146. package/src/duckdb/ub_src_common_arrow.cpp +2 -0
  147. package/vendor.py +1 -2
@@ -241,12 +241,16 @@ public:
241
241
  };
242
242
 
243
243
  //! Get JSON value using JSON path query (safe, checks the path query)
244
- static inline yyjson_val *Get(yyjson_val *val, const string_t &path_str) {
244
+ static inline yyjson_val *Get(yyjson_val *val, const string_t &path_str, bool integral_argument) {
245
245
  auto ptr = path_str.GetData();
246
246
  auto len = path_str.GetSize();
247
247
  if (len == 0) {
248
248
  return GetUnsafe(val, ptr, len);
249
249
  }
250
+ if (integral_argument) {
251
+ auto str = "$[" + path_str.GetString() + "]";
252
+ return GetUnsafe(val, str.c_str(), str.length());
253
+ }
250
254
  switch (*ptr) {
251
255
  case '/': {
252
256
  // '/' notation must be '\0'-terminated
@@ -260,9 +264,15 @@ public:
260
264
  }
261
265
  return GetUnsafe(val, ptr, len);
262
266
  }
263
- default:
264
- auto str = "/" + string(ptr, len);
265
- return GetUnsafe(val, str.c_str(), len + 1);
267
+ default: {
268
+ string path;
269
+ if (memchr(ptr, '"', len)) {
270
+ path = "/" + string(ptr, len);
271
+ } else {
272
+ path = "$.\"" + path_str.GetString() + "\"";
273
+ }
274
+ return GetUnsafe(val, path.c_str(), path.length());
275
+ }
266
276
  }
267
277
  }
268
278
 
@@ -8,6 +8,7 @@
8
8
 
9
9
  #pragma once
10
10
 
11
+ #include "duckdb/common/vector_operations/vector_operations.hpp"
11
12
  #include "duckdb/execution/expression_executor.hpp"
12
13
  #include "json_functions.hpp"
13
14
 
@@ -88,11 +89,18 @@ public:
88
89
  }
89
90
  } else { // Columnref path
90
91
  D_ASSERT(info.path_type == JSONCommon::JSONPathType::REGULAR);
91
- auto &paths = args.data[1];
92
+ unique_ptr<Vector> casted_paths;
93
+ if (args.data[1].GetType().id() == LogicalTypeId::VARCHAR) {
94
+ casted_paths = make_uniq<Vector>(args.data[1]);
95
+ } else {
96
+ casted_paths = make_uniq<Vector>(LogicalTypeId::VARCHAR);
97
+ VectorOperations::DefaultCast(args.data[1], *casted_paths, args.size(), true);
98
+ }
92
99
  BinaryExecutor::ExecuteWithNulls<string_t, string_t, T>(
93
- inputs, paths, result, args.size(), [&](string_t input, string_t path, ValidityMask &mask, idx_t idx) {
100
+ inputs, *casted_paths, result, args.size(),
101
+ [&](string_t input, string_t path, ValidityMask &mask, idx_t idx) {
94
102
  auto doc = JSONCommon::ReadDocument(input, JSONCommon::READ_FLAG, lstate.json_allocator.GetYYAlc());
95
- auto val = JSONCommon::Get(doc->root, path);
103
+ auto val = JSONCommon::Get(doc->root, path, args.data[1].GetType().IsIntegral());
96
104
  if (SET_NULL_IF_NOT_FOUND && !val) {
97
105
  mask.SetInvalid(idx);
98
106
  return T {};
@@ -27,7 +27,7 @@ static DefaultMacro json_macros[] = {
27
27
  "json_group_structure",
28
28
  {"x", nullptr},
29
29
  {{nullptr, nullptr}},
30
- "json_structure(json_group_array(x))->'0'"},
30
+ "json_structure(json_group_array(x))->0"},
31
31
  {DEFAULT_SCHEMA, "json", {"x", nullptr}, {{nullptr, nullptr}}, "json_extract(x, '$')"},
32
32
  {nullptr, nullptr, {nullptr}, {{nullptr, nullptr}}, nullptr}};
33
33
 
@@ -6,9 +6,17 @@ static inline string_t ExtractFromVal(yyjson_val *val, yyjson_alc *alc, Vector &
6
6
  return JSONCommon::WriteVal<yyjson_val>(val, alc);
7
7
  }
8
8
 
9
- static inline string_t ExtractStringFromVal(yyjson_val *val, yyjson_alc *alc, Vector &, ValidityMask &, idx_t) {
10
- return yyjson_is_str(val) ? string_t(unsafe_yyjson_get_str(val), unsafe_yyjson_get_len(val))
11
- : JSONCommon::WriteVal<yyjson_val>(val, alc);
9
+ static inline string_t ExtractStringFromVal(yyjson_val *val, yyjson_alc *alc, Vector &, ValidityMask &mask, idx_t idx) {
10
+ switch (yyjson_get_tag(val)) {
11
+ case YYJSON_TYPE_NULL | YYJSON_SUBTYPE_NONE:
12
+ mask.SetInvalid(idx);
13
+ return string_t {};
14
+ case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NOESC:
15
+ case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NONE:
16
+ return string_t(unsafe_yyjson_get_str(val), unsafe_yyjson_get_len(val));
17
+ default:
18
+ return JSONCommon::WriteVal<yyjson_val>(val, alc);
19
+ }
12
20
  }
13
21
 
14
22
  static void ExtractFunction(DataChunk &args, ExpressionState &state, Vector &result) {
@@ -4,6 +4,7 @@ namespace duckdb {
4
4
 
5
5
  static inline string_t ValueFromVal(yyjson_val *val, yyjson_alc *alc, Vector &, ValidityMask &mask, idx_t idx) {
6
6
  switch (yyjson_get_tag(val)) {
7
+ case YYJSON_TYPE_NULL | YYJSON_SUBTYPE_NONE:
7
8
  case YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE:
8
9
  case YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE:
9
10
  mask.SetInvalid(idx);
@@ -22,12 +23,12 @@ static void ValueManyFunction(DataChunk &args, ExpressionState &state, Vector &r
22
23
  }
23
24
 
24
25
  static void GetValueFunctionsInternal(ScalarFunctionSet &set, const LogicalType &input_type) {
25
- set.AddFunction(ScalarFunction({input_type, LogicalType::BIGINT}, LogicalType::JSON(), ValueFunction,
26
+ set.AddFunction(ScalarFunction({input_type, LogicalType::BIGINT}, LogicalType::VARCHAR, ValueFunction,
26
27
  JSONReadFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
27
- set.AddFunction(ScalarFunction({input_type, LogicalType::VARCHAR}, LogicalType::JSON(), ValueFunction,
28
+ set.AddFunction(ScalarFunction({input_type, LogicalType::VARCHAR}, LogicalType::VARCHAR, ValueFunction,
28
29
  JSONReadFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
29
30
  set.AddFunction(ScalarFunction({input_type, LogicalType::LIST(LogicalType::VARCHAR)},
30
- LogicalType::LIST(LogicalType::JSON()), ValueManyFunction,
31
+ LogicalType::LIST(LogicalType::VARCHAR), ValueManyFunction,
31
32
  JSONReadManyFunctionData::Bind, nullptr, nullptr, JSONFunctionLocalState::Init));
32
33
  }
33
34
 
@@ -21,21 +21,25 @@ static JSONPathType CheckPath(const Value &path_val, string &path, size_t &len)
21
21
  const auto path_str_val = path_val.DefaultCastAs(LogicalType::VARCHAR);
22
22
  auto path_str = path_str_val.GetValueUnsafe<string_t>();
23
23
  len = path_str.GetSize();
24
- auto ptr = path_str.GetData();
24
+ const auto ptr = path_str.GetData();
25
25
  // Empty strings and invalid $ paths yield an error
26
26
  if (len == 0) {
27
27
  throw BinderException("Empty JSON path");
28
28
  }
29
29
  JSONPathType path_type = JSONPathType::REGULAR;
30
- if (*ptr == '$') {
31
- path_type = JSONCommon::ValidatePath(ptr, len, true);
32
- }
33
30
  // Copy over string to the bind data
34
31
  if (*ptr == '/' || *ptr == '$') {
35
32
  path = string(ptr, len);
36
- } else {
33
+ } else if (path_val.type().IsIntegral()) {
34
+ path = "$[" + string(ptr, len) + "]";
35
+ } else if (memchr(ptr, '"', len)) {
37
36
  path = "/" + string(ptr, len);
38
- len++;
37
+ } else {
38
+ path = "$.\"" + string(ptr, len) + "\"";
39
+ }
40
+ len = path.length();
41
+ if (*path.c_str() == '$') {
42
+ path_type = JSONCommon::ValidatePath(path.c_str(), len, true);
39
43
  }
40
44
  return path_type;
41
45
  }
@@ -67,7 +71,11 @@ unique_ptr<FunctionData> JSONReadFunctionData::Bind(ClientContext &context, Scal
67
71
  path_type = CheckPath(path_val, path, len);
68
72
  }
69
73
  }
70
- bound_function.arguments[1] = LogicalType::VARCHAR;
74
+ if (arguments[1]->return_type.IsIntegral()) {
75
+ bound_function.arguments[1] = LogicalType::BIGINT;
76
+ } else {
77
+ bound_function.arguments[1] = LogicalType::VARCHAR;
78
+ }
71
79
  if (path_type == JSONCommon::JSONPathType::WILDCARD) {
72
80
  bound_function.return_type = LogicalType::LIST(bound_function.return_type);
73
81
  }
@@ -117,6 +125,7 @@ unique_ptr<FunctionData> JSONReadManyFunctionData::Bind(ClientContext &context,
117
125
 
118
126
  JSONFunctionLocalState::JSONFunctionLocalState(Allocator &allocator) : json_allocator(allocator) {
119
127
  }
128
+
120
129
  JSONFunctionLocalState::JSONFunctionLocalState(ClientContext &context)
121
130
  : JSONFunctionLocalState(BufferAllocator::Get(context)) {
122
131
  }
@@ -259,6 +259,9 @@ void ColumnReader::PrepareRead(parquet_filter_t &filter) {
259
259
  break;
260
260
  case PageType::DICTIONARY_PAGE:
261
261
  PreparePage(page_hdr);
262
+ if (page_hdr.dictionary_page_header.num_values < 0) {
263
+ throw std::runtime_error("Invalid dictionary page header (num_values < 0)");
264
+ }
262
265
  Dictionary(std::move(block), page_hdr.dictionary_page_header.num_values);
263
266
  break;
264
267
  default:
@@ -1209,47 +1209,6 @@ public:
1209
1209
  }
1210
1210
  };
1211
1211
 
1212
- //===--------------------------------------------------------------------===//
1213
- // Geometry Column Writer
1214
- //===--------------------------------------------------------------------===//
1215
- // This class just wraps another column writer, but also calculates the extent
1216
- // of the geometry column by updating the geodata object with every written
1217
- // vector.
1218
- template <class WRITER_IMPL>
1219
- class GeometryColumnWriter : public WRITER_IMPL {
1220
- GeoParquetColumnMetadata geo_data;
1221
- GeoParquetColumnMetadataWriter geo_data_writer;
1222
- string column_name;
1223
-
1224
- public:
1225
- void Write(ColumnWriterState &state, Vector &vector, idx_t count) override {
1226
- // Just write normally
1227
- WRITER_IMPL::Write(state, vector, count);
1228
-
1229
- // And update the geodata object
1230
- geo_data_writer.Update(geo_data, vector, count);
1231
- }
1232
- void FinalizeWrite(ColumnWriterState &state) override {
1233
- WRITER_IMPL::FinalizeWrite(state);
1234
-
1235
- // Add the geodata object to the writer
1236
- this->writer.GetGeoParquetData().geometry_columns[column_name] = geo_data;
1237
- }
1238
-
1239
- public:
1240
- GeometryColumnWriter(ClientContext &context, ParquetWriter &writer, idx_t schema_idx, vector<string> schema_path_p,
1241
- idx_t max_repeat, idx_t max_define, bool can_have_nulls, string name)
1242
- : WRITER_IMPL(writer, schema_idx, std::move(schema_path_p), max_repeat, max_define, can_have_nulls),
1243
- geo_data_writer(context), column_name(std::move(name)) {
1244
-
1245
- auto &geo_data = writer.GetGeoParquetData();
1246
- if (geo_data.primary_geometry_column.empty()) {
1247
- // Set the first column to the primary column
1248
- geo_data.primary_geometry_column = column_name;
1249
- }
1250
- }
1251
- };
1252
-
1253
1212
  //===--------------------------------------------------------------------===//
1254
1213
  // String Column Writer
1255
1214
  //===--------------------------------------------------------------------===//
@@ -1563,6 +1522,58 @@ private:
1563
1522
  }
1564
1523
  };
1565
1524
 
1525
+ //===--------------------------------------------------------------------===//
1526
+ // WKB Column Writer
1527
+ //===--------------------------------------------------------------------===//
1528
+ // Used to store the metadata for a WKB-encoded geometry column when writing
1529
+ // GeoParquet files.
1530
+ class WKBColumnWriterState final : public StringColumnWriterState {
1531
+ public:
1532
+ WKBColumnWriterState(ClientContext &context, duckdb_parquet::format::RowGroup &row_group, idx_t col_idx)
1533
+ : StringColumnWriterState(row_group, col_idx), geo_data(), geo_data_writer(context) {
1534
+ }
1535
+
1536
+ GeoParquetColumnMetadata geo_data;
1537
+ GeoParquetColumnMetadataWriter geo_data_writer;
1538
+ };
1539
+
1540
+ class WKBColumnWriter final : public StringColumnWriter {
1541
+ public:
1542
+ WKBColumnWriter(ClientContext &context_p, ParquetWriter &writer, idx_t schema_idx, vector<string> schema_path_p,
1543
+ idx_t max_repeat, idx_t max_define, bool can_have_nulls, string name)
1544
+ : StringColumnWriter(writer, schema_idx, std::move(schema_path_p), max_repeat, max_define, can_have_nulls),
1545
+ column_name(std::move(name)), context(context_p) {
1546
+
1547
+ this->writer.GetGeoParquetData().RegisterGeometryColumn(column_name);
1548
+ }
1549
+
1550
+ unique_ptr<ColumnWriterState> InitializeWriteState(duckdb_parquet::format::RowGroup &row_group) override {
1551
+ auto result = make_uniq<WKBColumnWriterState>(context, row_group, row_group.columns.size());
1552
+ RegisterToRowGroup(row_group);
1553
+ return std::move(result);
1554
+ }
1555
+ void Write(ColumnWriterState &state, Vector &vector, idx_t count) override {
1556
+ StringColumnWriter::Write(state, vector, count);
1557
+
1558
+ auto &geo_state = state.Cast<WKBColumnWriterState>();
1559
+ geo_state.geo_data_writer.Update(geo_state.geo_data, vector, count);
1560
+ }
1561
+
1562
+ void FinalizeWrite(ColumnWriterState &state) override {
1563
+ StringColumnWriter::FinalizeWrite(state);
1564
+
1565
+ // Add the geodata object to the writer
1566
+ const auto &geo_state = state.Cast<WKBColumnWriterState>();
1567
+
1568
+ // Merge this state's geo column data with the writer's geo column data
1569
+ writer.GetGeoParquetData().FlushColumnMeta(column_name, geo_state.geo_data);
1570
+ }
1571
+
1572
+ private:
1573
+ string column_name;
1574
+ ClientContext &context;
1575
+ };
1576
+
1566
1577
  //===--------------------------------------------------------------------===//
1567
1578
  // Enum Column Writer
1568
1579
  //===--------------------------------------------------------------------===//
@@ -2234,8 +2245,8 @@ unique_ptr<ColumnWriter> ColumnWriter::CreateWriterRecursive(ClientContext &cont
2234
2245
  schema_path.push_back(name);
2235
2246
 
2236
2247
  if (type.id() == LogicalTypeId::BLOB && type.GetAlias() == "WKB_BLOB") {
2237
- return make_uniq<GeometryColumnWriter<StringColumnWriter>>(context, writer, schema_idx, std::move(schema_path),
2238
- max_repeat, max_define, can_have_nulls, name);
2248
+ return make_uniq<WKBColumnWriter>(context, writer, schema_idx, std::move(schema_path), max_repeat, max_define,
2249
+ can_have_nulls, name);
2239
2250
  }
2240
2251
 
2241
2252
  switch (type.id()) {
@@ -279,6 +279,17 @@ GeoParquetFileMetadata::TryRead(const duckdb_parquet::format::FileMetaData &file
279
279
  return nullptr;
280
280
  }
281
281
 
282
+ void GeoParquetFileMetadata::FlushColumnMeta(const string &column_name, const GeoParquetColumnMetadata &meta) {
283
+ // Lock the metadata
284
+ lock_guard<mutex> glock(write_lock);
285
+
286
+ auto &column = geometry_columns[column_name];
287
+
288
+ // Combine the metadata
289
+ column.geometry_types.insert(meta.geometry_types.begin(), meta.geometry_types.end());
290
+ column.bbox.Combine(meta.bbox);
291
+ }
292
+
282
293
  void GeoParquetFileMetadata::Write(duckdb_parquet::format::FileMetaData &file_meta_data) const {
283
294
 
284
295
  yyjson_mut_doc *doc = yyjson_mut_doc_new(nullptr);
@@ -349,6 +360,14 @@ bool GeoParquetFileMetadata::IsGeometryColumn(const string &column_name) const {
349
360
  return geometry_columns.find(column_name) != geometry_columns.end();
350
361
  }
351
362
 
363
+ void GeoParquetFileMetadata::RegisterGeometryColumn(const string &column_name) {
364
+ lock_guard<mutex> glock(write_lock);
365
+ if (primary_geometry_column.empty()) {
366
+ primary_geometry_column = column_name;
367
+ }
368
+ geometry_columns[column_name] = GeoParquetColumnMetadata();
369
+ }
370
+
352
371
  unique_ptr<ColumnReader> GeoParquetFileMetadata::CreateColumnReader(ParquetReader &reader,
353
372
  const LogicalType &logical_type,
354
373
  const SchemaElement &s_ele, idx_t schema_idx_p,
@@ -115,7 +115,7 @@ public:
115
115
  void Update(GeoParquetColumnMetadata &meta, Vector &vector, idx_t count);
116
116
  };
117
117
 
118
- struct GeoParquetFileMetadata {
118
+ class GeoParquetFileMetadata {
119
119
  public:
120
120
  // Try to read GeoParquet metadata. Returns nullptr if not found, invalid or the required spatial extension is not
121
121
  // available.
@@ -123,17 +123,21 @@ public:
123
123
  ClientContext &context);
124
124
  void Write(duckdb_parquet::format::FileMetaData &file_meta_data) const;
125
125
 
126
- public:
127
- // Default to 1.1.0 for now
128
- string version = "1.1.0";
129
- string primary_geometry_column;
130
- unordered_map<string, GeoParquetColumnMetadata> geometry_columns;
126
+ void FlushColumnMeta(const string &column_name, const GeoParquetColumnMetadata &meta);
127
+ const unordered_map<string, GeoParquetColumnMetadata> &GetColumnMeta() const;
131
128
 
132
129
  unique_ptr<ColumnReader> CreateColumnReader(ParquetReader &reader, const LogicalType &logical_type,
133
130
  const duckdb_parquet::format::SchemaElement &s_ele, idx_t schema_idx_p,
134
131
  idx_t max_define_p, idx_t max_repeat_p, ClientContext &context);
135
132
 
136
133
  bool IsGeometryColumn(const string &column_name) const;
134
+ void RegisterGeometryColumn(const string &column_name);
135
+
136
+ private:
137
+ mutex write_lock;
138
+ string version = "1.1.0";
139
+ string primary_geometry_column;
140
+ unordered_map<string, GeoParquetColumnMetadata> geometry_columns;
137
141
  };
138
142
 
139
143
  } // namespace duckdb
@@ -68,9 +68,9 @@ public:
68
68
 
69
69
  void Offsets(uint32_t *offsets, uint8_t *defines, uint64_t num_values, parquet_filter_t &filter,
70
70
  idx_t result_offset, Vector &result) override {
71
- if (!dict) {
72
- throw IOException(
73
- "Parquet file is likely corrupted, cannot have dictionary offsets without seeing a dictionary first.");
71
+ if (!dict || dict->len == 0) {
72
+ throw IOException("Parquet file is likely corrupted, cannot have dictionary offsets without seeing a "
73
+ "non-empty dictionary first.");
74
74
  }
75
75
  if (HasDefines()) {
76
76
  OffsetsInternal<true>(*dict, offsets, defines, num_values, filter, result_offset, result);
@@ -350,7 +350,8 @@ ParquetWriter::ParquetWriter(ClientContext &context, FileSystem &fs, string file
350
350
  file_meta_data.version = 1;
351
351
 
352
352
  file_meta_data.__isset.created_by = true;
353
- file_meta_data.created_by = "DuckDB";
353
+ file_meta_data.created_by =
354
+ StringUtil::Format("DuckDB version %s (build %s)", DuckDB::LibraryVersion(), DuckDB::SourceID());
354
355
 
355
356
  file_meta_data.schema.resize(1);
356
357
 
@@ -166,7 +166,7 @@ void SetArrowFormat(DuckDBArrowSchemaHolder &root_holder, ArrowSchema &child, co
166
166
  break;
167
167
  }
168
168
  case LogicalTypeId::VARCHAR:
169
- if (type.IsJSONType()) {
169
+ if (type.IsJSONType() && options.arrow_lossless_conversion) {
170
170
  auto schema_metadata = ArrowSchemaMetadata::MetadataFromName("arrow.json");
171
171
  root_holder.metadata_info.emplace_back(schema_metadata.SerializeMetadata());
172
172
  child.metadata = root_holder.metadata_info.back().get();
@@ -1,4 +1,5 @@
1
1
  #include "duckdb/common/arrow/arrow_merge_event.hpp"
2
+ #include "duckdb/common/arrow/arrow_util.hpp"
2
3
  #include "duckdb/storage/storage_info.hpp"
3
4
 
4
5
  namespace duckdb {
@@ -0,0 +1,60 @@
1
+ #include "duckdb/common/arrow/arrow_util.hpp"
2
+ #include "duckdb/common/arrow/arrow_appender.hpp"
3
+ #include "duckdb/common/types/data_chunk.hpp"
4
+
5
+ namespace duckdb {
6
+
7
+ bool ArrowUtil::TryFetchChunk(ChunkScanState &scan_state, ClientProperties options, idx_t batch_size, ArrowArray *out,
8
+ idx_t &count, ErrorData &error) {
9
+ count = 0;
10
+ ArrowAppender appender(scan_state.Types(), batch_size, std::move(options));
11
+ auto remaining_tuples_in_chunk = scan_state.RemainingInChunk();
12
+ if (remaining_tuples_in_chunk) {
13
+ // We start by scanning the non-finished current chunk
14
+ idx_t cur_consumption = MinValue(remaining_tuples_in_chunk, batch_size);
15
+ count += cur_consumption;
16
+ auto &current_chunk = scan_state.CurrentChunk();
17
+ appender.Append(current_chunk, scan_state.CurrentOffset(), scan_state.CurrentOffset() + cur_consumption,
18
+ current_chunk.size());
19
+ scan_state.IncreaseOffset(cur_consumption);
20
+ }
21
+ while (count < batch_size) {
22
+ if (!scan_state.LoadNextChunk(error)) {
23
+ if (scan_state.HasError()) {
24
+ error = scan_state.GetError();
25
+ }
26
+ return false;
27
+ }
28
+ if (scan_state.ChunkIsEmpty()) {
29
+ // The scan was successful, but an empty chunk was returned
30
+ break;
31
+ }
32
+ auto &current_chunk = scan_state.CurrentChunk();
33
+ if (scan_state.Finished() || current_chunk.size() == 0) {
34
+ break;
35
+ }
36
+ // The amount we still need to append into this chunk
37
+ auto remaining = batch_size - count;
38
+
39
+ // The amount remaining, capped by the amount left in the current chunk
40
+ auto to_append_to_batch = MinValue(remaining, scan_state.RemainingInChunk());
41
+ appender.Append(current_chunk, 0, to_append_to_batch, current_chunk.size());
42
+ count += to_append_to_batch;
43
+ scan_state.IncreaseOffset(to_append_to_batch);
44
+ }
45
+ if (count > 0) {
46
+ *out = appender.Finalize();
47
+ }
48
+ return true;
49
+ }
50
+
51
+ idx_t ArrowUtil::FetchChunk(ChunkScanState &scan_state, ClientProperties options, idx_t chunk_size, ArrowArray *out) {
52
+ ErrorData error;
53
+ idx_t result_count;
54
+ if (!TryFetchChunk(scan_state, std::move(options), chunk_size, out, result_count, error)) {
55
+ error.Throw();
56
+ }
57
+ return result_count;
58
+ }
59
+
60
+ } // namespace duckdb
@@ -1,4 +1,5 @@
1
1
  #include "duckdb/common/arrow/arrow_wrapper.hpp"
2
+ #include "duckdb/common/arrow/arrow_util.hpp"
2
3
  #include "duckdb/common/arrow/arrow_converter.hpp"
3
4
 
4
5
  #include "duckdb/common/assert.hpp"
@@ -176,57 +177,4 @@ ResultArrowArrayStreamWrapper::ResultArrowArrayStreamWrapper(unique_ptr<QueryRes
176
177
  stream.get_last_error = ResultArrowArrayStreamWrapper::MyStreamGetLastError;
177
178
  }
178
179
 
179
- bool ArrowUtil::TryFetchChunk(ChunkScanState &scan_state, ClientProperties options, idx_t batch_size, ArrowArray *out,
180
- idx_t &count, ErrorData &error) {
181
- count = 0;
182
- ArrowAppender appender(scan_state.Types(), batch_size, std::move(options));
183
- auto remaining_tuples_in_chunk = scan_state.RemainingInChunk();
184
- if (remaining_tuples_in_chunk) {
185
- // We start by scanning the non-finished current chunk
186
- idx_t cur_consumption = MinValue(remaining_tuples_in_chunk, batch_size);
187
- count += cur_consumption;
188
- auto &current_chunk = scan_state.CurrentChunk();
189
- appender.Append(current_chunk, scan_state.CurrentOffset(), scan_state.CurrentOffset() + cur_consumption,
190
- current_chunk.size());
191
- scan_state.IncreaseOffset(cur_consumption);
192
- }
193
- while (count < batch_size) {
194
- if (!scan_state.LoadNextChunk(error)) {
195
- if (scan_state.HasError()) {
196
- error = scan_state.GetError();
197
- }
198
- return false;
199
- }
200
- if (scan_state.ChunkIsEmpty()) {
201
- // The scan was successful, but an empty chunk was returned
202
- break;
203
- }
204
- auto &current_chunk = scan_state.CurrentChunk();
205
- if (scan_state.Finished() || current_chunk.size() == 0) {
206
- break;
207
- }
208
- // The amount we still need to append into this chunk
209
- auto remaining = batch_size - count;
210
-
211
- // The amount remaining, capped by the amount left in the current chunk
212
- auto to_append_to_batch = MinValue(remaining, scan_state.RemainingInChunk());
213
- appender.Append(current_chunk, 0, to_append_to_batch, current_chunk.size());
214
- count += to_append_to_batch;
215
- scan_state.IncreaseOffset(to_append_to_batch);
216
- }
217
- if (count > 0) {
218
- *out = appender.Finalize();
219
- }
220
- return true;
221
- }
222
-
223
- idx_t ArrowUtil::FetchChunk(ChunkScanState &scan_state, ClientProperties options, idx_t chunk_size, ArrowArray *out) {
224
- ErrorData error;
225
- idx_t result_count;
226
- if (!TryFetchChunk(scan_state, std::move(options), chunk_size, out, result_count, error)) {
227
- error.Throw();
228
- }
229
- return result_count;
230
- }
231
-
232
180
  } // namespace duckdb
@@ -22,9 +22,7 @@ optional_idx CGroups::GetMemoryLimit(FileSystem &fs) {
22
22
  }
23
23
 
24
24
  optional_idx CGroups::GetCGroupV2MemoryLimit(FileSystem &fs) {
25
- #ifdef DUCKDB_WASM
26
- return optional_idx();
27
- #else
25
+ #if defined(__linux__) && !defined(DUCKDB_WASM)
28
26
  const char *cgroup_self = "/proc/self/cgroup";
29
27
  const char *memory_max = "/sys/fs/cgroup/%s/memory.max";
30
28
 
@@ -45,13 +43,13 @@ optional_idx CGroups::GetCGroupV2MemoryLimit(FileSystem &fs) {
45
43
  }
46
44
 
47
45
  return ReadCGroupValue(fs, memory_max_path);
46
+ #else
47
+ return optional_idx();
48
48
  #endif
49
49
  }
50
50
 
51
51
  optional_idx CGroups::GetCGroupV1MemoryLimit(FileSystem &fs) {
52
- #ifdef DUCKDB_WASM
53
- return optional_idx();
54
- #else
52
+ #if defined(__linux__) && !defined(DUCKDB_WASM)
55
53
  const char *cgroup_self = "/proc/self/cgroup";
56
54
  const char *memory_limit = "/sys/fs/cgroup/memory/%s/memory.limit_in_bytes";
57
55
 
@@ -72,13 +70,13 @@ optional_idx CGroups::GetCGroupV1MemoryLimit(FileSystem &fs) {
72
70
  }
73
71
 
74
72
  return ReadCGroupValue(fs, memory_limit_path);
73
+ #else
74
+ return optional_idx();
75
75
  #endif
76
76
  }
77
77
 
78
78
  string CGroups::ReadCGroupPath(FileSystem &fs, const char *cgroup_file) {
79
- #ifdef DUCKDB_WASM
80
- return "";
81
- #else
79
+ #if defined(__linux__) && !defined(DUCKDB_WASM)
82
80
  auto handle = fs.OpenFile(cgroup_file, FileFlags::FILE_FLAGS_READ);
83
81
  char buffer[1024];
84
82
  auto bytes_read = fs.Read(*handle, buffer, sizeof(buffer) - 1);
@@ -90,15 +88,12 @@ string CGroups::ReadCGroupPath(FileSystem &fs, const char *cgroup_file) {
90
88
  if (pos != string::npos) {
91
89
  return content.substr(pos + 2);
92
90
  }
93
-
94
- return "";
95
91
  #endif
92
+ return "";
96
93
  }
97
94
 
98
95
  string CGroups::ReadMemoryCGroupPath(FileSystem &fs, const char *cgroup_file) {
99
- #ifdef DUCKDB_WASM
100
- return "";
101
- #else
96
+ #if defined(__linux__) && !defined(DUCKDB_WASM)
102
97
  auto handle = fs.OpenFile(cgroup_file, FileFlags::FILE_FLAGS_READ);
103
98
  char buffer[1024];
104
99
  auto bytes_read = fs.Read(*handle, buffer, sizeof(buffer) - 1);
@@ -115,15 +110,12 @@ string CGroups::ReadMemoryCGroupPath(FileSystem &fs, const char *cgroup_file) {
115
110
  }
116
111
  content.erase(0, pos + 1);
117
112
  }
118
-
119
- return "";
120
113
  #endif
114
+ return "";
121
115
  }
122
116
 
123
117
  optional_idx CGroups::ReadCGroupValue(FileSystem &fs, const char *file_path) {
124
- #ifdef DUCKDB_WASM
125
- return optional_idx();
126
- #else
118
+ #if defined(__linux__) && !defined(DUCKDB_WASM)
127
119
  auto handle = fs.OpenFile(file_path, FileFlags::FILE_FLAGS_READ);
128
120
  char buffer[100];
129
121
  auto bytes_read = fs.Read(*handle, buffer, 99);
@@ -133,15 +125,12 @@ optional_idx CGroups::ReadCGroupValue(FileSystem &fs, const char *file_path) {
133
125
  if (TryCast::Operation<string_t, idx_t>(string_t(buffer), value)) {
134
126
  return optional_idx(value);
135
127
  }
136
- return optional_idx();
137
128
  #endif
129
+ return optional_idx();
138
130
  }
139
131
 
140
132
  idx_t CGroups::GetCPULimit(FileSystem &fs, idx_t physical_cores) {
141
- #ifdef DUCKDB_WASM
142
- return physical_cores;
143
- #else
144
-
133
+ #if defined(__linux__) && !defined(DUCKDB_WASM)
145
134
  static constexpr const char *cpu_max = "/sys/fs/cgroup/cpu.max";
146
135
  static constexpr const char *cfs_quota = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us";
147
136
  static constexpr const char *cfs_period = "/sys/fs/cgroup/cpu/cpu.cfs_period_us";
@@ -183,6 +172,8 @@ idx_t CGroups::GetCPULimit(FileSystem &fs, idx_t physical_cores) {
183
172
  } else {
184
173
  return physical_cores;
185
174
  }
175
+ #else
176
+ return physical_cores;
186
177
  #endif
187
178
  }
188
179