duckdb 1.3.1-dev6.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (179) hide show
  1. package/package.json +1 -1
  2. package/src/duckdb/extension/core_functions/aggregate/distributive/arg_min_max.cpp +27 -39
  3. package/src/duckdb/extension/core_functions/aggregate/holistic/quantile.cpp +2 -3
  4. package/src/duckdb/extension/core_functions/include/core_functions/aggregate/quantile_sort_tree.hpp +1 -1
  5. package/src/duckdb/extension/core_functions/lambda_functions.cpp +16 -14
  6. package/src/duckdb/extension/core_functions/scalar/list/list_filter.cpp +3 -2
  7. package/src/duckdb/extension/core_functions/scalar/list/list_reduce.cpp +46 -10
  8. package/src/duckdb/extension/core_functions/scalar/list/list_transform.cpp +3 -2
  9. package/src/duckdb/extension/core_functions/scalar/random/random.cpp +3 -1
  10. package/src/duckdb/extension/icu/icu-datefunc.cpp +5 -3
  11. package/src/duckdb/extension/icu/icu-strptime.cpp +6 -1
  12. package/src/duckdb/extension/icu/icu-timezone.cpp +4 -0
  13. package/src/duckdb/extension/icu/icu_extension.cpp +7 -2
  14. package/src/duckdb/extension/icu/include/icu-datefunc.hpp +1 -1
  15. package/src/duckdb/extension/icu/include/icu-helpers.hpp +1 -1
  16. package/src/duckdb/extension/icu/third_party/icu/common/uloc.cpp +5 -5
  17. package/src/duckdb/extension/json/include/json_common.hpp +19 -0
  18. package/src/duckdb/extension/json/include/json_deserializer.hpp +1 -4
  19. package/src/duckdb/extension/json/include/json_functions.hpp +4 -4
  20. package/src/duckdb/extension/json/json_functions/json_serialize_sql.cpp +38 -17
  21. package/src/duckdb/extension/json/json_functions/json_table_in_out.cpp +11 -7
  22. package/src/duckdb/extension/json/json_functions.cpp +4 -4
  23. package/src/duckdb/extension/json/json_reader.cpp +1 -1
  24. package/src/duckdb/extension/parquet/column_reader.cpp +7 -1
  25. package/src/duckdb/extension/parquet/include/parquet_bss_decoder.hpp +2 -2
  26. package/src/duckdb/extension/parquet/include/parquet_dbp_encoder.hpp +2 -2
  27. package/src/duckdb/extension/parquet/include/parquet_reader.hpp +2 -1
  28. package/src/duckdb/extension/parquet/include/parquet_statistics.hpp +1 -1
  29. package/src/duckdb/extension/parquet/include/parquet_writer.hpp +3 -0
  30. package/src/duckdb/extension/parquet/include/writer/parquet_write_operators.hpp +3 -1
  31. package/src/duckdb/extension/parquet/include/writer/templated_column_writer.hpp +1 -1
  32. package/src/duckdb/extension/parquet/parquet_crypto.cpp +9 -5
  33. package/src/duckdb/extension/parquet/parquet_extension.cpp +26 -0
  34. package/src/duckdb/extension/parquet/parquet_float16.cpp +4 -2
  35. package/src/duckdb/extension/parquet/parquet_metadata.cpp +3 -3
  36. package/src/duckdb/extension/parquet/parquet_multi_file_info.cpp +12 -0
  37. package/src/duckdb/extension/parquet/parquet_reader.cpp +5 -4
  38. package/src/duckdb/extension/parquet/parquet_statistics.cpp +13 -3
  39. package/src/duckdb/extension/parquet/parquet_writer.cpp +1 -1
  40. package/src/duckdb/extension/parquet/reader/decimal_column_reader.cpp +1 -1
  41. package/src/duckdb/extension/parquet/reader/string_column_reader.cpp +1 -1
  42. package/src/duckdb/extension/parquet/reader/struct_column_reader.cpp +13 -4
  43. package/src/duckdb/extension/parquet/serialize_parquet.cpp +2 -0
  44. package/src/duckdb/src/catalog/catalog.cpp +10 -4
  45. package/src/duckdb/src/catalog/catalog_entry/duck_table_entry.cpp +4 -10
  46. package/src/duckdb/src/catalog/catalog_entry/schema_catalog_entry.cpp +1 -2
  47. package/src/duckdb/src/catalog/catalog_entry/sequence_catalog_entry.cpp +1 -1
  48. package/src/duckdb/src/catalog/catalog_entry/table_catalog_entry.cpp +2 -2
  49. package/src/duckdb/src/catalog/catalog_entry/type_catalog_entry.cpp +1 -1
  50. package/src/duckdb/src/catalog/catalog_search_path.cpp +7 -1
  51. package/src/duckdb/src/catalog/catalog_set.cpp +21 -1
  52. package/src/duckdb/src/common/adbc/adbc.cpp +1 -1
  53. package/src/duckdb/src/common/arrow/arrow_appender.cpp +17 -5
  54. package/src/duckdb/src/common/arrow/arrow_converter.cpp +23 -15
  55. package/src/duckdb/src/common/box_renderer.cpp +1 -2
  56. package/src/duckdb/src/common/enum_util.cpp +4 -3
  57. package/src/duckdb/src/common/local_file_system.cpp +13 -12
  58. package/src/duckdb/src/common/multi_file/multi_file_column_mapper.cpp +35 -12
  59. package/src/duckdb/src/common/multi_file/multi_file_reader.cpp +13 -3
  60. package/src/duckdb/src/common/string_util.cpp +7 -5
  61. package/src/duckdb/src/common/tree_renderer/graphviz_tree_renderer.cpp +4 -4
  62. package/src/duckdb/src/common/tree_renderer/html_tree_renderer.cpp +4 -4
  63. package/src/duckdb/src/common/tree_renderer/json_tree_renderer.cpp +4 -4
  64. package/src/duckdb/src/common/tree_renderer/text_tree_renderer.cpp +4 -4
  65. package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +1 -1
  66. package/src/duckdb/src/common/types/uuid.cpp +5 -1
  67. package/src/duckdb/src/common/types.cpp +28 -0
  68. package/src/duckdb/src/common/virtual_file_system.cpp +5 -0
  69. package/src/duckdb/src/execution/column_binding_resolver.cpp +49 -30
  70. package/src/duckdb/src/execution/index/fixed_size_allocator.cpp +4 -0
  71. package/src/duckdb/src/execution/join_hashtable.cpp +10 -7
  72. package/src/duckdb/src/execution/operator/aggregate/physical_streaming_window.cpp +3 -3
  73. package/src/duckdb/src/execution/operator/csv_scanner/encode/csv_encoder.cpp +1 -1
  74. package/src/duckdb/src/execution/operator/csv_scanner/scanner/column_count_scanner.cpp +2 -1
  75. package/src/duckdb/src/execution/operator/csv_scanner/scanner/skip_scanner.cpp +1 -4
  76. package/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +53 -1
  77. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +58 -59
  78. package/src/duckdb/src/execution/operator/join/physical_hash_join.cpp +10 -5
  79. package/src/duckdb/src/execution/operator/persistent/physical_batch_copy_to_file.cpp +4 -0
  80. package/src/duckdb/src/execution/operator/persistent/physical_copy_to_file.cpp +18 -8
  81. package/src/duckdb/src/execution/operator/persistent/physical_export.cpp +1 -1
  82. package/src/duckdb/src/execution/operator/schema/physical_attach.cpp +1 -0
  83. package/src/duckdb/src/execution/physical_plan_generator.cpp +5 -5
  84. package/src/duckdb/src/function/cast/vector_cast_helpers.cpp +2 -1
  85. package/src/duckdb/src/function/function.cpp +4 -0
  86. package/src/duckdb/src/function/scalar/operator/arithmetic.cpp +6 -0
  87. package/src/duckdb/src/function/scalar/struct/remap_struct.cpp +10 -1
  88. package/src/duckdb/src/function/table/copy_csv.cpp +1 -0
  89. package/src/duckdb/src/function/table/version/pragma_version.cpp +3 -3
  90. package/src/duckdb/src/include/duckdb/catalog/catalog.hpp +1 -0
  91. package/src/duckdb/src/include/duckdb/catalog/catalog_entry/duck_table_entry.hpp +1 -1
  92. package/src/duckdb/src/include/duckdb/catalog/catalog_search_path.hpp +1 -1
  93. package/src/duckdb/src/include/duckdb/catalog/catalog_set.hpp +2 -0
  94. package/src/duckdb/src/include/duckdb/common/file_buffer.hpp +2 -2
  95. package/src/duckdb/src/include/duckdb/common/helper.hpp +9 -9
  96. package/src/duckdb/src/include/duckdb/common/hive_partitioning.hpp +1 -1
  97. package/src/duckdb/src/include/duckdb/common/multi_file/multi_file_column_mapper.hpp +3 -5
  98. package/src/duckdb/src/include/duckdb/common/multi_file/multi_file_reader.hpp +7 -0
  99. package/src/duckdb/src/include/duckdb/common/multi_file/multi_file_states.hpp +3 -0
  100. package/src/duckdb/src/include/duckdb/common/shadow_forbidden_functions.hpp +40 -0
  101. package/src/duckdb/src/include/duckdb/common/string.hpp +25 -2
  102. package/src/duckdb/src/include/duckdb/common/types/hugeint.hpp +20 -24
  103. package/src/duckdb/src/include/duckdb/common/types/uhugeint.hpp +20 -24
  104. package/src/duckdb/src/include/duckdb/common/types.hpp +3 -0
  105. package/src/duckdb/src/include/duckdb/common/unique_ptr.hpp +34 -8
  106. package/src/duckdb/src/include/duckdb/execution/column_binding_resolver.hpp +1 -0
  107. package/src/duckdb/src/include/duckdb/execution/join_hashtable.hpp +3 -2
  108. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/column_count_scanner.hpp +3 -0
  109. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/encode/csv_encoder.hpp +1 -1
  110. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp +15 -3
  111. package/src/duckdb/src/include/duckdb/function/cast/vector_cast_helpers.hpp +2 -2
  112. package/src/duckdb/src/include/duckdb/function/copy_function.hpp +7 -3
  113. package/src/duckdb/src/include/duckdb/function/function.hpp +1 -0
  114. package/src/duckdb/src/include/duckdb/function/function_binder.hpp +2 -1
  115. package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +20 -12
  116. package/src/duckdb/src/include/duckdb/function/lambda_functions.hpp +4 -3
  117. package/src/duckdb/src/include/duckdb/function/scalar_function.hpp +3 -1
  118. package/src/duckdb/src/include/duckdb/logging/log_type.hpp +17 -0
  119. package/src/duckdb/src/include/duckdb/main/attached_database.hpp +1 -0
  120. package/src/duckdb/src/include/duckdb/main/client_properties.hpp +22 -6
  121. package/src/duckdb/src/include/duckdb/main/config.hpp +2 -0
  122. package/src/duckdb/src/include/duckdb/main/database_manager.hpp +4 -1
  123. package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +27 -13
  124. package/src/duckdb/src/include/duckdb/main/secret/secret_manager.hpp +1 -0
  125. package/src/duckdb/src/include/duckdb/main/settings.hpp +11 -0
  126. package/src/duckdb/src/include/duckdb/optimizer/topn_optimizer.hpp +7 -1
  127. package/src/duckdb/src/include/duckdb/original/std/locale.hpp +10 -0
  128. package/src/duckdb/src/include/duckdb/original/std/memory.hpp +12 -0
  129. package/src/duckdb/src/include/duckdb/original/std/sstream.hpp +11 -0
  130. package/src/duckdb/src/include/duckdb/planner/expression_binder.hpp +5 -3
  131. package/src/duckdb/src/include/duckdb/storage/buffer/buffer_pool.hpp +4 -2
  132. package/src/duckdb/src/logging/log_manager.cpp +1 -0
  133. package/src/duckdb/src/logging/log_types.cpp +40 -0
  134. package/src/duckdb/src/main/attached_database.cpp +4 -0
  135. package/src/duckdb/src/main/client_context.cpp +1 -0
  136. package/src/duckdb/src/main/config.cpp +1 -0
  137. package/src/duckdb/src/main/database.cpp +1 -0
  138. package/src/duckdb/src/main/database_manager.cpp +19 -2
  139. package/src/duckdb/src/main/extension/extension_helper.cpp +4 -3
  140. package/src/duckdb/src/main/query_profiler.cpp +2 -2
  141. package/src/duckdb/src/main/query_result.cpp +1 -1
  142. package/src/duckdb/src/main/secret/secret_manager.cpp +2 -0
  143. package/src/duckdb/src/main/settings/autogenerated_settings.cpp +7 -0
  144. package/src/duckdb/src/main/settings/custom_settings.cpp +106 -34
  145. package/src/duckdb/src/optimizer/optimizer.cpp +1 -1
  146. package/src/duckdb/src/optimizer/topn_optimizer.cpp +18 -8
  147. package/src/duckdb/src/parallel/executor.cpp +5 -0
  148. package/src/duckdb/src/parser/parsed_data/create_sequence_info.cpp +1 -1
  149. package/src/duckdb/src/parser/transform/expression/transform_interval.cpp +5 -1
  150. package/src/duckdb/src/planner/binder/expression/bind_function_expression.cpp +21 -24
  151. package/src/duckdb/src/planner/binder/expression/bind_lambda.cpp +10 -8
  152. package/src/duckdb/src/planner/binder/expression/bind_operator_expression.cpp +3 -2
  153. package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +0 -4
  154. package/src/duckdb/src/planner/binder/tableref/bind_basetableref.cpp +3 -0
  155. package/src/duckdb/src/planner/binder/tableref/bind_table_function.cpp +3 -0
  156. package/src/duckdb/src/planner/expression/bound_cast_expression.cpp +3 -0
  157. package/src/duckdb/src/planner/expression/bound_columnref_expression.cpp +1 -1
  158. package/src/duckdb/src/planner/expression/bound_function_expression.cpp +0 -1
  159. package/src/duckdb/src/planner/expression/bound_reference_expression.cpp +1 -1
  160. package/src/duckdb/src/planner/expression_binder.cpp +4 -2
  161. package/src/duckdb/src/planner/logical_operator.cpp +2 -1
  162. package/src/duckdb/src/planner/subquery/flatten_dependent_join.cpp +4 -1
  163. package/src/duckdb/src/storage/buffer/block_handle.cpp +8 -0
  164. package/src/duckdb/src/storage/buffer/buffer_pool.cpp +44 -18
  165. package/src/duckdb/src/storage/caching_file_system.cpp +7 -7
  166. package/src/duckdb/src/storage/standard_buffer_manager.cpp +4 -3
  167. package/src/duckdb/src/storage/storage_info.cpp +2 -0
  168. package/src/duckdb/src/storage/wal_replay.cpp +9 -4
  169. package/src/duckdb/third_party/fmt/include/fmt/format.h +8 -1
  170. package/src/duckdb/third_party/fsst/libfsst.cpp +4 -3
  171. package/src/duckdb/third_party/httplib/httplib.hpp +25 -22
  172. package/src/duckdb/third_party/hyperloglog/sds.cpp +7 -3
  173. package/src/duckdb/third_party/libpg_query/src_common_keywords.cpp +8 -1
  174. package/src/duckdb/third_party/re2/re2/filtered_re2.h +8 -2
  175. package/src/duckdb/third_party/re2/re2/pod_array.h +7 -1
  176. package/src/duckdb/third_party/re2/re2/re2.cc +6 -2
  177. package/src/duckdb/third_party/re2/re2/set.cc +1 -1
  178. package/src/duckdb/third_party/re2/re2/set.h +7 -1
  179. package/src/duckdb/ub_src_logging.cpp +4 -4
@@ -171,8 +171,8 @@ ScalarFunctionSet JSONFunctions::GetSerializeSqlFunction() {
171
171
  //----------------------------------------------------------------------
172
172
  // JSON DESERIALIZE
173
173
  //----------------------------------------------------------------------
174
- static unique_ptr<SelectStatement> DeserializeSelectStatement(string_t input, yyjson_alc *alc) {
175
- auto doc = JSONCommon::ReadDocument(input, JSONCommon::READ_FLAG, alc);
174
+ static vector<unique_ptr<SelectStatement>> DeserializeSelectStatement(string_t input, yyjson_alc *alc) {
175
+ auto doc = yyjson_doc_ptr(JSONCommon::ReadDocument(input, JSONCommon::READ_FLAG, alc));
176
176
  if (!doc) {
177
177
  throw ParserException("Could not parse json");
178
178
  }
@@ -196,16 +196,22 @@ static unique_ptr<SelectStatement> DeserializeSelectStatement(string_t input, yy
196
196
  if (size == 0) {
197
197
  throw ParserException("Error parsing json: no statements");
198
198
  }
199
- if (size > 1) {
200
- throw ParserException("Error parsing json: more than one statement");
201
- }
202
- auto stmt_json = yyjson_arr_get(statements, 0);
203
- JsonDeserializer deserializer(stmt_json, doc);
204
- auto stmt = SelectStatement::Deserialize(deserializer);
205
- if (!stmt->node) {
206
- throw ParserException("Error parsing json: no select node found in json");
199
+
200
+ vector<unique_ptr<SelectStatement>> result;
201
+
202
+ idx_t idx;
203
+ idx_t max;
204
+ yyjson_val *stmt_json;
205
+ yyjson_arr_foreach(statements, idx, max, stmt_json) {
206
+ JsonDeserializer deserializer(stmt_json, doc);
207
+ auto stmt = SelectStatement::Deserialize(deserializer);
208
+ if (!stmt->node) {
209
+ throw ParserException("Error parsing json: no select node found in json");
210
+ }
211
+ result.push_back(std::move(stmt));
207
212
  }
208
- return stmt;
213
+
214
+ return result;
209
215
  }
210
216
 
211
217
  //----------------------------------------------------------------------
@@ -217,8 +223,17 @@ static void JsonDeserializeFunction(DataChunk &args, ExpressionState &state, Vec
217
223
  auto &inputs = args.data[0];
218
224
 
219
225
  UnaryExecutor::Execute<string_t, string_t>(inputs, result, args.size(), [&](string_t input) {
220
- auto stmt = DeserializeSelectStatement(input, alc);
221
- return StringVector::AddString(result, stmt->ToString());
226
+ auto stmts = DeserializeSelectStatement(input, alc);
227
+ // Combine all statements into a single semicolon separated string
228
+ string str;
229
+ for (idx_t i = 0; i < stmts.size(); i++) {
230
+ if (i > 0) {
231
+ str += "; ";
232
+ }
233
+ str += stmts[i]->ToString();
234
+ }
235
+
236
+ return StringVector::AddString(result, str);
222
237
  });
223
238
  }
224
239
 
@@ -237,8 +252,11 @@ static string ExecuteJsonSerializedSqlPragmaFunction(ClientContext &context, con
237
252
  auto alc = local_state.json_allocator->GetYYAlc();
238
253
 
239
254
  auto input = parameters.values[0].GetValueUnsafe<string_t>();
240
- auto stmt = DeserializeSelectStatement(input, alc);
241
- return stmt->ToString();
255
+ auto stmts = DeserializeSelectStatement(input, alc);
256
+ if (stmts.size() != 1) {
257
+ throw BinderException("json_execute_serialized_sql pragma expects exactly one statement");
258
+ }
259
+ return stmts[0]->ToString();
242
260
  }
243
261
 
244
262
  PragmaFunctionSet JSONFunctions::GetExecuteJsonSerializedSqlPragmaFunction() {
@@ -268,8 +286,11 @@ struct ExecuteSqlTableFunction {
268
286
  throw BinderException("json_execute_serialized_sql cannot execute NULL plan");
269
287
  }
270
288
  auto serialized = input.inputs[0].GetValueUnsafe<string>();
271
- auto stmt = DeserializeSelectStatement(serialized, alc);
272
- result->plan = result->con->RelationFromQuery(std::move(stmt));
289
+ auto stmts = DeserializeSelectStatement(serialized, alc);
290
+ if (stmts.size() != 1) {
291
+ throw BinderException("json_execute_serialized_sql expects exactly one statement");
292
+ }
293
+ result->plan = result->con->RelationFromQuery(std::move(stmts[0]));
273
294
 
274
295
  for (auto &col : result->plan->Columns()) {
275
296
  return_types.emplace_back(col.Type());
@@ -124,17 +124,21 @@ struct JSONTableInOutLocalState : LocalTableFunctionState {
124
124
  return result;
125
125
  }
126
126
 
127
- void AddRecursionNode(yyjson_val *val, optional_ptr<yyjson_val> vkey) {
128
- const auto vkey_str =
129
- vkey ? "." + string(unsafe_yyjson_get_str(vkey.get()), unsafe_yyjson_get_len(vkey.get())) : "";
130
- recursion_nodes.emplace_back(vkey_str, val);
127
+ void AddRecursionNode(yyjson_val *val, optional_ptr<yyjson_val> vkey, const optional_idx arr_index) {
128
+ string str;
129
+ if (vkey) {
130
+ str = "." + string(unsafe_yyjson_get_str(vkey.get()), unsafe_yyjson_get_len(vkey.get()));
131
+ } else if (arr_index.IsValid()) {
132
+ str = "[" + to_string(arr_index.GetIndex()) + "]";
133
+ }
134
+ recursion_nodes.emplace_back(str, val);
131
135
  }
132
136
 
133
137
  JSONAllocator json_allocator;
134
138
  yyjson_alc *alc;
135
139
 
136
140
  string path;
137
- size_t len;
141
+ idx_t len;
138
142
  yyjson_doc *doc;
139
143
  bool initialized;
140
144
 
@@ -269,7 +273,7 @@ static void InitializeLocalState(JSONTableInOutLocalState &lstate, DataChunk &in
269
273
  result.AddRow<TYPE>(lstate, nullptr, root);
270
274
  }
271
275
  if (is_container) {
272
- lstate.AddRecursionNode(root, nullptr);
276
+ lstate.AddRecursionNode(root, nullptr, optional_idx());
273
277
  }
274
278
  }
275
279
 
@@ -283,7 +287,7 @@ static bool JSONTableInOutHandleValue(JSONTableInOutLocalState &lstate, JSONTabl
283
287
  result.AddRow<TYPE>(lstate, child_key, child_val);
284
288
  child_index++; // We finished processing the array element
285
289
  if (TYPE == JSONTableInOutType::TREE && (unsafe_yyjson_is_arr(child_val) || unsafe_yyjson_is_obj(child_val))) {
286
- lstate.AddRecursionNode(child_val, child_key);
290
+ lstate.AddRecursionNode(child_val, child_key, idx);
287
291
  return true; // Break: We added a recursion node, go depth-first
288
292
  }
289
293
  if (result.count == STANDARD_VECTOR_SIZE) {
@@ -14,7 +14,7 @@ namespace duckdb {
14
14
 
15
15
  using JSONPathType = JSONCommon::JSONPathType;
16
16
 
17
- JSONPathType JSONReadFunctionData::CheckPath(const Value &path_val, string &path, size_t &len) {
17
+ JSONPathType JSONReadFunctionData::CheckPath(const Value &path_val, string &path, idx_t &len) {
18
18
  if (path_val.IsNull()) {
19
19
  throw BinderException("JSON path cannot be NULL");
20
20
  }
@@ -60,7 +60,7 @@ unique_ptr<FunctionData> JSONReadFunctionData::Bind(ClientContext &context, Scal
60
60
  D_ASSERT(bound_function.arguments.size() == 2);
61
61
  bool constant = false;
62
62
  string path;
63
- size_t len = 0;
63
+ idx_t len = 0;
64
64
  JSONPathType path_type = JSONPathType::REGULAR;
65
65
  if (arguments[1]->IsFoldable()) {
66
66
  const auto path_val = ExpressionExecutor::EvaluateScalar(context, *arguments[1]);
@@ -80,7 +80,7 @@ unique_ptr<FunctionData> JSONReadFunctionData::Bind(ClientContext &context, Scal
80
80
  return make_uniq<JSONReadFunctionData>(constant, std::move(path), len, path_type);
81
81
  }
82
82
 
83
- JSONReadManyFunctionData::JSONReadManyFunctionData(vector<string> paths_p, vector<size_t> lens_p)
83
+ JSONReadManyFunctionData::JSONReadManyFunctionData(vector<string> paths_p, vector<idx_t> lens_p)
84
84
  : paths(std::move(paths_p)), lens(std::move(lens_p)) {
85
85
  for (const auto &path : paths) {
86
86
  ptrs.push_back(path.c_str());
@@ -107,7 +107,7 @@ unique_ptr<FunctionData> JSONReadManyFunctionData::Bind(ClientContext &context,
107
107
  }
108
108
 
109
109
  vector<string> paths;
110
- vector<size_t> lens;
110
+ vector<idx_t> lens;
111
111
  auto paths_val = ExpressionExecutor::EvaluateScalar(context, *arguments[1]);
112
112
 
113
113
  for (auto &path_val : ListValue::GetChildren(paths_val)) {
@@ -737,7 +737,7 @@ bool JSONReader::CopyRemainderFromPreviousBuffer(JSONReaderScanState &scan_state
737
737
  idx_t prev_buffer_size = previous_buffer_handle->buffer_size - previous_buffer_handle->buffer_start;
738
738
  auto prev_buffer_ptr = char_ptr_cast(previous_buffer_handle->buffer.get()) + previous_buffer_handle->buffer_size;
739
739
  auto prev_object_start = PreviousNewline(prev_buffer_ptr, prev_buffer_size);
740
- auto prev_object_size = prev_buffer_ptr - prev_object_start;
740
+ auto prev_object_size = NumericCast<idx_t>(prev_buffer_ptr - prev_object_start);
741
741
 
742
742
  D_ASSERT(scan_state.buffer_offset == options.maximum_object_size);
743
743
  if (prev_object_size > scan_state.buffer_offset) {
@@ -412,7 +412,7 @@ void ColumnReader::DecompressInternal(CompressionCodec::type codec, const_data_p
412
412
  }
413
413
 
414
414
  default: {
415
- std::stringstream codec_name;
415
+ duckdb::stringstream codec_name;
416
416
  codec_name << codec;
417
417
  throw std::runtime_error("Unsupported compression codec \"" + codec_name.str() +
418
418
  "\". Supported options are uncompressed, brotli, gzip, lz4_raw, snappy or zstd");
@@ -713,6 +713,12 @@ void ColumnReader::ApplyPendingSkips(data_ptr_t define_out, data_ptr_t repeat_ou
713
713
 
714
714
  while (to_skip > 0) {
715
715
  auto skip_now = ReadPageHeaders(to_skip);
716
+ if (page_is_filtered_out) {
717
+ // the page has been filtered out entirely - skip
718
+ page_rows_available -= skip_now;
719
+ to_skip -= skip_now;
720
+ continue;
721
+ }
716
722
  const auto all_valid = PrepareRead(skip_now, define_out, repeat_out, 0);
717
723
 
718
724
  const auto define_ptr = all_valid ? nullptr : static_cast<uint8_t *>(define_out);
@@ -23,7 +23,7 @@ public:
23
23
  template <typename T>
24
24
  void GetBatch(data_ptr_t values_target_ptr, uint32_t batch_size) {
25
25
  if (buffer_.len % sizeof(T) != 0) {
26
- std::stringstream error;
26
+ duckdb::stringstream error;
27
27
  error << "Data buffer size for the BYTE_STREAM_SPLIT encoding (" << buffer_.len
28
28
  << ") should be a multiple of the type size (" << sizeof(T) << ")";
29
29
  throw std::runtime_error(error.str());
@@ -44,7 +44,7 @@ public:
44
44
  template <typename T>
45
45
  void Skip(uint32_t batch_size) {
46
46
  if (buffer_.len % sizeof(T) != 0) {
47
- std::stringstream error;
47
+ duckdb::stringstream error;
48
48
  error << "Data buffer size for the BYTE_STREAM_SPLIT encoding (" << buffer_.len
49
49
  << ") should be a multiple of the type size (" << sizeof(T) << ")";
50
50
  throw std::runtime_error(error.str());
@@ -155,8 +155,8 @@ private:
155
155
  int64_t verification_data[NUMBER_OF_VALUES_IN_A_MINIBLOCK];
156
156
  ByteBuffer byte_buffer(data_ptr_cast(data_packed), write_size);
157
157
  bitpacking_width_t bitpack_pos = 0;
158
- ParquetDecodeUtils::BitUnpack(byte_buffer, bitpack_pos, verification_data, NUMBER_OF_VALUES_IN_A_MINIBLOCK,
159
- width);
158
+ ParquetDecodeUtils::BitUnpack(byte_buffer, bitpack_pos, reinterpret_cast<uint64_t *>(verification_data),
159
+ NUMBER_OF_VALUES_IN_A_MINIBLOCK, width);
160
160
  for (idx_t i = 0; i < NUMBER_OF_VALUES_IN_A_MINIBLOCK; i++) {
161
161
  D_ASSERT(src[i] == verification_data[i]);
162
162
  }
@@ -62,7 +62,7 @@ struct ParquetReaderScanState {
62
62
  idx_t group_offset;
63
63
  unique_ptr<CachingFileHandle> file_handle;
64
64
  unique_ptr<ColumnReader> root_reader;
65
- std::unique_ptr<duckdb_apache::thrift::protocol::TProtocol> thrift_file_proto;
65
+ duckdb_base_std::unique_ptr<duckdb_apache::thrift::protocol::TProtocol> thrift_file_proto;
66
66
 
67
67
  bool finished;
68
68
  SelectionVector sel;
@@ -108,6 +108,7 @@ struct ParquetOptions {
108
108
 
109
109
  vector<ParquetColumnDefinition> schema;
110
110
  idx_t explicit_cardinality = 0;
111
+ bool can_have_nan = false; // if floats or doubles can contain NaN values
111
112
  };
112
113
 
113
114
  struct ParquetOptionsSerialization {
@@ -27,7 +27,7 @@ class ResizeableBuffer;
27
27
  struct ParquetStatisticsUtils {
28
28
 
29
29
  static unique_ptr<BaseStatistics> TransformColumnStatistics(const ParquetColumnSchema &reader,
30
- const vector<ColumnChunk> &columns);
30
+ const vector<ColumnChunk> &columns, bool can_have_nan);
31
31
 
32
32
  static Value ConvertValue(const LogicalType &type, const ParquetColumnSchema &schema_ele, const std::string &stats);
33
33
 
@@ -134,6 +134,9 @@ public:
134
134
  ParquetVersion GetParquetVersion() const {
135
135
  return parquet_version;
136
136
  }
137
+ const string &GetFileName() const {
138
+ return file_name;
139
+ }
137
140
 
138
141
  uint32_t Write(const duckdb_apache::thrift::TBase &object);
139
142
  uint32_t WriteData(const const_data_ptr_t buffer, const uint32_t buffer_size);
@@ -138,7 +138,9 @@ struct ParquetBaseStringOperator : public BaseParquetOperator {
138
138
 
139
139
  template <class SRC, class TGT>
140
140
  static idx_t GetRowSize(const Vector &vector, idx_t index) {
141
- return FlatVector::GetData<string_t>(vector)[index].GetSize();
141
+ // This needs to add the 4 bytes (just like WriteSize) otherwise we underestimate and we have to realloc
142
+ // This seriously harms performance, mostly by making it very inconsistent (see internal issue #4990)
143
+ return sizeof(uint32_t) + FlatVector::GetData<string_t>(vector)[index].GetSize();
142
144
  }
143
145
  };
144
146
 
@@ -403,7 +403,7 @@ private:
403
403
  break;
404
404
  }
405
405
  case duckdb_parquet::Encoding::BYTE_STREAM_SPLIT: {
406
- if (page_state.bss_initialized) {
406
+ if (!page_state.bss_initialized) {
407
407
  page_state.bss_encoder.BeginWrite(BufferAllocator::Get(writer.GetContext()));
408
408
  page_state.bss_initialized = true;
409
409
  }
@@ -300,14 +300,15 @@ private:
300
300
  uint32_t ParquetCrypto::Read(TBase &object, TProtocol &iprot, const string &key,
301
301
  const EncryptionUtil &encryption_util_p) {
302
302
  TCompactProtocolFactoryT<DecryptionTransport> tproto_factory;
303
- auto dprot = tproto_factory.getProtocol(std::make_shared<DecryptionTransport>(iprot, key, encryption_util_p));
303
+ auto dprot =
304
+ tproto_factory.getProtocol(duckdb_base_std::make_shared<DecryptionTransport>(iprot, key, encryption_util_p));
304
305
  auto &dtrans = reinterpret_cast<DecryptionTransport &>(*dprot->getTransport());
305
306
 
306
307
  // We have to read the whole thing otherwise thrift throws an error before we realize we're decryption is wrong
307
308
  auto all = dtrans.ReadAll();
308
309
  TCompactProtocolFactoryT<SimpleReadTransport> tsimple_proto_factory;
309
310
  auto simple_prot =
310
- tsimple_proto_factory.getProtocol(std::make_shared<SimpleReadTransport>(all.get(), all.GetSize()));
311
+ tsimple_proto_factory.getProtocol(duckdb_base_std::make_shared<SimpleReadTransport>(all.get(), all.GetSize()));
311
312
 
312
313
  // Read the object
313
314
  object.read(simple_prot.get());
@@ -319,7 +320,8 @@ uint32_t ParquetCrypto::Write(const TBase &object, TProtocol &oprot, const strin
319
320
  const EncryptionUtil &encryption_util_p) {
320
321
  // Create encryption protocol
321
322
  TCompactProtocolFactoryT<EncryptionTransport> tproto_factory;
322
- auto eprot = tproto_factory.getProtocol(std::make_shared<EncryptionTransport>(oprot, key, encryption_util_p));
323
+ auto eprot =
324
+ tproto_factory.getProtocol(duckdb_base_std::make_shared<EncryptionTransport>(oprot, key, encryption_util_p));
323
325
  auto &etrans = reinterpret_cast<EncryptionTransport &>(*eprot->getTransport());
324
326
 
325
327
  // Write the object in memory
@@ -333,7 +335,8 @@ uint32_t ParquetCrypto::ReadData(TProtocol &iprot, const data_ptr_t buffer, cons
333
335
  const string &key, const EncryptionUtil &encryption_util_p) {
334
336
  // Create decryption protocol
335
337
  TCompactProtocolFactoryT<DecryptionTransport> tproto_factory;
336
- auto dprot = tproto_factory.getProtocol(std::make_shared<DecryptionTransport>(iprot, key, encryption_util_p));
338
+ auto dprot =
339
+ tproto_factory.getProtocol(duckdb_base_std::make_shared<DecryptionTransport>(iprot, key, encryption_util_p));
337
340
  auto &dtrans = reinterpret_cast<DecryptionTransport &>(*dprot->getTransport());
338
341
 
339
342
  // Read buffer
@@ -348,7 +351,8 @@ uint32_t ParquetCrypto::WriteData(TProtocol &oprot, const const_data_ptr_t buffe
348
351
  // FIXME: we know the size upfront so we could do a streaming write instead of this
349
352
  // Create encryption protocol
350
353
  TCompactProtocolFactoryT<EncryptionTransport> tproto_factory;
351
- auto eprot = tproto_factory.getProtocol(std::make_shared<EncryptionTransport>(oprot, key, encryption_util_p));
354
+ auto eprot =
355
+ tproto_factory.getProtocol(duckdb_base_std::make_shared<EncryptionTransport>(oprot, key, encryption_util_p));
352
356
  auto &etrans = reinterpret_cast<EncryptionTransport &>(*eprot->getTransport());
353
357
 
354
358
  // Write the data in memory
@@ -243,6 +243,18 @@ struct ParquetWriteBindData : public TableFunctionData {
243
243
 
244
244
  struct ParquetWriteGlobalState : public GlobalFunctionData {
245
245
  unique_ptr<ParquetWriter> writer;
246
+ optional_ptr<const PhysicalOperator> op;
247
+
248
+ void LogFlushingRowGroup(const ColumnDataCollection &buffer, const string &reason) {
249
+ if (!op) {
250
+ return;
251
+ }
252
+ DUCKDB_LOG(writer->GetContext(), PhysicalOperatorLogType, *op, "ParquetWriter", "FlushRowGroup",
253
+ {{"file", writer->GetFileName()},
254
+ {"rows", to_string(buffer.Count())},
255
+ {"size", to_string(buffer.SizeInBytes())},
256
+ {"reason", reason}});
257
+ }
246
258
 
247
259
  mutex lock;
248
260
  unique_ptr<ColumnDataCollection> combine_buffer;
@@ -446,6 +458,9 @@ void ParquetWriteSink(ExecutionContext &context, FunctionData &bind_data_p, Glob
446
458
 
447
459
  if (local_state.buffer.Count() >= bind_data.row_group_size ||
448
460
  local_state.buffer.SizeInBytes() >= bind_data.row_group_size_bytes) {
461
+ const string reason =
462
+ local_state.buffer.Count() >= bind_data.row_group_size ? "ROW_GROUP_SIZE" : "ROW_GROUP_SIZE_BYTES";
463
+ global_state.LogFlushingRowGroup(local_state.buffer, reason);
449
464
  // if the chunk collection exceeds a certain size (rows/bytes) we flush it to the parquet file
450
465
  local_state.append_state.current_chunk_state.handles.clear();
451
466
  global_state.writer->Flush(local_state.buffer);
@@ -462,6 +477,7 @@ void ParquetWriteCombine(ExecutionContext &context, FunctionData &bind_data_p, G
462
477
  if (local_state.buffer.Count() >= bind_data.row_group_size / 2 ||
463
478
  local_state.buffer.SizeInBytes() >= bind_data.row_group_size_bytes / 2) {
464
479
  // local state buffer is more than half of the row_group_size(_bytes), just flush it
480
+ global_state.LogFlushingRowGroup(local_state.buffer, "Combine");
465
481
  global_state.writer->Flush(local_state.buffer);
466
482
  return;
467
483
  }
@@ -475,6 +491,7 @@ void ParquetWriteCombine(ExecutionContext &context, FunctionData &bind_data_p, G
475
491
  // After combining, the combine buffer is more than half of the row_group_size(_bytes), so we flush
476
492
  auto owned_combine_buffer = std::move(global_state.combine_buffer);
477
493
  guard.unlock();
494
+ global_state.LogFlushingRowGroup(*owned_combine_buffer, "Combine");
478
495
  // Lock free, of course
479
496
  global_state.writer->Flush(*owned_combine_buffer);
480
497
  }
@@ -489,6 +506,7 @@ void ParquetWriteFinalize(ClientContext &context, FunctionData &bind_data, Globa
489
506
  auto &global_state = gstate.Cast<ParquetWriteGlobalState>();
490
507
  // flush the combine buffer (if it's there)
491
508
  if (global_state.combine_buffer) {
509
+ global_state.LogFlushingRowGroup(*global_state.combine_buffer, "Finalize");
492
510
  global_state.writer->Flush(*global_state.combine_buffer);
493
511
  }
494
512
 
@@ -691,6 +709,13 @@ CopyFunctionExecutionMode ParquetWriteExecutionMode(bool preserve_insertion_orde
691
709
  return CopyFunctionExecutionMode::REGULAR_COPY_TO_FILE;
692
710
  }
693
711
  //===--------------------------------------------------------------------===//
712
+ // Initialize Logger
713
+ //===--------------------------------------------------------------------===//
714
+ void ParquetWriteInitializeOperator(GlobalFunctionData &gstate, const PhysicalOperator &op) {
715
+ auto &global_state = gstate.Cast<ParquetWriteGlobalState>();
716
+ global_state.op = &op;
717
+ }
718
+ //===--------------------------------------------------------------------===//
694
719
  // Prepare Batch
695
720
  //===--------------------------------------------------------------------===//
696
721
  struct ParquetWriteBatchData : public PreparedBatchData {
@@ -889,6 +914,7 @@ void ParquetExtension::Load(DuckDB &db) {
889
914
  function.copy_to_combine = ParquetWriteCombine;
890
915
  function.copy_to_finalize = ParquetWriteFinalize;
891
916
  function.execution_mode = ParquetWriteExecutionMode;
917
+ function.initialize_operator = ParquetWriteInitializeOperator;
892
918
  function.copy_from_bind = MultiFileFunction<ParquetMultiFileInfo>::MultiFileBindCopy;
893
919
  function.copy_from_function = scan_fun.functions[0];
894
920
  function.prepare_batch = ParquetWritePrepareBatch;
@@ -11,7 +11,9 @@ float Float16ToFloat32(const uint16_t &float16_value) {
11
11
  uint32_t sign = float16_value >> 15;
12
12
  uint32_t exponent = (float16_value >> 10) & 0x1F;
13
13
  uint32_t fraction = (float16_value & 0x3FF);
14
- uint32_t float32_value;
14
+ // Avoid strict aliasing issues and compiler warnings
15
+ uint32_t float32_value = 0;
16
+
15
17
  if (exponent == 0) {
16
18
  if (fraction == 0) {
17
19
  // zero
@@ -39,7 +41,7 @@ float Float16ToFloat32(const uint16_t &float16_value) {
39
41
  float32_value = (sign << 31) | ((exponent + (127 - 15)) << 23) | (fraction << 13);
40
42
  }
41
43
 
42
- return *reinterpret_cast<float *>(&float32_value);
44
+ return Load<float>(const_data_ptr_cast(&float32_value));
43
45
  }
44
46
 
45
47
  } // namespace duckdb
@@ -63,14 +63,14 @@ public:
63
63
 
64
64
  template <class T>
65
65
  string ConvertParquetElementToString(T &&entry) {
66
- std::stringstream ss;
66
+ duckdb::stringstream ss;
67
67
  ss << entry;
68
68
  return ss.str();
69
69
  }
70
70
 
71
71
  template <class T>
72
72
  string PrintParquetElementToString(T &&entry) {
73
- std::stringstream ss;
73
+ duckdb::stringstream ss;
74
74
  entry.printTo(ss);
75
75
  return ss.str();
76
76
  }
@@ -652,7 +652,7 @@ void ParquetMetaDataOperatorData::ExecuteBloomProbe(ClientContext &context, cons
652
652
  }
653
653
 
654
654
  auto &allocator = Allocator::DefaultAllocator();
655
- auto transport = std::make_shared<ThriftFileTransport>(reader->GetHandle(), false);
655
+ auto transport = duckdb_base_std::make_shared<ThriftFileTransport>(reader->GetHandle(), false);
656
656
  auto protocol =
657
657
  make_uniq<duckdb_apache::thrift::protocol::TCompactProtocolT<ThriftFileTransport>>(std::move(transport));
658
658
 
@@ -318,6 +318,7 @@ TableFunctionSet ParquetScanFunction::GetFunctionSet() {
318
318
  table_function.named_parameters["schema"] = LogicalTypeId::ANY;
319
319
  table_function.named_parameters["encryption_config"] = LogicalTypeId::ANY;
320
320
  table_function.named_parameters["parquet_version"] = LogicalType::VARCHAR;
321
+ table_function.named_parameters["can_have_nan"] = LogicalType::BOOLEAN;
321
322
  table_function.statistics = MultiFileFunction<ParquetMultiFileInfo>::MultiFileScanStats;
322
323
  table_function.serialize = ParquetScanSerialize;
323
324
  table_function.deserialize = ParquetScanDeserialize;
@@ -365,6 +366,13 @@ bool ParquetMultiFileInfo::ParseCopyOption(ClientContext &context, const string
365
366
  options.encryption_config = ParquetEncryptionConfig::Create(context, values[0]);
366
367
  return true;
367
368
  }
369
+ if (key == "can_have_nan") {
370
+ if (values.size() != 1) {
371
+ throw BinderException("Parquet can_have_nan cannot be empty!");
372
+ }
373
+ options.can_have_nan = GetBooleanArgument(key, values);
374
+ return true;
375
+ }
368
376
  return false;
369
377
  }
370
378
 
@@ -393,6 +401,10 @@ bool ParquetMultiFileInfo::ParseOption(ClientContext &context, const string &ori
393
401
  options.debug_use_openssl = BooleanValue::Get(val);
394
402
  return true;
395
403
  }
404
+ if (key == "can_have_nan") {
405
+ options.can_have_nan = BooleanValue::Get(val);
406
+ return true;
407
+ }
396
408
  if (key == "schema") {
397
409
  // Argument is a map that defines the schema
398
410
  const auto &schema_value = val;
@@ -48,7 +48,7 @@ using duckdb_parquet::Type;
48
48
 
49
49
  static unique_ptr<duckdb_apache::thrift::protocol::TProtocol> CreateThriftFileProtocol(CachingFileHandle &file_handle,
50
50
  bool prefetch_mode) {
51
- auto transport = std::make_shared<ThriftFileTransport>(file_handle, prefetch_mode);
51
+ auto transport = duckdb_base_std::make_shared<ThriftFileTransport>(file_handle, prefetch_mode);
52
52
  return make_uniq<duckdb_apache::thrift::protocol::TCompactProtocolT<ThriftFileTransport>>(std::move(transport));
53
53
  }
54
54
 
@@ -501,7 +501,7 @@ unique_ptr<BaseStatistics> ParquetColumnSchema::Stats(ParquetReader &reader, idx
501
501
  stats.Set(StatsInfo::CANNOT_HAVE_NULL_VALUES);
502
502
  return stats.ToUnique();
503
503
  }
504
- return ParquetStatisticsUtils::TransformColumnStatistics(*this, columns);
504
+ return ParquetStatisticsUtils::TransformColumnStatistics(*this, columns, reader.parquet_options.can_have_nan);
505
505
  }
506
506
 
507
507
  ParquetColumnSchema ParquetReader::ParseSchemaRecursive(idx_t depth, idx_t max_define, idx_t max_repeat,
@@ -1052,7 +1052,8 @@ void ParquetReader::PrepareRowGroupBuffer(ParquetReaderScanState &state, idx_t i
1052
1052
  *stats, group.columns[column_reader.ColumnIndex()].meta_data.statistics, filter);
1053
1053
  } else if (!is_generated_column && has_min_max &&
1054
1054
  (column_reader.Type().id() == LogicalTypeId::FLOAT ||
1055
- column_reader.Type().id() == LogicalTypeId::DOUBLE)) {
1055
+ column_reader.Type().id() == LogicalTypeId::DOUBLE) &&
1056
+ parquet_options.can_have_nan) {
1056
1057
  // floating point columns can have NaN values in addition to the min/max bounds defined in the file
1057
1058
  // in order to do optimal pruning - we prune based on the [min, max] of the file followed by pruning
1058
1059
  // based on nan
@@ -1116,7 +1117,7 @@ void ParquetReader::InitializeScan(ClientContext &context, ParquetReaderScanStat
1116
1117
  state.prefetch_mode = false;
1117
1118
  }
1118
1119
 
1119
- state.file_handle = fs.OpenFile(file_handle->GetPath(), flags);
1120
+ state.file_handle = fs.OpenFile(file, flags);
1120
1121
  }
1121
1122
  state.adaptive_filter.reset();
1122
1123
  state.scan_filters.clear();
@@ -304,7 +304,8 @@ Value ParquetStatisticsUtils::ConvertValueInternal(const LogicalType &type, cons
304
304
  }
305
305
 
306
306
  unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(const ParquetColumnSchema &schema,
307
- const vector<ColumnChunk> &columns) {
307
+ const vector<ColumnChunk> &columns,
308
+ bool can_have_nan) {
308
309
 
309
310
  // Not supported types
310
311
  auto &type = schema.type;
@@ -320,7 +321,7 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(con
320
321
  // Recurse into child readers
321
322
  for (idx_t i = 0; i < schema.children.size(); i++) {
322
323
  auto &child_schema = schema.children[i];
323
- auto child_stats = ParquetStatisticsUtils::TransformColumnStatistics(child_schema, columns);
324
+ auto child_stats = ParquetStatisticsUtils::TransformColumnStatistics(child_schema, columns, can_have_nan);
324
325
  StructStats::SetChildStats(struct_stats, i, std::move(child_stats));
325
326
  }
326
327
  row_group_stats = struct_stats.ToUnique();
@@ -363,7 +364,16 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(con
363
364
  break;
364
365
  case LogicalTypeId::FLOAT:
365
366
  case LogicalTypeId::DOUBLE:
366
- row_group_stats = CreateFloatingPointStats(type, schema, parquet_stats);
367
+ if (can_have_nan) {
368
+ // Since parquet doesn't tell us if the column has NaN values, if the user has explicitly declared that it
369
+ // does, we create stats without an upper max value, as NaN compares larger than anything else.
370
+ row_group_stats = CreateFloatingPointStats(type, schema, parquet_stats);
371
+ } else {
372
+ // Otherwise we use the numeric stats as usual, which might lead to "wrong" pruning if the column contains
373
+ // NaN values. The parquet spec is not clear on how to handle NaN values in statistics, and so this is
374
+ // probably the best we can do for now.
375
+ row_group_stats = CreateNumericStats(type, schema, parquet_stats);
376
+ }
367
377
  break;
368
378
  case LogicalTypeId::VARCHAR: {
369
379
  auto string_stats = StringStats::CreateEmpty(type);
@@ -376,7 +376,7 @@ ParquetWriter::ParquetWriter(ClientContext &context, FileSystem &fs, string file
376
376
  }
377
377
 
378
378
  TCompactProtocolFactoryT<MyTransport> tproto_factory;
379
- protocol = tproto_factory.getProtocol(std::make_shared<MyTransport>(*writer));
379
+ protocol = tproto_factory.getProtocol(duckdb_base_std::make_shared<MyTransport>(*writer));
380
380
 
381
381
  file_meta_data.num_rows = 0;
382
382
  file_meta_data.version = 1;
@@ -46,7 +46,7 @@ double ParquetDecimalUtils::ReadDecimalValue(const_data_ptr_t pointer, idx_t siz
46
46
  }
47
47
 
48
48
  unique_ptr<ColumnReader> ParquetDecimalUtils::CreateReader(ParquetReader &reader, const ParquetColumnSchema &schema) {
49
- if (schema.type_length > 0) {
49
+ if (schema.parquet_type == Type::FIXED_LEN_BYTE_ARRAY) {
50
50
  return CreateDecimalReaderInternal<true>(reader, schema);
51
51
  } else {
52
52
  return CreateDecimalReaderInternal<false>(reader, schema);
@@ -11,7 +11,7 @@ namespace duckdb {
11
11
  StringColumnReader::StringColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema)
12
12
  : ColumnReader(reader, schema) {
13
13
  fixed_width_string_length = 0;
14
- if (schema.type_length > 0) {
14
+ if (schema.parquet_type == Type::FIXED_LEN_BYTE_ARRAY) {
15
15
  fixed_width_string_length = schema.type_length;
16
16
  }
17
17
  }
@@ -118,12 +118,21 @@ static bool TypeHasExactRowCount(const LogicalType &type) {
118
118
  }
119
119
 
120
120
  idx_t StructColumnReader::GroupRowsAvailable() {
121
- for (idx_t i = 0; i < child_readers.size(); i++) {
122
- if (TypeHasExactRowCount(child_readers[i]->Type())) {
123
- return child_readers[i]->GroupRowsAvailable();
121
+ for (auto &child : child_readers) {
122
+ if (!child) {
123
+ continue;
124
+ }
125
+ if (TypeHasExactRowCount(child->Type())) {
126
+ return child->GroupRowsAvailable();
127
+ }
128
+ }
129
+ for (auto &child : child_readers) {
130
+ if (!child) {
131
+ continue;
124
132
  }
133
+ return child->GroupRowsAvailable();
125
134
  }
126
- return child_readers[0]->GroupRowsAvailable();
135
+ throw InternalException("No projected columns in struct?");
127
136
  }
128
137
 
129
138
  } // namespace duckdb
@@ -73,6 +73,7 @@ void ParquetOptionsSerialization::Serialize(Serializer &serializer) const {
73
73
  serializer.WritePropertyWithDefault<shared_ptr<ParquetEncryptionConfig>>(104, "encryption_config", parquet_options.encryption_config, nullptr);
74
74
  serializer.WritePropertyWithDefault<bool>(105, "debug_use_openssl", parquet_options.debug_use_openssl, true);
75
75
  serializer.WritePropertyWithDefault<idx_t>(106, "explicit_cardinality", parquet_options.explicit_cardinality, 0);
76
+ serializer.WritePropertyWithDefault<bool>(107, "can_have_nan", parquet_options.can_have_nan, false);
76
77
  }
77
78
 
78
79
  ParquetOptionsSerialization ParquetOptionsSerialization::Deserialize(Deserializer &deserializer) {
@@ -84,6 +85,7 @@ ParquetOptionsSerialization ParquetOptionsSerialization::Deserialize(Deserialize
84
85
  deserializer.ReadPropertyWithExplicitDefault<shared_ptr<ParquetEncryptionConfig>>(104, "encryption_config", result.parquet_options.encryption_config, nullptr);
85
86
  deserializer.ReadPropertyWithExplicitDefault<bool>(105, "debug_use_openssl", result.parquet_options.debug_use_openssl, true);
86
87
  deserializer.ReadPropertyWithExplicitDefault<idx_t>(106, "explicit_cardinality", result.parquet_options.explicit_cardinality, 0);
88
+ deserializer.ReadPropertyWithExplicitDefault<bool>(107, "can_have_nan", result.parquet_options.can_have_nan, false);
87
89
  return result;
88
90
  }
89
91