duckdb 1.3.1-dev6.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/extension/core_functions/aggregate/distributive/arg_min_max.cpp +27 -39
- package/src/duckdb/extension/core_functions/aggregate/holistic/quantile.cpp +2 -3
- package/src/duckdb/extension/core_functions/include/core_functions/aggregate/quantile_sort_tree.hpp +1 -1
- package/src/duckdb/extension/core_functions/lambda_functions.cpp +16 -14
- package/src/duckdb/extension/core_functions/scalar/list/list_filter.cpp +3 -2
- package/src/duckdb/extension/core_functions/scalar/list/list_reduce.cpp +46 -10
- package/src/duckdb/extension/core_functions/scalar/list/list_transform.cpp +3 -2
- package/src/duckdb/extension/core_functions/scalar/random/random.cpp +3 -1
- package/src/duckdb/extension/icu/icu-datefunc.cpp +5 -3
- package/src/duckdb/extension/icu/icu-strptime.cpp +6 -1
- package/src/duckdb/extension/icu/icu-timezone.cpp +4 -0
- package/src/duckdb/extension/icu/icu_extension.cpp +7 -2
- package/src/duckdb/extension/icu/include/icu-datefunc.hpp +1 -1
- package/src/duckdb/extension/icu/include/icu-helpers.hpp +1 -1
- package/src/duckdb/extension/icu/third_party/icu/common/uloc.cpp +5 -5
- package/src/duckdb/extension/json/include/json_common.hpp +19 -0
- package/src/duckdb/extension/json/include/json_deserializer.hpp +1 -4
- package/src/duckdb/extension/json/include/json_functions.hpp +4 -4
- package/src/duckdb/extension/json/json_functions/json_serialize_sql.cpp +38 -17
- package/src/duckdb/extension/json/json_functions/json_table_in_out.cpp +11 -7
- package/src/duckdb/extension/json/json_functions.cpp +4 -4
- package/src/duckdb/extension/json/json_reader.cpp +1 -1
- package/src/duckdb/extension/parquet/column_reader.cpp +7 -1
- package/src/duckdb/extension/parquet/include/parquet_bss_decoder.hpp +2 -2
- package/src/duckdb/extension/parquet/include/parquet_dbp_encoder.hpp +2 -2
- package/src/duckdb/extension/parquet/include/parquet_reader.hpp +2 -1
- package/src/duckdb/extension/parquet/include/parquet_statistics.hpp +1 -1
- package/src/duckdb/extension/parquet/include/parquet_writer.hpp +3 -0
- package/src/duckdb/extension/parquet/include/writer/parquet_write_operators.hpp +3 -1
- package/src/duckdb/extension/parquet/include/writer/templated_column_writer.hpp +1 -1
- package/src/duckdb/extension/parquet/parquet_crypto.cpp +9 -5
- package/src/duckdb/extension/parquet/parquet_extension.cpp +26 -0
- package/src/duckdb/extension/parquet/parquet_float16.cpp +4 -2
- package/src/duckdb/extension/parquet/parquet_metadata.cpp +3 -3
- package/src/duckdb/extension/parquet/parquet_multi_file_info.cpp +12 -0
- package/src/duckdb/extension/parquet/parquet_reader.cpp +5 -4
- package/src/duckdb/extension/parquet/parquet_statistics.cpp +13 -3
- package/src/duckdb/extension/parquet/parquet_writer.cpp +1 -1
- package/src/duckdb/extension/parquet/reader/decimal_column_reader.cpp +1 -1
- package/src/duckdb/extension/parquet/reader/string_column_reader.cpp +1 -1
- package/src/duckdb/extension/parquet/reader/struct_column_reader.cpp +13 -4
- package/src/duckdb/extension/parquet/serialize_parquet.cpp +2 -0
- package/src/duckdb/src/catalog/catalog.cpp +10 -4
- package/src/duckdb/src/catalog/catalog_entry/duck_table_entry.cpp +4 -10
- package/src/duckdb/src/catalog/catalog_entry/schema_catalog_entry.cpp +1 -2
- package/src/duckdb/src/catalog/catalog_entry/sequence_catalog_entry.cpp +1 -1
- package/src/duckdb/src/catalog/catalog_entry/table_catalog_entry.cpp +2 -2
- package/src/duckdb/src/catalog/catalog_entry/type_catalog_entry.cpp +1 -1
- package/src/duckdb/src/catalog/catalog_search_path.cpp +7 -1
- package/src/duckdb/src/catalog/catalog_set.cpp +21 -1
- package/src/duckdb/src/common/adbc/adbc.cpp +1 -1
- package/src/duckdb/src/common/arrow/arrow_appender.cpp +17 -5
- package/src/duckdb/src/common/arrow/arrow_converter.cpp +23 -15
- package/src/duckdb/src/common/box_renderer.cpp +1 -2
- package/src/duckdb/src/common/enum_util.cpp +4 -3
- package/src/duckdb/src/common/local_file_system.cpp +13 -12
- package/src/duckdb/src/common/multi_file/multi_file_column_mapper.cpp +35 -12
- package/src/duckdb/src/common/multi_file/multi_file_reader.cpp +13 -3
- package/src/duckdb/src/common/string_util.cpp +7 -5
- package/src/duckdb/src/common/tree_renderer/graphviz_tree_renderer.cpp +4 -4
- package/src/duckdb/src/common/tree_renderer/html_tree_renderer.cpp +4 -4
- package/src/duckdb/src/common/tree_renderer/json_tree_renderer.cpp +4 -4
- package/src/duckdb/src/common/tree_renderer/text_tree_renderer.cpp +4 -4
- package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +1 -1
- package/src/duckdb/src/common/types/uuid.cpp +5 -1
- package/src/duckdb/src/common/types.cpp +28 -0
- package/src/duckdb/src/common/virtual_file_system.cpp +5 -0
- package/src/duckdb/src/execution/column_binding_resolver.cpp +49 -30
- package/src/duckdb/src/execution/index/fixed_size_allocator.cpp +4 -0
- package/src/duckdb/src/execution/join_hashtable.cpp +10 -7
- package/src/duckdb/src/execution/operator/aggregate/physical_streaming_window.cpp +3 -3
- package/src/duckdb/src/execution/operator/csv_scanner/encode/csv_encoder.cpp +1 -1
- package/src/duckdb/src/execution/operator/csv_scanner/scanner/column_count_scanner.cpp +2 -1
- package/src/duckdb/src/execution/operator/csv_scanner/scanner/skip_scanner.cpp +1 -4
- package/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +53 -1
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +58 -59
- package/src/duckdb/src/execution/operator/join/physical_hash_join.cpp +10 -5
- package/src/duckdb/src/execution/operator/persistent/physical_batch_copy_to_file.cpp +4 -0
- package/src/duckdb/src/execution/operator/persistent/physical_copy_to_file.cpp +18 -8
- package/src/duckdb/src/execution/operator/persistent/physical_export.cpp +1 -1
- package/src/duckdb/src/execution/operator/schema/physical_attach.cpp +1 -0
- package/src/duckdb/src/execution/physical_plan_generator.cpp +5 -5
- package/src/duckdb/src/function/cast/vector_cast_helpers.cpp +2 -1
- package/src/duckdb/src/function/function.cpp +4 -0
- package/src/duckdb/src/function/scalar/operator/arithmetic.cpp +6 -0
- package/src/duckdb/src/function/scalar/struct/remap_struct.cpp +10 -1
- package/src/duckdb/src/function/table/copy_csv.cpp +1 -0
- package/src/duckdb/src/function/table/version/pragma_version.cpp +3 -3
- package/src/duckdb/src/include/duckdb/catalog/catalog.hpp +1 -0
- package/src/duckdb/src/include/duckdb/catalog/catalog_entry/duck_table_entry.hpp +1 -1
- package/src/duckdb/src/include/duckdb/catalog/catalog_search_path.hpp +1 -1
- package/src/duckdb/src/include/duckdb/catalog/catalog_set.hpp +2 -0
- package/src/duckdb/src/include/duckdb/common/file_buffer.hpp +2 -2
- package/src/duckdb/src/include/duckdb/common/helper.hpp +9 -9
- package/src/duckdb/src/include/duckdb/common/hive_partitioning.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/multi_file/multi_file_column_mapper.hpp +3 -5
- package/src/duckdb/src/include/duckdb/common/multi_file/multi_file_reader.hpp +7 -0
- package/src/duckdb/src/include/duckdb/common/multi_file/multi_file_states.hpp +3 -0
- package/src/duckdb/src/include/duckdb/common/shadow_forbidden_functions.hpp +40 -0
- package/src/duckdb/src/include/duckdb/common/string.hpp +25 -2
- package/src/duckdb/src/include/duckdb/common/types/hugeint.hpp +20 -24
- package/src/duckdb/src/include/duckdb/common/types/uhugeint.hpp +20 -24
- package/src/duckdb/src/include/duckdb/common/types.hpp +3 -0
- package/src/duckdb/src/include/duckdb/common/unique_ptr.hpp +34 -8
- package/src/duckdb/src/include/duckdb/execution/column_binding_resolver.hpp +1 -0
- package/src/duckdb/src/include/duckdb/execution/join_hashtable.hpp +3 -2
- package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/column_count_scanner.hpp +3 -0
- package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/encode/csv_encoder.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp +15 -3
- package/src/duckdb/src/include/duckdb/function/cast/vector_cast_helpers.hpp +2 -2
- package/src/duckdb/src/include/duckdb/function/copy_function.hpp +7 -3
- package/src/duckdb/src/include/duckdb/function/function.hpp +1 -0
- package/src/duckdb/src/include/duckdb/function/function_binder.hpp +2 -1
- package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +20 -12
- package/src/duckdb/src/include/duckdb/function/lambda_functions.hpp +4 -3
- package/src/duckdb/src/include/duckdb/function/scalar_function.hpp +3 -1
- package/src/duckdb/src/include/duckdb/logging/log_type.hpp +17 -0
- package/src/duckdb/src/include/duckdb/main/attached_database.hpp +1 -0
- package/src/duckdb/src/include/duckdb/main/client_properties.hpp +22 -6
- package/src/duckdb/src/include/duckdb/main/config.hpp +2 -0
- package/src/duckdb/src/include/duckdb/main/database_manager.hpp +4 -1
- package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +27 -13
- package/src/duckdb/src/include/duckdb/main/secret/secret_manager.hpp +1 -0
- package/src/duckdb/src/include/duckdb/main/settings.hpp +11 -0
- package/src/duckdb/src/include/duckdb/optimizer/topn_optimizer.hpp +7 -1
- package/src/duckdb/src/include/duckdb/original/std/locale.hpp +10 -0
- package/src/duckdb/src/include/duckdb/original/std/memory.hpp +12 -0
- package/src/duckdb/src/include/duckdb/original/std/sstream.hpp +11 -0
- package/src/duckdb/src/include/duckdb/planner/expression_binder.hpp +5 -3
- package/src/duckdb/src/include/duckdb/storage/buffer/buffer_pool.hpp +4 -2
- package/src/duckdb/src/logging/log_manager.cpp +1 -0
- package/src/duckdb/src/logging/log_types.cpp +40 -0
- package/src/duckdb/src/main/attached_database.cpp +4 -0
- package/src/duckdb/src/main/client_context.cpp +1 -0
- package/src/duckdb/src/main/config.cpp +1 -0
- package/src/duckdb/src/main/database.cpp +1 -0
- package/src/duckdb/src/main/database_manager.cpp +19 -2
- package/src/duckdb/src/main/extension/extension_helper.cpp +4 -3
- package/src/duckdb/src/main/query_profiler.cpp +2 -2
- package/src/duckdb/src/main/query_result.cpp +1 -1
- package/src/duckdb/src/main/secret/secret_manager.cpp +2 -0
- package/src/duckdb/src/main/settings/autogenerated_settings.cpp +7 -0
- package/src/duckdb/src/main/settings/custom_settings.cpp +106 -34
- package/src/duckdb/src/optimizer/optimizer.cpp +1 -1
- package/src/duckdb/src/optimizer/topn_optimizer.cpp +18 -8
- package/src/duckdb/src/parallel/executor.cpp +5 -0
- package/src/duckdb/src/parser/parsed_data/create_sequence_info.cpp +1 -1
- package/src/duckdb/src/parser/transform/expression/transform_interval.cpp +5 -1
- package/src/duckdb/src/planner/binder/expression/bind_function_expression.cpp +21 -24
- package/src/duckdb/src/planner/binder/expression/bind_lambda.cpp +10 -8
- package/src/duckdb/src/planner/binder/expression/bind_operator_expression.cpp +3 -2
- package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +0 -4
- package/src/duckdb/src/planner/binder/tableref/bind_basetableref.cpp +3 -0
- package/src/duckdb/src/planner/binder/tableref/bind_table_function.cpp +3 -0
- package/src/duckdb/src/planner/expression/bound_cast_expression.cpp +3 -0
- package/src/duckdb/src/planner/expression/bound_columnref_expression.cpp +1 -1
- package/src/duckdb/src/planner/expression/bound_function_expression.cpp +0 -1
- package/src/duckdb/src/planner/expression/bound_reference_expression.cpp +1 -1
- package/src/duckdb/src/planner/expression_binder.cpp +4 -2
- package/src/duckdb/src/planner/logical_operator.cpp +2 -1
- package/src/duckdb/src/planner/subquery/flatten_dependent_join.cpp +4 -1
- package/src/duckdb/src/storage/buffer/block_handle.cpp +8 -0
- package/src/duckdb/src/storage/buffer/buffer_pool.cpp +44 -18
- package/src/duckdb/src/storage/caching_file_system.cpp +7 -7
- package/src/duckdb/src/storage/standard_buffer_manager.cpp +4 -3
- package/src/duckdb/src/storage/storage_info.cpp +2 -0
- package/src/duckdb/src/storage/wal_replay.cpp +9 -4
- package/src/duckdb/third_party/fmt/include/fmt/format.h +8 -1
- package/src/duckdb/third_party/fsst/libfsst.cpp +4 -3
- package/src/duckdb/third_party/httplib/httplib.hpp +25 -22
- package/src/duckdb/third_party/hyperloglog/sds.cpp +7 -3
- package/src/duckdb/third_party/libpg_query/src_common_keywords.cpp +8 -1
- package/src/duckdb/third_party/re2/re2/filtered_re2.h +8 -2
- package/src/duckdb/third_party/re2/re2/pod_array.h +7 -1
- package/src/duckdb/third_party/re2/re2/re2.cc +6 -2
- package/src/duckdb/third_party/re2/re2/set.cc +1 -1
- package/src/duckdb/third_party/re2/re2/set.h +7 -1
- package/src/duckdb/ub_src_logging.cpp +4 -4
@@ -171,8 +171,8 @@ ScalarFunctionSet JSONFunctions::GetSerializeSqlFunction() {
|
|
171
171
|
//----------------------------------------------------------------------
|
172
172
|
// JSON DESERIALIZE
|
173
173
|
//----------------------------------------------------------------------
|
174
|
-
static unique_ptr<SelectStatement
|
175
|
-
auto doc = JSONCommon::ReadDocument(input, JSONCommon::READ_FLAG, alc);
|
174
|
+
static vector<unique_ptr<SelectStatement>> DeserializeSelectStatement(string_t input, yyjson_alc *alc) {
|
175
|
+
auto doc = yyjson_doc_ptr(JSONCommon::ReadDocument(input, JSONCommon::READ_FLAG, alc));
|
176
176
|
if (!doc) {
|
177
177
|
throw ParserException("Could not parse json");
|
178
178
|
}
|
@@ -196,16 +196,22 @@ static unique_ptr<SelectStatement> DeserializeSelectStatement(string_t input, yy
|
|
196
196
|
if (size == 0) {
|
197
197
|
throw ParserException("Error parsing json: no statements");
|
198
198
|
}
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
199
|
+
|
200
|
+
vector<unique_ptr<SelectStatement>> result;
|
201
|
+
|
202
|
+
idx_t idx;
|
203
|
+
idx_t max;
|
204
|
+
yyjson_val *stmt_json;
|
205
|
+
yyjson_arr_foreach(statements, idx, max, stmt_json) {
|
206
|
+
JsonDeserializer deserializer(stmt_json, doc);
|
207
|
+
auto stmt = SelectStatement::Deserialize(deserializer);
|
208
|
+
if (!stmt->node) {
|
209
|
+
throw ParserException("Error parsing json: no select node found in json");
|
210
|
+
}
|
211
|
+
result.push_back(std::move(stmt));
|
207
212
|
}
|
208
|
-
|
213
|
+
|
214
|
+
return result;
|
209
215
|
}
|
210
216
|
|
211
217
|
//----------------------------------------------------------------------
|
@@ -217,8 +223,17 @@ static void JsonDeserializeFunction(DataChunk &args, ExpressionState &state, Vec
|
|
217
223
|
auto &inputs = args.data[0];
|
218
224
|
|
219
225
|
UnaryExecutor::Execute<string_t, string_t>(inputs, result, args.size(), [&](string_t input) {
|
220
|
-
auto
|
221
|
-
|
226
|
+
auto stmts = DeserializeSelectStatement(input, alc);
|
227
|
+
// Combine all statements into a single semicolon separated string
|
228
|
+
string str;
|
229
|
+
for (idx_t i = 0; i < stmts.size(); i++) {
|
230
|
+
if (i > 0) {
|
231
|
+
str += "; ";
|
232
|
+
}
|
233
|
+
str += stmts[i]->ToString();
|
234
|
+
}
|
235
|
+
|
236
|
+
return StringVector::AddString(result, str);
|
222
237
|
});
|
223
238
|
}
|
224
239
|
|
@@ -237,8 +252,11 @@ static string ExecuteJsonSerializedSqlPragmaFunction(ClientContext &context, con
|
|
237
252
|
auto alc = local_state.json_allocator->GetYYAlc();
|
238
253
|
|
239
254
|
auto input = parameters.values[0].GetValueUnsafe<string_t>();
|
240
|
-
auto
|
241
|
-
|
255
|
+
auto stmts = DeserializeSelectStatement(input, alc);
|
256
|
+
if (stmts.size() != 1) {
|
257
|
+
throw BinderException("json_execute_serialized_sql pragma expects exactly one statement");
|
258
|
+
}
|
259
|
+
return stmts[0]->ToString();
|
242
260
|
}
|
243
261
|
|
244
262
|
PragmaFunctionSet JSONFunctions::GetExecuteJsonSerializedSqlPragmaFunction() {
|
@@ -268,8 +286,11 @@ struct ExecuteSqlTableFunction {
|
|
268
286
|
throw BinderException("json_execute_serialized_sql cannot execute NULL plan");
|
269
287
|
}
|
270
288
|
auto serialized = input.inputs[0].GetValueUnsafe<string>();
|
271
|
-
auto
|
272
|
-
|
289
|
+
auto stmts = DeserializeSelectStatement(serialized, alc);
|
290
|
+
if (stmts.size() != 1) {
|
291
|
+
throw BinderException("json_execute_serialized_sql expects exactly one statement");
|
292
|
+
}
|
293
|
+
result->plan = result->con->RelationFromQuery(std::move(stmts[0]));
|
273
294
|
|
274
295
|
for (auto &col : result->plan->Columns()) {
|
275
296
|
return_types.emplace_back(col.Type());
|
@@ -124,17 +124,21 @@ struct JSONTableInOutLocalState : LocalTableFunctionState {
|
|
124
124
|
return result;
|
125
125
|
}
|
126
126
|
|
127
|
-
void AddRecursionNode(yyjson_val *val, optional_ptr<yyjson_val> vkey) {
|
128
|
-
|
129
|
-
|
130
|
-
|
127
|
+
void AddRecursionNode(yyjson_val *val, optional_ptr<yyjson_val> vkey, const optional_idx arr_index) {
|
128
|
+
string str;
|
129
|
+
if (vkey) {
|
130
|
+
str = "." + string(unsafe_yyjson_get_str(vkey.get()), unsafe_yyjson_get_len(vkey.get()));
|
131
|
+
} else if (arr_index.IsValid()) {
|
132
|
+
str = "[" + to_string(arr_index.GetIndex()) + "]";
|
133
|
+
}
|
134
|
+
recursion_nodes.emplace_back(str, val);
|
131
135
|
}
|
132
136
|
|
133
137
|
JSONAllocator json_allocator;
|
134
138
|
yyjson_alc *alc;
|
135
139
|
|
136
140
|
string path;
|
137
|
-
|
141
|
+
idx_t len;
|
138
142
|
yyjson_doc *doc;
|
139
143
|
bool initialized;
|
140
144
|
|
@@ -269,7 +273,7 @@ static void InitializeLocalState(JSONTableInOutLocalState &lstate, DataChunk &in
|
|
269
273
|
result.AddRow<TYPE>(lstate, nullptr, root);
|
270
274
|
}
|
271
275
|
if (is_container) {
|
272
|
-
lstate.AddRecursionNode(root, nullptr);
|
276
|
+
lstate.AddRecursionNode(root, nullptr, optional_idx());
|
273
277
|
}
|
274
278
|
}
|
275
279
|
|
@@ -283,7 +287,7 @@ static bool JSONTableInOutHandleValue(JSONTableInOutLocalState &lstate, JSONTabl
|
|
283
287
|
result.AddRow<TYPE>(lstate, child_key, child_val);
|
284
288
|
child_index++; // We finished processing the array element
|
285
289
|
if (TYPE == JSONTableInOutType::TREE && (unsafe_yyjson_is_arr(child_val) || unsafe_yyjson_is_obj(child_val))) {
|
286
|
-
lstate.AddRecursionNode(child_val, child_key);
|
290
|
+
lstate.AddRecursionNode(child_val, child_key, idx);
|
287
291
|
return true; // Break: We added a recursion node, go depth-first
|
288
292
|
}
|
289
293
|
if (result.count == STANDARD_VECTOR_SIZE) {
|
@@ -14,7 +14,7 @@ namespace duckdb {
|
|
14
14
|
|
15
15
|
using JSONPathType = JSONCommon::JSONPathType;
|
16
16
|
|
17
|
-
JSONPathType JSONReadFunctionData::CheckPath(const Value &path_val, string &path,
|
17
|
+
JSONPathType JSONReadFunctionData::CheckPath(const Value &path_val, string &path, idx_t &len) {
|
18
18
|
if (path_val.IsNull()) {
|
19
19
|
throw BinderException("JSON path cannot be NULL");
|
20
20
|
}
|
@@ -60,7 +60,7 @@ unique_ptr<FunctionData> JSONReadFunctionData::Bind(ClientContext &context, Scal
|
|
60
60
|
D_ASSERT(bound_function.arguments.size() == 2);
|
61
61
|
bool constant = false;
|
62
62
|
string path;
|
63
|
-
|
63
|
+
idx_t len = 0;
|
64
64
|
JSONPathType path_type = JSONPathType::REGULAR;
|
65
65
|
if (arguments[1]->IsFoldable()) {
|
66
66
|
const auto path_val = ExpressionExecutor::EvaluateScalar(context, *arguments[1]);
|
@@ -80,7 +80,7 @@ unique_ptr<FunctionData> JSONReadFunctionData::Bind(ClientContext &context, Scal
|
|
80
80
|
return make_uniq<JSONReadFunctionData>(constant, std::move(path), len, path_type);
|
81
81
|
}
|
82
82
|
|
83
|
-
JSONReadManyFunctionData::JSONReadManyFunctionData(vector<string> paths_p, vector<
|
83
|
+
JSONReadManyFunctionData::JSONReadManyFunctionData(vector<string> paths_p, vector<idx_t> lens_p)
|
84
84
|
: paths(std::move(paths_p)), lens(std::move(lens_p)) {
|
85
85
|
for (const auto &path : paths) {
|
86
86
|
ptrs.push_back(path.c_str());
|
@@ -107,7 +107,7 @@ unique_ptr<FunctionData> JSONReadManyFunctionData::Bind(ClientContext &context,
|
|
107
107
|
}
|
108
108
|
|
109
109
|
vector<string> paths;
|
110
|
-
vector<
|
110
|
+
vector<idx_t> lens;
|
111
111
|
auto paths_val = ExpressionExecutor::EvaluateScalar(context, *arguments[1]);
|
112
112
|
|
113
113
|
for (auto &path_val : ListValue::GetChildren(paths_val)) {
|
@@ -737,7 +737,7 @@ bool JSONReader::CopyRemainderFromPreviousBuffer(JSONReaderScanState &scan_state
|
|
737
737
|
idx_t prev_buffer_size = previous_buffer_handle->buffer_size - previous_buffer_handle->buffer_start;
|
738
738
|
auto prev_buffer_ptr = char_ptr_cast(previous_buffer_handle->buffer.get()) + previous_buffer_handle->buffer_size;
|
739
739
|
auto prev_object_start = PreviousNewline(prev_buffer_ptr, prev_buffer_size);
|
740
|
-
auto prev_object_size = prev_buffer_ptr - prev_object_start;
|
740
|
+
auto prev_object_size = NumericCast<idx_t>(prev_buffer_ptr - prev_object_start);
|
741
741
|
|
742
742
|
D_ASSERT(scan_state.buffer_offset == options.maximum_object_size);
|
743
743
|
if (prev_object_size > scan_state.buffer_offset) {
|
@@ -412,7 +412,7 @@ void ColumnReader::DecompressInternal(CompressionCodec::type codec, const_data_p
|
|
412
412
|
}
|
413
413
|
|
414
414
|
default: {
|
415
|
-
|
415
|
+
duckdb::stringstream codec_name;
|
416
416
|
codec_name << codec;
|
417
417
|
throw std::runtime_error("Unsupported compression codec \"" + codec_name.str() +
|
418
418
|
"\". Supported options are uncompressed, brotli, gzip, lz4_raw, snappy or zstd");
|
@@ -713,6 +713,12 @@ void ColumnReader::ApplyPendingSkips(data_ptr_t define_out, data_ptr_t repeat_ou
|
|
713
713
|
|
714
714
|
while (to_skip > 0) {
|
715
715
|
auto skip_now = ReadPageHeaders(to_skip);
|
716
|
+
if (page_is_filtered_out) {
|
717
|
+
// the page has been filtered out entirely - skip
|
718
|
+
page_rows_available -= skip_now;
|
719
|
+
to_skip -= skip_now;
|
720
|
+
continue;
|
721
|
+
}
|
716
722
|
const auto all_valid = PrepareRead(skip_now, define_out, repeat_out, 0);
|
717
723
|
|
718
724
|
const auto define_ptr = all_valid ? nullptr : static_cast<uint8_t *>(define_out);
|
@@ -23,7 +23,7 @@ public:
|
|
23
23
|
template <typename T>
|
24
24
|
void GetBatch(data_ptr_t values_target_ptr, uint32_t batch_size) {
|
25
25
|
if (buffer_.len % sizeof(T) != 0) {
|
26
|
-
|
26
|
+
duckdb::stringstream error;
|
27
27
|
error << "Data buffer size for the BYTE_STREAM_SPLIT encoding (" << buffer_.len
|
28
28
|
<< ") should be a multiple of the type size (" << sizeof(T) << ")";
|
29
29
|
throw std::runtime_error(error.str());
|
@@ -44,7 +44,7 @@ public:
|
|
44
44
|
template <typename T>
|
45
45
|
void Skip(uint32_t batch_size) {
|
46
46
|
if (buffer_.len % sizeof(T) != 0) {
|
47
|
-
|
47
|
+
duckdb::stringstream error;
|
48
48
|
error << "Data buffer size for the BYTE_STREAM_SPLIT encoding (" << buffer_.len
|
49
49
|
<< ") should be a multiple of the type size (" << sizeof(T) << ")";
|
50
50
|
throw std::runtime_error(error.str());
|
@@ -155,8 +155,8 @@ private:
|
|
155
155
|
int64_t verification_data[NUMBER_OF_VALUES_IN_A_MINIBLOCK];
|
156
156
|
ByteBuffer byte_buffer(data_ptr_cast(data_packed), write_size);
|
157
157
|
bitpacking_width_t bitpack_pos = 0;
|
158
|
-
ParquetDecodeUtils::BitUnpack(byte_buffer, bitpack_pos, verification_data,
|
159
|
-
width);
|
158
|
+
ParquetDecodeUtils::BitUnpack(byte_buffer, bitpack_pos, reinterpret_cast<uint64_t *>(verification_data),
|
159
|
+
NUMBER_OF_VALUES_IN_A_MINIBLOCK, width);
|
160
160
|
for (idx_t i = 0; i < NUMBER_OF_VALUES_IN_A_MINIBLOCK; i++) {
|
161
161
|
D_ASSERT(src[i] == verification_data[i]);
|
162
162
|
}
|
@@ -62,7 +62,7 @@ struct ParquetReaderScanState {
|
|
62
62
|
idx_t group_offset;
|
63
63
|
unique_ptr<CachingFileHandle> file_handle;
|
64
64
|
unique_ptr<ColumnReader> root_reader;
|
65
|
-
|
65
|
+
duckdb_base_std::unique_ptr<duckdb_apache::thrift::protocol::TProtocol> thrift_file_proto;
|
66
66
|
|
67
67
|
bool finished;
|
68
68
|
SelectionVector sel;
|
@@ -108,6 +108,7 @@ struct ParquetOptions {
|
|
108
108
|
|
109
109
|
vector<ParquetColumnDefinition> schema;
|
110
110
|
idx_t explicit_cardinality = 0;
|
111
|
+
bool can_have_nan = false; // if floats or doubles can contain NaN values
|
111
112
|
};
|
112
113
|
|
113
114
|
struct ParquetOptionsSerialization {
|
@@ -27,7 +27,7 @@ class ResizeableBuffer;
|
|
27
27
|
struct ParquetStatisticsUtils {
|
28
28
|
|
29
29
|
static unique_ptr<BaseStatistics> TransformColumnStatistics(const ParquetColumnSchema &reader,
|
30
|
-
const vector<ColumnChunk> &columns);
|
30
|
+
const vector<ColumnChunk> &columns, bool can_have_nan);
|
31
31
|
|
32
32
|
static Value ConvertValue(const LogicalType &type, const ParquetColumnSchema &schema_ele, const std::string &stats);
|
33
33
|
|
@@ -134,6 +134,9 @@ public:
|
|
134
134
|
ParquetVersion GetParquetVersion() const {
|
135
135
|
return parquet_version;
|
136
136
|
}
|
137
|
+
const string &GetFileName() const {
|
138
|
+
return file_name;
|
139
|
+
}
|
137
140
|
|
138
141
|
uint32_t Write(const duckdb_apache::thrift::TBase &object);
|
139
142
|
uint32_t WriteData(const const_data_ptr_t buffer, const uint32_t buffer_size);
|
@@ -138,7 +138,9 @@ struct ParquetBaseStringOperator : public BaseParquetOperator {
|
|
138
138
|
|
139
139
|
template <class SRC, class TGT>
|
140
140
|
static idx_t GetRowSize(const Vector &vector, idx_t index) {
|
141
|
-
|
141
|
+
// This needs to add the 4 bytes (just like WriteSize) otherwise we underestimate and we have to realloc
|
142
|
+
// This seriously harms performance, mostly by making it very inconsistent (see internal issue #4990)
|
143
|
+
return sizeof(uint32_t) + FlatVector::GetData<string_t>(vector)[index].GetSize();
|
142
144
|
}
|
143
145
|
};
|
144
146
|
|
@@ -403,7 +403,7 @@ private:
|
|
403
403
|
break;
|
404
404
|
}
|
405
405
|
case duckdb_parquet::Encoding::BYTE_STREAM_SPLIT: {
|
406
|
-
if (page_state.bss_initialized) {
|
406
|
+
if (!page_state.bss_initialized) {
|
407
407
|
page_state.bss_encoder.BeginWrite(BufferAllocator::Get(writer.GetContext()));
|
408
408
|
page_state.bss_initialized = true;
|
409
409
|
}
|
@@ -300,14 +300,15 @@ private:
|
|
300
300
|
uint32_t ParquetCrypto::Read(TBase &object, TProtocol &iprot, const string &key,
|
301
301
|
const EncryptionUtil &encryption_util_p) {
|
302
302
|
TCompactProtocolFactoryT<DecryptionTransport> tproto_factory;
|
303
|
-
auto dprot =
|
303
|
+
auto dprot =
|
304
|
+
tproto_factory.getProtocol(duckdb_base_std::make_shared<DecryptionTransport>(iprot, key, encryption_util_p));
|
304
305
|
auto &dtrans = reinterpret_cast<DecryptionTransport &>(*dprot->getTransport());
|
305
306
|
|
306
307
|
// We have to read the whole thing otherwise thrift throws an error before we realize we're decryption is wrong
|
307
308
|
auto all = dtrans.ReadAll();
|
308
309
|
TCompactProtocolFactoryT<SimpleReadTransport> tsimple_proto_factory;
|
309
310
|
auto simple_prot =
|
310
|
-
tsimple_proto_factory.getProtocol(
|
311
|
+
tsimple_proto_factory.getProtocol(duckdb_base_std::make_shared<SimpleReadTransport>(all.get(), all.GetSize()));
|
311
312
|
|
312
313
|
// Read the object
|
313
314
|
object.read(simple_prot.get());
|
@@ -319,7 +320,8 @@ uint32_t ParquetCrypto::Write(const TBase &object, TProtocol &oprot, const strin
|
|
319
320
|
const EncryptionUtil &encryption_util_p) {
|
320
321
|
// Create encryption protocol
|
321
322
|
TCompactProtocolFactoryT<EncryptionTransport> tproto_factory;
|
322
|
-
auto eprot =
|
323
|
+
auto eprot =
|
324
|
+
tproto_factory.getProtocol(duckdb_base_std::make_shared<EncryptionTransport>(oprot, key, encryption_util_p));
|
323
325
|
auto &etrans = reinterpret_cast<EncryptionTransport &>(*eprot->getTransport());
|
324
326
|
|
325
327
|
// Write the object in memory
|
@@ -333,7 +335,8 @@ uint32_t ParquetCrypto::ReadData(TProtocol &iprot, const data_ptr_t buffer, cons
|
|
333
335
|
const string &key, const EncryptionUtil &encryption_util_p) {
|
334
336
|
// Create decryption protocol
|
335
337
|
TCompactProtocolFactoryT<DecryptionTransport> tproto_factory;
|
336
|
-
auto dprot =
|
338
|
+
auto dprot =
|
339
|
+
tproto_factory.getProtocol(duckdb_base_std::make_shared<DecryptionTransport>(iprot, key, encryption_util_p));
|
337
340
|
auto &dtrans = reinterpret_cast<DecryptionTransport &>(*dprot->getTransport());
|
338
341
|
|
339
342
|
// Read buffer
|
@@ -348,7 +351,8 @@ uint32_t ParquetCrypto::WriteData(TProtocol &oprot, const const_data_ptr_t buffe
|
|
348
351
|
// FIXME: we know the size upfront so we could do a streaming write instead of this
|
349
352
|
// Create encryption protocol
|
350
353
|
TCompactProtocolFactoryT<EncryptionTransport> tproto_factory;
|
351
|
-
auto eprot =
|
354
|
+
auto eprot =
|
355
|
+
tproto_factory.getProtocol(duckdb_base_std::make_shared<EncryptionTransport>(oprot, key, encryption_util_p));
|
352
356
|
auto &etrans = reinterpret_cast<EncryptionTransport &>(*eprot->getTransport());
|
353
357
|
|
354
358
|
// Write the data in memory
|
@@ -243,6 +243,18 @@ struct ParquetWriteBindData : public TableFunctionData {
|
|
243
243
|
|
244
244
|
struct ParquetWriteGlobalState : public GlobalFunctionData {
|
245
245
|
unique_ptr<ParquetWriter> writer;
|
246
|
+
optional_ptr<const PhysicalOperator> op;
|
247
|
+
|
248
|
+
void LogFlushingRowGroup(const ColumnDataCollection &buffer, const string &reason) {
|
249
|
+
if (!op) {
|
250
|
+
return;
|
251
|
+
}
|
252
|
+
DUCKDB_LOG(writer->GetContext(), PhysicalOperatorLogType, *op, "ParquetWriter", "FlushRowGroup",
|
253
|
+
{{"file", writer->GetFileName()},
|
254
|
+
{"rows", to_string(buffer.Count())},
|
255
|
+
{"size", to_string(buffer.SizeInBytes())},
|
256
|
+
{"reason", reason}});
|
257
|
+
}
|
246
258
|
|
247
259
|
mutex lock;
|
248
260
|
unique_ptr<ColumnDataCollection> combine_buffer;
|
@@ -446,6 +458,9 @@ void ParquetWriteSink(ExecutionContext &context, FunctionData &bind_data_p, Glob
|
|
446
458
|
|
447
459
|
if (local_state.buffer.Count() >= bind_data.row_group_size ||
|
448
460
|
local_state.buffer.SizeInBytes() >= bind_data.row_group_size_bytes) {
|
461
|
+
const string reason =
|
462
|
+
local_state.buffer.Count() >= bind_data.row_group_size ? "ROW_GROUP_SIZE" : "ROW_GROUP_SIZE_BYTES";
|
463
|
+
global_state.LogFlushingRowGroup(local_state.buffer, reason);
|
449
464
|
// if the chunk collection exceeds a certain size (rows/bytes) we flush it to the parquet file
|
450
465
|
local_state.append_state.current_chunk_state.handles.clear();
|
451
466
|
global_state.writer->Flush(local_state.buffer);
|
@@ -462,6 +477,7 @@ void ParquetWriteCombine(ExecutionContext &context, FunctionData &bind_data_p, G
|
|
462
477
|
if (local_state.buffer.Count() >= bind_data.row_group_size / 2 ||
|
463
478
|
local_state.buffer.SizeInBytes() >= bind_data.row_group_size_bytes / 2) {
|
464
479
|
// local state buffer is more than half of the row_group_size(_bytes), just flush it
|
480
|
+
global_state.LogFlushingRowGroup(local_state.buffer, "Combine");
|
465
481
|
global_state.writer->Flush(local_state.buffer);
|
466
482
|
return;
|
467
483
|
}
|
@@ -475,6 +491,7 @@ void ParquetWriteCombine(ExecutionContext &context, FunctionData &bind_data_p, G
|
|
475
491
|
// After combining, the combine buffer is more than half of the row_group_size(_bytes), so we flush
|
476
492
|
auto owned_combine_buffer = std::move(global_state.combine_buffer);
|
477
493
|
guard.unlock();
|
494
|
+
global_state.LogFlushingRowGroup(*owned_combine_buffer, "Combine");
|
478
495
|
// Lock free, of course
|
479
496
|
global_state.writer->Flush(*owned_combine_buffer);
|
480
497
|
}
|
@@ -489,6 +506,7 @@ void ParquetWriteFinalize(ClientContext &context, FunctionData &bind_data, Globa
|
|
489
506
|
auto &global_state = gstate.Cast<ParquetWriteGlobalState>();
|
490
507
|
// flush the combine buffer (if it's there)
|
491
508
|
if (global_state.combine_buffer) {
|
509
|
+
global_state.LogFlushingRowGroup(*global_state.combine_buffer, "Finalize");
|
492
510
|
global_state.writer->Flush(*global_state.combine_buffer);
|
493
511
|
}
|
494
512
|
|
@@ -691,6 +709,13 @@ CopyFunctionExecutionMode ParquetWriteExecutionMode(bool preserve_insertion_orde
|
|
691
709
|
return CopyFunctionExecutionMode::REGULAR_COPY_TO_FILE;
|
692
710
|
}
|
693
711
|
//===--------------------------------------------------------------------===//
|
712
|
+
// Initialize Logger
|
713
|
+
//===--------------------------------------------------------------------===//
|
714
|
+
void ParquetWriteInitializeOperator(GlobalFunctionData &gstate, const PhysicalOperator &op) {
|
715
|
+
auto &global_state = gstate.Cast<ParquetWriteGlobalState>();
|
716
|
+
global_state.op = &op;
|
717
|
+
}
|
718
|
+
//===--------------------------------------------------------------------===//
|
694
719
|
// Prepare Batch
|
695
720
|
//===--------------------------------------------------------------------===//
|
696
721
|
struct ParquetWriteBatchData : public PreparedBatchData {
|
@@ -889,6 +914,7 @@ void ParquetExtension::Load(DuckDB &db) {
|
|
889
914
|
function.copy_to_combine = ParquetWriteCombine;
|
890
915
|
function.copy_to_finalize = ParquetWriteFinalize;
|
891
916
|
function.execution_mode = ParquetWriteExecutionMode;
|
917
|
+
function.initialize_operator = ParquetWriteInitializeOperator;
|
892
918
|
function.copy_from_bind = MultiFileFunction<ParquetMultiFileInfo>::MultiFileBindCopy;
|
893
919
|
function.copy_from_function = scan_fun.functions[0];
|
894
920
|
function.prepare_batch = ParquetWritePrepareBatch;
|
@@ -11,7 +11,9 @@ float Float16ToFloat32(const uint16_t &float16_value) {
|
|
11
11
|
uint32_t sign = float16_value >> 15;
|
12
12
|
uint32_t exponent = (float16_value >> 10) & 0x1F;
|
13
13
|
uint32_t fraction = (float16_value & 0x3FF);
|
14
|
-
|
14
|
+
// Avoid strict aliasing issues and compiler warnings
|
15
|
+
uint32_t float32_value = 0;
|
16
|
+
|
15
17
|
if (exponent == 0) {
|
16
18
|
if (fraction == 0) {
|
17
19
|
// zero
|
@@ -39,7 +41,7 @@ float Float16ToFloat32(const uint16_t &float16_value) {
|
|
39
41
|
float32_value = (sign << 31) | ((exponent + (127 - 15)) << 23) | (fraction << 13);
|
40
42
|
}
|
41
43
|
|
42
|
-
return
|
44
|
+
return Load<float>(const_data_ptr_cast(&float32_value));
|
43
45
|
}
|
44
46
|
|
45
47
|
} // namespace duckdb
|
@@ -63,14 +63,14 @@ public:
|
|
63
63
|
|
64
64
|
template <class T>
|
65
65
|
string ConvertParquetElementToString(T &&entry) {
|
66
|
-
|
66
|
+
duckdb::stringstream ss;
|
67
67
|
ss << entry;
|
68
68
|
return ss.str();
|
69
69
|
}
|
70
70
|
|
71
71
|
template <class T>
|
72
72
|
string PrintParquetElementToString(T &&entry) {
|
73
|
-
|
73
|
+
duckdb::stringstream ss;
|
74
74
|
entry.printTo(ss);
|
75
75
|
return ss.str();
|
76
76
|
}
|
@@ -652,7 +652,7 @@ void ParquetMetaDataOperatorData::ExecuteBloomProbe(ClientContext &context, cons
|
|
652
652
|
}
|
653
653
|
|
654
654
|
auto &allocator = Allocator::DefaultAllocator();
|
655
|
-
auto transport =
|
655
|
+
auto transport = duckdb_base_std::make_shared<ThriftFileTransport>(reader->GetHandle(), false);
|
656
656
|
auto protocol =
|
657
657
|
make_uniq<duckdb_apache::thrift::protocol::TCompactProtocolT<ThriftFileTransport>>(std::move(transport));
|
658
658
|
|
@@ -318,6 +318,7 @@ TableFunctionSet ParquetScanFunction::GetFunctionSet() {
|
|
318
318
|
table_function.named_parameters["schema"] = LogicalTypeId::ANY;
|
319
319
|
table_function.named_parameters["encryption_config"] = LogicalTypeId::ANY;
|
320
320
|
table_function.named_parameters["parquet_version"] = LogicalType::VARCHAR;
|
321
|
+
table_function.named_parameters["can_have_nan"] = LogicalType::BOOLEAN;
|
321
322
|
table_function.statistics = MultiFileFunction<ParquetMultiFileInfo>::MultiFileScanStats;
|
322
323
|
table_function.serialize = ParquetScanSerialize;
|
323
324
|
table_function.deserialize = ParquetScanDeserialize;
|
@@ -365,6 +366,13 @@ bool ParquetMultiFileInfo::ParseCopyOption(ClientContext &context, const string
|
|
365
366
|
options.encryption_config = ParquetEncryptionConfig::Create(context, values[0]);
|
366
367
|
return true;
|
367
368
|
}
|
369
|
+
if (key == "can_have_nan") {
|
370
|
+
if (values.size() != 1) {
|
371
|
+
throw BinderException("Parquet can_have_nan cannot be empty!");
|
372
|
+
}
|
373
|
+
options.can_have_nan = GetBooleanArgument(key, values);
|
374
|
+
return true;
|
375
|
+
}
|
368
376
|
return false;
|
369
377
|
}
|
370
378
|
|
@@ -393,6 +401,10 @@ bool ParquetMultiFileInfo::ParseOption(ClientContext &context, const string &ori
|
|
393
401
|
options.debug_use_openssl = BooleanValue::Get(val);
|
394
402
|
return true;
|
395
403
|
}
|
404
|
+
if (key == "can_have_nan") {
|
405
|
+
options.can_have_nan = BooleanValue::Get(val);
|
406
|
+
return true;
|
407
|
+
}
|
396
408
|
if (key == "schema") {
|
397
409
|
// Argument is a map that defines the schema
|
398
410
|
const auto &schema_value = val;
|
@@ -48,7 +48,7 @@ using duckdb_parquet::Type;
|
|
48
48
|
|
49
49
|
static unique_ptr<duckdb_apache::thrift::protocol::TProtocol> CreateThriftFileProtocol(CachingFileHandle &file_handle,
|
50
50
|
bool prefetch_mode) {
|
51
|
-
auto transport =
|
51
|
+
auto transport = duckdb_base_std::make_shared<ThriftFileTransport>(file_handle, prefetch_mode);
|
52
52
|
return make_uniq<duckdb_apache::thrift::protocol::TCompactProtocolT<ThriftFileTransport>>(std::move(transport));
|
53
53
|
}
|
54
54
|
|
@@ -501,7 +501,7 @@ unique_ptr<BaseStatistics> ParquetColumnSchema::Stats(ParquetReader &reader, idx
|
|
501
501
|
stats.Set(StatsInfo::CANNOT_HAVE_NULL_VALUES);
|
502
502
|
return stats.ToUnique();
|
503
503
|
}
|
504
|
-
return ParquetStatisticsUtils::TransformColumnStatistics(*this, columns);
|
504
|
+
return ParquetStatisticsUtils::TransformColumnStatistics(*this, columns, reader.parquet_options.can_have_nan);
|
505
505
|
}
|
506
506
|
|
507
507
|
ParquetColumnSchema ParquetReader::ParseSchemaRecursive(idx_t depth, idx_t max_define, idx_t max_repeat,
|
@@ -1052,7 +1052,8 @@ void ParquetReader::PrepareRowGroupBuffer(ParquetReaderScanState &state, idx_t i
|
|
1052
1052
|
*stats, group.columns[column_reader.ColumnIndex()].meta_data.statistics, filter);
|
1053
1053
|
} else if (!is_generated_column && has_min_max &&
|
1054
1054
|
(column_reader.Type().id() == LogicalTypeId::FLOAT ||
|
1055
|
-
column_reader.Type().id() == LogicalTypeId::DOUBLE)
|
1055
|
+
column_reader.Type().id() == LogicalTypeId::DOUBLE) &&
|
1056
|
+
parquet_options.can_have_nan) {
|
1056
1057
|
// floating point columns can have NaN values in addition to the min/max bounds defined in the file
|
1057
1058
|
// in order to do optimal pruning - we prune based on the [min, max] of the file followed by pruning
|
1058
1059
|
// based on nan
|
@@ -1116,7 +1117,7 @@ void ParquetReader::InitializeScan(ClientContext &context, ParquetReaderScanStat
|
|
1116
1117
|
state.prefetch_mode = false;
|
1117
1118
|
}
|
1118
1119
|
|
1119
|
-
state.file_handle = fs.OpenFile(
|
1120
|
+
state.file_handle = fs.OpenFile(file, flags);
|
1120
1121
|
}
|
1121
1122
|
state.adaptive_filter.reset();
|
1122
1123
|
state.scan_filters.clear();
|
@@ -304,7 +304,8 @@ Value ParquetStatisticsUtils::ConvertValueInternal(const LogicalType &type, cons
|
|
304
304
|
}
|
305
305
|
|
306
306
|
unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(const ParquetColumnSchema &schema,
|
307
|
-
const vector<ColumnChunk> &columns
|
307
|
+
const vector<ColumnChunk> &columns,
|
308
|
+
bool can_have_nan) {
|
308
309
|
|
309
310
|
// Not supported types
|
310
311
|
auto &type = schema.type;
|
@@ -320,7 +321,7 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(con
|
|
320
321
|
// Recurse into child readers
|
321
322
|
for (idx_t i = 0; i < schema.children.size(); i++) {
|
322
323
|
auto &child_schema = schema.children[i];
|
323
|
-
auto child_stats = ParquetStatisticsUtils::TransformColumnStatistics(child_schema, columns);
|
324
|
+
auto child_stats = ParquetStatisticsUtils::TransformColumnStatistics(child_schema, columns, can_have_nan);
|
324
325
|
StructStats::SetChildStats(struct_stats, i, std::move(child_stats));
|
325
326
|
}
|
326
327
|
row_group_stats = struct_stats.ToUnique();
|
@@ -363,7 +364,16 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(con
|
|
363
364
|
break;
|
364
365
|
case LogicalTypeId::FLOAT:
|
365
366
|
case LogicalTypeId::DOUBLE:
|
366
|
-
|
367
|
+
if (can_have_nan) {
|
368
|
+
// Since parquet doesn't tell us if the column has NaN values, if the user has explicitly declared that it
|
369
|
+
// does, we create stats without an upper max value, as NaN compares larger than anything else.
|
370
|
+
row_group_stats = CreateFloatingPointStats(type, schema, parquet_stats);
|
371
|
+
} else {
|
372
|
+
// Otherwise we use the numeric stats as usual, which might lead to "wrong" pruning if the column contains
|
373
|
+
// NaN values. The parquet spec is not clear on how to handle NaN values in statistics, and so this is
|
374
|
+
// probably the best we can do for now.
|
375
|
+
row_group_stats = CreateNumericStats(type, schema, parquet_stats);
|
376
|
+
}
|
367
377
|
break;
|
368
378
|
case LogicalTypeId::VARCHAR: {
|
369
379
|
auto string_stats = StringStats::CreateEmpty(type);
|
@@ -376,7 +376,7 @@ ParquetWriter::ParquetWriter(ClientContext &context, FileSystem &fs, string file
|
|
376
376
|
}
|
377
377
|
|
378
378
|
TCompactProtocolFactoryT<MyTransport> tproto_factory;
|
379
|
-
protocol = tproto_factory.getProtocol(
|
379
|
+
protocol = tproto_factory.getProtocol(duckdb_base_std::make_shared<MyTransport>(*writer));
|
380
380
|
|
381
381
|
file_meta_data.num_rows = 0;
|
382
382
|
file_meta_data.version = 1;
|
@@ -46,7 +46,7 @@ double ParquetDecimalUtils::ReadDecimalValue(const_data_ptr_t pointer, idx_t siz
|
|
46
46
|
}
|
47
47
|
|
48
48
|
unique_ptr<ColumnReader> ParquetDecimalUtils::CreateReader(ParquetReader &reader, const ParquetColumnSchema &schema) {
|
49
|
-
if (schema.
|
49
|
+
if (schema.parquet_type == Type::FIXED_LEN_BYTE_ARRAY) {
|
50
50
|
return CreateDecimalReaderInternal<true>(reader, schema);
|
51
51
|
} else {
|
52
52
|
return CreateDecimalReaderInternal<false>(reader, schema);
|
@@ -11,7 +11,7 @@ namespace duckdb {
|
|
11
11
|
StringColumnReader::StringColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema)
|
12
12
|
: ColumnReader(reader, schema) {
|
13
13
|
fixed_width_string_length = 0;
|
14
|
-
if (schema.
|
14
|
+
if (schema.parquet_type == Type::FIXED_LEN_BYTE_ARRAY) {
|
15
15
|
fixed_width_string_length = schema.type_length;
|
16
16
|
}
|
17
17
|
}
|
@@ -118,12 +118,21 @@ static bool TypeHasExactRowCount(const LogicalType &type) {
|
|
118
118
|
}
|
119
119
|
|
120
120
|
idx_t StructColumnReader::GroupRowsAvailable() {
|
121
|
-
for (
|
122
|
-
if (
|
123
|
-
|
121
|
+
for (auto &child : child_readers) {
|
122
|
+
if (!child) {
|
123
|
+
continue;
|
124
|
+
}
|
125
|
+
if (TypeHasExactRowCount(child->Type())) {
|
126
|
+
return child->GroupRowsAvailable();
|
127
|
+
}
|
128
|
+
}
|
129
|
+
for (auto &child : child_readers) {
|
130
|
+
if (!child) {
|
131
|
+
continue;
|
124
132
|
}
|
133
|
+
return child->GroupRowsAvailable();
|
125
134
|
}
|
126
|
-
|
135
|
+
throw InternalException("No projected columns in struct?");
|
127
136
|
}
|
128
137
|
|
129
138
|
} // namespace duckdb
|
@@ -73,6 +73,7 @@ void ParquetOptionsSerialization::Serialize(Serializer &serializer) const {
|
|
73
73
|
serializer.WritePropertyWithDefault<shared_ptr<ParquetEncryptionConfig>>(104, "encryption_config", parquet_options.encryption_config, nullptr);
|
74
74
|
serializer.WritePropertyWithDefault<bool>(105, "debug_use_openssl", parquet_options.debug_use_openssl, true);
|
75
75
|
serializer.WritePropertyWithDefault<idx_t>(106, "explicit_cardinality", parquet_options.explicit_cardinality, 0);
|
76
|
+
serializer.WritePropertyWithDefault<bool>(107, "can_have_nan", parquet_options.can_have_nan, false);
|
76
77
|
}
|
77
78
|
|
78
79
|
ParquetOptionsSerialization ParquetOptionsSerialization::Deserialize(Deserializer &deserializer) {
|
@@ -84,6 +85,7 @@ ParquetOptionsSerialization ParquetOptionsSerialization::Deserialize(Deserialize
|
|
84
85
|
deserializer.ReadPropertyWithExplicitDefault<shared_ptr<ParquetEncryptionConfig>>(104, "encryption_config", result.parquet_options.encryption_config, nullptr);
|
85
86
|
deserializer.ReadPropertyWithExplicitDefault<bool>(105, "debug_use_openssl", result.parquet_options.debug_use_openssl, true);
|
86
87
|
deserializer.ReadPropertyWithExplicitDefault<idx_t>(106, "explicit_cardinality", result.parquet_options.explicit_cardinality, 0);
|
88
|
+
deserializer.ReadPropertyWithExplicitDefault<bool>(107, "can_have_nan", result.parquet_options.can_have_nan, false);
|
87
89
|
return result;
|
88
90
|
}
|
89
91
|
|