duckdb 1.4.0 → 1.4.1-dev2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/extension/core_functions/scalar/generic/current_setting.cpp +1 -4
- package/src/duckdb/extension/icu/icu-strptime.cpp +2 -1
- package/src/duckdb/extension/json/include/json_common.hpp +2 -4
- package/src/duckdb/extension/json/json_functions.cpp +5 -1
- package/src/duckdb/extension/parquet/column_writer.cpp +31 -21
- package/src/duckdb/extension/parquet/geo_parquet.cpp +21 -6
- package/src/duckdb/extension/parquet/include/column_writer.hpp +2 -2
- package/src/duckdb/extension/parquet/include/geo_parquet.hpp +28 -1
- package/src/duckdb/extension/parquet/include/parquet_writer.hpp +7 -2
- package/src/duckdb/extension/parquet/include/reader/string_column_reader.hpp +13 -0
- package/src/duckdb/extension/parquet/include/writer/array_column_writer.hpp +4 -0
- package/src/duckdb/extension/parquet/parquet_extension.cpp +56 -1
- package/src/duckdb/extension/parquet/parquet_reader.cpp +4 -1
- package/src/duckdb/extension/parquet/parquet_statistics.cpp +5 -7
- package/src/duckdb/extension/parquet/parquet_writer.cpp +15 -8
- package/src/duckdb/extension/parquet/reader/string_column_reader.cpp +17 -4
- package/src/duckdb/extension/parquet/writer/array_column_writer.cpp +22 -28
- package/src/duckdb/extension/parquet/writer/primitive_column_writer.cpp +17 -5
- package/src/duckdb/extension/parquet/writer/struct_column_writer.cpp +3 -2
- package/src/duckdb/src/catalog/catalog_search_path.cpp +2 -2
- package/src/duckdb/src/catalog/catalog_set.cpp +1 -2
- package/src/duckdb/src/common/enum_util.cpp +20 -0
- package/src/duckdb/src/common/file_system.cpp +0 -30
- package/src/duckdb/src/common/sorting/sort.cpp +25 -6
- package/src/duckdb/src/common/sorting/sorted_run_merger.cpp +1 -0
- package/src/duckdb/src/common/string_util.cpp +24 -0
- package/src/duckdb/src/common/virtual_file_system.cpp +59 -10
- package/src/duckdb/src/execution/index/art/art_merger.cpp +0 -3
- package/src/duckdb/src/execution/index/art/prefix.cpp +4 -0
- package/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +1 -1
- package/src/duckdb/src/execution/operator/helper/physical_reset.cpp +2 -2
- package/src/duckdb/src/execution/operator/schema/physical_attach.cpp +1 -1
- package/src/duckdb/src/execution/physical_plan/plan_asof_join.cpp +3 -3
- package/src/duckdb/src/function/table/system/duckdb_connection_count.cpp +45 -0
- package/src/duckdb/src/function/table/system/duckdb_settings.cpp +11 -1
- package/src/duckdb/src/function/table/system_functions.cpp +1 -0
- package/src/duckdb/src/function/table/version/pragma_version.cpp +3 -3
- package/src/duckdb/src/include/duckdb/common/enum_util.hpp +8 -0
- package/src/duckdb/src/include/duckdb/common/string_util.hpp +2 -0
- package/src/duckdb/src/include/duckdb/common/virtual_file_system.hpp +4 -1
- package/src/duckdb/src/include/duckdb/function/scalar/variant_functions.hpp +1 -1
- package/src/duckdb/src/include/duckdb/function/table/system_functions.hpp +4 -0
- package/src/duckdb/src/include/duckdb/logging/log_storage.hpp +6 -6
- package/src/duckdb/src/include/duckdb/logging/log_type.hpp +26 -3
- package/src/duckdb/src/include/duckdb/main/attached_database.hpp +4 -0
- package/src/duckdb/src/include/duckdb/main/client_context.hpp +2 -0
- package/src/duckdb/src/include/duckdb/main/connection.hpp +0 -1
- package/src/duckdb/src/include/duckdb/main/connection_manager.hpp +0 -1
- package/src/duckdb/src/include/duckdb/main/database_file_path_manager.hpp +12 -1
- package/src/duckdb/src/include/duckdb/main/database_manager.hpp +3 -0
- package/src/duckdb/src/include/duckdb/main/relation/create_table_relation.hpp +2 -0
- package/src/duckdb/src/include/duckdb/main/relation/create_view_relation.hpp +2 -0
- package/src/duckdb/src/include/duckdb/main/relation/delete_relation.hpp +2 -0
- package/src/duckdb/src/include/duckdb/main/relation/explain_relation.hpp +2 -0
- package/src/duckdb/src/include/duckdb/main/relation/insert_relation.hpp +2 -0
- package/src/duckdb/src/include/duckdb/main/relation/query_relation.hpp +1 -0
- package/src/duckdb/src/include/duckdb/main/relation/update_relation.hpp +2 -0
- package/src/duckdb/src/include/duckdb/main/relation/write_csv_relation.hpp +2 -0
- package/src/duckdb/src/include/duckdb/main/relation/write_parquet_relation.hpp +2 -0
- package/src/duckdb/src/include/duckdb/main/relation.hpp +2 -1
- package/src/duckdb/src/include/duckdb/main/secret/secret.hpp +3 -1
- package/src/duckdb/src/include/duckdb/optimizer/filter_pushdown.hpp +3 -2
- package/src/duckdb/src/include/duckdb/planner/binder.hpp +62 -3
- package/src/duckdb/src/include/duckdb/planner/expression_binder/lateral_binder.hpp +2 -2
- package/src/duckdb/src/include/duckdb/planner/operator/logical_cte.hpp +1 -1
- package/src/duckdb/src/include/duckdb/planner/operator/logical_dependent_join.hpp +3 -3
- package/src/duckdb/src/include/duckdb/planner/subquery/flatten_dependent_join.hpp +2 -2
- package/src/duckdb/src/include/duckdb/planner/subquery/has_correlated_expressions.hpp +2 -2
- package/src/duckdb/src/include/duckdb/planner/subquery/rewrite_cte_scan.hpp +2 -2
- package/src/duckdb/src/include/duckdb/planner/tableref/bound_joinref.hpp +1 -1
- package/src/duckdb/src/include/duckdb/storage/compression/alp/alp_analyze.hpp +6 -1
- package/src/duckdb/src/include/duckdb/storage/compression/alprd/alprd_analyze.hpp +5 -1
- package/src/duckdb/src/include/duckdb/storage/metadata/metadata_manager.hpp +9 -7
- package/src/duckdb/src/include/duckdb/storage/statistics/string_stats.hpp +2 -0
- package/src/duckdb/src/include/duckdb/storage/table/array_column_data.hpp +4 -4
- package/src/duckdb/src/include/duckdb/storage/table/column_data.hpp +6 -6
- package/src/duckdb/src/include/duckdb/storage/table/list_column_data.hpp +4 -4
- package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +4 -4
- package/src/duckdb/src/include/duckdb/storage/table/row_group_collection.hpp +5 -3
- package/src/duckdb/src/include/duckdb/storage/table/row_id_column_data.hpp +4 -4
- package/src/duckdb/src/include/duckdb/storage/table/standard_column_data.hpp +4 -4
- package/src/duckdb/src/include/duckdb/storage/table/struct_column_data.hpp +4 -4
- package/src/duckdb/src/include/duckdb/storage/table/update_segment.hpp +2 -2
- package/src/duckdb/src/include/duckdb/transaction/duck_transaction.hpp +2 -1
- package/src/duckdb/src/include/duckdb/transaction/update_info.hpp +4 -1
- package/src/duckdb/src/include/duckdb/transaction/wal_write_state.hpp +1 -1
- package/src/duckdb/src/logging/log_types.cpp +63 -0
- package/src/duckdb/src/main/attached_database.cpp +16 -3
- package/src/duckdb/src/main/client_context.cpp +27 -19
- package/src/duckdb/src/main/connection.cpp +2 -5
- package/src/duckdb/src/main/database_file_path_manager.cpp +23 -6
- package/src/duckdb/src/main/database_manager.cpp +18 -3
- package/src/duckdb/src/main/http/http_util.cpp +3 -1
- package/src/duckdb/src/main/relation/create_table_relation.cpp +8 -0
- package/src/duckdb/src/main/relation/create_view_relation.cpp +8 -0
- package/src/duckdb/src/main/relation/delete_relation.cpp +8 -0
- package/src/duckdb/src/main/relation/explain_relation.cpp +8 -0
- package/src/duckdb/src/main/relation/insert_relation.cpp +8 -0
- package/src/duckdb/src/main/relation/query_relation.cpp +4 -0
- package/src/duckdb/src/main/relation/update_relation.cpp +8 -0
- package/src/duckdb/src/main/relation/write_csv_relation.cpp +8 -0
- package/src/duckdb/src/main/relation/write_parquet_relation.cpp +8 -0
- package/src/duckdb/src/main/relation.cpp +2 -2
- package/src/duckdb/src/optimizer/filter_combiner.cpp +7 -0
- package/src/duckdb/src/optimizer/filter_pushdown.cpp +9 -3
- package/src/duckdb/src/optimizer/pushdown/pushdown_get.cpp +4 -1
- package/src/duckdb/src/optimizer/rule/comparison_simplification.cpp +3 -7
- package/src/duckdb/src/parser/statement/relation_statement.cpp +1 -4
- package/src/duckdb/src/parser/transform/statement/transform_create_function.cpp +2 -0
- package/src/duckdb/src/planner/binder/query_node/plan_subquery.cpp +8 -6
- package/src/duckdb/src/planner/binder/statement/bind_create.cpp +1 -5
- package/src/duckdb/src/planner/binder/statement/bind_merge_into.cpp +10 -2
- package/src/duckdb/src/planner/binder/statement/bind_pragma.cpp +20 -3
- package/src/duckdb/src/planner/binder/tableref/bind_pivot.cpp +8 -3
- package/src/duckdb/src/planner/binder/tableref/bind_table_function.cpp +9 -2
- package/src/duckdb/src/planner/binder.cpp +2 -2
- package/src/duckdb/src/planner/expression_binder/lateral_binder.cpp +9 -13
- package/src/duckdb/src/planner/expression_binder/table_function_binder.cpp +4 -0
- package/src/duckdb/src/planner/expression_binder.cpp +3 -1
- package/src/duckdb/src/planner/operator/logical_dependent_join.cpp +2 -2
- package/src/duckdb/src/planner/subquery/flatten_dependent_join.cpp +12 -14
- package/src/duckdb/src/planner/subquery/has_correlated_expressions.cpp +1 -1
- package/src/duckdb/src/planner/subquery/rewrite_cte_scan.cpp +2 -2
- package/src/duckdb/src/storage/compression/bitpacking.cpp +1 -2
- package/src/duckdb/src/storage/data_table.cpp +2 -2
- package/src/duckdb/src/storage/local_storage.cpp +1 -1
- package/src/duckdb/src/storage/metadata/metadata_manager.cpp +67 -25
- package/src/duckdb/src/storage/statistics/string_stats.cpp +8 -0
- package/src/duckdb/src/storage/table/array_column_data.cpp +6 -5
- package/src/duckdb/src/storage/table/column_data.cpp +23 -9
- package/src/duckdb/src/storage/table/column_data_checkpointer.cpp +15 -1
- package/src/duckdb/src/storage/table/list_column_data.cpp +5 -4
- package/src/duckdb/src/storage/table/row_group.cpp +8 -8
- package/src/duckdb/src/storage/table/row_group_collection.cpp +12 -8
- package/src/duckdb/src/storage/table/row_id_column_data.cpp +5 -4
- package/src/duckdb/src/storage/table/standard_column_data.cpp +9 -8
- package/src/duckdb/src/storage/table/struct_column_data.cpp +10 -9
- package/src/duckdb/src/storage/table/update_segment.cpp +12 -10
- package/src/duckdb/src/transaction/commit_state.cpp +18 -0
- package/src/duckdb/src/transaction/duck_transaction.cpp +2 -10
- package/src/duckdb/src/transaction/wal_write_state.cpp +5 -5
- package/src/duckdb/third_party/httplib/httplib.hpp +6 -1
- package/src/duckdb/ub_src_function_table_system.cpp +2 -0
package/package.json
CHANGED
@@ -53,10 +53,7 @@ unique_ptr<FunctionData> CurrentSettingBind(ClientContext &context, ScalarFuncti
|
|
53
53
|
if (!context.TryGetCurrentSetting(key, val)) {
|
54
54
|
auto extension_name = Catalog::AutoloadExtensionByConfigName(context, key);
|
55
55
|
// If autoloader didn't throw, the config is now available
|
56
|
-
|
57
|
-
throw InternalException("Extension %s did not provide the '%s' config setting",
|
58
|
-
extension_name.ToStdString(), key);
|
59
|
-
}
|
56
|
+
context.TryGetCurrentSetting(key, val);
|
60
57
|
}
|
61
58
|
|
62
59
|
bound_function.return_type = val.type();
|
@@ -221,8 +221,9 @@ struct ICUStrptime : public ICUDateFunc {
|
|
221
221
|
if (!error.empty()) {
|
222
222
|
throw InvalidInputException("Failed to parse format specifier %s: %s", format_string, error);
|
223
223
|
}
|
224
|
-
// If any format has UTC offsets, then we have to produce TSTZ
|
224
|
+
// If any format has UTC offsets or names, then we have to produce TSTZ
|
225
225
|
has_tz = has_tz || format.HasFormatSpecifier(StrTimeSpecifier::TZ_NAME);
|
226
|
+
has_tz = has_tz || format.HasFormatSpecifier(StrTimeSpecifier::UTC_OFFSET);
|
226
227
|
formats.emplace_back(format);
|
227
228
|
}
|
228
229
|
if (has_tz) {
|
@@ -13,6 +13,7 @@
|
|
13
13
|
#include "duckdb/common/operator/string_cast.hpp"
|
14
14
|
#include "duckdb/planner/expression/bound_function_expression.hpp"
|
15
15
|
#include "yyjson.hpp"
|
16
|
+
#include "duckdb/common/types/blob.hpp"
|
16
17
|
|
17
18
|
using namespace duckdb_yyjson; // NOLINT
|
18
19
|
|
@@ -228,11 +229,8 @@ public:
|
|
228
229
|
|
229
230
|
static string FormatParseError(const char *data, idx_t length, yyjson_read_err &error, const string &extra = "") {
|
230
231
|
D_ASSERT(error.code != YYJSON_READ_SUCCESS);
|
231
|
-
// Go to blob so we can have a better error message for weird strings
|
232
|
-
auto blob = Value::BLOB(string(data, length));
|
233
232
|
// Truncate, so we don't print megabytes worth of JSON
|
234
|
-
|
235
|
-
input = input.length() > 50 ? string(input.c_str(), 47) + "..." : input;
|
233
|
+
auto input = length > 50 ? string(data, 47) + "..." : string(data, length);
|
236
234
|
// Have to replace \r, otherwise output is unreadable
|
237
235
|
input = StringUtil::Replace(input, "\r", "\\r");
|
238
236
|
return StringUtil::Format("Malformed JSON at byte %lld of input: %s. %s Input: \"%s\"", error.pos, error.msg,
|
@@ -394,7 +394,11 @@ void JSONFunctions::RegisterSimpleCastFunctions(ExtensionLoader &loader) {
|
|
394
394
|
loader.RegisterCastFunction(LogicalType::LIST(LogicalType::JSON()), LogicalTypeId::VARCHAR, CastJSONListToVarchar,
|
395
395
|
json_list_to_varchar_cost);
|
396
396
|
|
397
|
-
//
|
397
|
+
// JSON[] to JSON is allowed implicitly
|
398
|
+
loader.RegisterCastFunction(LogicalType::LIST(LogicalType::JSON()), LogicalType::JSON(), CastJSONListToVarchar,
|
399
|
+
100);
|
400
|
+
|
401
|
+
// VARCHAR to JSON[] (also needs a special case otherwise we get a VARCHAR -> VARCHAR[] cast first)
|
398
402
|
const auto varchar_to_json_list_cost =
|
399
403
|
CastFunctionSet::ImplicitCastCost(db, LogicalType::VARCHAR, LogicalType::LIST(LogicalType::JSON())) - 1;
|
400
404
|
BoundCastInfo varchar_to_json_list_info(CastVarcharToJSONList, nullptr, JSONFunctionLocalState::InitCastLocalState);
|
@@ -187,9 +187,12 @@ void ColumnWriter::HandleRepeatLevels(ColumnWriterState &state, ColumnWriterStat
|
|
187
187
|
// no repeat levels without a parent node
|
188
188
|
return;
|
189
189
|
}
|
190
|
-
|
191
|
-
|
190
|
+
if (state.repetition_levels.size() >= parent->repetition_levels.size()) {
|
191
|
+
return;
|
192
192
|
}
|
193
|
+
state.repetition_levels.insert(state.repetition_levels.end(),
|
194
|
+
parent->repetition_levels.begin() + state.repetition_levels.size(),
|
195
|
+
parent->repetition_levels.end());
|
193
196
|
}
|
194
197
|
|
195
198
|
void ColumnWriter::HandleDefineLevels(ColumnWriterState &state, ColumnWriterState *parent, const ValidityMask &validity,
|
@@ -200,36 +203,41 @@ void ColumnWriter::HandleDefineLevels(ColumnWriterState &state, ColumnWriterStat
|
|
200
203
|
while (state.definition_levels.size() < parent->definition_levels.size()) {
|
201
204
|
idx_t current_index = state.definition_levels.size();
|
202
205
|
if (parent->definition_levels[current_index] != PARQUET_DEFINE_VALID) {
|
206
|
+
//! Inherit nulls from parent
|
203
207
|
state.definition_levels.push_back(parent->definition_levels[current_index]);
|
204
208
|
state.parent_null_count++;
|
205
209
|
} else if (validity.RowIsValid(vector_index)) {
|
210
|
+
//! Produce a non-null define
|
206
211
|
state.definition_levels.push_back(define_value);
|
207
212
|
} else {
|
213
|
+
//! Produce a null define
|
208
214
|
if (!can_have_nulls) {
|
209
215
|
throw IOException("Parquet writer: map key column is not allowed to contain NULL values");
|
210
216
|
}
|
211
217
|
state.null_count++;
|
212
218
|
state.definition_levels.push_back(null_value);
|
213
219
|
}
|
220
|
+
D_ASSERT(parent->is_empty.empty() || current_index < parent->is_empty.size());
|
214
221
|
if (parent->is_empty.empty() || !parent->is_empty[current_index]) {
|
215
222
|
vector_index++;
|
216
223
|
}
|
217
224
|
}
|
225
|
+
return;
|
226
|
+
}
|
227
|
+
|
228
|
+
// no parent: set definition levels only from this validity mask
|
229
|
+
if (validity.AllValid()) {
|
230
|
+
state.definition_levels.insert(state.definition_levels.end(), count, define_value);
|
218
231
|
} else {
|
219
|
-
|
220
|
-
|
221
|
-
state.definition_levels.
|
222
|
-
|
223
|
-
for (idx_t i = 0; i < count; i++) {
|
224
|
-
const auto is_null = !validity.RowIsValid(i);
|
225
|
-
state.definition_levels.emplace_back(is_null ? null_value : define_value);
|
226
|
-
state.null_count += is_null;
|
227
|
-
}
|
228
|
-
}
|
229
|
-
if (!can_have_nulls && state.null_count != 0) {
|
230
|
-
throw IOException("Parquet writer: map key column is not allowed to contain NULL values");
|
232
|
+
for (idx_t i = 0; i < count; i++) {
|
233
|
+
const auto is_null = !validity.RowIsValid(i);
|
234
|
+
state.definition_levels.emplace_back(is_null ? null_value : define_value);
|
235
|
+
state.null_count += is_null;
|
231
236
|
}
|
232
237
|
}
|
238
|
+
if (!can_have_nulls && state.null_count != 0) {
|
239
|
+
throw IOException("Parquet writer: map key column is not allowed to contain NULL values");
|
240
|
+
}
|
233
241
|
}
|
234
242
|
|
235
243
|
//===--------------------------------------------------------------------===//
|
@@ -237,7 +245,7 @@ void ColumnWriter::HandleDefineLevels(ColumnWriterState &state, ColumnWriterStat
|
|
237
245
|
//===--------------------------------------------------------------------===//
|
238
246
|
|
239
247
|
ParquetColumnSchema ColumnWriter::FillParquetSchema(vector<duckdb_parquet::SchemaElement> &schemas,
|
240
|
-
const LogicalType &type, const string &name,
|
248
|
+
const LogicalType &type, const string &name, bool allow_geometry,
|
241
249
|
optional_ptr<const ChildFieldIDs> field_ids, idx_t max_repeat,
|
242
250
|
idx_t max_define, bool can_have_nulls) {
|
243
251
|
auto null_type = can_have_nulls ? FieldRepetitionType::OPTIONAL : FieldRepetitionType::REQUIRED;
|
@@ -277,7 +285,8 @@ ParquetColumnSchema ColumnWriter::FillParquetSchema(vector<duckdb_parquet::Schem
|
|
277
285
|
struct_column.children.reserve(child_types.size());
|
278
286
|
for (auto &child_type : child_types) {
|
279
287
|
struct_column.children.emplace_back(FillParquetSchema(schemas, child_type.second, child_type.first,
|
280
|
-
child_field_ids, max_repeat,
|
288
|
+
allow_geometry, child_field_ids, max_repeat,
|
289
|
+
max_define + 1));
|
281
290
|
}
|
282
291
|
return struct_column;
|
283
292
|
}
|
@@ -313,8 +322,8 @@ ParquetColumnSchema ColumnWriter::FillParquetSchema(vector<duckdb_parquet::Schem
|
|
313
322
|
schemas.push_back(std::move(repeated_element));
|
314
323
|
|
315
324
|
ParquetColumnSchema list_column(name, type, max_define, max_repeat, schema_idx, 0);
|
316
|
-
list_column.children.push_back(
|
317
|
-
|
325
|
+
list_column.children.push_back(FillParquetSchema(schemas, child_type, "element", allow_geometry,
|
326
|
+
child_field_ids, max_repeat + 1, max_define + 2));
|
318
327
|
return list_column;
|
319
328
|
}
|
320
329
|
if (type.id() == LogicalTypeId::MAP) {
|
@@ -361,13 +370,14 @@ ParquetColumnSchema ColumnWriter::FillParquetSchema(vector<duckdb_parquet::Schem
|
|
361
370
|
for (idx_t i = 0; i < 2; i++) {
|
362
371
|
// key needs to be marked as REQUIRED
|
363
372
|
bool is_key = i == 0;
|
364
|
-
auto child_schema = FillParquetSchema(schemas, kv_types[i], kv_names[i],
|
365
|
-
max_define + 2, !is_key);
|
373
|
+
auto child_schema = FillParquetSchema(schemas, kv_types[i], kv_names[i], allow_geometry, child_field_ids,
|
374
|
+
max_repeat + 1, max_define + 2, !is_key);
|
366
375
|
|
367
376
|
map_column.children.push_back(std::move(child_schema));
|
368
377
|
}
|
369
378
|
return map_column;
|
370
379
|
}
|
380
|
+
|
371
381
|
duckdb_parquet::SchemaElement schema_element;
|
372
382
|
schema_element.type = ParquetWriter::DuckDBTypeToParquetType(type);
|
373
383
|
schema_element.repetition_type = null_type;
|
@@ -379,7 +389,7 @@ ParquetColumnSchema ColumnWriter::FillParquetSchema(vector<duckdb_parquet::Schem
|
|
379
389
|
schema_element.__isset.field_id = true;
|
380
390
|
schema_element.field_id = field_id->field_id;
|
381
391
|
}
|
382
|
-
ParquetWriter::SetSchemaProperties(type, schema_element);
|
392
|
+
ParquetWriter::SetSchemaProperties(type, schema_element, allow_geometry);
|
383
393
|
schemas.push_back(std::move(schema_element));
|
384
394
|
return ParquetColumnSchema(name, type, max_define, max_repeat, schema_idx, 0);
|
385
395
|
}
|
@@ -208,17 +208,19 @@ unique_ptr<GeoParquetFileMetadata> GeoParquetFileMetadata::TryRead(const duckdb_
|
|
208
208
|
throw InvalidInputException("Geoparquet metadata is not an object");
|
209
209
|
}
|
210
210
|
|
211
|
-
|
211
|
+
// We dont actually care about the version for now, as we only support V1+native
|
212
|
+
auto result = make_uniq<GeoParquetFileMetadata>(GeoParquetVersion::BOTH);
|
212
213
|
|
213
214
|
// Check and parse the version
|
214
215
|
const auto version_val = yyjson_obj_get(root, "version");
|
215
216
|
if (!yyjson_is_str(version_val)) {
|
216
217
|
throw InvalidInputException("Geoparquet metadata does not have a version");
|
217
218
|
}
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
219
|
+
|
220
|
+
auto version = yyjson_get_str(version_val);
|
221
|
+
if (StringUtil::StartsWith(version, "3")) {
|
222
|
+
// Guard against a breaking future 3.0 version
|
223
|
+
throw InvalidInputException("Geoparquet version %s is not supported", version);
|
222
224
|
}
|
223
225
|
|
224
226
|
// Check and parse the geometry columns
|
@@ -344,7 +346,20 @@ void GeoParquetFileMetadata::Write(duckdb_parquet::FileMetaData &file_meta_data)
|
|
344
346
|
yyjson_mut_doc_set_root(doc, root);
|
345
347
|
|
346
348
|
// Add the version
|
347
|
-
|
349
|
+
switch (version) {
|
350
|
+
case GeoParquetVersion::V1:
|
351
|
+
case GeoParquetVersion::BOTH:
|
352
|
+
yyjson_mut_obj_add_strcpy(doc, root, "version", "1.0.0");
|
353
|
+
break;
|
354
|
+
case GeoParquetVersion::V2:
|
355
|
+
yyjson_mut_obj_add_strcpy(doc, root, "version", "2.0.0");
|
356
|
+
break;
|
357
|
+
case GeoParquetVersion::NONE:
|
358
|
+
default:
|
359
|
+
// Should never happen, we should not be writing anything
|
360
|
+
yyjson_mut_doc_free(doc);
|
361
|
+
throw InternalException("GeoParquetVersion::NONE should not write metadata");
|
362
|
+
}
|
348
363
|
|
349
364
|
// Add the primary column
|
350
365
|
yyjson_mut_obj_add_strncpy(doc, root, "primary_column", primary_geometry_column.c_str(),
|
@@ -27,7 +27,7 @@ public:
|
|
27
27
|
|
28
28
|
unsafe_vector<uint16_t> definition_levels;
|
29
29
|
unsafe_vector<uint16_t> repetition_levels;
|
30
|
-
|
30
|
+
unsafe_vector<uint8_t> is_empty;
|
31
31
|
idx_t parent_null_count = 0;
|
32
32
|
idx_t null_count = 0;
|
33
33
|
|
@@ -94,7 +94,7 @@ public:
|
|
94
94
|
}
|
95
95
|
|
96
96
|
static ParquetColumnSchema FillParquetSchema(vector<duckdb_parquet::SchemaElement> &schemas,
|
97
|
-
const LogicalType &type, const string &name,
|
97
|
+
const LogicalType &type, const string &name, bool allow_geometry,
|
98
98
|
optional_ptr<const ChildFieldIDs> field_ids, idx_t max_repeat = 0,
|
99
99
|
idx_t max_define = 1, bool can_have_nulls = true);
|
100
100
|
//! Create the column writer for a specific type recursively
|
@@ -199,6 +199,31 @@ enum class GeoParquetColumnEncoding : uint8_t {
|
|
199
199
|
MULTIPOLYGON,
|
200
200
|
};
|
201
201
|
|
202
|
+
enum class GeoParquetVersion : uint8_t {
|
203
|
+
// Write GeoParquet 1.0 metadata
|
204
|
+
// GeoParquet 1.0 has the widest support among readers and writers
|
205
|
+
V1,
|
206
|
+
|
207
|
+
// Write GeoParquet 2.0
|
208
|
+
// The GeoParquet 2.0 options is identical to GeoParquet 1.0 except the underlying storage
|
209
|
+
// of spatial columns is Parquet native geometry, where the Parquet writer will include
|
210
|
+
// native statistics according to the underlying Parquet options. Compared to 'BOTH', this will
|
211
|
+
// actually write the metadata as containing GeoParquet version 2.0.0
|
212
|
+
// However, V2 isnt standardized yet, so this option is still a bit experimental
|
213
|
+
V2,
|
214
|
+
|
215
|
+
// Write GeoParquet 1.0 metadata, with native Parquet geometry types
|
216
|
+
// This is a bit of a hold-over option for compatibility with systems that
|
217
|
+
// reject GeoParquet 2.0 metadata, but can read Parquet native geometry types as they simply ignore the extra
|
218
|
+
// logical type. DuckDB v1.4.0 falls into this category.
|
219
|
+
BOTH,
|
220
|
+
|
221
|
+
// Do not write GeoParquet metadata
|
222
|
+
// This option suppresses GeoParquet metadata; however, spatial types will be written as
|
223
|
+
// Parquet native Geometry/Geography.
|
224
|
+
NONE,
|
225
|
+
};
|
226
|
+
|
202
227
|
struct GeoParquetColumnMetadata {
|
203
228
|
// The encoding of the geometry column
|
204
229
|
GeoParquetColumnEncoding geometry_encoding;
|
@@ -215,6 +240,8 @@ struct GeoParquetColumnMetadata {
|
|
215
240
|
|
216
241
|
class GeoParquetFileMetadata {
|
217
242
|
public:
|
243
|
+
GeoParquetFileMetadata(GeoParquetVersion geo_parquet_version) : version(geo_parquet_version) {
|
244
|
+
}
|
218
245
|
void AddGeoParquetStats(const string &column_name, const LogicalType &type, const GeometryStats &stats);
|
219
246
|
void Write(duckdb_parquet::FileMetaData &file_meta_data);
|
220
247
|
|
@@ -234,8 +261,8 @@ public:
|
|
234
261
|
|
235
262
|
private:
|
236
263
|
mutex write_lock;
|
237
|
-
string version = "1.1.0";
|
238
264
|
unordered_map<string, GeoParquetColumnMetadata> geometry_columns;
|
265
|
+
GeoParquetVersion version;
|
239
266
|
};
|
240
267
|
|
241
268
|
} // namespace duckdb
|
@@ -85,7 +85,7 @@ public:
|
|
85
85
|
shared_ptr<ParquetEncryptionConfig> encryption_config, optional_idx dictionary_size_limit,
|
86
86
|
idx_t string_dictionary_page_size_limit, bool enable_bloom_filters,
|
87
87
|
double bloom_filter_false_positive_ratio, int64_t compression_level, bool debug_use_openssl,
|
88
|
-
ParquetVersion parquet_version);
|
88
|
+
ParquetVersion parquet_version, GeoParquetVersion geoparquet_version);
|
89
89
|
~ParquetWriter();
|
90
90
|
|
91
91
|
public:
|
@@ -95,7 +95,8 @@ public:
|
|
95
95
|
void Finalize();
|
96
96
|
|
97
97
|
static duckdb_parquet::Type::type DuckDBTypeToParquetType(const LogicalType &duckdb_type);
|
98
|
-
static void SetSchemaProperties(const LogicalType &duckdb_type, duckdb_parquet::SchemaElement &schema_ele
|
98
|
+
static void SetSchemaProperties(const LogicalType &duckdb_type, duckdb_parquet::SchemaElement &schema_ele,
|
99
|
+
bool allow_geometry);
|
99
100
|
|
100
101
|
ClientContext &GetContext() {
|
101
102
|
return context;
|
@@ -139,6 +140,9 @@ public:
|
|
139
140
|
ParquetVersion GetParquetVersion() const {
|
140
141
|
return parquet_version;
|
141
142
|
}
|
143
|
+
GeoParquetVersion GetGeoParquetVersion() const {
|
144
|
+
return geoparquet_version;
|
145
|
+
}
|
142
146
|
const string &GetFileName() const {
|
143
147
|
return file_name;
|
144
148
|
}
|
@@ -175,6 +179,7 @@ private:
|
|
175
179
|
bool debug_use_openssl;
|
176
180
|
shared_ptr<EncryptionUtil> encryption_util;
|
177
181
|
ParquetVersion parquet_version;
|
182
|
+
GeoParquetVersion geoparquet_version;
|
178
183
|
vector<ParquetColumnSchema> column_schemas;
|
179
184
|
|
180
185
|
unique_ptr<BufferedFileWriter> writer;
|
@@ -14,12 +14,25 @@
|
|
14
14
|
namespace duckdb {
|
15
15
|
|
16
16
|
class StringColumnReader : public ColumnReader {
|
17
|
+
enum class StringColumnType : uint8_t { VARCHAR, JSON, OTHER };
|
18
|
+
|
19
|
+
static StringColumnType GetStringColumnType(const LogicalType &type) {
|
20
|
+
if (type.IsJSONType()) {
|
21
|
+
return StringColumnType::JSON;
|
22
|
+
}
|
23
|
+
if (type.id() == LogicalTypeId::VARCHAR) {
|
24
|
+
return StringColumnType::VARCHAR;
|
25
|
+
}
|
26
|
+
return StringColumnType::OTHER;
|
27
|
+
}
|
28
|
+
|
17
29
|
public:
|
18
30
|
static constexpr const PhysicalType TYPE = PhysicalType::VARCHAR;
|
19
31
|
|
20
32
|
public:
|
21
33
|
StringColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema);
|
22
34
|
idx_t fixed_width_string_length;
|
35
|
+
const StringColumnType string_column_type;
|
23
36
|
|
24
37
|
public:
|
25
38
|
static void VerifyString(const char *str_data, uint32_t str_len, const bool isVarchar);
|
@@ -25,6 +25,10 @@ public:
|
|
25
25
|
void Prepare(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count,
|
26
26
|
bool vector_can_span_multiple_pages) override;
|
27
27
|
void Write(ColumnWriterState &state, Vector &vector, idx_t count) override;
|
28
|
+
|
29
|
+
protected:
|
30
|
+
void WriteArrayState(ListColumnWriterState &state, idx_t array_size, uint16_t first_repeat_level,
|
31
|
+
idx_t define_value, const bool is_empty = false);
|
28
32
|
};
|
29
33
|
|
30
34
|
} // namespace duckdb
|
@@ -238,6 +238,9 @@ struct ParquetWriteBindData : public TableFunctionData {
|
|
238
238
|
|
239
239
|
//! Which encodings to include when writing
|
240
240
|
ParquetVersion parquet_version = ParquetVersion::V1;
|
241
|
+
|
242
|
+
//! Which geo-parquet version to use when writing
|
243
|
+
GeoParquetVersion geoparquet_version = GeoParquetVersion::V1;
|
241
244
|
};
|
242
245
|
|
243
246
|
struct ParquetWriteGlobalState : public GlobalFunctionData {
|
@@ -291,6 +294,7 @@ static void ParquetListCopyOptions(ClientContext &context, CopyOptionsInput &inp
|
|
291
294
|
copy_options["binary_as_string"] = CopyOption(LogicalType::BOOLEAN, CopyOptionMode::READ_ONLY);
|
292
295
|
copy_options["file_row_number"] = CopyOption(LogicalType::BOOLEAN, CopyOptionMode::READ_ONLY);
|
293
296
|
copy_options["can_have_nan"] = CopyOption(LogicalType::BOOLEAN, CopyOptionMode::READ_ONLY);
|
297
|
+
copy_options["geoparquet_version"] = CopyOption(LogicalType::VARCHAR, CopyOptionMode::WRITE_ONLY);
|
294
298
|
}
|
295
299
|
|
296
300
|
static unique_ptr<FunctionData> ParquetWriteBind(ClientContext &context, CopyFunctionBindInput &input,
|
@@ -426,6 +430,19 @@ static unique_ptr<FunctionData> ParquetWriteBind(ClientContext &context, CopyFun
|
|
426
430
|
} else {
|
427
431
|
throw BinderException("Expected parquet_version 'V1' or 'V2'");
|
428
432
|
}
|
433
|
+
} else if (loption == "geoparquet_version") {
|
434
|
+
const auto roption = StringUtil::Upper(option.second[0].ToString());
|
435
|
+
if (roption == "NONE") {
|
436
|
+
bind_data->geoparquet_version = GeoParquetVersion::NONE;
|
437
|
+
} else if (roption == "V1") {
|
438
|
+
bind_data->geoparquet_version = GeoParquetVersion::V1;
|
439
|
+
} else if (roption == "V2") {
|
440
|
+
bind_data->geoparquet_version = GeoParquetVersion::V2;
|
441
|
+
} else if (roption == "BOTH") {
|
442
|
+
bind_data->geoparquet_version = GeoParquetVersion::BOTH;
|
443
|
+
} else {
|
444
|
+
throw BinderException("Expected geoparquet_version 'NONE', 'V1' or 'BOTH'");
|
445
|
+
}
|
429
446
|
} else {
|
430
447
|
throw InternalException("Unrecognized option for PARQUET: %s", option.first.c_str());
|
431
448
|
}
|
@@ -457,7 +474,8 @@ static unique_ptr<GlobalFunctionData> ParquetWriteInitializeGlobal(ClientContext
|
|
457
474
|
parquet_bind.field_ids.Copy(), parquet_bind.kv_metadata, parquet_bind.encryption_config,
|
458
475
|
parquet_bind.dictionary_size_limit, parquet_bind.string_dictionary_page_size_limit,
|
459
476
|
parquet_bind.enable_bloom_filters, parquet_bind.bloom_filter_false_positive_ratio,
|
460
|
-
parquet_bind.compression_level, parquet_bind.debug_use_openssl, parquet_bind.parquet_version
|
477
|
+
parquet_bind.compression_level, parquet_bind.debug_use_openssl, parquet_bind.parquet_version,
|
478
|
+
parquet_bind.geoparquet_version);
|
461
479
|
return std::move(global_state);
|
462
480
|
}
|
463
481
|
|
@@ -626,6 +644,39 @@ ParquetVersion EnumUtil::FromString<ParquetVersion>(const char *value) {
|
|
626
644
|
throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value));
|
627
645
|
}
|
628
646
|
|
647
|
+
template <>
|
648
|
+
const char *EnumUtil::ToChars<GeoParquetVersion>(GeoParquetVersion value) {
|
649
|
+
switch (value) {
|
650
|
+
case GeoParquetVersion::NONE:
|
651
|
+
return "NONE";
|
652
|
+
case GeoParquetVersion::V1:
|
653
|
+
return "V1";
|
654
|
+
case GeoParquetVersion::V2:
|
655
|
+
return "V2";
|
656
|
+
case GeoParquetVersion::BOTH:
|
657
|
+
return "BOTH";
|
658
|
+
default:
|
659
|
+
throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value));
|
660
|
+
}
|
661
|
+
}
|
662
|
+
|
663
|
+
template <>
|
664
|
+
GeoParquetVersion EnumUtil::FromString<GeoParquetVersion>(const char *value) {
|
665
|
+
if (StringUtil::Equals(value, "NONE")) {
|
666
|
+
return GeoParquetVersion::NONE;
|
667
|
+
}
|
668
|
+
if (StringUtil::Equals(value, "V1")) {
|
669
|
+
return GeoParquetVersion::V1;
|
670
|
+
}
|
671
|
+
if (StringUtil::Equals(value, "V2")) {
|
672
|
+
return GeoParquetVersion::V2;
|
673
|
+
}
|
674
|
+
if (StringUtil::Equals(value, "BOTH")) {
|
675
|
+
return GeoParquetVersion::BOTH;
|
676
|
+
}
|
677
|
+
throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value));
|
678
|
+
}
|
679
|
+
|
629
680
|
static optional_idx SerializeCompressionLevel(const int64_t compression_level) {
|
630
681
|
return compression_level < 0 ? NumericLimits<idx_t>::Maximum() - NumericCast<idx_t>(AbsValue(compression_level))
|
631
682
|
: NumericCast<idx_t>(compression_level);
|
@@ -679,6 +730,8 @@ static void ParquetCopySerialize(Serializer &serializer, const FunctionData &bin
|
|
679
730
|
serializer.WritePropertyWithDefault(115, "string_dictionary_page_size_limit",
|
680
731
|
bind_data.string_dictionary_page_size_limit,
|
681
732
|
default_value.string_dictionary_page_size_limit);
|
733
|
+
serializer.WritePropertyWithDefault(116, "geoparquet_version", bind_data.geoparquet_version,
|
734
|
+
default_value.geoparquet_version);
|
682
735
|
}
|
683
736
|
|
684
737
|
static unique_ptr<FunctionData> ParquetCopyDeserialize(Deserializer &deserializer, CopyFunction &function) {
|
@@ -711,6 +764,8 @@ static unique_ptr<FunctionData> ParquetCopyDeserialize(Deserializer &deserialize
|
|
711
764
|
deserializer.ReadPropertyWithExplicitDefault(114, "parquet_version", default_value.parquet_version);
|
712
765
|
data->string_dictionary_page_size_limit = deserializer.ReadPropertyWithExplicitDefault(
|
713
766
|
115, "string_dictionary_page_size_limit", default_value.string_dictionary_page_size_limit);
|
767
|
+
data->geoparquet_version =
|
768
|
+
deserializer.ReadPropertyWithExplicitDefault(116, "geoparquet_version", default_value.geoparquet_version);
|
714
769
|
|
715
770
|
return std::move(data);
|
716
771
|
}
|
@@ -570,7 +570,10 @@ ParquetColumnSchema ParquetReader::ParseSchemaRecursive(idx_t depth, idx_t max_d
|
|
570
570
|
|
571
571
|
auto file_meta_data = GetFileMetadata();
|
572
572
|
D_ASSERT(file_meta_data);
|
573
|
-
|
573
|
+
if (next_schema_idx >= file_meta_data->schema.size()) {
|
574
|
+
throw InvalidInputException("Malformed Parquet schema in file \"%s\": invalid schema index %d", file.path,
|
575
|
+
next_schema_idx);
|
576
|
+
}
|
574
577
|
auto &s_ele = file_meta_data->schema[next_schema_idx];
|
575
578
|
auto this_idx = next_schema_idx;
|
576
579
|
|
@@ -395,23 +395,21 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(con
|
|
395
395
|
}
|
396
396
|
break;
|
397
397
|
case LogicalTypeId::VARCHAR: {
|
398
|
-
auto string_stats = StringStats::
|
398
|
+
auto string_stats = StringStats::CreateUnknown(type);
|
399
399
|
if (parquet_stats.__isset.min_value) {
|
400
400
|
StringColumnReader::VerifyString(parquet_stats.min_value.c_str(), parquet_stats.min_value.size(), true);
|
401
|
-
StringStats::
|
401
|
+
StringStats::SetMin(string_stats, parquet_stats.min_value);
|
402
402
|
} else if (parquet_stats.__isset.min) {
|
403
403
|
StringColumnReader::VerifyString(parquet_stats.min.c_str(), parquet_stats.min.size(), true);
|
404
|
-
StringStats::
|
404
|
+
StringStats::SetMin(string_stats, parquet_stats.min);
|
405
405
|
}
|
406
406
|
if (parquet_stats.__isset.max_value) {
|
407
407
|
StringColumnReader::VerifyString(parquet_stats.max_value.c_str(), parquet_stats.max_value.size(), true);
|
408
|
-
StringStats::
|
408
|
+
StringStats::SetMax(string_stats, parquet_stats.max_value);
|
409
409
|
} else if (parquet_stats.__isset.max) {
|
410
410
|
StringColumnReader::VerifyString(parquet_stats.max.c_str(), parquet_stats.max.size(), true);
|
411
|
-
StringStats::
|
411
|
+
StringStats::SetMax(string_stats, parquet_stats.max);
|
412
412
|
}
|
413
|
-
StringStats::SetContainsUnicode(string_stats);
|
414
|
-
StringStats::ResetMaxStringLength(string_stats);
|
415
413
|
row_group_stats = string_stats.ToUnique();
|
416
414
|
break;
|
417
415
|
}
|
@@ -166,7 +166,8 @@ Type::type ParquetWriter::DuckDBTypeToParquetType(const LogicalType &duckdb_type
|
|
166
166
|
throw NotImplementedException("Unimplemented type for Parquet \"%s\"", duckdb_type.ToString());
|
167
167
|
}
|
168
168
|
|
169
|
-
void ParquetWriter::SetSchemaProperties(const LogicalType &duckdb_type, duckdb_parquet::SchemaElement &schema_ele
|
169
|
+
void ParquetWriter::SetSchemaProperties(const LogicalType &duckdb_type, duckdb_parquet::SchemaElement &schema_ele,
|
170
|
+
bool allow_geometry) {
|
170
171
|
if (duckdb_type.IsJSONType()) {
|
171
172
|
schema_ele.converted_type = ConvertedType::JSON;
|
172
173
|
schema_ele.__isset.converted_type = true;
|
@@ -174,7 +175,7 @@ void ParquetWriter::SetSchemaProperties(const LogicalType &duckdb_type, duckdb_p
|
|
174
175
|
schema_ele.logicalType.__set_JSON(duckdb_parquet::JsonType());
|
175
176
|
return;
|
176
177
|
}
|
177
|
-
if (duckdb_type.GetAlias() == "WKB_BLOB") {
|
178
|
+
if (duckdb_type.GetAlias() == "WKB_BLOB" && allow_geometry) {
|
178
179
|
schema_ele.__isset.logicalType = true;
|
179
180
|
schema_ele.logicalType.__isset.GEOMETRY = true;
|
180
181
|
// TODO: Set CRS in the future
|
@@ -356,14 +357,16 @@ ParquetWriter::ParquetWriter(ClientContext &context, FileSystem &fs, string file
|
|
356
357
|
shared_ptr<ParquetEncryptionConfig> encryption_config_p,
|
357
358
|
optional_idx dictionary_size_limit_p, idx_t string_dictionary_page_size_limit_p,
|
358
359
|
bool enable_bloom_filters_p, double bloom_filter_false_positive_ratio_p,
|
359
|
-
int64_t compression_level_p, bool debug_use_openssl_p, ParquetVersion parquet_version
|
360
|
+
int64_t compression_level_p, bool debug_use_openssl_p, ParquetVersion parquet_version,
|
361
|
+
GeoParquetVersion geoparquet_version)
|
360
362
|
: context(context), file_name(std::move(file_name_p)), sql_types(std::move(types_p)),
|
361
363
|
column_names(std::move(names_p)), codec(codec), field_ids(std::move(field_ids_p)),
|
362
364
|
encryption_config(std::move(encryption_config_p)), dictionary_size_limit(dictionary_size_limit_p),
|
363
365
|
string_dictionary_page_size_limit(string_dictionary_page_size_limit_p),
|
364
366
|
enable_bloom_filters(enable_bloom_filters_p),
|
365
367
|
bloom_filter_false_positive_ratio(bloom_filter_false_positive_ratio_p), compression_level(compression_level_p),
|
366
|
-
debug_use_openssl(debug_use_openssl_p), parquet_version(parquet_version),
|
368
|
+
debug_use_openssl(debug_use_openssl_p), parquet_version(parquet_version), geoparquet_version(geoparquet_version),
|
369
|
+
total_written(0), num_row_groups(0) {
|
367
370
|
|
368
371
|
// initialize the file writer
|
369
372
|
writer = make_uniq<BufferedFileWriter>(fs, file_name.c_str(),
|
@@ -416,10 +419,13 @@ ParquetWriter::ParquetWriter(ClientContext &context, FileSystem &fs, string file
|
|
416
419
|
auto &unique_names = column_names;
|
417
420
|
VerifyUniqueNames(unique_names);
|
418
421
|
|
422
|
+
// V1 GeoParquet stores geometries as blobs, no logical type
|
423
|
+
auto allow_geometry = geoparquet_version != GeoParquetVersion::V1;
|
424
|
+
|
419
425
|
// construct the child schemas
|
420
426
|
for (idx_t i = 0; i < sql_types.size(); i++) {
|
421
|
-
auto child_schema =
|
422
|
-
|
427
|
+
auto child_schema = ColumnWriter::FillParquetSchema(file_meta_data.schema, sql_types[i], unique_names[i],
|
428
|
+
allow_geometry, &field_ids);
|
423
429
|
column_schemas.push_back(std::move(child_schema));
|
424
430
|
}
|
425
431
|
// now construct the writers based on the schemas
|
@@ -975,7 +981,8 @@ void ParquetWriter::Finalize() {
|
|
975
981
|
}
|
976
982
|
|
977
983
|
// Add geoparquet metadata to the file metadata
|
978
|
-
if (geoparquet_data && GeoParquetFileMetadata::IsGeoParquetConversionEnabled(context)
|
984
|
+
if (geoparquet_data && GeoParquetFileMetadata::IsGeoParquetConversionEnabled(context) &&
|
985
|
+
geoparquet_version != GeoParquetVersion::NONE) {
|
979
986
|
geoparquet_data->Write(file_meta_data);
|
980
987
|
}
|
981
988
|
|
@@ -1005,7 +1012,7 @@ void ParquetWriter::Finalize() {
|
|
1005
1012
|
|
1006
1013
|
GeoParquetFileMetadata &ParquetWriter::GetGeoParquetData() {
|
1007
1014
|
if (!geoparquet_data) {
|
1008
|
-
geoparquet_data = make_uniq<GeoParquetFileMetadata>();
|
1015
|
+
geoparquet_data = make_uniq<GeoParquetFileMetadata>(geoparquet_version);
|
1009
1016
|
}
|
1010
1017
|
return *geoparquet_data;
|
1011
1018
|
}
|
@@ -9,7 +9,7 @@ namespace duckdb {
|
|
9
9
|
// String Column Reader
|
10
10
|
//===--------------------------------------------------------------------===//
|
11
11
|
StringColumnReader::StringColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema)
|
12
|
-
: ColumnReader(reader, schema) {
|
12
|
+
: ColumnReader(reader, schema), string_column_type(GetStringColumnType(Type())) {
|
13
13
|
fixed_width_string_length = 0;
|
14
14
|
if (schema.parquet_type == Type::FIXED_LEN_BYTE_ARRAY) {
|
15
15
|
fixed_width_string_length = schema.type_length;
|
@@ -26,13 +26,26 @@ void StringColumnReader::VerifyString(const char *str_data, uint32_t str_len, co
|
|
26
26
|
size_t pos;
|
27
27
|
auto utf_type = Utf8Proc::Analyze(str_data, str_len, &reason, &pos);
|
28
28
|
if (utf_type == UnicodeType::INVALID) {
|
29
|
-
throw InvalidInputException("Invalid string encoding found in Parquet file: value \""
|
30
|
-
Blob::ToString(string_t(str_data, str_len))
|
29
|
+
throw InvalidInputException("Invalid string encoding found in Parquet file: value \"%s\" is not valid UTF8!",
|
30
|
+
Blob::ToString(string_t(str_data, str_len)));
|
31
31
|
}
|
32
32
|
}
|
33
33
|
|
34
34
|
void StringColumnReader::VerifyString(const char *str_data, uint32_t str_len) {
|
35
|
-
|
35
|
+
switch (string_column_type) {
|
36
|
+
case StringColumnType::VARCHAR:
|
37
|
+
VerifyString(str_data, str_len, true);
|
38
|
+
break;
|
39
|
+
case StringColumnType::JSON: {
|
40
|
+
const auto error = StringUtil::ValidateJSON(str_data, str_len);
|
41
|
+
if (!error.empty()) {
|
42
|
+
throw InvalidInputException("Invalid JSON found in Parquet file: %s", error);
|
43
|
+
}
|
44
|
+
break;
|
45
|
+
}
|
46
|
+
default:
|
47
|
+
break;
|
48
|
+
}
|
36
49
|
}
|
37
50
|
|
38
51
|
class ParquetStringVectorBuffer : public VectorBuffer {
|