duckdb 1.1.4-dev9.0 → 1.2.1-dev4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/binding.gyp +1 -0
- package/package.json +2 -2
- package/src/connection.cpp +42 -15
- package/src/duckdb/extension/core_functions/function_list.cpp +1 -0
- package/src/duckdb/extension/core_functions/include/core_functions/scalar/map_functions.hpp +9 -0
- package/src/duckdb/extension/core_functions/scalar/date/current.cpp +1 -0
- package/src/duckdb/extension/core_functions/scalar/generic/can_implicitly_cast.cpp +2 -2
- package/src/duckdb/extension/core_functions/scalar/generic/typeof.cpp +1 -1
- package/src/duckdb/extension/core_functions/scalar/list/flatten.cpp +91 -61
- package/src/duckdb/extension/core_functions/scalar/map/map_extract.cpp +89 -8
- package/src/duckdb/extension/icu/icu-current.cpp +63 -0
- package/src/duckdb/extension/icu/icu-makedate.cpp +43 -39
- package/src/duckdb/extension/icu/icu-timezone.cpp +63 -63
- package/src/duckdb/extension/icu/icu_extension.cpp +2 -0
- package/src/duckdb/extension/icu/include/icu-casts.hpp +39 -0
- package/src/duckdb/extension/icu/include/icu-current.hpp +17 -0
- package/src/duckdb/extension/icu/third_party/icu/stubdata/stubdata.cpp +1 -1
- package/src/duckdb/extension/json/json_functions/json_structure.cpp +3 -1
- package/src/duckdb/extension/parquet/column_writer.cpp +26 -18
- package/src/duckdb/extension/parquet/include/parquet_reader.hpp +0 -6
- package/src/duckdb/extension/parquet/include/parquet_writer.hpp +15 -1
- package/src/duckdb/extension/parquet/include/resizable_buffer.hpp +1 -0
- package/src/duckdb/extension/parquet/parquet_extension.cpp +67 -15
- package/src/duckdb/extension/parquet/parquet_reader.cpp +5 -3
- package/src/duckdb/extension/parquet/parquet_writer.cpp +5 -6
- package/src/duckdb/src/catalog/catalog.cpp +21 -8
- package/src/duckdb/src/catalog/catalog_search_path.cpp +17 -1
- package/src/duckdb/src/catalog/catalog_set.cpp +1 -1
- package/src/duckdb/src/catalog/default/default_functions.cpp +0 -3
- package/src/duckdb/src/catalog/dependency_list.cpp +7 -0
- package/src/duckdb/src/common/adbc/adbc.cpp +1 -56
- package/src/duckdb/src/common/arrow/arrow_converter.cpp +3 -2
- package/src/duckdb/src/common/arrow/arrow_type_extension.cpp +58 -28
- package/src/duckdb/src/common/arrow/schema_metadata.cpp +1 -1
- package/src/duckdb/src/common/compressed_file_system.cpp +6 -2
- package/src/duckdb/src/common/enum_util.cpp +26 -22
- package/src/duckdb/src/common/error_data.cpp +3 -2
- package/src/duckdb/src/common/gzip_file_system.cpp +8 -8
- package/src/duckdb/src/common/local_file_system.cpp +2 -2
- package/src/duckdb/src/common/multi_file_reader.cpp +1 -1
- package/src/duckdb/src/common/random_engine.cpp +4 -1
- package/src/duckdb/src/common/serializer/memory_stream.cpp +23 -19
- package/src/duckdb/src/common/serializer/serializer.cpp +1 -1
- package/src/duckdb/src/common/types/bit.cpp +1 -1
- package/src/duckdb/src/common/types/column/column_data_allocator.cpp +0 -5
- package/src/duckdb/src/common/types/column/column_data_collection.cpp +4 -1
- package/src/duckdb/src/common/types/data_chunk.cpp +2 -1
- package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +0 -4
- package/src/duckdb/src/common/types.cpp +1 -1
- package/src/duckdb/src/execution/index/art/art.cpp +52 -42
- package/src/duckdb/src/execution/index/art/leaf.cpp +4 -9
- package/src/duckdb/src/execution/index/art/node.cpp +13 -13
- package/src/duckdb/src/execution/index/art/prefix.cpp +21 -16
- package/src/duckdb/src/execution/index/bound_index.cpp +6 -8
- package/src/duckdb/src/execution/index/fixed_size_allocator.cpp +39 -34
- package/src/duckdb/src/execution/index/fixed_size_buffer.cpp +2 -1
- package/src/duckdb/src/execution/index/unbound_index.cpp +10 -0
- package/src/duckdb/src/execution/operator/aggregate/physical_streaming_window.cpp +62 -44
- package/src/duckdb/src/execution/operator/csv_scanner/scanner/column_count_scanner.cpp +26 -0
- package/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +69 -40
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +3 -7
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +11 -5
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +4 -0
- package/src/duckdb/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp +8 -8
- package/src/duckdb/src/execution/operator/csv_scanner/util/csv_error.cpp +36 -12
- package/src/duckdb/src/execution/operator/csv_scanner/util/csv_reader_options.cpp +12 -9
- package/src/duckdb/src/execution/operator/join/physical_hash_join.cpp +0 -1
- package/src/duckdb/src/execution/operator/persistent/physical_copy_database.cpp +29 -1
- package/src/duckdb/src/execution/operator/persistent/physical_delete.cpp +58 -10
- package/src/duckdb/src/execution/operator/persistent/physical_insert.cpp +58 -35
- package/src/duckdb/src/execution/operator/schema/physical_create_art_index.cpp +2 -1
- package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +9 -4
- package/src/duckdb/src/execution/sample/reservoir_sample.cpp +7 -6
- package/src/duckdb/src/function/compression_config.cpp +4 -0
- package/src/duckdb/src/function/function_binder.cpp +1 -1
- package/src/duckdb/src/function/scalar/system/write_log.cpp +2 -2
- package/src/duckdb/src/function/table/arrow/arrow_duck_schema.cpp +15 -2
- package/src/duckdb/src/function/table/arrow_conversion.cpp +10 -10
- package/src/duckdb/src/function/table/copy_csv.cpp +8 -5
- package/src/duckdb/src/function/table/read_csv.cpp +21 -4
- package/src/duckdb/src/function/table/sniff_csv.cpp +7 -0
- package/src/duckdb/src/function/table/system/duckdb_extensions.cpp +4 -0
- package/src/duckdb/src/function/table/system/duckdb_secret_types.cpp +71 -0
- package/src/duckdb/src/function/table/system_functions.cpp +1 -0
- package/src/duckdb/src/function/table/table_scan.cpp +120 -36
- package/src/duckdb/src/function/table/version/pragma_version.cpp +4 -4
- package/src/duckdb/src/function/window/window_aggregate_function.cpp +6 -1
- package/src/duckdb/src/function/window/window_boundaries_state.cpp +135 -11
- package/src/duckdb/src/function/window/window_segment_tree.cpp +50 -22
- package/src/duckdb/src/function/window/window_token_tree.cpp +4 -3
- package/src/duckdb/src/include/duckdb/catalog/catalog.hpp +4 -0
- package/src/duckdb/src/include/duckdb/catalog/catalog_search_path.hpp +2 -0
- package/src/duckdb/src/include/duckdb/catalog/dependency_list.hpp +1 -0
- package/src/duckdb/src/include/duckdb/common/arrow/arrow_type_extension.hpp +4 -2
- package/src/duckdb/src/include/duckdb/common/enum_util.hpp +8 -8
- package/src/duckdb/src/include/duckdb/common/multi_file_reader.hpp +0 -2
- package/src/duckdb/src/include/duckdb/common/serializer/deserializer.hpp +8 -3
- package/src/duckdb/src/include/duckdb/common/serializer/memory_stream.hpp +6 -1
- package/src/duckdb/src/include/duckdb/common/serializer/serialization_data.hpp +25 -0
- package/src/duckdb/src/include/duckdb/common/serializer/serializer.hpp +9 -3
- package/src/duckdb/src/include/duckdb/common/types/selection_vector.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/index/art/art.hpp +11 -14
- package/src/duckdb/src/include/duckdb/execution/index/art/prefix.hpp +5 -4
- package/src/duckdb/src/include/duckdb/execution/index/bound_index.hpp +21 -10
- package/src/duckdb/src/include/duckdb/execution/index/fixed_size_allocator.hpp +6 -5
- package/src/duckdb/src/include/duckdb/execution/index/fixed_size_buffer.hpp +37 -32
- package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/base_scanner.hpp +36 -1
- package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/column_count_scanner.hpp +3 -0
- package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp +2 -0
- package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/state_machine_options.hpp +5 -5
- package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +5 -30
- package/src/duckdb/src/include/duckdb/execution/reservoir_sample.hpp +7 -1
- package/src/duckdb/src/include/duckdb/function/scalar_function.hpp +3 -3
- package/src/duckdb/src/include/duckdb/function/table/arrow/arrow_duck_schema.hpp +1 -0
- package/src/duckdb/src/include/duckdb/function/table/system_functions.hpp +4 -0
- package/src/duckdb/src/include/duckdb/function/window/window_boundaries_state.hpp +2 -2
- package/src/duckdb/src/include/duckdb/logging/logger.hpp +40 -119
- package/src/duckdb/src/include/duckdb/logging/logging.hpp +0 -2
- package/src/duckdb/src/include/duckdb/main/config.hpp +5 -0
- package/src/duckdb/src/include/duckdb/main/connection.hpp +0 -8
- package/src/duckdb/src/include/duckdb/main/connection_manager.hpp +2 -1
- package/src/duckdb/src/include/duckdb/main/extension.hpp +1 -0
- package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +11 -7
- package/src/duckdb/src/include/duckdb/main/extension_helper.hpp +1 -0
- package/src/duckdb/src/include/duckdb/main/secret/secret.hpp +2 -0
- package/src/duckdb/src/include/duckdb/main/secret/secret_manager.hpp +3 -0
- package/src/duckdb/src/include/duckdb/main/settings.hpp +10 -0
- package/src/duckdb/src/include/duckdb/parser/constraint.hpp +9 -0
- package/src/duckdb/src/include/duckdb/parser/expression/window_expression.hpp +36 -9
- package/src/duckdb/src/include/duckdb/parser/parsed_data/create_view_info.hpp +2 -1
- package/src/duckdb/src/include/duckdb/parser/query_node/set_operation_node.hpp +8 -2
- package/src/duckdb/src/include/duckdb/planner/binder.hpp +4 -0
- package/src/duckdb/src/include/duckdb/planner/expression/bound_parameter_data.hpp +9 -1
- package/src/duckdb/src/include/duckdb/planner/filter/constant_filter.hpp +1 -0
- package/src/duckdb/src/include/duckdb/planner/filter/in_filter.hpp +0 -2
- package/src/duckdb/src/include/duckdb/planner/filter/optional_filter.hpp +4 -4
- package/src/duckdb/src/include/duckdb/planner/table_filter.hpp +1 -1
- package/src/duckdb/src/include/duckdb/storage/data_table.hpp +14 -10
- package/src/duckdb/src/include/duckdb/storage/index_storage_info.hpp +4 -0
- package/src/duckdb/src/include/duckdb/storage/single_file_block_manager.hpp +6 -1
- package/src/duckdb/src/include/duckdb/storage/storage_info.hpp +7 -2
- package/src/duckdb/src/include/duckdb/storage/storage_manager.hpp +9 -0
- package/src/duckdb/src/include/duckdb/storage/storage_options.hpp +2 -0
- package/src/duckdb/src/include/duckdb/storage/string_uncompressed.hpp +4 -3
- package/src/duckdb/src/include/duckdb/storage/table/column_data.hpp +2 -0
- package/src/duckdb/src/include/duckdb/storage/table/table_index_list.hpp +6 -4
- package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +1 -1
- package/src/duckdb/src/include/duckdb/storage/write_ahead_log.hpp +2 -0
- package/src/duckdb/src/include/duckdb/transaction/local_storage.hpp +2 -0
- package/src/duckdb/src/include/duckdb/transaction/meta_transaction.hpp +1 -1
- package/src/duckdb/src/logging/logger.cpp +8 -66
- package/src/duckdb/src/main/attached_database.cpp +3 -1
- package/src/duckdb/src/main/client_context.cpp +4 -2
- package/src/duckdb/src/main/config.cpp +20 -2
- package/src/duckdb/src/main/connection.cpp +2 -29
- package/src/duckdb/src/main/connection_manager.cpp +5 -3
- package/src/duckdb/src/main/database.cpp +2 -2
- package/src/duckdb/src/main/extension/extension_helper.cpp +4 -5
- package/src/duckdb/src/main/extension/extension_install.cpp +23 -10
- package/src/duckdb/src/main/extension/extension_load.cpp +6 -7
- package/src/duckdb/src/main/extension.cpp +27 -9
- package/src/duckdb/src/main/secret/secret_manager.cpp +11 -0
- package/src/duckdb/src/main/settings/custom_settings.cpp +44 -0
- package/src/duckdb/src/optimizer/column_lifetime_analyzer.cpp +6 -0
- package/src/duckdb/src/optimizer/filter_combiner.cpp +13 -3
- package/src/duckdb/src/optimizer/filter_pushdown.cpp +33 -6
- package/src/duckdb/src/optimizer/late_materialization.cpp +14 -3
- package/src/duckdb/src/optimizer/remove_unused_columns.cpp +0 -3
- package/src/duckdb/src/parser/parsed_data/attach_info.cpp +5 -1
- package/src/duckdb/src/parser/parsed_data/create_view_info.cpp +6 -3
- package/src/duckdb/src/parser/query_node/set_operation_node.cpp +49 -0
- package/src/duckdb/src/parser/transform/expression/transform_columnref.cpp +1 -0
- package/src/duckdb/src/parser/transform/expression/transform_function.cpp +50 -12
- package/src/duckdb/src/planner/binder/expression/bind_columnref_expression.cpp +7 -5
- package/src/duckdb/src/planner/binder/expression/bind_comparison_expression.cpp +1 -0
- package/src/duckdb/src/planner/binder/expression/bind_operator_expression.cpp +2 -2
- package/src/duckdb/src/planner/binder/expression/bind_star_expression.cpp +12 -2
- package/src/duckdb/src/planner/binder/statement/bind_copy_database.cpp +0 -1
- package/src/duckdb/src/planner/binder/statement/bind_create.cpp +55 -39
- package/src/duckdb/src/planner/binder/statement/bind_execute.cpp +2 -1
- package/src/duckdb/src/planner/binder/tableref/bind_basetableref.cpp +15 -7
- package/src/duckdb/src/planner/binder/tableref/bind_showref.cpp +13 -8
- package/src/duckdb/src/planner/binder/tableref/bind_table_function.cpp +8 -3
- package/src/duckdb/src/planner/expression/bound_function_expression.cpp +17 -1
- package/src/duckdb/src/planner/expression_binder/index_binder.cpp +1 -0
- package/src/duckdb/src/planner/filter/conjunction_filter.cpp +1 -0
- package/src/duckdb/src/planner/filter/constant_filter.cpp +21 -0
- package/src/duckdb/src/planner/filter/in_filter.cpp +4 -7
- package/src/duckdb/src/planner/logical_operator.cpp +5 -3
- package/src/duckdb/src/planner/planner.cpp +1 -1
- package/src/duckdb/src/planner/subquery/flatten_dependent_join.cpp +2 -0
- package/src/duckdb/src/storage/checkpoint/table_data_writer.cpp +3 -4
- package/src/duckdb/src/storage/checkpoint_manager.cpp +3 -5
- package/src/duckdb/src/storage/compression/dictionary/decompression.cpp +4 -4
- package/src/duckdb/src/storage/compression/fsst.cpp +2 -2
- package/src/duckdb/src/storage/compression/roaring/common.cpp +10 -1
- package/src/duckdb/src/storage/compression/string_uncompressed.cpp +11 -6
- package/src/duckdb/src/storage/compression/validity_uncompressed.cpp +4 -0
- package/src/duckdb/src/storage/compression/zstd.cpp +6 -0
- package/src/duckdb/src/storage/data_table.cpp +104 -109
- package/src/duckdb/src/storage/local_storage.cpp +8 -6
- package/src/duckdb/src/storage/magic_bytes.cpp +1 -1
- package/src/duckdb/src/storage/serialization/serialize_dependency.cpp +3 -3
- package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +3 -3
- package/src/duckdb/src/storage/serialization/serialize_query_node.cpp +7 -5
- package/src/duckdb/src/storage/single_file_block_manager.cpp +95 -28
- package/src/duckdb/src/storage/storage_info.cpp +38 -0
- package/src/duckdb/src/storage/storage_manager.cpp +11 -0
- package/src/duckdb/src/storage/table/column_data.cpp +4 -0
- package/src/duckdb/src/storage/table/column_data_checkpointer.cpp +3 -3
- package/src/duckdb/src/storage/table/row_group_collection.cpp +67 -68
- package/src/duckdb/src/storage/table/table_statistics.cpp +4 -4
- package/src/duckdb/src/storage/table_index_list.cpp +41 -15
- package/src/duckdb/src/storage/wal_replay.cpp +3 -1
- package/src/duckdb/src/storage/write_ahead_log.cpp +11 -4
- package/src/duckdb/src/transaction/meta_transaction.cpp +1 -1
- package/src/duckdb/src/verification/deserialized_statement_verifier.cpp +2 -1
- package/src/duckdb/third_party/httplib/httplib.hpp +0 -1
- package/src/duckdb/third_party/re2/util/logging.h +10 -10
- package/src/duckdb/ub_src_function_table_system.cpp +2 -0
@@ -126,6 +126,10 @@ StringValueResult::StringValueResult(CSVStates &states, CSVStateMachine &state_m
|
|
126
126
|
SkipBOM();
|
127
127
|
}
|
128
128
|
}
|
129
|
+
ignore_empty_values = state_machine.dialect_options.state_machine_options.delimiter.GetValue()[0] != ' ' &&
|
130
|
+
state_machine.dialect_options.state_machine_options.quote != ' ' &&
|
131
|
+
state_machine.dialect_options.state_machine_options.escape != ' ' &&
|
132
|
+
state_machine.dialect_options.state_machine_options.comment != ' ';
|
129
133
|
}
|
130
134
|
|
131
135
|
StringValueResult::~StringValueResult() {
|
@@ -148,7 +152,7 @@ inline bool IsValueNull(const char *null_str_ptr, const char *value_ptr, const i
|
|
148
152
|
}
|
149
153
|
|
150
154
|
bool StringValueResult::HandleTooManyColumnsError(const char *value_ptr, const idx_t size) {
|
151
|
-
if (cur_col_id >= number_of_columns) {
|
155
|
+
if (cur_col_id >= number_of_columns && state_machine.state_machine_options.strict_mode.GetValue()) {
|
152
156
|
bool error = true;
|
153
157
|
if (cur_col_id == number_of_columns && ((quoted && state_machine.options.allow_quoted_nulls) || !quoted)) {
|
154
158
|
// we make an exception if the first over-value is null
|
@@ -220,6 +224,9 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size
|
|
220
224
|
return;
|
221
225
|
}
|
222
226
|
if (cur_col_id >= number_of_columns) {
|
227
|
+
if (!state_machine.state_machine_options.strict_mode.GetValue()) {
|
228
|
+
return;
|
229
|
+
}
|
223
230
|
bool error = true;
|
224
231
|
if (cur_col_id == number_of_columns && ((quoted && state_machine.options.allow_quoted_nulls) || !quoted)) {
|
225
232
|
// we make an exception if the first over-value is null
|
@@ -245,9 +252,9 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size
|
|
245
252
|
}
|
246
253
|
|
247
254
|
if (((quoted && state_machine.options.allow_quoted_nulls) || !quoted)) {
|
248
|
-
// Check for the occurrence of escaped null string like \N only if
|
255
|
+
// Check for the occurrence of escaped null string like \N only if strict_mode is disabled
|
249
256
|
const bool check_unquoted_escaped_null =
|
250
|
-
state_machine.state_machine_options.
|
257
|
+
state_machine.state_machine_options.strict_mode.GetValue() == false && escaped && !quoted && size == 1;
|
251
258
|
for (idx_t i = 0; i < null_str_count; i++) {
|
252
259
|
bool is_null = false;
|
253
260
|
if (null_str_size[i] == 2 && null_str_ptr[i][0] == state_machine.state_machine_options.escape.GetValue()) {
|
@@ -485,19 +492,30 @@ void StringValueResult::Reset() {
|
|
485
492
|
cur_buffer = buffer_handles[iterator.GetBufferIdx()];
|
486
493
|
}
|
487
494
|
buffer_handles.clear();
|
495
|
+
idx_t actual_size = 0;
|
488
496
|
if (cur_buffer) {
|
489
497
|
buffer_handles[cur_buffer->buffer_idx] = cur_buffer;
|
498
|
+
actual_size = cur_buffer->actual_size;
|
490
499
|
}
|
491
500
|
current_errors.Reset();
|
492
501
|
borked_rows.clear();
|
502
|
+
current_line_position.begin = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, actual_size};
|
503
|
+
current_line_position.end = current_line_position.begin;
|
493
504
|
}
|
494
505
|
|
495
506
|
void StringValueResult::AddQuotedValue(StringValueResult &result, const idx_t buffer_pos) {
|
496
507
|
if (!result.unquoted) {
|
497
508
|
result.current_errors.Insert(UNTERMINATED_QUOTES, result.cur_col_id, result.chunk_col_id, result.last_position);
|
498
509
|
}
|
499
|
-
|
500
|
-
|
510
|
+
// remove potential empty values
|
511
|
+
idx_t length = buffer_pos - result.quoted_position - 1;
|
512
|
+
while (length > 0 && result.ignore_empty_values &&
|
513
|
+
result.buffer_ptr[result.quoted_position + 1 + length - 1] == ' ') {
|
514
|
+
length--;
|
515
|
+
}
|
516
|
+
length--;
|
517
|
+
AddPossiblyEscapedValue(result, buffer_pos, result.buffer_ptr + result.quoted_position + 1, length,
|
518
|
+
buffer_pos < result.last_position.buffer_pos + 2);
|
501
519
|
result.quoted = false;
|
502
520
|
}
|
503
521
|
|
@@ -511,6 +529,10 @@ void StringValueResult::AddPossiblyEscapedValue(StringValueResult &result, const
|
|
511
529
|
return;
|
512
530
|
}
|
513
531
|
}
|
532
|
+
if (result.cur_col_id >= result.number_of_columns &&
|
533
|
+
!result.state_machine.state_machine_options.strict_mode.GetValue()) {
|
534
|
+
return;
|
535
|
+
}
|
514
536
|
if (!result.HandleTooManyColumnsError(value_ptr, length)) {
|
515
537
|
// If it's an escaped value we have to remove all the escapes, this is not really great
|
516
538
|
// If we are going to escape, this vector must be a varchar vector
|
@@ -520,7 +542,6 @@ void StringValueResult::AddPossiblyEscapedValue(StringValueResult &result, const
|
|
520
542
|
// We have to write the cast error message.
|
521
543
|
std::ostringstream error;
|
522
544
|
// Casting Error Message
|
523
|
-
|
524
545
|
error << "Could not convert string \"" << std::string(value_ptr, length) << "\" to \'"
|
525
546
|
<< LogicalTypeIdToString(result.parse_types[result.chunk_col_id].type_id) << "\'";
|
526
547
|
auto error_string = error.str();
|
@@ -533,6 +554,7 @@ void StringValueResult::AddPossiblyEscapedValue(StringValueResult &result, const
|
|
533
554
|
auto value = StringValueScanner::RemoveEscape(
|
534
555
|
value_ptr, length, result.state_machine.dialect_options.state_machine_options.escape.GetValue(),
|
535
556
|
result.state_machine.dialect_options.state_machine_options.quote.GetValue(),
|
557
|
+
result.state_machine.dialect_options.state_machine_options.strict_mode.GetValue(),
|
536
558
|
result.parse_chunk.data[result.chunk_col_id]);
|
537
559
|
result.AddValueToVector(value.GetData(), value.GetSize());
|
538
560
|
}
|
@@ -806,7 +828,7 @@ bool StringValueResult::AddRowInternal() {
|
|
806
828
|
quoted_new_line = false;
|
807
829
|
// We need to check if we are getting the correct number of columns here.
|
808
830
|
// If columns are correct, we add it, and that's it.
|
809
|
-
if (cur_col_id
|
831
|
+
if (cur_col_id < number_of_columns) {
|
810
832
|
// We have too few columns:
|
811
833
|
if (null_padding) {
|
812
834
|
while (cur_col_id < number_of_columns) {
|
@@ -1231,7 +1253,8 @@ void StringValueScanner::ProcessExtraRow() {
|
|
1231
1253
|
}
|
1232
1254
|
}
|
1233
1255
|
|
1234
|
-
string_t StringValueScanner::RemoveEscape(const char *str_ptr, idx_t end, char escape, char quote,
|
1256
|
+
string_t StringValueScanner::RemoveEscape(const char *str_ptr, idx_t end, char escape, char quote, bool strict_mode,
|
1257
|
+
Vector &vector) {
|
1235
1258
|
// Figure out the exact size
|
1236
1259
|
idx_t str_pos = 0;
|
1237
1260
|
bool just_escaped = false;
|
@@ -1239,7 +1262,7 @@ string_t StringValueScanner::RemoveEscape(const char *str_ptr, idx_t end, char e
|
|
1239
1262
|
if (str_ptr[cur_pos] == escape && !just_escaped) {
|
1240
1263
|
just_escaped = true;
|
1241
1264
|
} else if (str_ptr[cur_pos] == quote) {
|
1242
|
-
if (just_escaped) {
|
1265
|
+
if (just_escaped || !strict_mode) {
|
1243
1266
|
str_pos++;
|
1244
1267
|
}
|
1245
1268
|
just_escaped = false;
|
@@ -1259,7 +1282,7 @@ string_t StringValueScanner::RemoveEscape(const char *str_ptr, idx_t end, char e
|
|
1259
1282
|
if (c == escape && !just_escaped) {
|
1260
1283
|
just_escaped = true;
|
1261
1284
|
} else if (str_ptr[cur_pos] == quote) {
|
1262
|
-
if (just_escaped) {
|
1285
|
+
if (just_escaped || !strict_mode) {
|
1263
1286
|
removed_escapes_ptr[str_pos++] = c;
|
1264
1287
|
}
|
1265
1288
|
just_escaped = false;
|
@@ -1289,10 +1312,8 @@ void StringValueScanner::ProcessOverBufferValue() {
|
|
1289
1312
|
}
|
1290
1313
|
if (states.NewRow() || states.NewValue()) {
|
1291
1314
|
break;
|
1292
|
-
} else {
|
1293
|
-
|
1294
|
-
over_buffer_string += previous_buffer[i];
|
1295
|
-
}
|
1315
|
+
} else if (!result.comment) {
|
1316
|
+
over_buffer_string += previous_buffer[i];
|
1296
1317
|
}
|
1297
1318
|
if (states.IsQuoted()) {
|
1298
1319
|
result.SetQuoted(result, j);
|
@@ -1323,16 +1344,13 @@ void StringValueScanner::ProcessOverBufferValue() {
|
|
1323
1344
|
if (states.EmptyLine()) {
|
1324
1345
|
if (state_machine->dialect_options.num_cols == 1) {
|
1325
1346
|
break;
|
1326
|
-
} else {
|
1327
|
-
continue;
|
1328
1347
|
}
|
1348
|
+
continue;
|
1329
1349
|
}
|
1330
1350
|
if (states.NewRow() || states.NewValue()) {
|
1331
1351
|
break;
|
1332
|
-
} else {
|
1333
|
-
|
1334
|
-
over_buffer_string += buffer_handle_ptr[iterator.pos.buffer_pos];
|
1335
|
-
}
|
1352
|
+
} else if (!result.comment && !states.IsComment()) {
|
1353
|
+
over_buffer_string += buffer_handle_ptr[iterator.pos.buffer_pos];
|
1336
1354
|
}
|
1337
1355
|
if (states.IsQuoted()) {
|
1338
1356
|
result.SetQuoted(result, j);
|
@@ -1357,26 +1375,34 @@ void StringValueScanner::ProcessOverBufferValue() {
|
|
1357
1375
|
}
|
1358
1376
|
if (!skip_value) {
|
1359
1377
|
string_t value;
|
1360
|
-
if (result.quoted) {
|
1361
|
-
|
1362
|
-
|
1378
|
+
if (result.quoted && !result.comment) {
|
1379
|
+
idx_t length = over_buffer_string.size() - 1 - result.quoted_position;
|
1380
|
+
while (length > 0 && result.ignore_empty_values &&
|
1381
|
+
over_buffer_string.c_str()[result.quoted_position + length] == ' ') {
|
1382
|
+
length--;
|
1383
|
+
}
|
1384
|
+
value = string_t(over_buffer_string.c_str() + result.quoted_position, UnsafeNumericCast<uint32_t>(length));
|
1363
1385
|
if (result.escaped) {
|
1364
1386
|
if (!result.HandleTooManyColumnsError(over_buffer_string.c_str(), over_buffer_string.size())) {
|
1365
1387
|
const auto str_ptr = over_buffer_string.c_str() + result.quoted_position;
|
1366
|
-
value =
|
1367
|
-
|
1368
|
-
|
1369
|
-
|
1388
|
+
value =
|
1389
|
+
RemoveEscape(str_ptr, over_buffer_string.size() - 2,
|
1390
|
+
state_machine->dialect_options.state_machine_options.escape.GetValue(),
|
1391
|
+
state_machine->dialect_options.state_machine_options.quote.GetValue(),
|
1392
|
+
result.state_machine.dialect_options.state_machine_options.strict_mode.GetValue(),
|
1393
|
+
result.parse_chunk.data[result.chunk_col_id]);
|
1370
1394
|
}
|
1371
1395
|
}
|
1372
1396
|
} else {
|
1373
1397
|
value = string_t(over_buffer_string.c_str(), UnsafeNumericCast<uint32_t>(over_buffer_string.size()));
|
1374
1398
|
if (result.escaped) {
|
1375
1399
|
if (!result.HandleTooManyColumnsError(over_buffer_string.c_str(), over_buffer_string.size())) {
|
1376
|
-
value =
|
1377
|
-
|
1378
|
-
|
1379
|
-
|
1400
|
+
value =
|
1401
|
+
RemoveEscape(over_buffer_string.c_str(), over_buffer_string.size(),
|
1402
|
+
state_machine->dialect_options.state_machine_options.escape.GetValue(),
|
1403
|
+
state_machine->dialect_options.state_machine_options.quote.GetValue(),
|
1404
|
+
result.state_machine.dialect_options.state_machine_options.strict_mode.GetValue(),
|
1405
|
+
result.parse_chunk.data[result.chunk_col_id]);
|
1380
1406
|
}
|
1381
1407
|
}
|
1382
1408
|
}
|
@@ -1436,7 +1462,7 @@ bool StringValueScanner::MoveToNextBuffer() {
|
|
1436
1462
|
// This means we reached the end of the file, we must add a last line if there is any to be added
|
1437
1463
|
if (states.EmptyLine() || states.NewRow() || result.added_last_line || states.IsCurrentNewRow() ||
|
1438
1464
|
states.IsNotSet()) {
|
1439
|
-
if (result.cur_col_id == result.number_of_columns) {
|
1465
|
+
if (result.cur_col_id == result.number_of_columns && !result.IsStateCurrent(CSVState::INVALID)) {
|
1440
1466
|
result.number_of_rows++;
|
1441
1467
|
}
|
1442
1468
|
result.cur_col_id = 0;
|
@@ -1453,7 +1479,7 @@ bool StringValueScanner::MoveToNextBuffer() {
|
|
1453
1479
|
}
|
1454
1480
|
lines_read++;
|
1455
1481
|
} else if (states.IsQuotedCurrent() &&
|
1456
|
-
state_machine->dialect_options.state_machine_options.
|
1482
|
+
state_machine->dialect_options.state_machine_options.strict_mode.GetValue()) {
|
1457
1483
|
// Unterminated quote
|
1458
1484
|
LinePosition current_line_start = {iterator.pos.buffer_idx, iterator.pos.buffer_pos,
|
1459
1485
|
result.buffer_size};
|
@@ -1465,7 +1491,7 @@ bool StringValueScanner::MoveToNextBuffer() {
|
|
1465
1491
|
result.UnsetComment(result, iterator.pos.buffer_pos);
|
1466
1492
|
} else {
|
1467
1493
|
if (result.quoted && states.IsDelimiterBytes() &&
|
1468
|
-
state_machine->dialect_options.state_machine_options.
|
1494
|
+
state_machine->dialect_options.state_machine_options.strict_mode.GetValue()) {
|
1469
1495
|
result.current_errors.Insert(UNTERMINATED_QUOTES, result.cur_col_id, result.chunk_col_id,
|
1470
1496
|
result.last_position);
|
1471
1497
|
}
|
@@ -1519,8 +1545,8 @@ bool StringValueScanner::FirstValueEndsOnQuote(CSVIterator iterator) const {
|
|
1519
1545
|
const idx_t to_pos = iterator.GetEndPos();
|
1520
1546
|
while (iterator.pos.buffer_pos < to_pos) {
|
1521
1547
|
state_machine->Transition(current_state, buffer_handle_ptr[iterator.pos.buffer_pos++]);
|
1522
|
-
if (
|
1523
|
-
|
1548
|
+
if (current_state.IsState(CSVState::DELIMITER) || current_state.IsState(CSVState::CARRIAGE_RETURN) ||
|
1549
|
+
current_state.IsState(CSVState::RECORD_SEPARATOR)) {
|
1524
1550
|
return buffer_handle_ptr[iterator.pos.buffer_pos - 2] ==
|
1525
1551
|
state_machine->dialect_options.state_machine_options.quote.GetValue();
|
1526
1552
|
}
|
@@ -1675,9 +1701,9 @@ void StringValueScanner::SetStart() {
|
|
1675
1701
|
// We need to initialize our strict state machine
|
1676
1702
|
auto &state_machine_cache = CSVStateMachineCache::Get(buffer_manager->context);
|
1677
1703
|
auto state_options = state_machine->state_machine_options;
|
1678
|
-
// To set the state machine to be strict we ensure that
|
1679
|
-
if (!state_options.
|
1680
|
-
state_options.
|
1704
|
+
// To set the state machine to be strict we ensure that strict_mode is set to true
|
1705
|
+
if (!state_options.strict_mode.IsSetByUser()) {
|
1706
|
+
state_options.strict_mode = true;
|
1681
1707
|
}
|
1682
1708
|
state_machine_strict =
|
1683
1709
|
make_shared_ptr<CSVStateMachine>(state_machine_cache.Get(state_options), state_machine->options);
|
@@ -1699,6 +1725,9 @@ void StringValueScanner::SetStart() {
|
|
1699
1725
|
if (!best_row.is_valid && !quoted_row.is_valid && best_row.start_pos < quoted_row.start_pos) {
|
1700
1726
|
best_row = quoted_row;
|
1701
1727
|
}
|
1728
|
+
if (quoted_row.is_valid && quoted_row.start_pos < best_row.start_pos) {
|
1729
|
+
best_row = quoted_row;
|
1730
|
+
}
|
1702
1731
|
}
|
1703
1732
|
// 3. We are in an escaped value
|
1704
1733
|
if (!best_row.is_valid && state_machine->dialect_options.state_machine_options.escape.GetValue() != '\0' &&
|
@@ -1794,7 +1823,7 @@ void StringValueScanner::FinalizeChunkProcess() {
|
|
1794
1823
|
}
|
1795
1824
|
}
|
1796
1825
|
if (states.IsQuotedCurrent() && !found_error &&
|
1797
|
-
state_machine->dialect_options.state_machine_options.
|
1826
|
+
state_machine->dialect_options.state_machine_options.strict_mode.GetValue()) {
|
1798
1827
|
// If we finish the execution of a buffer, and we end in a quoted state, it means we have unterminated
|
1799
1828
|
// quotes
|
1800
1829
|
result.current_errors.Insert(type, result.cur_col_id, result.chunk_col_id, result.last_position);
|
@@ -156,11 +156,6 @@ void CSVSniffer::GenerateStateMachineSearchSpace(vector<unique_ptr<ColumnCountSc
|
|
156
156
|
} else {
|
157
157
|
new_line_id = DetectNewLineDelimiter(*buffer_manager);
|
158
158
|
}
|
159
|
-
// We only sniff RFC 4180 rules, unless manually set by user.
|
160
|
-
bool rfc_4180 = true;
|
161
|
-
if (options.dialect_options.state_machine_options.rfc_4180.IsSetByUser()) {
|
162
|
-
rfc_4180 = options.dialect_options.state_machine_options.rfc_4180.GetValue();
|
163
|
-
}
|
164
159
|
CSVIterator first_iterator;
|
165
160
|
bool iterator_set = false;
|
166
161
|
for (const auto quote_rule : dialect_candidates.quote_rule_candidates) {
|
@@ -172,8 +167,9 @@ void CSVSniffer::GenerateStateMachineSearchSpace(vector<unique_ptr<ColumnCountSc
|
|
172
167
|
for (const auto &escape : escape_candidates) {
|
173
168
|
for (const auto &comment : dialect_candidates.comment_candidates) {
|
174
169
|
D_ASSERT(buffer_manager);
|
175
|
-
CSVStateMachineOptions state_machine_options(
|
176
|
-
|
170
|
+
CSVStateMachineOptions state_machine_options(
|
171
|
+
delimiter, quote, escape, comment, new_line_id,
|
172
|
+
options.dialect_options.state_machine_options.strict_mode.GetValue());
|
177
173
|
auto sniffing_state_machine =
|
178
174
|
make_shared_ptr<CSVStateMachine>(options, state_machine_options, state_machine_cache);
|
179
175
|
if (options.dialect_options.skip_rows.IsSetByUser()) {
|
@@ -117,9 +117,7 @@ static void ReplaceNames(vector<string> &detected_names, CSVStateMachine &state_
|
|
117
117
|
detected_names.push_back(GenerateColumnName(options.name_list.size(), col++));
|
118
118
|
best_sql_types_candidates_per_column_idx[i] = {LogicalType::VARCHAR};
|
119
119
|
}
|
120
|
-
|
121
120
|
dialect_options.num_cols = options.name_list.size();
|
122
|
-
|
123
121
|
} else {
|
124
122
|
// we throw an error
|
125
123
|
const auto error = CSVError::HeaderSniffingError(
|
@@ -128,8 +126,16 @@ static void ReplaceNames(vector<string> &detected_names, CSVStateMachine &state_
|
|
128
126
|
error_handler.Error(error);
|
129
127
|
}
|
130
128
|
}
|
131
|
-
|
132
|
-
|
129
|
+
if (options.name_list.size() > detected_names.size()) {
|
130
|
+
// we throw an error
|
131
|
+
const auto error =
|
132
|
+
CSVError::HeaderSniffingError(options, best_header_row, options.name_list.size(),
|
133
|
+
state_machine.dialect_options.state_machine_options.delimiter.GetValue());
|
134
|
+
error_handler.Error(error);
|
135
|
+
} else {
|
136
|
+
for (idx_t i = 0; i < options.name_list.size(); i++) {
|
137
|
+
detected_names[i] = options.name_list[i];
|
138
|
+
}
|
133
139
|
}
|
134
140
|
}
|
135
141
|
}
|
@@ -335,7 +341,7 @@ void CSVSniffer::DetectHeader() {
|
|
335
341
|
auto &sniffer_state_machine = best_candidate->GetStateMachine();
|
336
342
|
names = DetectHeaderInternal(buffer_manager->context, best_header_row, sniffer_state_machine, set_columns,
|
337
343
|
best_sql_types_candidates_per_column_idx, options, *error_handler);
|
338
|
-
if (
|
344
|
+
if (EmptyOrOnlyHeader()) {
|
339
345
|
// This file only contains a header, lets default to the lowest type of all.
|
340
346
|
detected_types.clear();
|
341
347
|
for (idx_t i = 0; i < names.size(); i++) {
|
@@ -99,6 +99,10 @@ idx_t CSVSniffer::LinesSniffed() const {
|
|
99
99
|
return lines_sniffed;
|
100
100
|
}
|
101
101
|
|
102
|
+
bool CSVSniffer::EmptyOrOnlyHeader() const {
|
103
|
+
return (single_row_file && best_candidate->state_machine->dialect_options.header.GetValue()) || lines_sniffed == 0;
|
104
|
+
}
|
105
|
+
|
102
106
|
bool CSVSniffer::CanYouCastIt(ClientContext &context, const string_t value, const LogicalType &type,
|
103
107
|
const DialectOptions &dialect_options, const bool is_null, const char decimal_separator) {
|
104
108
|
if (is_null) {
|
package/src/duckdb/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp
CHANGED
@@ -31,7 +31,7 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
|
|
31
31
|
InitializeTransitionArray(transition_array, cur_state, CSVState::QUOTED);
|
32
32
|
break;
|
33
33
|
case CSVState::UNQUOTED:
|
34
|
-
if (state_machine_options.
|
34
|
+
if (state_machine_options.strict_mode.GetValue()) {
|
35
35
|
// If we have an unquoted state, following rfc 4180, our base state is invalid
|
36
36
|
InitializeTransitionArray(transition_array, cur_state, CSVState::INVALID);
|
37
37
|
} else {
|
@@ -58,7 +58,7 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
|
|
58
58
|
|
59
59
|
const bool multi_byte_delimiter = delimiter_value.size() != 1;
|
60
60
|
|
61
|
-
const bool enable_unquoted_escape = state_machine_options.
|
61
|
+
const bool enable_unquoted_escape = state_machine_options.strict_mode.GetValue() == false &&
|
62
62
|
state_machine_options.quote != state_machine_options.escape &&
|
63
63
|
state_machine_options.escape != '\0';
|
64
64
|
// Now set values depending on configuration
|
@@ -75,7 +75,7 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
|
|
75
75
|
transition_array[static_cast<uint8_t>('\r')][state] = CSVState::CARRIAGE_RETURN;
|
76
76
|
if (state == static_cast<uint8_t>(CSVState::STANDARD_NEWLINE)) {
|
77
77
|
transition_array[static_cast<uint8_t>('\n')][state] = CSVState::STANDARD;
|
78
|
-
} else if (!state_machine_options.
|
78
|
+
} else if (!state_machine_options.strict_mode.GetValue()) {
|
79
79
|
transition_array[static_cast<uint8_t>('\n')][state] = CSVState::RECORD_SEPARATOR;
|
80
80
|
} else {
|
81
81
|
transition_array[static_cast<uint8_t>('\n')][state] = CSVState::INVALID;
|
@@ -227,7 +227,7 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
|
|
227
227
|
if (state_machine_options.quote == state_machine_options.escape) {
|
228
228
|
transition_array[quote][static_cast<uint8_t>(CSVState::UNQUOTED)] = CSVState::QUOTED;
|
229
229
|
}
|
230
|
-
if (state_machine_options.
|
230
|
+
if (state_machine_options.strict_mode == false) {
|
231
231
|
if (escape == '\0') {
|
232
232
|
// If escape is defined, it limits a bit how relaxed quotes can be in a reliable way.
|
233
233
|
transition_array[quote][static_cast<uint8_t>(CSVState::UNQUOTED)] = CSVState::MAYBE_QUOTED;
|
@@ -413,10 +413,10 @@ CSVStateMachineCache::CSVStateMachineCache() {
|
|
413
413
|
const auto &escape_candidates = default_escape[static_cast<uint8_t>(quote_rule)];
|
414
414
|
for (const auto &escape : escape_candidates) {
|
415
415
|
for (const auto &comment : default_comment) {
|
416
|
-
for (const bool
|
417
|
-
Insert({delimiter, quote, escape, comment, NewLineIdentifier::SINGLE_N,
|
418
|
-
Insert({delimiter, quote, escape, comment, NewLineIdentifier::SINGLE_R,
|
419
|
-
Insert({delimiter, quote, escape, comment, NewLineIdentifier::CARRY_ON,
|
416
|
+
for (const bool strict_mode : {true, false}) {
|
417
|
+
Insert({delimiter, quote, escape, comment, NewLineIdentifier::SINGLE_N, strict_mode});
|
418
|
+
Insert({delimiter, quote, escape, comment, NewLineIdentifier::SINGLE_R, strict_mode});
|
419
|
+
Insert({delimiter, quote, escape, comment, NewLineIdentifier::CARRY_ON, strict_mode});
|
420
420
|
}
|
421
421
|
}
|
422
422
|
}
|
@@ -194,11 +194,18 @@ void CSVErrorHandler::FillRejectsTable(InternalAppender &errors_appender, const
|
|
194
194
|
errors_appender.Append(Value());
|
195
195
|
break;
|
196
196
|
case CSVErrorType::TOO_FEW_COLUMNS:
|
197
|
-
|
198
|
-
|
197
|
+
if (col_idx + 1 < bind_data.return_names.size()) {
|
198
|
+
errors_appender.Append(string_t(bind_data.return_names[col_idx + 1]));
|
199
|
+
} else {
|
200
|
+
errors_appender.Append(Value());
|
201
|
+
}
|
199
202
|
break;
|
200
203
|
default:
|
201
|
-
|
204
|
+
if (col_idx < bind_data.return_names.size()) {
|
205
|
+
errors_appender.Append(string_t(bind_data.return_names[col_idx]));
|
206
|
+
} else {
|
207
|
+
errors_appender.Append(Value());
|
208
|
+
}
|
202
209
|
}
|
203
210
|
// 8. Error Type
|
204
211
|
errors_appender.Append(string_t(CSVErrorTypeToEnum(error.type)));
|
@@ -321,11 +328,13 @@ CSVError CSVError::InvalidState(const CSVReaderOptions &options, idx_t current_c
|
|
321
328
|
std::ostringstream error;
|
322
329
|
error << "The CSV Parser state machine reached an invalid state.\nThis can happen when is not possible to parse "
|
323
330
|
"your CSV File with the given options, or the CSV File is not RFC 4180 compliant ";
|
324
|
-
|
325
331
|
std::ostringstream how_to_fix_it;
|
326
|
-
|
327
|
-
|
328
|
-
|
332
|
+
if (options.dialect_options.state_machine_options.strict_mode.GetValue()) {
|
333
|
+
how_to_fix_it << "Possible fixes:" << '\n';
|
334
|
+
how_to_fix_it << "* Disable the parser's strict mode (strict_mode=false) to allow reading rows that do not "
|
335
|
+
"comply with the CSV standard."
|
336
|
+
<< '\n';
|
337
|
+
}
|
329
338
|
return CSVError(error.str(), INVALID_STATE, current_column, csv_row, error_info, row_byte_position, byte_position,
|
330
339
|
options, how_to_fix_it.str(), current_path);
|
331
340
|
}
|
@@ -356,6 +365,11 @@ CSVError CSVError::HeaderSniffingError(const CSVReaderOptions &options, const ve
|
|
356
365
|
|
357
366
|
// 3. Suggest how to fix it!
|
358
367
|
error << "Possible fixes:" << '\n';
|
368
|
+
if (options.dialect_options.state_machine_options.strict_mode.GetValue()) {
|
369
|
+
error << "* Disable the parser's strict mode (strict_mode=false) to allow reading rows that do not comply with "
|
370
|
+
"the CSV standard."
|
371
|
+
<< '\n';
|
372
|
+
}
|
359
373
|
// header
|
360
374
|
if (!options.dialect_options.header.IsSetByUser()) {
|
361
375
|
error << "* Set header (header = true) if your CSV has a header, or (header = false) if it doesn't" << '\n';
|
@@ -395,6 +409,11 @@ CSVError CSVError::SniffingError(const CSVReaderOptions &options, const string &
|
|
395
409
|
// 3. Suggest how to fix it!
|
396
410
|
error << "Possible fixes:" << '\n';
|
397
411
|
// 3.1 Inform the reader of the dialect
|
412
|
+
if (options.dialect_options.state_machine_options.strict_mode.GetValue()) {
|
413
|
+
error << "* Disable the parser's strict mode (strict_mode=false) to allow reading rows that do not comply with "
|
414
|
+
"the CSV standard."
|
415
|
+
<< '\n';
|
416
|
+
}
|
398
417
|
// delimiter
|
399
418
|
if (!options.dialect_options.state_machine_options.delimiter.IsSetByUser()) {
|
400
419
|
error << "* Set delimiter (e.g., delim=\',\')" << '\n';
|
@@ -440,11 +459,6 @@ CSVError CSVError::SniffingError(const CSVReaderOptions &options, const string &
|
|
440
459
|
error << "* Be sure that the maximum line size is set to an appropriate value, otherwise set it (e.g., "
|
441
460
|
"max_line_size=10000000)"
|
442
461
|
<< "\n";
|
443
|
-
|
444
|
-
if (options.dialect_options.state_machine_options.rfc_4180.GetValue() != false ||
|
445
|
-
!options.dialect_options.state_machine_options.rfc_4180.IsSetByUser()) {
|
446
|
-
error << "* Enable scanning files that are not RFC 4180 compliant (rfc_4180=false). " << '\n';
|
447
|
-
}
|
448
462
|
return CSVError(error.str(), SNIFFING, {});
|
449
463
|
}
|
450
464
|
|
@@ -466,6 +480,11 @@ CSVError CSVError::UnterminatedQuotesError(const CSVReaderOptions &options, idx_
|
|
466
480
|
error << "Value with unterminated quote found." << '\n';
|
467
481
|
std::ostringstream how_to_fix_it;
|
468
482
|
how_to_fix_it << "Possible fixes:" << '\n';
|
483
|
+
if (options.dialect_options.state_machine_options.strict_mode.GetValue()) {
|
484
|
+
how_to_fix_it << "* Disable the parser's strict mode (strict_mode=false) to allow reading rows that do not "
|
485
|
+
"comply with the CSV standard."
|
486
|
+
<< '\n';
|
487
|
+
}
|
469
488
|
how_to_fix_it << "* Enable ignore errors (ignore_errors=true) to skip this row" << '\n';
|
470
489
|
how_to_fix_it << "* Set quote to empty or to a different value (e.g., quote=\'\')" << '\n';
|
471
490
|
return CSVError(error.str(), UNTERMINATED_QUOTES, current_column, csv_row, error_info, row_byte_position,
|
@@ -479,6 +498,11 @@ CSVError CSVError::IncorrectColumnAmountError(const CSVReaderOptions &options, i
|
|
479
498
|
// We don't have a fix for this
|
480
499
|
std::ostringstream how_to_fix_it;
|
481
500
|
how_to_fix_it << "Possible fixes:" << '\n';
|
501
|
+
if (options.dialect_options.state_machine_options.strict_mode.GetValue()) {
|
502
|
+
how_to_fix_it << "* Disable the parser's strict mode (strict_mode=false) to allow reading rows that do not "
|
503
|
+
"comply with the CSV standard."
|
504
|
+
<< '\n';
|
505
|
+
}
|
482
506
|
if (!options.null_padding) {
|
483
507
|
how_to_fix_it << "* Enable null padding (null_padding=true) to replace missing values with NULL" << '\n';
|
484
508
|
}
|
@@ -189,11 +189,11 @@ void CSVReaderOptions::SetNewline(const string &input) {
|
|
189
189
|
}
|
190
190
|
|
191
191
|
bool CSVReaderOptions::GetRFC4180() const {
|
192
|
-
return this->dialect_options.state_machine_options.
|
192
|
+
return this->dialect_options.state_machine_options.strict_mode.GetValue();
|
193
193
|
}
|
194
194
|
|
195
195
|
void CSVReaderOptions::SetRFC4180(bool input) {
|
196
|
-
this->dialect_options.state_machine_options.
|
196
|
+
this->dialect_options.state_machine_options.strict_mode.Set(input);
|
197
197
|
}
|
198
198
|
|
199
199
|
bool CSVReaderOptions::IgnoreErrors() const {
|
@@ -413,7 +413,7 @@ bool CSVReaderOptions::SetBaseOption(const string &loption, const Value &value,
|
|
413
413
|
|
414
414
|
} else if (loption == "compression") {
|
415
415
|
SetCompression(ParseString(value, loption));
|
416
|
-
} else if (loption == "
|
416
|
+
} else if (loption == "strict_mode") {
|
417
417
|
SetRFC4180(ParseBoolean(value, loption));
|
418
418
|
} else {
|
419
419
|
// unrecognized option in base CSV
|
@@ -440,7 +440,7 @@ string CSVReaderOptions::ToString(const string ¤t_file_path) const {
|
|
440
440
|
auto &escape = dialect_options.state_machine_options.escape;
|
441
441
|
auto &comment = dialect_options.state_machine_options.comment;
|
442
442
|
auto &new_line = dialect_options.state_machine_options.new_line;
|
443
|
-
auto &
|
443
|
+
auto &strict_mode = dialect_options.state_machine_options.strict_mode;
|
444
444
|
auto &skip_rows = dialect_options.skip_rows;
|
445
445
|
|
446
446
|
auto &header = dialect_options.header;
|
@@ -460,8 +460,8 @@ string CSVReaderOptions::ToString(const string ¤t_file_path) const {
|
|
460
460
|
error += FormatOptionLine("skip_rows", skip_rows);
|
461
461
|
// comment
|
462
462
|
error += FormatOptionLine("comment", comment);
|
463
|
-
//
|
464
|
-
error += FormatOptionLine("
|
463
|
+
// strict_mode
|
464
|
+
error += FormatOptionLine("strict_mode", strict_mode);
|
465
465
|
// date format
|
466
466
|
error += FormatOptionLine("date_format", dialect_options.date_format.at(LogicalType::DATE));
|
467
467
|
// timestamp format
|
@@ -638,6 +638,9 @@ void CSVReaderOptions::FromNamedParameters(const named_parameter_map_t &in, Clie
|
|
638
638
|
}
|
639
639
|
auto &children = ListValue::GetChildren(kv.second);
|
640
640
|
for (auto &child : children) {
|
641
|
+
if (child.IsNull()) {
|
642
|
+
throw BinderException("read_csv %s parameter cannot have a NULL value", kv.first);
|
643
|
+
}
|
641
644
|
name_list.push_back(StringValue::Get(child));
|
642
645
|
}
|
643
646
|
for (auto &name : name_list) {
|
@@ -716,7 +719,7 @@ void CSVReaderOptions::ToNamedParameters(named_parameter_map_t &named_params) co
|
|
716
719
|
auto "e = dialect_options.state_machine_options.quote;
|
717
720
|
auto &escape = dialect_options.state_machine_options.escape;
|
718
721
|
auto &comment = dialect_options.state_machine_options.comment;
|
719
|
-
auto &
|
722
|
+
auto &strict_mode = dialect_options.state_machine_options.strict_mode;
|
720
723
|
auto &header = dialect_options.header;
|
721
724
|
if (delimiter.IsSetByUser()) {
|
722
725
|
named_params["delim"] = Value(GetDelimiter());
|
@@ -736,8 +739,8 @@ void CSVReaderOptions::ToNamedParameters(named_parameter_map_t &named_params) co
|
|
736
739
|
if (header.IsSetByUser()) {
|
737
740
|
named_params["header"] = Value(GetHeader());
|
738
741
|
}
|
739
|
-
if (
|
740
|
-
named_params["
|
742
|
+
if (strict_mode.IsSetByUser()) {
|
743
|
+
named_params["strict_mode"] = Value(GetRFC4180());
|
741
744
|
}
|
742
745
|
named_params["max_line_size"] = Value::BIGINT(NumericCast<int64_t>(maximum_line_size.GetValue()));
|
743
746
|
if (dialect_options.skip_rows.IsSetByUser()) {
|
@@ -638,7 +638,6 @@ void JoinFilterPushdownInfo::PushInFilter(const JoinFilterPushdownFilter &info,
|
|
638
638
|
|
639
639
|
// generate the OR filter
|
640
640
|
auto in_filter = make_uniq<InFilter>(std::move(in_list));
|
641
|
-
in_filter->origin_is_hash_join = true;
|
642
641
|
|
643
642
|
// we push the OR filter as an OptionalFilter so that we can use it for zonemap pruning only
|
644
643
|
// the IN-list is expensive to execute otherwise
|
@@ -1,6 +1,8 @@
|
|
1
1
|
#include "duckdb/execution/operator/persistent/physical_copy_database.hpp"
|
2
|
+
|
2
3
|
#include "duckdb/catalog/catalog.hpp"
|
3
4
|
#include "duckdb/catalog/catalog_entry/schema_catalog_entry.hpp"
|
5
|
+
#include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp"
|
4
6
|
#include "duckdb/planner/binder.hpp"
|
5
7
|
#include "duckdb/planner/parsed_data/bound_create_table_info.hpp"
|
6
8
|
#include "duckdb/parser/parsed_data/create_schema_info.hpp"
|
@@ -9,6 +11,8 @@
|
|
9
11
|
#include "duckdb/parser/parsed_data/create_type_info.hpp"
|
10
12
|
#include "duckdb/parser/parsed_data/create_view_info.hpp"
|
11
13
|
#include "duckdb/parser/parsed_data/create_index_info.hpp"
|
14
|
+
#include "duckdb/execution/index/unbound_index.hpp"
|
15
|
+
#include "duckdb/storage/data_table.hpp"
|
12
16
|
|
13
17
|
namespace duckdb {
|
14
18
|
|
@@ -52,7 +56,7 @@ SourceResultType PhysicalCopyDatabase::GetData(ExecutionContext &context, DataCh
|
|
52
56
|
break;
|
53
57
|
}
|
54
58
|
case CatalogType::INDEX_ENTRY: {
|
55
|
-
|
59
|
+
// Skip for now.
|
56
60
|
break;
|
57
61
|
}
|
58
62
|
default:
|
@@ -60,6 +64,30 @@ SourceResultType PhysicalCopyDatabase::GetData(ExecutionContext &context, DataCh
|
|
60
64
|
CatalogTypeToString(create_info->type));
|
61
65
|
}
|
62
66
|
}
|
67
|
+
|
68
|
+
// Create the indexes after table creation.
|
69
|
+
for (auto &create_info : info->entries) {
|
70
|
+
if (!create_info || create_info->type != CatalogType::INDEX_ENTRY) {
|
71
|
+
continue;
|
72
|
+
}
|
73
|
+
catalog.CreateIndex(context.client, create_info->Cast<CreateIndexInfo>());
|
74
|
+
|
75
|
+
auto &create_index_info = create_info->Cast<CreateIndexInfo>();
|
76
|
+
auto &catalog_table = catalog.GetEntry(context.client, CatalogType::TABLE_ENTRY, create_index_info.schema,
|
77
|
+
create_index_info.table);
|
78
|
+
auto &table_entry = catalog_table.Cast<TableCatalogEntry>();
|
79
|
+
auto &data_table = table_entry.GetStorage();
|
80
|
+
|
81
|
+
IndexStorageInfo storage_info(create_index_info.index_name);
|
82
|
+
storage_info.options.emplace("v1_0_0_storage", false);
|
83
|
+
auto unbound_index = make_uniq<UnboundIndex>(create_index_info.Copy(), storage_info,
|
84
|
+
data_table.GetTableIOManager(), catalog.GetAttached());
|
85
|
+
|
86
|
+
data_table.AddIndex(std::move(unbound_index));
|
87
|
+
auto &data_table_info = *data_table.GetDataTableInfo();
|
88
|
+
data_table_info.GetIndexes().InitializeIndexes(context.client, data_table_info);
|
89
|
+
}
|
90
|
+
|
63
91
|
return SourceResultType::FINISHED;
|
64
92
|
}
|
65
93
|
|