duckdb 0.8.2-dev3458.0 → 0.8.2-dev3949.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +2 -0
- package/package.json +1 -1
- package/src/duckdb/extension/icu/icu_extension.cpp +5 -5
- package/src/duckdb/extension/json/include/json_deserializer.hpp +7 -16
- package/src/duckdb/extension/json/include/json_serializer.hpp +9 -15
- package/src/duckdb/extension/json/json_deserializer.cpp +29 -67
- package/src/duckdb/extension/json/json_scan.cpp +1 -1
- package/src/duckdb/extension/json/json_serializer.cpp +26 -69
- package/src/duckdb/src/common/enum_util.cpp +119 -7
- package/src/duckdb/src/common/extra_type_info.cpp +7 -3
- package/src/duckdb/src/common/radix_partitioning.cpp +8 -31
- package/src/duckdb/src/common/row_operations/row_aggregate.cpp +18 -3
- package/src/duckdb/src/common/serializer/binary_deserializer.cpp +62 -77
- package/src/duckdb/src/common/serializer/binary_serializer.cpp +84 -84
- package/src/duckdb/src/common/serializer/format_serializer.cpp +1 -1
- package/src/duckdb/src/common/sort/partition_state.cpp +41 -33
- package/src/duckdb/src/common/types/data_chunk.cpp +44 -8
- package/src/duckdb/src/common/types/hyperloglog.cpp +21 -0
- package/src/duckdb/src/common/types/interval.cpp +3 -0
- package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +252 -126
- package/src/duckdb/src/common/types/row/row_layout.cpp +3 -31
- package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +40 -32
- package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +39 -26
- package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +11 -1
- package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +21 -16
- package/src/duckdb/src/common/types/value.cpp +63 -42
- package/src/duckdb/src/common/types/vector.cpp +33 -67
- package/src/duckdb/src/core_functions/scalar/list/list_lambdas.cpp +3 -2
- package/src/duckdb/src/execution/aggregate_hashtable.cpp +222 -364
- package/src/duckdb/src/execution/join_hashtable.cpp +5 -6
- package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +240 -310
- package/src/duckdb/src/execution/operator/aggregate/physical_ungrouped_aggregate.cpp +202 -173
- package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +36 -2
- package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/base_csv_reader.cpp +58 -162
- package/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp +434 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +80 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer_manager.cpp +90 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_file_handle.cpp +95 -0
- package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/csv_reader_options.cpp +47 -28
- package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine.cpp +35 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +107 -0
- package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/parallel_csv_reader.cpp +44 -44
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +52 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +336 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +165 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +398 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +175 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +39 -0
- package/src/duckdb/src/execution/operator/join/physical_asof_join.cpp +1 -1
- package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +1 -2
- package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +614 -574
- package/src/duckdb/src/execution/window_executor.cpp +6 -5
- package/src/duckdb/src/function/cast/cast_function_set.cpp +1 -0
- package/src/duckdb/src/function/scalar/strftime_format.cpp +4 -4
- package/src/duckdb/src/function/table/copy_csv.cpp +94 -96
- package/src/duckdb/src/function/table/read_csv.cpp +150 -136
- package/src/duckdb/src/function/table/table_scan.cpp +0 -2
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/enum_util.hpp +24 -0
- package/src/duckdb/src/include/duckdb/common/file_opener.hpp +9 -0
- package/src/duckdb/src/include/duckdb/common/fixed_size_map.hpp +208 -0
- package/src/duckdb/src/include/duckdb/common/optional_idx.hpp +3 -0
- package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +2 -1
- package/src/duckdb/src/include/duckdb/common/printer.hpp +11 -0
- package/src/duckdb/src/include/duckdb/common/serializer/binary_deserializer.hpp +43 -30
- package/src/duckdb/src/include/duckdb/common/serializer/binary_serializer.hpp +36 -35
- package/src/duckdb/src/include/duckdb/common/serializer/deserialization_data.hpp +18 -0
- package/src/duckdb/src/include/duckdb/common/serializer/encoding_util.hpp +132 -0
- package/src/duckdb/src/include/duckdb/common/serializer/format_deserializer.hpp +125 -150
- package/src/duckdb/src/include/duckdb/common/serializer/format_serializer.hpp +119 -107
- package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +2 -1
- package/src/duckdb/src/include/duckdb/common/shared_ptr.hpp +8 -0
- package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +13 -7
- package/src/duckdb/src/include/duckdb/common/types/data_chunk.hpp +5 -0
- package/src/duckdb/src/include/duckdb/common/types/hyperloglog.hpp +7 -1
- package/src/duckdb/src/include/duckdb/common/types/interval.hpp +7 -0
- package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +41 -9
- package/src/duckdb/src/include/duckdb/common/types/row/row_data_collection_scanner.hpp +5 -0
- package/src/duckdb/src/include/duckdb/common/types/row/row_layout.hpp +1 -23
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +14 -8
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +6 -3
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +7 -0
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +13 -8
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +3 -2
- package/src/duckdb/src/include/duckdb/common/types/vector.hpp +3 -3
- package/src/duckdb/src/include/duckdb/common/vector.hpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +125 -146
- package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_hash_aggregate.hpp +5 -4
- package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_window.hpp +4 -3
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/base_csv_reader.hpp +17 -17
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp +72 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +110 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp +103 -0
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_file_handle.hpp +8 -15
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_line_info.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_reader_options.hpp +52 -28
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +127 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +75 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +51 -0
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/parallel_csv_reader.hpp +21 -27
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/quote_rules.hpp +21 -0
- package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +18 -27
- package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +5 -6
- package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +4 -4
- package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +17 -12
- package/src/duckdb/src/include/duckdb/main/client_context_file_opener.hpp +1 -0
- package/src/duckdb/src/include/duckdb/main/client_data.hpp +2 -1
- package/src/duckdb/src/include/duckdb/main/config.hpp +1 -0
- package/src/duckdb/src/include/duckdb/main/connection.hpp +2 -2
- package/src/duckdb/src/include/duckdb/main/relation/read_csv_relation.hpp +6 -6
- package/src/duckdb/src/include/duckdb/parallel/event.hpp +12 -1
- package/src/duckdb/src/include/duckdb/storage/block.hpp +6 -0
- package/src/duckdb/src/include/duckdb/storage/buffer/block_handle.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/base_statistics.hpp +7 -3
- package/src/duckdb/src/include/duckdb/storage/statistics/column_statistics.hpp +4 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +5 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/list_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/numeric_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/string_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/struct_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp +15 -3
- package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +4 -0
- package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +5 -0
- package/src/duckdb/src/include/duckdb/verification/deserialized_statement_verifier_v2.hpp +6 -0
- package/src/duckdb/src/include/duckdb/verification/statement_verifier.hpp +1 -0
- package/src/duckdb/src/include/duckdb.h +12 -0
- package/src/duckdb/src/main/capi/logical_types-c.cpp +22 -0
- package/src/duckdb/src/main/client_context_file_opener.cpp +17 -0
- package/src/duckdb/src/main/client_verify.cpp +1 -0
- package/src/duckdb/src/main/config.cpp +2 -2
- package/src/duckdb/src/main/connection.cpp +3 -3
- package/src/duckdb/src/main/relation/read_csv_relation.cpp +19 -13
- package/src/duckdb/src/parallel/pipeline_finish_event.cpp +1 -1
- package/src/duckdb/src/parser/tableref/pivotref.cpp +0 -16
- package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +1 -1
- package/src/duckdb/src/planner/binder/statement/bind_export.cpp +41 -25
- package/src/duckdb/src/planner/expression/bound_aggregate_expression.cpp +4 -4
- package/src/duckdb/src/planner/expression/bound_window_expression.cpp +10 -10
- package/src/duckdb/src/planner/logical_operator.cpp +1 -1
- package/src/duckdb/src/planner/planner.cpp +1 -1
- package/src/duckdb/src/storage/checkpoint_manager.cpp +4 -3
- package/src/duckdb/src/storage/serialization/serialize_constraint.cpp +1 -1
- package/src/duckdb/src/storage/serialization/serialize_create_info.cpp +5 -5
- package/src/duckdb/src/storage/serialization/serialize_expression.cpp +10 -10
- package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +20 -20
- package/src/duckdb/src/storage/serialization/serialize_macro_function.cpp +2 -2
- package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +118 -89
- package/src/duckdb/src/storage/serialization/serialize_parse_info.cpp +3 -3
- package/src/duckdb/src/storage/serialization/serialize_parsed_expression.cpp +27 -27
- package/src/duckdb/src/storage/serialization/serialize_query_node.cpp +16 -16
- package/src/duckdb/src/storage/serialization/serialize_result_modifier.cpp +8 -8
- package/src/duckdb/src/storage/serialization/serialize_statement.cpp +1 -1
- package/src/duckdb/src/storage/serialization/serialize_storage.cpp +39 -0
- package/src/duckdb/src/storage/serialization/serialize_tableref.cpp +9 -9
- package/src/duckdb/src/storage/statistics/base_statistics.cpp +67 -4
- package/src/duckdb/src/storage/statistics/column_statistics.cpp +16 -0
- package/src/duckdb/src/storage/statistics/list_stats.cpp +21 -0
- package/src/duckdb/src/storage/statistics/numeric_stats.cpp +126 -1
- package/src/duckdb/src/storage/statistics/string_stats.cpp +23 -0
- package/src/duckdb/src/storage/statistics/struct_stats.cpp +27 -0
- package/src/duckdb/src/storage/storage_info.cpp +1 -1
- package/src/duckdb/src/storage/table/chunk_info.cpp +82 -3
- package/src/duckdb/src/storage/table/row_group.cpp +68 -1
- package/src/duckdb/src/storage/table/table_statistics.cpp +21 -0
- package/src/duckdb/src/storage/wal_replay.cpp +2 -2
- package/src/duckdb/src/verification/deserialized_statement_verifier_v2.cpp +15 -1
- package/src/duckdb/src/verification/statement_verifier.cpp +2 -0
- package/src/duckdb/third_party/utf8proc/include/utf8proc_wrapper.hpp +8 -0
- package/src/duckdb/ub_src_execution.cpp +0 -2
- package/src/duckdb/ub_src_execution_operator_csv_scanner.cpp +18 -0
- package/src/duckdb/ub_src_execution_operator_csv_scanner_sniffer.cpp +12 -0
- package/src/duckdb/ub_src_execution_operator_persistent.cpp +0 -12
- package/src/duckdb/ub_src_storage_serialization.cpp +2 -0
- package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +0 -1487
- package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +0 -72
- package/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp +0 -158
- package/src/duckdb/src/execution/partitionable_hashtable.cpp +0 -207
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +0 -133
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +0 -74
- package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +0 -73
@@ -344,13 +344,14 @@ void WindowBoundariesState::Update(const idx_t row_idx, const WindowInputColumn
|
|
344
344
|
|
345
345
|
// when the partition changes, recompute the boundaries
|
346
346
|
if (!is_same_partition || is_jump) {
|
347
|
-
partition_start = row_idx;
|
348
|
-
peer_start = row_idx;
|
349
|
-
|
350
347
|
if (is_jump) {
|
351
|
-
// Go back as far as the previous partition start
|
352
348
|
idx_t n = 1;
|
353
|
-
partition_start = FindPrevStart(partition_mask,
|
349
|
+
partition_start = FindPrevStart(partition_mask, 0, row_idx + 1, n);
|
350
|
+
n = 1;
|
351
|
+
peer_start = FindPrevStart(order_mask, 0, row_idx + 1, n);
|
352
|
+
} else {
|
353
|
+
partition_start = row_idx;
|
354
|
+
peer_start = row_idx;
|
354
355
|
}
|
355
356
|
|
356
357
|
// find end of partition
|
@@ -668,7 +668,7 @@ int StrpTimeFormat::NumericSpecifierWidth(StrTimeSpecifier specifier) {
|
|
668
668
|
enum class TimeSpecifierAMOrPM : uint8_t { TIME_SPECIFIER_NONE = 0, TIME_SPECIFIER_AM = 1, TIME_SPECIFIER_PM = 2 };
|
669
669
|
|
670
670
|
int32_t StrpTimeFormat::TryParseCollection(const char *data, idx_t &pos, idx_t size, const string_t collection[],
|
671
|
-
idx_t collection_count) {
|
671
|
+
idx_t collection_count) const {
|
672
672
|
for (idx_t c = 0; c < collection_count; c++) {
|
673
673
|
auto &entry = collection[c];
|
674
674
|
auto entry_data = entry.GetData();
|
@@ -695,7 +695,7 @@ int32_t StrpTimeFormat::TryParseCollection(const char *data, idx_t &pos, idx_t s
|
|
695
695
|
}
|
696
696
|
|
697
697
|
//! Parses a timestamp using the given specifier
|
698
|
-
bool StrpTimeFormat::Parse(string_t str, ParseResult &result) {
|
698
|
+
bool StrpTimeFormat::Parse(string_t str, ParseResult &result) const {
|
699
699
|
auto &result_data = result.data;
|
700
700
|
auto &error_message = result.error_message;
|
701
701
|
auto &error_position = result.error_position;
|
@@ -1146,7 +1146,7 @@ string StrpTimeFormat::ParseResult::FormatError(string_t input, const string &fo
|
|
1146
1146
|
FormatStrpTimeError(input.GetString(), error_position), error_message);
|
1147
1147
|
}
|
1148
1148
|
|
1149
|
-
bool StrpTimeFormat::TryParseDate(string_t input, date_t &result, string &error_message) {
|
1149
|
+
bool StrpTimeFormat::TryParseDate(string_t input, date_t &result, string &error_message) const {
|
1150
1150
|
ParseResult parse_result;
|
1151
1151
|
if (!Parse(input, parse_result)) {
|
1152
1152
|
error_message = parse_result.FormatError(input, format_specifier);
|
@@ -1155,7 +1155,7 @@ bool StrpTimeFormat::TryParseDate(string_t input, date_t &result, string &error_
|
|
1155
1155
|
return parse_result.TryToDate(result);
|
1156
1156
|
}
|
1157
1157
|
|
1158
|
-
bool StrpTimeFormat::TryParseTimestamp(string_t input, timestamp_t &result, string &error_message) {
|
1158
|
+
bool StrpTimeFormat::TryParseTimestamp(string_t input, timestamp_t &result, string &error_message) const {
|
1159
1159
|
ParseResult parse_result;
|
1160
1160
|
if (!Parse(input, parse_result)) {
|
1161
1161
|
error_message = parse_result.FormatError(input, format_specifier);
|
@@ -6,6 +6,7 @@
|
|
6
6
|
#include "duckdb/common/types/column/column_data_collection.hpp"
|
7
7
|
#include "duckdb/common/types/string_type.hpp"
|
8
8
|
#include "duckdb/common/vector_operations/vector_operations.hpp"
|
9
|
+
#include "duckdb/execution/operator/scan/csv/csv_sniffer.hpp"
|
9
10
|
#include "duckdb/function/copy_function.hpp"
|
10
11
|
#include "duckdb/function/scalar/string_functions.hpp"
|
11
12
|
#include "duckdb/function/table/read_csv.hpp"
|
@@ -15,11 +16,20 @@
|
|
15
16
|
|
16
17
|
namespace duckdb {
|
17
18
|
|
18
|
-
void
|
19
|
-
if (str_1
|
19
|
+
void AreOptionsEqual(char &str_1, char &str_2, const string &name_str_1, const string &name_str_2) {
|
20
|
+
if (str_1 == '\0' || str_2 == '\0') {
|
20
21
|
return;
|
21
22
|
}
|
22
|
-
if (
|
23
|
+
if (str_1 == str_2) {
|
24
|
+
throw BinderException("%s must not appear in the %s specification and vice versa", name_str_1, name_str_2);
|
25
|
+
}
|
26
|
+
}
|
27
|
+
|
28
|
+
void SubstringDetection(char &str_1, string &str_2, const string &name_str_1, const string &name_str_2) {
|
29
|
+
if (str_1 == '\0' || str_2.empty()) {
|
30
|
+
return;
|
31
|
+
}
|
32
|
+
if (str_2.find(str_1) != string::npos) {
|
23
33
|
throw BinderException("%s must not appear in the %s specification and vice versa", name_str_1, name_str_2);
|
24
34
|
}
|
25
35
|
}
|
@@ -28,34 +38,46 @@ void SubstringDetection(string &str_1, string &str_2, const string &name_str_1,
|
|
28
38
|
// Bind
|
29
39
|
//===--------------------------------------------------------------------===//
|
30
40
|
|
41
|
+
void WriteQuoteOrEscape(Serializer &serializer, char quote_or_escape) {
|
42
|
+
if (quote_or_escape != '\0') {
|
43
|
+
serializer.Write(quote_or_escape);
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
31
47
|
void BaseCSVData::Finalize() {
|
32
48
|
// verify that the options are correct in the final pass
|
33
|
-
if (options.escape
|
34
|
-
options.escape = options.quote;
|
49
|
+
if (options.dialect_options.state_machine_options.escape == '\0') {
|
50
|
+
options.dialect_options.state_machine_options.escape = options.dialect_options.state_machine_options.quote;
|
35
51
|
}
|
36
52
|
// escape and delimiter must not be substrings of each other
|
37
53
|
if (options.has_delimiter && options.has_escape) {
|
38
|
-
|
54
|
+
AreOptionsEqual(options.dialect_options.state_machine_options.delimiter,
|
55
|
+
options.dialect_options.state_machine_options.escape, "DELIMITER", "ESCAPE");
|
39
56
|
}
|
40
57
|
// delimiter and quote must not be substrings of each other
|
41
58
|
if (options.has_quote && options.has_delimiter) {
|
42
|
-
|
59
|
+
AreOptionsEqual(options.dialect_options.state_machine_options.quote,
|
60
|
+
options.dialect_options.state_machine_options.delimiter, "DELIMITER", "QUOTE");
|
43
61
|
}
|
44
62
|
// escape and quote must not be substrings of each other (but can be the same)
|
45
|
-
if (options.quote != options.escape &&
|
46
|
-
|
63
|
+
if (options.dialect_options.state_machine_options.quote != options.dialect_options.state_machine_options.escape &&
|
64
|
+
options.has_quote && options.has_escape) {
|
65
|
+
AreOptionsEqual(options.dialect_options.state_machine_options.quote,
|
66
|
+
options.dialect_options.state_machine_options.escape, "QUOTE", "ESCAPE");
|
47
67
|
}
|
48
68
|
if (!options.null_str.empty()) {
|
49
69
|
// null string and delimiter must not be substrings of each other
|
50
70
|
if (options.has_delimiter) {
|
51
|
-
SubstringDetection(options.delimiter, options.null_str, "DELIMITER",
|
71
|
+
SubstringDetection(options.dialect_options.state_machine_options.delimiter, options.null_str, "DELIMITER",
|
72
|
+
"NULL");
|
52
73
|
}
|
53
74
|
// quote/escape and nullstr must not be substrings of each other
|
54
75
|
if (options.has_quote) {
|
55
|
-
SubstringDetection(options.quote, options.null_str, "QUOTE", "NULL");
|
76
|
+
SubstringDetection(options.dialect_options.state_machine_options.quote, options.null_str, "QUOTE", "NULL");
|
56
77
|
}
|
57
78
|
if (options.has_escape) {
|
58
|
-
SubstringDetection(options.escape, options.null_str, "ESCAPE",
|
79
|
+
SubstringDetection(options.dialect_options.state_machine_options.escape, options.null_str, "ESCAPE",
|
80
|
+
"NULL");
|
59
81
|
}
|
60
82
|
}
|
61
83
|
|
@@ -63,7 +85,7 @@ void BaseCSVData::Finalize() {
|
|
63
85
|
if (options.prefix.empty() || options.suffix.empty()) {
|
64
86
|
throw BinderException("COPY ... (FORMAT CSV) must have both PREFIX and SUFFIX, or none at all");
|
65
87
|
}
|
66
|
-
if (options.header) {
|
88
|
+
if (options.dialect_options.header) {
|
67
89
|
throw BinderException("COPY ... (FORMAT CSV)'s HEADER cannot be combined with PREFIX/SUFFIX");
|
68
90
|
}
|
69
91
|
}
|
@@ -85,16 +107,14 @@ static unique_ptr<FunctionData> WriteCSVBind(ClientContext &context, CopyInfo &i
|
|
85
107
|
bind_data->options.force_quote.resize(names.size(), false);
|
86
108
|
}
|
87
109
|
bind_data->Finalize();
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
bind_data->requires_quotes[bind_data->options.quote[0]] = true;
|
97
|
-
}
|
110
|
+
|
111
|
+
bind_data->requires_quotes = make_unsafe_uniq_array<bool>(256);
|
112
|
+
memset(bind_data->requires_quotes.get(), 0, sizeof(bool) * 256);
|
113
|
+
bind_data->requires_quotes['\n'] = true;
|
114
|
+
bind_data->requires_quotes['\r'] = true;
|
115
|
+
bind_data->requires_quotes[bind_data->options.dialect_options.state_machine_options.delimiter] = true;
|
116
|
+
bind_data->requires_quotes[bind_data->options.dialect_options.state_machine_options.quote] = true;
|
117
|
+
|
98
118
|
if (!bind_data->options.write_newline.empty()) {
|
99
119
|
bind_data->newline = bind_data->options.write_newline;
|
100
120
|
}
|
@@ -129,13 +149,24 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, CopyInfo &in
|
|
129
149
|
for (auto &option : info.options) {
|
130
150
|
options_map[option.first] = ConvertVectorToValue(std::move(option.second));
|
131
151
|
}
|
152
|
+
options.file_path = bind_data->files[0];
|
153
|
+
options.name_list = expected_names;
|
154
|
+
options.sql_type_list = expected_types;
|
155
|
+
for (idx_t i = 0; i < expected_types.size(); i++) {
|
156
|
+
options.sql_types_per_column[expected_names[i]] = i;
|
157
|
+
}
|
132
158
|
|
133
159
|
bind_data->FinalizeRead(context);
|
134
|
-
if (
|
135
|
-
|
136
|
-
|
137
|
-
auto
|
138
|
-
options
|
160
|
+
if (options.auto_detect) {
|
161
|
+
// We must run the sniffer.
|
162
|
+
auto file_handle = BaseCSVReader::OpenCSV(context, options);
|
163
|
+
auto buffer_manager = make_shared<CSVBufferManager>(context, std::move(file_handle), options);
|
164
|
+
CSVSniffer sniffer(options, buffer_manager, bind_data->state_machine_cache);
|
165
|
+
auto sniffer_result = sniffer.SniffCSV();
|
166
|
+
bind_data->csv_types = sniffer_result.return_types;
|
167
|
+
bind_data->csv_names = sniffer_result.names;
|
168
|
+
bind_data->return_types = sniffer_result.return_types;
|
169
|
+
bind_data->return_names = sniffer_result.names;
|
139
170
|
}
|
140
171
|
return std::move(bind_data);
|
141
172
|
}
|
@@ -143,7 +174,7 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, CopyInfo &in
|
|
143
174
|
//===--------------------------------------------------------------------===//
|
144
175
|
// Helper writing functions
|
145
176
|
//===--------------------------------------------------------------------===//
|
146
|
-
static string AddEscapes(
|
177
|
+
static string AddEscapes(char &to_be_escaped, const char &escape, const string &val) {
|
147
178
|
idx_t i = 0;
|
148
179
|
string new_val = "";
|
149
180
|
idx_t found = val.find(to_be_escaped);
|
@@ -153,8 +184,10 @@ static string AddEscapes(string &to_be_escaped, const string &escape, const stri
|
|
153
184
|
new_val += val[i];
|
154
185
|
i++;
|
155
186
|
}
|
156
|
-
|
157
|
-
|
187
|
+
if (escape != '\0') {
|
188
|
+
new_val += escape;
|
189
|
+
found = val.find(to_be_escaped, found + 1);
|
190
|
+
}
|
158
191
|
}
|
159
192
|
while (i < val.length()) {
|
160
193
|
new_val += val[i];
|
@@ -169,43 +202,16 @@ static bool RequiresQuotes(WriteCSVData &csv_data, const char *str, idx_t len) {
|
|
169
202
|
if (len == options.null_str.size() && memcmp(str, options.null_str.c_str(), len) == 0) {
|
170
203
|
return true;
|
171
204
|
}
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
if (csv_data.requires_quotes[str_data[i]]) {
|
177
|
-
// this byte requires quotes - write a quoted string
|
178
|
-
return true;
|
179
|
-
}
|
180
|
-
}
|
181
|
-
// no newline, quote or delimiter in the string
|
182
|
-
// no quoting or escaping necessary
|
183
|
-
return false;
|
184
|
-
} else {
|
185
|
-
// CSV with complex quotes/delimiter (multiple bytes)
|
186
|
-
|
187
|
-
// first check for \n, \r, \n\r in string
|
188
|
-
for (idx_t i = 0; i < len; i++) {
|
189
|
-
if (str[i] == '\n' || str[i] == '\r') {
|
190
|
-
// newline, write a quoted string
|
191
|
-
return true;
|
192
|
-
}
|
193
|
-
}
|
194
|
-
|
195
|
-
// check for delimiter
|
196
|
-
if (options.delimiter.length() != 0 &&
|
197
|
-
ContainsFun::Find(const_uchar_ptr_cast(str), len, const_uchar_ptr_cast(options.delimiter.c_str()),
|
198
|
-
options.delimiter.size()) != DConstants::INVALID_INDEX) {
|
205
|
+
auto str_data = reinterpret_cast<const_data_ptr_t>(str);
|
206
|
+
for (idx_t i = 0; i < len; i++) {
|
207
|
+
if (csv_data.requires_quotes[str_data[i]]) {
|
208
|
+
// this byte requires quotes - write a quoted string
|
199
209
|
return true;
|
200
210
|
}
|
201
|
-
// check for quote
|
202
|
-
if (options.quote.length() != 0 &&
|
203
|
-
ContainsFun::Find(const_uchar_ptr_cast(str), len, const_uchar_ptr_cast(options.quote.c_str()),
|
204
|
-
options.quote.size()) != DConstants::INVALID_INDEX) {
|
205
|
-
return true;
|
206
|
-
}
|
207
|
-
return false;
|
208
211
|
}
|
212
|
+
// no newline, quote or delimiter in the string
|
213
|
+
// no quoting or escaping necessary
|
214
|
+
return false;
|
209
215
|
}
|
210
216
|
|
211
217
|
static void WriteQuotedString(Serializer &serializer, WriteCSVData &csv_data, const char *str, idx_t len,
|
@@ -218,46 +224,37 @@ static void WriteQuotedString(Serializer &serializer, WriteCSVData &csv_data, co
|
|
218
224
|
if (force_quote) {
|
219
225
|
// quoting is enabled: we might need to escape things in the string
|
220
226
|
bool requires_escape = false;
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
requires_escape = true;
|
227
|
-
break;
|
228
|
-
}
|
229
|
-
}
|
230
|
-
} else {
|
231
|
-
// complex CSV
|
232
|
-
// check for quote or escape separately
|
233
|
-
if (options.quote.length() != 0 &&
|
234
|
-
ContainsFun::Find(const_uchar_ptr_cast(str), len, const_uchar_ptr_cast(options.quote.c_str()),
|
235
|
-
options.quote.size()) != DConstants::INVALID_INDEX) {
|
236
|
-
requires_escape = true;
|
237
|
-
} else if (options.escape.length() != 0 &&
|
238
|
-
ContainsFun::Find(const_uchar_ptr_cast(str), len, const_uchar_ptr_cast(options.escape.c_str()),
|
239
|
-
options.escape.size()) != DConstants::INVALID_INDEX) {
|
227
|
+
// simple CSV
|
228
|
+
// do a single loop to check for a quote or escape value
|
229
|
+
for (idx_t i = 0; i < len; i++) {
|
230
|
+
if (str[i] == options.dialect_options.state_machine_options.quote ||
|
231
|
+
str[i] == options.dialect_options.state_machine_options.escape) {
|
240
232
|
requires_escape = true;
|
233
|
+
break;
|
241
234
|
}
|
242
235
|
}
|
236
|
+
|
243
237
|
if (!requires_escape) {
|
244
238
|
// fast path: no need to escape anything
|
245
|
-
serializer
|
239
|
+
WriteQuoteOrEscape(serializer, options.dialect_options.state_machine_options.quote);
|
246
240
|
serializer.WriteData(const_data_ptr_cast(str), len);
|
247
|
-
serializer
|
241
|
+
WriteQuoteOrEscape(serializer, options.dialect_options.state_machine_options.quote);
|
248
242
|
return;
|
249
243
|
}
|
250
244
|
|
251
245
|
// slow path: need to add escapes
|
252
246
|
string new_val(str, len);
|
253
|
-
new_val = AddEscapes(options.
|
254
|
-
|
247
|
+
new_val = AddEscapes(options.dialect_options.state_machine_options.escape,
|
248
|
+
options.dialect_options.state_machine_options.escape, new_val);
|
249
|
+
if (options.dialect_options.state_machine_options.escape !=
|
250
|
+
options.dialect_options.state_machine_options.quote) {
|
255
251
|
// need to escape quotes separately
|
256
|
-
new_val = AddEscapes(options.quote,
|
252
|
+
new_val = AddEscapes(options.dialect_options.state_machine_options.quote,
|
253
|
+
options.dialect_options.state_machine_options.escape, new_val);
|
257
254
|
}
|
258
|
-
serializer
|
255
|
+
WriteQuoteOrEscape(serializer, options.dialect_options.state_machine_options.quote);
|
259
256
|
serializer.WriteBufferData(new_val);
|
260
|
-
serializer
|
257
|
+
WriteQuoteOrEscape(serializer, options.dialect_options.state_machine_options.quote);
|
261
258
|
} else {
|
262
259
|
serializer.WriteData(const_data_ptr_cast(str), len);
|
263
260
|
}
|
@@ -335,12 +332,12 @@ static unique_ptr<GlobalFunctionData> WriteCSVInitializeGlobal(ClientContext &co
|
|
335
332
|
global_data->WriteData(options.prefix.c_str(), options.prefix.size());
|
336
333
|
}
|
337
334
|
|
338
|
-
if (options.header) {
|
335
|
+
if (options.dialect_options.header) {
|
339
336
|
BufferedSerializer serializer;
|
340
337
|
// write the header line to the file
|
341
338
|
for (idx_t i = 0; i < csv_data.options.name_list.size(); i++) {
|
342
339
|
if (i != 0) {
|
343
|
-
serializer
|
340
|
+
WriteQuoteOrEscape(serializer, options.dialect_options.state_machine_options.delimiter);
|
344
341
|
}
|
345
342
|
WriteQuotedString(serializer, csv_data, csv_data.options.name_list[i].c_str(),
|
346
343
|
csv_data.options.name_list[i].size(), false);
|
@@ -365,11 +362,12 @@ static void WriteCSVChunkInternal(ClientContext &context, FunctionData &bind_dat
|
|
365
362
|
if (csv_data.sql_types[col_idx].id() == LogicalTypeId::VARCHAR) {
|
366
363
|
// VARCHAR, just reinterpret (cannot reference, because LogicalTypeId::VARCHAR is used by the JSON type too)
|
367
364
|
cast_chunk.data[col_idx].Reinterpret(input.data[col_idx]);
|
368
|
-
} else if (options.has_format[LogicalTypeId::DATE] &&
|
365
|
+
} else if (options.dialect_options.has_format[LogicalTypeId::DATE] &&
|
366
|
+
csv_data.sql_types[col_idx].id() == LogicalTypeId::DATE) {
|
369
367
|
// use the date format to cast the chunk
|
370
368
|
csv_data.options.write_date_format[LogicalTypeId::DATE].ConvertDateVector(
|
371
369
|
input.data[col_idx], cast_chunk.data[col_idx], input.size());
|
372
|
-
} else if (options.has_format[LogicalTypeId::TIMESTAMP] &&
|
370
|
+
} else if (options.dialect_options.has_format[LogicalTypeId::TIMESTAMP] &&
|
373
371
|
(csv_data.sql_types[col_idx].id() == LogicalTypeId::TIMESTAMP ||
|
374
372
|
csv_data.sql_types[col_idx].id() == LogicalTypeId::TIMESTAMP_TZ)) {
|
375
373
|
// use the timestamp format to cast the chunk
|
@@ -392,7 +390,7 @@ static void WriteCSVChunkInternal(ClientContext &context, FunctionData &bind_dat
|
|
392
390
|
// write values
|
393
391
|
for (idx_t col_idx = 0; col_idx < cast_chunk.ColumnCount(); col_idx++) {
|
394
392
|
if (col_idx != 0) {
|
395
|
-
writer
|
393
|
+
WriteQuoteOrEscape(writer, options.dialect_options.state_machine_options.delimiter);
|
396
394
|
}
|
397
395
|
if (FlatVector::IsNull(cast_chunk.data[col_idx], row_idx)) {
|
398
396
|
// write null value
|