duckdb 0.8.2-dev3458.0 → 0.8.2-dev3949.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +2 -0
- package/package.json +1 -1
- package/src/duckdb/extension/icu/icu_extension.cpp +5 -5
- package/src/duckdb/extension/json/include/json_deserializer.hpp +7 -16
- package/src/duckdb/extension/json/include/json_serializer.hpp +9 -15
- package/src/duckdb/extension/json/json_deserializer.cpp +29 -67
- package/src/duckdb/extension/json/json_scan.cpp +1 -1
- package/src/duckdb/extension/json/json_serializer.cpp +26 -69
- package/src/duckdb/src/common/enum_util.cpp +119 -7
- package/src/duckdb/src/common/extra_type_info.cpp +7 -3
- package/src/duckdb/src/common/radix_partitioning.cpp +8 -31
- package/src/duckdb/src/common/row_operations/row_aggregate.cpp +18 -3
- package/src/duckdb/src/common/serializer/binary_deserializer.cpp +62 -77
- package/src/duckdb/src/common/serializer/binary_serializer.cpp +84 -84
- package/src/duckdb/src/common/serializer/format_serializer.cpp +1 -1
- package/src/duckdb/src/common/sort/partition_state.cpp +41 -33
- package/src/duckdb/src/common/types/data_chunk.cpp +44 -8
- package/src/duckdb/src/common/types/hyperloglog.cpp +21 -0
- package/src/duckdb/src/common/types/interval.cpp +3 -0
- package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +252 -126
- package/src/duckdb/src/common/types/row/row_layout.cpp +3 -31
- package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +40 -32
- package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +39 -26
- package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +11 -1
- package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +21 -16
- package/src/duckdb/src/common/types/value.cpp +63 -42
- package/src/duckdb/src/common/types/vector.cpp +33 -67
- package/src/duckdb/src/core_functions/scalar/list/list_lambdas.cpp +3 -2
- package/src/duckdb/src/execution/aggregate_hashtable.cpp +222 -364
- package/src/duckdb/src/execution/join_hashtable.cpp +5 -6
- package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +240 -310
- package/src/duckdb/src/execution/operator/aggregate/physical_ungrouped_aggregate.cpp +202 -173
- package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +36 -2
- package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/base_csv_reader.cpp +58 -162
- package/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp +434 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +80 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer_manager.cpp +90 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_file_handle.cpp +95 -0
- package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/csv_reader_options.cpp +47 -28
- package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine.cpp +35 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +107 -0
- package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/parallel_csv_reader.cpp +44 -44
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +52 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +336 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +165 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +398 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +175 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +39 -0
- package/src/duckdb/src/execution/operator/join/physical_asof_join.cpp +1 -1
- package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +1 -2
- package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +614 -574
- package/src/duckdb/src/execution/window_executor.cpp +6 -5
- package/src/duckdb/src/function/cast/cast_function_set.cpp +1 -0
- package/src/duckdb/src/function/scalar/strftime_format.cpp +4 -4
- package/src/duckdb/src/function/table/copy_csv.cpp +94 -96
- package/src/duckdb/src/function/table/read_csv.cpp +150 -136
- package/src/duckdb/src/function/table/table_scan.cpp +0 -2
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/enum_util.hpp +24 -0
- package/src/duckdb/src/include/duckdb/common/file_opener.hpp +9 -0
- package/src/duckdb/src/include/duckdb/common/fixed_size_map.hpp +208 -0
- package/src/duckdb/src/include/duckdb/common/optional_idx.hpp +3 -0
- package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +2 -1
- package/src/duckdb/src/include/duckdb/common/printer.hpp +11 -0
- package/src/duckdb/src/include/duckdb/common/serializer/binary_deserializer.hpp +43 -30
- package/src/duckdb/src/include/duckdb/common/serializer/binary_serializer.hpp +36 -35
- package/src/duckdb/src/include/duckdb/common/serializer/deserialization_data.hpp +18 -0
- package/src/duckdb/src/include/duckdb/common/serializer/encoding_util.hpp +132 -0
- package/src/duckdb/src/include/duckdb/common/serializer/format_deserializer.hpp +125 -150
- package/src/duckdb/src/include/duckdb/common/serializer/format_serializer.hpp +119 -107
- package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +2 -1
- package/src/duckdb/src/include/duckdb/common/shared_ptr.hpp +8 -0
- package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +13 -7
- package/src/duckdb/src/include/duckdb/common/types/data_chunk.hpp +5 -0
- package/src/duckdb/src/include/duckdb/common/types/hyperloglog.hpp +7 -1
- package/src/duckdb/src/include/duckdb/common/types/interval.hpp +7 -0
- package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +41 -9
- package/src/duckdb/src/include/duckdb/common/types/row/row_data_collection_scanner.hpp +5 -0
- package/src/duckdb/src/include/duckdb/common/types/row/row_layout.hpp +1 -23
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +14 -8
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +6 -3
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +7 -0
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +13 -8
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +3 -2
- package/src/duckdb/src/include/duckdb/common/types/vector.hpp +3 -3
- package/src/duckdb/src/include/duckdb/common/vector.hpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +125 -146
- package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_hash_aggregate.hpp +5 -4
- package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_window.hpp +4 -3
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/base_csv_reader.hpp +17 -17
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp +72 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +110 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp +103 -0
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_file_handle.hpp +8 -15
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_line_info.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_reader_options.hpp +52 -28
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +127 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +75 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +51 -0
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/parallel_csv_reader.hpp +21 -27
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/quote_rules.hpp +21 -0
- package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +18 -27
- package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +5 -6
- package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +4 -4
- package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +17 -12
- package/src/duckdb/src/include/duckdb/main/client_context_file_opener.hpp +1 -0
- package/src/duckdb/src/include/duckdb/main/client_data.hpp +2 -1
- package/src/duckdb/src/include/duckdb/main/config.hpp +1 -0
- package/src/duckdb/src/include/duckdb/main/connection.hpp +2 -2
- package/src/duckdb/src/include/duckdb/main/relation/read_csv_relation.hpp +6 -6
- package/src/duckdb/src/include/duckdb/parallel/event.hpp +12 -1
- package/src/duckdb/src/include/duckdb/storage/block.hpp +6 -0
- package/src/duckdb/src/include/duckdb/storage/buffer/block_handle.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/base_statistics.hpp +7 -3
- package/src/duckdb/src/include/duckdb/storage/statistics/column_statistics.hpp +4 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +5 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/list_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/numeric_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/string_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/struct_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp +15 -3
- package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +4 -0
- package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +5 -0
- package/src/duckdb/src/include/duckdb/verification/deserialized_statement_verifier_v2.hpp +6 -0
- package/src/duckdb/src/include/duckdb/verification/statement_verifier.hpp +1 -0
- package/src/duckdb/src/include/duckdb.h +12 -0
- package/src/duckdb/src/main/capi/logical_types-c.cpp +22 -0
- package/src/duckdb/src/main/client_context_file_opener.cpp +17 -0
- package/src/duckdb/src/main/client_verify.cpp +1 -0
- package/src/duckdb/src/main/config.cpp +2 -2
- package/src/duckdb/src/main/connection.cpp +3 -3
- package/src/duckdb/src/main/relation/read_csv_relation.cpp +19 -13
- package/src/duckdb/src/parallel/pipeline_finish_event.cpp +1 -1
- package/src/duckdb/src/parser/tableref/pivotref.cpp +0 -16
- package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +1 -1
- package/src/duckdb/src/planner/binder/statement/bind_export.cpp +41 -25
- package/src/duckdb/src/planner/expression/bound_aggregate_expression.cpp +4 -4
- package/src/duckdb/src/planner/expression/bound_window_expression.cpp +10 -10
- package/src/duckdb/src/planner/logical_operator.cpp +1 -1
- package/src/duckdb/src/planner/planner.cpp +1 -1
- package/src/duckdb/src/storage/checkpoint_manager.cpp +4 -3
- package/src/duckdb/src/storage/serialization/serialize_constraint.cpp +1 -1
- package/src/duckdb/src/storage/serialization/serialize_create_info.cpp +5 -5
- package/src/duckdb/src/storage/serialization/serialize_expression.cpp +10 -10
- package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +20 -20
- package/src/duckdb/src/storage/serialization/serialize_macro_function.cpp +2 -2
- package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +118 -89
- package/src/duckdb/src/storage/serialization/serialize_parse_info.cpp +3 -3
- package/src/duckdb/src/storage/serialization/serialize_parsed_expression.cpp +27 -27
- package/src/duckdb/src/storage/serialization/serialize_query_node.cpp +16 -16
- package/src/duckdb/src/storage/serialization/serialize_result_modifier.cpp +8 -8
- package/src/duckdb/src/storage/serialization/serialize_statement.cpp +1 -1
- package/src/duckdb/src/storage/serialization/serialize_storage.cpp +39 -0
- package/src/duckdb/src/storage/serialization/serialize_tableref.cpp +9 -9
- package/src/duckdb/src/storage/statistics/base_statistics.cpp +67 -4
- package/src/duckdb/src/storage/statistics/column_statistics.cpp +16 -0
- package/src/duckdb/src/storage/statistics/list_stats.cpp +21 -0
- package/src/duckdb/src/storage/statistics/numeric_stats.cpp +126 -1
- package/src/duckdb/src/storage/statistics/string_stats.cpp +23 -0
- package/src/duckdb/src/storage/statistics/struct_stats.cpp +27 -0
- package/src/duckdb/src/storage/storage_info.cpp +1 -1
- package/src/duckdb/src/storage/table/chunk_info.cpp +82 -3
- package/src/duckdb/src/storage/table/row_group.cpp +68 -1
- package/src/duckdb/src/storage/table/table_statistics.cpp +21 -0
- package/src/duckdb/src/storage/wal_replay.cpp +2 -2
- package/src/duckdb/src/verification/deserialized_statement_verifier_v2.cpp +15 -1
- package/src/duckdb/src/verification/statement_verifier.cpp +2 -0
- package/src/duckdb/third_party/utf8proc/include/utf8proc_wrapper.hpp +8 -0
- package/src/duckdb/ub_src_execution.cpp +0 -2
- package/src/duckdb/ub_src_execution_operator_csv_scanner.cpp +18 -0
- package/src/duckdb/ub_src_execution_operator_csv_scanner_sniffer.cpp +12 -0
- package/src/duckdb/ub_src_execution_operator_persistent.cpp +0 -12
- package/src/duckdb/ub_src_storage_serialization.cpp +2 -0
- package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +0 -1487
- package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +0 -72
- package/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp +0 -158
- package/src/duckdb/src/execution/partitionable_hashtable.cpp +0 -207
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +0 -133
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +0 -74
- package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +0 -73
@@ -1,4 +1,4 @@
|
|
1
|
-
#include "duckdb/execution/operator/
|
1
|
+
#include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
|
2
2
|
#include "duckdb/common/bind_helpers.hpp"
|
3
3
|
#include "duckdb/common/vector_size.hpp"
|
4
4
|
#include "duckdb/common/string_util.hpp"
|
@@ -60,60 +60,77 @@ static int64_t ParseInteger(const Value &value, const string &loption) {
|
|
60
60
|
return value.GetValue<int64_t>();
|
61
61
|
}
|
62
62
|
|
63
|
-
void
|
64
|
-
this->header = input;
|
63
|
+
void CSVReaderOptions::SetHeader(bool input) {
|
64
|
+
this->dialect_options.header = input;
|
65
65
|
this->has_header = true;
|
66
66
|
}
|
67
67
|
|
68
|
-
void
|
68
|
+
void CSVReaderOptions::SetCompression(const string &compression_p) {
|
69
69
|
this->compression = FileCompressionTypeFromString(compression_p);
|
70
70
|
}
|
71
71
|
|
72
|
-
void
|
73
|
-
|
72
|
+
void CSVReaderOptions::SetEscape(const string &input) {
|
73
|
+
auto escape_str = input;
|
74
|
+
if (escape_str.size() > 1) {
|
75
|
+
throw InvalidInputException("The escape option cannot exceed a size of 1 byte.");
|
76
|
+
}
|
77
|
+
if (escape_str.empty()) {
|
78
|
+
escape_str = string("\0", 1);
|
79
|
+
}
|
80
|
+
this->dialect_options.state_machine_options.escape = escape_str[0];
|
74
81
|
this->has_escape = true;
|
75
82
|
}
|
76
83
|
|
77
|
-
void
|
78
|
-
|
84
|
+
void CSVReaderOptions::SetDelimiter(const string &input) {
|
85
|
+
auto delim_str = StringUtil::Replace(input, "\\t", "\t");
|
86
|
+
if (delim_str.size() > 1) {
|
87
|
+
throw InvalidInputException("The delimiter option cannot exceed a size of 1 byte.");
|
88
|
+
}
|
79
89
|
this->has_delimiter = true;
|
80
90
|
if (input.empty()) {
|
81
|
-
|
91
|
+
delim_str = string("\0", 1);
|
82
92
|
}
|
93
|
+
this->dialect_options.state_machine_options.delimiter = delim_str[0];
|
83
94
|
}
|
84
95
|
|
85
|
-
void
|
86
|
-
|
96
|
+
void CSVReaderOptions::SetQuote(const string "e_p) {
|
97
|
+
auto quote_str = quote_p;
|
98
|
+
if (quote_str.size() > 1) {
|
99
|
+
throw InvalidInputException("The quote option cannot exceed a size of 1 byte.");
|
100
|
+
}
|
101
|
+
if (quote_str.empty()) {
|
102
|
+
quote_str = string("\0", 1);
|
103
|
+
}
|
104
|
+
this->dialect_options.state_machine_options.quote = quote_str[0];
|
87
105
|
this->has_quote = true;
|
88
106
|
}
|
89
107
|
|
90
|
-
void
|
108
|
+
void CSVReaderOptions::SetNewline(const string &input) {
|
91
109
|
if (input == "\\n" || input == "\\r") {
|
92
|
-
new_line = NewLineIdentifier::SINGLE;
|
110
|
+
dialect_options.new_line = NewLineIdentifier::SINGLE;
|
93
111
|
} else if (input == "\\r\\n") {
|
94
|
-
new_line = NewLineIdentifier::CARRY_ON;
|
112
|
+
dialect_options.new_line = NewLineIdentifier::CARRY_ON;
|
95
113
|
} else {
|
96
114
|
throw InvalidInputException("This is not accepted as a newline: " + input);
|
97
115
|
}
|
98
116
|
has_newline = true;
|
99
117
|
}
|
100
118
|
|
101
|
-
void
|
119
|
+
void CSVReaderOptions::SetDateFormat(LogicalTypeId type, const string &format, bool read_format) {
|
102
120
|
string error;
|
103
121
|
if (read_format) {
|
104
|
-
error = StrTimeFormat::ParseFormatSpecifier(format, date_format[type]);
|
105
|
-
date_format[type].format_specifier = format;
|
122
|
+
error = StrTimeFormat::ParseFormatSpecifier(format, dialect_options.date_format[type]);
|
123
|
+
dialect_options.date_format[type].format_specifier = format;
|
106
124
|
} else {
|
107
125
|
error = StrTimeFormat::ParseFormatSpecifier(format, write_date_format[type]);
|
108
126
|
}
|
109
127
|
if (!error.empty()) {
|
110
128
|
throw InvalidInputException("Could not parse DATEFORMAT: %s", error.c_str());
|
111
129
|
}
|
112
|
-
has_format[type] = true;
|
130
|
+
dialect_options.has_format[type] = true;
|
113
131
|
}
|
114
132
|
|
115
|
-
void
|
116
|
-
vector<string> &expected_names) {
|
133
|
+
void CSVReaderOptions::SetReadOption(const string &loption, const Value &value, vector<string> &expected_names) {
|
117
134
|
if (SetBaseOption(loption, value)) {
|
118
135
|
return;
|
119
136
|
}
|
@@ -135,7 +152,7 @@ void BufferedCSVReaderOptions::SetReadOption(const string &loption, const Value
|
|
135
152
|
sample_chunks = sample_size / STANDARD_VECTOR_SIZE + 1;
|
136
153
|
}
|
137
154
|
} else if (loption == "skip") {
|
138
|
-
skip_rows = ParseInteger(value, loption);
|
155
|
+
dialect_options.skip_rows = ParseInteger(value, loption);
|
139
156
|
skip_rows_set = true;
|
140
157
|
} else if (loption == "max_line_size" || loption == "maximum_line_size") {
|
141
158
|
maximum_line_size = ParseInteger(value, loption);
|
@@ -204,7 +221,7 @@ void BufferedCSVReaderOptions::SetReadOption(const string &loption, const Value
|
|
204
221
|
}
|
205
222
|
}
|
206
223
|
|
207
|
-
void
|
224
|
+
void CSVReaderOptions::SetWriteOption(const string &loption, const Value &value) {
|
208
225
|
if (loption == "new_line") {
|
209
226
|
// Steal this from SetBaseOption so we can write different newlines (e.g., format JSON ARRAY)
|
210
227
|
write_newline = ParseString(value, loption);
|
@@ -236,7 +253,7 @@ void BufferedCSVReaderOptions::SetWriteOption(const string &loption, const Value
|
|
236
253
|
}
|
237
254
|
}
|
238
255
|
|
239
|
-
bool
|
256
|
+
bool CSVReaderOptions::SetBaseOption(const string &loption, const Value &value) {
|
240
257
|
// Make sure this function was only called after the option was turned into lowercase
|
241
258
|
D_ASSERT(!std::any_of(loption.begin(), loption.end(), ::isupper));
|
242
259
|
|
@@ -266,12 +283,14 @@ bool BufferedCSVReaderOptions::SetBaseOption(const string &loption, const Value
|
|
266
283
|
return true;
|
267
284
|
}
|
268
285
|
|
269
|
-
|
270
|
-
return " file=" + file_path + "\n delimiter='" + delimiter +
|
271
|
-
(has_delimiter ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) + "\n quote='" +
|
272
|
-
|
286
|
+
string CSVReaderOptions::ToString() const {
|
287
|
+
return " file=" + file_path + "\n delimiter='" + dialect_options.state_machine_options.delimiter +
|
288
|
+
(has_delimiter ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) + "\n quote='" +
|
289
|
+
dialect_options.state_machine_options.quote +
|
290
|
+
(has_quote ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) + "\n escape='" +
|
291
|
+
dialect_options.state_machine_options.escape +
|
273
292
|
(has_escape ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) +
|
274
|
-
"\n header=" + std::to_string(header) +
|
293
|
+
"\n header=" + std::to_string(dialect_options.header) +
|
275
294
|
(has_header ? "" : (auto_detect ? " (auto detected)" : "' (default)")) +
|
276
295
|
"\n sample_size=" + std::to_string(sample_chunk_size * sample_chunks) +
|
277
296
|
"\n ignore_errors=" + std::to_string(ignore_errors) + "\n all_varchar=" + std::to_string(all_varchar);
|
@@ -0,0 +1,35 @@
|
|
1
|
+
#include "duckdb/execution/operator/scan/csv/csv_state_machine.hpp"
|
2
|
+
#include "duckdb/execution/operator/scan/csv/csv_sniffer.hpp"
|
3
|
+
#include "utf8proc_wrapper.hpp"
|
4
|
+
#include "duckdb/main/error_manager.hpp"
|
5
|
+
#include "duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp"
|
6
|
+
|
7
|
+
namespace duckdb {
|
8
|
+
|
9
|
+
CSVStateMachine::CSVStateMachine(CSVReaderOptions &options_p, const CSVStateMachineOptions &state_machine_options,
|
10
|
+
shared_ptr<CSVBufferManager> buffer_manager_p,
|
11
|
+
CSVStateMachineCache &csv_state_machine_cache_p)
|
12
|
+
: csv_state_machine_cache(csv_state_machine_cache_p), options(options_p),
|
13
|
+
csv_buffer_iterator(std::move(buffer_manager_p)),
|
14
|
+
transition_array(csv_state_machine_cache.Get(state_machine_options)) {
|
15
|
+
dialect_options.state_machine_options = state_machine_options;
|
16
|
+
dialect_options.has_format = options.dialect_options.has_format;
|
17
|
+
dialect_options.date_format = options.dialect_options.date_format;
|
18
|
+
dialect_options.skip_rows = options.dialect_options.skip_rows;
|
19
|
+
}
|
20
|
+
|
21
|
+
void CSVStateMachine::Reset() {
|
22
|
+
csv_buffer_iterator.Reset();
|
23
|
+
}
|
24
|
+
|
25
|
+
void CSVStateMachine::VerifyUTF8() {
|
26
|
+
auto utf_type = Utf8Proc::Analyze(value.c_str(), value.size());
|
27
|
+
if (utf_type == UnicodeType::INVALID) {
|
28
|
+
int64_t error_line = cur_rows;
|
29
|
+
throw InvalidInputException("Error in file \"%s\" at line %llu: "
|
30
|
+
"%s. Parser options:\n%s",
|
31
|
+
options.file_path, error_line, ErrorManager::InvalidUnicodeError(value, "CSV file"),
|
32
|
+
options.ToString());
|
33
|
+
}
|
34
|
+
}
|
35
|
+
} // namespace duckdb
|
@@ -0,0 +1,107 @@
|
|
1
|
+
#include "duckdb/execution/operator/scan/csv/csv_state_machine.hpp"
|
2
|
+
#include "duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp"
|
3
|
+
|
4
|
+
namespace duckdb {
|
5
|
+
|
6
|
+
void InitializeTransitionArray(unsigned char *transition_array, const uint8_t state) {
|
7
|
+
for (uint32_t i = 0; i < NUM_TRANSITIONS; i++) {
|
8
|
+
transition_array[i] = state;
|
9
|
+
}
|
10
|
+
}
|
11
|
+
|
12
|
+
void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_options) {
|
13
|
+
D_ASSERT(state_machine_cache.find(state_machine_options) == state_machine_cache.end());
|
14
|
+
// Initialize transition array with default values to the Standard option
|
15
|
+
auto &transition_array = state_machine_cache[state_machine_options];
|
16
|
+
const uint8_t standard_state = static_cast<uint8_t>(CSVState::STANDARD);
|
17
|
+
const uint8_t field_separator_state = static_cast<uint8_t>(CSVState::DELIMITER);
|
18
|
+
const uint8_t record_separator_state = static_cast<uint8_t>(CSVState::RECORD_SEPARATOR);
|
19
|
+
const uint8_t carriage_return_state = static_cast<uint8_t>(CSVState::CARRIAGE_RETURN);
|
20
|
+
const uint8_t quoted_state = static_cast<uint8_t>(CSVState::QUOTED);
|
21
|
+
const uint8_t unquoted_state = static_cast<uint8_t>(CSVState::UNQUOTED);
|
22
|
+
const uint8_t escape_state = static_cast<uint8_t>(CSVState::ESCAPE);
|
23
|
+
const uint8_t empty_line_state = static_cast<uint8_t>(CSVState::EMPTY_LINE);
|
24
|
+
const uint8_t invalid_state = static_cast<uint8_t>(CSVState::INVALID);
|
25
|
+
|
26
|
+
for (uint32_t i = 0; i < NUM_STATES; i++) {
|
27
|
+
switch (i) {
|
28
|
+
case quoted_state:
|
29
|
+
InitializeTransitionArray(transition_array[i], quoted_state);
|
30
|
+
break;
|
31
|
+
case unquoted_state:
|
32
|
+
InitializeTransitionArray(transition_array[i], invalid_state);
|
33
|
+
break;
|
34
|
+
case escape_state:
|
35
|
+
InitializeTransitionArray(transition_array[i], invalid_state);
|
36
|
+
break;
|
37
|
+
default:
|
38
|
+
InitializeTransitionArray(transition_array[i], standard_state);
|
39
|
+
break;
|
40
|
+
}
|
41
|
+
}
|
42
|
+
|
43
|
+
// Now set values depending on configuration
|
44
|
+
// 1) Standard State
|
45
|
+
transition_array[standard_state][static_cast<uint8_t>(state_machine_options.delimiter)] = field_separator_state;
|
46
|
+
transition_array[standard_state][static_cast<uint8_t>('\n')] = record_separator_state;
|
47
|
+
transition_array[standard_state][static_cast<uint8_t>('\r')] = carriage_return_state;
|
48
|
+
transition_array[standard_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
|
49
|
+
// 2) Field Separator State
|
50
|
+
transition_array[field_separator_state][static_cast<uint8_t>(state_machine_options.delimiter)] =
|
51
|
+
field_separator_state;
|
52
|
+
transition_array[field_separator_state][static_cast<uint8_t>('\n')] = record_separator_state;
|
53
|
+
transition_array[field_separator_state][static_cast<uint8_t>('\r')] = carriage_return_state;
|
54
|
+
transition_array[field_separator_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
|
55
|
+
// 3) Record Separator State
|
56
|
+
transition_array[record_separator_state][static_cast<uint8_t>(state_machine_options.delimiter)] =
|
57
|
+
field_separator_state;
|
58
|
+
transition_array[record_separator_state][static_cast<uint8_t>('\n')] = empty_line_state;
|
59
|
+
transition_array[record_separator_state][static_cast<uint8_t>('\r')] = empty_line_state;
|
60
|
+
transition_array[record_separator_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
|
61
|
+
// 4) Carriage Return State
|
62
|
+
transition_array[carriage_return_state][static_cast<uint8_t>('\n')] = record_separator_state;
|
63
|
+
transition_array[carriage_return_state][static_cast<uint8_t>('\r')] = empty_line_state;
|
64
|
+
transition_array[carriage_return_state][static_cast<uint8_t>(state_machine_options.escape)] = escape_state;
|
65
|
+
// 5) Quoted State
|
66
|
+
transition_array[quoted_state][static_cast<uint8_t>(state_machine_options.quote)] = unquoted_state;
|
67
|
+
if (state_machine_options.quote != state_machine_options.escape) {
|
68
|
+
transition_array[quoted_state][static_cast<uint8_t>(state_machine_options.escape)] = escape_state;
|
69
|
+
}
|
70
|
+
// 6) Unquoted State
|
71
|
+
transition_array[unquoted_state][static_cast<uint8_t>('\n')] = record_separator_state;
|
72
|
+
transition_array[unquoted_state][static_cast<uint8_t>('\r')] = carriage_return_state;
|
73
|
+
transition_array[unquoted_state][static_cast<uint8_t>(state_machine_options.delimiter)] = field_separator_state;
|
74
|
+
if (state_machine_options.quote == state_machine_options.escape) {
|
75
|
+
transition_array[unquoted_state][static_cast<uint8_t>(state_machine_options.escape)] = quoted_state;
|
76
|
+
}
|
77
|
+
// 7) Escaped State
|
78
|
+
transition_array[escape_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
|
79
|
+
transition_array[escape_state][static_cast<uint8_t>(state_machine_options.escape)] = quoted_state;
|
80
|
+
// 8) Empty Line State
|
81
|
+
transition_array[empty_line_state][static_cast<uint8_t>('\r')] = empty_line_state;
|
82
|
+
transition_array[empty_line_state][static_cast<uint8_t>('\n')] = empty_line_state;
|
83
|
+
}
|
84
|
+
|
85
|
+
CSVStateMachineCache::CSVStateMachineCache() {
|
86
|
+
for (auto quoterule : default_quote_rule) {
|
87
|
+
const auto "e_candidates = default_quote[static_cast<uint8_t>(quoterule)];
|
88
|
+
for (const auto "e : quote_candidates) {
|
89
|
+
for (const auto &delimiter : default_delimiter) {
|
90
|
+
const auto &escape_candidates = default_escape[static_cast<uint8_t>(quoterule)];
|
91
|
+
for (const auto &escape : escape_candidates) {
|
92
|
+
Insert({delimiter, quote, escape});
|
93
|
+
}
|
94
|
+
}
|
95
|
+
}
|
96
|
+
}
|
97
|
+
}
|
98
|
+
|
99
|
+
const state_machine_t &CSVStateMachineCache::Get(const CSVStateMachineOptions &state_machine_options) {
|
100
|
+
//! Custom State Machine, we need to create it and cache it first
|
101
|
+
if (state_machine_cache.find(state_machine_options) == state_machine_cache.end()) {
|
102
|
+
Insert(state_machine_options);
|
103
|
+
}
|
104
|
+
const auto &transition_array = state_machine_cache[state_machine_options];
|
105
|
+
return transition_array;
|
106
|
+
}
|
107
|
+
} // namespace duckdb
|
package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/parallel_csv_reader.cpp
RENAMED
@@ -1,4 +1,4 @@
|
|
1
|
-
#include "duckdb/execution/operator/
|
1
|
+
#include "duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp"
|
2
2
|
|
3
3
|
#include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp"
|
4
4
|
#include "duckdb/common/file_system.hpp"
|
@@ -15,7 +15,7 @@
|
|
15
15
|
#include "utf8proc.hpp"
|
16
16
|
#include "duckdb/parser/keyword_helper.hpp"
|
17
17
|
#include "duckdb/function/table/read_csv.hpp"
|
18
|
-
#include "duckdb/execution/operator/
|
18
|
+
#include "duckdb/execution/operator/scan/csv/csv_line_info.hpp"
|
19
19
|
|
20
20
|
#include <algorithm>
|
21
21
|
#include <cctype>
|
@@ -24,16 +24,13 @@
|
|
24
24
|
|
25
25
|
namespace duckdb {
|
26
26
|
|
27
|
-
ParallelCSVReader::ParallelCSVReader(ClientContext &context,
|
27
|
+
ParallelCSVReader::ParallelCSVReader(ClientContext &context, CSVReaderOptions options_p,
|
28
28
|
unique_ptr<CSVBufferRead> buffer_p, idx_t first_pos_first_buffer_p,
|
29
29
|
const vector<LogicalType> &requested_types, idx_t file_idx_p)
|
30
30
|
: BaseCSVReader(context, std::move(options_p), requested_types), file_idx(file_idx_p),
|
31
31
|
first_pos_first_buffer(first_pos_first_buffer_p) {
|
32
32
|
Initialize(requested_types);
|
33
33
|
SetBufferRead(std::move(buffer_p));
|
34
|
-
if (options.delimiter.size() > 1 || options.escape.size() > 1 || options.quote.size() > 1) {
|
35
|
-
throw InternalException("Parallel CSV reader cannot handle CSVs with multi-byte delimiters/escapes/quotes");
|
36
|
-
}
|
37
34
|
}
|
38
35
|
|
39
36
|
void ParallelCSVReader::Initialize(const vector<LogicalType> &requested_types) {
|
@@ -44,8 +41,9 @@ void ParallelCSVReader::Initialize(const vector<LogicalType> &requested_types) {
|
|
44
41
|
bool ParallelCSVReader::NewLineDelimiter(bool carry, bool carry_followed_by_nl, bool first_char) {
|
45
42
|
// Set the delimiter if not set yet.
|
46
43
|
SetNewLineDelimiter(carry, carry_followed_by_nl);
|
47
|
-
D_ASSERT(options.new_line == NewLineIdentifier::SINGLE ||
|
48
|
-
|
44
|
+
D_ASSERT(options.dialect_options.new_line == NewLineIdentifier::SINGLE ||
|
45
|
+
options.dialect_options.new_line == NewLineIdentifier::CARRY_ON);
|
46
|
+
if (options.dialect_options.new_line == NewLineIdentifier::SINGLE) {
|
49
47
|
return (!carry) || (carry && !carry_followed_by_nl);
|
50
48
|
}
|
51
49
|
return (carry && carry_followed_by_nl) || (!carry && first_char);
|
@@ -75,15 +73,14 @@ void ParallelCSVReader::SkipEmptyLines() {
|
|
75
73
|
}
|
76
74
|
|
77
75
|
bool ParallelCSVReader::SetPosition() {
|
78
|
-
if (buffer->buffer->
|
79
|
-
|
80
|
-
start_buffer = buffer->buffer->GetStart();
|
76
|
+
if (buffer->buffer->is_first_buffer && start_buffer == position_buffer && start_buffer == first_pos_first_buffer) {
|
77
|
+
start_buffer = buffer->buffer->start_position;
|
81
78
|
position_buffer = start_buffer;
|
82
79
|
verification_positions.beginning_of_first_line = position_buffer;
|
83
80
|
verification_positions.end_of_last_line = position_buffer;
|
84
81
|
// First buffer doesn't need any setting
|
85
82
|
|
86
|
-
if (options.header) {
|
83
|
+
if (options.dialect_options.header) {
|
87
84
|
for (; position_buffer < end_buffer; position_buffer++) {
|
88
85
|
if (StringUtil::CharacterIsNewline((*buffer)[position_buffer])) {
|
89
86
|
bool carrier_return = (*buffer)[position_buffer] == '\r';
|
@@ -150,7 +147,7 @@ bool ParallelCSVReader::SetPosition() {
|
|
150
147
|
break;
|
151
148
|
}
|
152
149
|
|
153
|
-
if (position_buffer > end_buffer && options.new_line == NewLineIdentifier::CARRY_ON &&
|
150
|
+
if (position_buffer > end_buffer && options.dialect_options.new_line == NewLineIdentifier::CARRY_ON &&
|
154
151
|
(*buffer)[position_buffer - 1] == '\n') {
|
155
152
|
break;
|
156
153
|
}
|
@@ -199,9 +196,9 @@ void ParallelCSVReader::SetBufferRead(unique_ptr<CSVBufferRead> buffer_read_p) {
|
|
199
196
|
start_buffer = buffer_read_p->buffer_start;
|
200
197
|
end_buffer = buffer_read_p->buffer_end;
|
201
198
|
if (buffer_read_p->next_buffer) {
|
202
|
-
buffer_size = buffer_read_p->buffer->
|
199
|
+
buffer_size = buffer_read_p->buffer->actual_size + buffer_read_p->next_buffer->actual_size;
|
203
200
|
} else {
|
204
|
-
buffer_size = buffer_read_p->buffer->
|
201
|
+
buffer_size = buffer_read_p->buffer->actual_size;
|
205
202
|
}
|
206
203
|
buffer = std::move(buffer_read_p);
|
207
204
|
|
@@ -213,8 +210,8 @@ void ParallelCSVReader::SetBufferRead(unique_ptr<CSVBufferRead> buffer_read_p) {
|
|
213
210
|
}
|
214
211
|
|
215
212
|
VerificationPositions ParallelCSVReader::GetVerificationPositions() {
|
216
|
-
verification_positions.beginning_of_first_line += buffer->buffer->
|
217
|
-
verification_positions.end_of_last_line += buffer->buffer->
|
213
|
+
verification_positions.beginning_of_first_line += buffer->buffer->csv_global_start;
|
214
|
+
verification_positions.end_of_last_line += buffer->buffer->csv_global_start;
|
218
215
|
return verification_positions;
|
219
216
|
}
|
220
217
|
|
@@ -235,15 +232,6 @@ bool ParallelCSVReader::BufferRemainder() {
|
|
235
232
|
return true;
|
236
233
|
}
|
237
234
|
|
238
|
-
void ParallelCSVReader::VerifyLineLength(idx_t line_size) {
|
239
|
-
if (line_size > options.maximum_line_size) {
|
240
|
-
throw InvalidInputException("Error in file \"%s\" on line %s: Maximum line size of %llu bytes exceeded!",
|
241
|
-
options.file_path,
|
242
|
-
GetLineNumberStr(parse_chunk.size(), linenr_estimated, buffer->batch_index).c_str(),
|
243
|
-
options.maximum_line_size);
|
244
|
-
}
|
245
|
-
}
|
246
|
-
|
247
235
|
bool AllNewLine(string_t value, idx_t column_amount) {
|
248
236
|
auto value_str = value.GetString();
|
249
237
|
if (value_str.empty() && column_amount == 1) {
|
@@ -260,7 +248,7 @@ bool AllNewLine(string_t value, idx_t column_amount) {
|
|
260
248
|
|
261
249
|
bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line) {
|
262
250
|
// If line is not set, we have to figure it out, we assume whatever is in the first line
|
263
|
-
if (options.new_line == NewLineIdentifier::NOT_SET) {
|
251
|
+
if (options.dialect_options.new_line == NewLineIdentifier::NOT_SET) {
|
264
252
|
idx_t cur_pos = position_buffer;
|
265
253
|
// we can start in the middle of a new line, so move a bit forward.
|
266
254
|
while (cur_pos < end_buffer) {
|
@@ -324,7 +312,7 @@ value_start : {
|
|
324
312
|
offset = 0;
|
325
313
|
|
326
314
|
// this state parses the first character of a value
|
327
|
-
if ((*buffer)[position_buffer] == options.quote
|
315
|
+
if ((*buffer)[position_buffer] == options.dialect_options.state_machine_options.quote) {
|
328
316
|
// quote: actual value starts in the next position
|
329
317
|
// move to in_quotes state
|
330
318
|
start_buffer = position_buffer + 1;
|
@@ -341,10 +329,10 @@ normal : {
|
|
341
329
|
// this state parses the remainder of a non-quoted value until we reach a delimiter or newline
|
342
330
|
for (; position_buffer < end_buffer; position_buffer++) {
|
343
331
|
auto c = (*buffer)[position_buffer];
|
344
|
-
if (c == options.delimiter
|
332
|
+
if (c == options.dialect_options.state_machine_options.delimiter) {
|
345
333
|
// delimiter: end the value and add it to the chunk
|
346
334
|
goto add_value;
|
347
|
-
} else if (c == options.quote
|
335
|
+
} else if (c == options.dialect_options.state_machine_options.quote && try_add_line) {
|
348
336
|
return false;
|
349
337
|
} else if (StringUtil::CharacterIsNewline(c)) {
|
350
338
|
// newline: add row
|
@@ -396,7 +384,7 @@ add_row : {
|
|
396
384
|
parse_chunk.Reset();
|
397
385
|
return success;
|
398
386
|
} else {
|
399
|
-
VerifyLineLength(position_buffer - line_start);
|
387
|
+
VerifyLineLength(position_buffer - line_start, buffer->batch_index);
|
400
388
|
line_start = position_buffer;
|
401
389
|
finished_chunk = AddRow(insert_chunk, column, error_message, buffer->local_batch_index);
|
402
390
|
}
|
@@ -413,7 +401,7 @@ add_row : {
|
|
413
401
|
goto final_state;
|
414
402
|
}
|
415
403
|
if ((*buffer)[position_buffer] == '\n') {
|
416
|
-
if (options.new_line == NewLineIdentifier::SINGLE) {
|
404
|
+
if (options.dialect_options.new_line == NewLineIdentifier::SINGLE) {
|
417
405
|
error_message = "Wrong NewLine Identifier. Expecting \\r\\n";
|
418
406
|
return false;
|
419
407
|
}
|
@@ -428,7 +416,7 @@ add_row : {
|
|
428
416
|
goto final_state;
|
429
417
|
}
|
430
418
|
} else {
|
431
|
-
if (options.new_line == NewLineIdentifier::CARRY_ON) {
|
419
|
+
if (options.dialect_options.new_line == NewLineIdentifier::CARRY_ON) {
|
432
420
|
error_message = "Wrong NewLine Identifier. Expecting \\r or \\n";
|
433
421
|
return false;
|
434
422
|
}
|
@@ -441,7 +429,7 @@ add_row : {
|
|
441
429
|
}
|
442
430
|
goto value_start;
|
443
431
|
} else {
|
444
|
-
if (options.new_line == NewLineIdentifier::CARRY_ON) {
|
432
|
+
if (options.dialect_options.new_line == NewLineIdentifier::CARRY_ON) {
|
445
433
|
error_message = "Wrong NewLine Identifier. Expecting \\r or \\n";
|
446
434
|
return false;
|
447
435
|
}
|
@@ -452,6 +440,10 @@ add_row : {
|
|
452
440
|
goto final_state;
|
453
441
|
}
|
454
442
|
SkipEmptyLines();
|
443
|
+
if (position_buffer - verification_positions.end_of_last_line > options.buffer_size) {
|
444
|
+
error_message = "Line does not fit in one buffer. Increase the buffer size.";
|
445
|
+
return false;
|
446
|
+
}
|
455
447
|
verification_positions.end_of_last_line = position_buffer;
|
456
448
|
start_buffer = position_buffer;
|
457
449
|
// \n newline, move to value start
|
@@ -467,17 +459,17 @@ in_quotes:
|
|
467
459
|
position_buffer++;
|
468
460
|
for (; position_buffer < end_buffer; position_buffer++) {
|
469
461
|
auto c = (*buffer)[position_buffer];
|
470
|
-
if (c == options.quote
|
462
|
+
if (c == options.dialect_options.state_machine_options.quote) {
|
471
463
|
// quote: move to unquoted state
|
472
464
|
goto unquote;
|
473
|
-
} else if (c == options.escape
|
465
|
+
} else if (c == options.dialect_options.state_machine_options.escape) {
|
474
466
|
// escape: store the escaped position and move to handle_escape state
|
475
467
|
escape_positions.push_back(position_buffer - start_buffer);
|
476
468
|
goto handle_escape;
|
477
469
|
}
|
478
470
|
}
|
479
471
|
if (!BufferRemainder()) {
|
480
|
-
if (buffer->buffer->
|
472
|
+
if (buffer->buffer->is_last_buffer) {
|
481
473
|
if (try_add_line) {
|
482
474
|
return false;
|
483
475
|
}
|
@@ -504,11 +496,13 @@ unquote : {
|
|
504
496
|
goto final_state;
|
505
497
|
}
|
506
498
|
auto c = (*buffer)[position_buffer];
|
507
|
-
if (c == options.
|
499
|
+
if (c == options.dialect_options.state_machine_options.quote &&
|
500
|
+
(options.dialect_options.state_machine_options.escape == '\0' ||
|
501
|
+
options.dialect_options.state_machine_options.escape == options.dialect_options.state_machine_options.quote)) {
|
508
502
|
// escaped quote, return to quoted state and store escape position
|
509
503
|
escape_positions.push_back(position_buffer - start_buffer);
|
510
504
|
goto in_quotes;
|
511
|
-
} else if (c == options.delimiter
|
505
|
+
} else if (c == options.dialect_options.state_machine_options.delimiter) {
|
512
506
|
// delimiter, add value
|
513
507
|
offset = 1;
|
514
508
|
goto add_value;
|
@@ -537,13 +531,14 @@ handle_escape : {
|
|
537
531
|
if (!BufferRemainder()) {
|
538
532
|
goto final_state;
|
539
533
|
}
|
540
|
-
if (position_buffer >= buffer_size && buffer->buffer->
|
534
|
+
if (position_buffer >= buffer_size && buffer->buffer->is_last_buffer) {
|
541
535
|
error_message = StringUtil::Format(
|
542
536
|
"Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", options.file_path,
|
543
537
|
GetLineNumberStr(linenr, linenr_estimated, buffer->local_batch_index).c_str(), options.ToString());
|
544
538
|
return false;
|
545
539
|
}
|
546
|
-
if ((*buffer)[position_buffer] != options.quote
|
540
|
+
if ((*buffer)[position_buffer] != options.dialect_options.state_machine_options.quote &&
|
541
|
+
(*buffer)[position_buffer] != options.dialect_options.state_machine_options.escape) {
|
547
542
|
error_message = StringUtil::Format(
|
548
543
|
"Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", options.file_path,
|
549
544
|
GetLineNumberStr(linenr, linenr_estimated, buffer->local_batch_index).c_str(), options.ToString());
|
@@ -573,7 +568,8 @@ final_state : {
|
|
573
568
|
return true;
|
574
569
|
}
|
575
570
|
// If this is the last buffer, we have to read the last value
|
576
|
-
if (buffer->buffer->
|
571
|
+
if (buffer->buffer->is_last_buffer || !buffer->next_buffer ||
|
572
|
+
(buffer->next_buffer && buffer->next_buffer->is_last_buffer)) {
|
577
573
|
if (column > 0 || start_buffer != position_buffer || try_add_line ||
|
578
574
|
(insert_chunk.data.size() == 1 && start_buffer != position_buffer)) {
|
579
575
|
// remaining values to be added to the chunk
|
@@ -592,9 +588,13 @@ final_state : {
|
|
592
588
|
reached_remainder_state = false;
|
593
589
|
return success;
|
594
590
|
} else {
|
595
|
-
VerifyLineLength(position_buffer - line_start);
|
591
|
+
VerifyLineLength(position_buffer - line_start, buffer->batch_index);
|
596
592
|
line_start = position_buffer;
|
597
593
|
AddRow(insert_chunk, column, error_message, buffer->local_batch_index);
|
594
|
+
if (position_buffer - verification_positions.end_of_last_line > options.buffer_size) {
|
595
|
+
error_message = "Line does not fit in one buffer. Increase the buffer size.";
|
596
|
+
return false;
|
597
|
+
}
|
598
598
|
verification_positions.end_of_last_line = position_buffer;
|
599
599
|
}
|
600
600
|
}
|
@@ -638,7 +638,7 @@ void ParallelCSVReader::ParseCSV(DataChunk &insert_chunk) {
|
|
638
638
|
idx_t ParallelCSVReader::GetLineError(idx_t line_error, idx_t buffer_idx, bool stop_at_first) {
|
639
639
|
while (true) {
|
640
640
|
if (buffer->line_info->CanItGetLine(file_idx, buffer_idx)) {
|
641
|
-
auto cur_start = verification_positions.beginning_of_first_line + buffer->buffer->
|
641
|
+
auto cur_start = verification_positions.beginning_of_first_line + buffer->buffer->csv_global_start;
|
642
642
|
return buffer->line_info->GetLine(buffer_idx, line_error, file_idx, cur_start, false, stop_at_first);
|
643
643
|
}
|
644
644
|
}
|
@@ -0,0 +1,52 @@
|
|
1
|
+
#include "duckdb/execution/operator/scan/csv/csv_sniffer.hpp"
|
2
|
+
|
3
|
+
namespace duckdb {
|
4
|
+
|
5
|
+
CSVSniffer::CSVSniffer(CSVReaderOptions &options_p, shared_ptr<CSVBufferManager> buffer_manager_p,
|
6
|
+
CSVStateMachineCache &state_machine_cache_p)
|
7
|
+
: state_machine_cache(state_machine_cache_p), options(options_p), buffer_manager(std::move(buffer_manager_p)) {
|
8
|
+
|
9
|
+
// Check if any type is BLOB
|
10
|
+
for (auto &type : options.sql_type_list) {
|
11
|
+
if (type.id() == LogicalTypeId::BLOB) {
|
12
|
+
throw InvalidInputException(
|
13
|
+
"CSV auto-detect for blobs not supported: there may be invalid UTF-8 in the file");
|
14
|
+
}
|
15
|
+
}
|
16
|
+
|
17
|
+
// Initialize Format Candidates
|
18
|
+
for (const auto &format_template : format_template_candidates) {
|
19
|
+
auto &logical_type = format_template.first;
|
20
|
+
best_format_candidates[logical_type].clear();
|
21
|
+
}
|
22
|
+
}
|
23
|
+
|
24
|
+
SnifferResult CSVSniffer::SniffCSV() {
|
25
|
+
// 1. Dialect Detection
|
26
|
+
DetectDialect();
|
27
|
+
// 2. Type Detection
|
28
|
+
DetectTypes();
|
29
|
+
// 3. Header Detection
|
30
|
+
DetectHeader();
|
31
|
+
D_ASSERT(best_sql_types_candidates_per_column_idx.size() == names.size());
|
32
|
+
// 4. Type Replacement
|
33
|
+
ReplaceTypes();
|
34
|
+
// 5. Type Refinement
|
35
|
+
RefineTypes();
|
36
|
+
// We are done, construct and return the result.
|
37
|
+
|
38
|
+
// Set the CSV Options in the reference
|
39
|
+
options.dialect_options = best_candidate->dialect_options;
|
40
|
+
options.has_header = best_candidate->dialect_options.header;
|
41
|
+
options.skip_rows_set = options.dialect_options.skip_rows > 0;
|
42
|
+
if (options.has_header) {
|
43
|
+
options.dialect_options.true_start = best_start_with_header;
|
44
|
+
} else {
|
45
|
+
options.dialect_options.true_start = best_start_without_header;
|
46
|
+
}
|
47
|
+
|
48
|
+
// Return the types and names
|
49
|
+
return SnifferResult(detected_types, names);
|
50
|
+
}
|
51
|
+
|
52
|
+
} // namespace duckdb
|