duckdb 0.8.2-dev3458.0 → 0.8.2-dev3949.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +2 -0
- package/package.json +1 -1
- package/src/duckdb/extension/icu/icu_extension.cpp +5 -5
- package/src/duckdb/extension/json/include/json_deserializer.hpp +7 -16
- package/src/duckdb/extension/json/include/json_serializer.hpp +9 -15
- package/src/duckdb/extension/json/json_deserializer.cpp +29 -67
- package/src/duckdb/extension/json/json_scan.cpp +1 -1
- package/src/duckdb/extension/json/json_serializer.cpp +26 -69
- package/src/duckdb/src/common/enum_util.cpp +119 -7
- package/src/duckdb/src/common/extra_type_info.cpp +7 -3
- package/src/duckdb/src/common/radix_partitioning.cpp +8 -31
- package/src/duckdb/src/common/row_operations/row_aggregate.cpp +18 -3
- package/src/duckdb/src/common/serializer/binary_deserializer.cpp +62 -77
- package/src/duckdb/src/common/serializer/binary_serializer.cpp +84 -84
- package/src/duckdb/src/common/serializer/format_serializer.cpp +1 -1
- package/src/duckdb/src/common/sort/partition_state.cpp +41 -33
- package/src/duckdb/src/common/types/data_chunk.cpp +44 -8
- package/src/duckdb/src/common/types/hyperloglog.cpp +21 -0
- package/src/duckdb/src/common/types/interval.cpp +3 -0
- package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +252 -126
- package/src/duckdb/src/common/types/row/row_layout.cpp +3 -31
- package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +40 -32
- package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +39 -26
- package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +11 -1
- package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +21 -16
- package/src/duckdb/src/common/types/value.cpp +63 -42
- package/src/duckdb/src/common/types/vector.cpp +33 -67
- package/src/duckdb/src/core_functions/scalar/list/list_lambdas.cpp +3 -2
- package/src/duckdb/src/execution/aggregate_hashtable.cpp +222 -364
- package/src/duckdb/src/execution/join_hashtable.cpp +5 -6
- package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +240 -310
- package/src/duckdb/src/execution/operator/aggregate/physical_ungrouped_aggregate.cpp +202 -173
- package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +36 -2
- package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/base_csv_reader.cpp +58 -162
- package/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp +434 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +80 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer_manager.cpp +90 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_file_handle.cpp +95 -0
- package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/csv_reader_options.cpp +47 -28
- package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine.cpp +35 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +107 -0
- package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/parallel_csv_reader.cpp +44 -44
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +52 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +336 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +165 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +398 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +175 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +39 -0
- package/src/duckdb/src/execution/operator/join/physical_asof_join.cpp +1 -1
- package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +1 -2
- package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +614 -574
- package/src/duckdb/src/execution/window_executor.cpp +6 -5
- package/src/duckdb/src/function/cast/cast_function_set.cpp +1 -0
- package/src/duckdb/src/function/scalar/strftime_format.cpp +4 -4
- package/src/duckdb/src/function/table/copy_csv.cpp +94 -96
- package/src/duckdb/src/function/table/read_csv.cpp +150 -136
- package/src/duckdb/src/function/table/table_scan.cpp +0 -2
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/enum_util.hpp +24 -0
- package/src/duckdb/src/include/duckdb/common/file_opener.hpp +9 -0
- package/src/duckdb/src/include/duckdb/common/fixed_size_map.hpp +208 -0
- package/src/duckdb/src/include/duckdb/common/optional_idx.hpp +3 -0
- package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +2 -1
- package/src/duckdb/src/include/duckdb/common/printer.hpp +11 -0
- package/src/duckdb/src/include/duckdb/common/serializer/binary_deserializer.hpp +43 -30
- package/src/duckdb/src/include/duckdb/common/serializer/binary_serializer.hpp +36 -35
- package/src/duckdb/src/include/duckdb/common/serializer/deserialization_data.hpp +18 -0
- package/src/duckdb/src/include/duckdb/common/serializer/encoding_util.hpp +132 -0
- package/src/duckdb/src/include/duckdb/common/serializer/format_deserializer.hpp +125 -150
- package/src/duckdb/src/include/duckdb/common/serializer/format_serializer.hpp +119 -107
- package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +2 -1
- package/src/duckdb/src/include/duckdb/common/shared_ptr.hpp +8 -0
- package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +13 -7
- package/src/duckdb/src/include/duckdb/common/types/data_chunk.hpp +5 -0
- package/src/duckdb/src/include/duckdb/common/types/hyperloglog.hpp +7 -1
- package/src/duckdb/src/include/duckdb/common/types/interval.hpp +7 -0
- package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +41 -9
- package/src/duckdb/src/include/duckdb/common/types/row/row_data_collection_scanner.hpp +5 -0
- package/src/duckdb/src/include/duckdb/common/types/row/row_layout.hpp +1 -23
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +14 -8
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +6 -3
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +7 -0
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +13 -8
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +3 -2
- package/src/duckdb/src/include/duckdb/common/types/vector.hpp +3 -3
- package/src/duckdb/src/include/duckdb/common/vector.hpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +125 -146
- package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_hash_aggregate.hpp +5 -4
- package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_window.hpp +4 -3
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/base_csv_reader.hpp +17 -17
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp +72 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +110 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp +103 -0
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_file_handle.hpp +8 -15
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_line_info.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_reader_options.hpp +52 -28
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +127 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +75 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +51 -0
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/parallel_csv_reader.hpp +21 -27
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/quote_rules.hpp +21 -0
- package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +18 -27
- package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +5 -6
- package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +4 -4
- package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +17 -12
- package/src/duckdb/src/include/duckdb/main/client_context_file_opener.hpp +1 -0
- package/src/duckdb/src/include/duckdb/main/client_data.hpp +2 -1
- package/src/duckdb/src/include/duckdb/main/config.hpp +1 -0
- package/src/duckdb/src/include/duckdb/main/connection.hpp +2 -2
- package/src/duckdb/src/include/duckdb/main/relation/read_csv_relation.hpp +6 -6
- package/src/duckdb/src/include/duckdb/parallel/event.hpp +12 -1
- package/src/duckdb/src/include/duckdb/storage/block.hpp +6 -0
- package/src/duckdb/src/include/duckdb/storage/buffer/block_handle.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/base_statistics.hpp +7 -3
- package/src/duckdb/src/include/duckdb/storage/statistics/column_statistics.hpp +4 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +5 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/list_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/numeric_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/string_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/struct_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp +15 -3
- package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +4 -0
- package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +5 -0
- package/src/duckdb/src/include/duckdb/verification/deserialized_statement_verifier_v2.hpp +6 -0
- package/src/duckdb/src/include/duckdb/verification/statement_verifier.hpp +1 -0
- package/src/duckdb/src/include/duckdb.h +12 -0
- package/src/duckdb/src/main/capi/logical_types-c.cpp +22 -0
- package/src/duckdb/src/main/client_context_file_opener.cpp +17 -0
- package/src/duckdb/src/main/client_verify.cpp +1 -0
- package/src/duckdb/src/main/config.cpp +2 -2
- package/src/duckdb/src/main/connection.cpp +3 -3
- package/src/duckdb/src/main/relation/read_csv_relation.cpp +19 -13
- package/src/duckdb/src/parallel/pipeline_finish_event.cpp +1 -1
- package/src/duckdb/src/parser/tableref/pivotref.cpp +0 -16
- package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +1 -1
- package/src/duckdb/src/planner/binder/statement/bind_export.cpp +41 -25
- package/src/duckdb/src/planner/expression/bound_aggregate_expression.cpp +4 -4
- package/src/duckdb/src/planner/expression/bound_window_expression.cpp +10 -10
- package/src/duckdb/src/planner/logical_operator.cpp +1 -1
- package/src/duckdb/src/planner/planner.cpp +1 -1
- package/src/duckdb/src/storage/checkpoint_manager.cpp +4 -3
- package/src/duckdb/src/storage/serialization/serialize_constraint.cpp +1 -1
- package/src/duckdb/src/storage/serialization/serialize_create_info.cpp +5 -5
- package/src/duckdb/src/storage/serialization/serialize_expression.cpp +10 -10
- package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +20 -20
- package/src/duckdb/src/storage/serialization/serialize_macro_function.cpp +2 -2
- package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +118 -89
- package/src/duckdb/src/storage/serialization/serialize_parse_info.cpp +3 -3
- package/src/duckdb/src/storage/serialization/serialize_parsed_expression.cpp +27 -27
- package/src/duckdb/src/storage/serialization/serialize_query_node.cpp +16 -16
- package/src/duckdb/src/storage/serialization/serialize_result_modifier.cpp +8 -8
- package/src/duckdb/src/storage/serialization/serialize_statement.cpp +1 -1
- package/src/duckdb/src/storage/serialization/serialize_storage.cpp +39 -0
- package/src/duckdb/src/storage/serialization/serialize_tableref.cpp +9 -9
- package/src/duckdb/src/storage/statistics/base_statistics.cpp +67 -4
- package/src/duckdb/src/storage/statistics/column_statistics.cpp +16 -0
- package/src/duckdb/src/storage/statistics/list_stats.cpp +21 -0
- package/src/duckdb/src/storage/statistics/numeric_stats.cpp +126 -1
- package/src/duckdb/src/storage/statistics/string_stats.cpp +23 -0
- package/src/duckdb/src/storage/statistics/struct_stats.cpp +27 -0
- package/src/duckdb/src/storage/storage_info.cpp +1 -1
- package/src/duckdb/src/storage/table/chunk_info.cpp +82 -3
- package/src/duckdb/src/storage/table/row_group.cpp +68 -1
- package/src/duckdb/src/storage/table/table_statistics.cpp +21 -0
- package/src/duckdb/src/storage/wal_replay.cpp +2 -2
- package/src/duckdb/src/verification/deserialized_statement_verifier_v2.cpp +15 -1
- package/src/duckdb/src/verification/statement_verifier.cpp +2 -0
- package/src/duckdb/third_party/utf8proc/include/utf8proc_wrapper.hpp +8 -0
- package/src/duckdb/ub_src_execution.cpp +0 -2
- package/src/duckdb/ub_src_execution_operator_csv_scanner.cpp +18 -0
- package/src/duckdb/ub_src_execution_operator_csv_scanner_sniffer.cpp +12 -0
- package/src/duckdb/ub_src_execution_operator_persistent.cpp +0 -12
- package/src/duckdb/ub_src_storage_serialization.cpp +2 -0
- package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +0 -1487
- package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +0 -72
- package/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp +0 -158
- package/src/duckdb/src/execution/partitionable_hashtable.cpp +0 -207
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +0 -133
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +0 -74
- package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +0 -73
@@ -1,4 +1,5 @@
|
|
1
|
-
#include "duckdb/execution/operator/
|
1
|
+
#include "duckdb/execution/operator/scan/csv/base_csv_reader.hpp"
|
2
|
+
|
2
3
|
#include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp"
|
3
4
|
#include "duckdb/common/file_system.hpp"
|
4
5
|
#include "duckdb/common/string_util.hpp"
|
@@ -17,7 +18,7 @@
|
|
17
18
|
#include "utf8proc.hpp"
|
18
19
|
#include "duckdb/parser/keyword_helper.hpp"
|
19
20
|
#include "duckdb/main/error_manager.hpp"
|
20
|
-
#include "duckdb/execution/operator/
|
21
|
+
#include "duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp"
|
21
22
|
#include "duckdb/execution/operator/persistent/csv_rejects_table.hpp"
|
22
23
|
#include "duckdb/main/client_data.hpp"
|
23
24
|
#include <algorithm>
|
@@ -33,7 +34,7 @@ string BaseCSVReader::GetLineNumberStr(idx_t line_error, bool is_line_estimated,
|
|
33
34
|
return to_string(GetLineError(line_error, buffer_idx)) + estimated;
|
34
35
|
}
|
35
36
|
|
36
|
-
BaseCSVReader::BaseCSVReader(ClientContext &context_p,
|
37
|
+
BaseCSVReader::BaseCSVReader(ClientContext &context_p, CSVReaderOptions options_p,
|
37
38
|
const vector<LogicalType> &requested_types)
|
38
39
|
: context(context_p), fs(FileSystem::GetFileSystem(context)), allocator(BufferAllocator::Get(context)),
|
39
40
|
options(std::move(options_p)) {
|
@@ -42,8 +43,9 @@ BaseCSVReader::BaseCSVReader(ClientContext &context_p, BufferedCSVReaderOptions
|
|
42
43
|
BaseCSVReader::~BaseCSVReader() {
|
43
44
|
}
|
44
45
|
|
45
|
-
unique_ptr<CSVFileHandle> BaseCSVReader::OpenCSV(const
|
46
|
-
return CSVFileHandle::OpenFile(
|
46
|
+
unique_ptr<CSVFileHandle> BaseCSVReader::OpenCSV(ClientContext &context, const CSVReaderOptions &options_p) {
|
47
|
+
return CSVFileHandle::OpenFile(FileSystem::GetFileSystem(context), BufferAllocator::Get(context),
|
48
|
+
options_p.file_path, options_p.compression);
|
47
49
|
}
|
48
50
|
|
49
51
|
void BaseCSVReader::InitParseChunk(idx_t num_cols) {
|
@@ -69,101 +71,9 @@ void BaseCSVReader::InitializeProjection() {
|
|
69
71
|
}
|
70
72
|
}
|
71
73
|
|
72
|
-
void BaseCSVReader::SetDateFormat(const string &format_specifier, const LogicalTypeId &sql_type) {
|
73
|
-
options.has_format[sql_type] = true;
|
74
|
-
auto &date_format = options.date_format[sql_type];
|
75
|
-
date_format.format_specifier = format_specifier;
|
76
|
-
StrTimeFormat::ParseFormatSpecifier(date_format.format_specifier, date_format);
|
77
|
-
}
|
78
|
-
|
79
|
-
struct TryCastDecimalOperator {
|
80
|
-
template <class OP, class T>
|
81
|
-
static bool Operation(string_t input, uint8_t width, uint8_t scale) {
|
82
|
-
T result;
|
83
|
-
string error_message;
|
84
|
-
return OP::Operation(input, result, &error_message, width, scale);
|
85
|
-
}
|
86
|
-
};
|
87
|
-
|
88
|
-
struct TryCastFloatingOperator {
|
89
|
-
template <class OP, class T>
|
90
|
-
static bool Operation(string_t input) {
|
91
|
-
T result;
|
92
|
-
string error_message;
|
93
|
-
return OP::Operation(input, result, &error_message);
|
94
|
-
}
|
95
|
-
};
|
96
|
-
|
97
|
-
bool TryCastDecimalValueCommaSeparated(const string_t &value_str, const LogicalType &sql_type) {
|
98
|
-
auto width = DecimalType::GetWidth(sql_type);
|
99
|
-
auto scale = DecimalType::GetScale(sql_type);
|
100
|
-
switch (sql_type.InternalType()) {
|
101
|
-
case PhysicalType::INT16:
|
102
|
-
return TryCastDecimalOperator::Operation<TryCastToDecimalCommaSeparated, int16_t>(value_str, width, scale);
|
103
|
-
case PhysicalType::INT32:
|
104
|
-
return TryCastDecimalOperator::Operation<TryCastToDecimalCommaSeparated, int32_t>(value_str, width, scale);
|
105
|
-
case PhysicalType::INT64:
|
106
|
-
return TryCastDecimalOperator::Operation<TryCastToDecimalCommaSeparated, int64_t>(value_str, width, scale);
|
107
|
-
case PhysicalType::INT128:
|
108
|
-
return TryCastDecimalOperator::Operation<TryCastToDecimalCommaSeparated, hugeint_t>(value_str, width, scale);
|
109
|
-
default:
|
110
|
-
throw InternalException("Unimplemented physical type for decimal");
|
111
|
-
}
|
112
|
-
}
|
113
|
-
|
114
|
-
bool TryCastFloatingValueCommaSeparated(const string_t &value_str, const LogicalType &sql_type) {
|
115
|
-
switch (sql_type.InternalType()) {
|
116
|
-
case PhysicalType::DOUBLE:
|
117
|
-
return TryCastFloatingOperator::Operation<TryCastErrorMessageCommaSeparated, double>(value_str);
|
118
|
-
case PhysicalType::FLOAT:
|
119
|
-
return TryCastFloatingOperator::Operation<TryCastErrorMessageCommaSeparated, float>(value_str);
|
120
|
-
default:
|
121
|
-
throw InternalException("Unimplemented physical type for floating");
|
122
|
-
}
|
123
|
-
}
|
124
|
-
|
125
|
-
bool BaseCSVReader::TryCastValue(const Value &value, const LogicalType &sql_type) {
|
126
|
-
if (value.IsNull()) {
|
127
|
-
return true;
|
128
|
-
}
|
129
|
-
if (options.has_format[LogicalTypeId::DATE] && sql_type.id() == LogicalTypeId::DATE) {
|
130
|
-
date_t result;
|
131
|
-
string error_message;
|
132
|
-
return options.date_format[LogicalTypeId::DATE].TryParseDate(string_t(StringValue::Get(value)), result,
|
133
|
-
error_message);
|
134
|
-
} else if (options.has_format[LogicalTypeId::TIMESTAMP] && sql_type.id() == LogicalTypeId::TIMESTAMP) {
|
135
|
-
timestamp_t result;
|
136
|
-
string error_message;
|
137
|
-
return options.date_format[LogicalTypeId::TIMESTAMP].TryParseTimestamp(string_t(StringValue::Get(value)),
|
138
|
-
result, error_message);
|
139
|
-
} else if (options.decimal_separator != "." && sql_type.id() == LogicalTypeId::DECIMAL) {
|
140
|
-
return TryCastDecimalValueCommaSeparated(string_t(StringValue::Get(value)), sql_type);
|
141
|
-
} else if (options.decimal_separator != "." &&
|
142
|
-
((sql_type.id() == LogicalTypeId::FLOAT) || (sql_type.id() == LogicalTypeId::DOUBLE))) {
|
143
|
-
return TryCastFloatingValueCommaSeparated(string_t(StringValue::Get(value)), sql_type);
|
144
|
-
} else {
|
145
|
-
Value new_value;
|
146
|
-
string error_message;
|
147
|
-
return value.TryCastAs(context, sql_type, new_value, &error_message, true);
|
148
|
-
}
|
149
|
-
}
|
150
|
-
|
151
|
-
struct TryCastDateOperator {
|
152
|
-
static bool Operation(BufferedCSVReaderOptions &options, string_t input, date_t &result, string &error_message) {
|
153
|
-
return options.date_format[LogicalTypeId::DATE].TryParseDate(input, result, error_message);
|
154
|
-
}
|
155
|
-
};
|
156
|
-
|
157
|
-
struct TryCastTimestampOperator {
|
158
|
-
static bool Operation(BufferedCSVReaderOptions &options, string_t input, timestamp_t &result,
|
159
|
-
string &error_message) {
|
160
|
-
return options.date_format[LogicalTypeId::TIMESTAMP].TryParseTimestamp(input, result, error_message);
|
161
|
-
}
|
162
|
-
};
|
163
|
-
|
164
74
|
template <class OP, class T>
|
165
|
-
static bool TemplatedTryCastDateVector(
|
166
|
-
idx_t count, string &error_message, idx_t &line_error) {
|
75
|
+
static bool TemplatedTryCastDateVector(map<LogicalTypeId, StrpTimeFormat> &options, Vector &input_vector,
|
76
|
+
Vector &result_vector, idx_t count, string &error_message, idx_t &line_error) {
|
167
77
|
D_ASSERT(input_vector.GetType().id() == LogicalTypeId::VARCHAR);
|
168
78
|
bool all_converted = true;
|
169
79
|
idx_t cur_line = 0;
|
@@ -179,22 +89,44 @@ static bool TemplatedTryCastDateVector(BufferedCSVReaderOptions &options, Vector
|
|
179
89
|
return all_converted;
|
180
90
|
}
|
181
91
|
|
182
|
-
|
183
|
-
|
92
|
+
struct TryCastDateOperator {
|
93
|
+
static bool Operation(map<LogicalTypeId, StrpTimeFormat> &options, string_t input, date_t &result,
|
94
|
+
string &error_message) {
|
95
|
+
return options[LogicalTypeId::DATE].TryParseDate(input, result, error_message);
|
96
|
+
}
|
97
|
+
};
|
98
|
+
|
99
|
+
struct TryCastTimestampOperator {
|
100
|
+
static bool Operation(map<LogicalTypeId, StrpTimeFormat> &options, string_t input, timestamp_t &result,
|
101
|
+
string &error_message) {
|
102
|
+
return options[LogicalTypeId::TIMESTAMP].TryParseTimestamp(input, result, error_message);
|
103
|
+
}
|
104
|
+
};
|
105
|
+
|
106
|
+
bool BaseCSVReader::TryCastDateVector(map<LogicalTypeId, StrpTimeFormat> &options, Vector &input_vector,
|
107
|
+
Vector &result_vector, idx_t count, string &error_message, idx_t &line_error) {
|
184
108
|
return TemplatedTryCastDateVector<TryCastDateOperator, date_t>(options, input_vector, result_vector, count,
|
185
109
|
error_message, line_error);
|
186
110
|
}
|
187
111
|
|
188
|
-
bool TryCastTimestampVector(
|
189
|
-
|
112
|
+
bool BaseCSVReader::TryCastTimestampVector(map<LogicalTypeId, StrpTimeFormat> &options, Vector &input_vector,
|
113
|
+
Vector &result_vector, idx_t count, string &error_message) {
|
190
114
|
idx_t line_error;
|
191
115
|
return TemplatedTryCastDateVector<TryCastTimestampOperator, timestamp_t>(options, input_vector, result_vector,
|
192
116
|
count, error_message, line_error);
|
193
117
|
}
|
194
118
|
|
119
|
+
void BaseCSVReader::VerifyLineLength(idx_t line_size, idx_t buffer_idx) {
|
120
|
+
if (line_size > options.maximum_line_size) {
|
121
|
+
throw InvalidInputException(
|
122
|
+
"Error in file \"%s\" on line %s: Maximum line size of %llu bytes exceeded!", options.file_path,
|
123
|
+
GetLineNumberStr(parse_chunk.size(), linenr_estimated, buffer_idx).c_str(), options.maximum_line_size);
|
124
|
+
}
|
125
|
+
}
|
126
|
+
|
195
127
|
template <class OP, class T>
|
196
|
-
bool TemplatedTryCastFloatingVector(
|
197
|
-
|
128
|
+
bool TemplatedTryCastFloatingVector(CSVReaderOptions &options, Vector &input_vector, Vector &result_vector, idx_t count,
|
129
|
+
string &error_message, idx_t &line_error) {
|
198
130
|
D_ASSERT(input_vector.GetType().id() == LogicalTypeId::VARCHAR);
|
199
131
|
bool all_converted = true;
|
200
132
|
idx_t row = 0;
|
@@ -212,8 +144,8 @@ bool TemplatedTryCastFloatingVector(BufferedCSVReaderOptions &options, Vector &i
|
|
212
144
|
}
|
213
145
|
|
214
146
|
template <class OP, class T>
|
215
|
-
bool TemplatedTryCastDecimalVector(
|
216
|
-
|
147
|
+
bool TemplatedTryCastDecimalVector(CSVReaderOptions &options, Vector &input_vector, Vector &result_vector, idx_t count,
|
148
|
+
string &error_message, uint8_t width, uint8_t scale) {
|
217
149
|
D_ASSERT(input_vector.GetType().id() == LogicalTypeId::VARCHAR);
|
218
150
|
bool all_converted = true;
|
219
151
|
UnaryExecutor::Execute<string_t, T>(input_vector, result_vector, count, [&](string_t input) {
|
@@ -226,25 +158,6 @@ bool TemplatedTryCastDecimalVector(BufferedCSVReaderOptions &options, Vector &in
|
|
226
158
|
return all_converted;
|
227
159
|
}
|
228
160
|
|
229
|
-
bool BaseCSVReader::TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type) {
|
230
|
-
// try vector-cast from string to sql_type
|
231
|
-
Vector dummy_result(sql_type);
|
232
|
-
if (options.has_format[LogicalTypeId::DATE] && sql_type == LogicalTypeId::DATE) {
|
233
|
-
// use the date format to cast the chunk
|
234
|
-
string error_message;
|
235
|
-
idx_t line_error;
|
236
|
-
return TryCastDateVector(options, parse_chunk_col, dummy_result, size, error_message, line_error);
|
237
|
-
} else if (options.has_format[LogicalTypeId::TIMESTAMP] && sql_type == LogicalTypeId::TIMESTAMP) {
|
238
|
-
// use the timestamp format to cast the chunk
|
239
|
-
string error_message;
|
240
|
-
return TryCastTimestampVector(options, parse_chunk_col, dummy_result, size, error_message);
|
241
|
-
} else {
|
242
|
-
// target type is not varchar: perform a cast
|
243
|
-
string error_message;
|
244
|
-
return VectorOperations::DefaultTryCast(parse_chunk_col, dummy_result, size, &error_message, true);
|
245
|
-
}
|
246
|
-
}
|
247
|
-
|
248
161
|
void BaseCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &escape_positions, bool has_quotes,
|
249
162
|
idx_t buffer_idx) {
|
250
163
|
auto length = str_val.GetSize();
|
@@ -257,10 +170,6 @@ void BaseCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &esc
|
|
257
170
|
// skip a single trailing delimiter in last column
|
258
171
|
return;
|
259
172
|
}
|
260
|
-
if (mode == ParserMode::SNIFFING_DIALECT) {
|
261
|
-
column++;
|
262
|
-
return;
|
263
|
-
}
|
264
173
|
if (column >= return_types.size()) {
|
265
174
|
if (options.ignore_errors) {
|
266
175
|
error_column_overflow = true;
|
@@ -291,12 +200,7 @@ void BaseCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &esc
|
|
291
200
|
for (idx_t i = 0; i < escape_positions.size(); i++) {
|
292
201
|
idx_t next_pos = escape_positions[i];
|
293
202
|
new_val += old_val.substr(prev_pos, next_pos - prev_pos);
|
294
|
-
|
295
|
-
if (options.escape.empty() || options.escape == options.quote) {
|
296
|
-
prev_pos = next_pos + options.quote.size();
|
297
|
-
} else {
|
298
|
-
prev_pos = next_pos + options.escape.size();
|
299
|
-
}
|
203
|
+
prev_pos = ++next_pos;
|
300
204
|
}
|
301
205
|
new_val += old_val.substr(prev_pos, old_val.size() - prev_pos);
|
302
206
|
escape_positions.clear();
|
@@ -332,7 +236,7 @@ bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error
|
|
332
236
|
return false;
|
333
237
|
}
|
334
238
|
|
335
|
-
if (column < return_types.size()
|
239
|
+
if (column < return_types.size()) {
|
336
240
|
if (options.null_padding) {
|
337
241
|
for (; column < return_types.size(); column++) {
|
338
242
|
FlatVector::SetNull(parse_chunk.data[column], parse_chunk.size(), true);
|
@@ -353,15 +257,7 @@ bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error
|
|
353
257
|
}
|
354
258
|
}
|
355
259
|
|
356
|
-
|
357
|
-
sniffed_column_counts.push_back(column);
|
358
|
-
|
359
|
-
if (sniffed_column_counts.size() == options.sample_chunk_size) {
|
360
|
-
return true;
|
361
|
-
}
|
362
|
-
} else {
|
363
|
-
parse_chunk.SetCardinality(parse_chunk.size() + 1);
|
364
|
-
}
|
260
|
+
parse_chunk.SetCardinality(parse_chunk.size() + 1);
|
365
261
|
|
366
262
|
if (mode == ParserMode::PARSING_HEADER) {
|
367
263
|
return true;
|
@@ -412,7 +308,7 @@ void BaseCSVReader::VerifyUTF8(idx_t col_idx) {
|
|
412
308
|
}
|
413
309
|
}
|
414
310
|
|
415
|
-
bool TryCastDecimalVectorCommaSeparated(
|
311
|
+
bool TryCastDecimalVectorCommaSeparated(CSVReaderOptions &options, Vector &input_vector, Vector &result_vector,
|
416
312
|
idx_t count, string &error_message, const LogicalType &result_type) {
|
417
313
|
auto width = DecimalType::GetWidth(result_type);
|
418
314
|
auto scale = DecimalType::GetScale(result_type);
|
@@ -434,7 +330,7 @@ bool TryCastDecimalVectorCommaSeparated(BufferedCSVReaderOptions &options, Vecto
|
|
434
330
|
}
|
435
331
|
}
|
436
332
|
|
437
|
-
bool TryCastFloatingVectorCommaSeparated(
|
333
|
+
bool TryCastFloatingVectorCommaSeparated(CSVReaderOptions &options, Vector &input_vector, Vector &result_vector,
|
438
334
|
idx_t count, string &error_message, const LogicalType &result_type,
|
439
335
|
idx_t &line_error) {
|
440
336
|
switch (result_type.InternalType()) {
|
@@ -491,14 +387,15 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, idx_t buffer_idx, bool try_ad
|
|
491
387
|
bool success;
|
492
388
|
idx_t line_error = 0;
|
493
389
|
bool target_type_not_varchar = false;
|
494
|
-
if (options.has_format[LogicalTypeId::DATE] && type.id() == LogicalTypeId::DATE) {
|
390
|
+
if (options.dialect_options.has_format[LogicalTypeId::DATE] && type.id() == LogicalTypeId::DATE) {
|
495
391
|
// use the date format to cast the chunk
|
496
|
-
success = TryCastDateVector(options, parse_vector, result_vector,
|
497
|
-
line_error);
|
498
|
-
} else if (options.has_format[LogicalTypeId::TIMESTAMP] &&
|
392
|
+
success = TryCastDateVector(options.dialect_options.date_format, parse_vector, result_vector,
|
393
|
+
parse_chunk.size(), error_message, line_error);
|
394
|
+
} else if (options.dialect_options.has_format[LogicalTypeId::TIMESTAMP] &&
|
395
|
+
type.id() == LogicalTypeId::TIMESTAMP) {
|
499
396
|
// use the date format to cast the chunk
|
500
|
-
success =
|
501
|
-
|
397
|
+
success = TryCastTimestampVector(options.dialect_options.date_format, parse_vector, result_vector,
|
398
|
+
parse_chunk.size(), error_message);
|
502
399
|
} else if (options.decimal_separator != "." &&
|
503
400
|
(type.id() == LogicalTypeId::FLOAT || type.id() == LogicalTypeId::DOUBLE)) {
|
504
401
|
success = TryCastFloatingVectorCommaSeparated(options, parse_vector, result_vector, parse_chunk.size(),
|
@@ -666,9 +563,8 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, idx_t buffer_idx, bool try_ad
|
|
666
563
|
}
|
667
564
|
|
668
565
|
void BaseCSVReader::SetNewLineDelimiter(bool carry, bool carry_followed_by_nl) {
|
669
|
-
if (
|
670
|
-
|
671
|
-
if (options.new_line == NewLineIdentifier::MIX) {
|
566
|
+
if (options.dialect_options.new_line == NewLineIdentifier::NOT_SET) {
|
567
|
+
if (options.dialect_options.new_line == NewLineIdentifier::MIX) {
|
672
568
|
return;
|
673
569
|
}
|
674
570
|
NewLineIdentifier this_line_identifier;
|
@@ -681,15 +577,15 @@ void BaseCSVReader::SetNewLineDelimiter(bool carry, bool carry_followed_by_nl) {
|
|
681
577
|
} else {
|
682
578
|
this_line_identifier = NewLineIdentifier::SINGLE;
|
683
579
|
}
|
684
|
-
if (options.new_line == NewLineIdentifier::NOT_SET) {
|
685
|
-
options.new_line = this_line_identifier;
|
580
|
+
if (options.dialect_options.new_line == NewLineIdentifier::NOT_SET) {
|
581
|
+
options.dialect_options.new_line = this_line_identifier;
|
686
582
|
return;
|
687
583
|
}
|
688
|
-
if (options.new_line != this_line_identifier) {
|
689
|
-
options.new_line = NewLineIdentifier::MIX;
|
584
|
+
if (options.dialect_options.new_line != this_line_identifier) {
|
585
|
+
options.dialect_options.new_line = NewLineIdentifier::MIX;
|
690
586
|
return;
|
691
587
|
}
|
692
|
-
options.new_line = this_line_identifier;
|
588
|
+
options.dialect_options.new_line = this_line_identifier;
|
693
589
|
}
|
694
590
|
}
|
695
591
|
} // namespace duckdb
|