duckdb 0.8.2-dev3458.0 → 0.8.2-dev3949.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. package/binding.gyp +2 -0
  2. package/package.json +1 -1
  3. package/src/duckdb/extension/icu/icu_extension.cpp +5 -5
  4. package/src/duckdb/extension/json/include/json_deserializer.hpp +7 -16
  5. package/src/duckdb/extension/json/include/json_serializer.hpp +9 -15
  6. package/src/duckdb/extension/json/json_deserializer.cpp +29 -67
  7. package/src/duckdb/extension/json/json_scan.cpp +1 -1
  8. package/src/duckdb/extension/json/json_serializer.cpp +26 -69
  9. package/src/duckdb/src/common/enum_util.cpp +119 -7
  10. package/src/duckdb/src/common/extra_type_info.cpp +7 -3
  11. package/src/duckdb/src/common/radix_partitioning.cpp +8 -31
  12. package/src/duckdb/src/common/row_operations/row_aggregate.cpp +18 -3
  13. package/src/duckdb/src/common/serializer/binary_deserializer.cpp +62 -77
  14. package/src/duckdb/src/common/serializer/binary_serializer.cpp +84 -84
  15. package/src/duckdb/src/common/serializer/format_serializer.cpp +1 -1
  16. package/src/duckdb/src/common/sort/partition_state.cpp +41 -33
  17. package/src/duckdb/src/common/types/data_chunk.cpp +44 -8
  18. package/src/duckdb/src/common/types/hyperloglog.cpp +21 -0
  19. package/src/duckdb/src/common/types/interval.cpp +3 -0
  20. package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +252 -126
  21. package/src/duckdb/src/common/types/row/row_layout.cpp +3 -31
  22. package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +40 -32
  23. package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +39 -26
  24. package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +11 -1
  25. package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +21 -16
  26. package/src/duckdb/src/common/types/value.cpp +63 -42
  27. package/src/duckdb/src/common/types/vector.cpp +33 -67
  28. package/src/duckdb/src/core_functions/scalar/list/list_lambdas.cpp +3 -2
  29. package/src/duckdb/src/execution/aggregate_hashtable.cpp +222 -364
  30. package/src/duckdb/src/execution/join_hashtable.cpp +5 -6
  31. package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +240 -310
  32. package/src/duckdb/src/execution/operator/aggregate/physical_ungrouped_aggregate.cpp +202 -173
  33. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +36 -2
  34. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/base_csv_reader.cpp +58 -162
  35. package/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp +434 -0
  36. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +80 -0
  37. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer_manager.cpp +90 -0
  38. package/src/duckdb/src/execution/operator/csv_scanner/csv_file_handle.cpp +95 -0
  39. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/csv_reader_options.cpp +47 -28
  40. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine.cpp +35 -0
  41. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +107 -0
  42. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/parallel_csv_reader.cpp +44 -44
  43. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +52 -0
  44. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +336 -0
  45. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +165 -0
  46. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +398 -0
  47. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +175 -0
  48. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +39 -0
  49. package/src/duckdb/src/execution/operator/join/physical_asof_join.cpp +1 -1
  50. package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +1 -2
  51. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +614 -574
  52. package/src/duckdb/src/execution/window_executor.cpp +6 -5
  53. package/src/duckdb/src/function/cast/cast_function_set.cpp +1 -0
  54. package/src/duckdb/src/function/scalar/strftime_format.cpp +4 -4
  55. package/src/duckdb/src/function/table/copy_csv.cpp +94 -96
  56. package/src/duckdb/src/function/table/read_csv.cpp +150 -136
  57. package/src/duckdb/src/function/table/table_scan.cpp +0 -2
  58. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  59. package/src/duckdb/src/include/duckdb/common/enum_util.hpp +24 -0
  60. package/src/duckdb/src/include/duckdb/common/file_opener.hpp +9 -0
  61. package/src/duckdb/src/include/duckdb/common/fixed_size_map.hpp +208 -0
  62. package/src/duckdb/src/include/duckdb/common/optional_idx.hpp +3 -0
  63. package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +2 -1
  64. package/src/duckdb/src/include/duckdb/common/printer.hpp +11 -0
  65. package/src/duckdb/src/include/duckdb/common/serializer/binary_deserializer.hpp +43 -30
  66. package/src/duckdb/src/include/duckdb/common/serializer/binary_serializer.hpp +36 -35
  67. package/src/duckdb/src/include/duckdb/common/serializer/deserialization_data.hpp +18 -0
  68. package/src/duckdb/src/include/duckdb/common/serializer/encoding_util.hpp +132 -0
  69. package/src/duckdb/src/include/duckdb/common/serializer/format_deserializer.hpp +125 -150
  70. package/src/duckdb/src/include/duckdb/common/serializer/format_serializer.hpp +119 -107
  71. package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +2 -1
  72. package/src/duckdb/src/include/duckdb/common/shared_ptr.hpp +8 -0
  73. package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +13 -7
  74. package/src/duckdb/src/include/duckdb/common/types/data_chunk.hpp +5 -0
  75. package/src/duckdb/src/include/duckdb/common/types/hyperloglog.hpp +7 -1
  76. package/src/duckdb/src/include/duckdb/common/types/interval.hpp +7 -0
  77. package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +41 -9
  78. package/src/duckdb/src/include/duckdb/common/types/row/row_data_collection_scanner.hpp +5 -0
  79. package/src/duckdb/src/include/duckdb/common/types/row/row_layout.hpp +1 -23
  80. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +14 -8
  81. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +6 -3
  82. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +7 -0
  83. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +13 -8
  84. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +3 -2
  85. package/src/duckdb/src/include/duckdb/common/types/vector.hpp +3 -3
  86. package/src/duckdb/src/include/duckdb/common/vector.hpp +2 -2
  87. package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +125 -146
  88. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_hash_aggregate.hpp +5 -4
  89. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_window.hpp +4 -3
  90. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/base_csv_reader.hpp +17 -17
  91. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp +72 -0
  92. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +110 -0
  93. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp +103 -0
  94. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_file_handle.hpp +8 -15
  95. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_line_info.hpp +1 -1
  96. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_reader_options.hpp +52 -28
  97. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +127 -0
  98. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +75 -0
  99. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +51 -0
  100. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/parallel_csv_reader.hpp +21 -27
  101. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/quote_rules.hpp +21 -0
  102. package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +18 -27
  103. package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +5 -6
  104. package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +4 -4
  105. package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +17 -12
  106. package/src/duckdb/src/include/duckdb/main/client_context_file_opener.hpp +1 -0
  107. package/src/duckdb/src/include/duckdb/main/client_data.hpp +2 -1
  108. package/src/duckdb/src/include/duckdb/main/config.hpp +1 -0
  109. package/src/duckdb/src/include/duckdb/main/connection.hpp +2 -2
  110. package/src/duckdb/src/include/duckdb/main/relation/read_csv_relation.hpp +6 -6
  111. package/src/duckdb/src/include/duckdb/parallel/event.hpp +12 -1
  112. package/src/duckdb/src/include/duckdb/storage/block.hpp +6 -0
  113. package/src/duckdb/src/include/duckdb/storage/buffer/block_handle.hpp +3 -0
  114. package/src/duckdb/src/include/duckdb/storage/statistics/base_statistics.hpp +7 -3
  115. package/src/duckdb/src/include/duckdb/storage/statistics/column_statistics.hpp +4 -0
  116. package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +5 -0
  117. package/src/duckdb/src/include/duckdb/storage/statistics/list_stats.hpp +3 -0
  118. package/src/duckdb/src/include/duckdb/storage/statistics/numeric_stats.hpp +3 -0
  119. package/src/duckdb/src/include/duckdb/storage/statistics/string_stats.hpp +3 -0
  120. package/src/duckdb/src/include/duckdb/storage/statistics/struct_stats.hpp +3 -0
  121. package/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp +15 -3
  122. package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +4 -0
  123. package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +5 -0
  124. package/src/duckdb/src/include/duckdb/verification/deserialized_statement_verifier_v2.hpp +6 -0
  125. package/src/duckdb/src/include/duckdb/verification/statement_verifier.hpp +1 -0
  126. package/src/duckdb/src/include/duckdb.h +12 -0
  127. package/src/duckdb/src/main/capi/logical_types-c.cpp +22 -0
  128. package/src/duckdb/src/main/client_context_file_opener.cpp +17 -0
  129. package/src/duckdb/src/main/client_verify.cpp +1 -0
  130. package/src/duckdb/src/main/config.cpp +2 -2
  131. package/src/duckdb/src/main/connection.cpp +3 -3
  132. package/src/duckdb/src/main/relation/read_csv_relation.cpp +19 -13
  133. package/src/duckdb/src/parallel/pipeline_finish_event.cpp +1 -1
  134. package/src/duckdb/src/parser/tableref/pivotref.cpp +0 -16
  135. package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +1 -1
  136. package/src/duckdb/src/planner/binder/statement/bind_export.cpp +41 -25
  137. package/src/duckdb/src/planner/expression/bound_aggregate_expression.cpp +4 -4
  138. package/src/duckdb/src/planner/expression/bound_window_expression.cpp +10 -10
  139. package/src/duckdb/src/planner/logical_operator.cpp +1 -1
  140. package/src/duckdb/src/planner/planner.cpp +1 -1
  141. package/src/duckdb/src/storage/checkpoint_manager.cpp +4 -3
  142. package/src/duckdb/src/storage/serialization/serialize_constraint.cpp +1 -1
  143. package/src/duckdb/src/storage/serialization/serialize_create_info.cpp +5 -5
  144. package/src/duckdb/src/storage/serialization/serialize_expression.cpp +10 -10
  145. package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +20 -20
  146. package/src/duckdb/src/storage/serialization/serialize_macro_function.cpp +2 -2
  147. package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +118 -89
  148. package/src/duckdb/src/storage/serialization/serialize_parse_info.cpp +3 -3
  149. package/src/duckdb/src/storage/serialization/serialize_parsed_expression.cpp +27 -27
  150. package/src/duckdb/src/storage/serialization/serialize_query_node.cpp +16 -16
  151. package/src/duckdb/src/storage/serialization/serialize_result_modifier.cpp +8 -8
  152. package/src/duckdb/src/storage/serialization/serialize_statement.cpp +1 -1
  153. package/src/duckdb/src/storage/serialization/serialize_storage.cpp +39 -0
  154. package/src/duckdb/src/storage/serialization/serialize_tableref.cpp +9 -9
  155. package/src/duckdb/src/storage/statistics/base_statistics.cpp +67 -4
  156. package/src/duckdb/src/storage/statistics/column_statistics.cpp +16 -0
  157. package/src/duckdb/src/storage/statistics/list_stats.cpp +21 -0
  158. package/src/duckdb/src/storage/statistics/numeric_stats.cpp +126 -1
  159. package/src/duckdb/src/storage/statistics/string_stats.cpp +23 -0
  160. package/src/duckdb/src/storage/statistics/struct_stats.cpp +27 -0
  161. package/src/duckdb/src/storage/storage_info.cpp +1 -1
  162. package/src/duckdb/src/storage/table/chunk_info.cpp +82 -3
  163. package/src/duckdb/src/storage/table/row_group.cpp +68 -1
  164. package/src/duckdb/src/storage/table/table_statistics.cpp +21 -0
  165. package/src/duckdb/src/storage/wal_replay.cpp +2 -2
  166. package/src/duckdb/src/verification/deserialized_statement_verifier_v2.cpp +15 -1
  167. package/src/duckdb/src/verification/statement_verifier.cpp +2 -0
  168. package/src/duckdb/third_party/utf8proc/include/utf8proc_wrapper.hpp +8 -0
  169. package/src/duckdb/ub_src_execution.cpp +0 -2
  170. package/src/duckdb/ub_src_execution_operator_csv_scanner.cpp +18 -0
  171. package/src/duckdb/ub_src_execution_operator_csv_scanner_sniffer.cpp +12 -0
  172. package/src/duckdb/ub_src_execution_operator_persistent.cpp +0 -12
  173. package/src/duckdb/ub_src_storage_serialization.cpp +2 -0
  174. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +0 -1487
  175. package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +0 -72
  176. package/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp +0 -158
  177. package/src/duckdb/src/execution/partitionable_hashtable.cpp +0 -207
  178. package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +0 -133
  179. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +0 -74
  180. package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +0 -73
@@ -1,4 +1,5 @@
1
- #include "duckdb/execution/operator/persistent/base_csv_reader.hpp"
1
+ #include "duckdb/execution/operator/scan/csv/base_csv_reader.hpp"
2
+
2
3
  #include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp"
3
4
  #include "duckdb/common/file_system.hpp"
4
5
  #include "duckdb/common/string_util.hpp"
@@ -17,7 +18,7 @@
17
18
  #include "utf8proc.hpp"
18
19
  #include "duckdb/parser/keyword_helper.hpp"
19
20
  #include "duckdb/main/error_manager.hpp"
20
- #include "duckdb/execution/operator/persistent/parallel_csv_reader.hpp"
21
+ #include "duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp"
21
22
  #include "duckdb/execution/operator/persistent/csv_rejects_table.hpp"
22
23
  #include "duckdb/main/client_data.hpp"
23
24
  #include <algorithm>
@@ -33,7 +34,7 @@ string BaseCSVReader::GetLineNumberStr(idx_t line_error, bool is_line_estimated,
33
34
  return to_string(GetLineError(line_error, buffer_idx)) + estimated;
34
35
  }
35
36
 
36
- BaseCSVReader::BaseCSVReader(ClientContext &context_p, BufferedCSVReaderOptions options_p,
37
+ BaseCSVReader::BaseCSVReader(ClientContext &context_p, CSVReaderOptions options_p,
37
38
  const vector<LogicalType> &requested_types)
38
39
  : context(context_p), fs(FileSystem::GetFileSystem(context)), allocator(BufferAllocator::Get(context)),
39
40
  options(std::move(options_p)) {
@@ -42,8 +43,9 @@ BaseCSVReader::BaseCSVReader(ClientContext &context_p, BufferedCSVReaderOptions
42
43
  BaseCSVReader::~BaseCSVReader() {
43
44
  }
44
45
 
45
- unique_ptr<CSVFileHandle> BaseCSVReader::OpenCSV(const BufferedCSVReaderOptions &options_p) {
46
- return CSVFileHandle::OpenFile(fs, allocator, options_p.file_path, options_p.compression, true);
46
+ unique_ptr<CSVFileHandle> BaseCSVReader::OpenCSV(ClientContext &context, const CSVReaderOptions &options_p) {
47
+ return CSVFileHandle::OpenFile(FileSystem::GetFileSystem(context), BufferAllocator::Get(context),
48
+ options_p.file_path, options_p.compression);
47
49
  }
48
50
 
49
51
  void BaseCSVReader::InitParseChunk(idx_t num_cols) {
@@ -69,101 +71,9 @@ void BaseCSVReader::InitializeProjection() {
69
71
  }
70
72
  }
71
73
 
72
- void BaseCSVReader::SetDateFormat(const string &format_specifier, const LogicalTypeId &sql_type) {
73
- options.has_format[sql_type] = true;
74
- auto &date_format = options.date_format[sql_type];
75
- date_format.format_specifier = format_specifier;
76
- StrTimeFormat::ParseFormatSpecifier(date_format.format_specifier, date_format);
77
- }
78
-
79
- struct TryCastDecimalOperator {
80
- template <class OP, class T>
81
- static bool Operation(string_t input, uint8_t width, uint8_t scale) {
82
- T result;
83
- string error_message;
84
- return OP::Operation(input, result, &error_message, width, scale);
85
- }
86
- };
87
-
88
- struct TryCastFloatingOperator {
89
- template <class OP, class T>
90
- static bool Operation(string_t input) {
91
- T result;
92
- string error_message;
93
- return OP::Operation(input, result, &error_message);
94
- }
95
- };
96
-
97
- bool TryCastDecimalValueCommaSeparated(const string_t &value_str, const LogicalType &sql_type) {
98
- auto width = DecimalType::GetWidth(sql_type);
99
- auto scale = DecimalType::GetScale(sql_type);
100
- switch (sql_type.InternalType()) {
101
- case PhysicalType::INT16:
102
- return TryCastDecimalOperator::Operation<TryCastToDecimalCommaSeparated, int16_t>(value_str, width, scale);
103
- case PhysicalType::INT32:
104
- return TryCastDecimalOperator::Operation<TryCastToDecimalCommaSeparated, int32_t>(value_str, width, scale);
105
- case PhysicalType::INT64:
106
- return TryCastDecimalOperator::Operation<TryCastToDecimalCommaSeparated, int64_t>(value_str, width, scale);
107
- case PhysicalType::INT128:
108
- return TryCastDecimalOperator::Operation<TryCastToDecimalCommaSeparated, hugeint_t>(value_str, width, scale);
109
- default:
110
- throw InternalException("Unimplemented physical type for decimal");
111
- }
112
- }
113
-
114
- bool TryCastFloatingValueCommaSeparated(const string_t &value_str, const LogicalType &sql_type) {
115
- switch (sql_type.InternalType()) {
116
- case PhysicalType::DOUBLE:
117
- return TryCastFloatingOperator::Operation<TryCastErrorMessageCommaSeparated, double>(value_str);
118
- case PhysicalType::FLOAT:
119
- return TryCastFloatingOperator::Operation<TryCastErrorMessageCommaSeparated, float>(value_str);
120
- default:
121
- throw InternalException("Unimplemented physical type for floating");
122
- }
123
- }
124
-
125
- bool BaseCSVReader::TryCastValue(const Value &value, const LogicalType &sql_type) {
126
- if (value.IsNull()) {
127
- return true;
128
- }
129
- if (options.has_format[LogicalTypeId::DATE] && sql_type.id() == LogicalTypeId::DATE) {
130
- date_t result;
131
- string error_message;
132
- return options.date_format[LogicalTypeId::DATE].TryParseDate(string_t(StringValue::Get(value)), result,
133
- error_message);
134
- } else if (options.has_format[LogicalTypeId::TIMESTAMP] && sql_type.id() == LogicalTypeId::TIMESTAMP) {
135
- timestamp_t result;
136
- string error_message;
137
- return options.date_format[LogicalTypeId::TIMESTAMP].TryParseTimestamp(string_t(StringValue::Get(value)),
138
- result, error_message);
139
- } else if (options.decimal_separator != "." && sql_type.id() == LogicalTypeId::DECIMAL) {
140
- return TryCastDecimalValueCommaSeparated(string_t(StringValue::Get(value)), sql_type);
141
- } else if (options.decimal_separator != "." &&
142
- ((sql_type.id() == LogicalTypeId::FLOAT) || (sql_type.id() == LogicalTypeId::DOUBLE))) {
143
- return TryCastFloatingValueCommaSeparated(string_t(StringValue::Get(value)), sql_type);
144
- } else {
145
- Value new_value;
146
- string error_message;
147
- return value.TryCastAs(context, sql_type, new_value, &error_message, true);
148
- }
149
- }
150
-
151
- struct TryCastDateOperator {
152
- static bool Operation(BufferedCSVReaderOptions &options, string_t input, date_t &result, string &error_message) {
153
- return options.date_format[LogicalTypeId::DATE].TryParseDate(input, result, error_message);
154
- }
155
- };
156
-
157
- struct TryCastTimestampOperator {
158
- static bool Operation(BufferedCSVReaderOptions &options, string_t input, timestamp_t &result,
159
- string &error_message) {
160
- return options.date_format[LogicalTypeId::TIMESTAMP].TryParseTimestamp(input, result, error_message);
161
- }
162
- };
163
-
164
74
  template <class OP, class T>
165
- static bool TemplatedTryCastDateVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector,
166
- idx_t count, string &error_message, idx_t &line_error) {
75
+ static bool TemplatedTryCastDateVector(map<LogicalTypeId, StrpTimeFormat> &options, Vector &input_vector,
76
+ Vector &result_vector, idx_t count, string &error_message, idx_t &line_error) {
167
77
  D_ASSERT(input_vector.GetType().id() == LogicalTypeId::VARCHAR);
168
78
  bool all_converted = true;
169
79
  idx_t cur_line = 0;
@@ -179,22 +89,44 @@ static bool TemplatedTryCastDateVector(BufferedCSVReaderOptions &options, Vector
179
89
  return all_converted;
180
90
  }
181
91
 
182
- bool TryCastDateVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector, idx_t count,
183
- string &error_message, idx_t &line_error) {
92
+ struct TryCastDateOperator {
93
+ static bool Operation(map<LogicalTypeId, StrpTimeFormat> &options, string_t input, date_t &result,
94
+ string &error_message) {
95
+ return options[LogicalTypeId::DATE].TryParseDate(input, result, error_message);
96
+ }
97
+ };
98
+
99
+ struct TryCastTimestampOperator {
100
+ static bool Operation(map<LogicalTypeId, StrpTimeFormat> &options, string_t input, timestamp_t &result,
101
+ string &error_message) {
102
+ return options[LogicalTypeId::TIMESTAMP].TryParseTimestamp(input, result, error_message);
103
+ }
104
+ };
105
+
106
+ bool BaseCSVReader::TryCastDateVector(map<LogicalTypeId, StrpTimeFormat> &options, Vector &input_vector,
107
+ Vector &result_vector, idx_t count, string &error_message, idx_t &line_error) {
184
108
  return TemplatedTryCastDateVector<TryCastDateOperator, date_t>(options, input_vector, result_vector, count,
185
109
  error_message, line_error);
186
110
  }
187
111
 
188
- bool TryCastTimestampVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector, idx_t count,
189
- string &error_message) {
112
+ bool BaseCSVReader::TryCastTimestampVector(map<LogicalTypeId, StrpTimeFormat> &options, Vector &input_vector,
113
+ Vector &result_vector, idx_t count, string &error_message) {
190
114
  idx_t line_error;
191
115
  return TemplatedTryCastDateVector<TryCastTimestampOperator, timestamp_t>(options, input_vector, result_vector,
192
116
  count, error_message, line_error);
193
117
  }
194
118
 
119
+ void BaseCSVReader::VerifyLineLength(idx_t line_size, idx_t buffer_idx) {
120
+ if (line_size > options.maximum_line_size) {
121
+ throw InvalidInputException(
122
+ "Error in file \"%s\" on line %s: Maximum line size of %llu bytes exceeded!", options.file_path,
123
+ GetLineNumberStr(parse_chunk.size(), linenr_estimated, buffer_idx).c_str(), options.maximum_line_size);
124
+ }
125
+ }
126
+
195
127
  template <class OP, class T>
196
- bool TemplatedTryCastFloatingVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector,
197
- idx_t count, string &error_message, idx_t &line_error) {
128
+ bool TemplatedTryCastFloatingVector(CSVReaderOptions &options, Vector &input_vector, Vector &result_vector, idx_t count,
129
+ string &error_message, idx_t &line_error) {
198
130
  D_ASSERT(input_vector.GetType().id() == LogicalTypeId::VARCHAR);
199
131
  bool all_converted = true;
200
132
  idx_t row = 0;
@@ -212,8 +144,8 @@ bool TemplatedTryCastFloatingVector(BufferedCSVReaderOptions &options, Vector &i
212
144
  }
213
145
 
214
146
  template <class OP, class T>
215
- bool TemplatedTryCastDecimalVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector,
216
- idx_t count, string &error_message, uint8_t width, uint8_t scale) {
147
+ bool TemplatedTryCastDecimalVector(CSVReaderOptions &options, Vector &input_vector, Vector &result_vector, idx_t count,
148
+ string &error_message, uint8_t width, uint8_t scale) {
217
149
  D_ASSERT(input_vector.GetType().id() == LogicalTypeId::VARCHAR);
218
150
  bool all_converted = true;
219
151
  UnaryExecutor::Execute<string_t, T>(input_vector, result_vector, count, [&](string_t input) {
@@ -226,25 +158,6 @@ bool TemplatedTryCastDecimalVector(BufferedCSVReaderOptions &options, Vector &in
226
158
  return all_converted;
227
159
  }
228
160
 
229
- bool BaseCSVReader::TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type) {
230
- // try vector-cast from string to sql_type
231
- Vector dummy_result(sql_type);
232
- if (options.has_format[LogicalTypeId::DATE] && sql_type == LogicalTypeId::DATE) {
233
- // use the date format to cast the chunk
234
- string error_message;
235
- idx_t line_error;
236
- return TryCastDateVector(options, parse_chunk_col, dummy_result, size, error_message, line_error);
237
- } else if (options.has_format[LogicalTypeId::TIMESTAMP] && sql_type == LogicalTypeId::TIMESTAMP) {
238
- // use the timestamp format to cast the chunk
239
- string error_message;
240
- return TryCastTimestampVector(options, parse_chunk_col, dummy_result, size, error_message);
241
- } else {
242
- // target type is not varchar: perform a cast
243
- string error_message;
244
- return VectorOperations::DefaultTryCast(parse_chunk_col, dummy_result, size, &error_message, true);
245
- }
246
- }
247
-
248
161
  void BaseCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &escape_positions, bool has_quotes,
249
162
  idx_t buffer_idx) {
250
163
  auto length = str_val.GetSize();
@@ -257,10 +170,6 @@ void BaseCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &esc
257
170
  // skip a single trailing delimiter in last column
258
171
  return;
259
172
  }
260
- if (mode == ParserMode::SNIFFING_DIALECT) {
261
- column++;
262
- return;
263
- }
264
173
  if (column >= return_types.size()) {
265
174
  if (options.ignore_errors) {
266
175
  error_column_overflow = true;
@@ -291,12 +200,7 @@ void BaseCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &esc
291
200
  for (idx_t i = 0; i < escape_positions.size(); i++) {
292
201
  idx_t next_pos = escape_positions[i];
293
202
  new_val += old_val.substr(prev_pos, next_pos - prev_pos);
294
-
295
- if (options.escape.empty() || options.escape == options.quote) {
296
- prev_pos = next_pos + options.quote.size();
297
- } else {
298
- prev_pos = next_pos + options.escape.size();
299
- }
203
+ prev_pos = ++next_pos;
300
204
  }
301
205
  new_val += old_val.substr(prev_pos, old_val.size() - prev_pos);
302
206
  escape_positions.clear();
@@ -332,7 +236,7 @@ bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error
332
236
  return false;
333
237
  }
334
238
 
335
- if (column < return_types.size() && mode != ParserMode::SNIFFING_DIALECT) {
239
+ if (column < return_types.size()) {
336
240
  if (options.null_padding) {
337
241
  for (; column < return_types.size(); column++) {
338
242
  FlatVector::SetNull(parse_chunk.data[column], parse_chunk.size(), true);
@@ -353,15 +257,7 @@ bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error
353
257
  }
354
258
  }
355
259
 
356
- if (mode == ParserMode::SNIFFING_DIALECT) {
357
- sniffed_column_counts.push_back(column);
358
-
359
- if (sniffed_column_counts.size() == options.sample_chunk_size) {
360
- return true;
361
- }
362
- } else {
363
- parse_chunk.SetCardinality(parse_chunk.size() + 1);
364
- }
260
+ parse_chunk.SetCardinality(parse_chunk.size() + 1);
365
261
 
366
262
  if (mode == ParserMode::PARSING_HEADER) {
367
263
  return true;
@@ -412,7 +308,7 @@ void BaseCSVReader::VerifyUTF8(idx_t col_idx) {
412
308
  }
413
309
  }
414
310
 
415
- bool TryCastDecimalVectorCommaSeparated(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector,
311
+ bool TryCastDecimalVectorCommaSeparated(CSVReaderOptions &options, Vector &input_vector, Vector &result_vector,
416
312
  idx_t count, string &error_message, const LogicalType &result_type) {
417
313
  auto width = DecimalType::GetWidth(result_type);
418
314
  auto scale = DecimalType::GetScale(result_type);
@@ -434,7 +330,7 @@ bool TryCastDecimalVectorCommaSeparated(BufferedCSVReaderOptions &options, Vecto
434
330
  }
435
331
  }
436
332
 
437
- bool TryCastFloatingVectorCommaSeparated(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector,
333
+ bool TryCastFloatingVectorCommaSeparated(CSVReaderOptions &options, Vector &input_vector, Vector &result_vector,
438
334
  idx_t count, string &error_message, const LogicalType &result_type,
439
335
  idx_t &line_error) {
440
336
  switch (result_type.InternalType()) {
@@ -491,14 +387,15 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, idx_t buffer_idx, bool try_ad
491
387
  bool success;
492
388
  idx_t line_error = 0;
493
389
  bool target_type_not_varchar = false;
494
- if (options.has_format[LogicalTypeId::DATE] && type.id() == LogicalTypeId::DATE) {
390
+ if (options.dialect_options.has_format[LogicalTypeId::DATE] && type.id() == LogicalTypeId::DATE) {
495
391
  // use the date format to cast the chunk
496
- success = TryCastDateVector(options, parse_vector, result_vector, parse_chunk.size(), error_message,
497
- line_error);
498
- } else if (options.has_format[LogicalTypeId::TIMESTAMP] && type.id() == LogicalTypeId::TIMESTAMP) {
392
+ success = TryCastDateVector(options.dialect_options.date_format, parse_vector, result_vector,
393
+ parse_chunk.size(), error_message, line_error);
394
+ } else if (options.dialect_options.has_format[LogicalTypeId::TIMESTAMP] &&
395
+ type.id() == LogicalTypeId::TIMESTAMP) {
499
396
  // use the date format to cast the chunk
500
- success =
501
- TryCastTimestampVector(options, parse_vector, result_vector, parse_chunk.size(), error_message);
397
+ success = TryCastTimestampVector(options.dialect_options.date_format, parse_vector, result_vector,
398
+ parse_chunk.size(), error_message);
502
399
  } else if (options.decimal_separator != "." &&
503
400
  (type.id() == LogicalTypeId::FLOAT || type.id() == LogicalTypeId::DOUBLE)) {
504
401
  success = TryCastFloatingVectorCommaSeparated(options, parse_vector, result_vector, parse_chunk.size(),
@@ -666,9 +563,8 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, idx_t buffer_idx, bool try_ad
666
563
  }
667
564
 
668
565
  void BaseCSVReader::SetNewLineDelimiter(bool carry, bool carry_followed_by_nl) {
669
- if ((mode == ParserMode::SNIFFING_DIALECT && !options.has_newline) ||
670
- options.new_line == NewLineIdentifier::NOT_SET) {
671
- if (options.new_line == NewLineIdentifier::MIX) {
566
+ if (options.dialect_options.new_line == NewLineIdentifier::NOT_SET) {
567
+ if (options.dialect_options.new_line == NewLineIdentifier::MIX) {
672
568
  return;
673
569
  }
674
570
  NewLineIdentifier this_line_identifier;
@@ -681,15 +577,15 @@ void BaseCSVReader::SetNewLineDelimiter(bool carry, bool carry_followed_by_nl) {
681
577
  } else {
682
578
  this_line_identifier = NewLineIdentifier::SINGLE;
683
579
  }
684
- if (options.new_line == NewLineIdentifier::NOT_SET) {
685
- options.new_line = this_line_identifier;
580
+ if (options.dialect_options.new_line == NewLineIdentifier::NOT_SET) {
581
+ options.dialect_options.new_line = this_line_identifier;
686
582
  return;
687
583
  }
688
- if (options.new_line != this_line_identifier) {
689
- options.new_line = NewLineIdentifier::MIX;
584
+ if (options.dialect_options.new_line != this_line_identifier) {
585
+ options.dialect_options.new_line = NewLineIdentifier::MIX;
690
586
  return;
691
587
  }
692
- options.new_line = this_line_identifier;
588
+ options.dialect_options.new_line = this_line_identifier;
693
589
  }
694
590
  }
695
591
  } // namespace duckdb