duckdb 0.8.2-dev3458.0 → 0.8.2-dev3949.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. package/binding.gyp +2 -0
  2. package/package.json +1 -1
  3. package/src/duckdb/extension/icu/icu_extension.cpp +5 -5
  4. package/src/duckdb/extension/json/include/json_deserializer.hpp +7 -16
  5. package/src/duckdb/extension/json/include/json_serializer.hpp +9 -15
  6. package/src/duckdb/extension/json/json_deserializer.cpp +29 -67
  7. package/src/duckdb/extension/json/json_scan.cpp +1 -1
  8. package/src/duckdb/extension/json/json_serializer.cpp +26 -69
  9. package/src/duckdb/src/common/enum_util.cpp +119 -7
  10. package/src/duckdb/src/common/extra_type_info.cpp +7 -3
  11. package/src/duckdb/src/common/radix_partitioning.cpp +8 -31
  12. package/src/duckdb/src/common/row_operations/row_aggregate.cpp +18 -3
  13. package/src/duckdb/src/common/serializer/binary_deserializer.cpp +62 -77
  14. package/src/duckdb/src/common/serializer/binary_serializer.cpp +84 -84
  15. package/src/duckdb/src/common/serializer/format_serializer.cpp +1 -1
  16. package/src/duckdb/src/common/sort/partition_state.cpp +41 -33
  17. package/src/duckdb/src/common/types/data_chunk.cpp +44 -8
  18. package/src/duckdb/src/common/types/hyperloglog.cpp +21 -0
  19. package/src/duckdb/src/common/types/interval.cpp +3 -0
  20. package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +252 -126
  21. package/src/duckdb/src/common/types/row/row_layout.cpp +3 -31
  22. package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +40 -32
  23. package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +39 -26
  24. package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +11 -1
  25. package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +21 -16
  26. package/src/duckdb/src/common/types/value.cpp +63 -42
  27. package/src/duckdb/src/common/types/vector.cpp +33 -67
  28. package/src/duckdb/src/core_functions/scalar/list/list_lambdas.cpp +3 -2
  29. package/src/duckdb/src/execution/aggregate_hashtable.cpp +222 -364
  30. package/src/duckdb/src/execution/join_hashtable.cpp +5 -6
  31. package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +240 -310
  32. package/src/duckdb/src/execution/operator/aggregate/physical_ungrouped_aggregate.cpp +202 -173
  33. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +36 -2
  34. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/base_csv_reader.cpp +58 -162
  35. package/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp +434 -0
  36. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +80 -0
  37. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer_manager.cpp +90 -0
  38. package/src/duckdb/src/execution/operator/csv_scanner/csv_file_handle.cpp +95 -0
  39. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/csv_reader_options.cpp +47 -28
  40. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine.cpp +35 -0
  41. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +107 -0
  42. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/parallel_csv_reader.cpp +44 -44
  43. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +52 -0
  44. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +336 -0
  45. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +165 -0
  46. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +398 -0
  47. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +175 -0
  48. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +39 -0
  49. package/src/duckdb/src/execution/operator/join/physical_asof_join.cpp +1 -1
  50. package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +1 -2
  51. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +614 -574
  52. package/src/duckdb/src/execution/window_executor.cpp +6 -5
  53. package/src/duckdb/src/function/cast/cast_function_set.cpp +1 -0
  54. package/src/duckdb/src/function/scalar/strftime_format.cpp +4 -4
  55. package/src/duckdb/src/function/table/copy_csv.cpp +94 -96
  56. package/src/duckdb/src/function/table/read_csv.cpp +150 -136
  57. package/src/duckdb/src/function/table/table_scan.cpp +0 -2
  58. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  59. package/src/duckdb/src/include/duckdb/common/enum_util.hpp +24 -0
  60. package/src/duckdb/src/include/duckdb/common/file_opener.hpp +9 -0
  61. package/src/duckdb/src/include/duckdb/common/fixed_size_map.hpp +208 -0
  62. package/src/duckdb/src/include/duckdb/common/optional_idx.hpp +3 -0
  63. package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +2 -1
  64. package/src/duckdb/src/include/duckdb/common/printer.hpp +11 -0
  65. package/src/duckdb/src/include/duckdb/common/serializer/binary_deserializer.hpp +43 -30
  66. package/src/duckdb/src/include/duckdb/common/serializer/binary_serializer.hpp +36 -35
  67. package/src/duckdb/src/include/duckdb/common/serializer/deserialization_data.hpp +18 -0
  68. package/src/duckdb/src/include/duckdb/common/serializer/encoding_util.hpp +132 -0
  69. package/src/duckdb/src/include/duckdb/common/serializer/format_deserializer.hpp +125 -150
  70. package/src/duckdb/src/include/duckdb/common/serializer/format_serializer.hpp +119 -107
  71. package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +2 -1
  72. package/src/duckdb/src/include/duckdb/common/shared_ptr.hpp +8 -0
  73. package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +13 -7
  74. package/src/duckdb/src/include/duckdb/common/types/data_chunk.hpp +5 -0
  75. package/src/duckdb/src/include/duckdb/common/types/hyperloglog.hpp +7 -1
  76. package/src/duckdb/src/include/duckdb/common/types/interval.hpp +7 -0
  77. package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +41 -9
  78. package/src/duckdb/src/include/duckdb/common/types/row/row_data_collection_scanner.hpp +5 -0
  79. package/src/duckdb/src/include/duckdb/common/types/row/row_layout.hpp +1 -23
  80. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +14 -8
  81. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +6 -3
  82. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +7 -0
  83. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +13 -8
  84. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +3 -2
  85. package/src/duckdb/src/include/duckdb/common/types/vector.hpp +3 -3
  86. package/src/duckdb/src/include/duckdb/common/vector.hpp +2 -2
  87. package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +125 -146
  88. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_hash_aggregate.hpp +5 -4
  89. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_window.hpp +4 -3
  90. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/base_csv_reader.hpp +17 -17
  91. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp +72 -0
  92. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +110 -0
  93. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp +103 -0
  94. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_file_handle.hpp +8 -15
  95. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_line_info.hpp +1 -1
  96. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_reader_options.hpp +52 -28
  97. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +127 -0
  98. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +75 -0
  99. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +51 -0
  100. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/parallel_csv_reader.hpp +21 -27
  101. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/quote_rules.hpp +21 -0
  102. package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +18 -27
  103. package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +5 -6
  104. package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +4 -4
  105. package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +17 -12
  106. package/src/duckdb/src/include/duckdb/main/client_context_file_opener.hpp +1 -0
  107. package/src/duckdb/src/include/duckdb/main/client_data.hpp +2 -1
  108. package/src/duckdb/src/include/duckdb/main/config.hpp +1 -0
  109. package/src/duckdb/src/include/duckdb/main/connection.hpp +2 -2
  110. package/src/duckdb/src/include/duckdb/main/relation/read_csv_relation.hpp +6 -6
  111. package/src/duckdb/src/include/duckdb/parallel/event.hpp +12 -1
  112. package/src/duckdb/src/include/duckdb/storage/block.hpp +6 -0
  113. package/src/duckdb/src/include/duckdb/storage/buffer/block_handle.hpp +3 -0
  114. package/src/duckdb/src/include/duckdb/storage/statistics/base_statistics.hpp +7 -3
  115. package/src/duckdb/src/include/duckdb/storage/statistics/column_statistics.hpp +4 -0
  116. package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +5 -0
  117. package/src/duckdb/src/include/duckdb/storage/statistics/list_stats.hpp +3 -0
  118. package/src/duckdb/src/include/duckdb/storage/statistics/numeric_stats.hpp +3 -0
  119. package/src/duckdb/src/include/duckdb/storage/statistics/string_stats.hpp +3 -0
  120. package/src/duckdb/src/include/duckdb/storage/statistics/struct_stats.hpp +3 -0
  121. package/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp +15 -3
  122. package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +4 -0
  123. package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +5 -0
  124. package/src/duckdb/src/include/duckdb/verification/deserialized_statement_verifier_v2.hpp +6 -0
  125. package/src/duckdb/src/include/duckdb/verification/statement_verifier.hpp +1 -0
  126. package/src/duckdb/src/include/duckdb.h +12 -0
  127. package/src/duckdb/src/main/capi/logical_types-c.cpp +22 -0
  128. package/src/duckdb/src/main/client_context_file_opener.cpp +17 -0
  129. package/src/duckdb/src/main/client_verify.cpp +1 -0
  130. package/src/duckdb/src/main/config.cpp +2 -2
  131. package/src/duckdb/src/main/connection.cpp +3 -3
  132. package/src/duckdb/src/main/relation/read_csv_relation.cpp +19 -13
  133. package/src/duckdb/src/parallel/pipeline_finish_event.cpp +1 -1
  134. package/src/duckdb/src/parser/tableref/pivotref.cpp +0 -16
  135. package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +1 -1
  136. package/src/duckdb/src/planner/binder/statement/bind_export.cpp +41 -25
  137. package/src/duckdb/src/planner/expression/bound_aggregate_expression.cpp +4 -4
  138. package/src/duckdb/src/planner/expression/bound_window_expression.cpp +10 -10
  139. package/src/duckdb/src/planner/logical_operator.cpp +1 -1
  140. package/src/duckdb/src/planner/planner.cpp +1 -1
  141. package/src/duckdb/src/storage/checkpoint_manager.cpp +4 -3
  142. package/src/duckdb/src/storage/serialization/serialize_constraint.cpp +1 -1
  143. package/src/duckdb/src/storage/serialization/serialize_create_info.cpp +5 -5
  144. package/src/duckdb/src/storage/serialization/serialize_expression.cpp +10 -10
  145. package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +20 -20
  146. package/src/duckdb/src/storage/serialization/serialize_macro_function.cpp +2 -2
  147. package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +118 -89
  148. package/src/duckdb/src/storage/serialization/serialize_parse_info.cpp +3 -3
  149. package/src/duckdb/src/storage/serialization/serialize_parsed_expression.cpp +27 -27
  150. package/src/duckdb/src/storage/serialization/serialize_query_node.cpp +16 -16
  151. package/src/duckdb/src/storage/serialization/serialize_result_modifier.cpp +8 -8
  152. package/src/duckdb/src/storage/serialization/serialize_statement.cpp +1 -1
  153. package/src/duckdb/src/storage/serialization/serialize_storage.cpp +39 -0
  154. package/src/duckdb/src/storage/serialization/serialize_tableref.cpp +9 -9
  155. package/src/duckdb/src/storage/statistics/base_statistics.cpp +67 -4
  156. package/src/duckdb/src/storage/statistics/column_statistics.cpp +16 -0
  157. package/src/duckdb/src/storage/statistics/list_stats.cpp +21 -0
  158. package/src/duckdb/src/storage/statistics/numeric_stats.cpp +126 -1
  159. package/src/duckdb/src/storage/statistics/string_stats.cpp +23 -0
  160. package/src/duckdb/src/storage/statistics/struct_stats.cpp +27 -0
  161. package/src/duckdb/src/storage/storage_info.cpp +1 -1
  162. package/src/duckdb/src/storage/table/chunk_info.cpp +82 -3
  163. package/src/duckdb/src/storage/table/row_group.cpp +68 -1
  164. package/src/duckdb/src/storage/table/table_statistics.cpp +21 -0
  165. package/src/duckdb/src/storage/wal_replay.cpp +2 -2
  166. package/src/duckdb/src/verification/deserialized_statement_verifier_v2.cpp +15 -1
  167. package/src/duckdb/src/verification/statement_verifier.cpp +2 -0
  168. package/src/duckdb/third_party/utf8proc/include/utf8proc_wrapper.hpp +8 -0
  169. package/src/duckdb/ub_src_execution.cpp +0 -2
  170. package/src/duckdb/ub_src_execution_operator_csv_scanner.cpp +18 -0
  171. package/src/duckdb/ub_src_execution_operator_csv_scanner_sniffer.cpp +12 -0
  172. package/src/duckdb/ub_src_execution_operator_persistent.cpp +0 -12
  173. package/src/duckdb/ub_src_storage_serialization.cpp +2 -0
  174. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +0 -1487
  175. package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +0 -72
  176. package/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp +0 -158
  177. package/src/duckdb/src/execution/partitionable_hashtable.cpp +0 -207
  178. package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +0 -133
  179. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +0 -74
  180. package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +0 -73
@@ -344,13 +344,14 @@ void WindowBoundariesState::Update(const idx_t row_idx, const WindowInputColumn
344
344
 
345
345
  // when the partition changes, recompute the boundaries
346
346
  if (!is_same_partition || is_jump) {
347
- partition_start = row_idx;
348
- peer_start = row_idx;
349
-
350
347
  if (is_jump) {
351
- // Go back as far as the previous partition start
352
348
  idx_t n = 1;
353
- partition_start = FindPrevStart(partition_mask, partition_start, row_idx + 1, n);
349
+ partition_start = FindPrevStart(partition_mask, 0, row_idx + 1, n);
350
+ n = 1;
351
+ peer_start = FindPrevStart(order_mask, 0, row_idx + 1, n);
352
+ } else {
353
+ partition_start = row_idx;
354
+ peer_start = row_idx;
354
355
  }
355
356
 
356
357
  // find end of partition
@@ -1,3 +1,4 @@
1
+
1
2
  #include "duckdb/function/cast/cast_function_set.hpp"
2
3
 
3
4
  #include "duckdb/common/pair.hpp"
@@ -668,7 +668,7 @@ int StrpTimeFormat::NumericSpecifierWidth(StrTimeSpecifier specifier) {
668
668
  enum class TimeSpecifierAMOrPM : uint8_t { TIME_SPECIFIER_NONE = 0, TIME_SPECIFIER_AM = 1, TIME_SPECIFIER_PM = 2 };
669
669
 
670
670
  int32_t StrpTimeFormat::TryParseCollection(const char *data, idx_t &pos, idx_t size, const string_t collection[],
671
- idx_t collection_count) {
671
+ idx_t collection_count) const {
672
672
  for (idx_t c = 0; c < collection_count; c++) {
673
673
  auto &entry = collection[c];
674
674
  auto entry_data = entry.GetData();
@@ -695,7 +695,7 @@ int32_t StrpTimeFormat::TryParseCollection(const char *data, idx_t &pos, idx_t s
695
695
  }
696
696
 
697
697
  //! Parses a timestamp using the given specifier
698
- bool StrpTimeFormat::Parse(string_t str, ParseResult &result) {
698
+ bool StrpTimeFormat::Parse(string_t str, ParseResult &result) const {
699
699
  auto &result_data = result.data;
700
700
  auto &error_message = result.error_message;
701
701
  auto &error_position = result.error_position;
@@ -1146,7 +1146,7 @@ string StrpTimeFormat::ParseResult::FormatError(string_t input, const string &fo
1146
1146
  FormatStrpTimeError(input.GetString(), error_position), error_message);
1147
1147
  }
1148
1148
 
1149
- bool StrpTimeFormat::TryParseDate(string_t input, date_t &result, string &error_message) {
1149
+ bool StrpTimeFormat::TryParseDate(string_t input, date_t &result, string &error_message) const {
1150
1150
  ParseResult parse_result;
1151
1151
  if (!Parse(input, parse_result)) {
1152
1152
  error_message = parse_result.FormatError(input, format_specifier);
@@ -1155,7 +1155,7 @@ bool StrpTimeFormat::TryParseDate(string_t input, date_t &result, string &error_
1155
1155
  return parse_result.TryToDate(result);
1156
1156
  }
1157
1157
 
1158
- bool StrpTimeFormat::TryParseTimestamp(string_t input, timestamp_t &result, string &error_message) {
1158
+ bool StrpTimeFormat::TryParseTimestamp(string_t input, timestamp_t &result, string &error_message) const {
1159
1159
  ParseResult parse_result;
1160
1160
  if (!Parse(input, parse_result)) {
1161
1161
  error_message = parse_result.FormatError(input, format_specifier);
@@ -6,6 +6,7 @@
6
6
  #include "duckdb/common/types/column/column_data_collection.hpp"
7
7
  #include "duckdb/common/types/string_type.hpp"
8
8
  #include "duckdb/common/vector_operations/vector_operations.hpp"
9
+ #include "duckdb/execution/operator/scan/csv/csv_sniffer.hpp"
9
10
  #include "duckdb/function/copy_function.hpp"
10
11
  #include "duckdb/function/scalar/string_functions.hpp"
11
12
  #include "duckdb/function/table/read_csv.hpp"
@@ -15,11 +16,20 @@
15
16
 
16
17
  namespace duckdb {
17
18
 
18
- void SubstringDetection(string &str_1, string &str_2, const string &name_str_1, const string &name_str_2) {
19
- if (str_1.empty() || str_2.empty()) {
19
+ void AreOptionsEqual(char &str_1, char &str_2, const string &name_str_1, const string &name_str_2) {
20
+ if (str_1 == '\0' || str_2 == '\0') {
20
21
  return;
21
22
  }
22
- if ((str_1.find(str_2) != string::npos || str_2.find(str_1) != std::string::npos)) {
23
+ if (str_1 == str_2) {
24
+ throw BinderException("%s must not appear in the %s specification and vice versa", name_str_1, name_str_2);
25
+ }
26
+ }
27
+
28
+ void SubstringDetection(char &str_1, string &str_2, const string &name_str_1, const string &name_str_2) {
29
+ if (str_1 == '\0' || str_2.empty()) {
30
+ return;
31
+ }
32
+ if (str_2.find(str_1) != string::npos) {
23
33
  throw BinderException("%s must not appear in the %s specification and vice versa", name_str_1, name_str_2);
24
34
  }
25
35
  }
@@ -28,34 +38,46 @@ void SubstringDetection(string &str_1, string &str_2, const string &name_str_1,
28
38
  // Bind
29
39
  //===--------------------------------------------------------------------===//
30
40
 
41
+ void WriteQuoteOrEscape(Serializer &serializer, char quote_or_escape) {
42
+ if (quote_or_escape != '\0') {
43
+ serializer.Write(quote_or_escape);
44
+ }
45
+ }
46
+
31
47
  void BaseCSVData::Finalize() {
32
48
  // verify that the options are correct in the final pass
33
- if (options.escape.empty()) {
34
- options.escape = options.quote;
49
+ if (options.dialect_options.state_machine_options.escape == '\0') {
50
+ options.dialect_options.state_machine_options.escape = options.dialect_options.state_machine_options.quote;
35
51
  }
36
52
  // escape and delimiter must not be substrings of each other
37
53
  if (options.has_delimiter && options.has_escape) {
38
- SubstringDetection(options.delimiter, options.escape, "DELIMITER", "ESCAPE");
54
+ AreOptionsEqual(options.dialect_options.state_machine_options.delimiter,
55
+ options.dialect_options.state_machine_options.escape, "DELIMITER", "ESCAPE");
39
56
  }
40
57
  // delimiter and quote must not be substrings of each other
41
58
  if (options.has_quote && options.has_delimiter) {
42
- SubstringDetection(options.quote, options.delimiter, "DELIMITER", "QUOTE");
59
+ AreOptionsEqual(options.dialect_options.state_machine_options.quote,
60
+ options.dialect_options.state_machine_options.delimiter, "DELIMITER", "QUOTE");
43
61
  }
44
62
  // escape and quote must not be substrings of each other (but can be the same)
45
- if (options.quote != options.escape && options.has_quote && options.has_escape) {
46
- SubstringDetection(options.quote, options.escape, "QUOTE", "ESCAPE");
63
+ if (options.dialect_options.state_machine_options.quote != options.dialect_options.state_machine_options.escape &&
64
+ options.has_quote && options.has_escape) {
65
+ AreOptionsEqual(options.dialect_options.state_machine_options.quote,
66
+ options.dialect_options.state_machine_options.escape, "QUOTE", "ESCAPE");
47
67
  }
48
68
  if (!options.null_str.empty()) {
49
69
  // null string and delimiter must not be substrings of each other
50
70
  if (options.has_delimiter) {
51
- SubstringDetection(options.delimiter, options.null_str, "DELIMITER", "NULL");
71
+ SubstringDetection(options.dialect_options.state_machine_options.delimiter, options.null_str, "DELIMITER",
72
+ "NULL");
52
73
  }
53
74
  // quote/escape and nullstr must not be substrings of each other
54
75
  if (options.has_quote) {
55
- SubstringDetection(options.quote, options.null_str, "QUOTE", "NULL");
76
+ SubstringDetection(options.dialect_options.state_machine_options.quote, options.null_str, "QUOTE", "NULL");
56
77
  }
57
78
  if (options.has_escape) {
58
- SubstringDetection(options.escape, options.null_str, "ESCAPE", "NULL");
79
+ SubstringDetection(options.dialect_options.state_machine_options.escape, options.null_str, "ESCAPE",
80
+ "NULL");
59
81
  }
60
82
  }
61
83
 
@@ -63,7 +85,7 @@ void BaseCSVData::Finalize() {
63
85
  if (options.prefix.empty() || options.suffix.empty()) {
64
86
  throw BinderException("COPY ... (FORMAT CSV) must have both PREFIX and SUFFIX, or none at all");
65
87
  }
66
- if (options.header) {
88
+ if (options.dialect_options.header) {
67
89
  throw BinderException("COPY ... (FORMAT CSV)'s HEADER cannot be combined with PREFIX/SUFFIX");
68
90
  }
69
91
  }
@@ -85,16 +107,14 @@ static unique_ptr<FunctionData> WriteCSVBind(ClientContext &context, CopyInfo &i
85
107
  bind_data->options.force_quote.resize(names.size(), false);
86
108
  }
87
109
  bind_data->Finalize();
88
- bind_data->is_simple = bind_data->options.delimiter.size() == 1 && bind_data->options.escape.size() == 1 &&
89
- bind_data->options.quote.size() == 1;
90
- if (bind_data->is_simple) {
91
- bind_data->requires_quotes = make_unsafe_uniq_array<bool>(256);
92
- memset(bind_data->requires_quotes.get(), 0, sizeof(bool) * 256);
93
- bind_data->requires_quotes['\n'] = true;
94
- bind_data->requires_quotes['\r'] = true;
95
- bind_data->requires_quotes[bind_data->options.delimiter[0]] = true;
96
- bind_data->requires_quotes[bind_data->options.quote[0]] = true;
97
- }
110
+
111
+ bind_data->requires_quotes = make_unsafe_uniq_array<bool>(256);
112
+ memset(bind_data->requires_quotes.get(), 0, sizeof(bool) * 256);
113
+ bind_data->requires_quotes['\n'] = true;
114
+ bind_data->requires_quotes['\r'] = true;
115
+ bind_data->requires_quotes[bind_data->options.dialect_options.state_machine_options.delimiter] = true;
116
+ bind_data->requires_quotes[bind_data->options.dialect_options.state_machine_options.quote] = true;
117
+
98
118
  if (!bind_data->options.write_newline.empty()) {
99
119
  bind_data->newline = bind_data->options.write_newline;
100
120
  }
@@ -129,13 +149,24 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, CopyInfo &in
129
149
  for (auto &option : info.options) {
130
150
  options_map[option.first] = ConvertVectorToValue(std::move(option.second));
131
151
  }
152
+ options.file_path = bind_data->files[0];
153
+ options.name_list = expected_names;
154
+ options.sql_type_list = expected_types;
155
+ for (idx_t i = 0; i < expected_types.size(); i++) {
156
+ options.sql_types_per_column[expected_names[i]] = i;
157
+ }
132
158
 
133
159
  bind_data->FinalizeRead(context);
134
- if (!bind_data->single_threaded && options.auto_detect) {
135
- options.file_path = bind_data->files[0];
136
- options.name_list = expected_names;
137
- auto initial_reader = make_uniq<BufferedCSVReader>(context, options, expected_types);
138
- options = initial_reader->options;
160
+ if (options.auto_detect) {
161
+ // We must run the sniffer.
162
+ auto file_handle = BaseCSVReader::OpenCSV(context, options);
163
+ auto buffer_manager = make_shared<CSVBufferManager>(context, std::move(file_handle), options);
164
+ CSVSniffer sniffer(options, buffer_manager, bind_data->state_machine_cache);
165
+ auto sniffer_result = sniffer.SniffCSV();
166
+ bind_data->csv_types = sniffer_result.return_types;
167
+ bind_data->csv_names = sniffer_result.names;
168
+ bind_data->return_types = sniffer_result.return_types;
169
+ bind_data->return_names = sniffer_result.names;
139
170
  }
140
171
  return std::move(bind_data);
141
172
  }
@@ -143,7 +174,7 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, CopyInfo &in
143
174
  //===--------------------------------------------------------------------===//
144
175
  // Helper writing functions
145
176
  //===--------------------------------------------------------------------===//
146
- static string AddEscapes(string &to_be_escaped, const string &escape, const string &val) {
177
+ static string AddEscapes(char &to_be_escaped, const char &escape, const string &val) {
147
178
  idx_t i = 0;
148
179
  string new_val = "";
149
180
  idx_t found = val.find(to_be_escaped);
@@ -153,8 +184,10 @@ static string AddEscapes(string &to_be_escaped, const string &escape, const stri
153
184
  new_val += val[i];
154
185
  i++;
155
186
  }
156
- new_val += escape;
157
- found = val.find(to_be_escaped, found + escape.length());
187
+ if (escape != '\0') {
188
+ new_val += escape;
189
+ found = val.find(to_be_escaped, found + 1);
190
+ }
158
191
  }
159
192
  while (i < val.length()) {
160
193
  new_val += val[i];
@@ -169,43 +202,16 @@ static bool RequiresQuotes(WriteCSVData &csv_data, const char *str, idx_t len) {
169
202
  if (len == options.null_str.size() && memcmp(str, options.null_str.c_str(), len) == 0) {
170
203
  return true;
171
204
  }
172
- if (csv_data.is_simple) {
173
- // simple CSV: check for newlines, quotes and delimiter all at once
174
- auto str_data = reinterpret_cast<const_data_ptr_t>(str);
175
- for (idx_t i = 0; i < len; i++) {
176
- if (csv_data.requires_quotes[str_data[i]]) {
177
- // this byte requires quotes - write a quoted string
178
- return true;
179
- }
180
- }
181
- // no newline, quote or delimiter in the string
182
- // no quoting or escaping necessary
183
- return false;
184
- } else {
185
- // CSV with complex quotes/delimiter (multiple bytes)
186
-
187
- // first check for \n, \r, \n\r in string
188
- for (idx_t i = 0; i < len; i++) {
189
- if (str[i] == '\n' || str[i] == '\r') {
190
- // newline, write a quoted string
191
- return true;
192
- }
193
- }
194
-
195
- // check for delimiter
196
- if (options.delimiter.length() != 0 &&
197
- ContainsFun::Find(const_uchar_ptr_cast(str), len, const_uchar_ptr_cast(options.delimiter.c_str()),
198
- options.delimiter.size()) != DConstants::INVALID_INDEX) {
205
+ auto str_data = reinterpret_cast<const_data_ptr_t>(str);
206
+ for (idx_t i = 0; i < len; i++) {
207
+ if (csv_data.requires_quotes[str_data[i]]) {
208
+ // this byte requires quotes - write a quoted string
199
209
  return true;
200
210
  }
201
- // check for quote
202
- if (options.quote.length() != 0 &&
203
- ContainsFun::Find(const_uchar_ptr_cast(str), len, const_uchar_ptr_cast(options.quote.c_str()),
204
- options.quote.size()) != DConstants::INVALID_INDEX) {
205
- return true;
206
- }
207
- return false;
208
211
  }
212
+ // no newline, quote or delimiter in the string
213
+ // no quoting or escaping necessary
214
+ return false;
209
215
  }
210
216
 
211
217
  static void WriteQuotedString(Serializer &serializer, WriteCSVData &csv_data, const char *str, idx_t len,
@@ -218,46 +224,37 @@ static void WriteQuotedString(Serializer &serializer, WriteCSVData &csv_data, co
218
224
  if (force_quote) {
219
225
  // quoting is enabled: we might need to escape things in the string
220
226
  bool requires_escape = false;
221
- if (csv_data.is_simple) {
222
- // simple CSV
223
- // do a single loop to check for a quote or escape value
224
- for (idx_t i = 0; i < len; i++) {
225
- if (str[i] == options.quote[0] || str[i] == options.escape[0]) {
226
- requires_escape = true;
227
- break;
228
- }
229
- }
230
- } else {
231
- // complex CSV
232
- // check for quote or escape separately
233
- if (options.quote.length() != 0 &&
234
- ContainsFun::Find(const_uchar_ptr_cast(str), len, const_uchar_ptr_cast(options.quote.c_str()),
235
- options.quote.size()) != DConstants::INVALID_INDEX) {
236
- requires_escape = true;
237
- } else if (options.escape.length() != 0 &&
238
- ContainsFun::Find(const_uchar_ptr_cast(str), len, const_uchar_ptr_cast(options.escape.c_str()),
239
- options.escape.size()) != DConstants::INVALID_INDEX) {
227
+ // simple CSV
228
+ // do a single loop to check for a quote or escape value
229
+ for (idx_t i = 0; i < len; i++) {
230
+ if (str[i] == options.dialect_options.state_machine_options.quote ||
231
+ str[i] == options.dialect_options.state_machine_options.escape) {
240
232
  requires_escape = true;
233
+ break;
241
234
  }
242
235
  }
236
+
243
237
  if (!requires_escape) {
244
238
  // fast path: no need to escape anything
245
- serializer.WriteBufferData(options.quote);
239
+ WriteQuoteOrEscape(serializer, options.dialect_options.state_machine_options.quote);
246
240
  serializer.WriteData(const_data_ptr_cast(str), len);
247
- serializer.WriteBufferData(options.quote);
241
+ WriteQuoteOrEscape(serializer, options.dialect_options.state_machine_options.quote);
248
242
  return;
249
243
  }
250
244
 
251
245
  // slow path: need to add escapes
252
246
  string new_val(str, len);
253
- new_val = AddEscapes(options.escape, options.escape, new_val);
254
- if (options.escape != options.quote) {
247
+ new_val = AddEscapes(options.dialect_options.state_machine_options.escape,
248
+ options.dialect_options.state_machine_options.escape, new_val);
249
+ if (options.dialect_options.state_machine_options.escape !=
250
+ options.dialect_options.state_machine_options.quote) {
255
251
  // need to escape quotes separately
256
- new_val = AddEscapes(options.quote, options.escape, new_val);
252
+ new_val = AddEscapes(options.dialect_options.state_machine_options.quote,
253
+ options.dialect_options.state_machine_options.escape, new_val);
257
254
  }
258
- serializer.WriteBufferData(options.quote);
255
+ WriteQuoteOrEscape(serializer, options.dialect_options.state_machine_options.quote);
259
256
  serializer.WriteBufferData(new_val);
260
- serializer.WriteBufferData(options.quote);
257
+ WriteQuoteOrEscape(serializer, options.dialect_options.state_machine_options.quote);
261
258
  } else {
262
259
  serializer.WriteData(const_data_ptr_cast(str), len);
263
260
  }
@@ -335,12 +332,12 @@ static unique_ptr<GlobalFunctionData> WriteCSVInitializeGlobal(ClientContext &co
335
332
  global_data->WriteData(options.prefix.c_str(), options.prefix.size());
336
333
  }
337
334
 
338
- if (options.header) {
335
+ if (options.dialect_options.header) {
339
336
  BufferedSerializer serializer;
340
337
  // write the header line to the file
341
338
  for (idx_t i = 0; i < csv_data.options.name_list.size(); i++) {
342
339
  if (i != 0) {
343
- serializer.WriteBufferData(options.delimiter);
340
+ WriteQuoteOrEscape(serializer, options.dialect_options.state_machine_options.delimiter);
344
341
  }
345
342
  WriteQuotedString(serializer, csv_data, csv_data.options.name_list[i].c_str(),
346
343
  csv_data.options.name_list[i].size(), false);
@@ -365,11 +362,12 @@ static void WriteCSVChunkInternal(ClientContext &context, FunctionData &bind_dat
365
362
  if (csv_data.sql_types[col_idx].id() == LogicalTypeId::VARCHAR) {
366
363
  // VARCHAR, just reinterpret (cannot reference, because LogicalTypeId::VARCHAR is used by the JSON type too)
367
364
  cast_chunk.data[col_idx].Reinterpret(input.data[col_idx]);
368
- } else if (options.has_format[LogicalTypeId::DATE] && csv_data.sql_types[col_idx].id() == LogicalTypeId::DATE) {
365
+ } else if (options.dialect_options.has_format[LogicalTypeId::DATE] &&
366
+ csv_data.sql_types[col_idx].id() == LogicalTypeId::DATE) {
369
367
  // use the date format to cast the chunk
370
368
  csv_data.options.write_date_format[LogicalTypeId::DATE].ConvertDateVector(
371
369
  input.data[col_idx], cast_chunk.data[col_idx], input.size());
372
- } else if (options.has_format[LogicalTypeId::TIMESTAMP] &&
370
+ } else if (options.dialect_options.has_format[LogicalTypeId::TIMESTAMP] &&
373
371
  (csv_data.sql_types[col_idx].id() == LogicalTypeId::TIMESTAMP ||
374
372
  csv_data.sql_types[col_idx].id() == LogicalTypeId::TIMESTAMP_TZ)) {
375
373
  // use the timestamp format to cast the chunk
@@ -392,7 +390,7 @@ static void WriteCSVChunkInternal(ClientContext &context, FunctionData &bind_dat
392
390
  // write values
393
391
  for (idx_t col_idx = 0; col_idx < cast_chunk.ColumnCount(); col_idx++) {
394
392
  if (col_idx != 0) {
395
- writer.WriteBufferData(options.delimiter);
393
+ WriteQuoteOrEscape(writer, options.dialect_options.state_machine_options.delimiter);
396
394
  }
397
395
  if (FlatVector::IsNull(cast_chunk.data[col_idx], row_idx)) {
398
396
  // write null value