duckdb 0.8.2-dev3458.0 → 0.8.2-dev3949.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. package/binding.gyp +2 -0
  2. package/package.json +1 -1
  3. package/src/duckdb/extension/icu/icu_extension.cpp +5 -5
  4. package/src/duckdb/extension/json/include/json_deserializer.hpp +7 -16
  5. package/src/duckdb/extension/json/include/json_serializer.hpp +9 -15
  6. package/src/duckdb/extension/json/json_deserializer.cpp +29 -67
  7. package/src/duckdb/extension/json/json_scan.cpp +1 -1
  8. package/src/duckdb/extension/json/json_serializer.cpp +26 -69
  9. package/src/duckdb/src/common/enum_util.cpp +119 -7
  10. package/src/duckdb/src/common/extra_type_info.cpp +7 -3
  11. package/src/duckdb/src/common/radix_partitioning.cpp +8 -31
  12. package/src/duckdb/src/common/row_operations/row_aggregate.cpp +18 -3
  13. package/src/duckdb/src/common/serializer/binary_deserializer.cpp +62 -77
  14. package/src/duckdb/src/common/serializer/binary_serializer.cpp +84 -84
  15. package/src/duckdb/src/common/serializer/format_serializer.cpp +1 -1
  16. package/src/duckdb/src/common/sort/partition_state.cpp +41 -33
  17. package/src/duckdb/src/common/types/data_chunk.cpp +44 -8
  18. package/src/duckdb/src/common/types/hyperloglog.cpp +21 -0
  19. package/src/duckdb/src/common/types/interval.cpp +3 -0
  20. package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +252 -126
  21. package/src/duckdb/src/common/types/row/row_layout.cpp +3 -31
  22. package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +40 -32
  23. package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +39 -26
  24. package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +11 -1
  25. package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +21 -16
  26. package/src/duckdb/src/common/types/value.cpp +63 -42
  27. package/src/duckdb/src/common/types/vector.cpp +33 -67
  28. package/src/duckdb/src/core_functions/scalar/list/list_lambdas.cpp +3 -2
  29. package/src/duckdb/src/execution/aggregate_hashtable.cpp +222 -364
  30. package/src/duckdb/src/execution/join_hashtable.cpp +5 -6
  31. package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +240 -310
  32. package/src/duckdb/src/execution/operator/aggregate/physical_ungrouped_aggregate.cpp +202 -173
  33. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +36 -2
  34. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/base_csv_reader.cpp +58 -162
  35. package/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp +434 -0
  36. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +80 -0
  37. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer_manager.cpp +90 -0
  38. package/src/duckdb/src/execution/operator/csv_scanner/csv_file_handle.cpp +95 -0
  39. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/csv_reader_options.cpp +47 -28
  40. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine.cpp +35 -0
  41. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +107 -0
  42. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/parallel_csv_reader.cpp +44 -44
  43. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +52 -0
  44. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +336 -0
  45. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +165 -0
  46. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +398 -0
  47. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +175 -0
  48. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +39 -0
  49. package/src/duckdb/src/execution/operator/join/physical_asof_join.cpp +1 -1
  50. package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +1 -2
  51. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +614 -574
  52. package/src/duckdb/src/execution/window_executor.cpp +6 -5
  53. package/src/duckdb/src/function/cast/cast_function_set.cpp +1 -0
  54. package/src/duckdb/src/function/scalar/strftime_format.cpp +4 -4
  55. package/src/duckdb/src/function/table/copy_csv.cpp +94 -96
  56. package/src/duckdb/src/function/table/read_csv.cpp +150 -136
  57. package/src/duckdb/src/function/table/table_scan.cpp +0 -2
  58. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  59. package/src/duckdb/src/include/duckdb/common/enum_util.hpp +24 -0
  60. package/src/duckdb/src/include/duckdb/common/file_opener.hpp +9 -0
  61. package/src/duckdb/src/include/duckdb/common/fixed_size_map.hpp +208 -0
  62. package/src/duckdb/src/include/duckdb/common/optional_idx.hpp +3 -0
  63. package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +2 -1
  64. package/src/duckdb/src/include/duckdb/common/printer.hpp +11 -0
  65. package/src/duckdb/src/include/duckdb/common/serializer/binary_deserializer.hpp +43 -30
  66. package/src/duckdb/src/include/duckdb/common/serializer/binary_serializer.hpp +36 -35
  67. package/src/duckdb/src/include/duckdb/common/serializer/deserialization_data.hpp +18 -0
  68. package/src/duckdb/src/include/duckdb/common/serializer/encoding_util.hpp +132 -0
  69. package/src/duckdb/src/include/duckdb/common/serializer/format_deserializer.hpp +125 -150
  70. package/src/duckdb/src/include/duckdb/common/serializer/format_serializer.hpp +119 -107
  71. package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +2 -1
  72. package/src/duckdb/src/include/duckdb/common/shared_ptr.hpp +8 -0
  73. package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +13 -7
  74. package/src/duckdb/src/include/duckdb/common/types/data_chunk.hpp +5 -0
  75. package/src/duckdb/src/include/duckdb/common/types/hyperloglog.hpp +7 -1
  76. package/src/duckdb/src/include/duckdb/common/types/interval.hpp +7 -0
  77. package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +41 -9
  78. package/src/duckdb/src/include/duckdb/common/types/row/row_data_collection_scanner.hpp +5 -0
  79. package/src/duckdb/src/include/duckdb/common/types/row/row_layout.hpp +1 -23
  80. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +14 -8
  81. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +6 -3
  82. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +7 -0
  83. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +13 -8
  84. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +3 -2
  85. package/src/duckdb/src/include/duckdb/common/types/vector.hpp +3 -3
  86. package/src/duckdb/src/include/duckdb/common/vector.hpp +2 -2
  87. package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +125 -146
  88. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_hash_aggregate.hpp +5 -4
  89. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_window.hpp +4 -3
  90. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/base_csv_reader.hpp +17 -17
  91. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp +72 -0
  92. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +110 -0
  93. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp +103 -0
  94. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_file_handle.hpp +8 -15
  95. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_line_info.hpp +1 -1
  96. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_reader_options.hpp +52 -28
  97. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +127 -0
  98. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +75 -0
  99. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +51 -0
  100. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/parallel_csv_reader.hpp +21 -27
  101. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/quote_rules.hpp +21 -0
  102. package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +18 -27
  103. package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +5 -6
  104. package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +4 -4
  105. package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +17 -12
  106. package/src/duckdb/src/include/duckdb/main/client_context_file_opener.hpp +1 -0
  107. package/src/duckdb/src/include/duckdb/main/client_data.hpp +2 -1
  108. package/src/duckdb/src/include/duckdb/main/config.hpp +1 -0
  109. package/src/duckdb/src/include/duckdb/main/connection.hpp +2 -2
  110. package/src/duckdb/src/include/duckdb/main/relation/read_csv_relation.hpp +6 -6
  111. package/src/duckdb/src/include/duckdb/parallel/event.hpp +12 -1
  112. package/src/duckdb/src/include/duckdb/storage/block.hpp +6 -0
  113. package/src/duckdb/src/include/duckdb/storage/buffer/block_handle.hpp +3 -0
  114. package/src/duckdb/src/include/duckdb/storage/statistics/base_statistics.hpp +7 -3
  115. package/src/duckdb/src/include/duckdb/storage/statistics/column_statistics.hpp +4 -0
  116. package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +5 -0
  117. package/src/duckdb/src/include/duckdb/storage/statistics/list_stats.hpp +3 -0
  118. package/src/duckdb/src/include/duckdb/storage/statistics/numeric_stats.hpp +3 -0
  119. package/src/duckdb/src/include/duckdb/storage/statistics/string_stats.hpp +3 -0
  120. package/src/duckdb/src/include/duckdb/storage/statistics/struct_stats.hpp +3 -0
  121. package/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp +15 -3
  122. package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +4 -0
  123. package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +5 -0
  124. package/src/duckdb/src/include/duckdb/verification/deserialized_statement_verifier_v2.hpp +6 -0
  125. package/src/duckdb/src/include/duckdb/verification/statement_verifier.hpp +1 -0
  126. package/src/duckdb/src/include/duckdb.h +12 -0
  127. package/src/duckdb/src/main/capi/logical_types-c.cpp +22 -0
  128. package/src/duckdb/src/main/client_context_file_opener.cpp +17 -0
  129. package/src/duckdb/src/main/client_verify.cpp +1 -0
  130. package/src/duckdb/src/main/config.cpp +2 -2
  131. package/src/duckdb/src/main/connection.cpp +3 -3
  132. package/src/duckdb/src/main/relation/read_csv_relation.cpp +19 -13
  133. package/src/duckdb/src/parallel/pipeline_finish_event.cpp +1 -1
  134. package/src/duckdb/src/parser/tableref/pivotref.cpp +0 -16
  135. package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +1 -1
  136. package/src/duckdb/src/planner/binder/statement/bind_export.cpp +41 -25
  137. package/src/duckdb/src/planner/expression/bound_aggregate_expression.cpp +4 -4
  138. package/src/duckdb/src/planner/expression/bound_window_expression.cpp +10 -10
  139. package/src/duckdb/src/planner/logical_operator.cpp +1 -1
  140. package/src/duckdb/src/planner/planner.cpp +1 -1
  141. package/src/duckdb/src/storage/checkpoint_manager.cpp +4 -3
  142. package/src/duckdb/src/storage/serialization/serialize_constraint.cpp +1 -1
  143. package/src/duckdb/src/storage/serialization/serialize_create_info.cpp +5 -5
  144. package/src/duckdb/src/storage/serialization/serialize_expression.cpp +10 -10
  145. package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +20 -20
  146. package/src/duckdb/src/storage/serialization/serialize_macro_function.cpp +2 -2
  147. package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +118 -89
  148. package/src/duckdb/src/storage/serialization/serialize_parse_info.cpp +3 -3
  149. package/src/duckdb/src/storage/serialization/serialize_parsed_expression.cpp +27 -27
  150. package/src/duckdb/src/storage/serialization/serialize_query_node.cpp +16 -16
  151. package/src/duckdb/src/storage/serialization/serialize_result_modifier.cpp +8 -8
  152. package/src/duckdb/src/storage/serialization/serialize_statement.cpp +1 -1
  153. package/src/duckdb/src/storage/serialization/serialize_storage.cpp +39 -0
  154. package/src/duckdb/src/storage/serialization/serialize_tableref.cpp +9 -9
  155. package/src/duckdb/src/storage/statistics/base_statistics.cpp +67 -4
  156. package/src/duckdb/src/storage/statistics/column_statistics.cpp +16 -0
  157. package/src/duckdb/src/storage/statistics/list_stats.cpp +21 -0
  158. package/src/duckdb/src/storage/statistics/numeric_stats.cpp +126 -1
  159. package/src/duckdb/src/storage/statistics/string_stats.cpp +23 -0
  160. package/src/duckdb/src/storage/statistics/struct_stats.cpp +27 -0
  161. package/src/duckdb/src/storage/storage_info.cpp +1 -1
  162. package/src/duckdb/src/storage/table/chunk_info.cpp +82 -3
  163. package/src/duckdb/src/storage/table/row_group.cpp +68 -1
  164. package/src/duckdb/src/storage/table/table_statistics.cpp +21 -0
  165. package/src/duckdb/src/storage/wal_replay.cpp +2 -2
  166. package/src/duckdb/src/verification/deserialized_statement_verifier_v2.cpp +15 -1
  167. package/src/duckdb/src/verification/statement_verifier.cpp +2 -0
  168. package/src/duckdb/third_party/utf8proc/include/utf8proc_wrapper.hpp +8 -0
  169. package/src/duckdb/ub_src_execution.cpp +0 -2
  170. package/src/duckdb/ub_src_execution_operator_csv_scanner.cpp +18 -0
  171. package/src/duckdb/ub_src_execution_operator_csv_scanner_sniffer.cpp +12 -0
  172. package/src/duckdb/ub_src_execution_operator_persistent.cpp +0 -12
  173. package/src/duckdb/ub_src_storage_serialization.cpp +2 -0
  174. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +0 -1487
  175. package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +0 -72
  176. package/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp +0 -158
  177. package/src/duckdb/src/execution/partitionable_hashtable.cpp +0 -207
  178. package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +0 -133
  179. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +0 -74
  180. package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +0 -73
@@ -1,4 +1,4 @@
1
- #include "duckdb/execution/operator/persistent/csv_reader_options.hpp"
1
+ #include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
2
2
  #include "duckdb/common/bind_helpers.hpp"
3
3
  #include "duckdb/common/vector_size.hpp"
4
4
  #include "duckdb/common/string_util.hpp"
@@ -60,60 +60,77 @@ static int64_t ParseInteger(const Value &value, const string &loption) {
60
60
  return value.GetValue<int64_t>();
61
61
  }
62
62
 
63
- void BufferedCSVReaderOptions::SetHeader(bool input) {
64
- this->header = input;
63
+ void CSVReaderOptions::SetHeader(bool input) {
64
+ this->dialect_options.header = input;
65
65
  this->has_header = true;
66
66
  }
67
67
 
68
- void BufferedCSVReaderOptions::SetCompression(const string &compression_p) {
68
+ void CSVReaderOptions::SetCompression(const string &compression_p) {
69
69
  this->compression = FileCompressionTypeFromString(compression_p);
70
70
  }
71
71
 
72
- void BufferedCSVReaderOptions::SetEscape(const string &input) {
73
- this->escape = input;
72
+ void CSVReaderOptions::SetEscape(const string &input) {
73
+ auto escape_str = input;
74
+ if (escape_str.size() > 1) {
75
+ throw InvalidInputException("The escape option cannot exceed a size of 1 byte.");
76
+ }
77
+ if (escape_str.empty()) {
78
+ escape_str = string("\0", 1);
79
+ }
80
+ this->dialect_options.state_machine_options.escape = escape_str[0];
74
81
  this->has_escape = true;
75
82
  }
76
83
 
77
- void BufferedCSVReaderOptions::SetDelimiter(const string &input) {
78
- this->delimiter = StringUtil::Replace(input, "\\t", "\t");
84
+ void CSVReaderOptions::SetDelimiter(const string &input) {
85
+ auto delim_str = StringUtil::Replace(input, "\\t", "\t");
86
+ if (delim_str.size() > 1) {
87
+ throw InvalidInputException("The delimiter option cannot exceed a size of 1 byte.");
88
+ }
79
89
  this->has_delimiter = true;
80
90
  if (input.empty()) {
81
- this->delimiter = string("\0", 1);
91
+ delim_str = string("\0", 1);
82
92
  }
93
+ this->dialect_options.state_machine_options.delimiter = delim_str[0];
83
94
  }
84
95
 
85
- void BufferedCSVReaderOptions::SetQuote(const string &quote_p) {
86
- this->quote = quote_p;
96
+ void CSVReaderOptions::SetQuote(const string &quote_p) {
97
+ auto quote_str = quote_p;
98
+ if (quote_str.size() > 1) {
99
+ throw InvalidInputException("The quote option cannot exceed a size of 1 byte.");
100
+ }
101
+ if (quote_str.empty()) {
102
+ quote_str = string("\0", 1);
103
+ }
104
+ this->dialect_options.state_machine_options.quote = quote_str[0];
87
105
  this->has_quote = true;
88
106
  }
89
107
 
90
- void BufferedCSVReaderOptions::SetNewline(const string &input) {
108
+ void CSVReaderOptions::SetNewline(const string &input) {
91
109
  if (input == "\\n" || input == "\\r") {
92
- new_line = NewLineIdentifier::SINGLE;
110
+ dialect_options.new_line = NewLineIdentifier::SINGLE;
93
111
  } else if (input == "\\r\\n") {
94
- new_line = NewLineIdentifier::CARRY_ON;
112
+ dialect_options.new_line = NewLineIdentifier::CARRY_ON;
95
113
  } else {
96
114
  throw InvalidInputException("This is not accepted as a newline: " + input);
97
115
  }
98
116
  has_newline = true;
99
117
  }
100
118
 
101
- void BufferedCSVReaderOptions::SetDateFormat(LogicalTypeId type, const string &format, bool read_format) {
119
+ void CSVReaderOptions::SetDateFormat(LogicalTypeId type, const string &format, bool read_format) {
102
120
  string error;
103
121
  if (read_format) {
104
- error = StrTimeFormat::ParseFormatSpecifier(format, date_format[type]);
105
- date_format[type].format_specifier = format;
122
+ error = StrTimeFormat::ParseFormatSpecifier(format, dialect_options.date_format[type]);
123
+ dialect_options.date_format[type].format_specifier = format;
106
124
  } else {
107
125
  error = StrTimeFormat::ParseFormatSpecifier(format, write_date_format[type]);
108
126
  }
109
127
  if (!error.empty()) {
110
128
  throw InvalidInputException("Could not parse DATEFORMAT: %s", error.c_str());
111
129
  }
112
- has_format[type] = true;
130
+ dialect_options.has_format[type] = true;
113
131
  }
114
132
 
115
- void BufferedCSVReaderOptions::SetReadOption(const string &loption, const Value &value,
116
- vector<string> &expected_names) {
133
+ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value, vector<string> &expected_names) {
117
134
  if (SetBaseOption(loption, value)) {
118
135
  return;
119
136
  }
@@ -135,7 +152,7 @@ void BufferedCSVReaderOptions::SetReadOption(const string &loption, const Value
135
152
  sample_chunks = sample_size / STANDARD_VECTOR_SIZE + 1;
136
153
  }
137
154
  } else if (loption == "skip") {
138
- skip_rows = ParseInteger(value, loption);
155
+ dialect_options.skip_rows = ParseInteger(value, loption);
139
156
  skip_rows_set = true;
140
157
  } else if (loption == "max_line_size" || loption == "maximum_line_size") {
141
158
  maximum_line_size = ParseInteger(value, loption);
@@ -204,7 +221,7 @@ void BufferedCSVReaderOptions::SetReadOption(const string &loption, const Value
204
221
  }
205
222
  }
206
223
 
207
- void BufferedCSVReaderOptions::SetWriteOption(const string &loption, const Value &value) {
224
+ void CSVReaderOptions::SetWriteOption(const string &loption, const Value &value) {
208
225
  if (loption == "new_line") {
209
226
  // Steal this from SetBaseOption so we can write different newlines (e.g., format JSON ARRAY)
210
227
  write_newline = ParseString(value, loption);
@@ -236,7 +253,7 @@ void BufferedCSVReaderOptions::SetWriteOption(const string &loption, const Value
236
253
  }
237
254
  }
238
255
 
239
- bool BufferedCSVReaderOptions::SetBaseOption(const string &loption, const Value &value) {
256
+ bool CSVReaderOptions::SetBaseOption(const string &loption, const Value &value) {
240
257
  // Make sure this function was only called after the option was turned into lowercase
241
258
  D_ASSERT(!std::any_of(loption.begin(), loption.end(), ::isupper));
242
259
 
@@ -266,12 +283,14 @@ bool BufferedCSVReaderOptions::SetBaseOption(const string &loption, const Value
266
283
  return true;
267
284
  }
268
285
 
269
- std::string BufferedCSVReaderOptions::ToString() const {
270
- return " file=" + file_path + "\n delimiter='" + delimiter +
271
- (has_delimiter ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) + "\n quote='" + quote +
272
- (has_quote ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) + "\n escape='" + escape +
286
+ string CSVReaderOptions::ToString() const {
287
+ return " file=" + file_path + "\n delimiter='" + dialect_options.state_machine_options.delimiter +
288
+ (has_delimiter ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) + "\n quote='" +
289
+ dialect_options.state_machine_options.quote +
290
+ (has_quote ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) + "\n escape='" +
291
+ dialect_options.state_machine_options.escape +
273
292
  (has_escape ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) +
274
- "\n header=" + std::to_string(header) +
293
+ "\n header=" + std::to_string(dialect_options.header) +
275
294
  (has_header ? "" : (auto_detect ? " (auto detected)" : "' (default)")) +
276
295
  "\n sample_size=" + std::to_string(sample_chunk_size * sample_chunks) +
277
296
  "\n ignore_errors=" + std::to_string(ignore_errors) + "\n all_varchar=" + std::to_string(all_varchar);
@@ -0,0 +1,35 @@
1
+ #include "duckdb/execution/operator/scan/csv/csv_state_machine.hpp"
2
+ #include "duckdb/execution/operator/scan/csv/csv_sniffer.hpp"
3
+ #include "utf8proc_wrapper.hpp"
4
+ #include "duckdb/main/error_manager.hpp"
5
+ #include "duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp"
6
+
7
+ namespace duckdb {
8
+
9
+ CSVStateMachine::CSVStateMachine(CSVReaderOptions &options_p, const CSVStateMachineOptions &state_machine_options,
10
+ shared_ptr<CSVBufferManager> buffer_manager_p,
11
+ CSVStateMachineCache &csv_state_machine_cache_p)
12
+ : csv_state_machine_cache(csv_state_machine_cache_p), options(options_p),
13
+ csv_buffer_iterator(std::move(buffer_manager_p)),
14
+ transition_array(csv_state_machine_cache.Get(state_machine_options)) {
15
+ dialect_options.state_machine_options = state_machine_options;
16
+ dialect_options.has_format = options.dialect_options.has_format;
17
+ dialect_options.date_format = options.dialect_options.date_format;
18
+ dialect_options.skip_rows = options.dialect_options.skip_rows;
19
+ }
20
+
21
+ void CSVStateMachine::Reset() {
22
+ csv_buffer_iterator.Reset();
23
+ }
24
+
25
+ void CSVStateMachine::VerifyUTF8() {
26
+ auto utf_type = Utf8Proc::Analyze(value.c_str(), value.size());
27
+ if (utf_type == UnicodeType::INVALID) {
28
+ int64_t error_line = cur_rows;
29
+ throw InvalidInputException("Error in file \"%s\" at line %llu: "
30
+ "%s. Parser options:\n%s",
31
+ options.file_path, error_line, ErrorManager::InvalidUnicodeError(value, "CSV file"),
32
+ options.ToString());
33
+ }
34
+ }
35
+ } // namespace duckdb
@@ -0,0 +1,107 @@
1
+ #include "duckdb/execution/operator/scan/csv/csv_state_machine.hpp"
2
+ #include "duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp"
3
+
4
+ namespace duckdb {
5
+
6
+ void InitializeTransitionArray(unsigned char *transition_array, const uint8_t state) {
7
+ for (uint32_t i = 0; i < NUM_TRANSITIONS; i++) {
8
+ transition_array[i] = state;
9
+ }
10
+ }
11
+
12
+ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_options) {
13
+ D_ASSERT(state_machine_cache.find(state_machine_options) == state_machine_cache.end());
14
+ // Initialize transition array with default values to the Standard option
15
+ auto &transition_array = state_machine_cache[state_machine_options];
16
+ const uint8_t standard_state = static_cast<uint8_t>(CSVState::STANDARD);
17
+ const uint8_t field_separator_state = static_cast<uint8_t>(CSVState::DELIMITER);
18
+ const uint8_t record_separator_state = static_cast<uint8_t>(CSVState::RECORD_SEPARATOR);
19
+ const uint8_t carriage_return_state = static_cast<uint8_t>(CSVState::CARRIAGE_RETURN);
20
+ const uint8_t quoted_state = static_cast<uint8_t>(CSVState::QUOTED);
21
+ const uint8_t unquoted_state = static_cast<uint8_t>(CSVState::UNQUOTED);
22
+ const uint8_t escape_state = static_cast<uint8_t>(CSVState::ESCAPE);
23
+ const uint8_t empty_line_state = static_cast<uint8_t>(CSVState::EMPTY_LINE);
24
+ const uint8_t invalid_state = static_cast<uint8_t>(CSVState::INVALID);
25
+
26
+ for (uint32_t i = 0; i < NUM_STATES; i++) {
27
+ switch (i) {
28
+ case quoted_state:
29
+ InitializeTransitionArray(transition_array[i], quoted_state);
30
+ break;
31
+ case unquoted_state:
32
+ InitializeTransitionArray(transition_array[i], invalid_state);
33
+ break;
34
+ case escape_state:
35
+ InitializeTransitionArray(transition_array[i], invalid_state);
36
+ break;
37
+ default:
38
+ InitializeTransitionArray(transition_array[i], standard_state);
39
+ break;
40
+ }
41
+ }
42
+
43
+ // Now set values depending on configuration
44
+ // 1) Standard State
45
+ transition_array[standard_state][static_cast<uint8_t>(state_machine_options.delimiter)] = field_separator_state;
46
+ transition_array[standard_state][static_cast<uint8_t>('\n')] = record_separator_state;
47
+ transition_array[standard_state][static_cast<uint8_t>('\r')] = carriage_return_state;
48
+ transition_array[standard_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
49
+ // 2) Field Separator State
50
+ transition_array[field_separator_state][static_cast<uint8_t>(state_machine_options.delimiter)] =
51
+ field_separator_state;
52
+ transition_array[field_separator_state][static_cast<uint8_t>('\n')] = record_separator_state;
53
+ transition_array[field_separator_state][static_cast<uint8_t>('\r')] = carriage_return_state;
54
+ transition_array[field_separator_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
55
+ // 3) Record Separator State
56
+ transition_array[record_separator_state][static_cast<uint8_t>(state_machine_options.delimiter)] =
57
+ field_separator_state;
58
+ transition_array[record_separator_state][static_cast<uint8_t>('\n')] = empty_line_state;
59
+ transition_array[record_separator_state][static_cast<uint8_t>('\r')] = empty_line_state;
60
+ transition_array[record_separator_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
61
+ // 4) Carriage Return State
62
+ transition_array[carriage_return_state][static_cast<uint8_t>('\n')] = record_separator_state;
63
+ transition_array[carriage_return_state][static_cast<uint8_t>('\r')] = empty_line_state;
64
+ transition_array[carriage_return_state][static_cast<uint8_t>(state_machine_options.escape)] = escape_state;
65
+ // 5) Quoted State
66
+ transition_array[quoted_state][static_cast<uint8_t>(state_machine_options.quote)] = unquoted_state;
67
+ if (state_machine_options.quote != state_machine_options.escape) {
68
+ transition_array[quoted_state][static_cast<uint8_t>(state_machine_options.escape)] = escape_state;
69
+ }
70
+ // 6) Unquoted State
71
+ transition_array[unquoted_state][static_cast<uint8_t>('\n')] = record_separator_state;
72
+ transition_array[unquoted_state][static_cast<uint8_t>('\r')] = carriage_return_state;
73
+ transition_array[unquoted_state][static_cast<uint8_t>(state_machine_options.delimiter)] = field_separator_state;
74
+ if (state_machine_options.quote == state_machine_options.escape) {
75
+ transition_array[unquoted_state][static_cast<uint8_t>(state_machine_options.escape)] = quoted_state;
76
+ }
77
+ // 7) Escaped State
78
+ transition_array[escape_state][static_cast<uint8_t>(state_machine_options.quote)] = quoted_state;
79
+ transition_array[escape_state][static_cast<uint8_t>(state_machine_options.escape)] = quoted_state;
80
+ // 8) Empty Line State
81
+ transition_array[empty_line_state][static_cast<uint8_t>('\r')] = empty_line_state;
82
+ transition_array[empty_line_state][static_cast<uint8_t>('\n')] = empty_line_state;
83
+ }
84
+
85
+ CSVStateMachineCache::CSVStateMachineCache() {
86
+ for (auto quoterule : default_quote_rule) {
87
+ const auto &quote_candidates = default_quote[static_cast<uint8_t>(quoterule)];
88
+ for (const auto &quote : quote_candidates) {
89
+ for (const auto &delimiter : default_delimiter) {
90
+ const auto &escape_candidates = default_escape[static_cast<uint8_t>(quoterule)];
91
+ for (const auto &escape : escape_candidates) {
92
+ Insert({delimiter, quote, escape});
93
+ }
94
+ }
95
+ }
96
+ }
97
+ }
98
+
99
+ const state_machine_t &CSVStateMachineCache::Get(const CSVStateMachineOptions &state_machine_options) {
100
+ //! Custom State Machine, we need to create it and cache it first
101
+ if (state_machine_cache.find(state_machine_options) == state_machine_cache.end()) {
102
+ Insert(state_machine_options);
103
+ }
104
+ const auto &transition_array = state_machine_cache[state_machine_options];
105
+ return transition_array;
106
+ }
107
+ } // namespace duckdb
@@ -1,4 +1,4 @@
1
- #include "duckdb/execution/operator/persistent/parallel_csv_reader.hpp"
1
+ #include "duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp"
2
2
 
3
3
  #include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp"
4
4
  #include "duckdb/common/file_system.hpp"
@@ -15,7 +15,7 @@
15
15
  #include "utf8proc.hpp"
16
16
  #include "duckdb/parser/keyword_helper.hpp"
17
17
  #include "duckdb/function/table/read_csv.hpp"
18
- #include "duckdb/execution/operator/persistent/csv_line_info.hpp"
18
+ #include "duckdb/execution/operator/scan/csv/csv_line_info.hpp"
19
19
 
20
20
  #include <algorithm>
21
21
  #include <cctype>
@@ -24,16 +24,13 @@
24
24
 
25
25
  namespace duckdb {
26
26
 
27
- ParallelCSVReader::ParallelCSVReader(ClientContext &context, BufferedCSVReaderOptions options_p,
27
+ ParallelCSVReader::ParallelCSVReader(ClientContext &context, CSVReaderOptions options_p,
28
28
  unique_ptr<CSVBufferRead> buffer_p, idx_t first_pos_first_buffer_p,
29
29
  const vector<LogicalType> &requested_types, idx_t file_idx_p)
30
30
  : BaseCSVReader(context, std::move(options_p), requested_types), file_idx(file_idx_p),
31
31
  first_pos_first_buffer(first_pos_first_buffer_p) {
32
32
  Initialize(requested_types);
33
33
  SetBufferRead(std::move(buffer_p));
34
- if (options.delimiter.size() > 1 || options.escape.size() > 1 || options.quote.size() > 1) {
35
- throw InternalException("Parallel CSV reader cannot handle CSVs with multi-byte delimiters/escapes/quotes");
36
- }
37
34
  }
38
35
 
39
36
  void ParallelCSVReader::Initialize(const vector<LogicalType> &requested_types) {
@@ -44,8 +41,9 @@ void ParallelCSVReader::Initialize(const vector<LogicalType> &requested_types) {
44
41
  bool ParallelCSVReader::NewLineDelimiter(bool carry, bool carry_followed_by_nl, bool first_char) {
45
42
  // Set the delimiter if not set yet.
46
43
  SetNewLineDelimiter(carry, carry_followed_by_nl);
47
- D_ASSERT(options.new_line == NewLineIdentifier::SINGLE || options.new_line == NewLineIdentifier::CARRY_ON);
48
- if (options.new_line == NewLineIdentifier::SINGLE) {
44
+ D_ASSERT(options.dialect_options.new_line == NewLineIdentifier::SINGLE ||
45
+ options.dialect_options.new_line == NewLineIdentifier::CARRY_ON);
46
+ if (options.dialect_options.new_line == NewLineIdentifier::SINGLE) {
49
47
  return (!carry) || (carry && !carry_followed_by_nl);
50
48
  }
51
49
  return (carry && carry_followed_by_nl) || (!carry && first_char);
@@ -75,15 +73,14 @@ void ParallelCSVReader::SkipEmptyLines() {
75
73
  }
76
74
 
77
75
  bool ParallelCSVReader::SetPosition() {
78
- if (buffer->buffer->IsCSVFileFirstBuffer() && start_buffer == position_buffer &&
79
- start_buffer == first_pos_first_buffer) {
80
- start_buffer = buffer->buffer->GetStart();
76
+ if (buffer->buffer->is_first_buffer && start_buffer == position_buffer && start_buffer == first_pos_first_buffer) {
77
+ start_buffer = buffer->buffer->start_position;
81
78
  position_buffer = start_buffer;
82
79
  verification_positions.beginning_of_first_line = position_buffer;
83
80
  verification_positions.end_of_last_line = position_buffer;
84
81
  // First buffer doesn't need any setting
85
82
 
86
- if (options.header) {
83
+ if (options.dialect_options.header) {
87
84
  for (; position_buffer < end_buffer; position_buffer++) {
88
85
  if (StringUtil::CharacterIsNewline((*buffer)[position_buffer])) {
89
86
  bool carrier_return = (*buffer)[position_buffer] == '\r';
@@ -150,7 +147,7 @@ bool ParallelCSVReader::SetPosition() {
150
147
  break;
151
148
  }
152
149
 
153
- if (position_buffer > end_buffer && options.new_line == NewLineIdentifier::CARRY_ON &&
150
+ if (position_buffer > end_buffer && options.dialect_options.new_line == NewLineIdentifier::CARRY_ON &&
154
151
  (*buffer)[position_buffer - 1] == '\n') {
155
152
  break;
156
153
  }
@@ -199,9 +196,9 @@ void ParallelCSVReader::SetBufferRead(unique_ptr<CSVBufferRead> buffer_read_p) {
199
196
  start_buffer = buffer_read_p->buffer_start;
200
197
  end_buffer = buffer_read_p->buffer_end;
201
198
  if (buffer_read_p->next_buffer) {
202
- buffer_size = buffer_read_p->buffer->GetBufferSize() + buffer_read_p->next_buffer->GetBufferSize();
199
+ buffer_size = buffer_read_p->buffer->actual_size + buffer_read_p->next_buffer->actual_size;
203
200
  } else {
204
- buffer_size = buffer_read_p->buffer->GetBufferSize();
201
+ buffer_size = buffer_read_p->buffer->actual_size;
205
202
  }
206
203
  buffer = std::move(buffer_read_p);
207
204
 
@@ -213,8 +210,8 @@ void ParallelCSVReader::SetBufferRead(unique_ptr<CSVBufferRead> buffer_read_p) {
213
210
  }
214
211
 
215
212
  VerificationPositions ParallelCSVReader::GetVerificationPositions() {
216
- verification_positions.beginning_of_first_line += buffer->buffer->GetCSVGlobalStart();
217
- verification_positions.end_of_last_line += buffer->buffer->GetCSVGlobalStart();
213
+ verification_positions.beginning_of_first_line += buffer->buffer->csv_global_start;
214
+ verification_positions.end_of_last_line += buffer->buffer->csv_global_start;
218
215
  return verification_positions;
219
216
  }
220
217
 
@@ -235,15 +232,6 @@ bool ParallelCSVReader::BufferRemainder() {
235
232
  return true;
236
233
  }
237
234
 
238
- void ParallelCSVReader::VerifyLineLength(idx_t line_size) {
239
- if (line_size > options.maximum_line_size) {
240
- throw InvalidInputException("Error in file \"%s\" on line %s: Maximum line size of %llu bytes exceeded!",
241
- options.file_path,
242
- GetLineNumberStr(parse_chunk.size(), linenr_estimated, buffer->batch_index).c_str(),
243
- options.maximum_line_size);
244
- }
245
- }
246
-
247
235
  bool AllNewLine(string_t value, idx_t column_amount) {
248
236
  auto value_str = value.GetString();
249
237
  if (value_str.empty() && column_amount == 1) {
@@ -260,7 +248,7 @@ bool AllNewLine(string_t value, idx_t column_amount) {
260
248
 
261
249
  bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line) {
262
250
  // If line is not set, we have to figure it out, we assume whatever is in the first line
263
- if (options.new_line == NewLineIdentifier::NOT_SET) {
251
+ if (options.dialect_options.new_line == NewLineIdentifier::NOT_SET) {
264
252
  idx_t cur_pos = position_buffer;
265
253
  // we can start in the middle of a new line, so move a bit forward.
266
254
  while (cur_pos < end_buffer) {
@@ -324,7 +312,7 @@ value_start : {
324
312
  offset = 0;
325
313
 
326
314
  // this state parses the first character of a value
327
- if ((*buffer)[position_buffer] == options.quote[0]) {
315
+ if ((*buffer)[position_buffer] == options.dialect_options.state_machine_options.quote) {
328
316
  // quote: actual value starts in the next position
329
317
  // move to in_quotes state
330
318
  start_buffer = position_buffer + 1;
@@ -341,10 +329,10 @@ normal : {
341
329
  // this state parses the remainder of a non-quoted value until we reach a delimiter or newline
342
330
  for (; position_buffer < end_buffer; position_buffer++) {
343
331
  auto c = (*buffer)[position_buffer];
344
- if (c == options.delimiter[0]) {
332
+ if (c == options.dialect_options.state_machine_options.delimiter) {
345
333
  // delimiter: end the value and add it to the chunk
346
334
  goto add_value;
347
- } else if (c == options.quote[0] && try_add_line) {
335
+ } else if (c == options.dialect_options.state_machine_options.quote && try_add_line) {
348
336
  return false;
349
337
  } else if (StringUtil::CharacterIsNewline(c)) {
350
338
  // newline: add row
@@ -396,7 +384,7 @@ add_row : {
396
384
  parse_chunk.Reset();
397
385
  return success;
398
386
  } else {
399
- VerifyLineLength(position_buffer - line_start);
387
+ VerifyLineLength(position_buffer - line_start, buffer->batch_index);
400
388
  line_start = position_buffer;
401
389
  finished_chunk = AddRow(insert_chunk, column, error_message, buffer->local_batch_index);
402
390
  }
@@ -413,7 +401,7 @@ add_row : {
413
401
  goto final_state;
414
402
  }
415
403
  if ((*buffer)[position_buffer] == '\n') {
416
- if (options.new_line == NewLineIdentifier::SINGLE) {
404
+ if (options.dialect_options.new_line == NewLineIdentifier::SINGLE) {
417
405
  error_message = "Wrong NewLine Identifier. Expecting \\r\\n";
418
406
  return false;
419
407
  }
@@ -428,7 +416,7 @@ add_row : {
428
416
  goto final_state;
429
417
  }
430
418
  } else {
431
- if (options.new_line == NewLineIdentifier::CARRY_ON) {
419
+ if (options.dialect_options.new_line == NewLineIdentifier::CARRY_ON) {
432
420
  error_message = "Wrong NewLine Identifier. Expecting \\r or \\n";
433
421
  return false;
434
422
  }
@@ -441,7 +429,7 @@ add_row : {
441
429
  }
442
430
  goto value_start;
443
431
  } else {
444
- if (options.new_line == NewLineIdentifier::CARRY_ON) {
432
+ if (options.dialect_options.new_line == NewLineIdentifier::CARRY_ON) {
445
433
  error_message = "Wrong NewLine Identifier. Expecting \\r or \\n";
446
434
  return false;
447
435
  }
@@ -452,6 +440,10 @@ add_row : {
452
440
  goto final_state;
453
441
  }
454
442
  SkipEmptyLines();
443
+ if (position_buffer - verification_positions.end_of_last_line > options.buffer_size) {
444
+ error_message = "Line does not fit in one buffer. Increase the buffer size.";
445
+ return false;
446
+ }
455
447
  verification_positions.end_of_last_line = position_buffer;
456
448
  start_buffer = position_buffer;
457
449
  // \n newline, move to value start
@@ -467,17 +459,17 @@ in_quotes:
467
459
  position_buffer++;
468
460
  for (; position_buffer < end_buffer; position_buffer++) {
469
461
  auto c = (*buffer)[position_buffer];
470
- if (c == options.quote[0]) {
462
+ if (c == options.dialect_options.state_machine_options.quote) {
471
463
  // quote: move to unquoted state
472
464
  goto unquote;
473
- } else if (c == options.escape[0]) {
465
+ } else if (c == options.dialect_options.state_machine_options.escape) {
474
466
  // escape: store the escaped position and move to handle_escape state
475
467
  escape_positions.push_back(position_buffer - start_buffer);
476
468
  goto handle_escape;
477
469
  }
478
470
  }
479
471
  if (!BufferRemainder()) {
480
- if (buffer->buffer->IsCSVFileLastBuffer()) {
472
+ if (buffer->buffer->is_last_buffer) {
481
473
  if (try_add_line) {
482
474
  return false;
483
475
  }
@@ -504,11 +496,13 @@ unquote : {
504
496
  goto final_state;
505
497
  }
506
498
  auto c = (*buffer)[position_buffer];
507
- if (c == options.quote[0] && (options.escape.empty() || options.escape[0] == options.quote[0])) {
499
+ if (c == options.dialect_options.state_machine_options.quote &&
500
+ (options.dialect_options.state_machine_options.escape == '\0' ||
501
+ options.dialect_options.state_machine_options.escape == options.dialect_options.state_machine_options.quote)) {
508
502
  // escaped quote, return to quoted state and store escape position
509
503
  escape_positions.push_back(position_buffer - start_buffer);
510
504
  goto in_quotes;
511
- } else if (c == options.delimiter[0]) {
505
+ } else if (c == options.dialect_options.state_machine_options.delimiter) {
512
506
  // delimiter, add value
513
507
  offset = 1;
514
508
  goto add_value;
@@ -537,13 +531,14 @@ handle_escape : {
537
531
  if (!BufferRemainder()) {
538
532
  goto final_state;
539
533
  }
540
- if (position_buffer >= buffer_size && buffer->buffer->IsCSVFileLastBuffer()) {
534
+ if (position_buffer >= buffer_size && buffer->buffer->is_last_buffer) {
541
535
  error_message = StringUtil::Format(
542
536
  "Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", options.file_path,
543
537
  GetLineNumberStr(linenr, linenr_estimated, buffer->local_batch_index).c_str(), options.ToString());
544
538
  return false;
545
539
  }
546
- if ((*buffer)[position_buffer] != options.quote[0] && (*buffer)[position_buffer] != options.escape[0]) {
540
+ if ((*buffer)[position_buffer] != options.dialect_options.state_machine_options.quote &&
541
+ (*buffer)[position_buffer] != options.dialect_options.state_machine_options.escape) {
547
542
  error_message = StringUtil::Format(
548
543
  "Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", options.file_path,
549
544
  GetLineNumberStr(linenr, linenr_estimated, buffer->local_batch_index).c_str(), options.ToString());
@@ -573,7 +568,8 @@ final_state : {
573
568
  return true;
574
569
  }
575
570
  // If this is the last buffer, we have to read the last value
576
- if (buffer->buffer->IsCSVFileLastBuffer() || (buffer->next_buffer && buffer->next_buffer->IsCSVFileLastBuffer())) {
571
+ if (buffer->buffer->is_last_buffer || !buffer->next_buffer ||
572
+ (buffer->next_buffer && buffer->next_buffer->is_last_buffer)) {
577
573
  if (column > 0 || start_buffer != position_buffer || try_add_line ||
578
574
  (insert_chunk.data.size() == 1 && start_buffer != position_buffer)) {
579
575
  // remaining values to be added to the chunk
@@ -592,9 +588,13 @@ final_state : {
592
588
  reached_remainder_state = false;
593
589
  return success;
594
590
  } else {
595
- VerifyLineLength(position_buffer - line_start);
591
+ VerifyLineLength(position_buffer - line_start, buffer->batch_index);
596
592
  line_start = position_buffer;
597
593
  AddRow(insert_chunk, column, error_message, buffer->local_batch_index);
594
+ if (position_buffer - verification_positions.end_of_last_line > options.buffer_size) {
595
+ error_message = "Line does not fit in one buffer. Increase the buffer size.";
596
+ return false;
597
+ }
598
598
  verification_positions.end_of_last_line = position_buffer;
599
599
  }
600
600
  }
@@ -638,7 +638,7 @@ void ParallelCSVReader::ParseCSV(DataChunk &insert_chunk) {
638
638
  idx_t ParallelCSVReader::GetLineError(idx_t line_error, idx_t buffer_idx, bool stop_at_first) {
639
639
  while (true) {
640
640
  if (buffer->line_info->CanItGetLine(file_idx, buffer_idx)) {
641
- auto cur_start = verification_positions.beginning_of_first_line + buffer->buffer->GetCSVGlobalStart();
641
+ auto cur_start = verification_positions.beginning_of_first_line + buffer->buffer->csv_global_start;
642
642
  return buffer->line_info->GetLine(buffer_idx, line_error, file_idx, cur_start, false, stop_at_first);
643
643
  }
644
644
  }
@@ -0,0 +1,52 @@
1
+ #include "duckdb/execution/operator/scan/csv/csv_sniffer.hpp"
2
+
3
+ namespace duckdb {
4
+
5
+ CSVSniffer::CSVSniffer(CSVReaderOptions &options_p, shared_ptr<CSVBufferManager> buffer_manager_p,
6
+ CSVStateMachineCache &state_machine_cache_p)
7
+ : state_machine_cache(state_machine_cache_p), options(options_p), buffer_manager(std::move(buffer_manager_p)) {
8
+
9
+ // Check if any type is BLOB
10
+ for (auto &type : options.sql_type_list) {
11
+ if (type.id() == LogicalTypeId::BLOB) {
12
+ throw InvalidInputException(
13
+ "CSV auto-detect for blobs not supported: there may be invalid UTF-8 in the file");
14
+ }
15
+ }
16
+
17
+ // Initialize Format Candidates
18
+ for (const auto &format_template : format_template_candidates) {
19
+ auto &logical_type = format_template.first;
20
+ best_format_candidates[logical_type].clear();
21
+ }
22
+ }
23
+
24
+ SnifferResult CSVSniffer::SniffCSV() {
25
+ // 1. Dialect Detection
26
+ DetectDialect();
27
+ // 2. Type Detection
28
+ DetectTypes();
29
+ // 3. Header Detection
30
+ DetectHeader();
31
+ D_ASSERT(best_sql_types_candidates_per_column_idx.size() == names.size());
32
+ // 4. Type Replacement
33
+ ReplaceTypes();
34
+ // 5. Type Refinement
35
+ RefineTypes();
36
+ // We are done, construct and return the result.
37
+
38
+ // Set the CSV Options in the reference
39
+ options.dialect_options = best_candidate->dialect_options;
40
+ options.has_header = best_candidate->dialect_options.header;
41
+ options.skip_rows_set = options.dialect_options.skip_rows > 0;
42
+ if (options.has_header) {
43
+ options.dialect_options.true_start = best_start_with_header;
44
+ } else {
45
+ options.dialect_options.true_start = best_start_without_header;
46
+ }
47
+
48
+ // Return the types and names
49
+ return SnifferResult(detected_types, names);
50
+ }
51
+
52
+ } // namespace duckdb