duckdb 0.8.2-dev3458.0 → 0.8.2-dev3949.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. package/binding.gyp +2 -0
  2. package/package.json +1 -1
  3. package/src/duckdb/extension/icu/icu_extension.cpp +5 -5
  4. package/src/duckdb/extension/json/include/json_deserializer.hpp +7 -16
  5. package/src/duckdb/extension/json/include/json_serializer.hpp +9 -15
  6. package/src/duckdb/extension/json/json_deserializer.cpp +29 -67
  7. package/src/duckdb/extension/json/json_scan.cpp +1 -1
  8. package/src/duckdb/extension/json/json_serializer.cpp +26 -69
  9. package/src/duckdb/src/common/enum_util.cpp +119 -7
  10. package/src/duckdb/src/common/extra_type_info.cpp +7 -3
  11. package/src/duckdb/src/common/radix_partitioning.cpp +8 -31
  12. package/src/duckdb/src/common/row_operations/row_aggregate.cpp +18 -3
  13. package/src/duckdb/src/common/serializer/binary_deserializer.cpp +62 -77
  14. package/src/duckdb/src/common/serializer/binary_serializer.cpp +84 -84
  15. package/src/duckdb/src/common/serializer/format_serializer.cpp +1 -1
  16. package/src/duckdb/src/common/sort/partition_state.cpp +41 -33
  17. package/src/duckdb/src/common/types/data_chunk.cpp +44 -8
  18. package/src/duckdb/src/common/types/hyperloglog.cpp +21 -0
  19. package/src/duckdb/src/common/types/interval.cpp +3 -0
  20. package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +252 -126
  21. package/src/duckdb/src/common/types/row/row_layout.cpp +3 -31
  22. package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +40 -32
  23. package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +39 -26
  24. package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +11 -1
  25. package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +21 -16
  26. package/src/duckdb/src/common/types/value.cpp +63 -42
  27. package/src/duckdb/src/common/types/vector.cpp +33 -67
  28. package/src/duckdb/src/core_functions/scalar/list/list_lambdas.cpp +3 -2
  29. package/src/duckdb/src/execution/aggregate_hashtable.cpp +222 -364
  30. package/src/duckdb/src/execution/join_hashtable.cpp +5 -6
  31. package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +240 -310
  32. package/src/duckdb/src/execution/operator/aggregate/physical_ungrouped_aggregate.cpp +202 -173
  33. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +36 -2
  34. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/base_csv_reader.cpp +58 -162
  35. package/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp +434 -0
  36. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +80 -0
  37. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer_manager.cpp +90 -0
  38. package/src/duckdb/src/execution/operator/csv_scanner/csv_file_handle.cpp +95 -0
  39. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/csv_reader_options.cpp +47 -28
  40. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine.cpp +35 -0
  41. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +107 -0
  42. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/parallel_csv_reader.cpp +44 -44
  43. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +52 -0
  44. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +336 -0
  45. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +165 -0
  46. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +398 -0
  47. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +175 -0
  48. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +39 -0
  49. package/src/duckdb/src/execution/operator/join/physical_asof_join.cpp +1 -1
  50. package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +1 -2
  51. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +614 -574
  52. package/src/duckdb/src/execution/window_executor.cpp +6 -5
  53. package/src/duckdb/src/function/cast/cast_function_set.cpp +1 -0
  54. package/src/duckdb/src/function/scalar/strftime_format.cpp +4 -4
  55. package/src/duckdb/src/function/table/copy_csv.cpp +94 -96
  56. package/src/duckdb/src/function/table/read_csv.cpp +150 -136
  57. package/src/duckdb/src/function/table/table_scan.cpp +0 -2
  58. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  59. package/src/duckdb/src/include/duckdb/common/enum_util.hpp +24 -0
  60. package/src/duckdb/src/include/duckdb/common/file_opener.hpp +9 -0
  61. package/src/duckdb/src/include/duckdb/common/fixed_size_map.hpp +208 -0
  62. package/src/duckdb/src/include/duckdb/common/optional_idx.hpp +3 -0
  63. package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +2 -1
  64. package/src/duckdb/src/include/duckdb/common/printer.hpp +11 -0
  65. package/src/duckdb/src/include/duckdb/common/serializer/binary_deserializer.hpp +43 -30
  66. package/src/duckdb/src/include/duckdb/common/serializer/binary_serializer.hpp +36 -35
  67. package/src/duckdb/src/include/duckdb/common/serializer/deserialization_data.hpp +18 -0
  68. package/src/duckdb/src/include/duckdb/common/serializer/encoding_util.hpp +132 -0
  69. package/src/duckdb/src/include/duckdb/common/serializer/format_deserializer.hpp +125 -150
  70. package/src/duckdb/src/include/duckdb/common/serializer/format_serializer.hpp +119 -107
  71. package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +2 -1
  72. package/src/duckdb/src/include/duckdb/common/shared_ptr.hpp +8 -0
  73. package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +13 -7
  74. package/src/duckdb/src/include/duckdb/common/types/data_chunk.hpp +5 -0
  75. package/src/duckdb/src/include/duckdb/common/types/hyperloglog.hpp +7 -1
  76. package/src/duckdb/src/include/duckdb/common/types/interval.hpp +7 -0
  77. package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +41 -9
  78. package/src/duckdb/src/include/duckdb/common/types/row/row_data_collection_scanner.hpp +5 -0
  79. package/src/duckdb/src/include/duckdb/common/types/row/row_layout.hpp +1 -23
  80. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +14 -8
  81. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +6 -3
  82. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +7 -0
  83. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +13 -8
  84. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +3 -2
  85. package/src/duckdb/src/include/duckdb/common/types/vector.hpp +3 -3
  86. package/src/duckdb/src/include/duckdb/common/vector.hpp +2 -2
  87. package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +125 -146
  88. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_hash_aggregate.hpp +5 -4
  89. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_window.hpp +4 -3
  90. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/base_csv_reader.hpp +17 -17
  91. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp +72 -0
  92. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +110 -0
  93. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp +103 -0
  94. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_file_handle.hpp +8 -15
  95. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_line_info.hpp +1 -1
  96. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_reader_options.hpp +52 -28
  97. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +127 -0
  98. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +75 -0
  99. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +51 -0
  100. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/parallel_csv_reader.hpp +21 -27
  101. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/quote_rules.hpp +21 -0
  102. package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +18 -27
  103. package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +5 -6
  104. package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +4 -4
  105. package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +17 -12
  106. package/src/duckdb/src/include/duckdb/main/client_context_file_opener.hpp +1 -0
  107. package/src/duckdb/src/include/duckdb/main/client_data.hpp +2 -1
  108. package/src/duckdb/src/include/duckdb/main/config.hpp +1 -0
  109. package/src/duckdb/src/include/duckdb/main/connection.hpp +2 -2
  110. package/src/duckdb/src/include/duckdb/main/relation/read_csv_relation.hpp +6 -6
  111. package/src/duckdb/src/include/duckdb/parallel/event.hpp +12 -1
  112. package/src/duckdb/src/include/duckdb/storage/block.hpp +6 -0
  113. package/src/duckdb/src/include/duckdb/storage/buffer/block_handle.hpp +3 -0
  114. package/src/duckdb/src/include/duckdb/storage/statistics/base_statistics.hpp +7 -3
  115. package/src/duckdb/src/include/duckdb/storage/statistics/column_statistics.hpp +4 -0
  116. package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +5 -0
  117. package/src/duckdb/src/include/duckdb/storage/statistics/list_stats.hpp +3 -0
  118. package/src/duckdb/src/include/duckdb/storage/statistics/numeric_stats.hpp +3 -0
  119. package/src/duckdb/src/include/duckdb/storage/statistics/string_stats.hpp +3 -0
  120. package/src/duckdb/src/include/duckdb/storage/statistics/struct_stats.hpp +3 -0
  121. package/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp +15 -3
  122. package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +4 -0
  123. package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +5 -0
  124. package/src/duckdb/src/include/duckdb/verification/deserialized_statement_verifier_v2.hpp +6 -0
  125. package/src/duckdb/src/include/duckdb/verification/statement_verifier.hpp +1 -0
  126. package/src/duckdb/src/include/duckdb.h +12 -0
  127. package/src/duckdb/src/main/capi/logical_types-c.cpp +22 -0
  128. package/src/duckdb/src/main/client_context_file_opener.cpp +17 -0
  129. package/src/duckdb/src/main/client_verify.cpp +1 -0
  130. package/src/duckdb/src/main/config.cpp +2 -2
  131. package/src/duckdb/src/main/connection.cpp +3 -3
  132. package/src/duckdb/src/main/relation/read_csv_relation.cpp +19 -13
  133. package/src/duckdb/src/parallel/pipeline_finish_event.cpp +1 -1
  134. package/src/duckdb/src/parser/tableref/pivotref.cpp +0 -16
  135. package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +1 -1
  136. package/src/duckdb/src/planner/binder/statement/bind_export.cpp +41 -25
  137. package/src/duckdb/src/planner/expression/bound_aggregate_expression.cpp +4 -4
  138. package/src/duckdb/src/planner/expression/bound_window_expression.cpp +10 -10
  139. package/src/duckdb/src/planner/logical_operator.cpp +1 -1
  140. package/src/duckdb/src/planner/planner.cpp +1 -1
  141. package/src/duckdb/src/storage/checkpoint_manager.cpp +4 -3
  142. package/src/duckdb/src/storage/serialization/serialize_constraint.cpp +1 -1
  143. package/src/duckdb/src/storage/serialization/serialize_create_info.cpp +5 -5
  144. package/src/duckdb/src/storage/serialization/serialize_expression.cpp +10 -10
  145. package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +20 -20
  146. package/src/duckdb/src/storage/serialization/serialize_macro_function.cpp +2 -2
  147. package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +118 -89
  148. package/src/duckdb/src/storage/serialization/serialize_parse_info.cpp +3 -3
  149. package/src/duckdb/src/storage/serialization/serialize_parsed_expression.cpp +27 -27
  150. package/src/duckdb/src/storage/serialization/serialize_query_node.cpp +16 -16
  151. package/src/duckdb/src/storage/serialization/serialize_result_modifier.cpp +8 -8
  152. package/src/duckdb/src/storage/serialization/serialize_statement.cpp +1 -1
  153. package/src/duckdb/src/storage/serialization/serialize_storage.cpp +39 -0
  154. package/src/duckdb/src/storage/serialization/serialize_tableref.cpp +9 -9
  155. package/src/duckdb/src/storage/statistics/base_statistics.cpp +67 -4
  156. package/src/duckdb/src/storage/statistics/column_statistics.cpp +16 -0
  157. package/src/duckdb/src/storage/statistics/list_stats.cpp +21 -0
  158. package/src/duckdb/src/storage/statistics/numeric_stats.cpp +126 -1
  159. package/src/duckdb/src/storage/statistics/string_stats.cpp +23 -0
  160. package/src/duckdb/src/storage/statistics/struct_stats.cpp +27 -0
  161. package/src/duckdb/src/storage/storage_info.cpp +1 -1
  162. package/src/duckdb/src/storage/table/chunk_info.cpp +82 -3
  163. package/src/duckdb/src/storage/table/row_group.cpp +68 -1
  164. package/src/duckdb/src/storage/table/table_statistics.cpp +21 -0
  165. package/src/duckdb/src/storage/wal_replay.cpp +2 -2
  166. package/src/duckdb/src/verification/deserialized_statement_verifier_v2.cpp +15 -1
  167. package/src/duckdb/src/verification/statement_verifier.cpp +2 -0
  168. package/src/duckdb/third_party/utf8proc/include/utf8proc_wrapper.hpp +8 -0
  169. package/src/duckdb/ub_src_execution.cpp +0 -2
  170. package/src/duckdb/ub_src_execution_operator_csv_scanner.cpp +18 -0
  171. package/src/duckdb/ub_src_execution_operator_csv_scanner_sniffer.cpp +12 -0
  172. package/src/duckdb/ub_src_execution_operator_persistent.cpp +0 -12
  173. package/src/duckdb/ub_src_storage_serialization.cpp +2 -0
  174. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +0 -1487
  175. package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +0 -72
  176. package/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp +0 -158
  177. package/src/duckdb/src/execution/partitionable_hashtable.cpp +0 -207
  178. package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +0 -133
  179. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +0 -74
  180. package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +0 -73
@@ -0,0 +1,336 @@
1
+ #include "duckdb/execution/operator/scan/csv/csv_sniffer.hpp"
2
+ #include "duckdb/main/client_data.hpp"
3
+
4
+ namespace duckdb {
5
+
6
+ struct SniffDialect {
7
+ inline static void Initialize(CSVStateMachine &machine) {
8
+ machine.state = CSVState::STANDARD;
9
+ machine.previous_state = CSVState::STANDARD;
10
+ machine.pre_previous_state = CSVState::STANDARD;
11
+ machine.cur_rows = 0;
12
+ machine.column_count = 1;
13
+ }
14
+
15
+ inline static bool Process(CSVStateMachine &machine, vector<idx_t> &sniffed_column_counts, char current_char,
16
+ idx_t current_pos) {
17
+
18
+ D_ASSERT(sniffed_column_counts.size() == machine.options.sample_chunk_size);
19
+
20
+ if (machine.state == CSVState::INVALID) {
21
+ sniffed_column_counts.clear();
22
+ return true;
23
+ }
24
+ machine.pre_previous_state = machine.previous_state;
25
+ machine.previous_state = machine.state;
26
+
27
+ machine.state = static_cast<CSVState>(
28
+ machine.transition_array[static_cast<uint8_t>(machine.state)][static_cast<uint8_t>(current_char)]);
29
+
30
+ bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN;
31
+ machine.column_count += machine.previous_state == CSVState::DELIMITER;
32
+ sniffed_column_counts[machine.cur_rows] = machine.column_count;
33
+ machine.cur_rows +=
34
+ machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE;
35
+ machine.column_count -= (machine.column_count - 1) * (machine.previous_state == CSVState::RECORD_SEPARATOR);
36
+
37
+ // It means our carriage return is actually a record separator
38
+ machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return;
39
+ machine.column_count -=
40
+ (machine.column_count - 1) * (machine.state != CSVState::RECORD_SEPARATOR && carriage_return);
41
+
42
+ // Identify what is our line separator
43
+ machine.carry_on_separator =
44
+ (machine.state == CSVState::RECORD_SEPARATOR && carriage_return) || machine.carry_on_separator;
45
+ machine.single_record_separator = ((machine.state != CSVState::RECORD_SEPARATOR && carriage_return) ||
46
+ (machine.state == CSVState::RECORD_SEPARATOR && !carriage_return)) ||
47
+ machine.single_record_separator;
48
+ if (machine.cur_rows >= machine.options.sample_chunk_size) {
49
+ // We sniffed enough rows
50
+ return true;
51
+ }
52
+ return false;
53
+ }
54
+ inline static void Finalize(CSVStateMachine &machine, vector<idx_t> &sniffed_column_counts) {
55
+ if (machine.state == CSVState::INVALID) {
56
+ return;
57
+ }
58
+ if (machine.cur_rows < machine.options.sample_chunk_size && machine.state != CSVState::EMPTY_LINE) {
59
+ sniffed_column_counts[machine.cur_rows++] = machine.column_count;
60
+ }
61
+ NewLineIdentifier suggested_newline;
62
+ if (machine.carry_on_separator) {
63
+ if (machine.single_record_separator) {
64
+ suggested_newline = NewLineIdentifier::MIX;
65
+ } else {
66
+ suggested_newline = NewLineIdentifier::CARRY_ON;
67
+ }
68
+ } else {
69
+ suggested_newline = NewLineIdentifier::SINGLE;
70
+ }
71
+ if (machine.options.dialect_options.new_line == NewLineIdentifier::NOT_SET) {
72
+ machine.dialect_options.new_line = suggested_newline;
73
+ } else {
74
+ if (machine.options.dialect_options.new_line != suggested_newline) {
75
+ // Invalidate this whole detection
76
+ machine.cur_rows = 0;
77
+ }
78
+ }
79
+ sniffed_column_counts.erase(sniffed_column_counts.begin() + machine.cur_rows, sniffed_column_counts.end());
80
+ }
81
+ };
82
+
83
+ void CSVSniffer::GenerateCandidateDetectionSearchSpace(vector<char> &delim_candidates,
84
+ vector<QuoteRule> &quoterule_candidates,
85
+ unordered_map<uint8_t, vector<char>> &quote_candidates_map,
86
+ unordered_map<uint8_t, vector<char>> &escape_candidates_map) {
87
+ if (options.has_delimiter) {
88
+ // user provided a delimiter: use that delimiter
89
+ delim_candidates = {options.dialect_options.state_machine_options.delimiter};
90
+ } else {
91
+ // no delimiter provided: try standard/common delimiters
92
+ delim_candidates = {',', '|', ';', '\t'};
93
+ }
94
+ if (options.has_quote) {
95
+ // user provided quote: use that quote rule
96
+ quote_candidates_map[(uint8_t)QuoteRule::QUOTES_RFC] = {options.dialect_options.state_machine_options.quote};
97
+ quote_candidates_map[(uint8_t)QuoteRule::QUOTES_OTHER] = {options.dialect_options.state_machine_options.quote};
98
+ quote_candidates_map[(uint8_t)QuoteRule::NO_QUOTES] = {options.dialect_options.state_machine_options.quote};
99
+ } else {
100
+ // no quote rule provided: use standard/common quotes
101
+ quote_candidates_map[(uint8_t)QuoteRule::QUOTES_RFC] = {'\"'};
102
+ quote_candidates_map[(uint8_t)QuoteRule::QUOTES_OTHER] = {'\"', '\''};
103
+ quote_candidates_map[(uint8_t)QuoteRule::NO_QUOTES] = {'\0'};
104
+ }
105
+ if (options.has_escape) {
106
+ // user provided escape: use that escape rule
107
+ if (options.dialect_options.state_machine_options.escape == '\0') {
108
+ quoterule_candidates = {QuoteRule::QUOTES_RFC};
109
+ } else {
110
+ quoterule_candidates = {QuoteRule::QUOTES_OTHER};
111
+ }
112
+ escape_candidates_map[(uint8_t)quoterule_candidates[0]] = {
113
+ options.dialect_options.state_machine_options.escape};
114
+ } else {
115
+ // no escape provided: try standard/common escapes
116
+ quoterule_candidates = {QuoteRule::QUOTES_RFC, QuoteRule::QUOTES_OTHER, QuoteRule::NO_QUOTES};
117
+ }
118
+ }
119
+
120
+ void CSVSniffer::GenerateStateMachineSearchSpace(vector<unique_ptr<CSVStateMachine>> &csv_state_machines,
121
+ const vector<char> &delimiter_candidates,
122
+ const vector<QuoteRule> &quoterule_candidates,
123
+ const unordered_map<uint8_t, vector<char>> &quote_candidates_map,
124
+ const unordered_map<uint8_t, vector<char>> &escape_candidates_map) {
125
+ // Generate state machines for all option combinations
126
+ for (const auto quoterule : quoterule_candidates) {
127
+ const auto &quote_candidates = quote_candidates_map.at((uint8_t)quoterule);
128
+ for (const auto &quote : quote_candidates) {
129
+ for (const auto &delimiter : delimiter_candidates) {
130
+ const auto &escape_candidates = escape_candidates_map.at((uint8_t)quoterule);
131
+ for (const auto &escape : escape_candidates) {
132
+ D_ASSERT(buffer_manager);
133
+ CSVStateMachineOptions state_machine_options(delimiter, quote, escape);
134
+ csv_state_machines.emplace_back(make_uniq<CSVStateMachine>(options, state_machine_options,
135
+ buffer_manager, state_machine_cache));
136
+ }
137
+ }
138
+ }
139
+ }
140
+ }
141
+
142
+ void CSVSniffer::AnalyzeDialectCandidate(unique_ptr<CSVStateMachine> state_machine, idx_t &rows_read,
143
+ idx_t &best_consistent_rows, idx_t &prev_padding_count) {
144
+ // The sniffed_column_counts variable keeps track of the number of columns found for each row
145
+ vector<idx_t> sniffed_column_counts(options.sample_chunk_size);
146
+
147
+ state_machine->csv_buffer_iterator.Process<SniffDialect>(*state_machine, sniffed_column_counts);
148
+ idx_t start_row = options.dialect_options.skip_rows;
149
+ idx_t consistent_rows = 0;
150
+ idx_t num_cols = sniffed_column_counts.empty() ? 0 : sniffed_column_counts[0];
151
+ idx_t padding_count = 0;
152
+ bool allow_padding = options.null_padding;
153
+ if (sniffed_column_counts.size() > rows_read) {
154
+ rows_read = sniffed_column_counts.size();
155
+ }
156
+ for (idx_t row = 0; row < sniffed_column_counts.size(); row++) {
157
+ if (sniffed_column_counts[row] == num_cols) {
158
+ consistent_rows++;
159
+ } else if (num_cols < sniffed_column_counts[row] && !options.skip_rows_set) {
160
+ // all rows up to this point will need padding
161
+ padding_count = 0;
162
+ // we use the maximum amount of num_cols that we find
163
+ num_cols = sniffed_column_counts[row];
164
+ start_row = row + options.dialect_options.skip_rows;
165
+ consistent_rows = 1;
166
+
167
+ } else if (num_cols >= sniffed_column_counts[row]) {
168
+ // we are missing some columns, we can parse this as long as we add padding
169
+ padding_count++;
170
+ }
171
+ }
172
+
173
+ // Calculate the total number of consistent rows after adding padding.
174
+ consistent_rows += padding_count;
175
+
176
+ // Whether there are more values (rows) available that are consistent, exceeding the current best.
177
+ bool more_values = (consistent_rows > best_consistent_rows && num_cols >= max_columns_found);
178
+
179
+ // If additional padding is required when compared to the previous padding count.
180
+ bool require_more_padding = padding_count > prev_padding_count;
181
+
182
+ // If less padding is now required when compared to the previous padding count.
183
+ bool require_less_padding = padding_count < prev_padding_count;
184
+
185
+ // If there was only a single column before, and the new number of columns exceeds that.
186
+ bool single_column_before = max_columns_found < 2 && num_cols > max_columns_found;
187
+
188
+ // If the number of rows is consistent with the calculated value after accounting for skipped rows and the
189
+ // start row.
190
+ bool rows_consistent =
191
+ start_row + consistent_rows - options.dialect_options.skip_rows == sniffed_column_counts.size();
192
+
193
+ // If there are more than one consistent row.
194
+ bool more_than_one_row = (consistent_rows > 1);
195
+
196
+ // If there are more than one column.
197
+ bool more_than_one_column = (num_cols > 1);
198
+
199
+ // If the start position is valid.
200
+ bool start_good = !candidates.empty() && (start_row <= candidates.front()->start_row);
201
+
202
+ // If padding happened but it is not allowed.
203
+ bool invalid_padding = !allow_padding && padding_count > 0;
204
+
205
+ // If rows are consistent and no invalid padding happens, this is the best suitable candidate if one of the
206
+ // following is valid:
207
+ // - There's a single column before.
208
+ // - There are more values and no additional padding is required.
209
+ // - There's more than one column and less padding is required.
210
+ if (rows_consistent &&
211
+ (single_column_before || (more_values && !require_more_padding) ||
212
+ (more_than_one_column && require_less_padding)) &&
213
+ !invalid_padding) {
214
+ best_consistent_rows = consistent_rows;
215
+ max_columns_found = num_cols;
216
+ prev_padding_count = padding_count;
217
+ state_machine->start_row = start_row;
218
+ candidates.clear();
219
+ state_machine->dialect_options.num_cols = num_cols;
220
+ candidates.emplace_back(std::move(state_machine));
221
+ return;
222
+ }
223
+ // If there's more than one row and column, the start is good, rows are consistent,
224
+ // no additional padding is required, and there is no invalid padding, and there is not yet a candidate
225
+ // with the same quote, we add this state_machine as a suitable candidate.
226
+ if (more_than_one_row && more_than_one_column && start_good && rows_consistent && !require_more_padding &&
227
+ !invalid_padding) {
228
+ bool same_quote_is_candidate = false;
229
+ for (auto &candidate : candidates) {
230
+ if (state_machine->dialect_options.state_machine_options.quote ==
231
+ candidate->dialect_options.state_machine_options.quote) {
232
+ same_quote_is_candidate = true;
233
+ }
234
+ }
235
+ if (!same_quote_is_candidate) {
236
+ state_machine->start_row = start_row;
237
+ state_machine->dialect_options.num_cols = num_cols;
238
+ candidates.emplace_back(std::move(state_machine));
239
+ }
240
+ }
241
+ }
242
+
243
+ bool CSVSniffer::RefineCandidateNextChunk(CSVStateMachine &candidate) {
244
+ vector<idx_t> sniffed_column_counts(options.sample_chunk_size);
245
+ candidate.csv_buffer_iterator.Process<SniffDialect>(candidate, sniffed_column_counts);
246
+ bool allow_padding = options.null_padding;
247
+
248
+ for (idx_t row = 0; row < sniffed_column_counts.size(); row++) {
249
+ if (max_columns_found != sniffed_column_counts[row] && !allow_padding) {
250
+ return false;
251
+ }
252
+ }
253
+ return true;
254
+ }
255
+
256
+ void CSVSniffer::RefineCandidates() {
257
+ // It's very frequent that more than one dialect can parse a csv file, hence here we run one state machine
258
+ // fully on the whole sample dataset, when/if it fails we go to the next one.
259
+ if (candidates.empty()) {
260
+ // No candidates to refine
261
+ return;
262
+ }
263
+ if (candidates.size() == 1 || candidates[0]->csv_buffer_iterator.Finished()) {
264
+ // Only one candidate nothing to refine or all candidates already checked
265
+ return;
266
+ }
267
+ for (auto &cur_candidate : candidates) {
268
+ for (idx_t i = 1; i <= options.sample_chunks; i++) {
269
+ bool finished_file = cur_candidate->csv_buffer_iterator.Finished();
270
+ if (finished_file || i == options.sample_chunks) {
271
+ // we finished the file or our chunk sample successfully: stop
272
+ auto successful_candidate = std::move(cur_candidate);
273
+ candidates.clear();
274
+ candidates.emplace_back(std::move(successful_candidate));
275
+ return;
276
+ }
277
+ cur_candidate->cur_rows = 0;
278
+ cur_candidate->column_count = 1;
279
+ if (!RefineCandidateNextChunk(*cur_candidate)) {
280
+ // This candidate failed, move to the next one
281
+ break;
282
+ }
283
+ }
284
+ }
285
+ candidates.clear();
286
+ return;
287
+ }
288
+
289
+ // Dialect Detection consists of five steps:
290
+ // 1. Generate a search space of all possible dialects
291
+ // 2. Generate a state machine for each dialect
292
+ // 3. Analyze the first chunk of the file and find the best dialect candidates
293
+ // 4. Analyze the remaining chunks of the file and find the best dialect candidate
294
+ void CSVSniffer::DetectDialect() {
295
+ // Variables for Dialect Detection
296
+ // Candidates for the delimiter
297
+ vector<char> delim_candidates;
298
+ // Quote-Rule Candidates
299
+ vector<QuoteRule> quoterule_candidates;
300
+ // Candidates for the quote option
301
+ unordered_map<uint8_t, vector<char>> quote_candidates_map;
302
+ // Candidates for the escape option
303
+ unordered_map<uint8_t, vector<char>> escape_candidates_map;
304
+ escape_candidates_map[(uint8_t)QuoteRule::QUOTES_RFC] = {'\0', '\"', '\''};
305
+ escape_candidates_map[(uint8_t)QuoteRule::QUOTES_OTHER] = {'\\'};
306
+ escape_candidates_map[(uint8_t)QuoteRule::NO_QUOTES] = {'\0'};
307
+ // Number of rows read
308
+ idx_t rows_read = 0;
309
+ // Best Number of consistent rows (i.e., presenting all columns)
310
+ idx_t best_consistent_rows = 0;
311
+ // If padding was necessary (i.e., rows are missing some columns, how many)
312
+ idx_t prev_padding_count = 0;
313
+ // Vector of CSV State Machines
314
+ vector<unique_ptr<CSVStateMachine>> csv_state_machines;
315
+
316
+ // Step 1: Generate search space
317
+ GenerateCandidateDetectionSearchSpace(delim_candidates, quoterule_candidates, quote_candidates_map,
318
+ escape_candidates_map);
319
+ // Step 2: Generate state machines
320
+ GenerateStateMachineSearchSpace(csv_state_machines, delim_candidates, quoterule_candidates, quote_candidates_map,
321
+ escape_candidates_map);
322
+ // Step 3: Analyze all candidates on the first chunk
323
+ for (auto &state_machine : csv_state_machines) {
324
+ state_machine->Reset();
325
+ AnalyzeDialectCandidate(std::move(state_machine), rows_read, best_consistent_rows, prev_padding_count);
326
+ }
327
+ // Step 4: Loop over candidates and find if they can still produce good results for the remaining chunks
328
+ RefineCandidates();
329
+ // if no dialect candidate was found, we throw an exception
330
+ if (candidates.empty()) {
331
+ throw InvalidInputException(
332
+ "Error in file \"%s\": CSV options could not be auto-detected. Consider setting parser options manually.",
333
+ options.file_path);
334
+ }
335
+ }
336
+ } // namespace duckdb
@@ -0,0 +1,165 @@
1
+ #include "duckdb/common/types/cast_helpers.hpp"
2
+ #include "duckdb/execution/operator/scan/csv/csv_sniffer.hpp"
3
+ #include "utf8proc.hpp"
4
+
5
+ namespace duckdb {
6
+
7
+ // Helper function to generate column names
8
+ static string GenerateColumnName(const idx_t total_cols, const idx_t col_number, const string &prefix = "column") {
9
+ int max_digits = NumericHelper::UnsignedLength(total_cols - 1);
10
+ int digits = NumericHelper::UnsignedLength(col_number);
11
+ string leading_zeros = string(max_digits - digits, '0');
12
+ string value = to_string(col_number);
13
+ return string(prefix + leading_zeros + value);
14
+ }
15
+
16
+ // Helper function for UTF-8 aware space trimming
17
+ static string TrimWhitespace(const string &col_name) {
18
+ utf8proc_int32_t codepoint;
19
+ auto str = reinterpret_cast<const utf8proc_uint8_t *>(col_name.c_str());
20
+ idx_t size = col_name.size();
21
+ // Find the first character that is not left trimmed
22
+ idx_t begin = 0;
23
+ while (begin < size) {
24
+ auto bytes = utf8proc_iterate(str + begin, size - begin, &codepoint);
25
+ D_ASSERT(bytes > 0);
26
+ if (utf8proc_category(codepoint) != UTF8PROC_CATEGORY_ZS) {
27
+ break;
28
+ }
29
+ begin += bytes;
30
+ }
31
+
32
+ // Find the last character that is not right trimmed
33
+ idx_t end;
34
+ end = begin;
35
+ for (auto next = begin; next < col_name.size();) {
36
+ auto bytes = utf8proc_iterate(str + next, size - next, &codepoint);
37
+ D_ASSERT(bytes > 0);
38
+ next += bytes;
39
+ if (utf8proc_category(codepoint) != UTF8PROC_CATEGORY_ZS) {
40
+ end = next;
41
+ }
42
+ }
43
+
44
+ // return the trimmed string
45
+ return col_name.substr(begin, end - begin);
46
+ }
47
+
48
+ static string NormalizeColumnName(const string &col_name) {
49
+ // normalize UTF8 characters to NFKD
50
+ auto nfkd = utf8proc_NFKD(reinterpret_cast<const utf8proc_uint8_t *>(col_name.c_str()), col_name.size());
51
+ const string col_name_nfkd = string(const_char_ptr_cast(nfkd), strlen(const_char_ptr_cast(nfkd)));
52
+ free(nfkd);
53
+
54
+ // only keep ASCII characters 0-9 a-z A-Z and replace spaces with regular whitespace
55
+ string col_name_ascii = "";
56
+ for (idx_t i = 0; i < col_name_nfkd.size(); i++) {
57
+ if (col_name_nfkd[i] == '_' || (col_name_nfkd[i] >= '0' && col_name_nfkd[i] <= '9') ||
58
+ (col_name_nfkd[i] >= 'A' && col_name_nfkd[i] <= 'Z') ||
59
+ (col_name_nfkd[i] >= 'a' && col_name_nfkd[i] <= 'z')) {
60
+ col_name_ascii += col_name_nfkd[i];
61
+ } else if (StringUtil::CharacterIsSpace(col_name_nfkd[i])) {
62
+ col_name_ascii += " ";
63
+ }
64
+ }
65
+
66
+ // trim whitespace and replace remaining whitespace by _
67
+ string col_name_trimmed = TrimWhitespace(col_name_ascii);
68
+ string col_name_cleaned = "";
69
+ bool in_whitespace = false;
70
+ for (idx_t i = 0; i < col_name_trimmed.size(); i++) {
71
+ if (col_name_trimmed[i] == ' ') {
72
+ if (!in_whitespace) {
73
+ col_name_cleaned += "_";
74
+ in_whitespace = true;
75
+ }
76
+ } else {
77
+ col_name_cleaned += col_name_trimmed[i];
78
+ in_whitespace = false;
79
+ }
80
+ }
81
+
82
+ // don't leave string empty; if not empty, make lowercase
83
+ if (col_name_cleaned.empty()) {
84
+ col_name_cleaned = "_";
85
+ } else {
86
+ col_name_cleaned = StringUtil::Lower(col_name_cleaned);
87
+ }
88
+
89
+ // prepend _ if name starts with a digit or is a reserved keyword
90
+ if (KeywordHelper::IsKeyword(col_name_cleaned) || (col_name_cleaned[0] >= '0' && col_name_cleaned[0] <= '9')) {
91
+ col_name_cleaned = "_" + col_name_cleaned;
92
+ }
93
+ return col_name_cleaned;
94
+ }
95
+ void CSVSniffer::DetectHeader() {
96
+ // information for header detection
97
+ bool first_row_consistent = true;
98
+ // check if header row is all null and/or consistent with detected column data types
99
+ bool first_row_nulls = true;
100
+ // This case will fail in dialect detection, so we assert here just for sanity
101
+ D_ASSERT(best_candidate->options.null_padding ||
102
+ best_sql_types_candidates_per_column_idx.size() == best_header_row.size());
103
+ for (idx_t col = 0; col < best_header_row.size(); col++) {
104
+ auto dummy_val = best_header_row[col];
105
+ if (!dummy_val.IsNull()) {
106
+ first_row_nulls = false;
107
+ }
108
+
109
+ // try cast to sql_type of column
110
+ const auto &sql_type = best_sql_types_candidates_per_column_idx[col].back();
111
+ if (!TryCastValue(*best_candidate, dummy_val, sql_type)) {
112
+ first_row_consistent = false;
113
+ }
114
+ }
115
+ bool has_header;
116
+ if (!best_candidate->options.has_header) {
117
+ has_header = !first_row_consistent || first_row_nulls;
118
+ } else {
119
+ has_header = best_candidate->options.dialect_options.header;
120
+ }
121
+ // update parser info, and read, generate & set col_names based on previous findings
122
+ if (has_header) {
123
+ best_candidate->dialect_options.header = true;
124
+ case_insensitive_map_t<idx_t> name_collision_count;
125
+
126
+ // get header names from CSV
127
+ for (idx_t col = 0; col < best_header_row.size(); col++) {
128
+ const auto &val = best_header_row[col];
129
+ string col_name = val.ToString();
130
+
131
+ // generate name if field is empty
132
+ if (col_name.empty() || val.IsNull()) {
133
+ col_name = GenerateColumnName(best_candidate->dialect_options.num_cols, col);
134
+ }
135
+
136
+ // normalize names or at least trim whitespace
137
+ if (best_candidate->options.normalize_names) {
138
+ col_name = NormalizeColumnName(col_name);
139
+ } else {
140
+ col_name = TrimWhitespace(col_name);
141
+ }
142
+
143
+ // avoid duplicate header names
144
+ while (name_collision_count.find(col_name) != name_collision_count.end()) {
145
+ name_collision_count[col_name] += 1;
146
+ col_name = col_name + "_" + to_string(name_collision_count[col_name]);
147
+ }
148
+ names.push_back(col_name);
149
+ name_collision_count[col_name] = 0;
150
+ }
151
+
152
+ } else {
153
+ best_candidate->dialect_options.header = false;
154
+ for (idx_t col = 0; col < best_candidate->dialect_options.num_cols; col++) {
155
+ string column_name = GenerateColumnName(best_candidate->dialect_options.num_cols, col);
156
+ names.push_back(column_name);
157
+ }
158
+ }
159
+
160
+ // If the user provided names, we must replace our header with the user provided names
161
+ for (idx_t i = 0; i < MinValue<idx_t>(names.size(), best_candidate->options.name_list.size()); i++) {
162
+ names[i] = best_candidate->options.name_list[i];
163
+ }
164
+ }
165
+ } // namespace duckdb