duckdb 0.8.2-dev3458.0 → 0.8.2-dev3949.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. package/binding.gyp +2 -0
  2. package/package.json +1 -1
  3. package/src/duckdb/extension/icu/icu_extension.cpp +5 -5
  4. package/src/duckdb/extension/json/include/json_deserializer.hpp +7 -16
  5. package/src/duckdb/extension/json/include/json_serializer.hpp +9 -15
  6. package/src/duckdb/extension/json/json_deserializer.cpp +29 -67
  7. package/src/duckdb/extension/json/json_scan.cpp +1 -1
  8. package/src/duckdb/extension/json/json_serializer.cpp +26 -69
  9. package/src/duckdb/src/common/enum_util.cpp +119 -7
  10. package/src/duckdb/src/common/extra_type_info.cpp +7 -3
  11. package/src/duckdb/src/common/radix_partitioning.cpp +8 -31
  12. package/src/duckdb/src/common/row_operations/row_aggregate.cpp +18 -3
  13. package/src/duckdb/src/common/serializer/binary_deserializer.cpp +62 -77
  14. package/src/duckdb/src/common/serializer/binary_serializer.cpp +84 -84
  15. package/src/duckdb/src/common/serializer/format_serializer.cpp +1 -1
  16. package/src/duckdb/src/common/sort/partition_state.cpp +41 -33
  17. package/src/duckdb/src/common/types/data_chunk.cpp +44 -8
  18. package/src/duckdb/src/common/types/hyperloglog.cpp +21 -0
  19. package/src/duckdb/src/common/types/interval.cpp +3 -0
  20. package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +252 -126
  21. package/src/duckdb/src/common/types/row/row_layout.cpp +3 -31
  22. package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +40 -32
  23. package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +39 -26
  24. package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +11 -1
  25. package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +21 -16
  26. package/src/duckdb/src/common/types/value.cpp +63 -42
  27. package/src/duckdb/src/common/types/vector.cpp +33 -67
  28. package/src/duckdb/src/core_functions/scalar/list/list_lambdas.cpp +3 -2
  29. package/src/duckdb/src/execution/aggregate_hashtable.cpp +222 -364
  30. package/src/duckdb/src/execution/join_hashtable.cpp +5 -6
  31. package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +240 -310
  32. package/src/duckdb/src/execution/operator/aggregate/physical_ungrouped_aggregate.cpp +202 -173
  33. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +36 -2
  34. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/base_csv_reader.cpp +58 -162
  35. package/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp +434 -0
  36. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +80 -0
  37. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer_manager.cpp +90 -0
  38. package/src/duckdb/src/execution/operator/csv_scanner/csv_file_handle.cpp +95 -0
  39. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/csv_reader_options.cpp +47 -28
  40. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine.cpp +35 -0
  41. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +107 -0
  42. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/parallel_csv_reader.cpp +44 -44
  43. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +52 -0
  44. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +336 -0
  45. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +165 -0
  46. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +398 -0
  47. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +175 -0
  48. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +39 -0
  49. package/src/duckdb/src/execution/operator/join/physical_asof_join.cpp +1 -1
  50. package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +1 -2
  51. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +614 -574
  52. package/src/duckdb/src/execution/window_executor.cpp +6 -5
  53. package/src/duckdb/src/function/cast/cast_function_set.cpp +1 -0
  54. package/src/duckdb/src/function/scalar/strftime_format.cpp +4 -4
  55. package/src/duckdb/src/function/table/copy_csv.cpp +94 -96
  56. package/src/duckdb/src/function/table/read_csv.cpp +150 -136
  57. package/src/duckdb/src/function/table/table_scan.cpp +0 -2
  58. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  59. package/src/duckdb/src/include/duckdb/common/enum_util.hpp +24 -0
  60. package/src/duckdb/src/include/duckdb/common/file_opener.hpp +9 -0
  61. package/src/duckdb/src/include/duckdb/common/fixed_size_map.hpp +208 -0
  62. package/src/duckdb/src/include/duckdb/common/optional_idx.hpp +3 -0
  63. package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +2 -1
  64. package/src/duckdb/src/include/duckdb/common/printer.hpp +11 -0
  65. package/src/duckdb/src/include/duckdb/common/serializer/binary_deserializer.hpp +43 -30
  66. package/src/duckdb/src/include/duckdb/common/serializer/binary_serializer.hpp +36 -35
  67. package/src/duckdb/src/include/duckdb/common/serializer/deserialization_data.hpp +18 -0
  68. package/src/duckdb/src/include/duckdb/common/serializer/encoding_util.hpp +132 -0
  69. package/src/duckdb/src/include/duckdb/common/serializer/format_deserializer.hpp +125 -150
  70. package/src/duckdb/src/include/duckdb/common/serializer/format_serializer.hpp +119 -107
  71. package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +2 -1
  72. package/src/duckdb/src/include/duckdb/common/shared_ptr.hpp +8 -0
  73. package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +13 -7
  74. package/src/duckdb/src/include/duckdb/common/types/data_chunk.hpp +5 -0
  75. package/src/duckdb/src/include/duckdb/common/types/hyperloglog.hpp +7 -1
  76. package/src/duckdb/src/include/duckdb/common/types/interval.hpp +7 -0
  77. package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +41 -9
  78. package/src/duckdb/src/include/duckdb/common/types/row/row_data_collection_scanner.hpp +5 -0
  79. package/src/duckdb/src/include/duckdb/common/types/row/row_layout.hpp +1 -23
  80. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +14 -8
  81. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +6 -3
  82. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +7 -0
  83. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +13 -8
  84. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +3 -2
  85. package/src/duckdb/src/include/duckdb/common/types/vector.hpp +3 -3
  86. package/src/duckdb/src/include/duckdb/common/vector.hpp +2 -2
  87. package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +125 -146
  88. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_hash_aggregate.hpp +5 -4
  89. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_window.hpp +4 -3
  90. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/base_csv_reader.hpp +17 -17
  91. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp +72 -0
  92. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +110 -0
  93. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp +103 -0
  94. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_file_handle.hpp +8 -15
  95. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_line_info.hpp +1 -1
  96. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_reader_options.hpp +52 -28
  97. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +127 -0
  98. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +75 -0
  99. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +51 -0
  100. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/parallel_csv_reader.hpp +21 -27
  101. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/quote_rules.hpp +21 -0
  102. package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +18 -27
  103. package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +5 -6
  104. package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +4 -4
  105. package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +17 -12
  106. package/src/duckdb/src/include/duckdb/main/client_context_file_opener.hpp +1 -0
  107. package/src/duckdb/src/include/duckdb/main/client_data.hpp +2 -1
  108. package/src/duckdb/src/include/duckdb/main/config.hpp +1 -0
  109. package/src/duckdb/src/include/duckdb/main/connection.hpp +2 -2
  110. package/src/duckdb/src/include/duckdb/main/relation/read_csv_relation.hpp +6 -6
  111. package/src/duckdb/src/include/duckdb/parallel/event.hpp +12 -1
  112. package/src/duckdb/src/include/duckdb/storage/block.hpp +6 -0
  113. package/src/duckdb/src/include/duckdb/storage/buffer/block_handle.hpp +3 -0
  114. package/src/duckdb/src/include/duckdb/storage/statistics/base_statistics.hpp +7 -3
  115. package/src/duckdb/src/include/duckdb/storage/statistics/column_statistics.hpp +4 -0
  116. package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +5 -0
  117. package/src/duckdb/src/include/duckdb/storage/statistics/list_stats.hpp +3 -0
  118. package/src/duckdb/src/include/duckdb/storage/statistics/numeric_stats.hpp +3 -0
  119. package/src/duckdb/src/include/duckdb/storage/statistics/string_stats.hpp +3 -0
  120. package/src/duckdb/src/include/duckdb/storage/statistics/struct_stats.hpp +3 -0
  121. package/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp +15 -3
  122. package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +4 -0
  123. package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +5 -0
  124. package/src/duckdb/src/include/duckdb/verification/deserialized_statement_verifier_v2.hpp +6 -0
  125. package/src/duckdb/src/include/duckdb/verification/statement_verifier.hpp +1 -0
  126. package/src/duckdb/src/include/duckdb.h +12 -0
  127. package/src/duckdb/src/main/capi/logical_types-c.cpp +22 -0
  128. package/src/duckdb/src/main/client_context_file_opener.cpp +17 -0
  129. package/src/duckdb/src/main/client_verify.cpp +1 -0
  130. package/src/duckdb/src/main/config.cpp +2 -2
  131. package/src/duckdb/src/main/connection.cpp +3 -3
  132. package/src/duckdb/src/main/relation/read_csv_relation.cpp +19 -13
  133. package/src/duckdb/src/parallel/pipeline_finish_event.cpp +1 -1
  134. package/src/duckdb/src/parser/tableref/pivotref.cpp +0 -16
  135. package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +1 -1
  136. package/src/duckdb/src/planner/binder/statement/bind_export.cpp +41 -25
  137. package/src/duckdb/src/planner/expression/bound_aggregate_expression.cpp +4 -4
  138. package/src/duckdb/src/planner/expression/bound_window_expression.cpp +10 -10
  139. package/src/duckdb/src/planner/logical_operator.cpp +1 -1
  140. package/src/duckdb/src/planner/planner.cpp +1 -1
  141. package/src/duckdb/src/storage/checkpoint_manager.cpp +4 -3
  142. package/src/duckdb/src/storage/serialization/serialize_constraint.cpp +1 -1
  143. package/src/duckdb/src/storage/serialization/serialize_create_info.cpp +5 -5
  144. package/src/duckdb/src/storage/serialization/serialize_expression.cpp +10 -10
  145. package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +20 -20
  146. package/src/duckdb/src/storage/serialization/serialize_macro_function.cpp +2 -2
  147. package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +118 -89
  148. package/src/duckdb/src/storage/serialization/serialize_parse_info.cpp +3 -3
  149. package/src/duckdb/src/storage/serialization/serialize_parsed_expression.cpp +27 -27
  150. package/src/duckdb/src/storage/serialization/serialize_query_node.cpp +16 -16
  151. package/src/duckdb/src/storage/serialization/serialize_result_modifier.cpp +8 -8
  152. package/src/duckdb/src/storage/serialization/serialize_statement.cpp +1 -1
  153. package/src/duckdb/src/storage/serialization/serialize_storage.cpp +39 -0
  154. package/src/duckdb/src/storage/serialization/serialize_tableref.cpp +9 -9
  155. package/src/duckdb/src/storage/statistics/base_statistics.cpp +67 -4
  156. package/src/duckdb/src/storage/statistics/column_statistics.cpp +16 -0
  157. package/src/duckdb/src/storage/statistics/list_stats.cpp +21 -0
  158. package/src/duckdb/src/storage/statistics/numeric_stats.cpp +126 -1
  159. package/src/duckdb/src/storage/statistics/string_stats.cpp +23 -0
  160. package/src/duckdb/src/storage/statistics/struct_stats.cpp +27 -0
  161. package/src/duckdb/src/storage/storage_info.cpp +1 -1
  162. package/src/duckdb/src/storage/table/chunk_info.cpp +82 -3
  163. package/src/duckdb/src/storage/table/row_group.cpp +68 -1
  164. package/src/duckdb/src/storage/table/table_statistics.cpp +21 -0
  165. package/src/duckdb/src/storage/wal_replay.cpp +2 -2
  166. package/src/duckdb/src/verification/deserialized_statement_verifier_v2.cpp +15 -1
  167. package/src/duckdb/src/verification/statement_verifier.cpp +2 -0
  168. package/src/duckdb/third_party/utf8proc/include/utf8proc_wrapper.hpp +8 -0
  169. package/src/duckdb/ub_src_execution.cpp +0 -2
  170. package/src/duckdb/ub_src_execution_operator_csv_scanner.cpp +18 -0
  171. package/src/duckdb/ub_src_execution_operator_csv_scanner_sniffer.cpp +12 -0
  172. package/src/duckdb/ub_src_execution_operator_persistent.cpp +0 -12
  173. package/src/duckdb/ub_src_storage_serialization.cpp +2 -0
  174. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +0 -1487
  175. package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +0 -72
  176. package/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp +0 -158
  177. package/src/duckdb/src/execution/partitionable_hashtable.cpp +0 -207
  178. package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +0 -133
  179. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +0 -74
  180. package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +0 -73
@@ -0,0 +1,398 @@
1
+ #include "duckdb/common/operator/decimal_cast_operators.hpp"
2
+ #include "duckdb/execution/operator/scan/csv/csv_sniffer.hpp"
3
+ #include "duckdb/common/algorithm.hpp"
4
+ #include "duckdb/common/string.hpp"
5
+
6
+ namespace duckdb {
7
+ struct TryCastFloatingOperator {
8
+ template <class OP, class T>
9
+ static bool Operation(string_t input) {
10
+ T result;
11
+ string error_message;
12
+ return OP::Operation(input, result, &error_message);
13
+ }
14
+ };
15
+
16
+ struct TupleSniffing {
17
+ idx_t line_number;
18
+ idx_t position;
19
+ bool set = false;
20
+ vector<Value> values;
21
+ };
22
+
23
+ static bool StartsWithNumericDate(string &separator, const string &value) {
24
+ auto begin = value.c_str();
25
+ auto end = begin + value.size();
26
+
27
+ // StrpTimeFormat::Parse will skip whitespace, so we can too
28
+ auto field1 = std::find_if_not(begin, end, StringUtil::CharacterIsSpace);
29
+ if (field1 == end) {
30
+ return false;
31
+ }
32
+
33
+ // first numeric field must start immediately
34
+ if (!StringUtil::CharacterIsDigit(*field1)) {
35
+ return false;
36
+ }
37
+ auto literal1 = std::find_if_not(field1, end, StringUtil::CharacterIsDigit);
38
+ if (literal1 == end) {
39
+ return false;
40
+ }
41
+
42
+ // second numeric field must exist
43
+ auto field2 = std::find_if(literal1, end, StringUtil::CharacterIsDigit);
44
+ if (field2 == end) {
45
+ return false;
46
+ }
47
+ auto literal2 = std::find_if_not(field2, end, StringUtil::CharacterIsDigit);
48
+ if (literal2 == end) {
49
+ return false;
50
+ }
51
+
52
+ // third numeric field must exist
53
+ auto field3 = std::find_if(literal2, end, StringUtil::CharacterIsDigit);
54
+ if (field3 == end) {
55
+ return false;
56
+ }
57
+
58
+ // second literal must match first
59
+ if (((field3 - literal2) != (field2 - literal1)) || strncmp(literal1, literal2, (field2 - literal1)) != 0) {
60
+ return false;
61
+ }
62
+
63
+ // copy the literal as the separator, escaping percent signs
64
+ separator.clear();
65
+ while (literal1 < field2) {
66
+ const auto literal_char = *literal1++;
67
+ if (literal_char == '%') {
68
+ separator.push_back(literal_char);
69
+ }
70
+ separator.push_back(literal_char);
71
+ }
72
+
73
+ return true;
74
+ }
75
+
76
+ string GenerateDateFormat(const string &separator, const char *format_template) {
77
+ string format_specifier = format_template;
78
+ auto amount_of_dashes = std::count(format_specifier.begin(), format_specifier.end(), '-');
79
+ // All our date formats must have at least one -
80
+ D_ASSERT(amount_of_dashes);
81
+ string result;
82
+ result.reserve(format_specifier.size() - amount_of_dashes + (amount_of_dashes * separator.size()));
83
+ for (auto &character : format_specifier) {
84
+ if (character == '-') {
85
+ result += separator;
86
+ } else {
87
+ result += character;
88
+ }
89
+ }
90
+ return result;
91
+ }
92
+
93
+ bool CSVSniffer::TryCastValue(CSVStateMachine &candidate, const Value &value, const LogicalType &sql_type) {
94
+ if (value.IsNull()) {
95
+ return true;
96
+ }
97
+ if (candidate.dialect_options.has_format.find(LogicalTypeId::DATE)->second &&
98
+ sql_type.id() == LogicalTypeId::DATE) {
99
+ date_t result;
100
+ string error_message;
101
+ return candidate.dialect_options.date_format.find(LogicalTypeId::DATE)
102
+ ->second.TryParseDate(string_t(StringValue::Get(value)), result, error_message);
103
+ }
104
+ if (candidate.dialect_options.has_format.find(LogicalTypeId::TIMESTAMP)->second &&
105
+ sql_type.id() == LogicalTypeId::TIMESTAMP) {
106
+ timestamp_t result;
107
+ string error_message;
108
+ return candidate.dialect_options.date_format.find(LogicalTypeId::TIMESTAMP)
109
+ ->second.TryParseTimestamp(string_t(StringValue::Get(value)), result, error_message);
110
+ }
111
+ if (candidate.options.decimal_separator != "." && (sql_type.id() == LogicalTypeId::DOUBLE)) {
112
+ return TryCastFloatingOperator::Operation<TryCastErrorMessageCommaSeparated, double>(StringValue::Get(value));
113
+ }
114
+ Value new_value;
115
+ string error_message;
116
+ return value.TryCastAs(buffer_manager->context, sql_type, new_value, &error_message, true);
117
+ }
118
+
119
+ void CSVSniffer::SetDateFormat(CSVStateMachine &candidate, const string &format_specifier,
120
+ const LogicalTypeId &sql_type) {
121
+ candidate.dialect_options.has_format[sql_type] = true;
122
+ auto &date_format = candidate.dialect_options.date_format[sql_type];
123
+ date_format.format_specifier = format_specifier;
124
+ StrTimeFormat::ParseFormatSpecifier(date_format.format_specifier, date_format);
125
+ }
126
+
127
+ struct SniffValue {
128
+ inline static void Initialize(CSVStateMachine &machine) {
129
+ machine.state = CSVState::STANDARD;
130
+ machine.previous_state = CSVState::STANDARD;
131
+ machine.pre_previous_state = CSVState::STANDARD;
132
+ machine.cur_rows = 0;
133
+ machine.value = "";
134
+ machine.rows_read = 0;
135
+ }
136
+
137
+ inline static bool Process(CSVStateMachine &machine, vector<TupleSniffing> &sniffed_values, char current_char,
138
+ idx_t current_pos) {
139
+
140
+ if ((machine.dialect_options.new_line == NewLineIdentifier::SINGLE &&
141
+ (current_char == '\r' || current_char == '\n')) ||
142
+ (machine.dialect_options.new_line == NewLineIdentifier::CARRY_ON && current_char == '\n')) {
143
+ machine.rows_read++;
144
+ sniffed_values[machine.cur_rows].position = machine.line_start_pos;
145
+ sniffed_values[machine.cur_rows].set = true;
146
+ machine.line_start_pos = current_pos;
147
+ }
148
+ machine.pre_previous_state = machine.previous_state;
149
+ machine.previous_state = machine.state;
150
+ machine.state = static_cast<CSVState>(
151
+ machine.transition_array[static_cast<uint8_t>(machine.state)][static_cast<uint8_t>(current_char)]);
152
+
153
+ bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN;
154
+ if (machine.previous_state == CSVState::DELIMITER ||
155
+ (machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE) ||
156
+ (machine.state != CSVState::RECORD_SEPARATOR && carriage_return)) {
157
+ // Started a new value
158
+ // Check if it's UTF-8
159
+ machine.VerifyUTF8();
160
+ if (machine.value.empty() || machine.value == machine.options.null_str) {
161
+ // We set empty == null value
162
+ sniffed_values[machine.cur_rows].values.push_back(Value(LogicalType::VARCHAR));
163
+ } else {
164
+ sniffed_values[machine.cur_rows].values.push_back(Value(machine.value));
165
+ }
166
+ sniffed_values[machine.cur_rows].line_number = machine.rows_read;
167
+
168
+ machine.value = "";
169
+ }
170
+ if (machine.state == CSVState::STANDARD ||
171
+ (machine.state == CSVState::QUOTED && machine.previous_state == CSVState::QUOTED)) {
172
+ machine.value += current_char;
173
+ }
174
+ machine.cur_rows +=
175
+ machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE;
176
+ // It means our carriage return is actually a record separator
177
+ machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return;
178
+ if (machine.cur_rows >= sniffed_values.size()) {
179
+ // We sniffed enough rows
180
+ return true;
181
+ }
182
+ return false;
183
+ }
184
+
185
+ inline static void Finalize(CSVStateMachine &machine, vector<TupleSniffing> &sniffed_values) {
186
+ if (machine.cur_rows < sniffed_values.size() && machine.state != CSVState::EMPTY_LINE) {
187
+ machine.VerifyUTF8();
188
+ sniffed_values[machine.cur_rows].line_number = machine.rows_read;
189
+ if (!sniffed_values[machine.cur_rows].set) {
190
+ sniffed_values[machine.cur_rows].position = machine.line_start_pos;
191
+ sniffed_values[machine.cur_rows].set = true;
192
+ }
193
+
194
+ sniffed_values[machine.cur_rows++].values.push_back(Value(machine.value));
195
+ }
196
+ sniffed_values.erase(sniffed_values.end() - (sniffed_values.size() - machine.cur_rows), sniffed_values.end());
197
+ }
198
+ };
199
+
200
+ void CSVSniffer::DetectDateAndTimeStampFormats(CSVStateMachine &candidate,
201
+ map<LogicalTypeId, bool> &has_format_candidates,
202
+ map<LogicalTypeId, vector<string>> &format_candidates,
203
+ const LogicalType &sql_type, const string &separator, Value &dummy_val) {
204
+ // generate date format candidates the first time through
205
+ auto &type_format_candidates = format_candidates[sql_type.id()];
206
+ const auto had_format_candidates = has_format_candidates[sql_type.id()];
207
+ if (!has_format_candidates[sql_type.id()]) {
208
+ has_format_candidates[sql_type.id()] = true;
209
+ // order by preference
210
+ auto entry = format_template_candidates.find(sql_type.id());
211
+ if (entry != format_template_candidates.end()) {
212
+ const auto &format_template_list = entry->second;
213
+ for (const auto &t : format_template_list) {
214
+ const auto format_string = GenerateDateFormat(separator, t);
215
+ // don't parse ISO 8601
216
+ if (format_string.find("%Y-%m-%d") == string::npos) {
217
+ type_format_candidates.emplace_back(format_string);
218
+ }
219
+ }
220
+ }
221
+ // initialise the first candidate
222
+ candidate.dialect_options.has_format[sql_type.id()] = true;
223
+ // all formats are constructed to be valid
224
+ SetDateFormat(candidate, type_format_candidates.back(), sql_type.id());
225
+ }
226
+ // check all formats and keep the first one that works
227
+ StrpTimeFormat::ParseResult result;
228
+ auto save_format_candidates = type_format_candidates;
229
+ while (!type_format_candidates.empty()) {
230
+ // avoid using exceptions for flow control...
231
+ auto &current_format = candidate.dialect_options.date_format[sql_type.id()];
232
+ if (current_format.Parse(StringValue::Get(dummy_val), result)) {
233
+ break;
234
+ }
235
+ // doesn't work - move to the next one
236
+ type_format_candidates.pop_back();
237
+ candidate.dialect_options.has_format[sql_type.id()] = (!type_format_candidates.empty());
238
+ if (!type_format_candidates.empty()) {
239
+ SetDateFormat(candidate, type_format_candidates.back(), sql_type.id());
240
+ }
241
+ }
242
+ // if none match, then this is not a value of type sql_type,
243
+ if (type_format_candidates.empty()) {
244
+ // so restore the candidates that did work.
245
+ // or throw them out if they were generated by this value.
246
+ if (had_format_candidates) {
247
+ type_format_candidates.swap(save_format_candidates);
248
+ if (!type_format_candidates.empty()) {
249
+ SetDateFormat(candidate, type_format_candidates.back(), sql_type.id());
250
+ }
251
+ } else {
252
+ has_format_candidates[sql_type.id()] = false;
253
+ }
254
+ }
255
+ }
256
+
257
+ void CSVSniffer::DetectTypes() {
258
+ idx_t min_varchar_cols = max_columns_found + 1;
259
+ vector<LogicalType> return_types;
260
+ // check which info candidate leads to minimum amount of non-varchar columns...
261
+ for (auto &candidate : candidates) {
262
+ unordered_map<idx_t, vector<LogicalType>> info_sql_types_candidates;
263
+ for (idx_t i = 0; i < candidate->dialect_options.num_cols; i++) {
264
+ info_sql_types_candidates[i] = candidate->options.auto_type_candidates;
265
+ }
266
+ map<LogicalTypeId, bool> has_format_candidates;
267
+ map<LogicalTypeId, vector<string>> format_candidates;
268
+ for (const auto &t : format_template_candidates) {
269
+ has_format_candidates[t.first] = false;
270
+ format_candidates[t.first].clear();
271
+ }
272
+ D_ASSERT(candidate->dialect_options.num_cols > 0);
273
+
274
+ // Set all return_types to VARCHAR so we can do datatype detection based on VARCHAR values
275
+ return_types.clear();
276
+ return_types.assign(candidate->dialect_options.num_cols, LogicalType::VARCHAR);
277
+
278
+ // Reset candidate for parsing
279
+ candidate->Reset();
280
+
281
+ // Parse chunk and read csv with info candidate
282
+ idx_t sample_size = options.sample_chunk_size;
283
+ if (options.sample_chunk_size == 1) {
284
+ sample_size++;
285
+ }
286
+ vector<TupleSniffing> tuples(sample_size);
287
+ candidate->csv_buffer_iterator.Process<SniffValue>(*candidate, tuples);
288
+ // Potentially Skip empty rows (I find this dirty, but it is what the original code does)
289
+ idx_t true_start = 0;
290
+ idx_t values_start = 0;
291
+ while (true_start < tuples.size()) {
292
+ if (tuples[true_start].values.empty() ||
293
+ (tuples[true_start].values.size() == 1 && tuples[true_start].values[0].IsNull())) {
294
+ true_start = tuples[true_start].line_number;
295
+ values_start++;
296
+ } else {
297
+ break;
298
+ }
299
+ }
300
+
301
+ // Potentially Skip Notes (I also find this dirty, but it is what the original code does)
302
+ while (true_start < tuples.size()) {
303
+ if (tuples[true_start].values.size() < max_columns_found) {
304
+ true_start = tuples[true_start].line_number;
305
+ values_start++;
306
+ } else {
307
+ break;
308
+ }
309
+ }
310
+
311
+ tuples.erase(tuples.begin(), tuples.begin() + values_start);
312
+ idx_t row_idx = 0;
313
+ if (tuples.size() > 1 && (!options.has_header || (options.has_header && options.dialect_options.header))) {
314
+ // This means we have more than one row, hence we can use the first row to detect if we have a header
315
+ row_idx = 1;
316
+ }
317
+ if (!tuples.empty()) {
318
+ best_start_without_header = tuples[0].position;
319
+ }
320
+
321
+ // First line where we start our type detection
322
+ const idx_t start_idx_detection = row_idx;
323
+ for (; row_idx < tuples.size(); row_idx++) {
324
+ for (idx_t col = 0; col < tuples[row_idx].values.size(); col++) {
325
+ auto &col_type_candidates = info_sql_types_candidates[col];
326
+ auto cur_top_candidate = col_type_candidates.back();
327
+ auto dummy_val = tuples[row_idx].values[col];
328
+ // try cast from string to sql_type
329
+ while (col_type_candidates.size() > 1) {
330
+ const auto &sql_type = col_type_candidates.back();
331
+ // try formatting for date types if the user did not specify one and it starts with numeric values.
332
+ string separator;
333
+ bool has_format_is_set = false;
334
+ auto format_iterator = candidate->dialect_options.has_format.find(sql_type.id());
335
+ if (format_iterator != candidate->dialect_options.has_format.end()) {
336
+ has_format_is_set = format_iterator->second;
337
+ }
338
+ if (has_format_candidates.count(sql_type.id()) &&
339
+ (!has_format_is_set || format_candidates[sql_type.id()].size() > 1) && !dummy_val.IsNull() &&
340
+ StartsWithNumericDate(separator, StringValue::Get(dummy_val))) {
341
+ DetectDateAndTimeStampFormats(*candidate, has_format_candidates, format_candidates, sql_type,
342
+ separator, dummy_val);
343
+ }
344
+ // try cast from string to sql_type
345
+ if (TryCastValue(*candidate, dummy_val, sql_type)) {
346
+ break;
347
+ } else {
348
+ if (row_idx != start_idx_detection && cur_top_candidate == LogicalType::BOOLEAN) {
349
+ // If we thought this was a boolean value (i.e., T,F, True, False) and it is not, we
350
+ // immediately pop to varchar.
351
+ while (col_type_candidates.back() != LogicalType::VARCHAR) {
352
+ col_type_candidates.pop_back();
353
+ }
354
+ break;
355
+ }
356
+ col_type_candidates.pop_back();
357
+ }
358
+ }
359
+ }
360
+ }
361
+
362
+ idx_t varchar_cols = 0;
363
+
364
+ for (idx_t col = 0; col < info_sql_types_candidates.size(); col++) {
365
+ auto &col_type_candidates = info_sql_types_candidates[col];
366
+ // check number of varchar columns
367
+ const auto &col_type = col_type_candidates.back();
368
+ if (col_type == LogicalType::VARCHAR) {
369
+ varchar_cols++;
370
+ }
371
+ }
372
+
373
+ // it's good if the dialect creates more non-varchar columns, but only if we sacrifice < 30% of best_num_cols.
374
+ if (varchar_cols < min_varchar_cols && info_sql_types_candidates.size() > (max_columns_found * 0.7)) {
375
+ // we have a new best_options candidate
376
+ if (true_start > 0) {
377
+ // Add empty rows to skip_rows
378
+ candidate->dialect_options.skip_rows += true_start;
379
+ }
380
+ best_candidate = std::move(candidate);
381
+ min_varchar_cols = varchar_cols;
382
+ best_sql_types_candidates_per_column_idx = info_sql_types_candidates;
383
+ best_format_candidates = format_candidates;
384
+ best_header_row = tuples[0].values;
385
+ best_start_with_header = tuples[0].position;
386
+ }
387
+ }
388
+ // Assert that it's all good at this point.
389
+ D_ASSERT(best_candidate && !best_format_candidates.empty() && !best_header_row.empty());
390
+
391
+ for (const auto &best : best_format_candidates) {
392
+ if (!best.second.empty()) {
393
+ SetDateFormat(*best_candidate, best.second.back(), best.first);
394
+ }
395
+ }
396
+ }
397
+
398
+ } // namespace duckdb
@@ -0,0 +1,175 @@
1
+ #include "duckdb/execution/operator/scan/csv/csv_sniffer.hpp"
2
+ #include "duckdb/execution/operator/scan/csv/base_csv_reader.hpp"
3
+ namespace duckdb {
4
+ struct Parse {
5
+ inline static void Initialize(CSVStateMachine &machine) {
6
+ machine.state = CSVState::STANDARD;
7
+ machine.previous_state = CSVState::STANDARD;
8
+ machine.pre_previous_state = CSVState::STANDARD;
9
+
10
+ machine.cur_rows = 0;
11
+ machine.column_count = 0;
12
+ machine.value = "";
13
+ }
14
+
15
+ inline static bool Process(CSVStateMachine &machine, DataChunk &parse_chunk, char current_char, idx_t current_pos) {
16
+
17
+ machine.pre_previous_state = machine.previous_state;
18
+ machine.previous_state = machine.state;
19
+ machine.state = static_cast<CSVState>(
20
+ machine.transition_array[static_cast<uint8_t>(machine.state)][static_cast<uint8_t>(current_char)]);
21
+
22
+ bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN;
23
+ if (machine.previous_state == CSVState::DELIMITER ||
24
+ (machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE) ||
25
+ (machine.state != CSVState::RECORD_SEPARATOR && carriage_return)) {
26
+ // Started a new value
27
+ // Check if it's UTF-8 (Or not?)
28
+ machine.VerifyUTF8();
29
+ auto &v = parse_chunk.data[machine.column_count++];
30
+ auto parse_data = FlatVector::GetData<string_t>(v);
31
+ auto &validity_mask = FlatVector::Validity(v);
32
+ if (machine.value.empty()) {
33
+ validity_mask.SetInvalid(machine.cur_rows);
34
+ } else {
35
+ parse_data[machine.cur_rows] = StringVector::AddStringOrBlob(v, string_t(machine.value));
36
+ }
37
+ machine.value = "";
38
+ }
39
+ if (((machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE) ||
40
+ (machine.state != CSVState::RECORD_SEPARATOR && carriage_return)) &&
41
+ machine.options.null_padding && machine.column_count < parse_chunk.ColumnCount()) {
42
+ // It's a new row, check if we need to pad stuff
43
+ while (machine.column_count < parse_chunk.ColumnCount()) {
44
+ auto &v = parse_chunk.data[machine.column_count++];
45
+ auto &validity_mask = FlatVector::Validity(v);
46
+ validity_mask.SetInvalid(machine.cur_rows);
47
+ }
48
+ }
49
+ if (machine.state == CSVState::STANDARD) {
50
+ machine.value += current_char;
51
+ }
52
+ machine.cur_rows +=
53
+ machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE;
54
+ machine.column_count -= machine.column_count * (machine.previous_state == CSVState::RECORD_SEPARATOR);
55
+
56
+ // It means our carriage return is actually a record separator
57
+ machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return;
58
+ machine.column_count -= machine.column_count * (machine.state != CSVState::RECORD_SEPARATOR && carriage_return);
59
+
60
+ if (machine.cur_rows >= machine.options.sample_chunk_size) {
61
+ // We sniffed enough rows
62
+ return true;
63
+ }
64
+ return false;
65
+ }
66
+
67
+ inline static void Finalize(CSVStateMachine &machine, DataChunk &parse_chunk) {
68
+ if (machine.cur_rows < machine.options.sample_chunk_size && machine.state != CSVState::EMPTY_LINE) {
69
+ machine.VerifyUTF8();
70
+ auto &v = parse_chunk.data[machine.column_count++];
71
+ auto parse_data = FlatVector::GetData<string_t>(v);
72
+ parse_data[machine.cur_rows] = StringVector::AddStringOrBlob(v, string_t(machine.value));
73
+ }
74
+ parse_chunk.SetCardinality(machine.cur_rows);
75
+ }
76
+ };
77
+
78
+ bool CSVSniffer::TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type) {
79
+ // try vector-cast from string to sql_type
80
+ Vector dummy_result(sql_type);
81
+ if (best_candidate->dialect_options.has_format[LogicalTypeId::DATE] && sql_type == LogicalTypeId::DATE) {
82
+ // use the date format to cast the chunk
83
+ string error_message;
84
+ idx_t line_error;
85
+ return BaseCSVReader::TryCastDateVector(best_candidate->dialect_options.date_format, parse_chunk_col,
86
+ dummy_result, size, error_message, line_error);
87
+ }
88
+ if (best_candidate->dialect_options.has_format[LogicalTypeId::TIMESTAMP] && sql_type == LogicalTypeId::TIMESTAMP) {
89
+ // use the timestamp format to cast the chunk
90
+ string error_message;
91
+ return BaseCSVReader::TryCastTimestampVector(best_candidate->dialect_options.date_format, parse_chunk_col,
92
+ dummy_result, size, error_message);
93
+ }
94
+ // target type is not varchar: perform a cast
95
+ string error_message;
96
+ return VectorOperations::DefaultTryCast(parse_chunk_col, dummy_result, size, &error_message, true);
97
+ }
98
+
99
+ void CSVSniffer::RefineTypes() {
100
+ // if data types were provided, exit here if number of columns does not match
101
+ detected_types.assign(best_candidate->dialect_options.num_cols, LogicalType::VARCHAR);
102
+ if (best_candidate->options.all_varchar) {
103
+ // return all types varchar
104
+ return;
105
+ }
106
+ DataChunk parse_chunk;
107
+ parse_chunk.Initialize(BufferAllocator::Get(buffer_manager->context), detected_types, options.sample_chunk_size);
108
+ for (idx_t i = 1; i < best_candidate->options.sample_chunks; i++) {
109
+ bool finished_file = best_candidate->csv_buffer_iterator.Finished();
110
+ if (finished_file) {
111
+ // we finished the file: stop
112
+ // set sql types
113
+ detected_types.clear();
114
+ for (idx_t column_idx = 0; column_idx < best_sql_types_candidates_per_column_idx.size(); column_idx++) {
115
+ LogicalType d_type = best_sql_types_candidates_per_column_idx[column_idx].back();
116
+ if (best_sql_types_candidates_per_column_idx[column_idx].size() ==
117
+ best_candidate->options.auto_type_candidates.size()) {
118
+ d_type = LogicalType::VARCHAR;
119
+ }
120
+ detected_types.push_back(d_type);
121
+ }
122
+ return;
123
+ }
124
+ best_candidate->csv_buffer_iterator.Process<Parse>(*best_candidate, parse_chunk);
125
+ for (idx_t col = 0; col < parse_chunk.ColumnCount(); col++) {
126
+ vector<LogicalType> &col_type_candidates = best_sql_types_candidates_per_column_idx[col];
127
+ while (col_type_candidates.size() > 1) {
128
+ const auto &sql_type = col_type_candidates.back();
129
+ // narrow down the date formats
130
+ if (best_format_candidates.count(sql_type.id())) {
131
+ auto &best_type_format_candidates = best_format_candidates[sql_type.id()];
132
+ auto save_format_candidates = best_type_format_candidates;
133
+ while (!best_type_format_candidates.empty()) {
134
+ if (TryCastVector(parse_chunk.data[col], parse_chunk.size(), sql_type)) {
135
+ break;
136
+ }
137
+ // doesn't work - move to the next one
138
+ best_type_format_candidates.pop_back();
139
+ best_candidate->dialect_options.has_format[sql_type.id()] =
140
+ (!best_type_format_candidates.empty());
141
+ if (!best_type_format_candidates.empty()) {
142
+ SetDateFormat(*best_candidate, best_type_format_candidates.back(), sql_type.id());
143
+ }
144
+ }
145
+ // if none match, then this is not a column of type sql_type,
146
+ if (best_type_format_candidates.empty()) {
147
+ // so restore the candidates that did work.
148
+ best_type_format_candidates.swap(save_format_candidates);
149
+ if (!best_type_format_candidates.empty()) {
150
+ SetDateFormat(*best_candidate, best_type_format_candidates.back(), sql_type.id());
151
+ }
152
+ }
153
+ }
154
+ if (TryCastVector(parse_chunk.data[col], parse_chunk.size(), sql_type)) {
155
+ break;
156
+ } else {
157
+ col_type_candidates.pop_back();
158
+ }
159
+ }
160
+ }
161
+ // reset parse chunk for the next iteration
162
+ parse_chunk.Reset();
163
+ }
164
+ detected_types.clear();
165
+ // set sql types
166
+ for (idx_t column_idx = 0; column_idx < best_sql_types_candidates_per_column_idx.size(); column_idx++) {
167
+ LogicalType d_type = best_sql_types_candidates_per_column_idx[column_idx].back();
168
+ if (best_sql_types_candidates_per_column_idx[column_idx].size() ==
169
+ best_candidate->options.auto_type_candidates.size()) {
170
+ d_type = LogicalType::VARCHAR;
171
+ }
172
+ detected_types.push_back(d_type);
173
+ }
174
+ }
175
+ } // namespace duckdb
@@ -0,0 +1,39 @@
1
+ #include "duckdb/execution/operator/scan/csv/csv_sniffer.hpp"
2
+ #include "duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp"
3
+
4
+ namespace duckdb {
5
+ void CSVSniffer::ReplaceTypes() {
6
+ if (best_candidate->options.sql_type_list.empty()) {
7
+ return;
8
+ }
9
+ // user-defined types were supplied for certain columns
10
+ // override the types
11
+ if (!best_candidate->options.sql_types_per_column.empty()) {
12
+ // types supplied as name -> value map
13
+ idx_t found = 0;
14
+ for (idx_t i = 0; i < names.size(); i++) {
15
+ auto it = best_candidate->options.sql_types_per_column.find(names[i]);
16
+ if (it != best_candidate->options.sql_types_per_column.end()) {
17
+ best_sql_types_candidates_per_column_idx[i] = {best_candidate->options.sql_type_list[it->second]};
18
+ found++;
19
+ }
20
+ }
21
+ if (!best_candidate->options.file_options.union_by_name &&
22
+ found < best_candidate->options.sql_types_per_column.size()) {
23
+ string error_msg = BufferedCSVReader::ColumnTypesError(options.sql_types_per_column, names);
24
+ if (!error_msg.empty()) {
25
+ throw BinderException(error_msg);
26
+ }
27
+ }
28
+ return;
29
+ }
30
+ // types supplied as list
31
+ if (names.size() < best_candidate->options.sql_type_list.size()) {
32
+ throw BinderException("read_csv: %d types were provided, but CSV file only has %d columns",
33
+ best_candidate->options.sql_type_list.size(), names.size());
34
+ }
35
+ for (idx_t i = 0; i < best_candidate->options.sql_type_list.size(); i++) {
36
+ best_sql_types_candidates_per_column_idx[i] = {best_candidate->options.sql_type_list[i]};
37
+ }
38
+ }
39
+ } // namespace duckdb
@@ -722,7 +722,7 @@ void AsOfLocalSourceState::CombineLeftPartitions() {
722
722
 
723
723
  void AsOfLocalSourceState::MergeLeftPartitions() {
724
724
  PartitionGlobalMergeStates::Callback local_callback;
725
- PartitionLocalMergeState local_merge;
725
+ PartitionLocalMergeState local_merge(*gsource.gsink.lhs_sink);
726
726
  gsource.GetMergeStates().ExecuteTask(local_merge, local_callback);
727
727
  gsource.merged++;
728
728
  while (gsource.merged < gsource.mergers) {
@@ -43,7 +43,6 @@ public:
43
43
  bool initialized = false;
44
44
  bool finished_scan = false;
45
45
  SelectionVector new_groups;
46
- AggregateHTAppendState append_state;
47
46
  };
48
47
 
49
48
  unique_ptr<GlobalSinkState> PhysicalRecursiveCTE::GetGlobalSinkState(ClientContext &context) const {
@@ -54,7 +53,7 @@ idx_t PhysicalRecursiveCTE::ProbeHT(DataChunk &chunk, RecursiveCTEState &state)
54
53
  Vector dummy_addresses(LogicalType::POINTER);
55
54
 
56
55
  // Use the HT to eliminate duplicate rows
57
- idx_t new_group_count = state.ht->FindOrCreateGroups(state.append_state, chunk, dummy_addresses, state.new_groups);
56
+ idx_t new_group_count = state.ht->FindOrCreateGroups(chunk, dummy_addresses, state.new_groups);
58
57
 
59
58
  // we only return entries we have not seen before (i.e. new groups)
60
59
  chunk.Slice(state.new_groups, new_group_count);