duckdb 0.8.2-dev3458.0 → 0.8.2-dev3949.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. package/binding.gyp +2 -0
  2. package/package.json +1 -1
  3. package/src/duckdb/extension/icu/icu_extension.cpp +5 -5
  4. package/src/duckdb/extension/json/include/json_deserializer.hpp +7 -16
  5. package/src/duckdb/extension/json/include/json_serializer.hpp +9 -15
  6. package/src/duckdb/extension/json/json_deserializer.cpp +29 -67
  7. package/src/duckdb/extension/json/json_scan.cpp +1 -1
  8. package/src/duckdb/extension/json/json_serializer.cpp +26 -69
  9. package/src/duckdb/src/common/enum_util.cpp +119 -7
  10. package/src/duckdb/src/common/extra_type_info.cpp +7 -3
  11. package/src/duckdb/src/common/radix_partitioning.cpp +8 -31
  12. package/src/duckdb/src/common/row_operations/row_aggregate.cpp +18 -3
  13. package/src/duckdb/src/common/serializer/binary_deserializer.cpp +62 -77
  14. package/src/duckdb/src/common/serializer/binary_serializer.cpp +84 -84
  15. package/src/duckdb/src/common/serializer/format_serializer.cpp +1 -1
  16. package/src/duckdb/src/common/sort/partition_state.cpp +41 -33
  17. package/src/duckdb/src/common/types/data_chunk.cpp +44 -8
  18. package/src/duckdb/src/common/types/hyperloglog.cpp +21 -0
  19. package/src/duckdb/src/common/types/interval.cpp +3 -0
  20. package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +252 -126
  21. package/src/duckdb/src/common/types/row/row_layout.cpp +3 -31
  22. package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +40 -32
  23. package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +39 -26
  24. package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +11 -1
  25. package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +21 -16
  26. package/src/duckdb/src/common/types/value.cpp +63 -42
  27. package/src/duckdb/src/common/types/vector.cpp +33 -67
  28. package/src/duckdb/src/core_functions/scalar/list/list_lambdas.cpp +3 -2
  29. package/src/duckdb/src/execution/aggregate_hashtable.cpp +222 -364
  30. package/src/duckdb/src/execution/join_hashtable.cpp +5 -6
  31. package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +240 -310
  32. package/src/duckdb/src/execution/operator/aggregate/physical_ungrouped_aggregate.cpp +202 -173
  33. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +36 -2
  34. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/base_csv_reader.cpp +58 -162
  35. package/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp +434 -0
  36. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +80 -0
  37. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer_manager.cpp +90 -0
  38. package/src/duckdb/src/execution/operator/csv_scanner/csv_file_handle.cpp +95 -0
  39. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/csv_reader_options.cpp +47 -28
  40. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine.cpp +35 -0
  41. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +107 -0
  42. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/parallel_csv_reader.cpp +44 -44
  43. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +52 -0
  44. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +336 -0
  45. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +165 -0
  46. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +398 -0
  47. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +175 -0
  48. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +39 -0
  49. package/src/duckdb/src/execution/operator/join/physical_asof_join.cpp +1 -1
  50. package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +1 -2
  51. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +614 -574
  52. package/src/duckdb/src/execution/window_executor.cpp +6 -5
  53. package/src/duckdb/src/function/cast/cast_function_set.cpp +1 -0
  54. package/src/duckdb/src/function/scalar/strftime_format.cpp +4 -4
  55. package/src/duckdb/src/function/table/copy_csv.cpp +94 -96
  56. package/src/duckdb/src/function/table/read_csv.cpp +150 -136
  57. package/src/duckdb/src/function/table/table_scan.cpp +0 -2
  58. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  59. package/src/duckdb/src/include/duckdb/common/enum_util.hpp +24 -0
  60. package/src/duckdb/src/include/duckdb/common/file_opener.hpp +9 -0
  61. package/src/duckdb/src/include/duckdb/common/fixed_size_map.hpp +208 -0
  62. package/src/duckdb/src/include/duckdb/common/optional_idx.hpp +3 -0
  63. package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +2 -1
  64. package/src/duckdb/src/include/duckdb/common/printer.hpp +11 -0
  65. package/src/duckdb/src/include/duckdb/common/serializer/binary_deserializer.hpp +43 -30
  66. package/src/duckdb/src/include/duckdb/common/serializer/binary_serializer.hpp +36 -35
  67. package/src/duckdb/src/include/duckdb/common/serializer/deserialization_data.hpp +18 -0
  68. package/src/duckdb/src/include/duckdb/common/serializer/encoding_util.hpp +132 -0
  69. package/src/duckdb/src/include/duckdb/common/serializer/format_deserializer.hpp +125 -150
  70. package/src/duckdb/src/include/duckdb/common/serializer/format_serializer.hpp +119 -107
  71. package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +2 -1
  72. package/src/duckdb/src/include/duckdb/common/shared_ptr.hpp +8 -0
  73. package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +13 -7
  74. package/src/duckdb/src/include/duckdb/common/types/data_chunk.hpp +5 -0
  75. package/src/duckdb/src/include/duckdb/common/types/hyperloglog.hpp +7 -1
  76. package/src/duckdb/src/include/duckdb/common/types/interval.hpp +7 -0
  77. package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +41 -9
  78. package/src/duckdb/src/include/duckdb/common/types/row/row_data_collection_scanner.hpp +5 -0
  79. package/src/duckdb/src/include/duckdb/common/types/row/row_layout.hpp +1 -23
  80. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +14 -8
  81. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +6 -3
  82. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +7 -0
  83. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +13 -8
  84. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +3 -2
  85. package/src/duckdb/src/include/duckdb/common/types/vector.hpp +3 -3
  86. package/src/duckdb/src/include/duckdb/common/vector.hpp +2 -2
  87. package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +125 -146
  88. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_hash_aggregate.hpp +5 -4
  89. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_window.hpp +4 -3
  90. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/base_csv_reader.hpp +17 -17
  91. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp +72 -0
  92. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +110 -0
  93. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp +103 -0
  94. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_file_handle.hpp +8 -15
  95. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_line_info.hpp +1 -1
  96. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_reader_options.hpp +52 -28
  97. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +127 -0
  98. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +75 -0
  99. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +51 -0
  100. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/parallel_csv_reader.hpp +21 -27
  101. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/quote_rules.hpp +21 -0
  102. package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +18 -27
  103. package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +5 -6
  104. package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +4 -4
  105. package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +17 -12
  106. package/src/duckdb/src/include/duckdb/main/client_context_file_opener.hpp +1 -0
  107. package/src/duckdb/src/include/duckdb/main/client_data.hpp +2 -1
  108. package/src/duckdb/src/include/duckdb/main/config.hpp +1 -0
  109. package/src/duckdb/src/include/duckdb/main/connection.hpp +2 -2
  110. package/src/duckdb/src/include/duckdb/main/relation/read_csv_relation.hpp +6 -6
  111. package/src/duckdb/src/include/duckdb/parallel/event.hpp +12 -1
  112. package/src/duckdb/src/include/duckdb/storage/block.hpp +6 -0
  113. package/src/duckdb/src/include/duckdb/storage/buffer/block_handle.hpp +3 -0
  114. package/src/duckdb/src/include/duckdb/storage/statistics/base_statistics.hpp +7 -3
  115. package/src/duckdb/src/include/duckdb/storage/statistics/column_statistics.hpp +4 -0
  116. package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +5 -0
  117. package/src/duckdb/src/include/duckdb/storage/statistics/list_stats.hpp +3 -0
  118. package/src/duckdb/src/include/duckdb/storage/statistics/numeric_stats.hpp +3 -0
  119. package/src/duckdb/src/include/duckdb/storage/statistics/string_stats.hpp +3 -0
  120. package/src/duckdb/src/include/duckdb/storage/statistics/struct_stats.hpp +3 -0
  121. package/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp +15 -3
  122. package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +4 -0
  123. package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +5 -0
  124. package/src/duckdb/src/include/duckdb/verification/deserialized_statement_verifier_v2.hpp +6 -0
  125. package/src/duckdb/src/include/duckdb/verification/statement_verifier.hpp +1 -0
  126. package/src/duckdb/src/include/duckdb.h +12 -0
  127. package/src/duckdb/src/main/capi/logical_types-c.cpp +22 -0
  128. package/src/duckdb/src/main/client_context_file_opener.cpp +17 -0
  129. package/src/duckdb/src/main/client_verify.cpp +1 -0
  130. package/src/duckdb/src/main/config.cpp +2 -2
  131. package/src/duckdb/src/main/connection.cpp +3 -3
  132. package/src/duckdb/src/main/relation/read_csv_relation.cpp +19 -13
  133. package/src/duckdb/src/parallel/pipeline_finish_event.cpp +1 -1
  134. package/src/duckdb/src/parser/tableref/pivotref.cpp +0 -16
  135. package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +1 -1
  136. package/src/duckdb/src/planner/binder/statement/bind_export.cpp +41 -25
  137. package/src/duckdb/src/planner/expression/bound_aggregate_expression.cpp +4 -4
  138. package/src/duckdb/src/planner/expression/bound_window_expression.cpp +10 -10
  139. package/src/duckdb/src/planner/logical_operator.cpp +1 -1
  140. package/src/duckdb/src/planner/planner.cpp +1 -1
  141. package/src/duckdb/src/storage/checkpoint_manager.cpp +4 -3
  142. package/src/duckdb/src/storage/serialization/serialize_constraint.cpp +1 -1
  143. package/src/duckdb/src/storage/serialization/serialize_create_info.cpp +5 -5
  144. package/src/duckdb/src/storage/serialization/serialize_expression.cpp +10 -10
  145. package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +20 -20
  146. package/src/duckdb/src/storage/serialization/serialize_macro_function.cpp +2 -2
  147. package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +118 -89
  148. package/src/duckdb/src/storage/serialization/serialize_parse_info.cpp +3 -3
  149. package/src/duckdb/src/storage/serialization/serialize_parsed_expression.cpp +27 -27
  150. package/src/duckdb/src/storage/serialization/serialize_query_node.cpp +16 -16
  151. package/src/duckdb/src/storage/serialization/serialize_result_modifier.cpp +8 -8
  152. package/src/duckdb/src/storage/serialization/serialize_statement.cpp +1 -1
  153. package/src/duckdb/src/storage/serialization/serialize_storage.cpp +39 -0
  154. package/src/duckdb/src/storage/serialization/serialize_tableref.cpp +9 -9
  155. package/src/duckdb/src/storage/statistics/base_statistics.cpp +67 -4
  156. package/src/duckdb/src/storage/statistics/column_statistics.cpp +16 -0
  157. package/src/duckdb/src/storage/statistics/list_stats.cpp +21 -0
  158. package/src/duckdb/src/storage/statistics/numeric_stats.cpp +126 -1
  159. package/src/duckdb/src/storage/statistics/string_stats.cpp +23 -0
  160. package/src/duckdb/src/storage/statistics/struct_stats.cpp +27 -0
  161. package/src/duckdb/src/storage/storage_info.cpp +1 -1
  162. package/src/duckdb/src/storage/table/chunk_info.cpp +82 -3
  163. package/src/duckdb/src/storage/table/row_group.cpp +68 -1
  164. package/src/duckdb/src/storage/table/table_statistics.cpp +21 -0
  165. package/src/duckdb/src/storage/wal_replay.cpp +2 -2
  166. package/src/duckdb/src/verification/deserialized_statement_verifier_v2.cpp +15 -1
  167. package/src/duckdb/src/verification/statement_verifier.cpp +2 -0
  168. package/src/duckdb/third_party/utf8proc/include/utf8proc_wrapper.hpp +8 -0
  169. package/src/duckdb/ub_src_execution.cpp +0 -2
  170. package/src/duckdb/ub_src_execution_operator_csv_scanner.cpp +18 -0
  171. package/src/duckdb/ub_src_execution_operator_csv_scanner_sniffer.cpp +12 -0
  172. package/src/duckdb/ub_src_execution_operator_persistent.cpp +0 -12
  173. package/src/duckdb/ub_src_storage_serialization.cpp +2 -0
  174. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +0 -1487
  175. package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +0 -72
  176. package/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp +0 -158
  177. package/src/duckdb/src/execution/partitionable_hashtable.cpp +0 -207
  178. package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +0 -133
  179. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +0 -74
  180. package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +0 -73
@@ -0,0 +1,127 @@
1
+ //===----------------------------------------------------------------------===//
2
+ // DuckDB
3
+ //
4
+ // duckdb/execution/operator/scan/csv/csv_sniffer.hpp
5
+ //
6
+ //
7
+ //===----------------------------------------------------------------------===//
8
+
9
+ #pragma once
10
+
11
+ #include "duckdb/execution/operator/scan/csv/csv_state_machine.hpp"
12
+ #include "duckdb/common/vector.hpp"
13
+ #include "duckdb/execution/operator/scan/csv/quote_rules.hpp"
14
+
15
+ namespace duckdb {
16
+ //! Struct to store the result of the Sniffer
17
+ struct SnifferResult {
18
+ SnifferResult(vector<LogicalType> return_types_p, vector<string> names_p)
19
+ : return_types(std::move(return_types_p)), names(std::move(names_p)) {
20
+ }
21
+ //! Return Types that were detected
22
+ vector<LogicalType> return_types;
23
+ //! Column Names that were detected
24
+ vector<string> names;
25
+ };
26
+
27
+ //! Sniffer that detects Header, Dialect and Types of CSV Files
28
+ class CSVSniffer {
29
+ public:
30
+ explicit CSVSniffer(CSVReaderOptions &options_p, shared_ptr<CSVBufferManager> buffer_manager_p,
31
+ CSVStateMachineCache &state_machine_cache);
32
+
33
+ //! Main method that sniffs the CSV file, returns the types, names and options as a result
34
+ //! CSV Sniffing consists of five steps:
35
+ //! 1. Dialect Detection: Generate the CSV Options (delimiter, quote, escape, etc.)
36
+ //! 2. Type Detection: Figures out the types of the columns (For one chunk)
37
+ //! 3. Header Detection: Figures out if the CSV file has a header and produces the names of the columns
38
+ //! 4. Type Replacement: Replaces the types of the columns if the user specified them
39
+ //! 5. Type Refinement: Refines the types of the columns for the remaining chunks
40
+ SnifferResult SniffCSV();
41
+
42
+ private:
43
+ //! CSV State Machine Cache
44
+ CSVStateMachineCache &state_machine_cache;
45
+ //! Highest number of columns found
46
+ idx_t max_columns_found = 0;
47
+ //! Current Candidates being considered
48
+ vector<unique_ptr<CSVStateMachine>> candidates;
49
+ //! Reference to original CSV Options, it will be modified as a result of the sniffer.
50
+ CSVReaderOptions &options;
51
+ //! Buffer being used on sniffer
52
+ shared_ptr<CSVBufferManager> buffer_manager;
53
+
54
+ //! ------------------------------------------------------//
55
+ //! ----------------- Dialect Detection ----------------- //
56
+ //! ------------------------------------------------------//
57
+ //! First phase of auto detection: detect CSV dialect (i.e. delimiter, quote rules, etc)
58
+ void DetectDialect();
59
+ //! Functions called in the main DetectDialect(); function
60
+ //! 1. Generates the search space candidates for the dialect
61
+ void GenerateCandidateDetectionSearchSpace(vector<char> &delim_candidates, vector<QuoteRule> &quoterule_candidates,
62
+ unordered_map<uint8_t, vector<char>> &quote_candidates_map,
63
+ unordered_map<uint8_t, vector<char>> &escape_candidates_map);
64
+ //! 2. Generates the search space candidates for the state machines
65
+ void GenerateStateMachineSearchSpace(vector<unique_ptr<CSVStateMachine>> &csv_state_machines,
66
+ const vector<char> &delimiter_candidates,
67
+ const vector<QuoteRule> &quoterule_candidates,
68
+ const unordered_map<uint8_t, vector<char>> &quote_candidates_map,
69
+ const unordered_map<uint8_t, vector<char>> &escape_candidates_map);
70
+ //! 3. Analyzes if dialect candidate is a good candidate to be considered, if so, it adds it to the candidates
71
+ void AnalyzeDialectCandidate(unique_ptr<CSVStateMachine>, idx_t &rows_read, idx_t &best_consistent_rows,
72
+ idx_t &prev_padding_count);
73
+ //! 4. Refine Candidates over remaining chunks
74
+ void RefineCandidates();
75
+ //! Checks if candidate still produces good values for the next chunk
76
+ bool RefineCandidateNextChunk(CSVStateMachine &candidate);
77
+
78
+ //! ------------------------------------------------------//
79
+ //! ------------------- Type Detection ------------------ //
80
+ //! ------------------------------------------------------//
81
+ //! Second phase of auto detection: detect types, format template candidates
82
+ //! ordered by descending specificity (~ from high to low)
83
+ void DetectTypes();
84
+ //! Change the date format for the type to the string
85
+ //! Try to cast a string value to the specified sql type
86
+ bool TryCastValue(CSVStateMachine &candidate, const Value &value, const LogicalType &sql_type);
87
+ void SetDateFormat(CSVStateMachine &candidate, const string &format_specifier, const LogicalTypeId &sql_type);
88
+ //! Functions that performs detection for date and timestamp formats
89
+ void DetectDateAndTimeStampFormats(CSVStateMachine &candidate, map<LogicalTypeId, bool> &has_format_candidates,
90
+ map<LogicalTypeId, vector<string>> &format_candidates,
91
+ const LogicalType &sql_type, const string &separator, Value &dummy_val);
92
+
93
+ //! Variables for Type Detection
94
+ //! Format Candidates for Date and Timestamp Types
95
+ const map<LogicalTypeId, vector<const char *>> format_template_candidates = {
96
+ {LogicalTypeId::DATE, {"%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d", "%y-%m-%d"}},
97
+ {LogicalTypeId::TIMESTAMP,
98
+ {"%Y-%m-%d %H:%M:%S.%f", "%m-%d-%Y %I:%M:%S %p", "%m-%d-%y %I:%M:%S %p", "%d-%m-%Y %H:%M:%S",
99
+ "%d-%m-%y %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%y-%m-%d %H:%M:%S"}},
100
+ };
101
+ unordered_map<idx_t, vector<LogicalType>> best_sql_types_candidates_per_column_idx;
102
+ map<LogicalTypeId, vector<string>> best_format_candidates;
103
+ unique_ptr<CSVStateMachine> best_candidate;
104
+ idx_t best_start_with_header = 0;
105
+ idx_t best_start_without_header = 0;
106
+ vector<Value> best_header_row;
107
+
108
+ //! ------------------------------------------------------//
109
+ //! ------------------ Header Detection ----------------- //
110
+ //! ------------------------------------------------------//
111
+ void DetectHeader();
112
+ vector<string> names;
113
+
114
+ //! ------------------------------------------------------//
115
+ //! ------------------ Type Replacement ----------------- //
116
+ //! ------------------------------------------------------//
117
+ void ReplaceTypes();
118
+
119
+ //! ------------------------------------------------------//
120
+ //! ------------------ Type Refinement ------------------ //
121
+ //! ------------------------------------------------------//
122
+ void RefineTypes();
123
+ bool TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type);
124
+ vector<LogicalType> detected_types;
125
+ };
126
+
127
+ } // namespace duckdb
@@ -0,0 +1,75 @@
1
+ //===----------------------------------------------------------------------===//
2
+ // DuckDB
3
+ //
4
+ // duckdb/execution/operator/scan/csv/csv_state_machine.hpp
5
+ //
6
+ //
7
+ //===----------------------------------------------------------------------===//
8
+
9
+ #pragma once
10
+
11
+ #include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
12
+ #include "duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp"
13
+ #include "duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp"
14
+
15
+ namespace duckdb {
16
+
17
+ //! All States of CSV Parsing
18
+ enum class CSVState : uint8_t {
19
+ STANDARD = 0, //! Regular unquoted field state
20
+ DELIMITER = 1, //! State after encountering a field separator (e.g., ;)
21
+ RECORD_SEPARATOR = 2, //! State after encountering a record separator (i.e., \n)
22
+ CARRIAGE_RETURN = 3, //! State after encountering a carriage return(i.e., \r)
23
+ QUOTED = 4, //! State when inside a quoted field
24
+ UNQUOTED = 5, //! State when leaving a quoted field
25
+ ESCAPE = 6, //! State when encountering an escape character (e.g., \)
26
+ EMPTY_LINE = 7, //! State when encountering an empty line (i.e., \r\r \n\n, \n\r)
27
+ INVALID = 8 //! Got to an Invalid State, this should error.
28
+ };
29
+
30
+ //! The CSV State Machine comprises a state transition array (STA).
31
+ //! The STA indicates the current state of parsing based on both the current and preceding characters.
32
+ //! This reveals whether we are dealing with a Field, a New Line, a Delimiter, and so forth.
33
+ //! The STA's creation depends on the provided quote, character, and delimiter options for that state machine.
34
+ //! The motivation behind implementing an STA is to remove branching in regular CSV Parsing by predicting and detecting
35
+ //! the states. Note: The State Machine is currently utilized solely in the CSV Sniffer.
36
+ class CSVStateMachine {
37
+ public:
38
+ explicit CSVStateMachine(CSVReaderOptions &options_p, const CSVStateMachineOptions &state_machine_options,
39
+ shared_ptr<CSVBufferManager> buffer_manager_p,
40
+ CSVStateMachineCache &csv_state_machine_cache_p);
41
+ //! Resets the state machine, so it can be used again
42
+ void Reset();
43
+
44
+ //! Aux Function for string UTF8 Verification
45
+ void VerifyUTF8();
46
+
47
+ CSVStateMachineCache &csv_state_machine_cache;
48
+
49
+ const CSVReaderOptions &options;
50
+ CSVBufferIterator csv_buffer_iterator;
51
+ //! Stores identified start row for this file (e.g., a file can start with garbage like notes, before the header)
52
+ idx_t start_row = 0;
53
+ //! The Transition Array is a Finite State Machine
54
+ //! It holds the transitions of all states, on all 256 possible different characters
55
+ const state_machine_t &transition_array;
56
+
57
+ //! Both these variables are used for new line identifier detection
58
+ bool single_record_separator = false;
59
+ bool carry_on_separator = false;
60
+
61
+ //! Variables Used for Sniffing
62
+ CSVState state;
63
+ CSVState previous_state;
64
+ CSVState pre_previous_state;
65
+ idx_t cur_rows;
66
+ idx_t column_count;
67
+ string value;
68
+ idx_t rows_read;
69
+ idx_t line_start_pos = 0;
70
+
71
+ //! Dialect options resulting from sniffing
72
+ DialectOptions dialect_options;
73
+ };
74
+
75
+ } // namespace duckdb
@@ -0,0 +1,51 @@
1
+ //===----------------------------------------------------------------------===//
2
+ // DuckDB
3
+ //
4
+ // duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp
5
+ //
6
+ //
7
+ //===----------------------------------------------------------------------===//
8
+
9
+ #pragma once
10
+
11
+ #include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
12
+ #include "duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp"
13
+ #include "duckdb/execution/operator/scan/csv/quote_rules.hpp"
14
+
15
+ namespace duckdb {
16
+ static constexpr uint32_t NUM_STATES = 8;
17
+ static constexpr uint32_t NUM_TRANSITIONS = 256;
18
+ typedef uint8_t state_machine_t[NUM_STATES][NUM_TRANSITIONS];
19
+
20
+ //! Hash function used in out state machine cache, it hashes and combines all options used to generate a state machine
21
+ struct HashCSVStateMachineConfig {
22
+ size_t operator()(CSVStateMachineOptions const &config) const noexcept {
23
+ auto h_delimiter = Hash(config.delimiter);
24
+ auto h_quote = Hash(config.quote);
25
+ auto h_escape = Hash(config.escape);
26
+ return CombineHash(h_delimiter, CombineHash(h_quote, h_escape));
27
+ }
28
+ };
29
+
30
+ //! The CSVStateMachineCache caches state machines, although small ~2kb, the actual creation of multiple State Machines
31
+ //! can become a bottleneck on sniffing, when reading very small csv files.
32
+ //! Hence the cache stores State Machines based on their different delimiter|quote|escape options.
33
+ class CSVStateMachineCache {
34
+ public:
35
+ CSVStateMachineCache();
36
+ ~CSVStateMachineCache() {};
37
+ //! Gets a state machine from the cache, if it's not from one the default options
38
+ //! It first caches it, then returns it.
39
+ const state_machine_t &Get(const CSVStateMachineOptions &state_machine_options);
40
+
41
+ private:
42
+ void Insert(const CSVStateMachineOptions &state_machine_options);
43
+ //! Cache on delimiter|quote|escape
44
+ unordered_map<CSVStateMachineOptions, state_machine_t, HashCSVStateMachineConfig> state_machine_cache;
45
+ //! Default value for options used to intialize CSV State Machine Cache
46
+ const vector<char> default_delimiter = {',', '|', ';', '\t'};
47
+ const vector<vector<char>> default_quote = {{'\"'}, {'\"', '\''}, {'\0'}};
48
+ const vector<QuoteRule> default_quote_rule = {QuoteRule::QUOTES_RFC, QuoteRule::QUOTES_OTHER, QuoteRule::NO_QUOTES};
49
+ const vector<vector<char>> default_escape = {{'\0', '\"', '\''}, {'\\'}, {'\0'}};
50
+ };
51
+ } // namespace duckdb
@@ -1,18 +1,18 @@
1
1
  //===----------------------------------------------------------------------===//
2
2
  // DuckDB
3
3
  //
4
- // duckdb/execution/operator/persistent/parallel_csv_reader.hpp
4
+ // duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp
5
5
  //
6
6
  //
7
7
  //===----------------------------------------------------------------------===//
8
8
 
9
9
  #pragma once
10
10
 
11
- #include "duckdb/execution/operator/persistent/base_csv_reader.hpp"
12
- #include "duckdb/execution/operator/persistent/csv_reader_options.hpp"
13
- #include "duckdb/execution/operator/persistent/csv_file_handle.hpp"
14
- #include "duckdb/execution/operator/persistent/csv_buffer.hpp"
15
- #include "duckdb/execution/operator/persistent/csv_line_info.hpp"
11
+ #include "duckdb/execution/operator/scan/csv/base_csv_reader.hpp"
12
+ #include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
13
+ #include "duckdb/execution/operator/scan/csv/csv_file_handle.hpp"
14
+ #include "duckdb/execution/operator/scan/csv/csv_buffer.hpp"
15
+ #include "duckdb/execution/operator/scan/csv/csv_line_info.hpp"
16
16
 
17
17
  #include <sstream>
18
18
  #include <utility>
@@ -20,21 +20,17 @@
20
20
  namespace duckdb {
21
21
 
22
22
  struct CSVBufferRead {
23
- CSVBufferRead(shared_ptr<CSVBuffer> buffer_p, idx_t buffer_start_p, idx_t buffer_end_p, idx_t batch_index,
23
+ CSVBufferRead(unique_ptr<CSVBufferHandle> buffer_p, idx_t buffer_start_p, idx_t buffer_end_p, idx_t batch_index,
24
24
  idx_t local_batch_index_p, optional_ptr<LineInfo> line_info_p)
25
25
  : buffer(std::move(buffer_p)), line_info(line_info_p), buffer_start(buffer_start_p), buffer_end(buffer_end_p),
26
26
  batch_index(batch_index), local_batch_index(local_batch_index_p) {
27
- if (buffer) {
28
- if (buffer_end > buffer->GetBufferSize()) {
29
- buffer_end = buffer->GetBufferSize();
30
- }
31
- } else {
32
- buffer_start = 0;
33
- buffer_end = 0;
27
+ D_ASSERT(buffer);
28
+ if (buffer_end > buffer->actual_size) {
29
+ buffer_end = buffer->actual_size;
34
30
  }
35
31
  }
36
32
 
37
- CSVBufferRead(shared_ptr<CSVBuffer> buffer_p, shared_ptr<CSVBuffer> nxt_buffer_p, idx_t buffer_start_p,
33
+ CSVBufferRead(unique_ptr<CSVBufferHandle> buffer_p, unique_ptr<CSVBufferHandle> nxt_buffer_p, idx_t buffer_start_p,
38
34
  idx_t buffer_end_p, idx_t batch_index, idx_t local_batch_index, optional_ptr<LineInfo> line_info_p)
39
35
  : CSVBufferRead(std::move(buffer_p), buffer_start_p, buffer_end_p, batch_index, local_batch_index,
40
36
  line_info_p) {
@@ -44,33 +40,33 @@ struct CSVBufferRead {
44
40
  CSVBufferRead() : buffer_start(0), buffer_end(NumericLimits<idx_t>::Maximum()) {};
45
41
 
46
42
  const char &operator[](size_t i) const {
47
- if (i < buffer->GetBufferSize()) {
43
+ if (i < buffer->actual_size) {
48
44
  auto buffer_ptr = buffer->Ptr();
49
45
  return buffer_ptr[i];
50
46
  }
51
47
  auto next_ptr = next_buffer->Ptr();
52
- return next_ptr[i - buffer->GetBufferSize()];
48
+ return next_ptr[i - buffer->actual_size];
53
49
  }
54
50
 
55
51
  string_t GetValue(idx_t start_buffer, idx_t position_buffer, idx_t offset) {
56
52
  idx_t length = position_buffer - start_buffer - offset;
57
53
  // 1) It's all in the current buffer
58
- if (start_buffer + length <= buffer->GetBufferSize()) {
54
+ if (start_buffer + length <= buffer->actual_size) {
59
55
  auto buffer_ptr = buffer->Ptr();
60
56
  return string_t(buffer_ptr + start_buffer, length);
61
- } else if (start_buffer >= buffer->GetBufferSize()) {
57
+ } else if (start_buffer >= buffer->actual_size) {
62
58
  // 2) It's all in the next buffer
63
59
  D_ASSERT(next_buffer);
64
- D_ASSERT(next_buffer->GetBufferSize() >= length + (start_buffer - buffer->GetBufferSize()));
60
+ D_ASSERT(next_buffer->actual_size >= length + (start_buffer - buffer->actual_size));
65
61
  auto buffer_ptr = next_buffer->Ptr();
66
- return string_t(buffer_ptr + (start_buffer - buffer->GetBufferSize()), length);
62
+ return string_t(buffer_ptr + (start_buffer - buffer->actual_size), length);
67
63
  } else {
68
64
  // 3) It starts in the current buffer and ends in the next buffer
69
65
  D_ASSERT(next_buffer);
70
66
  auto intersection = make_unsafe_uniq_array<char>(length);
71
67
  idx_t cur_pos = 0;
72
68
  auto buffer_ptr = buffer->Ptr();
73
- for (idx_t i = start_buffer; i < buffer->GetBufferSize(); i++) {
69
+ for (idx_t i = start_buffer; i < buffer->actual_size; i++) {
74
70
  intersection[cur_pos++] = buffer_ptr[i];
75
71
  }
76
72
  idx_t nxt_buffer_pos = 0;
@@ -83,8 +79,8 @@ struct CSVBufferRead {
83
79
  }
84
80
  }
85
81
 
86
- shared_ptr<CSVBuffer> buffer;
87
- shared_ptr<CSVBuffer> next_buffer;
82
+ unique_ptr<CSVBufferHandle> buffer;
83
+ unique_ptr<CSVBufferHandle> next_buffer;
88
84
  vector<unsafe_unique_array<char>> intersections;
89
85
  optional_ptr<LineInfo> line_info;
90
86
 
@@ -103,7 +99,7 @@ struct VerificationPositions {
103
99
  //! CSV Reader for Parallel Reading
104
100
  class ParallelCSVReader : public BaseCSVReader {
105
101
  public:
106
- ParallelCSVReader(ClientContext &context, BufferedCSVReaderOptions options, unique_ptr<CSVBufferRead> buffer,
102
+ ParallelCSVReader(ClientContext &context, CSVReaderOptions options, unique_ptr<CSVBufferRead> buffer,
107
103
  idx_t first_pos_first_buffer, const vector<LogicalType> &requested_types, idx_t file_idx_p);
108
104
  virtual ~ParallelCSVReader() {
109
105
  }
@@ -162,8 +158,6 @@ private:
162
158
 
163
159
  //! Parses a CSV file with a one-byte delimiter, escape and quote character
164
160
  bool TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line = false);
165
- //! Verifies that the line length did not go over a pre-defined limit.
166
- void VerifyLineLength(idx_t line_size);
167
161
 
168
162
  //! First Position of First Buffer
169
163
  idx_t first_pos_first_buffer = 0;
@@ -0,0 +1,21 @@
1
+ //===----------------------------------------------------------------------===//
2
+ // DuckDB
3
+ //
4
+ // duckdb/execution/operator/scan/csv/quote_rules.hpp
5
+ //
6
+ //
7
+ //===----------------------------------------------------------------------===//
8
+
9
+ #pragma once
10
+
11
+ #include "duckdb/common/vector.hpp"
12
+
13
+ namespace duckdb {
14
+ //! Different Rules regarding possible combinations of Quote and Escape Values for CSV Dialects.
15
+ //! Each rule has a comment on the possible combinations.
16
+ enum class QuoteRule : uint8_t {
17
+ QUOTES_RFC = 0, //! quote = " escape = (\0 || " || ')
18
+ QUOTES_OTHER = 1, //! quote = ( " || ' ) escape = '\\'
19
+ NO_QUOTES = 2 //! quote = \0 escape = \0
20
+ };
21
+ } // namespace duckdb
@@ -8,31 +8,27 @@
8
8
 
9
9
  #pragma once
10
10
 
11
+ #include "duckdb/common/types/row/tuple_data_layout.hpp"
11
12
  #include "duckdb/execution/operator/aggregate/grouped_aggregate_data.hpp"
12
- #include "duckdb/execution/partitionable_hashtable.hpp"
13
- #include "duckdb/execution/physical_operator.hpp"
14
13
  #include "duckdb/parser/group_by_node.hpp"
15
14
 
16
15
  namespace duckdb {
17
- class BufferManager;
18
- class Executor;
19
- class PhysicalHashAggregate;
20
- class Pipeline;
21
- class Task;
16
+
17
+ class GroupedAggregateHashTable;
18
+ struct AggregatePartition;
22
19
 
23
20
  class RadixPartitionedHashTable {
24
21
  public:
25
22
  RadixPartitionedHashTable(GroupingSet &grouping_set, const GroupedAggregateData &op);
23
+ unique_ptr<GroupedAggregateHashTable> CreateHT(ClientContext &context, const idx_t capacity,
24
+ const idx_t radix_bits) const;
26
25
 
26
+ public:
27
27
  GroupingSet &grouping_set;
28
28
  //! The indices specified in the groups_count that do not appear in the grouping_set
29
29
  unsafe_vector<idx_t> null_groups;
30
30
  const GroupedAggregateData &op;
31
-
32
31
  vector<LogicalType> group_types;
33
- //! how many groups can we have in the operator before we switch to radix partitioning
34
- idx_t radix_limit;
35
-
36
32
  //! The GROUPING values that belong to this hash table
37
33
  vector<Value> grouping_values;
38
34
 
@@ -43,32 +39,27 @@ public:
43
39
 
44
40
  void Sink(ExecutionContext &context, DataChunk &chunk, OperatorSinkInput &input, DataChunk &aggregate_input_chunk,
45
41
  const unsafe_vector<idx_t> &filter) const;
46
- void Combine(ExecutionContext &context, GlobalSinkState &state, LocalSinkState &lstate) const;
47
- bool Finalize(ClientContext &context, GlobalSinkState &gstate_p) const;
48
-
49
- void ScheduleTasks(Executor &executor, const shared_ptr<Event> &event, GlobalSinkState &state,
50
- vector<shared_ptr<Task>> &tasks) const;
42
+ void Combine(ExecutionContext &context, GlobalSinkState &gstate, LocalSinkState &lstate) const;
43
+ void Finalize(ClientContext &context, GlobalSinkState &gstate) const;
51
44
 
45
+ public:
52
46
  //! Source interface
53
- idx_t Size(GlobalSinkState &sink_state) const;
54
47
  unique_ptr<GlobalSourceState> GetGlobalSourceState(ClientContext &context) const;
55
48
  unique_ptr<LocalSourceState> GetLocalSourceState(ExecutionContext &context) const;
56
- SourceResultType GetData(ExecutionContext &context, DataChunk &chunk, GlobalSinkState &sink_state,
49
+
50
+ SourceResultType GetData(ExecutionContext &context, DataChunk &chunk, GlobalSinkState &sink,
57
51
  OperatorSourceInput &input) const;
58
52
 
59
- static void SetMultiScan(GlobalSinkState &state);
60
- static bool ForceSingleHT(GlobalSinkState &state);
61
- static bool AnyPartitioned(GlobalSinkState &state);
62
- static void GetRepartitionInfo(ClientContext &context, GlobalSinkState &state, idx_t &repartition_radix_bits,
63
- idx_t &concurrent_repartitions, idx_t &tasks_per_partition);
53
+ const TupleDataLayout &GetLayout() const;
54
+ idx_t Count(GlobalSinkState &sink) const;
55
+ static void SetMultiScan(GlobalSinkState &sink);
64
56
 
65
57
  private:
66
58
  void SetGroupingValues();
67
59
  void PopulateGroupChunk(DataChunk &group_chunk, DataChunk &input_chunk) const;
68
- void InitializeFinalizedHTs(ClientContext &context, GlobalSinkState &state) const;
69
- void ScheduleRepartitionTasks(Executor &executor, const shared_ptr<Event> &event, GlobalSinkState &state,
70
- vector<shared_ptr<Task>> &tasks, const idx_t repartition_radix_bits,
71
- const idx_t concurrent_repartitions, const idx_t tasks_per_partition) const;
60
+ idx_t CountInternal(GlobalSinkState &sink) const;
61
+
62
+ TupleDataLayout layout;
72
63
  };
73
64
 
74
65
  } // namespace duckdb
@@ -110,9 +110,8 @@ public:
110
110
  bool has_serialize = function.format_serialize;
111
111
  serializer.WriteProperty(503, "has_serialize", has_serialize);
112
112
  if (has_serialize) {
113
- serializer.BeginObject(504, "function_data");
114
- function.format_serialize(serializer, bind_info, function);
115
- serializer.EndObject();
113
+ serializer.WriteObject(504, "function_data",
114
+ [&](FormatSerializer &obj) { function.format_serialize(obj, bind_info, function); });
116
115
  D_ASSERT(function.format_deserialize);
117
116
  }
118
117
  }
@@ -150,9 +149,9 @@ public:
150
149
  throw SerializationException("Function requires deserialization but no deserialization function for %s",
151
150
  function.name);
152
151
  }
153
- deserializer.BeginObject(504, "function_data");
154
- auto result = function.format_deserialize(deserializer, function);
155
- deserializer.EndObject();
152
+ unique_ptr<FunctionData> result;
153
+ deserializer.ReadObject(504, "function_data",
154
+ [&](FormatDeserializer &obj) { result = function.format_deserialize(obj, function); });
156
155
  return result;
157
156
  }
158
157
 
@@ -142,10 +142,10 @@ public:
142
142
  public:
143
143
  DUCKDB_API static ParseResult Parse(const string &format, const string &text);
144
144
 
145
- DUCKDB_API bool Parse(string_t str, ParseResult &result);
145
+ DUCKDB_API bool Parse(string_t str, ParseResult &result) const;
146
146
 
147
- DUCKDB_API bool TryParseDate(string_t str, date_t &result, string &error_message);
148
- DUCKDB_API bool TryParseTimestamp(string_t str, timestamp_t &result, string &error_message);
147
+ DUCKDB_API bool TryParseDate(string_t str, date_t &result, string &error_message) const;
148
+ DUCKDB_API bool TryParseTimestamp(string_t str, timestamp_t &result, string &error_message) const;
149
149
 
150
150
  date_t ParseDate(string_t str);
151
151
  timestamp_t ParseTimestamp(string_t str);
@@ -158,7 +158,7 @@ protected:
158
158
  DUCKDB_API void AddFormatSpecifier(string preceding_literal, StrTimeSpecifier specifier) override;
159
159
  int NumericSpecifierWidth(StrTimeSpecifier specifier);
160
160
  int32_t TryParseCollection(const char *data, idx_t &pos, idx_t size, const string_t collection[],
161
- idx_t collection_count);
161
+ idx_t collection_count) const;
162
162
 
163
163
  private:
164
164
  explicit StrpTimeFormat(const string &format_string);
@@ -8,14 +8,16 @@
8
8
 
9
9
  #pragma once
10
10
 
11
- #include "duckdb/function/table_function.hpp"
12
- #include "duckdb/function/scalar/strftime_format.hpp"
13
- #include "duckdb/execution/operator/persistent/csv_reader_options.hpp"
14
- #include "duckdb/execution/operator/persistent/buffered_csv_reader.hpp"
15
- #include "duckdb/execution/operator/persistent/parallel_csv_reader.hpp"
16
- #include "duckdb/execution/operator/persistent/csv_file_handle.hpp"
17
- #include "duckdb/execution/operator/persistent/csv_buffer.hpp"
11
+ #include "duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp"
12
+ #include "duckdb/execution/operator/scan/csv/csv_buffer.hpp"
13
+ #include "duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp"
14
+ #include "duckdb/execution/operator/scan/csv/csv_file_handle.hpp"
15
+ #include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
16
+ #include "duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp"
18
17
  #include "duckdb/function/built_in_functions.hpp"
18
+ #include "duckdb/function/scalar/strftime_format.hpp"
19
+ #include "duckdb/function/table_function.hpp"
20
+ #include "duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp"
19
21
 
20
22
  namespace duckdb {
21
23
 
@@ -31,7 +33,7 @@ struct BaseCSVData : public TableFunctionData {
31
33
  //! The file path of the CSV file to read or write
32
34
  vector<string> files;
33
35
  //! The CSV reader options
34
- BufferedCSVReaderOptions options;
36
+ CSVReaderOptions options;
35
37
  //! Offsets for generated columns
36
38
  idx_t filename_col_idx;
37
39
  idx_t hive_partition_col_idx;
@@ -50,8 +52,6 @@ struct WriteCSVData : public BaseCSVData {
50
52
  vector<LogicalType> sql_types;
51
53
  //! The newline string to write
52
54
  string newline = "\n";
53
- //! Whether or not we are writing a simple CSV (delimiter, quote and escape are all 1 byte in length)
54
- bool is_simple;
55
55
  //! The size of the CSV file (in bytes) that we buffer before we flush it to disk
56
56
  idx_t flush_size = 4096 * 8;
57
57
  //! For each byte whether or not the CSV file requires quotes when containing the byte
@@ -93,8 +93,9 @@ struct ReadCSVData : public BaseCSVData {
93
93
  vector<LogicalType> return_types;
94
94
  //! The expected SQL names to be returned from the read - including added constants (e.g. filename, hive partitions)
95
95
  vector<string> return_names;
96
- //! The initial reader (if any): this is used when automatic detection is used during binding.
97
- //! In this case, the CSV reader is already created and might as well be re-used.
96
+ //! The buffer manager (if any): this is used when automatic detection is used during binding.
97
+ //! In this case, some CSV buffers have already been read and can be reused.
98
+ shared_ptr<CSVBufferManager> buffer_manager;
98
99
  unique_ptr<BufferedCSVReader> initial_reader;
99
100
  //! The union readers are created (when csv union_by_name option is on) during binding
100
101
  //! Those readers can be re-used during ReadCSVFunction
@@ -104,6 +105,10 @@ struct ReadCSVData : public BaseCSVData {
104
105
  //! Reader bind data
105
106
  MultiFileReaderBindData reader_bind;
106
107
  vector<ColumnInfo> column_info;
108
+ //! The CSVStateMachineCache caches state machines created for sniffing and parsing csv files
109
+ //! We cache them because when reading very small csv files, the cost of creating all the possible
110
+ //! State machines for sniffing becomes a major bottleneck.
111
+ CSVStateMachineCache state_machine_cache;
107
112
 
108
113
  void Initialize(unique_ptr<BufferedCSVReader> &reader) {
109
114
  this->initial_reader = std::move(reader);
@@ -21,6 +21,7 @@ public:
21
21
  explicit ClientContextFileOpener(ClientContext &context_p) : context(context_p) {
22
22
  }
23
23
 
24
+ bool TryGetCurrentSetting(const string &key, Value &result, FileOpenerInfo &info) override;
24
25
  bool TryGetCurrentSetting(const string &key, Value &result) override;
25
26
 
26
27
  ClientContext *TryGetClientContext() override {
@@ -13,6 +13,7 @@
13
13
  #include "duckdb/common/types/value.hpp"
14
14
  #include "duckdb/common/case_insensitive_map.hpp"
15
15
  #include "duckdb/common/atomic.hpp"
16
+ #include "duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp"
16
17
 
17
18
  namespace duckdb {
18
19
  class AttachedDatabase;
@@ -29,7 +30,7 @@ class SchemaCatalogEntry;
29
30
  struct RandomEngine;
30
31
 
31
32
  struct ClientData {
32
- ClientData(ClientContext &context);
33
+ explicit ClientData(ClientContext &context);
33
34
  ~ClientData();
34
35
 
35
36
  //! Query profiler
@@ -248,6 +248,7 @@ public:
248
248
  bool operator!=(const DBConfig &other);
249
249
 
250
250
  DUCKDB_API CastFunctionSet &GetCastFunctions();
251
+ static idx_t GetSystemMaxThreads(FileSystem &fs);
251
252
  void SetDefaultMaxThreads();
252
253
  void SetDefaultMaxMemory();
253
254
 
@@ -30,7 +30,7 @@ class DatabaseInstance;
30
30
  class DuckDB;
31
31
  class LogicalOperator;
32
32
  class SelectStatement;
33
- struct BufferedCSVReaderOptions;
33
+ struct CSVReaderOptions;
34
34
 
35
35
  typedef void (*warning_callback)(std::string);
36
36
 
@@ -131,7 +131,7 @@ public:
131
131
 
132
132
  //! Reads CSV file
133
133
  DUCKDB_API shared_ptr<Relation> ReadCSV(const string &csv_file);
134
- DUCKDB_API shared_ptr<Relation> ReadCSV(const string &csv_file, BufferedCSVReaderOptions &options);
134
+ DUCKDB_API shared_ptr<Relation> ReadCSV(const string &csv_file, CSVReaderOptions &options);
135
135
  DUCKDB_API shared_ptr<Relation> ReadCSV(const string &csv_file, const vector<string> &columns);
136
136
 
137
137
  //! Reads Parquet file