duckdb 0.8.2-dev3458.0 → 0.8.2-dev3949.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +2 -0
- package/package.json +1 -1
- package/src/duckdb/extension/icu/icu_extension.cpp +5 -5
- package/src/duckdb/extension/json/include/json_deserializer.hpp +7 -16
- package/src/duckdb/extension/json/include/json_serializer.hpp +9 -15
- package/src/duckdb/extension/json/json_deserializer.cpp +29 -67
- package/src/duckdb/extension/json/json_scan.cpp +1 -1
- package/src/duckdb/extension/json/json_serializer.cpp +26 -69
- package/src/duckdb/src/common/enum_util.cpp +119 -7
- package/src/duckdb/src/common/extra_type_info.cpp +7 -3
- package/src/duckdb/src/common/radix_partitioning.cpp +8 -31
- package/src/duckdb/src/common/row_operations/row_aggregate.cpp +18 -3
- package/src/duckdb/src/common/serializer/binary_deserializer.cpp +62 -77
- package/src/duckdb/src/common/serializer/binary_serializer.cpp +84 -84
- package/src/duckdb/src/common/serializer/format_serializer.cpp +1 -1
- package/src/duckdb/src/common/sort/partition_state.cpp +41 -33
- package/src/duckdb/src/common/types/data_chunk.cpp +44 -8
- package/src/duckdb/src/common/types/hyperloglog.cpp +21 -0
- package/src/duckdb/src/common/types/interval.cpp +3 -0
- package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +252 -126
- package/src/duckdb/src/common/types/row/row_layout.cpp +3 -31
- package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +40 -32
- package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +39 -26
- package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +11 -1
- package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +21 -16
- package/src/duckdb/src/common/types/value.cpp +63 -42
- package/src/duckdb/src/common/types/vector.cpp +33 -67
- package/src/duckdb/src/core_functions/scalar/list/list_lambdas.cpp +3 -2
- package/src/duckdb/src/execution/aggregate_hashtable.cpp +222 -364
- package/src/duckdb/src/execution/join_hashtable.cpp +5 -6
- package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +240 -310
- package/src/duckdb/src/execution/operator/aggregate/physical_ungrouped_aggregate.cpp +202 -173
- package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +36 -2
- package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/base_csv_reader.cpp +58 -162
- package/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp +434 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +80 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer_manager.cpp +90 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_file_handle.cpp +95 -0
- package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/csv_reader_options.cpp +47 -28
- package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine.cpp +35 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +107 -0
- package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/parallel_csv_reader.cpp +44 -44
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +52 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +336 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +165 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +398 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +175 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +39 -0
- package/src/duckdb/src/execution/operator/join/physical_asof_join.cpp +1 -1
- package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +1 -2
- package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +614 -574
- package/src/duckdb/src/execution/window_executor.cpp +6 -5
- package/src/duckdb/src/function/cast/cast_function_set.cpp +1 -0
- package/src/duckdb/src/function/scalar/strftime_format.cpp +4 -4
- package/src/duckdb/src/function/table/copy_csv.cpp +94 -96
- package/src/duckdb/src/function/table/read_csv.cpp +150 -136
- package/src/duckdb/src/function/table/table_scan.cpp +0 -2
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/enum_util.hpp +24 -0
- package/src/duckdb/src/include/duckdb/common/file_opener.hpp +9 -0
- package/src/duckdb/src/include/duckdb/common/fixed_size_map.hpp +208 -0
- package/src/duckdb/src/include/duckdb/common/optional_idx.hpp +3 -0
- package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +2 -1
- package/src/duckdb/src/include/duckdb/common/printer.hpp +11 -0
- package/src/duckdb/src/include/duckdb/common/serializer/binary_deserializer.hpp +43 -30
- package/src/duckdb/src/include/duckdb/common/serializer/binary_serializer.hpp +36 -35
- package/src/duckdb/src/include/duckdb/common/serializer/deserialization_data.hpp +18 -0
- package/src/duckdb/src/include/duckdb/common/serializer/encoding_util.hpp +132 -0
- package/src/duckdb/src/include/duckdb/common/serializer/format_deserializer.hpp +125 -150
- package/src/duckdb/src/include/duckdb/common/serializer/format_serializer.hpp +119 -107
- package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +2 -1
- package/src/duckdb/src/include/duckdb/common/shared_ptr.hpp +8 -0
- package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +13 -7
- package/src/duckdb/src/include/duckdb/common/types/data_chunk.hpp +5 -0
- package/src/duckdb/src/include/duckdb/common/types/hyperloglog.hpp +7 -1
- package/src/duckdb/src/include/duckdb/common/types/interval.hpp +7 -0
- package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +41 -9
- package/src/duckdb/src/include/duckdb/common/types/row/row_data_collection_scanner.hpp +5 -0
- package/src/duckdb/src/include/duckdb/common/types/row/row_layout.hpp +1 -23
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +14 -8
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +6 -3
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +7 -0
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +13 -8
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +3 -2
- package/src/duckdb/src/include/duckdb/common/types/vector.hpp +3 -3
- package/src/duckdb/src/include/duckdb/common/vector.hpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +125 -146
- package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_hash_aggregate.hpp +5 -4
- package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_window.hpp +4 -3
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/base_csv_reader.hpp +17 -17
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp +72 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +110 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp +103 -0
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_file_handle.hpp +8 -15
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_line_info.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_reader_options.hpp +52 -28
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +127 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +75 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +51 -0
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/parallel_csv_reader.hpp +21 -27
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/quote_rules.hpp +21 -0
- package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +18 -27
- package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +5 -6
- package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +4 -4
- package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +17 -12
- package/src/duckdb/src/include/duckdb/main/client_context_file_opener.hpp +1 -0
- package/src/duckdb/src/include/duckdb/main/client_data.hpp +2 -1
- package/src/duckdb/src/include/duckdb/main/config.hpp +1 -0
- package/src/duckdb/src/include/duckdb/main/connection.hpp +2 -2
- package/src/duckdb/src/include/duckdb/main/relation/read_csv_relation.hpp +6 -6
- package/src/duckdb/src/include/duckdb/parallel/event.hpp +12 -1
- package/src/duckdb/src/include/duckdb/storage/block.hpp +6 -0
- package/src/duckdb/src/include/duckdb/storage/buffer/block_handle.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/base_statistics.hpp +7 -3
- package/src/duckdb/src/include/duckdb/storage/statistics/column_statistics.hpp +4 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +5 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/list_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/numeric_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/string_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/struct_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp +15 -3
- package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +4 -0
- package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +5 -0
- package/src/duckdb/src/include/duckdb/verification/deserialized_statement_verifier_v2.hpp +6 -0
- package/src/duckdb/src/include/duckdb/verification/statement_verifier.hpp +1 -0
- package/src/duckdb/src/include/duckdb.h +12 -0
- package/src/duckdb/src/main/capi/logical_types-c.cpp +22 -0
- package/src/duckdb/src/main/client_context_file_opener.cpp +17 -0
- package/src/duckdb/src/main/client_verify.cpp +1 -0
- package/src/duckdb/src/main/config.cpp +2 -2
- package/src/duckdb/src/main/connection.cpp +3 -3
- package/src/duckdb/src/main/relation/read_csv_relation.cpp +19 -13
- package/src/duckdb/src/parallel/pipeline_finish_event.cpp +1 -1
- package/src/duckdb/src/parser/tableref/pivotref.cpp +0 -16
- package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +1 -1
- package/src/duckdb/src/planner/binder/statement/bind_export.cpp +41 -25
- package/src/duckdb/src/planner/expression/bound_aggregate_expression.cpp +4 -4
- package/src/duckdb/src/planner/expression/bound_window_expression.cpp +10 -10
- package/src/duckdb/src/planner/logical_operator.cpp +1 -1
- package/src/duckdb/src/planner/planner.cpp +1 -1
- package/src/duckdb/src/storage/checkpoint_manager.cpp +4 -3
- package/src/duckdb/src/storage/serialization/serialize_constraint.cpp +1 -1
- package/src/duckdb/src/storage/serialization/serialize_create_info.cpp +5 -5
- package/src/duckdb/src/storage/serialization/serialize_expression.cpp +10 -10
- package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +20 -20
- package/src/duckdb/src/storage/serialization/serialize_macro_function.cpp +2 -2
- package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +118 -89
- package/src/duckdb/src/storage/serialization/serialize_parse_info.cpp +3 -3
- package/src/duckdb/src/storage/serialization/serialize_parsed_expression.cpp +27 -27
- package/src/duckdb/src/storage/serialization/serialize_query_node.cpp +16 -16
- package/src/duckdb/src/storage/serialization/serialize_result_modifier.cpp +8 -8
- package/src/duckdb/src/storage/serialization/serialize_statement.cpp +1 -1
- package/src/duckdb/src/storage/serialization/serialize_storage.cpp +39 -0
- package/src/duckdb/src/storage/serialization/serialize_tableref.cpp +9 -9
- package/src/duckdb/src/storage/statistics/base_statistics.cpp +67 -4
- package/src/duckdb/src/storage/statistics/column_statistics.cpp +16 -0
- package/src/duckdb/src/storage/statistics/list_stats.cpp +21 -0
- package/src/duckdb/src/storage/statistics/numeric_stats.cpp +126 -1
- package/src/duckdb/src/storage/statistics/string_stats.cpp +23 -0
- package/src/duckdb/src/storage/statistics/struct_stats.cpp +27 -0
- package/src/duckdb/src/storage/storage_info.cpp +1 -1
- package/src/duckdb/src/storage/table/chunk_info.cpp +82 -3
- package/src/duckdb/src/storage/table/row_group.cpp +68 -1
- package/src/duckdb/src/storage/table/table_statistics.cpp +21 -0
- package/src/duckdb/src/storage/wal_replay.cpp +2 -2
- package/src/duckdb/src/verification/deserialized_statement_verifier_v2.cpp +15 -1
- package/src/duckdb/src/verification/statement_verifier.cpp +2 -0
- package/src/duckdb/third_party/utf8proc/include/utf8proc_wrapper.hpp +8 -0
- package/src/duckdb/ub_src_execution.cpp +0 -2
- package/src/duckdb/ub_src_execution_operator_csv_scanner.cpp +18 -0
- package/src/duckdb/ub_src_execution_operator_csv_scanner_sniffer.cpp +12 -0
- package/src/duckdb/ub_src_execution_operator_persistent.cpp +0 -12
- package/src/duckdb/ub_src_storage_serialization.cpp +2 -0
- package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +0 -1487
- package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +0 -72
- package/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp +0 -158
- package/src/duckdb/src/execution/partitionable_hashtable.cpp +0 -207
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +0 -133
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +0 -74
- package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +0 -73
@@ -0,0 +1,127 @@
|
|
1
|
+
//===----------------------------------------------------------------------===//
|
2
|
+
// DuckDB
|
3
|
+
//
|
4
|
+
// duckdb/execution/operator/scan/csv/csv_sniffer.hpp
|
5
|
+
//
|
6
|
+
//
|
7
|
+
//===----------------------------------------------------------------------===//
|
8
|
+
|
9
|
+
#pragma once
|
10
|
+
|
11
|
+
#include "duckdb/execution/operator/scan/csv/csv_state_machine.hpp"
|
12
|
+
#include "duckdb/common/vector.hpp"
|
13
|
+
#include "duckdb/execution/operator/scan/csv/quote_rules.hpp"
|
14
|
+
|
15
|
+
namespace duckdb {
|
16
|
+
//! Struct to store the result of the Sniffer
|
17
|
+
struct SnifferResult {
|
18
|
+
SnifferResult(vector<LogicalType> return_types_p, vector<string> names_p)
|
19
|
+
: return_types(std::move(return_types_p)), names(std::move(names_p)) {
|
20
|
+
}
|
21
|
+
//! Return Types that were detected
|
22
|
+
vector<LogicalType> return_types;
|
23
|
+
//! Column Names that were detected
|
24
|
+
vector<string> names;
|
25
|
+
};
|
26
|
+
|
27
|
+
//! Sniffer that detects Header, Dialect and Types of CSV Files
|
28
|
+
class CSVSniffer {
|
29
|
+
public:
|
30
|
+
explicit CSVSniffer(CSVReaderOptions &options_p, shared_ptr<CSVBufferManager> buffer_manager_p,
|
31
|
+
CSVStateMachineCache &state_machine_cache);
|
32
|
+
|
33
|
+
//! Main method that sniffs the CSV file, returns the types, names and options as a result
|
34
|
+
//! CSV Sniffing consists of five steps:
|
35
|
+
//! 1. Dialect Detection: Generate the CSV Options (delimiter, quote, escape, etc.)
|
36
|
+
//! 2. Type Detection: Figures out the types of the columns (For one chunk)
|
37
|
+
//! 3. Header Detection: Figures out if the CSV file has a header and produces the names of the columns
|
38
|
+
//! 4. Type Replacement: Replaces the types of the columns if the user specified them
|
39
|
+
//! 5. Type Refinement: Refines the types of the columns for the remaining chunks
|
40
|
+
SnifferResult SniffCSV();
|
41
|
+
|
42
|
+
private:
|
43
|
+
//! CSV State Machine Cache
|
44
|
+
CSVStateMachineCache &state_machine_cache;
|
45
|
+
//! Highest number of columns found
|
46
|
+
idx_t max_columns_found = 0;
|
47
|
+
//! Current Candidates being considered
|
48
|
+
vector<unique_ptr<CSVStateMachine>> candidates;
|
49
|
+
//! Reference to original CSV Options, it will be modified as a result of the sniffer.
|
50
|
+
CSVReaderOptions &options;
|
51
|
+
//! Buffer being used on sniffer
|
52
|
+
shared_ptr<CSVBufferManager> buffer_manager;
|
53
|
+
|
54
|
+
//! ------------------------------------------------------//
|
55
|
+
//! ----------------- Dialect Detection ----------------- //
|
56
|
+
//! ------------------------------------------------------//
|
57
|
+
//! First phase of auto detection: detect CSV dialect (i.e. delimiter, quote rules, etc)
|
58
|
+
void DetectDialect();
|
59
|
+
//! Functions called in the main DetectDialect(); function
|
60
|
+
//! 1. Generates the search space candidates for the dialect
|
61
|
+
void GenerateCandidateDetectionSearchSpace(vector<char> &delim_candidates, vector<QuoteRule> "erule_candidates,
|
62
|
+
unordered_map<uint8_t, vector<char>> "e_candidates_map,
|
63
|
+
unordered_map<uint8_t, vector<char>> &escape_candidates_map);
|
64
|
+
//! 2. Generates the search space candidates for the state machines
|
65
|
+
void GenerateStateMachineSearchSpace(vector<unique_ptr<CSVStateMachine>> &csv_state_machines,
|
66
|
+
const vector<char> &delimiter_candidates,
|
67
|
+
const vector<QuoteRule> "erule_candidates,
|
68
|
+
const unordered_map<uint8_t, vector<char>> "e_candidates_map,
|
69
|
+
const unordered_map<uint8_t, vector<char>> &escape_candidates_map);
|
70
|
+
//! 3. Analyzes if dialect candidate is a good candidate to be considered, if so, it adds it to the candidates
|
71
|
+
void AnalyzeDialectCandidate(unique_ptr<CSVStateMachine>, idx_t &rows_read, idx_t &best_consistent_rows,
|
72
|
+
idx_t &prev_padding_count);
|
73
|
+
//! 4. Refine Candidates over remaining chunks
|
74
|
+
void RefineCandidates();
|
75
|
+
//! Checks if candidate still produces good values for the next chunk
|
76
|
+
bool RefineCandidateNextChunk(CSVStateMachine &candidate);
|
77
|
+
|
78
|
+
//! ------------------------------------------------------//
|
79
|
+
//! ------------------- Type Detection ------------------ //
|
80
|
+
//! ------------------------------------------------------//
|
81
|
+
//! Second phase of auto detection: detect types, format template candidates
|
82
|
+
//! ordered by descending specificity (~ from high to low)
|
83
|
+
void DetectTypes();
|
84
|
+
//! Change the date format for the type to the string
|
85
|
+
//! Try to cast a string value to the specified sql type
|
86
|
+
bool TryCastValue(CSVStateMachine &candidate, const Value &value, const LogicalType &sql_type);
|
87
|
+
void SetDateFormat(CSVStateMachine &candidate, const string &format_specifier, const LogicalTypeId &sql_type);
|
88
|
+
//! Functions that performs detection for date and timestamp formats
|
89
|
+
void DetectDateAndTimeStampFormats(CSVStateMachine &candidate, map<LogicalTypeId, bool> &has_format_candidates,
|
90
|
+
map<LogicalTypeId, vector<string>> &format_candidates,
|
91
|
+
const LogicalType &sql_type, const string &separator, Value &dummy_val);
|
92
|
+
|
93
|
+
//! Variables for Type Detection
|
94
|
+
//! Format Candidates for Date and Timestamp Types
|
95
|
+
const map<LogicalTypeId, vector<const char *>> format_template_candidates = {
|
96
|
+
{LogicalTypeId::DATE, {"%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d", "%y-%m-%d"}},
|
97
|
+
{LogicalTypeId::TIMESTAMP,
|
98
|
+
{"%Y-%m-%d %H:%M:%S.%f", "%m-%d-%Y %I:%M:%S %p", "%m-%d-%y %I:%M:%S %p", "%d-%m-%Y %H:%M:%S",
|
99
|
+
"%d-%m-%y %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%y-%m-%d %H:%M:%S"}},
|
100
|
+
};
|
101
|
+
unordered_map<idx_t, vector<LogicalType>> best_sql_types_candidates_per_column_idx;
|
102
|
+
map<LogicalTypeId, vector<string>> best_format_candidates;
|
103
|
+
unique_ptr<CSVStateMachine> best_candidate;
|
104
|
+
idx_t best_start_with_header = 0;
|
105
|
+
idx_t best_start_without_header = 0;
|
106
|
+
vector<Value> best_header_row;
|
107
|
+
|
108
|
+
//! ------------------------------------------------------//
|
109
|
+
//! ------------------ Header Detection ----------------- //
|
110
|
+
//! ------------------------------------------------------//
|
111
|
+
void DetectHeader();
|
112
|
+
vector<string> names;
|
113
|
+
|
114
|
+
//! ------------------------------------------------------//
|
115
|
+
//! ------------------ Type Replacement ----------------- //
|
116
|
+
//! ------------------------------------------------------//
|
117
|
+
void ReplaceTypes();
|
118
|
+
|
119
|
+
//! ------------------------------------------------------//
|
120
|
+
//! ------------------ Type Refinement ------------------ //
|
121
|
+
//! ------------------------------------------------------//
|
122
|
+
void RefineTypes();
|
123
|
+
bool TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type);
|
124
|
+
vector<LogicalType> detected_types;
|
125
|
+
};
|
126
|
+
|
127
|
+
} // namespace duckdb
|
@@ -0,0 +1,75 @@
|
|
1
|
+
//===----------------------------------------------------------------------===//
|
2
|
+
// DuckDB
|
3
|
+
//
|
4
|
+
// duckdb/execution/operator/scan/csv/csv_state_machine.hpp
|
5
|
+
//
|
6
|
+
//
|
7
|
+
//===----------------------------------------------------------------------===//
|
8
|
+
|
9
|
+
#pragma once
|
10
|
+
|
11
|
+
#include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
|
12
|
+
#include "duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp"
|
13
|
+
#include "duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp"
|
14
|
+
|
15
|
+
namespace duckdb {
|
16
|
+
|
17
|
+
//! All States of CSV Parsing
|
18
|
+
enum class CSVState : uint8_t {
|
19
|
+
STANDARD = 0, //! Regular unquoted field state
|
20
|
+
DELIMITER = 1, //! State after encountering a field separator (e.g., ;)
|
21
|
+
RECORD_SEPARATOR = 2, //! State after encountering a record separator (i.e., \n)
|
22
|
+
CARRIAGE_RETURN = 3, //! State after encountering a carriage return(i.e., \r)
|
23
|
+
QUOTED = 4, //! State when inside a quoted field
|
24
|
+
UNQUOTED = 5, //! State when leaving a quoted field
|
25
|
+
ESCAPE = 6, //! State when encountering an escape character (e.g., \)
|
26
|
+
EMPTY_LINE = 7, //! State when encountering an empty line (i.e., \r\r \n\n, \n\r)
|
27
|
+
INVALID = 8 //! Got to an Invalid State, this should error.
|
28
|
+
};
|
29
|
+
|
30
|
+
//! The CSV State Machine comprises a state transition array (STA).
|
31
|
+
//! The STA indicates the current state of parsing based on both the current and preceding characters.
|
32
|
+
//! This reveals whether we are dealing with a Field, a New Line, a Delimiter, and so forth.
|
33
|
+
//! The STA's creation depends on the provided quote, character, and delimiter options for that state machine.
|
34
|
+
//! The motivation behind implementing an STA is to remove branching in regular CSV Parsing by predicting and detecting
|
35
|
+
//! the states. Note: The State Machine is currently utilized solely in the CSV Sniffer.
|
36
|
+
class CSVStateMachine {
|
37
|
+
public:
|
38
|
+
explicit CSVStateMachine(CSVReaderOptions &options_p, const CSVStateMachineOptions &state_machine_options,
|
39
|
+
shared_ptr<CSVBufferManager> buffer_manager_p,
|
40
|
+
CSVStateMachineCache &csv_state_machine_cache_p);
|
41
|
+
//! Resets the state machine, so it can be used again
|
42
|
+
void Reset();
|
43
|
+
|
44
|
+
//! Aux Function for string UTF8 Verification
|
45
|
+
void VerifyUTF8();
|
46
|
+
|
47
|
+
CSVStateMachineCache &csv_state_machine_cache;
|
48
|
+
|
49
|
+
const CSVReaderOptions &options;
|
50
|
+
CSVBufferIterator csv_buffer_iterator;
|
51
|
+
//! Stores identified start row for this file (e.g., a file can start with garbage like notes, before the header)
|
52
|
+
idx_t start_row = 0;
|
53
|
+
//! The Transition Array is a Finite State Machine
|
54
|
+
//! It holds the transitions of all states, on all 256 possible different characters
|
55
|
+
const state_machine_t &transition_array;
|
56
|
+
|
57
|
+
//! Both these variables are used for new line identifier detection
|
58
|
+
bool single_record_separator = false;
|
59
|
+
bool carry_on_separator = false;
|
60
|
+
|
61
|
+
//! Variables Used for Sniffing
|
62
|
+
CSVState state;
|
63
|
+
CSVState previous_state;
|
64
|
+
CSVState pre_previous_state;
|
65
|
+
idx_t cur_rows;
|
66
|
+
idx_t column_count;
|
67
|
+
string value;
|
68
|
+
idx_t rows_read;
|
69
|
+
idx_t line_start_pos = 0;
|
70
|
+
|
71
|
+
//! Dialect options resulting from sniffing
|
72
|
+
DialectOptions dialect_options;
|
73
|
+
};
|
74
|
+
|
75
|
+
} // namespace duckdb
|
@@ -0,0 +1,51 @@
|
|
1
|
+
//===----------------------------------------------------------------------===//
|
2
|
+
// DuckDB
|
3
|
+
//
|
4
|
+
// duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp
|
5
|
+
//
|
6
|
+
//
|
7
|
+
//===----------------------------------------------------------------------===//
|
8
|
+
|
9
|
+
#pragma once
|
10
|
+
|
11
|
+
#include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
|
12
|
+
#include "duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp"
|
13
|
+
#include "duckdb/execution/operator/scan/csv/quote_rules.hpp"
|
14
|
+
|
15
|
+
namespace duckdb {
|
16
|
+
static constexpr uint32_t NUM_STATES = 8;
|
17
|
+
static constexpr uint32_t NUM_TRANSITIONS = 256;
|
18
|
+
typedef uint8_t state_machine_t[NUM_STATES][NUM_TRANSITIONS];
|
19
|
+
|
20
|
+
//! Hash function used in out state machine cache, it hashes and combines all options used to generate a state machine
|
21
|
+
struct HashCSVStateMachineConfig {
|
22
|
+
size_t operator()(CSVStateMachineOptions const &config) const noexcept {
|
23
|
+
auto h_delimiter = Hash(config.delimiter);
|
24
|
+
auto h_quote = Hash(config.quote);
|
25
|
+
auto h_escape = Hash(config.escape);
|
26
|
+
return CombineHash(h_delimiter, CombineHash(h_quote, h_escape));
|
27
|
+
}
|
28
|
+
};
|
29
|
+
|
30
|
+
//! The CSVStateMachineCache caches state machines, although small ~2kb, the actual creation of multiple State Machines
|
31
|
+
//! can become a bottleneck on sniffing, when reading very small csv files.
|
32
|
+
//! Hence the cache stores State Machines based on their different delimiter|quote|escape options.
|
33
|
+
class CSVStateMachineCache {
|
34
|
+
public:
|
35
|
+
CSVStateMachineCache();
|
36
|
+
~CSVStateMachineCache() {};
|
37
|
+
//! Gets a state machine from the cache, if it's not from one the default options
|
38
|
+
//! It first caches it, then returns it.
|
39
|
+
const state_machine_t &Get(const CSVStateMachineOptions &state_machine_options);
|
40
|
+
|
41
|
+
private:
|
42
|
+
void Insert(const CSVStateMachineOptions &state_machine_options);
|
43
|
+
//! Cache on delimiter|quote|escape
|
44
|
+
unordered_map<CSVStateMachineOptions, state_machine_t, HashCSVStateMachineConfig> state_machine_cache;
|
45
|
+
//! Default value for options used to intialize CSV State Machine Cache
|
46
|
+
const vector<char> default_delimiter = {',', '|', ';', '\t'};
|
47
|
+
const vector<vector<char>> default_quote = {{'\"'}, {'\"', '\''}, {'\0'}};
|
48
|
+
const vector<QuoteRule> default_quote_rule = {QuoteRule::QUOTES_RFC, QuoteRule::QUOTES_OTHER, QuoteRule::NO_QUOTES};
|
49
|
+
const vector<vector<char>> default_escape = {{'\0', '\"', '\''}, {'\\'}, {'\0'}};
|
50
|
+
};
|
51
|
+
} // namespace duckdb
|
@@ -1,18 +1,18 @@
|
|
1
1
|
//===----------------------------------------------------------------------===//
|
2
2
|
// DuckDB
|
3
3
|
//
|
4
|
-
// duckdb/execution/operator/
|
4
|
+
// duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp
|
5
5
|
//
|
6
6
|
//
|
7
7
|
//===----------------------------------------------------------------------===//
|
8
8
|
|
9
9
|
#pragma once
|
10
10
|
|
11
|
-
#include "duckdb/execution/operator/
|
12
|
-
#include "duckdb/execution/operator/
|
13
|
-
#include "duckdb/execution/operator/
|
14
|
-
#include "duckdb/execution/operator/
|
15
|
-
#include "duckdb/execution/operator/
|
11
|
+
#include "duckdb/execution/operator/scan/csv/base_csv_reader.hpp"
|
12
|
+
#include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
|
13
|
+
#include "duckdb/execution/operator/scan/csv/csv_file_handle.hpp"
|
14
|
+
#include "duckdb/execution/operator/scan/csv/csv_buffer.hpp"
|
15
|
+
#include "duckdb/execution/operator/scan/csv/csv_line_info.hpp"
|
16
16
|
|
17
17
|
#include <sstream>
|
18
18
|
#include <utility>
|
@@ -20,21 +20,17 @@
|
|
20
20
|
namespace duckdb {
|
21
21
|
|
22
22
|
struct CSVBufferRead {
|
23
|
-
CSVBufferRead(
|
23
|
+
CSVBufferRead(unique_ptr<CSVBufferHandle> buffer_p, idx_t buffer_start_p, idx_t buffer_end_p, idx_t batch_index,
|
24
24
|
idx_t local_batch_index_p, optional_ptr<LineInfo> line_info_p)
|
25
25
|
: buffer(std::move(buffer_p)), line_info(line_info_p), buffer_start(buffer_start_p), buffer_end(buffer_end_p),
|
26
26
|
batch_index(batch_index), local_batch_index(local_batch_index_p) {
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
}
|
31
|
-
} else {
|
32
|
-
buffer_start = 0;
|
33
|
-
buffer_end = 0;
|
27
|
+
D_ASSERT(buffer);
|
28
|
+
if (buffer_end > buffer->actual_size) {
|
29
|
+
buffer_end = buffer->actual_size;
|
34
30
|
}
|
35
31
|
}
|
36
32
|
|
37
|
-
CSVBufferRead(
|
33
|
+
CSVBufferRead(unique_ptr<CSVBufferHandle> buffer_p, unique_ptr<CSVBufferHandle> nxt_buffer_p, idx_t buffer_start_p,
|
38
34
|
idx_t buffer_end_p, idx_t batch_index, idx_t local_batch_index, optional_ptr<LineInfo> line_info_p)
|
39
35
|
: CSVBufferRead(std::move(buffer_p), buffer_start_p, buffer_end_p, batch_index, local_batch_index,
|
40
36
|
line_info_p) {
|
@@ -44,33 +40,33 @@ struct CSVBufferRead {
|
|
44
40
|
CSVBufferRead() : buffer_start(0), buffer_end(NumericLimits<idx_t>::Maximum()) {};
|
45
41
|
|
46
42
|
const char &operator[](size_t i) const {
|
47
|
-
if (i < buffer->
|
43
|
+
if (i < buffer->actual_size) {
|
48
44
|
auto buffer_ptr = buffer->Ptr();
|
49
45
|
return buffer_ptr[i];
|
50
46
|
}
|
51
47
|
auto next_ptr = next_buffer->Ptr();
|
52
|
-
return next_ptr[i - buffer->
|
48
|
+
return next_ptr[i - buffer->actual_size];
|
53
49
|
}
|
54
50
|
|
55
51
|
string_t GetValue(idx_t start_buffer, idx_t position_buffer, idx_t offset) {
|
56
52
|
idx_t length = position_buffer - start_buffer - offset;
|
57
53
|
// 1) It's all in the current buffer
|
58
|
-
if (start_buffer + length <= buffer->
|
54
|
+
if (start_buffer + length <= buffer->actual_size) {
|
59
55
|
auto buffer_ptr = buffer->Ptr();
|
60
56
|
return string_t(buffer_ptr + start_buffer, length);
|
61
|
-
} else if (start_buffer >= buffer->
|
57
|
+
} else if (start_buffer >= buffer->actual_size) {
|
62
58
|
// 2) It's all in the next buffer
|
63
59
|
D_ASSERT(next_buffer);
|
64
|
-
D_ASSERT(next_buffer->
|
60
|
+
D_ASSERT(next_buffer->actual_size >= length + (start_buffer - buffer->actual_size));
|
65
61
|
auto buffer_ptr = next_buffer->Ptr();
|
66
|
-
return string_t(buffer_ptr + (start_buffer - buffer->
|
62
|
+
return string_t(buffer_ptr + (start_buffer - buffer->actual_size), length);
|
67
63
|
} else {
|
68
64
|
// 3) It starts in the current buffer and ends in the next buffer
|
69
65
|
D_ASSERT(next_buffer);
|
70
66
|
auto intersection = make_unsafe_uniq_array<char>(length);
|
71
67
|
idx_t cur_pos = 0;
|
72
68
|
auto buffer_ptr = buffer->Ptr();
|
73
|
-
for (idx_t i = start_buffer; i < buffer->
|
69
|
+
for (idx_t i = start_buffer; i < buffer->actual_size; i++) {
|
74
70
|
intersection[cur_pos++] = buffer_ptr[i];
|
75
71
|
}
|
76
72
|
idx_t nxt_buffer_pos = 0;
|
@@ -83,8 +79,8 @@ struct CSVBufferRead {
|
|
83
79
|
}
|
84
80
|
}
|
85
81
|
|
86
|
-
|
87
|
-
|
82
|
+
unique_ptr<CSVBufferHandle> buffer;
|
83
|
+
unique_ptr<CSVBufferHandle> next_buffer;
|
88
84
|
vector<unsafe_unique_array<char>> intersections;
|
89
85
|
optional_ptr<LineInfo> line_info;
|
90
86
|
|
@@ -103,7 +99,7 @@ struct VerificationPositions {
|
|
103
99
|
//! CSV Reader for Parallel Reading
|
104
100
|
class ParallelCSVReader : public BaseCSVReader {
|
105
101
|
public:
|
106
|
-
ParallelCSVReader(ClientContext &context,
|
102
|
+
ParallelCSVReader(ClientContext &context, CSVReaderOptions options, unique_ptr<CSVBufferRead> buffer,
|
107
103
|
idx_t first_pos_first_buffer, const vector<LogicalType> &requested_types, idx_t file_idx_p);
|
108
104
|
virtual ~ParallelCSVReader() {
|
109
105
|
}
|
@@ -162,8 +158,6 @@ private:
|
|
162
158
|
|
163
159
|
//! Parses a CSV file with a one-byte delimiter, escape and quote character
|
164
160
|
bool TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line = false);
|
165
|
-
//! Verifies that the line length did not go over a pre-defined limit.
|
166
|
-
void VerifyLineLength(idx_t line_size);
|
167
161
|
|
168
162
|
//! First Position of First Buffer
|
169
163
|
idx_t first_pos_first_buffer = 0;
|
@@ -0,0 +1,21 @@
|
|
1
|
+
//===----------------------------------------------------------------------===//
|
2
|
+
// DuckDB
|
3
|
+
//
|
4
|
+
// duckdb/execution/operator/scan/csv/quote_rules.hpp
|
5
|
+
//
|
6
|
+
//
|
7
|
+
//===----------------------------------------------------------------------===//
|
8
|
+
|
9
|
+
#pragma once
|
10
|
+
|
11
|
+
#include "duckdb/common/vector.hpp"
|
12
|
+
|
13
|
+
namespace duckdb {
|
14
|
+
//! Different Rules regarding possible combinations of Quote and Escape Values for CSV Dialects.
|
15
|
+
//! Each rule has a comment on the possible combinations.
|
16
|
+
enum class QuoteRule : uint8_t {
|
17
|
+
QUOTES_RFC = 0, //! quote = " escape = (\0 || " || ')
|
18
|
+
QUOTES_OTHER = 1, //! quote = ( " || ' ) escape = '\\'
|
19
|
+
NO_QUOTES = 2 //! quote = \0 escape = \0
|
20
|
+
};
|
21
|
+
} // namespace duckdb
|
@@ -8,31 +8,27 @@
|
|
8
8
|
|
9
9
|
#pragma once
|
10
10
|
|
11
|
+
#include "duckdb/common/types/row/tuple_data_layout.hpp"
|
11
12
|
#include "duckdb/execution/operator/aggregate/grouped_aggregate_data.hpp"
|
12
|
-
#include "duckdb/execution/partitionable_hashtable.hpp"
|
13
|
-
#include "duckdb/execution/physical_operator.hpp"
|
14
13
|
#include "duckdb/parser/group_by_node.hpp"
|
15
14
|
|
16
15
|
namespace duckdb {
|
17
|
-
|
18
|
-
class
|
19
|
-
|
20
|
-
class Pipeline;
|
21
|
-
class Task;
|
16
|
+
|
17
|
+
class GroupedAggregateHashTable;
|
18
|
+
struct AggregatePartition;
|
22
19
|
|
23
20
|
class RadixPartitionedHashTable {
|
24
21
|
public:
|
25
22
|
RadixPartitionedHashTable(GroupingSet &grouping_set, const GroupedAggregateData &op);
|
23
|
+
unique_ptr<GroupedAggregateHashTable> CreateHT(ClientContext &context, const idx_t capacity,
|
24
|
+
const idx_t radix_bits) const;
|
26
25
|
|
26
|
+
public:
|
27
27
|
GroupingSet &grouping_set;
|
28
28
|
//! The indices specified in the groups_count that do not appear in the grouping_set
|
29
29
|
unsafe_vector<idx_t> null_groups;
|
30
30
|
const GroupedAggregateData &op;
|
31
|
-
|
32
31
|
vector<LogicalType> group_types;
|
33
|
-
//! how many groups can we have in the operator before we switch to radix partitioning
|
34
|
-
idx_t radix_limit;
|
35
|
-
|
36
32
|
//! The GROUPING values that belong to this hash table
|
37
33
|
vector<Value> grouping_values;
|
38
34
|
|
@@ -43,32 +39,27 @@ public:
|
|
43
39
|
|
44
40
|
void Sink(ExecutionContext &context, DataChunk &chunk, OperatorSinkInput &input, DataChunk &aggregate_input_chunk,
|
45
41
|
const unsafe_vector<idx_t> &filter) const;
|
46
|
-
void Combine(ExecutionContext &context, GlobalSinkState &
|
47
|
-
|
48
|
-
|
49
|
-
void ScheduleTasks(Executor &executor, const shared_ptr<Event> &event, GlobalSinkState &state,
|
50
|
-
vector<shared_ptr<Task>> &tasks) const;
|
42
|
+
void Combine(ExecutionContext &context, GlobalSinkState &gstate, LocalSinkState &lstate) const;
|
43
|
+
void Finalize(ClientContext &context, GlobalSinkState &gstate) const;
|
51
44
|
|
45
|
+
public:
|
52
46
|
//! Source interface
|
53
|
-
idx_t Size(GlobalSinkState &sink_state) const;
|
54
47
|
unique_ptr<GlobalSourceState> GetGlobalSourceState(ClientContext &context) const;
|
55
48
|
unique_ptr<LocalSourceState> GetLocalSourceState(ExecutionContext &context) const;
|
56
|
-
|
49
|
+
|
50
|
+
SourceResultType GetData(ExecutionContext &context, DataChunk &chunk, GlobalSinkState &sink,
|
57
51
|
OperatorSourceInput &input) const;
|
58
52
|
|
59
|
-
|
60
|
-
|
61
|
-
static
|
62
|
-
static void GetRepartitionInfo(ClientContext &context, GlobalSinkState &state, idx_t &repartition_radix_bits,
|
63
|
-
idx_t &concurrent_repartitions, idx_t &tasks_per_partition);
|
53
|
+
const TupleDataLayout &GetLayout() const;
|
54
|
+
idx_t Count(GlobalSinkState &sink) const;
|
55
|
+
static void SetMultiScan(GlobalSinkState &sink);
|
64
56
|
|
65
57
|
private:
|
66
58
|
void SetGroupingValues();
|
67
59
|
void PopulateGroupChunk(DataChunk &group_chunk, DataChunk &input_chunk) const;
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
const idx_t concurrent_repartitions, const idx_t tasks_per_partition) const;
|
60
|
+
idx_t CountInternal(GlobalSinkState &sink) const;
|
61
|
+
|
62
|
+
TupleDataLayout layout;
|
72
63
|
};
|
73
64
|
|
74
65
|
} // namespace duckdb
|
@@ -110,9 +110,8 @@ public:
|
|
110
110
|
bool has_serialize = function.format_serialize;
|
111
111
|
serializer.WriteProperty(503, "has_serialize", has_serialize);
|
112
112
|
if (has_serialize) {
|
113
|
-
serializer.
|
114
|
-
|
115
|
-
serializer.EndObject();
|
113
|
+
serializer.WriteObject(504, "function_data",
|
114
|
+
[&](FormatSerializer &obj) { function.format_serialize(obj, bind_info, function); });
|
116
115
|
D_ASSERT(function.format_deserialize);
|
117
116
|
}
|
118
117
|
}
|
@@ -150,9 +149,9 @@ public:
|
|
150
149
|
throw SerializationException("Function requires deserialization but no deserialization function for %s",
|
151
150
|
function.name);
|
152
151
|
}
|
153
|
-
|
154
|
-
|
155
|
-
|
152
|
+
unique_ptr<FunctionData> result;
|
153
|
+
deserializer.ReadObject(504, "function_data",
|
154
|
+
[&](FormatDeserializer &obj) { result = function.format_deserialize(obj, function); });
|
156
155
|
return result;
|
157
156
|
}
|
158
157
|
|
@@ -142,10 +142,10 @@ public:
|
|
142
142
|
public:
|
143
143
|
DUCKDB_API static ParseResult Parse(const string &format, const string &text);
|
144
144
|
|
145
|
-
DUCKDB_API bool Parse(string_t str, ParseResult &result);
|
145
|
+
DUCKDB_API bool Parse(string_t str, ParseResult &result) const;
|
146
146
|
|
147
|
-
DUCKDB_API bool TryParseDate(string_t str, date_t &result, string &error_message);
|
148
|
-
DUCKDB_API bool TryParseTimestamp(string_t str, timestamp_t &result, string &error_message);
|
147
|
+
DUCKDB_API bool TryParseDate(string_t str, date_t &result, string &error_message) const;
|
148
|
+
DUCKDB_API bool TryParseTimestamp(string_t str, timestamp_t &result, string &error_message) const;
|
149
149
|
|
150
150
|
date_t ParseDate(string_t str);
|
151
151
|
timestamp_t ParseTimestamp(string_t str);
|
@@ -158,7 +158,7 @@ protected:
|
|
158
158
|
DUCKDB_API void AddFormatSpecifier(string preceding_literal, StrTimeSpecifier specifier) override;
|
159
159
|
int NumericSpecifierWidth(StrTimeSpecifier specifier);
|
160
160
|
int32_t TryParseCollection(const char *data, idx_t &pos, idx_t size, const string_t collection[],
|
161
|
-
idx_t collection_count);
|
161
|
+
idx_t collection_count) const;
|
162
162
|
|
163
163
|
private:
|
164
164
|
explicit StrpTimeFormat(const string &format_string);
|
@@ -8,14 +8,16 @@
|
|
8
8
|
|
9
9
|
#pragma once
|
10
10
|
|
11
|
-
#include "duckdb/
|
12
|
-
#include "duckdb/
|
13
|
-
#include "duckdb/execution/operator/
|
14
|
-
#include "duckdb/execution/operator/
|
15
|
-
#include "duckdb/execution/operator/
|
16
|
-
#include "duckdb/execution/operator/
|
17
|
-
#include "duckdb/execution/operator/persistent/csv_buffer.hpp"
|
11
|
+
#include "duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp"
|
12
|
+
#include "duckdb/execution/operator/scan/csv/csv_buffer.hpp"
|
13
|
+
#include "duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp"
|
14
|
+
#include "duckdb/execution/operator/scan/csv/csv_file_handle.hpp"
|
15
|
+
#include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
|
16
|
+
#include "duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp"
|
18
17
|
#include "duckdb/function/built_in_functions.hpp"
|
18
|
+
#include "duckdb/function/scalar/strftime_format.hpp"
|
19
|
+
#include "duckdb/function/table_function.hpp"
|
20
|
+
#include "duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp"
|
19
21
|
|
20
22
|
namespace duckdb {
|
21
23
|
|
@@ -31,7 +33,7 @@ struct BaseCSVData : public TableFunctionData {
|
|
31
33
|
//! The file path of the CSV file to read or write
|
32
34
|
vector<string> files;
|
33
35
|
//! The CSV reader options
|
34
|
-
|
36
|
+
CSVReaderOptions options;
|
35
37
|
//! Offsets for generated columns
|
36
38
|
idx_t filename_col_idx;
|
37
39
|
idx_t hive_partition_col_idx;
|
@@ -50,8 +52,6 @@ struct WriteCSVData : public BaseCSVData {
|
|
50
52
|
vector<LogicalType> sql_types;
|
51
53
|
//! The newline string to write
|
52
54
|
string newline = "\n";
|
53
|
-
//! Whether or not we are writing a simple CSV (delimiter, quote and escape are all 1 byte in length)
|
54
|
-
bool is_simple;
|
55
55
|
//! The size of the CSV file (in bytes) that we buffer before we flush it to disk
|
56
56
|
idx_t flush_size = 4096 * 8;
|
57
57
|
//! For each byte whether or not the CSV file requires quotes when containing the byte
|
@@ -93,8 +93,9 @@ struct ReadCSVData : public BaseCSVData {
|
|
93
93
|
vector<LogicalType> return_types;
|
94
94
|
//! The expected SQL names to be returned from the read - including added constants (e.g. filename, hive partitions)
|
95
95
|
vector<string> return_names;
|
96
|
-
//! The
|
97
|
-
//! In this case,
|
96
|
+
//! The buffer manager (if any): this is used when automatic detection is used during binding.
|
97
|
+
//! In this case, some CSV buffers have already been read and can be reused.
|
98
|
+
shared_ptr<CSVBufferManager> buffer_manager;
|
98
99
|
unique_ptr<BufferedCSVReader> initial_reader;
|
99
100
|
//! The union readers are created (when csv union_by_name option is on) during binding
|
100
101
|
//! Those readers can be re-used during ReadCSVFunction
|
@@ -104,6 +105,10 @@ struct ReadCSVData : public BaseCSVData {
|
|
104
105
|
//! Reader bind data
|
105
106
|
MultiFileReaderBindData reader_bind;
|
106
107
|
vector<ColumnInfo> column_info;
|
108
|
+
//! The CSVStateMachineCache caches state machines created for sniffing and parsing csv files
|
109
|
+
//! We cache them because when reading very small csv files, the cost of creating all the possible
|
110
|
+
//! State machines for sniffing becomes a major bottleneck.
|
111
|
+
CSVStateMachineCache state_machine_cache;
|
107
112
|
|
108
113
|
void Initialize(unique_ptr<BufferedCSVReader> &reader) {
|
109
114
|
this->initial_reader = std::move(reader);
|
@@ -21,6 +21,7 @@ public:
|
|
21
21
|
explicit ClientContextFileOpener(ClientContext &context_p) : context(context_p) {
|
22
22
|
}
|
23
23
|
|
24
|
+
bool TryGetCurrentSetting(const string &key, Value &result, FileOpenerInfo &info) override;
|
24
25
|
bool TryGetCurrentSetting(const string &key, Value &result) override;
|
25
26
|
|
26
27
|
ClientContext *TryGetClientContext() override {
|
@@ -13,6 +13,7 @@
|
|
13
13
|
#include "duckdb/common/types/value.hpp"
|
14
14
|
#include "duckdb/common/case_insensitive_map.hpp"
|
15
15
|
#include "duckdb/common/atomic.hpp"
|
16
|
+
#include "duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp"
|
16
17
|
|
17
18
|
namespace duckdb {
|
18
19
|
class AttachedDatabase;
|
@@ -29,7 +30,7 @@ class SchemaCatalogEntry;
|
|
29
30
|
struct RandomEngine;
|
30
31
|
|
31
32
|
struct ClientData {
|
32
|
-
ClientData(ClientContext &context);
|
33
|
+
explicit ClientData(ClientContext &context);
|
33
34
|
~ClientData();
|
34
35
|
|
35
36
|
//! Query profiler
|
@@ -30,7 +30,7 @@ class DatabaseInstance;
|
|
30
30
|
class DuckDB;
|
31
31
|
class LogicalOperator;
|
32
32
|
class SelectStatement;
|
33
|
-
struct
|
33
|
+
struct CSVReaderOptions;
|
34
34
|
|
35
35
|
typedef void (*warning_callback)(std::string);
|
36
36
|
|
@@ -131,7 +131,7 @@ public:
|
|
131
131
|
|
132
132
|
//! Reads CSV file
|
133
133
|
DUCKDB_API shared_ptr<Relation> ReadCSV(const string &csv_file);
|
134
|
-
DUCKDB_API shared_ptr<Relation> ReadCSV(const string &csv_file,
|
134
|
+
DUCKDB_API shared_ptr<Relation> ReadCSV(const string &csv_file, CSVReaderOptions &options);
|
135
135
|
DUCKDB_API shared_ptr<Relation> ReadCSV(const string &csv_file, const vector<string> &columns);
|
136
136
|
|
137
137
|
//! Reads Parquet file
|