duckdb 0.7.2-dev2144.0 → 0.7.2-dev2233.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/extension/parquet/column_reader.cpp +3 -0
- package/src/duckdb/src/common/types/column/column_data_collection.cpp +7 -2
- package/src/duckdb/src/execution/operator/persistent/base_csv_reader.cpp +3 -0
- package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +71 -22
- package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +17 -13
- package/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp +0 -7
- package/src/duckdb/src/execution/operator/persistent/parallel_csv_reader.cpp +124 -29
- package/src/duckdb/src/execution/operator/scan/physical_table_scan.cpp +1 -1
- package/src/duckdb/src/function/table/read_csv.cpp +124 -58
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/types/column/column_data_collection.hpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +4 -1
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +8 -3
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp +5 -7
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/parallel_csv_reader.hpp +5 -1
- package/src/duckdb/src/include/duckdb/function/function.hpp +2 -0
- package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +25 -0
- package/src/duckdb/src/include/duckdb/main/client_data.hpp +3 -0
- package/src/duckdb/src/include/duckdb/main/config.hpp +0 -2
- package/src/duckdb/src/main/settings/settings.cpp +3 -4
- package/src/duckdb/src/planner/binder/expression/bind_operator_expression.cpp +13 -0
- package/src/duckdb/src/planner/binder/tableref/bind_table_function.cpp +9 -0
@@ -131,8 +131,8 @@ public:
|
|
131
131
|
//! Compare two column data collections to another. If they are equal according to result equality rules,
|
132
132
|
//! return true. That means null values are equal, and approx equality is used for floating point values.
|
133
133
|
//! If they are not equal, return false and fill in the error message.
|
134
|
-
static bool ResultEquals(const ColumnDataCollection &left, const ColumnDataCollection &right,
|
135
|
-
|
134
|
+
static bool ResultEquals(const ColumnDataCollection &left, const ColumnDataCollection &right, string &error_message,
|
135
|
+
bool ordered = false);
|
136
136
|
|
137
137
|
//! Obtains the next scan index to scan from
|
138
138
|
bool NextScanIndex(ColumnDataScanState &state, idx_t &chunk_index, idx_t &segment_index, idx_t &row_index) const;
|
@@ -87,7 +87,7 @@ private:
|
|
87
87
|
//! Resets the steam
|
88
88
|
void ResetStream();
|
89
89
|
//! Reads a new buffer from the CSV file if the current one has been exhausted
|
90
|
-
bool ReadBuffer(idx_t &start);
|
90
|
+
bool ReadBuffer(idx_t &start, idx_t &line_start);
|
91
91
|
//! Jumps back to the beginning of input stream and resets necessary internal states
|
92
92
|
bool JumpToNextSample();
|
93
93
|
//! Initializes the TextSearchShiftArrays for complex parser
|
@@ -124,6 +124,9 @@ private:
|
|
124
124
|
const vector<LogicalType> &requested_types,
|
125
125
|
vector<vector<LogicalType>> &best_sql_types_candidates,
|
126
126
|
map<LogicalTypeId, vector<string>> &best_format_candidates);
|
127
|
+
|
128
|
+
//! Skip Empty lines for tables with over one column
|
129
|
+
void SkipEmptyLines();
|
127
130
|
};
|
128
131
|
|
129
132
|
} // namespace duckdb
|
@@ -21,14 +21,15 @@ public:
|
|
21
21
|
|
22
22
|
//! Constructor for Initial Buffer
|
23
23
|
CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle,
|
24
|
-
idx_t &global_csv_current_position);
|
24
|
+
idx_t &global_csv_current_position, idx_t file_number);
|
25
25
|
|
26
26
|
//! Constructor for `Next()` Buffers
|
27
27
|
CSVBuffer(ClientContext &context, BufferHandle handle, idx_t buffer_size_p, idx_t actual_size_p, bool final_buffer,
|
28
|
-
idx_t global_csv_current_position);
|
28
|
+
idx_t global_csv_current_position, idx_t file_number);
|
29
29
|
|
30
30
|
//! Creates a new buffer with the next part of the CSV File
|
31
|
-
unique_ptr<CSVBuffer> Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t &global_csv_current_position
|
31
|
+
unique_ptr<CSVBuffer> Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t &global_csv_current_position,
|
32
|
+
idx_t file_number);
|
32
33
|
|
33
34
|
//! Gets the buffer actual size
|
34
35
|
idx_t GetBufferSize();
|
@@ -44,6 +45,8 @@ public:
|
|
44
45
|
|
45
46
|
idx_t GetCSVGlobalStart();
|
46
47
|
|
48
|
+
idx_t GetFileNumber();
|
49
|
+
|
47
50
|
BufferHandle AllocateBuffer(idx_t buffer_size);
|
48
51
|
|
49
52
|
char *Ptr() {
|
@@ -65,5 +68,7 @@ private:
|
|
65
68
|
bool first_buffer = false;
|
66
69
|
//! Global position from the CSV File where this buffer starts
|
67
70
|
idx_t global_csv_start = 0;
|
71
|
+
//! Number of the file that is in this buffer
|
72
|
+
idx_t file_number = 0;
|
68
73
|
};
|
69
74
|
} // namespace duckdb
|
@@ -39,11 +39,6 @@ struct BufferedCSVReaderOptions {
|
|
39
39
|
bool has_newline = false;
|
40
40
|
//! New Line separator
|
41
41
|
NewLineIdentifier new_line = NewLineIdentifier::NOT_SET;
|
42
|
-
|
43
|
-
//! Whether or not an option was provided for parallel
|
44
|
-
bool has_parallel = false;
|
45
|
-
//! Whether or not the read will use the ParallelCSVReader
|
46
|
-
bool use_parallel = false;
|
47
42
|
//! Whether or not a quote was defined by the user
|
48
43
|
bool has_quote = false;
|
49
44
|
//! Quote used for columns that contain reserved characters, e.g., delimiter
|
@@ -114,8 +109,12 @@ struct BufferedCSVReaderOptions {
|
|
114
109
|
//! Decimal separator when reading as numeric
|
115
110
|
string decimal_separator = ".";
|
116
111
|
//! Whether or not to pad rows that do not have enough columns with NULL values
|
117
|
-
bool null_padding =
|
112
|
+
bool null_padding = false;
|
118
113
|
|
114
|
+
//! If we are running the parallel version of the CSV Reader. In general, the system should always auto-detect
|
115
|
+
//! When it can't execute a parallel run before execution. However, there are (rather specific) situations where
|
116
|
+
//! setting up this manually might be important
|
117
|
+
bool run_parallel = true;
|
119
118
|
//===--------------------------------------------------------------------===//
|
120
119
|
// WriteCSVOptions
|
121
120
|
//===--------------------------------------------------------------------===//
|
@@ -139,7 +138,6 @@ struct BufferedCSVReaderOptions {
|
|
139
138
|
void SetEscape(const string &escape);
|
140
139
|
void SetQuote(const string "e);
|
141
140
|
void SetDelimiter(const string &delimiter);
|
142
|
-
void SetParallel(bool use_parallel);
|
143
141
|
|
144
142
|
void SetNewline(const string &input);
|
145
143
|
//! Set an option that is supported by both reading and writing functions, called by
|
@@ -99,7 +99,7 @@ struct VerificationPositions {
|
|
99
99
|
class ParallelCSVReader : public BaseCSVReader {
|
100
100
|
public:
|
101
101
|
ParallelCSVReader(ClientContext &context, BufferedCSVReaderOptions options, unique_ptr<CSVBufferRead> buffer,
|
102
|
-
const vector<LogicalType> &requested_types);
|
102
|
+
idx_t first_pos_first_buffer, const vector<LogicalType> &requested_types);
|
103
103
|
~ParallelCSVReader();
|
104
104
|
|
105
105
|
//! Current Position (Relative to the Buffer)
|
@@ -136,6 +136,8 @@ private:
|
|
136
136
|
bool TryParseCSV(ParserMode mode, DataChunk &insert_chunk, string &error_message);
|
137
137
|
//! Sets Position depending on the byte_start of this thread
|
138
138
|
bool SetPosition(DataChunk &insert_chunk);
|
139
|
+
//! Called when scanning the 1st buffer, skips empty lines
|
140
|
+
void SkipEmptyLines();
|
139
141
|
//! When a buffer finishes reading its piece, it still can try to scan up to the real end of the buffer
|
140
142
|
//! Up to finding a new line. This function sets the buffer_end and marks a boolean variable
|
141
143
|
//! when changing the buffer end the first time.
|
@@ -148,6 +150,8 @@ private:
|
|
148
150
|
bool TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line = false);
|
149
151
|
//! Position of the first read line and last read line for verification purposes
|
150
152
|
VerificationPositions verification_positions;
|
153
|
+
//! First Position of First Buffer
|
154
|
+
idx_t first_pos_first_buffer = 0;
|
151
155
|
};
|
152
156
|
|
153
157
|
} // namespace duckdb
|
@@ -56,6 +56,28 @@ struct WriteCSVData : public BaseCSVData {
|
|
56
56
|
idx_t flush_size = 4096 * 8;
|
57
57
|
};
|
58
58
|
|
59
|
+
struct ColumnInfo {
|
60
|
+
ColumnInfo() {
|
61
|
+
}
|
62
|
+
ColumnInfo(vector<std::string> names_p, vector<LogicalType> types_p) {
|
63
|
+
names = std::move(names_p);
|
64
|
+
types = std::move(types_p);
|
65
|
+
}
|
66
|
+
void Serialize(FieldWriter &writer) {
|
67
|
+
writer.WriteList<string>(names);
|
68
|
+
writer.WriteRegularSerializableList<LogicalType>(types);
|
69
|
+
}
|
70
|
+
|
71
|
+
static ColumnInfo Deserialize(FieldReader &reader) {
|
72
|
+
ColumnInfo info;
|
73
|
+
info.names = reader.ReadRequiredList<string>();
|
74
|
+
info.types = reader.ReadRequiredSerializableList<LogicalType, LogicalType>();
|
75
|
+
return info;
|
76
|
+
}
|
77
|
+
vector<std::string> names;
|
78
|
+
vector<LogicalType> types;
|
79
|
+
};
|
80
|
+
|
59
81
|
struct ReadCSVData : public BaseCSVData {
|
60
82
|
//! The expected SQL types to read from the file
|
61
83
|
vector<LogicalType> csv_types;
|
@@ -75,6 +97,9 @@ struct ReadCSVData : public BaseCSVData {
|
|
75
97
|
bool single_threaded = false;
|
76
98
|
//! Reader bind data
|
77
99
|
MultiFileReaderBindData reader_bind;
|
100
|
+
//! If all files are On-Disk file (e.g., not a pipe)
|
101
|
+
bool file_exists = true;
|
102
|
+
vector<ColumnInfo> column_info;
|
78
103
|
|
79
104
|
void Initialize(unique_ptr<BufferedCSVReader> &reader) {
|
80
105
|
this->initial_reader = std::move(reader);
|
@@ -58,6 +58,9 @@ struct ClientData {
|
|
58
58
|
//! The file search path
|
59
59
|
string file_search_path;
|
60
60
|
|
61
|
+
//! The Max Line Length Size of Last Query Executed on a CSV File. (Only used for testing)
|
62
|
+
idx_t max_line_length = 0;
|
63
|
+
|
61
64
|
public:
|
62
65
|
DUCKDB_API static ClientData &Get(ClientContext &context);
|
63
66
|
};
|
@@ -143,8 +143,6 @@ struct DBConfigOptions {
|
|
143
143
|
bool allow_unsigned_extensions = false;
|
144
144
|
//! Enable emitting FSST Vectors
|
145
145
|
bool enable_fsst_vectors = false;
|
146
|
-
//! Experimental parallel CSV reader
|
147
|
-
bool experimental_parallel_csv_reader = false;
|
148
146
|
//! Start transactions immediately in all attached databases - instead of lazily when a database is referenced
|
149
147
|
bool immediate_transaction_mode = false;
|
150
148
|
//! The set of unrecognized (other) options
|
@@ -512,16 +512,15 @@ Value EnableProgressBarPrintSetting::GetSetting(ClientContext &context) {
|
|
512
512
|
// Experimental Parallel CSV
|
513
513
|
//===--------------------------------------------------------------------===//
|
514
514
|
void ExperimentalParallelCSVSetting::SetGlobal(DatabaseInstance *db, DBConfig &config, const Value &input) {
|
515
|
-
|
515
|
+
Printer::Print("experimental_parallel_csv is deprecated and will be removed with the next release - the parallel "
|
516
|
+
"CSV reader is now standard and does not need to be manually enabled anymore 1");
|
516
517
|
}
|
517
518
|
|
518
519
|
void ExperimentalParallelCSVSetting::ResetGlobal(DatabaseInstance *db, DBConfig &config) {
|
519
|
-
config.options.experimental_parallel_csv_reader = DBConfig().options.experimental_parallel_csv_reader;
|
520
520
|
}
|
521
521
|
|
522
522
|
Value ExperimentalParallelCSVSetting::GetSetting(ClientContext &context) {
|
523
|
-
|
524
|
-
return Value::BIGINT(config.options.experimental_parallel_csv_reader);
|
523
|
+
return Value();
|
525
524
|
}
|
526
525
|
|
527
526
|
//===--------------------------------------------------------------------===//
|
@@ -21,8 +21,21 @@ static LogicalType ResolveInType(OperatorExpression &op, vector<BoundExpression
|
|
21
21
|
}
|
22
22
|
// get the maximum type from the children
|
23
23
|
LogicalType max_type = children[0]->expr->return_type;
|
24
|
+
bool any_varchar = children[0]->expr->return_type == LogicalType::VARCHAR;
|
25
|
+
bool any_enum = children[0]->expr->return_type.id() == LogicalTypeId::ENUM;
|
24
26
|
for (idx_t i = 1; i < children.size(); i++) {
|
25
27
|
max_type = LogicalType::MaxLogicalType(max_type, children[i]->expr->return_type);
|
28
|
+
if (children[i]->expr->return_type == LogicalType::VARCHAR) {
|
29
|
+
any_varchar = true;
|
30
|
+
}
|
31
|
+
if (children[i]->expr->return_type.id() == LogicalTypeId::ENUM) {
|
32
|
+
any_enum = true;
|
33
|
+
}
|
34
|
+
}
|
35
|
+
if (any_varchar && any_enum) {
|
36
|
+
// For the coalesce function, we must be sure we always upcast the parameters to VARCHAR, if there are at least
|
37
|
+
// one enum and one varchar
|
38
|
+
max_type = LogicalType::VARCHAR;
|
26
39
|
}
|
27
40
|
|
28
41
|
// cast all children to the same type
|
@@ -19,6 +19,7 @@
|
|
19
19
|
#include "duckdb/function/function_binder.hpp"
|
20
20
|
#include "duckdb/catalog/catalog_entry/table_function_catalog_entry.hpp"
|
21
21
|
#include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp"
|
22
|
+
#include "duckdb/function/table/read_csv.hpp"
|
22
23
|
|
23
24
|
namespace duckdb {
|
24
25
|
|
@@ -143,6 +144,14 @@ Binder::BindTableFunctionInternal(TableFunction &table_function, const string &f
|
|
143
144
|
auto arrow_bind = (PyTableFunctionData *)bind_data.get();
|
144
145
|
arrow_bind->external_dependency = std::move(external_dependency);
|
145
146
|
}
|
147
|
+
if (table_function.name == "read_csv" || table_function.name == "read_csv_auto") {
|
148
|
+
auto &csv_bind = bind_data->Cast<ReadCSVData>();
|
149
|
+
if (csv_bind.single_threaded) {
|
150
|
+
table_function.extra_info = "(Single-Threaded)";
|
151
|
+
} else {
|
152
|
+
table_function.extra_info = "(Multi-Threaded)";
|
153
|
+
}
|
154
|
+
}
|
146
155
|
}
|
147
156
|
if (return_types.size() != return_names.size()) {
|
148
157
|
throw InternalException(
|