duckdb 0.7.2-dev2144.0 → 0.7.2-dev2233.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. package/package.json +1 -1
  2. package/src/duckdb/extension/parquet/column_reader.cpp +3 -0
  3. package/src/duckdb/src/common/types/column/column_data_collection.cpp +7 -2
  4. package/src/duckdb/src/execution/operator/persistent/base_csv_reader.cpp +3 -0
  5. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +71 -22
  6. package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +17 -13
  7. package/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp +0 -7
  8. package/src/duckdb/src/execution/operator/persistent/parallel_csv_reader.cpp +124 -29
  9. package/src/duckdb/src/execution/operator/scan/physical_table_scan.cpp +1 -1
  10. package/src/duckdb/src/function/table/read_csv.cpp +124 -58
  11. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  12. package/src/duckdb/src/include/duckdb/common/types/column/column_data_collection.hpp +2 -2
  13. package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +4 -1
  14. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +8 -3
  15. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp +5 -7
  16. package/src/duckdb/src/include/duckdb/execution/operator/persistent/parallel_csv_reader.hpp +5 -1
  17. package/src/duckdb/src/include/duckdb/function/function.hpp +2 -0
  18. package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +25 -0
  19. package/src/duckdb/src/include/duckdb/main/client_data.hpp +3 -0
  20. package/src/duckdb/src/include/duckdb/main/config.hpp +0 -2
  21. package/src/duckdb/src/main/settings/settings.cpp +3 -4
  22. package/src/duckdb/src/planner/binder/expression/bind_operator_expression.cpp +13 -0
  23. package/src/duckdb/src/planner/binder/tableref/bind_table_function.cpp +9 -0
@@ -131,8 +131,8 @@ public:
131
131
  //! Compare two column data collections to another. If they are equal according to result equality rules,
132
132
  //! return true. That means null values are equal, and approx equality is used for floating point values.
133
133
  //! If they are not equal, return false and fill in the error message.
134
- static bool ResultEquals(const ColumnDataCollection &left, const ColumnDataCollection &right,
135
- string &error_message);
134
+ static bool ResultEquals(const ColumnDataCollection &left, const ColumnDataCollection &right, string &error_message,
135
+ bool ordered = false);
136
136
 
137
137
  //! Obtains the next scan index to scan from
138
138
  bool NextScanIndex(ColumnDataScanState &state, idx_t &chunk_index, idx_t &segment_index, idx_t &row_index) const;
@@ -87,7 +87,7 @@ private:
87
87
  //! Resets the steam
88
88
  void ResetStream();
89
89
  //! Reads a new buffer from the CSV file if the current one has been exhausted
90
- bool ReadBuffer(idx_t &start);
90
+ bool ReadBuffer(idx_t &start, idx_t &line_start);
91
91
  //! Jumps back to the beginning of input stream and resets necessary internal states
92
92
  bool JumpToNextSample();
93
93
  //! Initializes the TextSearchShiftArrays for complex parser
@@ -124,6 +124,9 @@ private:
124
124
  const vector<LogicalType> &requested_types,
125
125
  vector<vector<LogicalType>> &best_sql_types_candidates,
126
126
  map<LogicalTypeId, vector<string>> &best_format_candidates);
127
+
128
+ //! Skip Empty lines for tables with over one column
129
+ void SkipEmptyLines();
127
130
  };
128
131
 
129
132
  } // namespace duckdb
@@ -21,14 +21,15 @@ public:
21
21
 
22
22
  //! Constructor for Initial Buffer
23
23
  CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle,
24
- idx_t &global_csv_current_position);
24
+ idx_t &global_csv_current_position, idx_t file_number);
25
25
 
26
26
  //! Constructor for `Next()` Buffers
27
27
  CSVBuffer(ClientContext &context, BufferHandle handle, idx_t buffer_size_p, idx_t actual_size_p, bool final_buffer,
28
- idx_t global_csv_current_position);
28
+ idx_t global_csv_current_position, idx_t file_number);
29
29
 
30
30
  //! Creates a new buffer with the next part of the CSV File
31
- unique_ptr<CSVBuffer> Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t &global_csv_current_position);
31
+ unique_ptr<CSVBuffer> Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t &global_csv_current_position,
32
+ idx_t file_number);
32
33
 
33
34
  //! Gets the buffer actual size
34
35
  idx_t GetBufferSize();
@@ -44,6 +45,8 @@ public:
44
45
 
45
46
  idx_t GetCSVGlobalStart();
46
47
 
48
+ idx_t GetFileNumber();
49
+
47
50
  BufferHandle AllocateBuffer(idx_t buffer_size);
48
51
 
49
52
  char *Ptr() {
@@ -65,5 +68,7 @@ private:
65
68
  bool first_buffer = false;
66
69
  //! Global position from the CSV File where this buffer starts
67
70
  idx_t global_csv_start = 0;
71
+ //! Number of the file that is in this buffer
72
+ idx_t file_number = 0;
68
73
  };
69
74
  } // namespace duckdb
@@ -39,11 +39,6 @@ struct BufferedCSVReaderOptions {
39
39
  bool has_newline = false;
40
40
  //! New Line separator
41
41
  NewLineIdentifier new_line = NewLineIdentifier::NOT_SET;
42
-
43
- //! Whether or not an option was provided for parallel
44
- bool has_parallel = false;
45
- //! Whether or not the read will use the ParallelCSVReader
46
- bool use_parallel = false;
47
42
  //! Whether or not a quote was defined by the user
48
43
  bool has_quote = false;
49
44
  //! Quote used for columns that contain reserved characters, e.g., delimiter
@@ -114,8 +109,12 @@ struct BufferedCSVReaderOptions {
114
109
  //! Decimal separator when reading as numeric
115
110
  string decimal_separator = ".";
116
111
  //! Whether or not to pad rows that do not have enough columns with NULL values
117
- bool null_padding = true;
112
+ bool null_padding = false;
118
113
 
114
+ //! If we are running the parallel version of the CSV Reader. In general, the system should always auto-detect
115
+ //! When it can't execute a parallel run before execution. However, there are (rather specific) situations where
116
+ //! setting up this manually might be important
117
+ bool run_parallel = true;
119
118
  //===--------------------------------------------------------------------===//
120
119
  // WriteCSVOptions
121
120
  //===--------------------------------------------------------------------===//
@@ -139,7 +138,6 @@ struct BufferedCSVReaderOptions {
139
138
  void SetEscape(const string &escape);
140
139
  void SetQuote(const string &quote);
141
140
  void SetDelimiter(const string &delimiter);
142
- void SetParallel(bool use_parallel);
143
141
 
144
142
  void SetNewline(const string &input);
145
143
  //! Set an option that is supported by both reading and writing functions, called by
@@ -99,7 +99,7 @@ struct VerificationPositions {
99
99
  class ParallelCSVReader : public BaseCSVReader {
100
100
  public:
101
101
  ParallelCSVReader(ClientContext &context, BufferedCSVReaderOptions options, unique_ptr<CSVBufferRead> buffer,
102
- const vector<LogicalType> &requested_types);
102
+ idx_t first_pos_first_buffer, const vector<LogicalType> &requested_types);
103
103
  ~ParallelCSVReader();
104
104
 
105
105
  //! Current Position (Relative to the Buffer)
@@ -136,6 +136,8 @@ private:
136
136
  bool TryParseCSV(ParserMode mode, DataChunk &insert_chunk, string &error_message);
137
137
  //! Sets Position depending on the byte_start of this thread
138
138
  bool SetPosition(DataChunk &insert_chunk);
139
+ //! Called when scanning the 1st buffer, skips empty lines
140
+ void SkipEmptyLines();
139
141
  //! When a buffer finishes reading its piece, it still can try to scan up to the real end of the buffer
140
142
  //! Up to finding a new line. This function sets the buffer_end and marks a boolean variable
141
143
  //! when changing the buffer end the first time.
@@ -148,6 +150,8 @@ private:
148
150
  bool TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message, bool try_add_line = false);
149
151
  //! Position of the first read line and last read line for verification purposes
150
152
  VerificationPositions verification_positions;
153
+ //! First Position of First Buffer
154
+ idx_t first_pos_first_buffer = 0;
151
155
  };
152
156
 
153
157
  } // namespace duckdb
@@ -91,6 +91,8 @@ public:
91
91
 
92
92
  //! The name of the function
93
93
  string name;
94
+ //! Additional Information to specify function from it's name
95
+ string extra_info;
94
96
 
95
97
  public:
96
98
  //! Returns the formatted string name(arg1, arg2, ...)
@@ -56,6 +56,28 @@ struct WriteCSVData : public BaseCSVData {
56
56
  idx_t flush_size = 4096 * 8;
57
57
  };
58
58
 
59
+ struct ColumnInfo {
60
+ ColumnInfo() {
61
+ }
62
+ ColumnInfo(vector<std::string> names_p, vector<LogicalType> types_p) {
63
+ names = std::move(names_p);
64
+ types = std::move(types_p);
65
+ }
66
+ void Serialize(FieldWriter &writer) {
67
+ writer.WriteList<string>(names);
68
+ writer.WriteRegularSerializableList<LogicalType>(types);
69
+ }
70
+
71
+ static ColumnInfo Deserialize(FieldReader &reader) {
72
+ ColumnInfo info;
73
+ info.names = reader.ReadRequiredList<string>();
74
+ info.types = reader.ReadRequiredSerializableList<LogicalType, LogicalType>();
75
+ return info;
76
+ }
77
+ vector<std::string> names;
78
+ vector<LogicalType> types;
79
+ };
80
+
59
81
  struct ReadCSVData : public BaseCSVData {
60
82
  //! The expected SQL types to read from the file
61
83
  vector<LogicalType> csv_types;
@@ -75,6 +97,9 @@ struct ReadCSVData : public BaseCSVData {
75
97
  bool single_threaded = false;
76
98
  //! Reader bind data
77
99
  MultiFileReaderBindData reader_bind;
100
+ //! If all files are On-Disk file (e.g., not a pipe)
101
+ bool file_exists = true;
102
+ vector<ColumnInfo> column_info;
78
103
 
79
104
  void Initialize(unique_ptr<BufferedCSVReader> &reader) {
80
105
  this->initial_reader = std::move(reader);
@@ -58,6 +58,9 @@ struct ClientData {
58
58
  //! The file search path
59
59
  string file_search_path;
60
60
 
61
+ //! The Max Line Length Size of Last Query Executed on a CSV File. (Only used for testing)
62
+ idx_t max_line_length = 0;
63
+
61
64
  public:
62
65
  DUCKDB_API static ClientData &Get(ClientContext &context);
63
66
  };
@@ -143,8 +143,6 @@ struct DBConfigOptions {
143
143
  bool allow_unsigned_extensions = false;
144
144
  //! Enable emitting FSST Vectors
145
145
  bool enable_fsst_vectors = false;
146
- //! Experimental parallel CSV reader
147
- bool experimental_parallel_csv_reader = false;
148
146
  //! Start transactions immediately in all attached databases - instead of lazily when a database is referenced
149
147
  bool immediate_transaction_mode = false;
150
148
  //! The set of unrecognized (other) options
@@ -512,16 +512,15 @@ Value EnableProgressBarPrintSetting::GetSetting(ClientContext &context) {
512
512
  // Experimental Parallel CSV
513
513
  //===--------------------------------------------------------------------===//
514
514
  void ExperimentalParallelCSVSetting::SetGlobal(DatabaseInstance *db, DBConfig &config, const Value &input) {
515
- config.options.experimental_parallel_csv_reader = input.GetValue<bool>();
515
+ Printer::Print("experimental_parallel_csv is deprecated and will be removed with the next release - the parallel "
516
+ "CSV reader is now standard and does not need to be manually enabled anymore 1");
516
517
  }
517
518
 
518
519
  void ExperimentalParallelCSVSetting::ResetGlobal(DatabaseInstance *db, DBConfig &config) {
519
- config.options.experimental_parallel_csv_reader = DBConfig().options.experimental_parallel_csv_reader;
520
520
  }
521
521
 
522
522
  Value ExperimentalParallelCSVSetting::GetSetting(ClientContext &context) {
523
- auto &config = DBConfig::GetConfig(context);
524
- return Value::BIGINT(config.options.experimental_parallel_csv_reader);
523
+ return Value();
525
524
  }
526
525
 
527
526
  //===--------------------------------------------------------------------===//
@@ -21,8 +21,21 @@ static LogicalType ResolveInType(OperatorExpression &op, vector<BoundExpression
21
21
  }
22
22
  // get the maximum type from the children
23
23
  LogicalType max_type = children[0]->expr->return_type;
24
+ bool any_varchar = children[0]->expr->return_type == LogicalType::VARCHAR;
25
+ bool any_enum = children[0]->expr->return_type.id() == LogicalTypeId::ENUM;
24
26
  for (idx_t i = 1; i < children.size(); i++) {
25
27
  max_type = LogicalType::MaxLogicalType(max_type, children[i]->expr->return_type);
28
+ if (children[i]->expr->return_type == LogicalType::VARCHAR) {
29
+ any_varchar = true;
30
+ }
31
+ if (children[i]->expr->return_type.id() == LogicalTypeId::ENUM) {
32
+ any_enum = true;
33
+ }
34
+ }
35
+ if (any_varchar && any_enum) {
36
+ // For the coalesce function, we must be sure we always upcast the parameters to VARCHAR, if there are at least
37
+ // one enum and one varchar
38
+ max_type = LogicalType::VARCHAR;
26
39
  }
27
40
 
28
41
  // cast all children to the same type
@@ -19,6 +19,7 @@
19
19
  #include "duckdb/function/function_binder.hpp"
20
20
  #include "duckdb/catalog/catalog_entry/table_function_catalog_entry.hpp"
21
21
  #include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp"
22
+ #include "duckdb/function/table/read_csv.hpp"
22
23
 
23
24
  namespace duckdb {
24
25
 
@@ -143,6 +144,14 @@ Binder::BindTableFunctionInternal(TableFunction &table_function, const string &f
143
144
  auto arrow_bind = (PyTableFunctionData *)bind_data.get();
144
145
  arrow_bind->external_dependency = std::move(external_dependency);
145
146
  }
147
+ if (table_function.name == "read_csv" || table_function.name == "read_csv_auto") {
148
+ auto &csv_bind = bind_data->Cast<ReadCSVData>();
149
+ if (csv_bind.single_threaded) {
150
+ table_function.extra_info = "(Single-Threaded)";
151
+ } else {
152
+ table_function.extra_info = "(Multi-Threaded)";
153
+ }
154
+ }
146
155
  }
147
156
  if (return_types.size() != return_names.size()) {
148
157
  throw InternalException(