duckdb 0.5.2-dev2006.0 → 0.5.2-dev2076.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/duckdb.hpp CHANGED
@@ -11,8 +11,8 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI
11
11
  #pragma once
12
12
  #define DUCKDB_AMALGAMATION 1
13
13
  #define DUCKDB_AMALGAMATION_EXTENDED 1
14
- #define DUCKDB_SOURCE_ID "2fc0fa3788"
15
- #define DUCKDB_VERSION "v0.5.2-dev2006"
14
+ #define DUCKDB_SOURCE_ID "80861d6b6f"
15
+ #define DUCKDB_VERSION "v0.5.2-dev2076"
16
16
  //===----------------------------------------------------------------------===//
17
17
  // DuckDB
18
18
  //
@@ -6293,6 +6293,9 @@ public:
6293
6293
  DUCKDB_API static bool CharacterIsNewline(char c) {
6294
6294
  return c == '\n' || c == '\r';
6295
6295
  }
6296
+ DUCKDB_API static bool CharacterIsNullTerminator(char c) {
6297
+ return c == '\0';
6298
+ }
6296
6299
  DUCKDB_API static bool CharacterIsDigit(char c) {
6297
6300
  return c >= '0' && c <= '9';
6298
6301
  }
@@ -16607,6 +16610,8 @@ struct DBConfigOptions {
16607
16610
  bool allow_unsigned_extensions = false;
16608
16611
  //! Enable emitting FSST Vectors
16609
16612
  bool enable_fsst_vectors = false;
16613
+ //! Experimental parallel CSV reader
16614
+ bool experimental_parallel_csv_reader = false;
16610
16615
 
16611
16616
  bool operator==(const DBConfigOptions &other) const;
16612
16617
  };
@@ -27400,7 +27405,17 @@ public:
27400
27405
  //===----------------------------------------------------------------------===//
27401
27406
  // DuckDB
27402
27407
  //
27403
- // duckdb/execution/operator/persistent/buffered_csv_reader.hpp
27408
+ // duckdb/execution/operator/persistent/base_csv_reader.hpp
27409
+ //
27410
+ //
27411
+ //===----------------------------------------------------------------------===//
27412
+
27413
+
27414
+
27415
+ //===----------------------------------------------------------------------===//
27416
+ // DuckDB
27417
+ //
27418
+ // duckdb/execution/operator/persistent/base_csv_reader.hpp
27404
27419
  //
27405
27420
  //
27406
27421
  //===----------------------------------------------------------------------===//
@@ -27728,44 +27743,242 @@ namespace duckdb {
27728
27743
  using std::queue;
27729
27744
  }
27730
27745
 
27746
+ //===----------------------------------------------------------------------===//
27747
+ // DuckDB
27748
+ //
27749
+ // duckdb/execution/operator/persistent/csv_reader_options.hpp
27750
+ //
27751
+ //
27752
+ //===----------------------------------------------------------------------===//
27753
+
27754
+
27755
+
27756
+ //===----------------------------------------------------------------------===//
27757
+ // DuckDB
27758
+ //
27759
+ // duckdb/execution/operator/persistent/csv_buffer.hpp
27760
+ //
27761
+ //
27762
+ //===----------------------------------------------------------------------===//
27763
+
27764
+
27765
+
27766
+
27767
+ //===----------------------------------------------------------------------===//
27768
+ // DuckDB
27769
+ //
27770
+ // duckdb/execution/operator/persistent/csv_file_handle.hpp
27771
+ //
27772
+ //
27773
+ //===----------------------------------------------------------------------===//
27774
+
27775
+
27776
+
27777
+
27731
27778
 
27732
- #include <sstream>
27733
27779
 
27734
27780
  namespace duckdb {
27735
- struct CopyInfo;
27736
- struct CSVFileHandle;
27737
- struct FileHandle;
27738
- struct StrpTimeFormat;
27739
27781
 
27740
- class FileOpener;
27741
- class FileSystem;
27782
+ struct CSVFileHandle {
27783
+ public:
27784
+ explicit CSVFileHandle(unique_ptr<FileHandle> file_handle_p) : file_handle(move(file_handle_p)) {
27785
+ can_seek = file_handle->CanSeek();
27786
+ plain_file_source = file_handle->OnDiskFile() && can_seek;
27787
+ file_size = file_handle->GetFileSize();
27788
+ }
27742
27789
 
27743
- //! The shifts array allows for linear searching of multi-byte values. For each position, it determines the next
27744
- //! position given that we encounter a byte with the given value.
27745
- /*! For example, if we have a string "ABAC", the shifts array will have the following values:
27746
- * [0] --> ['A'] = 1, all others = 0
27747
- * [1] --> ['B'] = 2, ['A'] = 1, all others = 0
27748
- * [2] --> ['A'] = 3, all others = 0
27749
- * [3] --> ['C'] = 4 (match), 'B' = 2, 'A' = 1, all others = 0
27750
- * Suppose we then search in the following string "ABABAC", our progression will be as follows:
27751
- * 'A' -> [1], 'B' -> [2], 'A' -> [3], 'B' -> [2], 'A' -> [3], 'C' -> [4] (match!)
27752
- */
27753
- struct TextSearchShiftArray {
27754
- TextSearchShiftArray();
27755
- explicit TextSearchShiftArray(string search_term);
27790
+ bool CanSeek() {
27791
+ return can_seek;
27792
+ }
27793
+ void Seek(idx_t position) {
27794
+ if (!can_seek) {
27795
+ throw InternalException("Cannot seek in this file");
27796
+ }
27797
+ file_handle->Seek(position);
27798
+ }
27799
+ idx_t SeekPosition() {
27800
+ if (!can_seek) {
27801
+ throw InternalException("Cannot seek in this file");
27802
+ }
27803
+ return file_handle->SeekPosition();
27804
+ }
27805
+ void Reset() {
27806
+ if (plain_file_source) {
27807
+ file_handle->Reset();
27808
+ } else {
27809
+ if (!reset_enabled) {
27810
+ throw InternalException("Reset called but reset is not enabled for this CSV Handle");
27811
+ }
27812
+ read_position = 0;
27813
+ }
27814
+ }
27815
+ bool PlainFileSource() {
27816
+ return plain_file_source;
27817
+ }
27756
27818
 
27757
- inline bool Match(uint8_t &position, uint8_t byte_value) {
27758
- if (position >= length) {
27759
- return false;
27819
+ bool OnDiskFile() {
27820
+ return file_handle->OnDiskFile();
27821
+ }
27822
+
27823
+ idx_t FileSize() {
27824
+ return file_size;
27825
+ }
27826
+
27827
+ bool FinishedReading() {
27828
+ return requested_bytes >= file_size;
27829
+ }
27830
+
27831
+ idx_t Read(void *buffer, idx_t nr_bytes) {
27832
+ requested_bytes += nr_bytes;
27833
+ if (!plain_file_source) {
27834
+ // not a plain file source: we need to do some bookkeeping around the reset functionality
27835
+ idx_t result_offset = 0;
27836
+ if (read_position < buffer_size) {
27837
+ // we need to read from our cached buffer
27838
+ auto buffer_read_count = MinValue<idx_t>(nr_bytes, buffer_size - read_position);
27839
+ memcpy(buffer, cached_buffer.get() + read_position, buffer_read_count);
27840
+ result_offset += buffer_read_count;
27841
+ read_position += buffer_read_count;
27842
+ if (result_offset == nr_bytes) {
27843
+ return nr_bytes;
27844
+ }
27845
+ } else if (!reset_enabled && cached_buffer) {
27846
+ // reset is disabled, but we still have cached data
27847
+ // we can remove any cached data
27848
+ cached_buffer.reset();
27849
+ buffer_size = 0;
27850
+ buffer_capacity = 0;
27851
+ read_position = 0;
27852
+ }
27853
+ // we have data left to read from the file
27854
+ // read directly into the buffer
27855
+ auto bytes_read = file_handle->Read((char *)buffer + result_offset, nr_bytes - result_offset);
27856
+ read_position += bytes_read;
27857
+ if (reset_enabled) {
27858
+ // if reset caching is enabled, we need to cache the bytes that we have read
27859
+ if (buffer_size + bytes_read >= buffer_capacity) {
27860
+ // no space; first enlarge the buffer
27861
+ buffer_capacity = MaxValue<idx_t>(NextPowerOfTwo(buffer_size + bytes_read), buffer_capacity * 2);
27862
+
27863
+ auto new_buffer = unique_ptr<data_t[]>(new data_t[buffer_capacity]);
27864
+ if (buffer_size > 0) {
27865
+ memcpy(new_buffer.get(), cached_buffer.get(), buffer_size);
27866
+ }
27867
+ cached_buffer = move(new_buffer);
27868
+ }
27869
+ memcpy(cached_buffer.get() + buffer_size, (char *)buffer + result_offset, bytes_read);
27870
+ buffer_size += bytes_read;
27871
+ }
27872
+
27873
+ return result_offset + bytes_read;
27874
+ } else {
27875
+ return file_handle->Read(buffer, nr_bytes);
27760
27876
  }
27761
- position = shifts[position * 255 + byte_value];
27762
- return position == length;
27763
27877
  }
27764
27878
 
27765
- idx_t length;
27766
- unique_ptr<uint8_t[]> shifts;
27879
+ string ReadLine() {
27880
+ bool carriage_return = false;
27881
+ string result;
27882
+ char buffer[1];
27883
+ while (true) {
27884
+ idx_t bytes_read = Read(buffer, 1);
27885
+ if (bytes_read == 0) {
27886
+ return result;
27887
+ }
27888
+ if (carriage_return) {
27889
+ if (buffer[0] != '\n') {
27890
+ if (!file_handle->CanSeek()) {
27891
+ throw BinderException(
27892
+ "Carriage return newlines not supported when reading CSV files in which we cannot seek");
27893
+ }
27894
+ file_handle->Seek(file_handle->SeekPosition() - 1);
27895
+ return result;
27896
+ }
27897
+ }
27898
+ if (buffer[0] == '\n') {
27899
+ return result;
27900
+ }
27901
+ if (buffer[0] != '\r') {
27902
+ result += buffer[0];
27903
+ } else {
27904
+ carriage_return = true;
27905
+ }
27906
+ }
27907
+ }
27908
+
27909
+ void DisableReset() {
27910
+ this->reset_enabled = false;
27911
+ }
27912
+ mutex main_mutex;
27913
+ idx_t count = 0;
27914
+
27915
+ private:
27916
+ unique_ptr<FileHandle> file_handle;
27917
+ bool reset_enabled = true;
27918
+ bool can_seek = false;
27919
+ bool plain_file_source = false;
27920
+ idx_t file_size = 0;
27921
+ // reset support
27922
+ unique_ptr<data_t[]> cached_buffer;
27923
+ idx_t read_position = 0;
27924
+ idx_t buffer_size = 0;
27925
+ idx_t buffer_capacity = 0;
27926
+ idx_t requested_bytes = 0;
27767
27927
  };
27768
27928
 
27929
+ } // namespace duckdb
27930
+
27931
+
27932
+ namespace duckdb {
27933
+
27934
+ class CSVBuffer {
27935
+ public:
27936
+ //! Colossal buffer size for multi-threading
27937
+ static constexpr idx_t INITIAL_BUFFER_SIZE_COLOSSAL = 32000000; // 32MB
27938
+
27939
+ //! Constructor for Initial Buffer
27940
+ CSVBuffer(idx_t buffer_size_p, CSVFileHandle &file_handle);
27941
+
27942
+ //! Constructor for `Next()` Buffers
27943
+ CSVBuffer(unique_ptr<char[]> buffer_p, idx_t buffer_size_p, idx_t actual_size_p, bool final_buffer);
27944
+
27945
+ //! Creates a new buffer with the next part of the CSV File
27946
+ unique_ptr<CSVBuffer> Next(CSVFileHandle &file_handle, idx_t set_buffer_size);
27947
+
27948
+ //! Gets the buffer actual size
27949
+ idx_t GetBufferSize();
27950
+
27951
+ //! Gets the start position of the buffer, only relevant for the first time it's scanned
27952
+ idx_t GetStart();
27953
+
27954
+ //! If this buffer is the last buffer of the CSV File
27955
+ bool IsCSVFileLastBuffer();
27956
+
27957
+ //! If this buffer is the first buffer of the CSV File
27958
+ bool IsCSVFileFirstBuffer();
27959
+ //! The actual buffer
27960
+ unique_ptr<char[]> buffer;
27961
+
27962
+ private:
27963
+ //! Actual size can be smaller than the buffer size in case we allocate it too optimistically.
27964
+ idx_t actual_size;
27965
+ //! We need to check for Byte Order Mark, to define the start position of this buffer
27966
+ //! https://en.wikipedia.org/wiki/Byte_order_mark#UTF-8
27967
+ idx_t start_position = 0;
27968
+ //! If this is the last buffer of the CSV File
27969
+ bool last_buffer = false;
27970
+ //! If this is the first buffer of the CSV File
27971
+ bool first_buffer = false;
27972
+ };
27973
+ } // namespace duckdb
27974
+
27975
+
27976
+
27977
+
27978
+
27979
+
27980
+ namespace duckdb {
27981
+
27769
27982
  struct BufferedCSVReaderOptions {
27770
27983
  //===--------------------------------------------------------------------===//
27771
27984
  // CommonCSVOptions
@@ -27792,7 +28005,7 @@ struct BufferedCSVReaderOptions {
27792
28005
  //! Expected number of columns
27793
28006
  idx_t num_cols = 0;
27794
28007
  //! Number of samples to buffer
27795
- idx_t buffer_size = STANDARD_VECTOR_SIZE * 50;
28008
+ idx_t buffer_sample_size = STANDARD_VECTOR_SIZE * 50;
27796
28009
  //! Specifies the string that represents a null value
27797
28010
  string null_str;
27798
28011
  //! Whether file is compressed or not, and if so which compression type
@@ -27830,6 +28043,8 @@ struct BufferedCSVReaderOptions {
27830
28043
  bool include_parsed_hive_partitions = false;
27831
28044
  //! Whether or not to union files with different (but compatible) columns
27832
28045
  bool union_by_name = false;
28046
+ //! Buffer Size (Parallel Scan)
28047
+ idx_t buffer_size = CSVBuffer::INITIAL_BUFFER_SIZE_COLOSSAL;
27833
28048
 
27834
28049
  //===--------------------------------------------------------------------===//
27835
28050
  // WriteCSVOptions
@@ -27864,24 +28079,31 @@ struct BufferedCSVReaderOptions {
27864
28079
 
27865
28080
  std::string ToString() const;
27866
28081
  };
28082
+ } // namespace duckdb
28083
+
28084
+
28085
+ #include <sstream>
28086
+
28087
+ namespace duckdb {
28088
+ struct CopyInfo;
28089
+ struct CSVFileHandle;
28090
+ struct FileHandle;
28091
+ struct StrpTimeFormat;
28092
+
28093
+ class FileOpener;
28094
+ class FileSystem;
27867
28095
 
27868
28096
  enum class ParserMode : uint8_t { PARSING = 0, SNIFFING_DIALECT = 1, SNIFFING_DATATYPES = 2, PARSING_HEADER = 3 };
27869
28097
 
27870
28098
  //! Buffered CSV reader is a class that reads values from a stream and parses them as a CSV file
27871
- class BufferedCSVReader {
27872
- //! Initial buffer read size; can be extended for long lines
27873
- static constexpr idx_t INITIAL_BUFFER_SIZE = 16384;
27874
- //! Larger buffer size for non disk files
27875
- static constexpr idx_t INITIAL_BUFFER_SIZE_LARGE = 10000000; // 10MB
27876
- ParserMode mode;
27877
-
28099
+ class BaseCSVReader {
27878
28100
  public:
27879
- BufferedCSVReader(ClientContext &context, BufferedCSVReaderOptions options,
27880
- const vector<LogicalType> &requested_types = vector<LogicalType>());
28101
+ BaseCSVReader(ClientContext &context, BufferedCSVReaderOptions options,
28102
+ const vector<LogicalType> &requested_types = vector<LogicalType>());
27881
28103
 
27882
- BufferedCSVReader(FileSystem &fs, Allocator &allocator, FileOpener *opener, BufferedCSVReaderOptions options,
27883
- const vector<LogicalType> &requested_types = vector<LogicalType>());
27884
- ~BufferedCSVReader();
28104
+ BaseCSVReader(FileSystem &fs, Allocator &allocator, FileOpener *opener, BufferedCSVReaderOptions options,
28105
+ const vector<LogicalType> &requested_types = vector<LogicalType>());
28106
+ ~BaseCSVReader();
27885
28107
 
27886
28108
  FileSystem &fs;
27887
28109
  Allocator &allocator;
@@ -27895,17 +28117,9 @@ public:
27895
28117
  vector<idx_t> insert_cols_idx;
27896
28118
  vector<idx_t> insert_nulls_idx;
27897
28119
 
27898
- unique_ptr<CSVFileHandle> file_handle;
27899
-
27900
- unique_ptr<char[]> buffer;
27901
- idx_t buffer_size;
27902
- idx_t position;
27903
- idx_t start = 0;
27904
-
27905
28120
  idx_t linenr = 0;
27906
28121
  bool linenr_estimated = false;
27907
28122
 
27908
- vector<idx_t> sniffed_column_counts;
27909
28123
  bool row_empty = false;
27910
28124
  idx_t sample_chunk_idx = 0;
27911
28125
  bool jumping_samples = false;
@@ -27915,72 +28129,145 @@ public:
27915
28129
  idx_t bytes_in_chunk = 0;
27916
28130
  double bytes_per_line_avg = 0;
27917
28131
 
27918
- vector<unique_ptr<char[]>> cached_buffers;
27919
-
27920
- TextSearchShiftArray delimiter_search, escape_search, quote_search;
27921
-
27922
28132
  DataChunk parse_chunk;
27923
28133
 
27924
28134
  std::queue<unique_ptr<DataChunk>> cached_chunks;
27925
28135
 
27926
- public:
27927
- //! Extract a single DataChunk from the CSV file and stores it in insert_chunk
27928
- void ParseCSV(DataChunk &insert_chunk);
27929
-
27930
- idx_t GetFileSize();
28136
+ ParserMode mode;
27931
28137
 
28138
+ public:
27932
28139
  //! Fill nulls into the cols that mismtach union names
27933
28140
  void SetNullUnionCols(DataChunk &insert_chunk);
27934
28141
 
27935
- private:
27936
- //! Initialize Parser
27937
- void Initialize(const vector<LogicalType> &requested_types);
28142
+ protected:
27938
28143
  //! Initializes the parse_chunk with varchar columns and aligns info with new number of cols
27939
28144
  void InitParseChunk(idx_t num_cols);
27940
28145
  //! Initializes the insert_chunk idx for mapping parse_chunk cols to insert_chunk cols
27941
28146
  void InitInsertChunkIdx(idx_t num_cols);
27942
- //! Initializes the TextSearchShiftArrays for complex parser
27943
- void PrepareComplexParser();
27944
- //! Try to parse a single datachunk from the file. Throws an exception if anything goes wrong.
27945
- void ParseCSV(ParserMode mode);
27946
- //! Try to parse a single datachunk from the file. Returns whether or not the parsing is successful
27947
- bool TryParseCSV(ParserMode mode);
27948
- //! Extract a single DataChunk from the CSV file and stores it in insert_chunk
27949
- bool TryParseCSV(ParserMode mode, DataChunk &insert_chunk, string &error_message);
27950
- //! Sniffs CSV dialect and determines skip rows, header row, column types and column names
27951
- vector<LogicalType> SniffCSV(const vector<LogicalType> &requested_types);
27952
28147
  //! Change the date format for the type to the string
27953
28148
  void SetDateFormat(const string &format_specifier, const LogicalTypeId &sql_type);
27954
28149
  //! Try to cast a string value to the specified sql type
27955
28150
  bool TryCastValue(const Value &value, const LogicalType &sql_type);
27956
28151
  //! Try to cast a vector of values to the specified sql type
27957
28152
  bool TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type);
28153
+
28154
+ //! Adds a value to the current row
28155
+ void AddValue(string_t str_val, idx_t &column, vector<idx_t> &escape_positions, bool has_quotes);
28156
+ //! Adds a row to the insert_chunk, returns true if the chunk is filled as a result of this row being added
28157
+ bool AddRow(DataChunk &insert_chunk, idx_t &column);
28158
+ //! Finalizes a chunk, parsing all values that have been added so far and adding them to the insert_chunk
28159
+ bool Flush(DataChunk &insert_chunk, bool try_add_line = false);
28160
+
28161
+ unique_ptr<CSVFileHandle> OpenCSV(const BufferedCSVReaderOptions &options);
28162
+
28163
+ void VerifyUTF8(idx_t col_idx);
28164
+ void VerifyUTF8(idx_t col_idx, idx_t row_idx, DataChunk &chunk, int64_t offset = 0);
28165
+ static string GetLineNumberStr(idx_t linenr, bool linenr_estimated);
28166
+
28167
+ protected:
28168
+ //! Whether or not the current row's columns have overflown sql_types.size()
28169
+ bool error_column_overflow = false;
28170
+ //! Number of sniffed columns - only used when auto-detecting
28171
+ vector<idx_t> sniffed_column_counts;
28172
+ };
28173
+
28174
+ } // namespace duckdb
28175
+
28176
+
28177
+ namespace duckdb {
28178
+ struct CopyInfo;
28179
+ struct CSVFileHandle;
28180
+ struct FileHandle;
28181
+ struct StrpTimeFormat;
28182
+
28183
+ class FileOpener;
28184
+ class FileSystem;
28185
+
28186
+ //! The shifts array allows for linear searching of multi-byte values. For each position, it determines the next
28187
+ //! position given that we encounter a byte with the given value.
28188
+ /*! For example, if we have a string "ABAC", the shifts array will have the following values:
28189
+ * [0] --> ['A'] = 1, all others = 0
28190
+ * [1] --> ['B'] = 2, ['A'] = 1, all others = 0
28191
+ * [2] --> ['A'] = 3, all others = 0
28192
+ * [3] --> ['C'] = 4 (match), 'B' = 2, 'A' = 1, all others = 0
28193
+ * Suppose we then search in the following string "ABABAC", our progression will be as follows:
28194
+ * 'A' -> [1], 'B' -> [2], 'A' -> [3], 'B' -> [2], 'A' -> [3], 'C' -> [4] (match!)
28195
+ */
28196
+ struct TextSearchShiftArray {
28197
+ TextSearchShiftArray();
28198
+ explicit TextSearchShiftArray(string search_term);
28199
+
28200
+ inline bool Match(uint8_t &position, uint8_t byte_value) {
28201
+ if (position >= length) {
28202
+ return false;
28203
+ }
28204
+ position = shifts[position * 255 + byte_value];
28205
+ return position == length;
28206
+ }
28207
+
28208
+ idx_t length;
28209
+ unique_ptr<uint8_t[]> shifts;
28210
+ };
28211
+
28212
+ //! Buffered CSV reader is a class that reads values from a stream and parses them as a CSV file
28213
+ class BufferedCSVReader : public BaseCSVReader {
28214
+ //! Initial buffer read size; can be extended for long lines
28215
+ static constexpr idx_t INITIAL_BUFFER_SIZE = 16384;
28216
+ //! Larger buffer size for non disk files
28217
+ static constexpr idx_t INITIAL_BUFFER_SIZE_LARGE = 10000000; // 10MB
28218
+
28219
+ public:
28220
+ BufferedCSVReader(ClientContext &context, BufferedCSVReaderOptions options,
28221
+ const vector<LogicalType> &requested_types = vector<LogicalType>());
28222
+ BufferedCSVReader(FileSystem &fs, Allocator &allocator, FileOpener *opener, BufferedCSVReaderOptions options,
28223
+ const vector<LogicalType> &requested_types = vector<LogicalType>());
28224
+ ~BufferedCSVReader();
28225
+
28226
+ unique_ptr<char[]> buffer;
28227
+ idx_t buffer_size;
28228
+ idx_t position;
28229
+ idx_t start = 0;
28230
+
28231
+ vector<unique_ptr<char[]>> cached_buffers;
28232
+
28233
+ unique_ptr<CSVFileHandle> file_handle;
28234
+
28235
+ TextSearchShiftArray delimiter_search, escape_search, quote_search;
28236
+
28237
+ public:
28238
+ //! Extract a single DataChunk from the CSV file and stores it in insert_chunk
28239
+ void ParseCSV(DataChunk &insert_chunk);
28240
+
28241
+ private:
28242
+ //! Initialize Parser
28243
+ void Initialize(const vector<LogicalType> &requested_types);
27958
28244
  //! Skips skip_rows, reads header row from input stream
27959
28245
  void SkipRowsAndReadHeader(idx_t skip_rows, bool skip_header);
27960
28246
  //! Jumps back to the beginning of input stream and resets necessary internal states
27961
28247
  void JumpToBeginning(idx_t skip_rows, bool skip_header);
27962
- //! Jumps back to the beginning of input stream and resets necessary internal states
27963
- bool JumpToNextSample();
27964
28248
  //! Resets the buffer
27965
28249
  void ResetBuffer();
27966
28250
  //! Resets the steam
27967
28251
  void ResetStream();
28252
+ //! Reads a new buffer from the CSV file if the current one has been exhausted
28253
+ bool ReadBuffer(idx_t &start);
28254
+ //! Jumps back to the beginning of input stream and resets necessary internal states
28255
+ bool JumpToNextSample();
28256
+ //! Initializes the TextSearchShiftArrays for complex parser
28257
+ void PrepareComplexParser();
28258
+ //! Try to parse a single datachunk from the file. Throws an exception if anything goes wrong.
28259
+ void ParseCSV(ParserMode mode);
28260
+ //! Try to parse a single datachunk from the file. Returns whether or not the parsing is successful
28261
+ bool TryParseCSV(ParserMode mode);
28262
+ //! Extract a single DataChunk from the CSV file and stores it in insert_chunk
28263
+ bool TryParseCSV(ParserMode mode, DataChunk &insert_chunk, string &error_message);
27968
28264
 
27969
28265
  //! Parses a CSV file with a one-byte delimiter, escape and quote character
27970
28266
  bool TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message);
27971
28267
  //! Parses more complex CSV files with multi-byte delimiters, escapes or quotes
27972
28268
  bool TryParseComplexCSV(DataChunk &insert_chunk, string &error_message);
27973
-
27974
- //! Adds a value to the current row
27975
- void AddValue(string_t str_val, idx_t &column, vector<idx_t> &escape_positions, bool has_quotes);
27976
- //! Adds a row to the insert_chunk, returns true if the chunk is filled as a result of this row being added
27977
- bool AddRow(DataChunk &insert_chunk, idx_t &column);
27978
- //! Finalizes a chunk, parsing all values that have been added so far and adding them to the insert_chunk
27979
- void Flush(DataChunk &insert_chunk);
27980
- //! Reads a new buffer from the CSV file if the current one has been exhausted
27981
- bool ReadBuffer(idx_t &start);
27982
-
27983
- unique_ptr<CSVFileHandle> OpenCSV(const BufferedCSVReaderOptions &options);
28269
+ //! Sniffs CSV dialect and determines skip rows, header row, column types and column names
28270
+ vector<LogicalType> SniffCSV(const vector<LogicalType> &requested_types);
27984
28271
 
27985
28272
  //! First phase of auto detection: detect CSV dialect (i.e. delimiter, quote rules, etc)
27986
28273
  void DetectDialect(const vector<LogicalType> &requested_types, BufferedCSVReaderOptions &original_options,
@@ -28000,13 +28287,6 @@ private:
28000
28287
  const vector<LogicalType> &requested_types,
28001
28288
  vector<vector<LogicalType>> &best_sql_types_candidates,
28002
28289
  map<LogicalTypeId, vector<string>> &best_format_candidates);
28003
-
28004
- void VerifyUTF8(idx_t col_idx);
28005
- void VerifyUTF8(idx_t col_idx, idx_t row_idx, DataChunk &chunk, int64_t offset = 0);
28006
-
28007
- private:
28008
- //! Whether or not the current row's columns have overflown sql_types.size()
28009
- bool error_column_overflow = false;
28010
28290
  };
28011
28291
 
28012
28292
  } // namespace duckdb