duckdb 0.5.2-dev2006.0 → 0.5.2-dev2076.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb.cpp +1649 -786
- package/src/duckdb.hpp +373 -93
- package/src/parquet-amalgamation.cpp +37721 -37721
package/src/duckdb.hpp
CHANGED
|
@@ -11,8 +11,8 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI
|
|
|
11
11
|
#pragma once
|
|
12
12
|
#define DUCKDB_AMALGAMATION 1
|
|
13
13
|
#define DUCKDB_AMALGAMATION_EXTENDED 1
|
|
14
|
-
#define DUCKDB_SOURCE_ID "
|
|
15
|
-
#define DUCKDB_VERSION "v0.5.2-
|
|
14
|
+
#define DUCKDB_SOURCE_ID "80861d6b6f"
|
|
15
|
+
#define DUCKDB_VERSION "v0.5.2-dev2076"
|
|
16
16
|
//===----------------------------------------------------------------------===//
|
|
17
17
|
// DuckDB
|
|
18
18
|
//
|
|
@@ -6293,6 +6293,9 @@ public:
|
|
|
6293
6293
|
DUCKDB_API static bool CharacterIsNewline(char c) {
|
|
6294
6294
|
return c == '\n' || c == '\r';
|
|
6295
6295
|
}
|
|
6296
|
+
DUCKDB_API static bool CharacterIsNullTerminator(char c) {
|
|
6297
|
+
return c == '\0';
|
|
6298
|
+
}
|
|
6296
6299
|
DUCKDB_API static bool CharacterIsDigit(char c) {
|
|
6297
6300
|
return c >= '0' && c <= '9';
|
|
6298
6301
|
}
|
|
@@ -16607,6 +16610,8 @@ struct DBConfigOptions {
|
|
|
16607
16610
|
bool allow_unsigned_extensions = false;
|
|
16608
16611
|
//! Enable emitting FSST Vectors
|
|
16609
16612
|
bool enable_fsst_vectors = false;
|
|
16613
|
+
//! Experimental parallel CSV reader
|
|
16614
|
+
bool experimental_parallel_csv_reader = false;
|
|
16610
16615
|
|
|
16611
16616
|
bool operator==(const DBConfigOptions &other) const;
|
|
16612
16617
|
};
|
|
@@ -27400,7 +27405,17 @@ public:
|
|
|
27400
27405
|
//===----------------------------------------------------------------------===//
|
|
27401
27406
|
// DuckDB
|
|
27402
27407
|
//
|
|
27403
|
-
// duckdb/execution/operator/persistent/
|
|
27408
|
+
// duckdb/execution/operator/persistent/base_csv_reader.hpp
|
|
27409
|
+
//
|
|
27410
|
+
//
|
|
27411
|
+
//===----------------------------------------------------------------------===//
|
|
27412
|
+
|
|
27413
|
+
|
|
27414
|
+
|
|
27415
|
+
//===----------------------------------------------------------------------===//
|
|
27416
|
+
// DuckDB
|
|
27417
|
+
//
|
|
27418
|
+
// duckdb/execution/operator/persistent/base_csv_reader.hpp
|
|
27404
27419
|
//
|
|
27405
27420
|
//
|
|
27406
27421
|
//===----------------------------------------------------------------------===//
|
|
@@ -27728,44 +27743,242 @@ namespace duckdb {
|
|
|
27728
27743
|
using std::queue;
|
|
27729
27744
|
}
|
|
27730
27745
|
|
|
27746
|
+
//===----------------------------------------------------------------------===//
|
|
27747
|
+
// DuckDB
|
|
27748
|
+
//
|
|
27749
|
+
// duckdb/execution/operator/persistent/csv_reader_options.hpp
|
|
27750
|
+
//
|
|
27751
|
+
//
|
|
27752
|
+
//===----------------------------------------------------------------------===//
|
|
27753
|
+
|
|
27754
|
+
|
|
27755
|
+
|
|
27756
|
+
//===----------------------------------------------------------------------===//
|
|
27757
|
+
// DuckDB
|
|
27758
|
+
//
|
|
27759
|
+
// duckdb/execution/operator/persistent/csv_buffer.hpp
|
|
27760
|
+
//
|
|
27761
|
+
//
|
|
27762
|
+
//===----------------------------------------------------------------------===//
|
|
27763
|
+
|
|
27764
|
+
|
|
27765
|
+
|
|
27766
|
+
|
|
27767
|
+
//===----------------------------------------------------------------------===//
|
|
27768
|
+
// DuckDB
|
|
27769
|
+
//
|
|
27770
|
+
// duckdb/execution/operator/persistent/csv_file_handle.hpp
|
|
27771
|
+
//
|
|
27772
|
+
//
|
|
27773
|
+
//===----------------------------------------------------------------------===//
|
|
27774
|
+
|
|
27775
|
+
|
|
27776
|
+
|
|
27777
|
+
|
|
27731
27778
|
|
|
27732
|
-
#include <sstream>
|
|
27733
27779
|
|
|
27734
27780
|
namespace duckdb {
|
|
27735
|
-
struct CopyInfo;
|
|
27736
|
-
struct CSVFileHandle;
|
|
27737
|
-
struct FileHandle;
|
|
27738
|
-
struct StrpTimeFormat;
|
|
27739
27781
|
|
|
27740
|
-
|
|
27741
|
-
|
|
27782
|
+
struct CSVFileHandle {
|
|
27783
|
+
public:
|
|
27784
|
+
explicit CSVFileHandle(unique_ptr<FileHandle> file_handle_p) : file_handle(move(file_handle_p)) {
|
|
27785
|
+
can_seek = file_handle->CanSeek();
|
|
27786
|
+
plain_file_source = file_handle->OnDiskFile() && can_seek;
|
|
27787
|
+
file_size = file_handle->GetFileSize();
|
|
27788
|
+
}
|
|
27742
27789
|
|
|
27743
|
-
|
|
27744
|
-
|
|
27745
|
-
|
|
27746
|
-
|
|
27747
|
-
|
|
27748
|
-
|
|
27749
|
-
|
|
27750
|
-
|
|
27751
|
-
|
|
27752
|
-
|
|
27753
|
-
|
|
27754
|
-
|
|
27755
|
-
|
|
27790
|
+
bool CanSeek() {
|
|
27791
|
+
return can_seek;
|
|
27792
|
+
}
|
|
27793
|
+
void Seek(idx_t position) {
|
|
27794
|
+
if (!can_seek) {
|
|
27795
|
+
throw InternalException("Cannot seek in this file");
|
|
27796
|
+
}
|
|
27797
|
+
file_handle->Seek(position);
|
|
27798
|
+
}
|
|
27799
|
+
idx_t SeekPosition() {
|
|
27800
|
+
if (!can_seek) {
|
|
27801
|
+
throw InternalException("Cannot seek in this file");
|
|
27802
|
+
}
|
|
27803
|
+
return file_handle->SeekPosition();
|
|
27804
|
+
}
|
|
27805
|
+
void Reset() {
|
|
27806
|
+
if (plain_file_source) {
|
|
27807
|
+
file_handle->Reset();
|
|
27808
|
+
} else {
|
|
27809
|
+
if (!reset_enabled) {
|
|
27810
|
+
throw InternalException("Reset called but reset is not enabled for this CSV Handle");
|
|
27811
|
+
}
|
|
27812
|
+
read_position = 0;
|
|
27813
|
+
}
|
|
27814
|
+
}
|
|
27815
|
+
bool PlainFileSource() {
|
|
27816
|
+
return plain_file_source;
|
|
27817
|
+
}
|
|
27756
27818
|
|
|
27757
|
-
|
|
27758
|
-
|
|
27759
|
-
|
|
27819
|
+
bool OnDiskFile() {
|
|
27820
|
+
return file_handle->OnDiskFile();
|
|
27821
|
+
}
|
|
27822
|
+
|
|
27823
|
+
idx_t FileSize() {
|
|
27824
|
+
return file_size;
|
|
27825
|
+
}
|
|
27826
|
+
|
|
27827
|
+
bool FinishedReading() {
|
|
27828
|
+
return requested_bytes >= file_size;
|
|
27829
|
+
}
|
|
27830
|
+
|
|
27831
|
+
idx_t Read(void *buffer, idx_t nr_bytes) {
|
|
27832
|
+
requested_bytes += nr_bytes;
|
|
27833
|
+
if (!plain_file_source) {
|
|
27834
|
+
// not a plain file source: we need to do some bookkeeping around the reset functionality
|
|
27835
|
+
idx_t result_offset = 0;
|
|
27836
|
+
if (read_position < buffer_size) {
|
|
27837
|
+
// we need to read from our cached buffer
|
|
27838
|
+
auto buffer_read_count = MinValue<idx_t>(nr_bytes, buffer_size - read_position);
|
|
27839
|
+
memcpy(buffer, cached_buffer.get() + read_position, buffer_read_count);
|
|
27840
|
+
result_offset += buffer_read_count;
|
|
27841
|
+
read_position += buffer_read_count;
|
|
27842
|
+
if (result_offset == nr_bytes) {
|
|
27843
|
+
return nr_bytes;
|
|
27844
|
+
}
|
|
27845
|
+
} else if (!reset_enabled && cached_buffer) {
|
|
27846
|
+
// reset is disabled, but we still have cached data
|
|
27847
|
+
// we can remove any cached data
|
|
27848
|
+
cached_buffer.reset();
|
|
27849
|
+
buffer_size = 0;
|
|
27850
|
+
buffer_capacity = 0;
|
|
27851
|
+
read_position = 0;
|
|
27852
|
+
}
|
|
27853
|
+
// we have data left to read from the file
|
|
27854
|
+
// read directly into the buffer
|
|
27855
|
+
auto bytes_read = file_handle->Read((char *)buffer + result_offset, nr_bytes - result_offset);
|
|
27856
|
+
read_position += bytes_read;
|
|
27857
|
+
if (reset_enabled) {
|
|
27858
|
+
// if reset caching is enabled, we need to cache the bytes that we have read
|
|
27859
|
+
if (buffer_size + bytes_read >= buffer_capacity) {
|
|
27860
|
+
// no space; first enlarge the buffer
|
|
27861
|
+
buffer_capacity = MaxValue<idx_t>(NextPowerOfTwo(buffer_size + bytes_read), buffer_capacity * 2);
|
|
27862
|
+
|
|
27863
|
+
auto new_buffer = unique_ptr<data_t[]>(new data_t[buffer_capacity]);
|
|
27864
|
+
if (buffer_size > 0) {
|
|
27865
|
+
memcpy(new_buffer.get(), cached_buffer.get(), buffer_size);
|
|
27866
|
+
}
|
|
27867
|
+
cached_buffer = move(new_buffer);
|
|
27868
|
+
}
|
|
27869
|
+
memcpy(cached_buffer.get() + buffer_size, (char *)buffer + result_offset, bytes_read);
|
|
27870
|
+
buffer_size += bytes_read;
|
|
27871
|
+
}
|
|
27872
|
+
|
|
27873
|
+
return result_offset + bytes_read;
|
|
27874
|
+
} else {
|
|
27875
|
+
return file_handle->Read(buffer, nr_bytes);
|
|
27760
27876
|
}
|
|
27761
|
-
position = shifts[position * 255 + byte_value];
|
|
27762
|
-
return position == length;
|
|
27763
27877
|
}
|
|
27764
27878
|
|
|
27765
|
-
|
|
27766
|
-
|
|
27879
|
+
string ReadLine() {
|
|
27880
|
+
bool carriage_return = false;
|
|
27881
|
+
string result;
|
|
27882
|
+
char buffer[1];
|
|
27883
|
+
while (true) {
|
|
27884
|
+
idx_t bytes_read = Read(buffer, 1);
|
|
27885
|
+
if (bytes_read == 0) {
|
|
27886
|
+
return result;
|
|
27887
|
+
}
|
|
27888
|
+
if (carriage_return) {
|
|
27889
|
+
if (buffer[0] != '\n') {
|
|
27890
|
+
if (!file_handle->CanSeek()) {
|
|
27891
|
+
throw BinderException(
|
|
27892
|
+
"Carriage return newlines not supported when reading CSV files in which we cannot seek");
|
|
27893
|
+
}
|
|
27894
|
+
file_handle->Seek(file_handle->SeekPosition() - 1);
|
|
27895
|
+
return result;
|
|
27896
|
+
}
|
|
27897
|
+
}
|
|
27898
|
+
if (buffer[0] == '\n') {
|
|
27899
|
+
return result;
|
|
27900
|
+
}
|
|
27901
|
+
if (buffer[0] != '\r') {
|
|
27902
|
+
result += buffer[0];
|
|
27903
|
+
} else {
|
|
27904
|
+
carriage_return = true;
|
|
27905
|
+
}
|
|
27906
|
+
}
|
|
27907
|
+
}
|
|
27908
|
+
|
|
27909
|
+
void DisableReset() {
|
|
27910
|
+
this->reset_enabled = false;
|
|
27911
|
+
}
|
|
27912
|
+
mutex main_mutex;
|
|
27913
|
+
idx_t count = 0;
|
|
27914
|
+
|
|
27915
|
+
private:
|
|
27916
|
+
unique_ptr<FileHandle> file_handle;
|
|
27917
|
+
bool reset_enabled = true;
|
|
27918
|
+
bool can_seek = false;
|
|
27919
|
+
bool plain_file_source = false;
|
|
27920
|
+
idx_t file_size = 0;
|
|
27921
|
+
// reset support
|
|
27922
|
+
unique_ptr<data_t[]> cached_buffer;
|
|
27923
|
+
idx_t read_position = 0;
|
|
27924
|
+
idx_t buffer_size = 0;
|
|
27925
|
+
idx_t buffer_capacity = 0;
|
|
27926
|
+
idx_t requested_bytes = 0;
|
|
27767
27927
|
};
|
|
27768
27928
|
|
|
27929
|
+
} // namespace duckdb
|
|
27930
|
+
|
|
27931
|
+
|
|
27932
|
+
namespace duckdb {
|
|
27933
|
+
|
|
27934
|
+
class CSVBuffer {
|
|
27935
|
+
public:
|
|
27936
|
+
//! Colossal buffer size for multi-threading
|
|
27937
|
+
static constexpr idx_t INITIAL_BUFFER_SIZE_COLOSSAL = 32000000; // 32MB
|
|
27938
|
+
|
|
27939
|
+
//! Constructor for Initial Buffer
|
|
27940
|
+
CSVBuffer(idx_t buffer_size_p, CSVFileHandle &file_handle);
|
|
27941
|
+
|
|
27942
|
+
//! Constructor for `Next()` Buffers
|
|
27943
|
+
CSVBuffer(unique_ptr<char[]> buffer_p, idx_t buffer_size_p, idx_t actual_size_p, bool final_buffer);
|
|
27944
|
+
|
|
27945
|
+
//! Creates a new buffer with the next part of the CSV File
|
|
27946
|
+
unique_ptr<CSVBuffer> Next(CSVFileHandle &file_handle, idx_t set_buffer_size);
|
|
27947
|
+
|
|
27948
|
+
//! Gets the buffer actual size
|
|
27949
|
+
idx_t GetBufferSize();
|
|
27950
|
+
|
|
27951
|
+
//! Gets the start position of the buffer, only relevant for the first time it's scanned
|
|
27952
|
+
idx_t GetStart();
|
|
27953
|
+
|
|
27954
|
+
//! If this buffer is the last buffer of the CSV File
|
|
27955
|
+
bool IsCSVFileLastBuffer();
|
|
27956
|
+
|
|
27957
|
+
//! If this buffer is the first buffer of the CSV File
|
|
27958
|
+
bool IsCSVFileFirstBuffer();
|
|
27959
|
+
//! The actual buffer
|
|
27960
|
+
unique_ptr<char[]> buffer;
|
|
27961
|
+
|
|
27962
|
+
private:
|
|
27963
|
+
//! Actual size can be smaller than the buffer size in case we allocate it too optimistically.
|
|
27964
|
+
idx_t actual_size;
|
|
27965
|
+
//! We need to check for Byte Order Mark, to define the start position of this buffer
|
|
27966
|
+
//! https://en.wikipedia.org/wiki/Byte_order_mark#UTF-8
|
|
27967
|
+
idx_t start_position = 0;
|
|
27968
|
+
//! If this is the last buffer of the CSV File
|
|
27969
|
+
bool last_buffer = false;
|
|
27970
|
+
//! If this is the first buffer of the CSV File
|
|
27971
|
+
bool first_buffer = false;
|
|
27972
|
+
};
|
|
27973
|
+
} // namespace duckdb
|
|
27974
|
+
|
|
27975
|
+
|
|
27976
|
+
|
|
27977
|
+
|
|
27978
|
+
|
|
27979
|
+
|
|
27980
|
+
namespace duckdb {
|
|
27981
|
+
|
|
27769
27982
|
struct BufferedCSVReaderOptions {
|
|
27770
27983
|
//===--------------------------------------------------------------------===//
|
|
27771
27984
|
// CommonCSVOptions
|
|
@@ -27792,7 +28005,7 @@ struct BufferedCSVReaderOptions {
|
|
|
27792
28005
|
//! Expected number of columns
|
|
27793
28006
|
idx_t num_cols = 0;
|
|
27794
28007
|
//! Number of samples to buffer
|
|
27795
|
-
idx_t
|
|
28008
|
+
idx_t buffer_sample_size = STANDARD_VECTOR_SIZE * 50;
|
|
27796
28009
|
//! Specifies the string that represents a null value
|
|
27797
28010
|
string null_str;
|
|
27798
28011
|
//! Whether file is compressed or not, and if so which compression type
|
|
@@ -27830,6 +28043,8 @@ struct BufferedCSVReaderOptions {
|
|
|
27830
28043
|
bool include_parsed_hive_partitions = false;
|
|
27831
28044
|
//! Whether or not to union files with different (but compatible) columns
|
|
27832
28045
|
bool union_by_name = false;
|
|
28046
|
+
//! Buffer Size (Parallel Scan)
|
|
28047
|
+
idx_t buffer_size = CSVBuffer::INITIAL_BUFFER_SIZE_COLOSSAL;
|
|
27833
28048
|
|
|
27834
28049
|
//===--------------------------------------------------------------------===//
|
|
27835
28050
|
// WriteCSVOptions
|
|
@@ -27864,24 +28079,31 @@ struct BufferedCSVReaderOptions {
|
|
|
27864
28079
|
|
|
27865
28080
|
std::string ToString() const;
|
|
27866
28081
|
};
|
|
28082
|
+
} // namespace duckdb
|
|
28083
|
+
|
|
28084
|
+
|
|
28085
|
+
#include <sstream>
|
|
28086
|
+
|
|
28087
|
+
namespace duckdb {
|
|
28088
|
+
struct CopyInfo;
|
|
28089
|
+
struct CSVFileHandle;
|
|
28090
|
+
struct FileHandle;
|
|
28091
|
+
struct StrpTimeFormat;
|
|
28092
|
+
|
|
28093
|
+
class FileOpener;
|
|
28094
|
+
class FileSystem;
|
|
27867
28095
|
|
|
27868
28096
|
enum class ParserMode : uint8_t { PARSING = 0, SNIFFING_DIALECT = 1, SNIFFING_DATATYPES = 2, PARSING_HEADER = 3 };
|
|
27869
28097
|
|
|
27870
28098
|
//! Buffered CSV reader is a class that reads values from a stream and parses them as a CSV file
|
|
27871
|
-
class
|
|
27872
|
-
//! Initial buffer read size; can be extended for long lines
|
|
27873
|
-
static constexpr idx_t INITIAL_BUFFER_SIZE = 16384;
|
|
27874
|
-
//! Larger buffer size for non disk files
|
|
27875
|
-
static constexpr idx_t INITIAL_BUFFER_SIZE_LARGE = 10000000; // 10MB
|
|
27876
|
-
ParserMode mode;
|
|
27877
|
-
|
|
28099
|
+
class BaseCSVReader {
|
|
27878
28100
|
public:
|
|
27879
|
-
|
|
27880
|
-
|
|
28101
|
+
BaseCSVReader(ClientContext &context, BufferedCSVReaderOptions options,
|
|
28102
|
+
const vector<LogicalType> &requested_types = vector<LogicalType>());
|
|
27881
28103
|
|
|
27882
|
-
|
|
27883
|
-
|
|
27884
|
-
~
|
|
28104
|
+
BaseCSVReader(FileSystem &fs, Allocator &allocator, FileOpener *opener, BufferedCSVReaderOptions options,
|
|
28105
|
+
const vector<LogicalType> &requested_types = vector<LogicalType>());
|
|
28106
|
+
~BaseCSVReader();
|
|
27885
28107
|
|
|
27886
28108
|
FileSystem &fs;
|
|
27887
28109
|
Allocator &allocator;
|
|
@@ -27895,17 +28117,9 @@ public:
|
|
|
27895
28117
|
vector<idx_t> insert_cols_idx;
|
|
27896
28118
|
vector<idx_t> insert_nulls_idx;
|
|
27897
28119
|
|
|
27898
|
-
unique_ptr<CSVFileHandle> file_handle;
|
|
27899
|
-
|
|
27900
|
-
unique_ptr<char[]> buffer;
|
|
27901
|
-
idx_t buffer_size;
|
|
27902
|
-
idx_t position;
|
|
27903
|
-
idx_t start = 0;
|
|
27904
|
-
|
|
27905
28120
|
idx_t linenr = 0;
|
|
27906
28121
|
bool linenr_estimated = false;
|
|
27907
28122
|
|
|
27908
|
-
vector<idx_t> sniffed_column_counts;
|
|
27909
28123
|
bool row_empty = false;
|
|
27910
28124
|
idx_t sample_chunk_idx = 0;
|
|
27911
28125
|
bool jumping_samples = false;
|
|
@@ -27915,72 +28129,145 @@ public:
|
|
|
27915
28129
|
idx_t bytes_in_chunk = 0;
|
|
27916
28130
|
double bytes_per_line_avg = 0;
|
|
27917
28131
|
|
|
27918
|
-
vector<unique_ptr<char[]>> cached_buffers;
|
|
27919
|
-
|
|
27920
|
-
TextSearchShiftArray delimiter_search, escape_search, quote_search;
|
|
27921
|
-
|
|
27922
28132
|
DataChunk parse_chunk;
|
|
27923
28133
|
|
|
27924
28134
|
std::queue<unique_ptr<DataChunk>> cached_chunks;
|
|
27925
28135
|
|
|
27926
|
-
|
|
27927
|
-
//! Extract a single DataChunk from the CSV file and stores it in insert_chunk
|
|
27928
|
-
void ParseCSV(DataChunk &insert_chunk);
|
|
27929
|
-
|
|
27930
|
-
idx_t GetFileSize();
|
|
28136
|
+
ParserMode mode;
|
|
27931
28137
|
|
|
28138
|
+
public:
|
|
27932
28139
|
//! Fill nulls into the cols that mismtach union names
|
|
27933
28140
|
void SetNullUnionCols(DataChunk &insert_chunk);
|
|
27934
28141
|
|
|
27935
|
-
|
|
27936
|
-
//! Initialize Parser
|
|
27937
|
-
void Initialize(const vector<LogicalType> &requested_types);
|
|
28142
|
+
protected:
|
|
27938
28143
|
//! Initializes the parse_chunk with varchar columns and aligns info with new number of cols
|
|
27939
28144
|
void InitParseChunk(idx_t num_cols);
|
|
27940
28145
|
//! Initializes the insert_chunk idx for mapping parse_chunk cols to insert_chunk cols
|
|
27941
28146
|
void InitInsertChunkIdx(idx_t num_cols);
|
|
27942
|
-
//! Initializes the TextSearchShiftArrays for complex parser
|
|
27943
|
-
void PrepareComplexParser();
|
|
27944
|
-
//! Try to parse a single datachunk from the file. Throws an exception if anything goes wrong.
|
|
27945
|
-
void ParseCSV(ParserMode mode);
|
|
27946
|
-
//! Try to parse a single datachunk from the file. Returns whether or not the parsing is successful
|
|
27947
|
-
bool TryParseCSV(ParserMode mode);
|
|
27948
|
-
//! Extract a single DataChunk from the CSV file and stores it in insert_chunk
|
|
27949
|
-
bool TryParseCSV(ParserMode mode, DataChunk &insert_chunk, string &error_message);
|
|
27950
|
-
//! Sniffs CSV dialect and determines skip rows, header row, column types and column names
|
|
27951
|
-
vector<LogicalType> SniffCSV(const vector<LogicalType> &requested_types);
|
|
27952
28147
|
//! Change the date format for the type to the string
|
|
27953
28148
|
void SetDateFormat(const string &format_specifier, const LogicalTypeId &sql_type);
|
|
27954
28149
|
//! Try to cast a string value to the specified sql type
|
|
27955
28150
|
bool TryCastValue(const Value &value, const LogicalType &sql_type);
|
|
27956
28151
|
//! Try to cast a vector of values to the specified sql type
|
|
27957
28152
|
bool TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type);
|
|
28153
|
+
|
|
28154
|
+
//! Adds a value to the current row
|
|
28155
|
+
void AddValue(string_t str_val, idx_t &column, vector<idx_t> &escape_positions, bool has_quotes);
|
|
28156
|
+
//! Adds a row to the insert_chunk, returns true if the chunk is filled as a result of this row being added
|
|
28157
|
+
bool AddRow(DataChunk &insert_chunk, idx_t &column);
|
|
28158
|
+
//! Finalizes a chunk, parsing all values that have been added so far and adding them to the insert_chunk
|
|
28159
|
+
bool Flush(DataChunk &insert_chunk, bool try_add_line = false);
|
|
28160
|
+
|
|
28161
|
+
unique_ptr<CSVFileHandle> OpenCSV(const BufferedCSVReaderOptions &options);
|
|
28162
|
+
|
|
28163
|
+
void VerifyUTF8(idx_t col_idx);
|
|
28164
|
+
void VerifyUTF8(idx_t col_idx, idx_t row_idx, DataChunk &chunk, int64_t offset = 0);
|
|
28165
|
+
static string GetLineNumberStr(idx_t linenr, bool linenr_estimated);
|
|
28166
|
+
|
|
28167
|
+
protected:
|
|
28168
|
+
//! Whether or not the current row's columns have overflown sql_types.size()
|
|
28169
|
+
bool error_column_overflow = false;
|
|
28170
|
+
//! Number of sniffed columns - only used when auto-detecting
|
|
28171
|
+
vector<idx_t> sniffed_column_counts;
|
|
28172
|
+
};
|
|
28173
|
+
|
|
28174
|
+
} // namespace duckdb
|
|
28175
|
+
|
|
28176
|
+
|
|
28177
|
+
namespace duckdb {
|
|
28178
|
+
struct CopyInfo;
|
|
28179
|
+
struct CSVFileHandle;
|
|
28180
|
+
struct FileHandle;
|
|
28181
|
+
struct StrpTimeFormat;
|
|
28182
|
+
|
|
28183
|
+
class FileOpener;
|
|
28184
|
+
class FileSystem;
|
|
28185
|
+
|
|
28186
|
+
//! The shifts array allows for linear searching of multi-byte values. For each position, it determines the next
|
|
28187
|
+
//! position given that we encounter a byte with the given value.
|
|
28188
|
+
/*! For example, if we have a string "ABAC", the shifts array will have the following values:
|
|
28189
|
+
* [0] --> ['A'] = 1, all others = 0
|
|
28190
|
+
* [1] --> ['B'] = 2, ['A'] = 1, all others = 0
|
|
28191
|
+
* [2] --> ['A'] = 3, all others = 0
|
|
28192
|
+
* [3] --> ['C'] = 4 (match), 'B' = 2, 'A' = 1, all others = 0
|
|
28193
|
+
* Suppose we then search in the following string "ABABAC", our progression will be as follows:
|
|
28194
|
+
* 'A' -> [1], 'B' -> [2], 'A' -> [3], 'B' -> [2], 'A' -> [3], 'C' -> [4] (match!)
|
|
28195
|
+
*/
|
|
28196
|
+
struct TextSearchShiftArray {
|
|
28197
|
+
TextSearchShiftArray();
|
|
28198
|
+
explicit TextSearchShiftArray(string search_term);
|
|
28199
|
+
|
|
28200
|
+
inline bool Match(uint8_t &position, uint8_t byte_value) {
|
|
28201
|
+
if (position >= length) {
|
|
28202
|
+
return false;
|
|
28203
|
+
}
|
|
28204
|
+
position = shifts[position * 255 + byte_value];
|
|
28205
|
+
return position == length;
|
|
28206
|
+
}
|
|
28207
|
+
|
|
28208
|
+
idx_t length;
|
|
28209
|
+
unique_ptr<uint8_t[]> shifts;
|
|
28210
|
+
};
|
|
28211
|
+
|
|
28212
|
+
//! Buffered CSV reader is a class that reads values from a stream and parses them as a CSV file
|
|
28213
|
+
class BufferedCSVReader : public BaseCSVReader {
|
|
28214
|
+
//! Initial buffer read size; can be extended for long lines
|
|
28215
|
+
static constexpr idx_t INITIAL_BUFFER_SIZE = 16384;
|
|
28216
|
+
//! Larger buffer size for non disk files
|
|
28217
|
+
static constexpr idx_t INITIAL_BUFFER_SIZE_LARGE = 10000000; // 10MB
|
|
28218
|
+
|
|
28219
|
+
public:
|
|
28220
|
+
BufferedCSVReader(ClientContext &context, BufferedCSVReaderOptions options,
|
|
28221
|
+
const vector<LogicalType> &requested_types = vector<LogicalType>());
|
|
28222
|
+
BufferedCSVReader(FileSystem &fs, Allocator &allocator, FileOpener *opener, BufferedCSVReaderOptions options,
|
|
28223
|
+
const vector<LogicalType> &requested_types = vector<LogicalType>());
|
|
28224
|
+
~BufferedCSVReader();
|
|
28225
|
+
|
|
28226
|
+
unique_ptr<char[]> buffer;
|
|
28227
|
+
idx_t buffer_size;
|
|
28228
|
+
idx_t position;
|
|
28229
|
+
idx_t start = 0;
|
|
28230
|
+
|
|
28231
|
+
vector<unique_ptr<char[]>> cached_buffers;
|
|
28232
|
+
|
|
28233
|
+
unique_ptr<CSVFileHandle> file_handle;
|
|
28234
|
+
|
|
28235
|
+
TextSearchShiftArray delimiter_search, escape_search, quote_search;
|
|
28236
|
+
|
|
28237
|
+
public:
|
|
28238
|
+
//! Extract a single DataChunk from the CSV file and stores it in insert_chunk
|
|
28239
|
+
void ParseCSV(DataChunk &insert_chunk);
|
|
28240
|
+
|
|
28241
|
+
private:
|
|
28242
|
+
//! Initialize Parser
|
|
28243
|
+
void Initialize(const vector<LogicalType> &requested_types);
|
|
27958
28244
|
//! Skips skip_rows, reads header row from input stream
|
|
27959
28245
|
void SkipRowsAndReadHeader(idx_t skip_rows, bool skip_header);
|
|
27960
28246
|
//! Jumps back to the beginning of input stream and resets necessary internal states
|
|
27961
28247
|
void JumpToBeginning(idx_t skip_rows, bool skip_header);
|
|
27962
|
-
//! Jumps back to the beginning of input stream and resets necessary internal states
|
|
27963
|
-
bool JumpToNextSample();
|
|
27964
28248
|
//! Resets the buffer
|
|
27965
28249
|
void ResetBuffer();
|
|
27966
28250
|
//! Resets the steam
|
|
27967
28251
|
void ResetStream();
|
|
28252
|
+
//! Reads a new buffer from the CSV file if the current one has been exhausted
|
|
28253
|
+
bool ReadBuffer(idx_t &start);
|
|
28254
|
+
//! Jumps back to the beginning of input stream and resets necessary internal states
|
|
28255
|
+
bool JumpToNextSample();
|
|
28256
|
+
//! Initializes the TextSearchShiftArrays for complex parser
|
|
28257
|
+
void PrepareComplexParser();
|
|
28258
|
+
//! Try to parse a single datachunk from the file. Throws an exception if anything goes wrong.
|
|
28259
|
+
void ParseCSV(ParserMode mode);
|
|
28260
|
+
//! Try to parse a single datachunk from the file. Returns whether or not the parsing is successful
|
|
28261
|
+
bool TryParseCSV(ParserMode mode);
|
|
28262
|
+
//! Extract a single DataChunk from the CSV file and stores it in insert_chunk
|
|
28263
|
+
bool TryParseCSV(ParserMode mode, DataChunk &insert_chunk, string &error_message);
|
|
27968
28264
|
|
|
27969
28265
|
//! Parses a CSV file with a one-byte delimiter, escape and quote character
|
|
27970
28266
|
bool TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message);
|
|
27971
28267
|
//! Parses more complex CSV files with multi-byte delimiters, escapes or quotes
|
|
27972
28268
|
bool TryParseComplexCSV(DataChunk &insert_chunk, string &error_message);
|
|
27973
|
-
|
|
27974
|
-
|
|
27975
|
-
void AddValue(string_t str_val, idx_t &column, vector<idx_t> &escape_positions, bool has_quotes);
|
|
27976
|
-
//! Adds a row to the insert_chunk, returns true if the chunk is filled as a result of this row being added
|
|
27977
|
-
bool AddRow(DataChunk &insert_chunk, idx_t &column);
|
|
27978
|
-
//! Finalizes a chunk, parsing all values that have been added so far and adding them to the insert_chunk
|
|
27979
|
-
void Flush(DataChunk &insert_chunk);
|
|
27980
|
-
//! Reads a new buffer from the CSV file if the current one has been exhausted
|
|
27981
|
-
bool ReadBuffer(idx_t &start);
|
|
27982
|
-
|
|
27983
|
-
unique_ptr<CSVFileHandle> OpenCSV(const BufferedCSVReaderOptions &options);
|
|
28269
|
+
//! Sniffs CSV dialect and determines skip rows, header row, column types and column names
|
|
28270
|
+
vector<LogicalType> SniffCSV(const vector<LogicalType> &requested_types);
|
|
27984
28271
|
|
|
27985
28272
|
//! First phase of auto detection: detect CSV dialect (i.e. delimiter, quote rules, etc)
|
|
27986
28273
|
void DetectDialect(const vector<LogicalType> &requested_types, BufferedCSVReaderOptions &original_options,
|
|
@@ -28000,13 +28287,6 @@ private:
|
|
|
28000
28287
|
const vector<LogicalType> &requested_types,
|
|
28001
28288
|
vector<vector<LogicalType>> &best_sql_types_candidates,
|
|
28002
28289
|
map<LogicalTypeId, vector<string>> &best_format_candidates);
|
|
28003
|
-
|
|
28004
|
-
void VerifyUTF8(idx_t col_idx);
|
|
28005
|
-
void VerifyUTF8(idx_t col_idx, idx_t row_idx, DataChunk &chunk, int64_t offset = 0);
|
|
28006
|
-
|
|
28007
|
-
private:
|
|
28008
|
-
//! Whether or not the current row's columns have overflown sql_types.size()
|
|
28009
|
-
bool error_column_overflow = false;
|
|
28010
28290
|
};
|
|
28011
28291
|
|
|
28012
28292
|
} // namespace duckdb
|