duckdb 0.7.2-dev3353.0 → 0.7.2-dev3402.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/extension/json/buffered_json_reader.cpp +2 -3
- package/src/duckdb/extension/json/include/json_functions.hpp +5 -1
- package/src/duckdb/extension/json/include/json_scan.hpp +1 -0
- package/src/duckdb/extension/json/include/json_transform.hpp +2 -2
- package/src/duckdb/extension/json/json-extension.cpp +7 -3
- package/src/duckdb/extension/json/json_functions/copy_json.cpp +16 -5
- package/src/duckdb/extension/json/json_functions/json_create.cpp +220 -93
- package/src/duckdb/extension/json/json_functions/json_merge_patch.cpp +2 -2
- package/src/duckdb/extension/json/json_functions/json_transform.cpp +283 -117
- package/src/duckdb/extension/json/json_functions/read_json.cpp +8 -6
- package/src/duckdb/extension/json/json_functions.cpp +17 -15
- package/src/duckdb/extension/json/json_scan.cpp +8 -4
- package/src/duckdb/extension/parquet/column_reader.cpp +6 -2
- package/src/duckdb/extension/parquet/include/parquet_reader.hpp +1 -2
- package/src/duckdb/extension/parquet/include/parquet_writer.hpp +2 -2
- package/src/duckdb/extension/parquet/include/string_column_reader.hpp +1 -0
- package/src/duckdb/extension/parquet/include/thrift_tools.hpp +3 -5
- package/src/duckdb/extension/parquet/parquet-extension.cpp +2 -4
- package/src/duckdb/extension/parquet/parquet_reader.cpp +11 -22
- package/src/duckdb/extension/parquet/parquet_statistics.cpp +5 -0
- package/src/duckdb/extension/parquet/parquet_writer.cpp +4 -4
- package/src/duckdb/src/common/file_system.cpp +13 -20
- package/src/duckdb/src/common/serializer/buffered_file_writer.cpp +2 -2
- package/src/duckdb/src/execution/index/art/art.cpp +3 -1
- package/src/duckdb/src/execution/operator/join/physical_index_join.cpp +0 -1
- package/src/duckdb/src/execution/operator/persistent/base_csv_reader.cpp +2 -2
- package/src/duckdb/src/execution/operator/persistent/parallel_csv_reader.cpp +1 -1
- package/src/duckdb/src/execution/operator/persistent/physical_copy_to_file.cpp +1 -2
- package/src/duckdb/src/execution/operator/persistent/physical_export.cpp +4 -5
- package/src/duckdb/src/execution/physical_plan/plan_copy_to_file.cpp +1 -1
- package/src/duckdb/src/function/cast/cast_function_set.cpp +89 -25
- package/src/duckdb/src/function/pragma/pragma_queries.cpp +20 -15
- package/src/duckdb/src/function/table/copy_csv.cpp +4 -5
- package/src/duckdb/src/function/table/read_csv.cpp +6 -5
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/file_opener.hpp +0 -1
- package/src/duckdb/src/include/duckdb/common/file_system.hpp +7 -6
- package/src/duckdb/src/include/duckdb/common/opener_file_system.hpp +118 -0
- package/src/duckdb/src/include/duckdb/common/serializer/buffered_file_writer.hpp +1 -2
- package/src/duckdb/src/include/duckdb/common/types/type_map.hpp +19 -1
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/base_csv_reader.hpp +3 -2
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_line_info.hpp +1 -0
- package/src/duckdb/src/include/duckdb/main/client_data.hpp +4 -0
- package/src/duckdb/src/include/duckdb/main/extension_helper.hpp +5 -5
- package/src/duckdb/src/main/client_context.cpp +0 -4
- package/src/duckdb/src/main/client_data.cpp +19 -0
- package/src/duckdb/src/main/database.cpp +4 -1
- package/src/duckdb/src/main/extension/extension_install.cpp +5 -6
- package/src/duckdb/src/main/extension/extension_load.cpp +11 -16
- package/src/duckdb/src/main/settings/settings.cpp +2 -3
@@ -6,8 +6,8 @@
|
|
6
6
|
#include "duckdb/function/replacement_scan.hpp"
|
7
7
|
#include "duckdb/parser/expression/constant_expression.hpp"
|
8
8
|
#include "duckdb/parser/expression/function_expression.hpp"
|
9
|
-
#include "duckdb/parser/tableref/table_function_ref.hpp"
|
10
9
|
#include "duckdb/parser/parsed_data/create_pragma_function_info.hpp"
|
10
|
+
#include "duckdb/parser/tableref/table_function_ref.hpp"
|
11
11
|
|
12
12
|
namespace duckdb {
|
13
13
|
|
@@ -115,6 +115,14 @@ unique_ptr<FunctionLocalState> JSONFunctionLocalState::Init(ExpressionState &sta
|
|
115
115
|
return make_uniq<JSONFunctionLocalState>(state.GetContext());
|
116
116
|
}
|
117
117
|
|
118
|
+
unique_ptr<FunctionLocalState> JSONFunctionLocalState::InitCastLocalState(CastLocalStateParameters ¶meters) {
|
119
|
+
if (parameters.context) {
|
120
|
+
return make_uniq<JSONFunctionLocalState>(*parameters.context);
|
121
|
+
} else {
|
122
|
+
return make_uniq<JSONFunctionLocalState>(Allocator::DefaultAllocator());
|
123
|
+
}
|
124
|
+
}
|
125
|
+
|
118
126
|
JSONFunctionLocalState &JSONFunctionLocalState::ResetAndGet(ExpressionState &state) {
|
119
127
|
auto &lstate = ExecuteFunctionState::GetFunctionState(state)->Cast<JSONFunctionLocalState>();
|
120
128
|
lstate.json_allocator.Reset();
|
@@ -197,14 +205,6 @@ unique_ptr<TableRef> JSONFunctions::ReadJSONReplacement(ClientContext &context,
|
|
197
205
|
return std::move(table_function);
|
198
206
|
}
|
199
207
|
|
200
|
-
static duckdb::unique_ptr<FunctionLocalState> InitJSONCastLocalState(CastLocalStateParameters ¶meters) {
|
201
|
-
if (parameters.context) {
|
202
|
-
return make_uniq<JSONFunctionLocalState>(*parameters.context);
|
203
|
-
} else {
|
204
|
-
return make_uniq<JSONFunctionLocalState>(Allocator::DefaultAllocator());
|
205
|
-
}
|
206
|
-
}
|
207
|
-
|
208
208
|
static bool CastVarcharToJSON(Vector &source, Vector &result, idx_t count, CastParameters ¶meters) {
|
209
209
|
auto &lstate = parameters.local_state->Cast<JSONFunctionLocalState>();
|
210
210
|
lstate.json_allocator.Reset();
|
@@ -215,15 +215,17 @@ static bool CastVarcharToJSON(Vector &source, Vector &result, idx_t count, CastP
|
|
215
215
|
source, result, count, [&](string_t input, ValidityMask &mask, idx_t idx) {
|
216
216
|
auto data = (char *)(input.GetData());
|
217
217
|
auto length = input.GetSize();
|
218
|
-
yyjson_read_err error;
|
219
218
|
|
219
|
+
yyjson_read_err error;
|
220
220
|
auto doc = JSONCommon::ReadDocumentUnsafe(data, length, JSONCommon::READ_FLAG, alc, &error);
|
221
221
|
|
222
222
|
if (!doc) {
|
223
|
-
HandleCastError::AssignError(JSONCommon::FormatParseError(data, length, error),
|
224
|
-
parameters.error_message);
|
225
223
|
mask.SetInvalid(idx);
|
226
|
-
success
|
224
|
+
if (success) {
|
225
|
+
HandleCastError::AssignError(JSONCommon::FormatParseError(data, length, error),
|
226
|
+
parameters.error_message);
|
227
|
+
success = false;
|
228
|
+
}
|
227
229
|
}
|
228
230
|
return input;
|
229
231
|
});
|
@@ -231,13 +233,13 @@ static bool CastVarcharToJSON(Vector &source, Vector &result, idx_t count, CastP
|
|
231
233
|
return success;
|
232
234
|
}
|
233
235
|
|
234
|
-
void JSONFunctions::
|
236
|
+
void JSONFunctions::RegisterSimpleCastFunctions(CastFunctionSet &casts) {
|
235
237
|
// JSON to VARCHAR is basically free
|
236
238
|
casts.RegisterCastFunction(JSONCommon::JSONType(), LogicalType::VARCHAR, DefaultCasts::ReinterpretCast, 1);
|
237
239
|
|
238
240
|
// VARCHAR to JSON requires a parse so it's not free. Let's make it 1 more than a cast to STRUCT
|
239
241
|
auto varchar_to_json_cost = casts.ImplicitCastCost(LogicalType::SQLNULL, LogicalTypeId::STRUCT) + 1;
|
240
|
-
BoundCastInfo info(CastVarcharToJSON, nullptr,
|
242
|
+
BoundCastInfo info(CastVarcharToJSON, nullptr, JSONFunctionLocalState::InitCastLocalState);
|
241
243
|
casts.RegisterCastFunction(LogicalType::VARCHAR, JSONCommon::JSONType(), std::move(info), varchar_to_json_cost);
|
242
244
|
|
243
245
|
// Register NULL to JSON with a different cost than NULL to VARCHAR so the binder can disambiguate functions
|
@@ -1,10 +1,10 @@
|
|
1
1
|
#include "json_scan.hpp"
|
2
2
|
|
3
|
+
#include "duckdb/common/multi_file_reader.hpp"
|
3
4
|
#include "duckdb/main/database.hpp"
|
4
5
|
#include "duckdb/main/extension_helper.hpp"
|
5
6
|
#include "duckdb/parallel/task_scheduler.hpp"
|
6
7
|
#include "duckdb/storage/buffer_manager.hpp"
|
7
|
-
#include "duckdb/common/multi_file_reader.hpp"
|
8
8
|
|
9
9
|
namespace duckdb {
|
10
10
|
|
@@ -59,11 +59,15 @@ unique_ptr<FunctionData> JSONScanData::Bind(ClientContext &context, TableFunctio
|
|
59
59
|
}
|
60
60
|
|
61
61
|
void JSONScanData::InitializeFormats() {
|
62
|
+
InitializeFormats(auto_detect);
|
63
|
+
}
|
64
|
+
|
65
|
+
void JSONScanData::InitializeFormats(bool auto_detect_p) {
|
62
66
|
// Set defaults for date/timestamp formats if we need to
|
63
|
-
if (!
|
67
|
+
if (!auto_detect_p && date_format.empty()) {
|
64
68
|
date_format = "%Y-%m-%d";
|
65
69
|
}
|
66
|
-
if (!
|
70
|
+
if (!auto_detect_p && timestamp_format.empty()) {
|
67
71
|
timestamp_format = "%Y-%m-%dT%H:%M:%S.%fZ";
|
68
72
|
}
|
69
73
|
|
@@ -75,7 +79,7 @@ void JSONScanData::InitializeFormats() {
|
|
75
79
|
date_format_map.AddFormat(LogicalTypeId::TIMESTAMP, timestamp_format);
|
76
80
|
}
|
77
81
|
|
78
|
-
if (
|
82
|
+
if (auto_detect_p) {
|
79
83
|
static const unordered_map<LogicalTypeId, vector<const char *>, LogicalTypeIdHash> FORMAT_TEMPLATES = {
|
80
84
|
{LogicalTypeId::DATE, {"%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d", "%y-%m-%d"}},
|
81
85
|
{LogicalTypeId::TIMESTAMP,
|
@@ -589,8 +589,8 @@ StringColumnReader::StringColumnReader(ParquetReader &reader, LogicalType type_p
|
|
589
589
|
}
|
590
590
|
}
|
591
591
|
|
592
|
-
uint32_t StringColumnReader::VerifyString(const char *str_data, uint32_t str_len) {
|
593
|
-
if (
|
592
|
+
uint32_t StringColumnReader::VerifyString(const char *str_data, uint32_t str_len, const bool is_varchar) {
|
593
|
+
if (!is_varchar) {
|
594
594
|
return str_len;
|
595
595
|
}
|
596
596
|
// verify if a string is actually UTF8, and if there are no null bytes in the middle of the string
|
@@ -605,6 +605,10 @@ uint32_t StringColumnReader::VerifyString(const char *str_data, uint32_t str_len
|
|
605
605
|
return str_len;
|
606
606
|
}
|
607
607
|
|
608
|
+
uint32_t StringColumnReader::VerifyString(const char *str_data, uint32_t str_len) {
|
609
|
+
return VerifyString(str_data, str_len, Type() == LogicalTypeId::VARCHAR);
|
610
|
+
}
|
611
|
+
|
608
612
|
void StringColumnReader::Dictionary(shared_ptr<ResizeableBuffer> data, idx_t num_entries) {
|
609
613
|
dict = std::move(data);
|
610
614
|
dict_strings = duckdb::unique_ptr<string_t[]>(new string_t[num_entries]);
|
@@ -80,15 +80,14 @@ public:
|
|
80
80
|
|
81
81
|
class ParquetReader {
|
82
82
|
public:
|
83
|
-
ParquetReader(Allocator &allocator, unique_ptr<FileHandle> file_handle_p);
|
84
83
|
ParquetReader(ClientContext &context, string file_name, ParquetOptions parquet_options);
|
85
84
|
ParquetReader(ClientContext &context, ParquetOptions parquet_options,
|
86
85
|
shared_ptr<ParquetFileMetadataCache> metadata);
|
87
86
|
~ParquetReader();
|
88
87
|
|
88
|
+
FileSystem &fs;
|
89
89
|
Allocator &allocator;
|
90
90
|
string file_name;
|
91
|
-
FileOpener *file_opener;
|
92
91
|
vector<LogicalType> return_types;
|
93
92
|
vector<string> names;
|
94
93
|
shared_ptr<ParquetFileMetadataCache> metadata;
|
@@ -32,8 +32,8 @@ struct PreparedRowGroup {
|
|
32
32
|
|
33
33
|
class ParquetWriter {
|
34
34
|
public:
|
35
|
-
ParquetWriter(FileSystem &fs, string file_name,
|
36
|
-
|
35
|
+
ParquetWriter(FileSystem &fs, string file_name, vector<LogicalType> types, vector<string> names,
|
36
|
+
duckdb_parquet::format::CompressionCodec::type codec);
|
37
37
|
|
38
38
|
public:
|
39
39
|
void PrepareRowGroup(ColumnDataCollection &buffer, PreparedRowGroup &result);
|
@@ -39,6 +39,7 @@ public:
|
|
39
39
|
void PrepareDeltaByteArray(ResizeableBuffer &buffer) override;
|
40
40
|
void DeltaByteArray(uint8_t *defines, idx_t num_values, parquet_filter_t &filter, idx_t result_offset,
|
41
41
|
Vector &result) override;
|
42
|
+
static uint32_t VerifyString(const char *str_data, uint32_t str_len, const bool isVarchar);
|
42
43
|
uint32_t VerifyString(const char *str_data, uint32_t str_len);
|
43
44
|
|
44
45
|
protected:
|
@@ -51,8 +51,7 @@ struct ReadHeadComparator {
|
|
51
51
|
// 1: register all ranges that will be read, merging ranges that are consecutive
|
52
52
|
// 2: prefetch all registered ranges
|
53
53
|
struct ReadAheadBuffer {
|
54
|
-
ReadAheadBuffer(Allocator &allocator, FileHandle &handle,
|
55
|
-
: allocator(allocator), handle(handle), file_opener(opener) {
|
54
|
+
ReadAheadBuffer(Allocator &allocator, FileHandle &handle) : allocator(allocator), handle(handle) {
|
56
55
|
}
|
57
56
|
|
58
57
|
// The list of read heads
|
@@ -62,7 +61,6 @@ struct ReadAheadBuffer {
|
|
62
61
|
|
63
62
|
Allocator &allocator;
|
64
63
|
FileHandle &handle;
|
65
|
-
FileOpener &file_opener;
|
66
64
|
|
67
65
|
idx_t total_size = 0;
|
68
66
|
|
@@ -124,8 +122,8 @@ class ThriftFileTransport : public duckdb_apache::thrift::transport::TVirtualTra
|
|
124
122
|
public:
|
125
123
|
static constexpr uint64_t PREFETCH_FALLBACK_BUFFERSIZE = 1000000;
|
126
124
|
|
127
|
-
ThriftFileTransport(Allocator &allocator, FileHandle &handle_p,
|
128
|
-
: handle(handle_p), location(0), allocator(allocator), ra_buffer(ReadAheadBuffer(allocator, handle_p
|
125
|
+
ThriftFileTransport(Allocator &allocator, FileHandle &handle_p, bool prefetch_mode_p)
|
126
|
+
: handle(handle_p), location(0), allocator(allocator), ra_buffer(ReadAheadBuffer(allocator, handle_p)),
|
129
127
|
prefetch_mode(prefetch_mode_p) {
|
130
128
|
}
|
131
129
|
|
@@ -239,8 +239,7 @@ public:
|
|
239
239
|
// missing metadata entry in cache, no usable stats
|
240
240
|
return nullptr;
|
241
241
|
}
|
242
|
-
auto handle = fs.OpenFile(file_name, FileFlags::FILE_FLAGS_READ
|
243
|
-
FileSystem::DEFAULT_COMPRESSION, FileSystem::GetFileOpener(context));
|
242
|
+
auto handle = fs.OpenFile(file_name, FileFlags::FILE_FLAGS_READ);
|
244
243
|
// we need to check if the metadata cache entries are current
|
245
244
|
if (fs.GetLastModifiedTime(*handle) >= metadata->read_time) {
|
246
245
|
// missing or invalid metadata entry in cache, no usable stats overall
|
@@ -627,8 +626,7 @@ unique_ptr<GlobalFunctionData> ParquetWriteInitializeGlobal(ClientContext &conte
|
|
627
626
|
|
628
627
|
auto &fs = FileSystem::GetFileSystem(context);
|
629
628
|
global_state->writer =
|
630
|
-
make_uniq<ParquetWriter>(fs, file_path,
|
631
|
-
parquet_bind.column_names, parquet_bind.codec);
|
629
|
+
make_uniq<ParquetWriter>(fs, file_path, parquet_bind.sql_types, parquet_bind.column_names, parquet_bind.codec);
|
632
630
|
return std::move(global_state);
|
633
631
|
}
|
634
632
|
|
@@ -49,16 +49,15 @@ using duckdb_parquet::format::Statistics;
|
|
49
49
|
using duckdb_parquet::format::Type;
|
50
50
|
|
51
51
|
static duckdb::unique_ptr<duckdb_apache::thrift::protocol::TProtocol>
|
52
|
-
CreateThriftProtocol(Allocator &allocator, FileHandle &file_handle,
|
53
|
-
auto transport = make_shared<ThriftFileTransport>(allocator, file_handle,
|
52
|
+
CreateThriftProtocol(Allocator &allocator, FileHandle &file_handle, bool prefetch_mode) {
|
53
|
+
auto transport = make_shared<ThriftFileTransport>(allocator, file_handle, prefetch_mode);
|
54
54
|
return make_uniq<duckdb_apache::thrift::protocol::TCompactProtocolT<ThriftFileTransport>>(std::move(transport));
|
55
55
|
}
|
56
56
|
|
57
|
-
static shared_ptr<ParquetFileMetadataCache> LoadMetadata(Allocator &allocator, FileHandle &file_handle
|
58
|
-
FileOpener &opener) {
|
57
|
+
static shared_ptr<ParquetFileMetadataCache> LoadMetadata(Allocator &allocator, FileHandle &file_handle) {
|
59
58
|
auto current_time = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
|
60
59
|
|
61
|
-
auto proto = CreateThriftProtocol(allocator, file_handle,
|
60
|
+
auto proto = CreateThriftProtocol(allocator, file_handle, false);
|
62
61
|
auto &transport = ((ThriftFileTransport &)*proto->getTransport());
|
63
62
|
auto file_size = transport.GetSize();
|
64
63
|
if (file_size < 12) {
|
@@ -428,20 +427,11 @@ ParquetOptions::ParquetOptions(ClientContext &context) {
|
|
428
427
|
}
|
429
428
|
}
|
430
429
|
|
431
|
-
ParquetReader::ParquetReader(Allocator &allocator_p, unique_ptr<FileHandle> file_handle_p) : allocator(allocator_p) {
|
432
|
-
file_name = file_handle_p->path;
|
433
|
-
file_handle = std::move(file_handle_p);
|
434
|
-
metadata = LoadMetadata(allocator, *file_handle, *file_opener);
|
435
|
-
InitializeSchema();
|
436
|
-
}
|
437
|
-
|
438
430
|
ParquetReader::ParquetReader(ClientContext &context_p, string file_name_p, ParquetOptions parquet_options_p)
|
439
|
-
:
|
431
|
+
: fs(FileSystem::GetFileSystem(context_p)), allocator(BufferAllocator::Get(context_p)),
|
440
432
|
parquet_options(parquet_options_p) {
|
441
|
-
auto &fs = FileSystem::GetFileSystem(context_p);
|
442
433
|
file_name = std::move(file_name_p);
|
443
|
-
file_handle = fs.OpenFile(file_name, FileFlags::FILE_FLAGS_READ
|
444
|
-
FileSystem::DEFAULT_COMPRESSION, file_opener);
|
434
|
+
file_handle = fs.OpenFile(file_name, FileFlags::FILE_FLAGS_READ);
|
445
435
|
if (!file_handle->CanSeek()) {
|
446
436
|
throw NotImplementedException(
|
447
437
|
"Reading parquet files from a FIFO stream is not supported and cannot be efficiently supported since "
|
@@ -451,12 +441,12 @@ ParquetReader::ParquetReader(ClientContext &context_p, string file_name_p, Parqu
|
|
451
441
|
// or if this file has cached metadata
|
452
442
|
// or if the cached version already expired
|
453
443
|
if (!ObjectCache::ObjectCacheEnabled(context_p)) {
|
454
|
-
metadata = LoadMetadata(allocator, *file_handle
|
444
|
+
metadata = LoadMetadata(allocator, *file_handle);
|
455
445
|
} else {
|
456
446
|
auto last_modify_time = fs.GetLastModifiedTime(*file_handle);
|
457
447
|
metadata = ObjectCache::GetObjectCache(context_p).Get<ParquetFileMetadataCache>(file_name);
|
458
448
|
if (!metadata || (last_modify_time + 10 >= metadata->read_time)) {
|
459
|
-
metadata = LoadMetadata(allocator, *file_handle
|
449
|
+
metadata = LoadMetadata(allocator, *file_handle);
|
460
450
|
ObjectCache::GetObjectCache(context_p).Put(file_name, metadata);
|
461
451
|
}
|
462
452
|
}
|
@@ -466,7 +456,7 @@ ParquetReader::ParquetReader(ClientContext &context_p, string file_name_p, Parqu
|
|
466
456
|
|
467
457
|
ParquetReader::ParquetReader(ClientContext &context_p, ParquetOptions parquet_options_p,
|
468
458
|
shared_ptr<ParquetFileMetadataCache> metadata_p)
|
469
|
-
:
|
459
|
+
: fs(FileSystem::GetFileSystem(context_p)), allocator(BufferAllocator::Get(context_p)),
|
470
460
|
metadata(std::move(metadata_p)), parquet_options(parquet_options_p) {
|
471
461
|
InitializeSchema();
|
472
462
|
}
|
@@ -634,11 +624,10 @@ void ParquetReader::InitializeScan(ParquetReaderScanState &state, vector<idx_t>
|
|
634
624
|
state.prefetch_mode = false;
|
635
625
|
}
|
636
626
|
|
637
|
-
state.file_handle =
|
638
|
-
FileSystem::DEFAULT_COMPRESSION, file_opener);
|
627
|
+
state.file_handle = fs.OpenFile(file_handle->path, flags);
|
639
628
|
}
|
640
629
|
|
641
|
-
state.thrift_file_proto = CreateThriftProtocol(allocator, *state.file_handle,
|
630
|
+
state.thrift_file_proto = CreateThriftProtocol(allocator, *state.file_handle, state.prefetch_mode);
|
642
631
|
state.root_reader = CreateReader();
|
643
632
|
state.define_buf.resize(allocator, STANDARD_VECTOR_SIZE);
|
644
633
|
state.repeat_buf.resize(allocator, STANDARD_VECTOR_SIZE);
|
@@ -1,6 +1,7 @@
|
|
1
1
|
#include "parquet_statistics.hpp"
|
2
2
|
#include "parquet_decimal_utils.hpp"
|
3
3
|
#include "parquet_timestamp.hpp"
|
4
|
+
#include "string_column_reader.hpp"
|
4
5
|
#include "duckdb.hpp"
|
5
6
|
#ifndef DUCKDB_AMALGAMATION
|
6
7
|
#include "duckdb/common/types/blob.hpp"
|
@@ -253,15 +254,19 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(con
|
|
253
254
|
case LogicalTypeId::VARCHAR: {
|
254
255
|
auto string_stats = StringStats::CreateEmpty(type);
|
255
256
|
if (parquet_stats.__isset.min) {
|
257
|
+
StringColumnReader::VerifyString(parquet_stats.min.c_str(), parquet_stats.min.size(), true);
|
256
258
|
StringStats::Update(string_stats, parquet_stats.min);
|
257
259
|
} else if (parquet_stats.__isset.min_value) {
|
260
|
+
StringColumnReader::VerifyString(parquet_stats.min_value.c_str(), parquet_stats.min_value.size(), true);
|
258
261
|
StringStats::Update(string_stats, parquet_stats.min_value);
|
259
262
|
} else {
|
260
263
|
return nullptr;
|
261
264
|
}
|
262
265
|
if (parquet_stats.__isset.max) {
|
266
|
+
StringColumnReader::VerifyString(parquet_stats.max.c_str(), parquet_stats.max.size(), true);
|
263
267
|
StringStats::Update(string_stats, parquet_stats.max);
|
264
268
|
} else if (parquet_stats.__isset.max_value) {
|
269
|
+
StringColumnReader::VerifyString(parquet_stats.max_value.c_str(), parquet_stats.max_value.size(), true);
|
265
270
|
StringStats::Update(string_stats, parquet_stats.max_value);
|
266
271
|
} else {
|
267
272
|
return nullptr;
|
@@ -225,12 +225,12 @@ void VerifyUniqueNames(const vector<string> &names) {
|
|
225
225
|
#endif
|
226
226
|
}
|
227
227
|
|
228
|
-
ParquetWriter::ParquetWriter(FileSystem &fs, string file_name_p,
|
229
|
-
|
228
|
+
ParquetWriter::ParquetWriter(FileSystem &fs, string file_name_p, vector<LogicalType> types_p, vector<string> names_p,
|
229
|
+
CompressionCodec::type codec)
|
230
230
|
: file_name(std::move(file_name_p)), sql_types(std::move(types_p)), column_names(std::move(names_p)), codec(codec) {
|
231
231
|
// initialize the file writer
|
232
|
-
writer = make_uniq<BufferedFileWriter>(
|
233
|
-
|
232
|
+
writer = make_uniq<BufferedFileWriter>(fs, file_name.c_str(),
|
233
|
+
FileFlags::FILE_FLAGS_WRITE | FileFlags::FILE_FLAGS_FILE_CREATE_NEW);
|
234
234
|
// parquet files start with the string "PAR1"
|
235
235
|
writer->WriteData((const_data_ptr_t) "PAR1", 4);
|
236
236
|
TCompactProtocolFactoryT<MyTransport> tproto_factory;
|
@@ -40,11 +40,8 @@ FileSystem::~FileSystem() {
|
|
40
40
|
}
|
41
41
|
|
42
42
|
FileSystem &FileSystem::GetFileSystem(ClientContext &context) {
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
FileOpener *FileSystem::GetFileOpener(ClientContext &context) {
|
47
|
-
return ClientData::Get(context).file_opener.get();
|
43
|
+
auto &client_data = ClientData::Get(context);
|
44
|
+
return *client_data.client_file_system;
|
48
45
|
}
|
49
46
|
|
50
47
|
bool PathMatched(const string &path, const string &sub_path) {
|
@@ -193,7 +190,7 @@ string FileSystem::ExtractBaseName(const string &path) {
|
|
193
190
|
return vec[0];
|
194
191
|
}
|
195
192
|
|
196
|
-
string FileSystem::GetHomeDirectory(FileOpener
|
193
|
+
string FileSystem::GetHomeDirectory(optional_ptr<FileOpener> opener) {
|
197
194
|
// read the home_directory setting first, if it is set
|
198
195
|
if (opener) {
|
199
196
|
Value result;
|
@@ -215,7 +212,11 @@ string FileSystem::GetHomeDirectory(FileOpener *opener) {
|
|
215
212
|
return string();
|
216
213
|
}
|
217
214
|
|
218
|
-
string FileSystem::
|
215
|
+
string FileSystem::GetHomeDirectory() {
|
216
|
+
return GetHomeDirectory(nullptr);
|
217
|
+
}
|
218
|
+
|
219
|
+
string FileSystem::ExpandPath(const string &path, optional_ptr<FileOpener> opener) {
|
219
220
|
if (path.empty()) {
|
220
221
|
return path;
|
221
222
|
}
|
@@ -225,6 +226,10 @@ string FileSystem::ExpandPath(const string &path, FileOpener *opener) {
|
|
225
226
|
return path;
|
226
227
|
}
|
227
228
|
|
229
|
+
string FileSystem::ExpandPath(const string &path) {
|
230
|
+
return FileSystem::ExpandPath(path, nullptr);
|
231
|
+
}
|
232
|
+
|
228
233
|
// LCOV_EXCL_START
|
229
234
|
unique_ptr<FileHandle> FileSystem::OpenFile(const string &path, uint8_t flags, FileLockType lock,
|
230
235
|
FileCompressionType compression, FileOpener *opener) {
|
@@ -247,14 +252,6 @@ int64_t FileSystem::Write(FileHandle &handle, void *buffer, int64_t nr_bytes) {
|
|
247
252
|
throw NotImplementedException("%s: Write is not implemented!", GetName());
|
248
253
|
}
|
249
254
|
|
250
|
-
string FileSystem::GetFileExtension(FileHandle &handle) {
|
251
|
-
auto dot_location = handle.path.rfind('.');
|
252
|
-
if (dot_location != std::string::npos) {
|
253
|
-
return handle.path.substr(dot_location + 1, std::string::npos);
|
254
|
-
}
|
255
|
-
return string();
|
256
|
-
}
|
257
|
-
|
258
255
|
int64_t FileSystem::GetFileSize(FileHandle &handle) {
|
259
256
|
throw NotImplementedException("%s: GetFileSize is not implemented!", GetName());
|
260
257
|
}
|
@@ -312,10 +309,6 @@ vector<string> FileSystem::Glob(const string &path, FileOpener *opener) {
|
|
312
309
|
throw NotImplementedException("%s: Glob is not implemented!", GetName());
|
313
310
|
}
|
314
311
|
|
315
|
-
vector<string> FileSystem::Glob(const string &path, ClientContext &context) {
|
316
|
-
return Glob(path, GetFileOpener(context));
|
317
|
-
}
|
318
|
-
|
319
312
|
void FileSystem::RegisterSubSystem(unique_ptr<FileSystem> sub_fs) {
|
320
313
|
throw NotImplementedException("%s: Can't register a sub system on a non-virtual file system", GetName());
|
321
314
|
}
|
@@ -337,7 +330,7 @@ bool FileSystem::CanHandleFile(const string &fpath) {
|
|
337
330
|
}
|
338
331
|
|
339
332
|
vector<string> FileSystem::GlobFiles(const string &pattern, ClientContext &context, FileGlobOptions options) {
|
340
|
-
auto result = Glob(pattern
|
333
|
+
auto result = Glob(pattern);
|
341
334
|
if (result.empty()) {
|
342
335
|
string required_extension;
|
343
336
|
const string prefixes[] = {"http://", "https://", "s3://"};
|
@@ -8,9 +8,9 @@ namespace duckdb {
|
|
8
8
|
// Remove this when we switch C++17: https://stackoverflow.com/a/53350948
|
9
9
|
constexpr uint8_t BufferedFileWriter::DEFAULT_OPEN_FLAGS;
|
10
10
|
|
11
|
-
BufferedFileWriter::BufferedFileWriter(FileSystem &fs, const string &path_p, uint8_t open_flags
|
11
|
+
BufferedFileWriter::BufferedFileWriter(FileSystem &fs, const string &path_p, uint8_t open_flags)
|
12
12
|
: fs(fs), path(path_p), data(unique_ptr<data_t[]>(new data_t[FILE_BUFFER_SIZE])), offset(0), total_written(0) {
|
13
|
-
handle = fs.OpenFile(path, open_flags, FileLockType::WRITE_LOCK
|
13
|
+
handle = fs.OpenFile(path, open_flags, FileLockType::WRITE_LOCK);
|
14
14
|
}
|
15
15
|
|
16
16
|
int64_t BufferedFileWriter::GetFileSize() {
|
@@ -130,6 +130,9 @@ static void TemplatedGenerateKeys(ArenaAllocator &allocator, Vector &input, idx_
|
|
130
130
|
auto idx = idata.sel->get_index(i);
|
131
131
|
if (idata.validity.RowIsValid(idx)) {
|
132
132
|
ARTKey::CreateARTKey<T>(allocator, input.GetType(), keys[i], input_data[idx]);
|
133
|
+
} else {
|
134
|
+
// we need to possibly reset the former key value in the keys vector
|
135
|
+
keys[i] = ARTKey();
|
133
136
|
}
|
134
137
|
}
|
135
138
|
}
|
@@ -680,7 +683,6 @@ Node ART::Lookup(Node node, const ARTKey &key, idx_t depth) {
|
|
680
683
|
}
|
681
684
|
return node;
|
682
685
|
}
|
683
|
-
|
684
686
|
auto &node_prefix = node.GetPrefix(*this);
|
685
687
|
if (node_prefix.count) {
|
686
688
|
for (idx_t pos = 0; pos < node_prefix.count; pos++) {
|
@@ -167,7 +167,6 @@ void PhysicalIndexJoin::GetRHSMatches(ExecutionContext &context, DataChunk &inpu
|
|
167
167
|
|
168
168
|
auto &state = state_p.Cast<IndexJoinOperatorState>();
|
169
169
|
auto &art = index.Cast<ART>();
|
170
|
-
;
|
171
170
|
|
172
171
|
// generate the keys for this chunk
|
173
172
|
state.arena_allocator.Reset();
|
@@ -35,7 +35,7 @@ string BaseCSVReader::GetLineNumberStr(idx_t line_error, bool is_line_estimated,
|
|
35
35
|
BaseCSVReader::BaseCSVReader(ClientContext &context_p, BufferedCSVReaderOptions options_p,
|
36
36
|
const vector<LogicalType> &requested_types)
|
37
37
|
: context(context_p), fs(FileSystem::GetFileSystem(context)), allocator(Allocator::Get(context)),
|
38
|
-
|
38
|
+
options(std::move(options_p)) {
|
39
39
|
}
|
40
40
|
|
41
41
|
BaseCSVReader::~BaseCSVReader() {
|
@@ -43,7 +43,7 @@ BaseCSVReader::~BaseCSVReader() {
|
|
43
43
|
|
44
44
|
unique_ptr<CSVFileHandle> BaseCSVReader::OpenCSV(const BufferedCSVReaderOptions &options_p) {
|
45
45
|
auto file_handle = fs.OpenFile(options_p.file_path.c_str(), FileFlags::FILE_FLAGS_READ, FileLockType::NO_LOCK,
|
46
|
-
options_p.compression
|
46
|
+
options_p.compression);
|
47
47
|
if (file_handle->CanSeek()) {
|
48
48
|
file_handle->Reset();
|
49
49
|
}
|
@@ -636,10 +636,10 @@ void ParallelCSVReader::ParseCSV(DataChunk &insert_chunk) {
|
|
636
636
|
}
|
637
637
|
|
638
638
|
idx_t ParallelCSVReader::GetLineError(idx_t line_error, idx_t buffer_idx) {
|
639
|
-
|
640
639
|
while (true) {
|
641
640
|
if (buffer->line_info->CanItGetLine(file_idx, buffer_idx)) {
|
642
641
|
auto cur_start = verification_positions.beginning_of_first_line + buffer->buffer->GetCSVGlobalStart();
|
642
|
+
// line errors are 1-indexed
|
643
643
|
return buffer->line_info->GetLine(buffer_idx, line_error, file_idx, cur_start, false);
|
644
644
|
}
|
645
645
|
}
|
@@ -207,8 +207,7 @@ unique_ptr<GlobalSinkState> PhysicalCopyToFile::GetGlobalSinkState(ClientContext
|
|
207
207
|
fs.CreateDirectory(file_path);
|
208
208
|
} else if (!overwrite_or_ignore) {
|
209
209
|
idx_t n_files = 0;
|
210
|
-
fs.ListFiles(
|
211
|
-
file_path, [&n_files](const string &path, bool) { n_files++; }, FileOpener::Get(context));
|
210
|
+
fs.ListFiles(file_path, [&n_files](const string &path, bool) { n_files++; });
|
212
211
|
if (n_files > 0) {
|
213
212
|
throw IOException("Directory %s is not empty! Enable OVERWRITE_OR_IGNORE option to force writing",
|
214
213
|
file_path);
|
@@ -27,10 +27,10 @@ static void WriteCatalogEntries(stringstream &ss, vector<reference<CatalogEntry>
|
|
27
27
|
ss << std::endl;
|
28
28
|
}
|
29
29
|
|
30
|
-
static void WriteStringStreamToFile(FileSystem &fs,
|
30
|
+
static void WriteStringStreamToFile(FileSystem &fs, stringstream &ss, const string &path) {
|
31
31
|
auto ss_string = ss.str();
|
32
32
|
auto handle = fs.OpenFile(path, FileFlags::FILE_FLAGS_WRITE | FileFlags::FILE_FLAGS_FILE_CREATE_NEW,
|
33
|
-
FileLockType::WRITE_LOCK
|
33
|
+
FileLockType::WRITE_LOCK);
|
34
34
|
fs.Write(*handle, (void *)ss_string.c_str(), ss_string.size());
|
35
35
|
handle.reset();
|
36
36
|
}
|
@@ -108,7 +108,6 @@ SourceResultType PhysicalExport::GetData(ExecutionContext &context, DataChunk &c
|
|
108
108
|
|
109
109
|
auto &ccontext = context.client;
|
110
110
|
auto &fs = FileSystem::GetFileSystem(ccontext);
|
111
|
-
auto *opener = FileSystem::GetFileOpener(ccontext);
|
112
111
|
|
113
112
|
// gather all catalog types to export
|
114
113
|
vector<reference<CatalogEntry>> schemas;
|
@@ -172,7 +171,7 @@ SourceResultType PhysicalExport::GetData(ExecutionContext &context, DataChunk &c
|
|
172
171
|
WriteCatalogEntries(ss, indexes);
|
173
172
|
WriteCatalogEntries(ss, macros);
|
174
173
|
|
175
|
-
WriteStringStreamToFile(fs,
|
174
|
+
WriteStringStreamToFile(fs, ss, fs.JoinPath(info->file_path, "schema.sql"));
|
176
175
|
|
177
176
|
// write the load.sql file
|
178
177
|
// for every table, we write COPY INTO statement with the specified options
|
@@ -181,7 +180,7 @@ SourceResultType PhysicalExport::GetData(ExecutionContext &context, DataChunk &c
|
|
181
180
|
auto exported_table_info = exported_tables.data[i].table_data;
|
182
181
|
WriteCopyStatement(fs, load_ss, *info, exported_table_info, function);
|
183
182
|
}
|
184
|
-
WriteStringStreamToFile(fs,
|
183
|
+
WriteStringStreamToFile(fs, load_ss, fs.JoinPath(info->file_path, "load.sql"));
|
185
184
|
state.finished = true;
|
186
185
|
|
187
186
|
return SourceResultType::FINISHED;
|
@@ -11,7 +11,7 @@ unique_ptr<PhysicalOperator> PhysicalPlanGenerator::CreatePlan(LogicalCopyToFile
|
|
11
11
|
bool preserve_insertion_order = PhysicalPlanGenerator::PreserveInsertionOrder(context, *plan);
|
12
12
|
bool supports_batch_index = PhysicalPlanGenerator::UseBatchIndex(context, *plan);
|
13
13
|
auto &fs = FileSystem::GetFileSystem(context);
|
14
|
-
op.file_path = fs.ExpandPath(op.file_path
|
14
|
+
op.file_path = fs.ExpandPath(op.file_path);
|
15
15
|
if (op.use_tmp_file) {
|
16
16
|
op.file_path += ".tmp";
|
17
17
|
}
|