duckdb 0.8.2-dev4572.0 → 0.8.2-dev4653.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/extension/json/buffered_json_reader.cpp +8 -8
- package/src/duckdb/extension/json/json_functions/read_json.cpp +6 -2
- package/src/duckdb/extension/json/json_scan.cpp +4 -6
- package/src/duckdb/src/common/enum_util.cpp +24 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_reader_options.cpp +213 -2
- package/src/duckdb/src/function/table/read_csv.cpp +3 -130
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/box_renderer.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/enum_util.hpp +8 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_reader_options.hpp +24 -0
- package/src/duckdb/src/include/duckdb/main/connection.hpp +1 -1
- package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +10 -4
- package/src/duckdb/src/include/duckdb/main/relation/read_csv_relation.hpp +3 -3
- package/src/duckdb/src/include/duckdb/main/relation/table_function_relation.hpp +1 -0
- package/src/duckdb/src/include/duckdb.h +3 -3
- package/src/duckdb/src/main/connection.cpp +4 -6
- package/src/duckdb/src/main/extension/extension_install.cpp +2 -1
- package/src/duckdb/src/main/relation/read_csv_relation.cpp +28 -9
- package/src/duckdb/src/main/relation/table_function_relation.cpp +8 -2
- package/src/duckdb/src/storage/checkpoint_manager.cpp +3 -3
- package/src/duckdb/src/storage/table/table_statistics.cpp +1 -3
- package/src/duckdb/src/storage/wal_replay.cpp +8 -2
package/package.json
CHANGED
@@ -1,9 +1,8 @@
|
|
1
1
|
#include "buffered_json_reader.hpp"
|
2
2
|
|
3
3
|
#include "duckdb/common/file_opener.hpp"
|
4
|
-
#include "duckdb/common/printer.hpp"
|
5
|
-
#include "duckdb/common/serializer/serializer.hpp"
|
6
4
|
#include "duckdb/common/serializer/deserializer.hpp"
|
5
|
+
#include "duckdb/common/serializer/serializer.hpp"
|
7
6
|
|
8
7
|
#include <utility>
|
9
8
|
|
@@ -24,7 +23,7 @@ bool JSONFileHandle::IsOpen() const {
|
|
24
23
|
}
|
25
24
|
|
26
25
|
void JSONFileHandle::Close() {
|
27
|
-
if (IsOpen()) {
|
26
|
+
if (IsOpen() && plain_file_source) {
|
28
27
|
file_handle->Close();
|
29
28
|
file_handle = nullptr;
|
30
29
|
}
|
@@ -174,12 +173,13 @@ BufferedJSONReader::BufferedJSONReader(ClientContext &context, BufferedJSONReade
|
|
174
173
|
}
|
175
174
|
|
176
175
|
void BufferedJSONReader::OpenJSONFile() {
|
177
|
-
D_ASSERT(!IsOpen());
|
178
176
|
lock_guard<mutex> guard(lock);
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
177
|
+
if (!IsOpen()) {
|
178
|
+
auto &file_system = FileSystem::GetFileSystem(context);
|
179
|
+
auto regular_file_handle = file_system.OpenFile(file_name.c_str(), FileFlags::FILE_FLAGS_READ,
|
180
|
+
FileLockType::NO_LOCK, options.compression);
|
181
|
+
file_handle = make_uniq<JSONFileHandle>(std::move(regular_file_handle), BufferAllocator::Get(context));
|
182
|
+
}
|
183
183
|
Reset();
|
184
184
|
}
|
185
185
|
|
@@ -17,6 +17,7 @@ void JSONScan::AutoDetect(ClientContext &context, JSONScanData &bind_data, vecto
|
|
17
17
|
Vector string_vector(LogicalType::VARCHAR);
|
18
18
|
|
19
19
|
// Loop through the files (if union_by_name, else just sample the first file)
|
20
|
+
idx_t remaining = bind_data.sample_size;
|
20
21
|
for (idx_t file_idx = 0; file_idx < bind_data.files.size(); file_idx++) {
|
21
22
|
// Create global/local state and place the reader in the right field
|
22
23
|
JSONScanGlobalState gstate(context, bind_data);
|
@@ -28,7 +29,6 @@ void JSONScan::AutoDetect(ClientContext &context, JSONScanData &bind_data, vecto
|
|
28
29
|
}
|
29
30
|
|
30
31
|
// Read and detect schema
|
31
|
-
idx_t remaining = bind_data.sample_size;
|
32
32
|
while (remaining != 0) {
|
33
33
|
allocator.Reset();
|
34
34
|
auto read_count = lstate.ReadNext(gstate);
|
@@ -56,7 +56,11 @@ void JSONScan::AutoDetect(ClientContext &context, JSONScanData &bind_data, vecto
|
|
56
56
|
}
|
57
57
|
|
58
58
|
// Close the file and stop detection if not union_by_name
|
59
|
-
if (
|
59
|
+
if (bind_data.options.file_options.union_by_name) {
|
60
|
+
// When union_by_name=true we sample sample_size per file
|
61
|
+
remaining = bind_data.sample_size;
|
62
|
+
} else if (remaining == 0) {
|
63
|
+
// When union_by_name=false, we sample sample_size in total (across the first files)
|
60
64
|
break;
|
61
65
|
}
|
62
66
|
}
|
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
#include "duckdb/common/enum_util.hpp"
|
4
4
|
#include "duckdb/common/multi_file_reader.hpp"
|
5
|
+
#include "duckdb/common/serializer/deserializer.hpp"
|
6
|
+
#include "duckdb/common/serializer/serializer.hpp"
|
5
7
|
#include "duckdb/main/extension_helper.hpp"
|
6
8
|
#include "duckdb/parallel/task_scheduler.hpp"
|
7
9
|
#include "duckdb/storage/buffer_manager.hpp"
|
8
|
-
#include "duckdb/common/serializer/serializer.hpp"
|
9
|
-
#include "duckdb/common/serializer/deserializer.hpp"
|
10
10
|
|
11
11
|
namespace duckdb {
|
12
12
|
|
@@ -558,10 +558,8 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
|
|
558
558
|
if (current_reader) {
|
559
559
|
// If we performed the final read of this reader in the previous iteration, close it now
|
560
560
|
if (is_last) {
|
561
|
-
|
562
|
-
|
563
|
-
current_reader->CloseJSONFile();
|
564
|
-
}
|
561
|
+
TryIncrementFileIndex(gstate);
|
562
|
+
current_reader->CloseJSONFile();
|
565
563
|
current_reader = nullptr;
|
566
564
|
continue;
|
567
565
|
}
|
@@ -11,6 +11,7 @@
|
|
11
11
|
|
12
12
|
#include "duckdb/common/enum_util.hpp"
|
13
13
|
#include "duckdb/catalog/catalog_entry/table_column_type.hpp"
|
14
|
+
#include "duckdb/common/box_renderer.hpp"
|
14
15
|
#include "duckdb/common/enums/access_mode.hpp"
|
15
16
|
#include "duckdb/common/enums/aggregate_handling.hpp"
|
16
17
|
#include "duckdb/common/enums/catalog_type.hpp"
|
@@ -4797,6 +4798,29 @@ RelationType EnumUtil::FromString<RelationType>(const char *value) {
|
|
4797
4798
|
throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value));
|
4798
4799
|
}
|
4799
4800
|
|
4801
|
+
template<>
|
4802
|
+
const char* EnumUtil::ToChars<RenderMode>(RenderMode value) {
|
4803
|
+
switch(value) {
|
4804
|
+
case RenderMode::ROWS:
|
4805
|
+
return "ROWS";
|
4806
|
+
case RenderMode::COLUMNS:
|
4807
|
+
return "COLUMNS";
|
4808
|
+
default:
|
4809
|
+
throw NotImplementedException(StringUtil::Format("Enum value: '%d' not implemented", value));
|
4810
|
+
}
|
4811
|
+
}
|
4812
|
+
|
4813
|
+
template<>
|
4814
|
+
RenderMode EnumUtil::FromString<RenderMode>(const char *value) {
|
4815
|
+
if (StringUtil::Equals(value, "ROWS")) {
|
4816
|
+
return RenderMode::ROWS;
|
4817
|
+
}
|
4818
|
+
if (StringUtil::Equals(value, "COLUMNS")) {
|
4819
|
+
return RenderMode::COLUMNS;
|
4820
|
+
}
|
4821
|
+
throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value));
|
4822
|
+
}
|
4823
|
+
|
4800
4824
|
template<>
|
4801
4825
|
const char* EnumUtil::ToChars<ResultModifierType>(ResultModifierType value) {
|
4802
4826
|
switch(value) {
|
@@ -2,6 +2,8 @@
|
|
2
2
|
#include "duckdb/common/bind_helpers.hpp"
|
3
3
|
#include "duckdb/common/vector_size.hpp"
|
4
4
|
#include "duckdb/common/string_util.hpp"
|
5
|
+
#include "duckdb/common/enum_util.hpp"
|
6
|
+
#include "duckdb/common/multi_file_reader.hpp"
|
5
7
|
|
6
8
|
namespace duckdb {
|
7
9
|
|
@@ -60,6 +62,10 @@ static int64_t ParseInteger(const Value &value, const string &loption) {
|
|
60
62
|
return value.GetValue<int64_t>();
|
61
63
|
}
|
62
64
|
|
65
|
+
bool CSVReaderOptions::GetHeader() const {
|
66
|
+
return this->dialect_options.header;
|
67
|
+
}
|
68
|
+
|
63
69
|
void CSVReaderOptions::SetHeader(bool input) {
|
64
70
|
this->dialect_options.header = input;
|
65
71
|
this->has_header = true;
|
@@ -69,6 +75,10 @@ void CSVReaderOptions::SetCompression(const string &compression_p) {
|
|
69
75
|
this->compression = FileCompressionTypeFromString(compression_p);
|
70
76
|
}
|
71
77
|
|
78
|
+
string CSVReaderOptions::GetEscape() const {
|
79
|
+
return std::string(1, this->dialect_options.state_machine_options.escape);
|
80
|
+
}
|
81
|
+
|
72
82
|
void CSVReaderOptions::SetEscape(const string &input) {
|
73
83
|
auto escape_str = input;
|
74
84
|
if (escape_str.size() > 1) {
|
@@ -81,6 +91,19 @@ void CSVReaderOptions::SetEscape(const string &input) {
|
|
81
91
|
this->has_escape = true;
|
82
92
|
}
|
83
93
|
|
94
|
+
int64_t CSVReaderOptions::GetSkipRows() const {
|
95
|
+
return this->dialect_options.skip_rows;
|
96
|
+
}
|
97
|
+
|
98
|
+
void CSVReaderOptions::SetSkipRows(int64_t skip_rows) {
|
99
|
+
dialect_options.skip_rows = skip_rows;
|
100
|
+
skip_rows_set = true;
|
101
|
+
}
|
102
|
+
|
103
|
+
string CSVReaderOptions::GetDelimiter() const {
|
104
|
+
return std::string(1, this->dialect_options.state_machine_options.delimiter);
|
105
|
+
}
|
106
|
+
|
84
107
|
void CSVReaderOptions::SetDelimiter(const string &input) {
|
85
108
|
auto delim_str = StringUtil::Replace(input, "\\t", "\t");
|
86
109
|
if (delim_str.size() > 1) {
|
@@ -93,6 +116,10 @@ void CSVReaderOptions::SetDelimiter(const string &input) {
|
|
93
116
|
this->dialect_options.state_machine_options.delimiter = delim_str[0];
|
94
117
|
}
|
95
118
|
|
119
|
+
string CSVReaderOptions::GetQuote() const {
|
120
|
+
return std::string(1, this->dialect_options.state_machine_options.quote);
|
121
|
+
}
|
122
|
+
|
96
123
|
void CSVReaderOptions::SetQuote(const string "e_p) {
|
97
124
|
auto quote_str = quote_p;
|
98
125
|
if (quote_str.size() > 1) {
|
@@ -105,6 +132,10 @@ void CSVReaderOptions::SetQuote(const string "e_p) {
|
|
105
132
|
this->has_quote = true;
|
106
133
|
}
|
107
134
|
|
135
|
+
NewLineIdentifier CSVReaderOptions::GetNewline() const {
|
136
|
+
return dialect_options.new_line;
|
137
|
+
}
|
138
|
+
|
108
139
|
void CSVReaderOptions::SetNewline(const string &input) {
|
109
140
|
if (input == "\\n" || input == "\\r") {
|
110
141
|
dialect_options.new_line = NewLineIdentifier::SINGLE;
|
@@ -152,8 +183,7 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value,
|
|
152
183
|
sample_chunks = sample_size / STANDARD_VECTOR_SIZE + 1;
|
153
184
|
}
|
154
185
|
} else if (loption == "skip") {
|
155
|
-
|
156
|
-
skip_rows_set = true;
|
186
|
+
SetSkipRows(ParseInteger(value, loption));
|
157
187
|
} else if (loption == "max_line_size" || loption == "maximum_line_size") {
|
158
188
|
maximum_line_size = ParseInteger(value, loption);
|
159
189
|
} else if (loption == "sample_chunk_size") {
|
@@ -296,4 +326,185 @@ string CSVReaderOptions::ToString() const {
|
|
296
326
|
"\n ignore_errors=" + std::to_string(ignore_errors) + "\n all_varchar=" + std::to_string(all_varchar);
|
297
327
|
}
|
298
328
|
|
329
|
+
static Value StringVectorToValue(const vector<string> &vec) {
|
330
|
+
vector<Value> content;
|
331
|
+
content.reserve(vec.size());
|
332
|
+
for (auto &item : vec) {
|
333
|
+
content.push_back(Value(item));
|
334
|
+
}
|
335
|
+
return Value::LIST(std::move(content));
|
336
|
+
}
|
337
|
+
|
338
|
+
static uint8_t GetCandidateSpecificity(const LogicalType &candidate_type) {
|
339
|
+
//! Const ht with accepted auto_types and their weights in specificity
|
340
|
+
const duckdb::unordered_map<uint8_t, uint8_t> auto_type_candidates_specificity {
|
341
|
+
{(uint8_t)LogicalTypeId::VARCHAR, 0}, {(uint8_t)LogicalTypeId::TIMESTAMP, 1},
|
342
|
+
{(uint8_t)LogicalTypeId::DATE, 2}, {(uint8_t)LogicalTypeId::TIME, 3},
|
343
|
+
{(uint8_t)LogicalTypeId::DOUBLE, 4}, {(uint8_t)LogicalTypeId::FLOAT, 5},
|
344
|
+
{(uint8_t)LogicalTypeId::BIGINT, 6}, {(uint8_t)LogicalTypeId::INTEGER, 7},
|
345
|
+
{(uint8_t)LogicalTypeId::SMALLINT, 8}, {(uint8_t)LogicalTypeId::TINYINT, 9},
|
346
|
+
{(uint8_t)LogicalTypeId::BOOLEAN, 10}, {(uint8_t)LogicalTypeId::SQLNULL, 11}};
|
347
|
+
|
348
|
+
auto id = (uint8_t)candidate_type.id();
|
349
|
+
auto it = auto_type_candidates_specificity.find(id);
|
350
|
+
if (it == auto_type_candidates_specificity.end()) {
|
351
|
+
throw BinderException("Auto Type Candidate of type %s is not accepted as a valid input",
|
352
|
+
EnumUtil::ToString(candidate_type.id()));
|
353
|
+
}
|
354
|
+
return it->second;
|
355
|
+
}
|
356
|
+
|
357
|
+
void CSVReaderOptions::FromNamedParameters(named_parameter_map_t &in, ClientContext &context,
|
358
|
+
vector<LogicalType> &return_types, vector<string> &names) {
|
359
|
+
for (auto &kv : in) {
|
360
|
+
if (MultiFileReader::ParseOption(kv.first, kv.second, file_options, context)) {
|
361
|
+
continue;
|
362
|
+
}
|
363
|
+
auto loption = StringUtil::Lower(kv.first);
|
364
|
+
if (loption == "columns") {
|
365
|
+
explicitly_set_columns = true;
|
366
|
+
auto &child_type = kv.second.type();
|
367
|
+
if (child_type.id() != LogicalTypeId::STRUCT) {
|
368
|
+
throw BinderException("read_csv columns requires a struct as input");
|
369
|
+
}
|
370
|
+
auto &struct_children = StructValue::GetChildren(kv.second);
|
371
|
+
D_ASSERT(StructType::GetChildCount(child_type) == struct_children.size());
|
372
|
+
for (idx_t i = 0; i < struct_children.size(); i++) {
|
373
|
+
auto &name = StructType::GetChildName(child_type, i);
|
374
|
+
auto &val = struct_children[i];
|
375
|
+
names.push_back(name);
|
376
|
+
if (val.type().id() != LogicalTypeId::VARCHAR) {
|
377
|
+
throw BinderException("read_csv requires a type specification as string");
|
378
|
+
}
|
379
|
+
return_types.emplace_back(TransformStringToLogicalType(StringValue::Get(val), context));
|
380
|
+
}
|
381
|
+
if (names.empty()) {
|
382
|
+
throw BinderException("read_csv requires at least a single column as input!");
|
383
|
+
}
|
384
|
+
} else if (loption == "auto_type_candidates") {
|
385
|
+
auto_type_candidates.clear();
|
386
|
+
map<uint8_t, LogicalType> candidate_types;
|
387
|
+
// We always have the extremes of Null and Varchar, so we can default to varchar if the
|
388
|
+
// sniffer is not able to confidently detect that column type
|
389
|
+
candidate_types[GetCandidateSpecificity(LogicalType::VARCHAR)] = LogicalType::VARCHAR;
|
390
|
+
candidate_types[GetCandidateSpecificity(LogicalType::SQLNULL)] = LogicalType::SQLNULL;
|
391
|
+
|
392
|
+
auto &child_type = kv.second.type();
|
393
|
+
if (child_type.id() != LogicalTypeId::LIST) {
|
394
|
+
throw BinderException("read_csv auto_types requires a list as input");
|
395
|
+
}
|
396
|
+
auto &list_children = ListValue::GetChildren(kv.second);
|
397
|
+
if (list_children.empty()) {
|
398
|
+
throw BinderException("auto_type_candidates requires at least one type");
|
399
|
+
}
|
400
|
+
for (auto &child : list_children) {
|
401
|
+
if (child.type().id() != LogicalTypeId::VARCHAR) {
|
402
|
+
throw BinderException("auto_type_candidates requires a type specification as string");
|
403
|
+
}
|
404
|
+
auto candidate_type = TransformStringToLogicalType(StringValue::Get(child), context);
|
405
|
+
candidate_types[GetCandidateSpecificity(candidate_type)] = candidate_type;
|
406
|
+
}
|
407
|
+
for (auto &candidate_type : candidate_types) {
|
408
|
+
auto_type_candidates.emplace_back(candidate_type.second);
|
409
|
+
}
|
410
|
+
} else if (loption == "column_names" || loption == "names") {
|
411
|
+
if (!name_list.empty()) {
|
412
|
+
throw BinderException("read_csv_auto column_names/names can only be supplied once");
|
413
|
+
}
|
414
|
+
if (kv.second.IsNull()) {
|
415
|
+
throw BinderException("read_csv_auto %s cannot be NULL", kv.first);
|
416
|
+
}
|
417
|
+
auto &children = ListValue::GetChildren(kv.second);
|
418
|
+
for (auto &child : children) {
|
419
|
+
name_list.push_back(StringValue::Get(child));
|
420
|
+
}
|
421
|
+
} else if (loption == "column_types" || loption == "types" || loption == "dtypes") {
|
422
|
+
auto &child_type = kv.second.type();
|
423
|
+
if (child_type.id() != LogicalTypeId::STRUCT && child_type.id() != LogicalTypeId::LIST) {
|
424
|
+
throw BinderException("read_csv_auto %s requires a struct or list as input", kv.first);
|
425
|
+
}
|
426
|
+
if (!sql_type_list.empty()) {
|
427
|
+
throw BinderException("read_csv_auto column_types/types/dtypes can only be supplied once");
|
428
|
+
}
|
429
|
+
vector<string> sql_type_names;
|
430
|
+
if (child_type.id() == LogicalTypeId::STRUCT) {
|
431
|
+
auto &struct_children = StructValue::GetChildren(kv.second);
|
432
|
+
D_ASSERT(StructType::GetChildCount(child_type) == struct_children.size());
|
433
|
+
for (idx_t i = 0; i < struct_children.size(); i++) {
|
434
|
+
auto &name = StructType::GetChildName(child_type, i);
|
435
|
+
auto &val = struct_children[i];
|
436
|
+
if (val.type().id() != LogicalTypeId::VARCHAR) {
|
437
|
+
throw BinderException("read_csv_auto %s requires a type specification as string", kv.first);
|
438
|
+
}
|
439
|
+
sql_type_names.push_back(StringValue::Get(val));
|
440
|
+
sql_types_per_column[name] = i;
|
441
|
+
}
|
442
|
+
} else {
|
443
|
+
auto &list_child = ListType::GetChildType(child_type);
|
444
|
+
if (list_child.id() != LogicalTypeId::VARCHAR) {
|
445
|
+
throw BinderException("read_csv_auto %s requires a list of types (varchar) as input", kv.first);
|
446
|
+
}
|
447
|
+
auto &children = ListValue::GetChildren(kv.second);
|
448
|
+
for (auto &child : children) {
|
449
|
+
sql_type_names.push_back(StringValue::Get(child));
|
450
|
+
}
|
451
|
+
}
|
452
|
+
sql_type_list.reserve(sql_type_names.size());
|
453
|
+
for (auto &sql_type : sql_type_names) {
|
454
|
+
auto def_type = TransformStringToLogicalType(sql_type);
|
455
|
+
if (def_type.id() == LogicalTypeId::USER) {
|
456
|
+
throw BinderException("Unrecognized type \"%s\" for read_csv_auto %s definition", sql_type,
|
457
|
+
kv.first);
|
458
|
+
}
|
459
|
+
sql_type_list.push_back(std::move(def_type));
|
460
|
+
}
|
461
|
+
} else if (loption == "all_varchar") {
|
462
|
+
all_varchar = BooleanValue::Get(kv.second);
|
463
|
+
} else if (loption == "normalize_names") {
|
464
|
+
normalize_names = BooleanValue::Get(kv.second);
|
465
|
+
} else {
|
466
|
+
SetReadOption(loption, kv.second, names);
|
467
|
+
}
|
468
|
+
}
|
469
|
+
}
|
470
|
+
|
471
|
+
//! This function is used to remember options set by the sniffer, for use in ReadCSVRelation
|
472
|
+
void CSVReaderOptions::ToNamedParameters(named_parameter_map_t &named_params) {
|
473
|
+
if (has_delimiter) {
|
474
|
+
named_params["delim"] = Value(GetDelimiter());
|
475
|
+
}
|
476
|
+
if (has_newline) {
|
477
|
+
named_params["newline"] = Value(EnumUtil::ToString(GetNewline()));
|
478
|
+
}
|
479
|
+
if (has_quote) {
|
480
|
+
named_params["quote"] = Value(GetQuote());
|
481
|
+
}
|
482
|
+
if (has_escape) {
|
483
|
+
named_params["escape"] = Value(GetEscape());
|
484
|
+
}
|
485
|
+
if (has_header) {
|
486
|
+
named_params["header"] = Value(GetHeader());
|
487
|
+
}
|
488
|
+
named_params["max_line_size"] = Value::BIGINT(maximum_line_size);
|
489
|
+
if (skip_rows_set) {
|
490
|
+
named_params["skip"] = Value::BIGINT(GetSkipRows());
|
491
|
+
}
|
492
|
+
named_params["sample_chunks"] = Value::BIGINT(sample_chunks);
|
493
|
+
named_params["sample_chunk_size"] = Value::BIGINT(sample_chunk_size);
|
494
|
+
named_params["null_padding"] = Value::BOOLEAN(null_padding);
|
495
|
+
if (!date_format.at(LogicalType::DATE).format_specifier.empty()) {
|
496
|
+
named_params["dateformat"] = Value(date_format.at(LogicalType::DATE).format_specifier);
|
497
|
+
}
|
498
|
+
if (!date_format.at(LogicalType::TIMESTAMP).format_specifier.empty()) {
|
499
|
+
named_params["timestampformat"] = Value(date_format.at(LogicalType::TIMESTAMP).format_specifier);
|
500
|
+
}
|
501
|
+
|
502
|
+
named_params["normalize_names"] = Value::BOOLEAN(normalize_names);
|
503
|
+
if (!name_list.empty() && !named_params.count("column_names") && !named_params.count("names")) {
|
504
|
+
named_params["column_names"] = StringVectorToValue(name_list);
|
505
|
+
}
|
506
|
+
named_params["all_varchar"] = Value::BOOLEAN(all_varchar);
|
507
|
+
named_params["maximum_line_size"] = Value::BIGINT(maximum_line_size);
|
508
|
+
}
|
509
|
+
|
299
510
|
} // namespace duckdb
|
@@ -85,25 +85,6 @@ void ReadCSVData::FinalizeRead(ClientContext &context) {
|
|
85
85
|
}
|
86
86
|
}
|
87
87
|
|
88
|
-
uint8_t GetCandidateSpecificity(const LogicalType &candidate_type) {
|
89
|
-
//! Const ht with accepted auto_types and their weights in specificity
|
90
|
-
const duckdb::unordered_map<uint8_t, uint8_t> auto_type_candidates_specificity {
|
91
|
-
{(uint8_t)LogicalTypeId::VARCHAR, 0}, {(uint8_t)LogicalTypeId::TIMESTAMP, 1},
|
92
|
-
{(uint8_t)LogicalTypeId::DATE, 2}, {(uint8_t)LogicalTypeId::TIME, 3},
|
93
|
-
{(uint8_t)LogicalTypeId::DOUBLE, 4}, {(uint8_t)LogicalTypeId::FLOAT, 5},
|
94
|
-
{(uint8_t)LogicalTypeId::BIGINT, 6}, {(uint8_t)LogicalTypeId::INTEGER, 7},
|
95
|
-
{(uint8_t)LogicalTypeId::SMALLINT, 8}, {(uint8_t)LogicalTypeId::TINYINT, 9},
|
96
|
-
{(uint8_t)LogicalTypeId::BOOLEAN, 10}, {(uint8_t)LogicalTypeId::SQLNULL, 11}};
|
97
|
-
|
98
|
-
auto id = (uint8_t)candidate_type.id();
|
99
|
-
auto it = auto_type_candidates_specificity.find(id);
|
100
|
-
if (it == auto_type_candidates_specificity.end()) {
|
101
|
-
throw BinderException("Auto Type Candidate of type %s is not accepted as a valid input",
|
102
|
-
EnumUtil::ToString(candidate_type.id()));
|
103
|
-
}
|
104
|
-
return it->second;
|
105
|
-
}
|
106
|
-
|
107
88
|
static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctionBindInput &input,
|
108
89
|
vector<LogicalType> &return_types, vector<string> &names) {
|
109
90
|
|
@@ -111,117 +92,9 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
|
|
111
92
|
auto &options = result->options;
|
112
93
|
result->files = MultiFileReader::GetFileList(context, input.inputs[0], "CSV");
|
113
94
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
continue;
|
118
|
-
}
|
119
|
-
auto loption = StringUtil::Lower(kv.first);
|
120
|
-
if (loption == "columns") {
|
121
|
-
explicitly_set_columns = true;
|
122
|
-
auto &child_type = kv.second.type();
|
123
|
-
if (child_type.id() != LogicalTypeId::STRUCT) {
|
124
|
-
throw BinderException("read_csv columns requires a struct as input");
|
125
|
-
}
|
126
|
-
auto &struct_children = StructValue::GetChildren(kv.second);
|
127
|
-
D_ASSERT(StructType::GetChildCount(child_type) == struct_children.size());
|
128
|
-
for (idx_t i = 0; i < struct_children.size(); i++) {
|
129
|
-
auto &name = StructType::GetChildName(child_type, i);
|
130
|
-
auto &val = struct_children[i];
|
131
|
-
names.push_back(name);
|
132
|
-
if (val.type().id() != LogicalTypeId::VARCHAR) {
|
133
|
-
throw BinderException("read_csv requires a type specification as string");
|
134
|
-
}
|
135
|
-
return_types.emplace_back(TransformStringToLogicalType(StringValue::Get(val), context));
|
136
|
-
}
|
137
|
-
if (names.empty()) {
|
138
|
-
throw BinderException("read_csv requires at least a single column as input!");
|
139
|
-
}
|
140
|
-
} else if (loption == "auto_type_candidates") {
|
141
|
-
options.auto_type_candidates.clear();
|
142
|
-
map<uint8_t, LogicalType> candidate_types;
|
143
|
-
// We always have the extremes of Null and Varchar, so we can default to varchar if the
|
144
|
-
// sniffer is not able to confidently detect that column type
|
145
|
-
candidate_types[GetCandidateSpecificity(LogicalType::VARCHAR)] = LogicalType::VARCHAR;
|
146
|
-
candidate_types[GetCandidateSpecificity(LogicalType::SQLNULL)] = LogicalType::SQLNULL;
|
147
|
-
|
148
|
-
auto &child_type = kv.second.type();
|
149
|
-
if (child_type.id() != LogicalTypeId::LIST) {
|
150
|
-
throw BinderException("read_csv auto_types requires a list as input");
|
151
|
-
}
|
152
|
-
auto &list_children = ListValue::GetChildren(kv.second);
|
153
|
-
if (list_children.empty()) {
|
154
|
-
throw BinderException("auto_type_candidates requires at least one type");
|
155
|
-
}
|
156
|
-
for (auto &child : list_children) {
|
157
|
-
if (child.type().id() != LogicalTypeId::VARCHAR) {
|
158
|
-
throw BinderException("auto_type_candidates requires a type specification as string");
|
159
|
-
}
|
160
|
-
auto candidate_type = TransformStringToLogicalType(StringValue::Get(child), context);
|
161
|
-
candidate_types[GetCandidateSpecificity(candidate_type)] = candidate_type;
|
162
|
-
}
|
163
|
-
for (auto &candidate_type : candidate_types) {
|
164
|
-
options.auto_type_candidates.emplace_back(candidate_type.second);
|
165
|
-
}
|
166
|
-
} else if (loption == "column_names" || loption == "names") {
|
167
|
-
if (!options.name_list.empty()) {
|
168
|
-
throw BinderException("read_csv_auto column_names/names can only be supplied once");
|
169
|
-
}
|
170
|
-
if (kv.second.IsNull()) {
|
171
|
-
throw BinderException("read_csv_auto %s cannot be NULL", kv.first);
|
172
|
-
}
|
173
|
-
auto &children = ListValue::GetChildren(kv.second);
|
174
|
-
for (auto &child : children) {
|
175
|
-
options.name_list.push_back(StringValue::Get(child));
|
176
|
-
}
|
177
|
-
} else if (loption == "column_types" || loption == "types" || loption == "dtypes") {
|
178
|
-
auto &child_type = kv.second.type();
|
179
|
-
if (child_type.id() != LogicalTypeId::STRUCT && child_type.id() != LogicalTypeId::LIST) {
|
180
|
-
throw BinderException("read_csv_auto %s requires a struct or list as input", kv.first);
|
181
|
-
}
|
182
|
-
if (!options.sql_type_list.empty()) {
|
183
|
-
throw BinderException("read_csv_auto column_types/types/dtypes can only be supplied once");
|
184
|
-
}
|
185
|
-
vector<string> sql_type_names;
|
186
|
-
if (child_type.id() == LogicalTypeId::STRUCT) {
|
187
|
-
auto &struct_children = StructValue::GetChildren(kv.second);
|
188
|
-
D_ASSERT(StructType::GetChildCount(child_type) == struct_children.size());
|
189
|
-
for (idx_t i = 0; i < struct_children.size(); i++) {
|
190
|
-
auto &name = StructType::GetChildName(child_type, i);
|
191
|
-
auto &val = struct_children[i];
|
192
|
-
if (val.type().id() != LogicalTypeId::VARCHAR) {
|
193
|
-
throw BinderException("read_csv_auto %s requires a type specification as string", kv.first);
|
194
|
-
}
|
195
|
-
sql_type_names.push_back(StringValue::Get(val));
|
196
|
-
options.sql_types_per_column[name] = i;
|
197
|
-
}
|
198
|
-
} else {
|
199
|
-
auto &list_child = ListType::GetChildType(child_type);
|
200
|
-
if (list_child.id() != LogicalTypeId::VARCHAR) {
|
201
|
-
throw BinderException("read_csv_auto %s requires a list of types (varchar) as input", kv.first);
|
202
|
-
}
|
203
|
-
auto &children = ListValue::GetChildren(kv.second);
|
204
|
-
for (auto &child : children) {
|
205
|
-
sql_type_names.push_back(StringValue::Get(child));
|
206
|
-
}
|
207
|
-
}
|
208
|
-
options.sql_type_list.reserve(sql_type_names.size());
|
209
|
-
for (auto &sql_type : sql_type_names) {
|
210
|
-
auto def_type = TransformStringToLogicalType(sql_type);
|
211
|
-
if (def_type.id() == LogicalTypeId::USER) {
|
212
|
-
throw BinderException("Unrecognized type \"%s\" for read_csv_auto %s definition", sql_type,
|
213
|
-
kv.first);
|
214
|
-
}
|
215
|
-
options.sql_type_list.push_back(std::move(def_type));
|
216
|
-
}
|
217
|
-
} else if (loption == "all_varchar") {
|
218
|
-
options.all_varchar = BooleanValue::Get(kv.second);
|
219
|
-
} else if (loption == "normalize_names") {
|
220
|
-
options.normalize_names = BooleanValue::Get(kv.second);
|
221
|
-
} else {
|
222
|
-
options.SetReadOption(loption, kv.second, names);
|
223
|
-
}
|
224
|
-
}
|
95
|
+
options.FromNamedParameters(input.named_parameters, context, return_types, names);
|
96
|
+
bool explicitly_set_columns = options.explicitly_set_columns;
|
97
|
+
|
225
98
|
options.file_options.AutoDetectHivePartitioning(result->files, context);
|
226
99
|
|
227
100
|
if (!options.auto_detect && return_types.empty()) {
|
@@ -1,8 +1,8 @@
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
2
|
-
#define DUCKDB_VERSION "0.8.2-
|
2
|
+
#define DUCKDB_VERSION "0.8.2-dev4653"
|
3
3
|
#endif
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
5
|
+
#define DUCKDB_SOURCE_ID "bb287d4b22"
|
6
6
|
#endif
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
8
8
|
#include "duckdb/main/database.hpp"
|
@@ -18,7 +18,7 @@ class ColumnDataCollection;
|
|
18
18
|
class ColumnDataRowCollection;
|
19
19
|
|
20
20
|
enum class ValueRenderAlignment { LEFT, MIDDLE, RIGHT };
|
21
|
-
enum class RenderMode { ROWS, COLUMNS };
|
21
|
+
enum class RenderMode : uint8_t { ROWS, COLUMNS };
|
22
22
|
|
23
23
|
struct BoxRendererConfig {
|
24
24
|
// a max_width of 0 means we default to the terminal width
|
@@ -216,6 +216,8 @@ enum class QuoteRule : uint8_t;
|
|
216
216
|
|
217
217
|
enum class RelationType : uint8_t;
|
218
218
|
|
219
|
+
enum class RenderMode : uint8_t;
|
220
|
+
|
219
221
|
enum class ResultModifierType : uint8_t;
|
220
222
|
|
221
223
|
enum class SampleMethod : uint8_t;
|
@@ -565,6 +567,9 @@ const char* EnumUtil::ToChars<QuoteRule>(QuoteRule value);
|
|
565
567
|
template<>
|
566
568
|
const char* EnumUtil::ToChars<RelationType>(RelationType value);
|
567
569
|
|
570
|
+
template<>
|
571
|
+
const char* EnumUtil::ToChars<RenderMode>(RenderMode value);
|
572
|
+
|
568
573
|
template<>
|
569
574
|
const char* EnumUtil::ToChars<ResultModifierType>(ResultModifierType value);
|
570
575
|
|
@@ -950,6 +955,9 @@ QuoteRule EnumUtil::FromString<QuoteRule>(const char *value);
|
|
950
955
|
template<>
|
951
956
|
RelationType EnumUtil::FromString<RelationType>(const char *value);
|
952
957
|
|
958
|
+
template<>
|
959
|
+
RenderMode EnumUtil::FromString<RenderMode>(const char *value);
|
960
|
+
|
953
961
|
template<>
|
954
962
|
ResultModifierType EnumUtil::FromString<ResultModifierType>(const char *value);
|
955
963
|
|
@@ -159,18 +159,33 @@ struct CSVReaderOptions {
|
|
159
159
|
string suffix;
|
160
160
|
string write_newline;
|
161
161
|
|
162
|
+
//! The date format to use (if any is specified)
|
163
|
+
map<LogicalTypeId, StrpTimeFormat> date_format = {{LogicalTypeId::DATE, {}}, {LogicalTypeId::TIMESTAMP, {}}};
|
162
164
|
//! The date format to use for writing (if any is specified)
|
163
165
|
map<LogicalTypeId, StrfTimeFormat> write_date_format = {{LogicalTypeId::DATE, {}}, {LogicalTypeId::TIMESTAMP, {}}};
|
166
|
+
//! Whether or not a type format is specified
|
167
|
+
map<LogicalTypeId, bool> has_format = {{LogicalTypeId::DATE, false}, {LogicalTypeId::TIMESTAMP, false}};
|
164
168
|
|
165
169
|
void Serialize(Serializer &serializer) const;
|
166
170
|
static CSVReaderOptions Deserialize(Deserializer &deserializer);
|
167
171
|
|
168
172
|
void SetCompression(const string &compression);
|
173
|
+
|
174
|
+
bool GetHeader() const;
|
169
175
|
void SetHeader(bool has_header);
|
176
|
+
|
177
|
+
string GetEscape() const;
|
170
178
|
void SetEscape(const string &escape);
|
179
|
+
|
180
|
+
int64_t GetSkipRows() const;
|
181
|
+
void SetSkipRows(int64_t rows);
|
182
|
+
|
183
|
+
string GetQuote() const;
|
171
184
|
void SetQuote(const string "e);
|
172
185
|
void SetDelimiter(const string &delimiter);
|
186
|
+
string GetDelimiter() const;
|
173
187
|
|
188
|
+
NewLineIdentifier GetNewline() const;
|
174
189
|
void SetNewline(const string &input);
|
175
190
|
//! Set an option that is supported by both reading and writing functions, called by
|
176
191
|
//! the SetReadOption and SetWriteOption methods
|
@@ -182,7 +197,16 @@ struct CSVReaderOptions {
|
|
182
197
|
void SetReadOption(const string &loption, const Value &value, vector<string> &expected_names);
|
183
198
|
void SetWriteOption(const string &loption, const Value &value);
|
184
199
|
void SetDateFormat(LogicalTypeId type, const string &format, bool read_format);
|
200
|
+
void ToNamedParameters(named_parameter_map_t &out);
|
201
|
+
void FromNamedParameters(named_parameter_map_t &in, ClientContext &context, vector<LogicalType> &return_types,
|
202
|
+
vector<string> &names);
|
185
203
|
|
186
204
|
string ToString() const;
|
205
|
+
|
206
|
+
named_parameter_map_t OutputReadSettings();
|
207
|
+
|
208
|
+
public:
|
209
|
+
//! Whether columns were explicitly provided through named parameters
|
210
|
+
bool explicitly_set_columns = false;
|
187
211
|
};
|
188
212
|
} // namespace duckdb
|
@@ -131,7 +131,7 @@ public:
|
|
131
131
|
|
132
132
|
//! Reads CSV file
|
133
133
|
DUCKDB_API shared_ptr<Relation> ReadCSV(const string &csv_file);
|
134
|
-
DUCKDB_API shared_ptr<Relation> ReadCSV(const string &csv_file,
|
134
|
+
DUCKDB_API shared_ptr<Relation> ReadCSV(const string &csv_file, named_parameter_map_t &&options);
|
135
135
|
DUCKDB_API shared_ptr<Relation> ReadCSV(const string &csv_file, const vector<string> &columns);
|
136
136
|
|
137
137
|
//! Reads Parquet file
|
@@ -118,6 +118,7 @@ static constexpr ExtensionEntry EXTENSION_FUNCTIONS[] = {
|
|
118
118
|
{"st_dwithin_spheroid", "spatial"},
|
119
119
|
{"st_envelope", "spatial"},
|
120
120
|
{"st_equals", "spatial"},
|
121
|
+
{"st_extent", "spatial"},
|
121
122
|
{"st_flipcoordinates", "spatial"},
|
122
123
|
{"st_geometrytype", "spatial"},
|
123
124
|
{"st_geomfromgeojson", "spatial"},
|
@@ -126,6 +127,7 @@ static constexpr ExtensionEntry EXTENSION_FUNCTIONS[] = {
|
|
126
127
|
{"st_geomfromtext", "spatial"},
|
127
128
|
{"st_geomfromwkb", "spatial"},
|
128
129
|
{"st_intersection", "spatial"},
|
130
|
+
{"st_intersection_agg", "spatial"},
|
129
131
|
{"st_intersects", "spatial"},
|
130
132
|
{"st_isclosed", "spatial"},
|
131
133
|
{"st_isempty", "spatial"},
|
@@ -159,9 +161,14 @@ static constexpr ExtensionEntry EXTENSION_FUNCTIONS[] = {
|
|
159
161
|
{"st_touches", "spatial"},
|
160
162
|
{"st_transform", "spatial"},
|
161
163
|
{"st_union", "spatial"},
|
164
|
+
{"st_union_agg", "spatial"},
|
162
165
|
{"st_within", "spatial"},
|
163
166
|
{"st_x", "spatial"},
|
167
|
+
{"st_xmax", "spatial"},
|
168
|
+
{"st_xmin", "spatial"},
|
164
169
|
{"st_y", "spatial"},
|
170
|
+
{"st_ymax", "spatial"},
|
171
|
+
{"st_ymin", "spatial"},
|
165
172
|
{"stem", "fts"},
|
166
173
|
{"text", "excel"},
|
167
174
|
{"to_arrow_ipc", "arrow"},
|
@@ -220,10 +227,9 @@ static constexpr ExtensionEntry EXTENSION_FILE_PREFIXES[] = {
|
|
220
227
|
|
221
228
|
// Note: these are currently hardcoded in scripts/generate_extensions_function.py
|
222
229
|
// TODO: automate by passing though to script via duckdb
|
223
|
-
static constexpr ExtensionEntry EXTENSION_FILE_POSTFIXES[] = {
|
224
|
-
|
225
|
-
|
226
|
-
{".ndjson", "json"}}; // END_OF_EXTENSION_FILE_POSTFIXES
|
230
|
+
static constexpr ExtensionEntry EXTENSION_FILE_POSTFIXES[] = {
|
231
|
+
{".parquet", "parquet"}, {".json", "json"}, {".jsonl", "json"}, {".ndjson", "json"},
|
232
|
+
{".shp", "spatial"}, {".gpkg", "spatial"}, {".fgb", "spatial"}}; // END_OF_EXTENSION_FILE_POSTFIXES
|
227
233
|
|
228
234
|
// Note: these are currently hardcoded in scripts/generate_extensions_function.py
|
229
235
|
// TODO: automate by passing though to script via duckdb
|
@@ -10,16 +10,16 @@
|
|
10
10
|
|
11
11
|
#include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
|
12
12
|
#include "duckdb/main/relation/table_function_relation.hpp"
|
13
|
+
#include "duckdb/common/shared_ptr.hpp"
|
14
|
+
#include "duckdb/common/case_insensitive_map.hpp"
|
13
15
|
|
14
16
|
namespace duckdb {
|
15
17
|
|
16
|
-
struct CSVReaderOptions;
|
17
|
-
|
18
18
|
class ReadCSVRelation : public TableFunctionRelation {
|
19
19
|
public:
|
20
20
|
ReadCSVRelation(const shared_ptr<ClientContext> &context, const string &csv_file, vector<ColumnDefinition> columns,
|
21
21
|
string alias = string());
|
22
|
-
ReadCSVRelation(const shared_ptr<ClientContext> &context, const string &csv_file,
|
22
|
+
ReadCSVRelation(const shared_ptr<ClientContext> &context, const string &csv_file, named_parameter_map_t &&options,
|
23
23
|
string alias = string());
|
24
24
|
|
25
25
|
string alias;
|
@@ -1740,7 +1740,7 @@ DUCKDB_API duckdb_vector duckdb_struct_vector_get_child(duckdb_vector vector, id
|
|
1740
1740
|
/*!
|
1741
1741
|
Returns whether or not a row is valid (i.e. not NULL) in the given validity mask.
|
1742
1742
|
|
1743
|
-
* validity: The validity mask, as obtained through `
|
1743
|
+
* validity: The validity mask, as obtained through `duckdb_vector_get_validity`
|
1744
1744
|
* row: The row index
|
1745
1745
|
* returns: true if the row is valid, false otherwise
|
1746
1746
|
*/
|
@@ -1749,10 +1749,10 @@ DUCKDB_API bool duckdb_validity_row_is_valid(uint64_t *validity, idx_t row);
|
|
1749
1749
|
/*!
|
1750
1750
|
In a validity mask, sets a specific row to either valid or invalid.
|
1751
1751
|
|
1752
|
-
Note that `
|
1752
|
+
Note that `duckdb_vector_ensure_validity_writable` should be called before calling `duckdb_vector_get_validity`,
|
1753
1753
|
to ensure that there is a validity mask to write to.
|
1754
1754
|
|
1755
|
-
* validity: The validity mask, as obtained through `
|
1755
|
+
* validity: The validity mask, as obtained through `duckdb_vector_get_validity`.
|
1756
1756
|
* row: The row index
|
1757
1757
|
* valid: Whether or not to set the row to valid, or invalid
|
1758
1758
|
*/
|
@@ -219,14 +219,12 @@ shared_ptr<Relation> Connection::Values(const string &values, const vector<strin
|
|
219
219
|
}
|
220
220
|
|
221
221
|
shared_ptr<Relation> Connection::ReadCSV(const string &csv_file) {
|
222
|
-
|
223
|
-
return ReadCSV(csv_file, options);
|
222
|
+
named_parameter_map_t options;
|
223
|
+
return ReadCSV(csv_file, std::move(options));
|
224
224
|
}
|
225
225
|
|
226
|
-
shared_ptr<Relation> Connection::ReadCSV(const string &csv_file,
|
227
|
-
|
228
|
-
options.auto_detect = true;
|
229
|
-
return make_shared<ReadCSVRelation>(context, csv_file, options);
|
226
|
+
shared_ptr<Relation> Connection::ReadCSV(const string &csv_file, named_parameter_map_t &&options) {
|
227
|
+
return make_shared<ReadCSVRelation>(context, csv_file, std::move(options));
|
230
228
|
}
|
231
229
|
|
232
230
|
shared_ptr<Relation> Connection::ReadCSV(const string &csv_file, const vector<string> &columns) {
|
@@ -158,11 +158,12 @@ void WriteExtensionFileToDisk(FileSystem &fs, const string &path, void *data, id
|
|
158
158
|
}
|
159
159
|
|
160
160
|
string ExtensionHelper::ExtensionUrlTemplate(optional_ptr<const ClientConfig> client_config, const string &repository) {
|
161
|
-
string default_endpoint = "http://extensions.duckdb.org";
|
162
161
|
string versioned_path = "/${REVISION}/${PLATFORM}/${NAME}.duckdb_extension";
|
163
162
|
#ifdef WASM_LOADABLE_EXTENSIONS
|
163
|
+
string default_endpoint = "https://extensions.duckdb.org";
|
164
164
|
versioned_path = "/duckdb-wasm" + versioned_path + ".wasm";
|
165
165
|
#else
|
166
|
+
string default_endpoint = "http://extensions.duckdb.org";
|
166
167
|
versioned_path = versioned_path + ".gz";
|
167
168
|
#endif
|
168
169
|
string custom_endpoint = client_config ? client_config->custom_extension_repo : string();
|
@@ -1,6 +1,5 @@
|
|
1
1
|
#include "duckdb/main/relation/read_csv_relation.hpp"
|
2
2
|
|
3
|
-
#include "duckdb/common/string_util.hpp"
|
4
3
|
#include "duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp"
|
5
4
|
#include "duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp"
|
6
5
|
#include "duckdb/execution/operator/scan/csv/csv_sniffer.hpp"
|
@@ -8,6 +7,9 @@
|
|
8
7
|
#include "duckdb/parser/expression/comparison_expression.hpp"
|
9
8
|
#include "duckdb/parser/expression/constant_expression.hpp"
|
10
9
|
#include "duckdb/parser/expression/function_expression.hpp"
|
10
|
+
#include "duckdb/common/string_util.hpp"
|
11
|
+
#include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
|
12
|
+
#include "duckdb/common/multi_file_reader.hpp"
|
11
13
|
#include "duckdb/parser/expression/star_expression.hpp"
|
12
14
|
#include "duckdb/parser/query_node/select_node.hpp"
|
13
15
|
#include "duckdb/parser/tableref/basetableref.hpp"
|
@@ -34,8 +36,8 @@ ReadCSVRelation::ReadCSVRelation(const shared_ptr<ClientContext> &context, const
|
|
34
36
|
AddNamedParameter("columns", Value::STRUCT(std::move(column_names)));
|
35
37
|
}
|
36
38
|
|
37
|
-
ReadCSVRelation::ReadCSVRelation(const shared_ptr<ClientContext> &context, const string &csv_file,
|
38
|
-
|
39
|
+
ReadCSVRelation::ReadCSVRelation(const std::shared_ptr<ClientContext> &context, const string &csv_file,
|
40
|
+
named_parameter_map_t &&options, string alias_p)
|
39
41
|
: TableFunctionRelation(context, "read_csv_auto", {Value(csv_file)}, nullptr, false), alias(std::move(alias_p)),
|
40
42
|
auto_detect(true) {
|
41
43
|
|
@@ -43,12 +45,24 @@ ReadCSVRelation::ReadCSVRelation(const shared_ptr<ClientContext> &context, const
|
|
43
45
|
alias = StringUtil::Split(csv_file, ".")[0];
|
44
46
|
}
|
45
47
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
auto
|
48
|
+
auto files = MultiFileReader::GetFileList(*context, csv_file, "CSV");
|
49
|
+
D_ASSERT(!files.empty());
|
50
|
+
|
51
|
+
auto &file_name = files[0];
|
52
|
+
options["auto_detect"] = Value::BOOLEAN(true);
|
53
|
+
CSVReaderOptions csv_options;
|
54
|
+
csv_options.file_path = file_name;
|
55
|
+
vector<string> empty;
|
56
|
+
|
57
|
+
vector<LogicalType> unused_types;
|
58
|
+
vector<string> unused_names;
|
59
|
+
csv_options.FromNamedParameters(options, *context, unused_types, unused_names);
|
60
|
+
// Run the auto-detect, populating the options with the detected settings
|
61
|
+
|
62
|
+
auto bm_file_handle = BaseCSVReader::OpenCSV(*context, csv_options);
|
63
|
+
auto buffer_manager = make_shared<CSVBufferManager>(*context, std::move(bm_file_handle), csv_options);
|
50
64
|
CSVStateMachineCache state_machine_cache;
|
51
|
-
CSVSniffer sniffer(
|
65
|
+
CSVSniffer sniffer(csv_options, buffer_manager, state_machine_cache);
|
52
66
|
auto sniffer_result = sniffer.SniffCSV();
|
53
67
|
auto &types = sniffer_result.return_types;
|
54
68
|
auto &names = sniffer_result.names;
|
@@ -56,7 +70,12 @@ ReadCSVRelation::ReadCSVRelation(const shared_ptr<ClientContext> &context, const
|
|
56
70
|
columns.emplace_back(names[i], types[i]);
|
57
71
|
}
|
58
72
|
|
59
|
-
|
73
|
+
//! Capture the options potentially set/altered by the auto detection phase
|
74
|
+
csv_options.ToNamedParameters(options);
|
75
|
+
|
76
|
+
// No need to auto-detect again
|
77
|
+
options["auto_detect"] = Value::BOOLEAN(false);
|
78
|
+
SetNamedParameters(std::move(options));
|
60
79
|
}
|
61
80
|
|
62
81
|
string ReadCSVRelation::GetAlias() {
|
@@ -9,6 +9,7 @@
|
|
9
9
|
#include "duckdb/main/client_context.hpp"
|
10
10
|
#include "duckdb/parser/expression/comparison_expression.hpp"
|
11
11
|
#include "duckdb/parser/expression/columnref_expression.hpp"
|
12
|
+
#include "duckdb/common/shared_ptr.hpp"
|
12
13
|
|
13
14
|
namespace duckdb {
|
14
15
|
|
@@ -16,7 +17,12 @@ void TableFunctionRelation::AddNamedParameter(const string &name, Value argument
|
|
16
17
|
named_parameters[name] = std::move(argument);
|
17
18
|
}
|
18
19
|
|
19
|
-
TableFunctionRelation::
|
20
|
+
void TableFunctionRelation::SetNamedParameters(named_parameter_map_t &&options) {
|
21
|
+
D_ASSERT(named_parameters.empty());
|
22
|
+
named_parameters = std::move(options);
|
23
|
+
}
|
24
|
+
|
25
|
+
TableFunctionRelation::TableFunctionRelation(const shared_ptr<ClientContext> &context, string name_p,
|
20
26
|
vector<Value> parameters_p, named_parameter_map_t named_parameters,
|
21
27
|
shared_ptr<Relation> input_relation_p, bool auto_init)
|
22
28
|
: Relation(context, RelationType::TABLE_FUNCTION_RELATION), name(std::move(name_p)),
|
@@ -25,7 +31,7 @@ TableFunctionRelation::TableFunctionRelation(const std::shared_ptr<ClientContext
|
|
25
31
|
InitializeColumns();
|
26
32
|
}
|
27
33
|
|
28
|
-
TableFunctionRelation::TableFunctionRelation(const
|
34
|
+
TableFunctionRelation::TableFunctionRelation(const shared_ptr<ClientContext> &context, string name_p,
|
29
35
|
vector<Value> parameters_p, shared_ptr<Relation> input_relation_p,
|
30
36
|
bool auto_init)
|
31
37
|
: Relation(context, RelationType::TABLE_FUNCTION_RELATION), name(std::move(name_p)),
|
@@ -131,11 +131,11 @@ void SingleFileCheckpointWriter::CreateCheckpoint() {
|
|
131
131
|
throw FatalException("Checkpoint aborted before truncate because of PRAGMA checkpoint_abort flag");
|
132
132
|
}
|
133
133
|
|
134
|
-
// truncate the WAL
|
135
|
-
wal->Truncate(0);
|
136
|
-
|
137
134
|
// truncate the file
|
138
135
|
block_manager.Truncate();
|
136
|
+
|
137
|
+
// truncate the WAL
|
138
|
+
wal->Truncate(0);
|
139
139
|
}
|
140
140
|
|
141
141
|
void CheckpointReader::LoadCheckpoint(ClientContext &context, MetadataReader &reader) {
|
@@ -102,9 +102,7 @@ void TableStatistics::CopyStats(TableStatistics &other) {
|
|
102
102
|
}
|
103
103
|
|
104
104
|
void TableStatistics::Serialize(Serializer &serializer) const {
|
105
|
-
|
106
|
-
serializer.WriteList(100, "column_stats", column_count,
|
107
|
-
[&](Serializer::List &list, idx_t i) { list.WriteElement(column_stats[i]); });
|
105
|
+
serializer.WriteProperty(100, "column_stats", column_stats);
|
108
106
|
}
|
109
107
|
|
110
108
|
void TableStatistics::Deserialize(Deserializer &deserializer, ColumnList &columns) {
|
@@ -57,7 +57,10 @@ bool WriteAheadLog::Replay(AttachedDatabase &database, string &path) {
|
|
57
57
|
deserializer.End();
|
58
58
|
}
|
59
59
|
}
|
60
|
-
} catch (
|
60
|
+
} catch (SerializationException &ex) { // LCOV_EXCL_START
|
61
|
+
// serialization exception - torn WAL
|
62
|
+
// continue reading
|
63
|
+
} catch (std::exception &ex) {
|
61
64
|
Printer::PrintF("Exception in WAL playback during initial read: %s\n", ex.what());
|
62
65
|
return false;
|
63
66
|
} catch (...) {
|
@@ -104,7 +107,10 @@ bool WriteAheadLog::Replay(AttachedDatabase &database, string &path) {
|
|
104
107
|
deserializer.End();
|
105
108
|
}
|
106
109
|
}
|
107
|
-
} catch (
|
110
|
+
} catch (SerializationException &ex) { // LCOV_EXCL_START
|
111
|
+
// serialization error during WAL replay: rollback
|
112
|
+
con.Rollback();
|
113
|
+
} catch (std::exception &ex) {
|
108
114
|
// FIXME: this should report a proper warning in the connection
|
109
115
|
Printer::PrintF("Exception in WAL playback: %s\n", ex.what());
|
110
116
|
// exception thrown in WAL replay: rollback
|