duckdb 0.8.2-dev4572.0 → 0.8.2-dev4623.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/src/common/enum_util.cpp +24 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_reader_options.cpp +213 -2
- package/src/duckdb/src/function/table/read_csv.cpp +3 -130
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/box_renderer.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/enum_util.hpp +8 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_reader_options.hpp +24 -0
- package/src/duckdb/src/include/duckdb/main/connection.hpp +1 -1
- package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +10 -4
- package/src/duckdb/src/include/duckdb/main/relation/read_csv_relation.hpp +3 -3
- package/src/duckdb/src/include/duckdb/main/relation/table_function_relation.hpp +1 -0
- package/src/duckdb/src/main/connection.cpp +4 -6
- package/src/duckdb/src/main/extension/extension_install.cpp +2 -1
- package/src/duckdb/src/main/relation/read_csv_relation.cpp +28 -9
- package/src/duckdb/src/main/relation/table_function_relation.cpp +8 -2
package/package.json
CHANGED
@@ -11,6 +11,7 @@
|
|
11
11
|
|
12
12
|
#include "duckdb/common/enum_util.hpp"
|
13
13
|
#include "duckdb/catalog/catalog_entry/table_column_type.hpp"
|
14
|
+
#include "duckdb/common/box_renderer.hpp"
|
14
15
|
#include "duckdb/common/enums/access_mode.hpp"
|
15
16
|
#include "duckdb/common/enums/aggregate_handling.hpp"
|
16
17
|
#include "duckdb/common/enums/catalog_type.hpp"
|
@@ -4797,6 +4798,29 @@ RelationType EnumUtil::FromString<RelationType>(const char *value) {
|
|
4797
4798
|
throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value));
|
4798
4799
|
}
|
4799
4800
|
|
4801
|
+
template<>
|
4802
|
+
const char* EnumUtil::ToChars<RenderMode>(RenderMode value) {
|
4803
|
+
switch(value) {
|
4804
|
+
case RenderMode::ROWS:
|
4805
|
+
return "ROWS";
|
4806
|
+
case RenderMode::COLUMNS:
|
4807
|
+
return "COLUMNS";
|
4808
|
+
default:
|
4809
|
+
throw NotImplementedException(StringUtil::Format("Enum value: '%d' not implemented", value));
|
4810
|
+
}
|
4811
|
+
}
|
4812
|
+
|
4813
|
+
template<>
|
4814
|
+
RenderMode EnumUtil::FromString<RenderMode>(const char *value) {
|
4815
|
+
if (StringUtil::Equals(value, "ROWS")) {
|
4816
|
+
return RenderMode::ROWS;
|
4817
|
+
}
|
4818
|
+
if (StringUtil::Equals(value, "COLUMNS")) {
|
4819
|
+
return RenderMode::COLUMNS;
|
4820
|
+
}
|
4821
|
+
throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value));
|
4822
|
+
}
|
4823
|
+
|
4800
4824
|
template<>
|
4801
4825
|
const char* EnumUtil::ToChars<ResultModifierType>(ResultModifierType value) {
|
4802
4826
|
switch(value) {
|
@@ -2,6 +2,8 @@
|
|
2
2
|
#include "duckdb/common/bind_helpers.hpp"
|
3
3
|
#include "duckdb/common/vector_size.hpp"
|
4
4
|
#include "duckdb/common/string_util.hpp"
|
5
|
+
#include "duckdb/common/enum_util.hpp"
|
6
|
+
#include "duckdb/common/multi_file_reader.hpp"
|
5
7
|
|
6
8
|
namespace duckdb {
|
7
9
|
|
@@ -60,6 +62,10 @@ static int64_t ParseInteger(const Value &value, const string &loption) {
|
|
60
62
|
return value.GetValue<int64_t>();
|
61
63
|
}
|
62
64
|
|
65
|
+
bool CSVReaderOptions::GetHeader() const {
|
66
|
+
return this->dialect_options.header;
|
67
|
+
}
|
68
|
+
|
63
69
|
void CSVReaderOptions::SetHeader(bool input) {
|
64
70
|
this->dialect_options.header = input;
|
65
71
|
this->has_header = true;
|
@@ -69,6 +75,10 @@ void CSVReaderOptions::SetCompression(const string &compression_p) {
|
|
69
75
|
this->compression = FileCompressionTypeFromString(compression_p);
|
70
76
|
}
|
71
77
|
|
78
|
+
string CSVReaderOptions::GetEscape() const {
|
79
|
+
return std::string(1, this->dialect_options.state_machine_options.escape);
|
80
|
+
}
|
81
|
+
|
72
82
|
void CSVReaderOptions::SetEscape(const string &input) {
|
73
83
|
auto escape_str = input;
|
74
84
|
if (escape_str.size() > 1) {
|
@@ -81,6 +91,19 @@ void CSVReaderOptions::SetEscape(const string &input) {
|
|
81
91
|
this->has_escape = true;
|
82
92
|
}
|
83
93
|
|
94
|
+
int64_t CSVReaderOptions::GetSkipRows() const {
|
95
|
+
return this->dialect_options.skip_rows;
|
96
|
+
}
|
97
|
+
|
98
|
+
void CSVReaderOptions::SetSkipRows(int64_t skip_rows) {
|
99
|
+
dialect_options.skip_rows = skip_rows;
|
100
|
+
skip_rows_set = true;
|
101
|
+
}
|
102
|
+
|
103
|
+
string CSVReaderOptions::GetDelimiter() const {
|
104
|
+
return std::string(1, this->dialect_options.state_machine_options.delimiter);
|
105
|
+
}
|
106
|
+
|
84
107
|
void CSVReaderOptions::SetDelimiter(const string &input) {
|
85
108
|
auto delim_str = StringUtil::Replace(input, "\\t", "\t");
|
86
109
|
if (delim_str.size() > 1) {
|
@@ -93,6 +116,10 @@ void CSVReaderOptions::SetDelimiter(const string &input) {
|
|
93
116
|
this->dialect_options.state_machine_options.delimiter = delim_str[0];
|
94
117
|
}
|
95
118
|
|
119
|
+
string CSVReaderOptions::GetQuote() const {
|
120
|
+
return std::string(1, this->dialect_options.state_machine_options.quote);
|
121
|
+
}
|
122
|
+
|
96
123
|
void CSVReaderOptions::SetQuote(const string "e_p) {
|
97
124
|
auto quote_str = quote_p;
|
98
125
|
if (quote_str.size() > 1) {
|
@@ -105,6 +132,10 @@ void CSVReaderOptions::SetQuote(const string "e_p) {
|
|
105
132
|
this->has_quote = true;
|
106
133
|
}
|
107
134
|
|
135
|
+
NewLineIdentifier CSVReaderOptions::GetNewline() const {
|
136
|
+
return dialect_options.new_line;
|
137
|
+
}
|
138
|
+
|
108
139
|
void CSVReaderOptions::SetNewline(const string &input) {
|
109
140
|
if (input == "\\n" || input == "\\r") {
|
110
141
|
dialect_options.new_line = NewLineIdentifier::SINGLE;
|
@@ -152,8 +183,7 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value,
|
|
152
183
|
sample_chunks = sample_size / STANDARD_VECTOR_SIZE + 1;
|
153
184
|
}
|
154
185
|
} else if (loption == "skip") {
|
155
|
-
|
156
|
-
skip_rows_set = true;
|
186
|
+
SetSkipRows(ParseInteger(value, loption));
|
157
187
|
} else if (loption == "max_line_size" || loption == "maximum_line_size") {
|
158
188
|
maximum_line_size = ParseInteger(value, loption);
|
159
189
|
} else if (loption == "sample_chunk_size") {
|
@@ -296,4 +326,185 @@ string CSVReaderOptions::ToString() const {
|
|
296
326
|
"\n ignore_errors=" + std::to_string(ignore_errors) + "\n all_varchar=" + std::to_string(all_varchar);
|
297
327
|
}
|
298
328
|
|
329
|
+
static Value StringVectorToValue(const vector<string> &vec) {
|
330
|
+
vector<Value> content;
|
331
|
+
content.reserve(vec.size());
|
332
|
+
for (auto &item : vec) {
|
333
|
+
content.push_back(Value(item));
|
334
|
+
}
|
335
|
+
return Value::LIST(std::move(content));
|
336
|
+
}
|
337
|
+
|
338
|
+
static uint8_t GetCandidateSpecificity(const LogicalType &candidate_type) {
|
339
|
+
//! Const ht with accepted auto_types and their weights in specificity
|
340
|
+
const duckdb::unordered_map<uint8_t, uint8_t> auto_type_candidates_specificity {
|
341
|
+
{(uint8_t)LogicalTypeId::VARCHAR, 0}, {(uint8_t)LogicalTypeId::TIMESTAMP, 1},
|
342
|
+
{(uint8_t)LogicalTypeId::DATE, 2}, {(uint8_t)LogicalTypeId::TIME, 3},
|
343
|
+
{(uint8_t)LogicalTypeId::DOUBLE, 4}, {(uint8_t)LogicalTypeId::FLOAT, 5},
|
344
|
+
{(uint8_t)LogicalTypeId::BIGINT, 6}, {(uint8_t)LogicalTypeId::INTEGER, 7},
|
345
|
+
{(uint8_t)LogicalTypeId::SMALLINT, 8}, {(uint8_t)LogicalTypeId::TINYINT, 9},
|
346
|
+
{(uint8_t)LogicalTypeId::BOOLEAN, 10}, {(uint8_t)LogicalTypeId::SQLNULL, 11}};
|
347
|
+
|
348
|
+
auto id = (uint8_t)candidate_type.id();
|
349
|
+
auto it = auto_type_candidates_specificity.find(id);
|
350
|
+
if (it == auto_type_candidates_specificity.end()) {
|
351
|
+
throw BinderException("Auto Type Candidate of type %s is not accepted as a valid input",
|
352
|
+
EnumUtil::ToString(candidate_type.id()));
|
353
|
+
}
|
354
|
+
return it->second;
|
355
|
+
}
|
356
|
+
|
357
|
+
void CSVReaderOptions::FromNamedParameters(named_parameter_map_t &in, ClientContext &context,
|
358
|
+
vector<LogicalType> &return_types, vector<string> &names) {
|
359
|
+
for (auto &kv : in) {
|
360
|
+
if (MultiFileReader::ParseOption(kv.first, kv.second, file_options, context)) {
|
361
|
+
continue;
|
362
|
+
}
|
363
|
+
auto loption = StringUtil::Lower(kv.first);
|
364
|
+
if (loption == "columns") {
|
365
|
+
explicitly_set_columns = true;
|
366
|
+
auto &child_type = kv.second.type();
|
367
|
+
if (child_type.id() != LogicalTypeId::STRUCT) {
|
368
|
+
throw BinderException("read_csv columns requires a struct as input");
|
369
|
+
}
|
370
|
+
auto &struct_children = StructValue::GetChildren(kv.second);
|
371
|
+
D_ASSERT(StructType::GetChildCount(child_type) == struct_children.size());
|
372
|
+
for (idx_t i = 0; i < struct_children.size(); i++) {
|
373
|
+
auto &name = StructType::GetChildName(child_type, i);
|
374
|
+
auto &val = struct_children[i];
|
375
|
+
names.push_back(name);
|
376
|
+
if (val.type().id() != LogicalTypeId::VARCHAR) {
|
377
|
+
throw BinderException("read_csv requires a type specification as string");
|
378
|
+
}
|
379
|
+
return_types.emplace_back(TransformStringToLogicalType(StringValue::Get(val), context));
|
380
|
+
}
|
381
|
+
if (names.empty()) {
|
382
|
+
throw BinderException("read_csv requires at least a single column as input!");
|
383
|
+
}
|
384
|
+
} else if (loption == "auto_type_candidates") {
|
385
|
+
auto_type_candidates.clear();
|
386
|
+
map<uint8_t, LogicalType> candidate_types;
|
387
|
+
// We always have the extremes of Null and Varchar, so we can default to varchar if the
|
388
|
+
// sniffer is not able to confidently detect that column type
|
389
|
+
candidate_types[GetCandidateSpecificity(LogicalType::VARCHAR)] = LogicalType::VARCHAR;
|
390
|
+
candidate_types[GetCandidateSpecificity(LogicalType::SQLNULL)] = LogicalType::SQLNULL;
|
391
|
+
|
392
|
+
auto &child_type = kv.second.type();
|
393
|
+
if (child_type.id() != LogicalTypeId::LIST) {
|
394
|
+
throw BinderException("read_csv auto_types requires a list as input");
|
395
|
+
}
|
396
|
+
auto &list_children = ListValue::GetChildren(kv.second);
|
397
|
+
if (list_children.empty()) {
|
398
|
+
throw BinderException("auto_type_candidates requires at least one type");
|
399
|
+
}
|
400
|
+
for (auto &child : list_children) {
|
401
|
+
if (child.type().id() != LogicalTypeId::VARCHAR) {
|
402
|
+
throw BinderException("auto_type_candidates requires a type specification as string");
|
403
|
+
}
|
404
|
+
auto candidate_type = TransformStringToLogicalType(StringValue::Get(child), context);
|
405
|
+
candidate_types[GetCandidateSpecificity(candidate_type)] = candidate_type;
|
406
|
+
}
|
407
|
+
for (auto &candidate_type : candidate_types) {
|
408
|
+
auto_type_candidates.emplace_back(candidate_type.second);
|
409
|
+
}
|
410
|
+
} else if (loption == "column_names" || loption == "names") {
|
411
|
+
if (!name_list.empty()) {
|
412
|
+
throw BinderException("read_csv_auto column_names/names can only be supplied once");
|
413
|
+
}
|
414
|
+
if (kv.second.IsNull()) {
|
415
|
+
throw BinderException("read_csv_auto %s cannot be NULL", kv.first);
|
416
|
+
}
|
417
|
+
auto &children = ListValue::GetChildren(kv.second);
|
418
|
+
for (auto &child : children) {
|
419
|
+
name_list.push_back(StringValue::Get(child));
|
420
|
+
}
|
421
|
+
} else if (loption == "column_types" || loption == "types" || loption == "dtypes") {
|
422
|
+
auto &child_type = kv.second.type();
|
423
|
+
if (child_type.id() != LogicalTypeId::STRUCT && child_type.id() != LogicalTypeId::LIST) {
|
424
|
+
throw BinderException("read_csv_auto %s requires a struct or list as input", kv.first);
|
425
|
+
}
|
426
|
+
if (!sql_type_list.empty()) {
|
427
|
+
throw BinderException("read_csv_auto column_types/types/dtypes can only be supplied once");
|
428
|
+
}
|
429
|
+
vector<string> sql_type_names;
|
430
|
+
if (child_type.id() == LogicalTypeId::STRUCT) {
|
431
|
+
auto &struct_children = StructValue::GetChildren(kv.second);
|
432
|
+
D_ASSERT(StructType::GetChildCount(child_type) == struct_children.size());
|
433
|
+
for (idx_t i = 0; i < struct_children.size(); i++) {
|
434
|
+
auto &name = StructType::GetChildName(child_type, i);
|
435
|
+
auto &val = struct_children[i];
|
436
|
+
if (val.type().id() != LogicalTypeId::VARCHAR) {
|
437
|
+
throw BinderException("read_csv_auto %s requires a type specification as string", kv.first);
|
438
|
+
}
|
439
|
+
sql_type_names.push_back(StringValue::Get(val));
|
440
|
+
sql_types_per_column[name] = i;
|
441
|
+
}
|
442
|
+
} else {
|
443
|
+
auto &list_child = ListType::GetChildType(child_type);
|
444
|
+
if (list_child.id() != LogicalTypeId::VARCHAR) {
|
445
|
+
throw BinderException("read_csv_auto %s requires a list of types (varchar) as input", kv.first);
|
446
|
+
}
|
447
|
+
auto &children = ListValue::GetChildren(kv.second);
|
448
|
+
for (auto &child : children) {
|
449
|
+
sql_type_names.push_back(StringValue::Get(child));
|
450
|
+
}
|
451
|
+
}
|
452
|
+
sql_type_list.reserve(sql_type_names.size());
|
453
|
+
for (auto &sql_type : sql_type_names) {
|
454
|
+
auto def_type = TransformStringToLogicalType(sql_type);
|
455
|
+
if (def_type.id() == LogicalTypeId::USER) {
|
456
|
+
throw BinderException("Unrecognized type \"%s\" for read_csv_auto %s definition", sql_type,
|
457
|
+
kv.first);
|
458
|
+
}
|
459
|
+
sql_type_list.push_back(std::move(def_type));
|
460
|
+
}
|
461
|
+
} else if (loption == "all_varchar") {
|
462
|
+
all_varchar = BooleanValue::Get(kv.second);
|
463
|
+
} else if (loption == "normalize_names") {
|
464
|
+
normalize_names = BooleanValue::Get(kv.second);
|
465
|
+
} else {
|
466
|
+
SetReadOption(loption, kv.second, names);
|
467
|
+
}
|
468
|
+
}
|
469
|
+
}
|
470
|
+
|
471
|
+
//! This function is used to remember options set by the sniffer, for use in ReadCSVRelation
|
472
|
+
void CSVReaderOptions::ToNamedParameters(named_parameter_map_t &named_params) {
|
473
|
+
if (has_delimiter) {
|
474
|
+
named_params["delim"] = Value(GetDelimiter());
|
475
|
+
}
|
476
|
+
if (has_newline) {
|
477
|
+
named_params["newline"] = Value(EnumUtil::ToString(GetNewline()));
|
478
|
+
}
|
479
|
+
if (has_quote) {
|
480
|
+
named_params["quote"] = Value(GetQuote());
|
481
|
+
}
|
482
|
+
if (has_escape) {
|
483
|
+
named_params["escape"] = Value(GetEscape());
|
484
|
+
}
|
485
|
+
if (has_header) {
|
486
|
+
named_params["header"] = Value(GetHeader());
|
487
|
+
}
|
488
|
+
named_params["max_line_size"] = Value::BIGINT(maximum_line_size);
|
489
|
+
if (skip_rows_set) {
|
490
|
+
named_params["skip"] = Value::BIGINT(GetSkipRows());
|
491
|
+
}
|
492
|
+
named_params["sample_chunks"] = Value::BIGINT(sample_chunks);
|
493
|
+
named_params["sample_chunk_size"] = Value::BIGINT(sample_chunk_size);
|
494
|
+
named_params["null_padding"] = Value::BOOLEAN(null_padding);
|
495
|
+
if (!date_format.at(LogicalType::DATE).format_specifier.empty()) {
|
496
|
+
named_params["dateformat"] = Value(date_format.at(LogicalType::DATE).format_specifier);
|
497
|
+
}
|
498
|
+
if (!date_format.at(LogicalType::TIMESTAMP).format_specifier.empty()) {
|
499
|
+
named_params["timestampformat"] = Value(date_format.at(LogicalType::TIMESTAMP).format_specifier);
|
500
|
+
}
|
501
|
+
|
502
|
+
named_params["normalize_names"] = Value::BOOLEAN(normalize_names);
|
503
|
+
if (!name_list.empty()) {
|
504
|
+
named_params["column_names"] = StringVectorToValue(name_list);
|
505
|
+
}
|
506
|
+
named_params["all_varchar"] = Value::BOOLEAN(all_varchar);
|
507
|
+
named_params["maximum_line_size"] = Value::BIGINT(maximum_line_size);
|
508
|
+
}
|
509
|
+
|
299
510
|
} // namespace duckdb
|
@@ -85,25 +85,6 @@ void ReadCSVData::FinalizeRead(ClientContext &context) {
|
|
85
85
|
}
|
86
86
|
}
|
87
87
|
|
88
|
-
uint8_t GetCandidateSpecificity(const LogicalType &candidate_type) {
|
89
|
-
//! Const ht with accepted auto_types and their weights in specificity
|
90
|
-
const duckdb::unordered_map<uint8_t, uint8_t> auto_type_candidates_specificity {
|
91
|
-
{(uint8_t)LogicalTypeId::VARCHAR, 0}, {(uint8_t)LogicalTypeId::TIMESTAMP, 1},
|
92
|
-
{(uint8_t)LogicalTypeId::DATE, 2}, {(uint8_t)LogicalTypeId::TIME, 3},
|
93
|
-
{(uint8_t)LogicalTypeId::DOUBLE, 4}, {(uint8_t)LogicalTypeId::FLOAT, 5},
|
94
|
-
{(uint8_t)LogicalTypeId::BIGINT, 6}, {(uint8_t)LogicalTypeId::INTEGER, 7},
|
95
|
-
{(uint8_t)LogicalTypeId::SMALLINT, 8}, {(uint8_t)LogicalTypeId::TINYINT, 9},
|
96
|
-
{(uint8_t)LogicalTypeId::BOOLEAN, 10}, {(uint8_t)LogicalTypeId::SQLNULL, 11}};
|
97
|
-
|
98
|
-
auto id = (uint8_t)candidate_type.id();
|
99
|
-
auto it = auto_type_candidates_specificity.find(id);
|
100
|
-
if (it == auto_type_candidates_specificity.end()) {
|
101
|
-
throw BinderException("Auto Type Candidate of type %s is not accepted as a valid input",
|
102
|
-
EnumUtil::ToString(candidate_type.id()));
|
103
|
-
}
|
104
|
-
return it->second;
|
105
|
-
}
|
106
|
-
|
107
88
|
static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctionBindInput &input,
|
108
89
|
vector<LogicalType> &return_types, vector<string> &names) {
|
109
90
|
|
@@ -111,117 +92,9 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
|
|
111
92
|
auto &options = result->options;
|
112
93
|
result->files = MultiFileReader::GetFileList(context, input.inputs[0], "CSV");
|
113
94
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
continue;
|
118
|
-
}
|
119
|
-
auto loption = StringUtil::Lower(kv.first);
|
120
|
-
if (loption == "columns") {
|
121
|
-
explicitly_set_columns = true;
|
122
|
-
auto &child_type = kv.second.type();
|
123
|
-
if (child_type.id() != LogicalTypeId::STRUCT) {
|
124
|
-
throw BinderException("read_csv columns requires a struct as input");
|
125
|
-
}
|
126
|
-
auto &struct_children = StructValue::GetChildren(kv.second);
|
127
|
-
D_ASSERT(StructType::GetChildCount(child_type) == struct_children.size());
|
128
|
-
for (idx_t i = 0; i < struct_children.size(); i++) {
|
129
|
-
auto &name = StructType::GetChildName(child_type, i);
|
130
|
-
auto &val = struct_children[i];
|
131
|
-
names.push_back(name);
|
132
|
-
if (val.type().id() != LogicalTypeId::VARCHAR) {
|
133
|
-
throw BinderException("read_csv requires a type specification as string");
|
134
|
-
}
|
135
|
-
return_types.emplace_back(TransformStringToLogicalType(StringValue::Get(val), context));
|
136
|
-
}
|
137
|
-
if (names.empty()) {
|
138
|
-
throw BinderException("read_csv requires at least a single column as input!");
|
139
|
-
}
|
140
|
-
} else if (loption == "auto_type_candidates") {
|
141
|
-
options.auto_type_candidates.clear();
|
142
|
-
map<uint8_t, LogicalType> candidate_types;
|
143
|
-
// We always have the extremes of Null and Varchar, so we can default to varchar if the
|
144
|
-
// sniffer is not able to confidently detect that column type
|
145
|
-
candidate_types[GetCandidateSpecificity(LogicalType::VARCHAR)] = LogicalType::VARCHAR;
|
146
|
-
candidate_types[GetCandidateSpecificity(LogicalType::SQLNULL)] = LogicalType::SQLNULL;
|
147
|
-
|
148
|
-
auto &child_type = kv.second.type();
|
149
|
-
if (child_type.id() != LogicalTypeId::LIST) {
|
150
|
-
throw BinderException("read_csv auto_types requires a list as input");
|
151
|
-
}
|
152
|
-
auto &list_children = ListValue::GetChildren(kv.second);
|
153
|
-
if (list_children.empty()) {
|
154
|
-
throw BinderException("auto_type_candidates requires at least one type");
|
155
|
-
}
|
156
|
-
for (auto &child : list_children) {
|
157
|
-
if (child.type().id() != LogicalTypeId::VARCHAR) {
|
158
|
-
throw BinderException("auto_type_candidates requires a type specification as string");
|
159
|
-
}
|
160
|
-
auto candidate_type = TransformStringToLogicalType(StringValue::Get(child), context);
|
161
|
-
candidate_types[GetCandidateSpecificity(candidate_type)] = candidate_type;
|
162
|
-
}
|
163
|
-
for (auto &candidate_type : candidate_types) {
|
164
|
-
options.auto_type_candidates.emplace_back(candidate_type.second);
|
165
|
-
}
|
166
|
-
} else if (loption == "column_names" || loption == "names") {
|
167
|
-
if (!options.name_list.empty()) {
|
168
|
-
throw BinderException("read_csv_auto column_names/names can only be supplied once");
|
169
|
-
}
|
170
|
-
if (kv.second.IsNull()) {
|
171
|
-
throw BinderException("read_csv_auto %s cannot be NULL", kv.first);
|
172
|
-
}
|
173
|
-
auto &children = ListValue::GetChildren(kv.second);
|
174
|
-
for (auto &child : children) {
|
175
|
-
options.name_list.push_back(StringValue::Get(child));
|
176
|
-
}
|
177
|
-
} else if (loption == "column_types" || loption == "types" || loption == "dtypes") {
|
178
|
-
auto &child_type = kv.second.type();
|
179
|
-
if (child_type.id() != LogicalTypeId::STRUCT && child_type.id() != LogicalTypeId::LIST) {
|
180
|
-
throw BinderException("read_csv_auto %s requires a struct or list as input", kv.first);
|
181
|
-
}
|
182
|
-
if (!options.sql_type_list.empty()) {
|
183
|
-
throw BinderException("read_csv_auto column_types/types/dtypes can only be supplied once");
|
184
|
-
}
|
185
|
-
vector<string> sql_type_names;
|
186
|
-
if (child_type.id() == LogicalTypeId::STRUCT) {
|
187
|
-
auto &struct_children = StructValue::GetChildren(kv.second);
|
188
|
-
D_ASSERT(StructType::GetChildCount(child_type) == struct_children.size());
|
189
|
-
for (idx_t i = 0; i < struct_children.size(); i++) {
|
190
|
-
auto &name = StructType::GetChildName(child_type, i);
|
191
|
-
auto &val = struct_children[i];
|
192
|
-
if (val.type().id() != LogicalTypeId::VARCHAR) {
|
193
|
-
throw BinderException("read_csv_auto %s requires a type specification as string", kv.first);
|
194
|
-
}
|
195
|
-
sql_type_names.push_back(StringValue::Get(val));
|
196
|
-
options.sql_types_per_column[name] = i;
|
197
|
-
}
|
198
|
-
} else {
|
199
|
-
auto &list_child = ListType::GetChildType(child_type);
|
200
|
-
if (list_child.id() != LogicalTypeId::VARCHAR) {
|
201
|
-
throw BinderException("read_csv_auto %s requires a list of types (varchar) as input", kv.first);
|
202
|
-
}
|
203
|
-
auto &children = ListValue::GetChildren(kv.second);
|
204
|
-
for (auto &child : children) {
|
205
|
-
sql_type_names.push_back(StringValue::Get(child));
|
206
|
-
}
|
207
|
-
}
|
208
|
-
options.sql_type_list.reserve(sql_type_names.size());
|
209
|
-
for (auto &sql_type : sql_type_names) {
|
210
|
-
auto def_type = TransformStringToLogicalType(sql_type);
|
211
|
-
if (def_type.id() == LogicalTypeId::USER) {
|
212
|
-
throw BinderException("Unrecognized type \"%s\" for read_csv_auto %s definition", sql_type,
|
213
|
-
kv.first);
|
214
|
-
}
|
215
|
-
options.sql_type_list.push_back(std::move(def_type));
|
216
|
-
}
|
217
|
-
} else if (loption == "all_varchar") {
|
218
|
-
options.all_varchar = BooleanValue::Get(kv.second);
|
219
|
-
} else if (loption == "normalize_names") {
|
220
|
-
options.normalize_names = BooleanValue::Get(kv.second);
|
221
|
-
} else {
|
222
|
-
options.SetReadOption(loption, kv.second, names);
|
223
|
-
}
|
224
|
-
}
|
95
|
+
options.FromNamedParameters(input.named_parameters, context, return_types, names);
|
96
|
+
bool explicitly_set_columns = options.explicitly_set_columns;
|
97
|
+
|
225
98
|
options.file_options.AutoDetectHivePartitioning(result->files, context);
|
226
99
|
|
227
100
|
if (!options.auto_detect && return_types.empty()) {
|
@@ -1,8 +1,8 @@
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
2
|
-
#define DUCKDB_VERSION "0.8.2-
|
2
|
+
#define DUCKDB_VERSION "0.8.2-dev4623"
|
3
3
|
#endif
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
5
|
+
#define DUCKDB_SOURCE_ID "52a47a6b31"
|
6
6
|
#endif
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
8
8
|
#include "duckdb/main/database.hpp"
|
@@ -18,7 +18,7 @@ class ColumnDataCollection;
|
|
18
18
|
class ColumnDataRowCollection;
|
19
19
|
|
20
20
|
enum class ValueRenderAlignment { LEFT, MIDDLE, RIGHT };
|
21
|
-
enum class RenderMode { ROWS, COLUMNS };
|
21
|
+
enum class RenderMode : uint8_t { ROWS, COLUMNS };
|
22
22
|
|
23
23
|
struct BoxRendererConfig {
|
24
24
|
// a max_width of 0 means we default to the terminal width
|
@@ -216,6 +216,8 @@ enum class QuoteRule : uint8_t;
|
|
216
216
|
|
217
217
|
enum class RelationType : uint8_t;
|
218
218
|
|
219
|
+
enum class RenderMode : uint8_t;
|
220
|
+
|
219
221
|
enum class ResultModifierType : uint8_t;
|
220
222
|
|
221
223
|
enum class SampleMethod : uint8_t;
|
@@ -565,6 +567,9 @@ const char* EnumUtil::ToChars<QuoteRule>(QuoteRule value);
|
|
565
567
|
template<>
|
566
568
|
const char* EnumUtil::ToChars<RelationType>(RelationType value);
|
567
569
|
|
570
|
+
template<>
|
571
|
+
const char* EnumUtil::ToChars<RenderMode>(RenderMode value);
|
572
|
+
|
568
573
|
template<>
|
569
574
|
const char* EnumUtil::ToChars<ResultModifierType>(ResultModifierType value);
|
570
575
|
|
@@ -950,6 +955,9 @@ QuoteRule EnumUtil::FromString<QuoteRule>(const char *value);
|
|
950
955
|
template<>
|
951
956
|
RelationType EnumUtil::FromString<RelationType>(const char *value);
|
952
957
|
|
958
|
+
template<>
|
959
|
+
RenderMode EnumUtil::FromString<RenderMode>(const char *value);
|
960
|
+
|
953
961
|
template<>
|
954
962
|
ResultModifierType EnumUtil::FromString<ResultModifierType>(const char *value);
|
955
963
|
|
@@ -159,18 +159,33 @@ struct CSVReaderOptions {
|
|
159
159
|
string suffix;
|
160
160
|
string write_newline;
|
161
161
|
|
162
|
+
//! The date format to use (if any is specified)
|
163
|
+
map<LogicalTypeId, StrpTimeFormat> date_format = {{LogicalTypeId::DATE, {}}, {LogicalTypeId::TIMESTAMP, {}}};
|
162
164
|
//! The date format to use for writing (if any is specified)
|
163
165
|
map<LogicalTypeId, StrfTimeFormat> write_date_format = {{LogicalTypeId::DATE, {}}, {LogicalTypeId::TIMESTAMP, {}}};
|
166
|
+
//! Whether or not a type format is specified
|
167
|
+
map<LogicalTypeId, bool> has_format = {{LogicalTypeId::DATE, false}, {LogicalTypeId::TIMESTAMP, false}};
|
164
168
|
|
165
169
|
void Serialize(Serializer &serializer) const;
|
166
170
|
static CSVReaderOptions Deserialize(Deserializer &deserializer);
|
167
171
|
|
168
172
|
void SetCompression(const string &compression);
|
173
|
+
|
174
|
+
bool GetHeader() const;
|
169
175
|
void SetHeader(bool has_header);
|
176
|
+
|
177
|
+
string GetEscape() const;
|
170
178
|
void SetEscape(const string &escape);
|
179
|
+
|
180
|
+
int64_t GetSkipRows() const;
|
181
|
+
void SetSkipRows(int64_t rows);
|
182
|
+
|
183
|
+
string GetQuote() const;
|
171
184
|
void SetQuote(const string "e);
|
172
185
|
void SetDelimiter(const string &delimiter);
|
186
|
+
string GetDelimiter() const;
|
173
187
|
|
188
|
+
NewLineIdentifier GetNewline() const;
|
174
189
|
void SetNewline(const string &input);
|
175
190
|
//! Set an option that is supported by both reading and writing functions, called by
|
176
191
|
//! the SetReadOption and SetWriteOption methods
|
@@ -182,7 +197,16 @@ struct CSVReaderOptions {
|
|
182
197
|
void SetReadOption(const string &loption, const Value &value, vector<string> &expected_names);
|
183
198
|
void SetWriteOption(const string &loption, const Value &value);
|
184
199
|
void SetDateFormat(LogicalTypeId type, const string &format, bool read_format);
|
200
|
+
void ToNamedParameters(named_parameter_map_t &out);
|
201
|
+
void FromNamedParameters(named_parameter_map_t &in, ClientContext &context, vector<LogicalType> &return_types,
|
202
|
+
vector<string> &names);
|
185
203
|
|
186
204
|
string ToString() const;
|
205
|
+
|
206
|
+
named_parameter_map_t OutputReadSettings();
|
207
|
+
|
208
|
+
public:
|
209
|
+
//! Whether columns were explicitly provided through named parameters
|
210
|
+
bool explicitly_set_columns = false;
|
187
211
|
};
|
188
212
|
} // namespace duckdb
|
@@ -131,7 +131,7 @@ public:
|
|
131
131
|
|
132
132
|
//! Reads CSV file
|
133
133
|
DUCKDB_API shared_ptr<Relation> ReadCSV(const string &csv_file);
|
134
|
-
DUCKDB_API shared_ptr<Relation> ReadCSV(const string &csv_file,
|
134
|
+
DUCKDB_API shared_ptr<Relation> ReadCSV(const string &csv_file, named_parameter_map_t &&options);
|
135
135
|
DUCKDB_API shared_ptr<Relation> ReadCSV(const string &csv_file, const vector<string> &columns);
|
136
136
|
|
137
137
|
//! Reads Parquet file
|
@@ -118,6 +118,7 @@ static constexpr ExtensionEntry EXTENSION_FUNCTIONS[] = {
|
|
118
118
|
{"st_dwithin_spheroid", "spatial"},
|
119
119
|
{"st_envelope", "spatial"},
|
120
120
|
{"st_equals", "spatial"},
|
121
|
+
{"st_extent", "spatial"},
|
121
122
|
{"st_flipcoordinates", "spatial"},
|
122
123
|
{"st_geometrytype", "spatial"},
|
123
124
|
{"st_geomfromgeojson", "spatial"},
|
@@ -126,6 +127,7 @@ static constexpr ExtensionEntry EXTENSION_FUNCTIONS[] = {
|
|
126
127
|
{"st_geomfromtext", "spatial"},
|
127
128
|
{"st_geomfromwkb", "spatial"},
|
128
129
|
{"st_intersection", "spatial"},
|
130
|
+
{"st_intersection_agg", "spatial"},
|
129
131
|
{"st_intersects", "spatial"},
|
130
132
|
{"st_isclosed", "spatial"},
|
131
133
|
{"st_isempty", "spatial"},
|
@@ -159,9 +161,14 @@ static constexpr ExtensionEntry EXTENSION_FUNCTIONS[] = {
|
|
159
161
|
{"st_touches", "spatial"},
|
160
162
|
{"st_transform", "spatial"},
|
161
163
|
{"st_union", "spatial"},
|
164
|
+
{"st_union_agg", "spatial"},
|
162
165
|
{"st_within", "spatial"},
|
163
166
|
{"st_x", "spatial"},
|
167
|
+
{"st_xmax", "spatial"},
|
168
|
+
{"st_xmin", "spatial"},
|
164
169
|
{"st_y", "spatial"},
|
170
|
+
{"st_ymax", "spatial"},
|
171
|
+
{"st_ymin", "spatial"},
|
165
172
|
{"stem", "fts"},
|
166
173
|
{"text", "excel"},
|
167
174
|
{"to_arrow_ipc", "arrow"},
|
@@ -220,10 +227,9 @@ static constexpr ExtensionEntry EXTENSION_FILE_PREFIXES[] = {
|
|
220
227
|
|
221
228
|
// Note: these are currently hardcoded in scripts/generate_extensions_function.py
|
222
229
|
// TODO: automate by passing though to script via duckdb
|
223
|
-
static constexpr ExtensionEntry EXTENSION_FILE_POSTFIXES[] = {
|
224
|
-
|
225
|
-
|
226
|
-
{".ndjson", "json"}}; // END_OF_EXTENSION_FILE_POSTFIXES
|
230
|
+
static constexpr ExtensionEntry EXTENSION_FILE_POSTFIXES[] = {
|
231
|
+
{".parquet", "parquet"}, {".json", "json"}, {".jsonl", "json"}, {".ndjson", "json"},
|
232
|
+
{".shp", "spatial"}, {".gpkg", "spatial"}, {".fgb", "spatial"}}; // END_OF_EXTENSION_FILE_POSTFIXES
|
227
233
|
|
228
234
|
// Note: these are currently hardcoded in scripts/generate_extensions_function.py
|
229
235
|
// TODO: automate by passing though to script via duckdb
|
@@ -10,16 +10,16 @@
|
|
10
10
|
|
11
11
|
#include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
|
12
12
|
#include "duckdb/main/relation/table_function_relation.hpp"
|
13
|
+
#include "duckdb/common/shared_ptr.hpp"
|
14
|
+
#include "duckdb/common/case_insensitive_map.hpp"
|
13
15
|
|
14
16
|
namespace duckdb {
|
15
17
|
|
16
|
-
struct CSVReaderOptions;
|
17
|
-
|
18
18
|
class ReadCSVRelation : public TableFunctionRelation {
|
19
19
|
public:
|
20
20
|
ReadCSVRelation(const shared_ptr<ClientContext> &context, const string &csv_file, vector<ColumnDefinition> columns,
|
21
21
|
string alias = string());
|
22
|
-
ReadCSVRelation(const shared_ptr<ClientContext> &context, const string &csv_file,
|
22
|
+
ReadCSVRelation(const shared_ptr<ClientContext> &context, const string &csv_file, named_parameter_map_t &&options,
|
23
23
|
string alias = string());
|
24
24
|
|
25
25
|
string alias;
|
@@ -219,14 +219,12 @@ shared_ptr<Relation> Connection::Values(const string &values, const vector<strin
|
|
219
219
|
}
|
220
220
|
|
221
221
|
shared_ptr<Relation> Connection::ReadCSV(const string &csv_file) {
|
222
|
-
|
223
|
-
return ReadCSV(csv_file, options);
|
222
|
+
named_parameter_map_t options;
|
223
|
+
return ReadCSV(csv_file, std::move(options));
|
224
224
|
}
|
225
225
|
|
226
|
-
shared_ptr<Relation> Connection::ReadCSV(const string &csv_file,
|
227
|
-
|
228
|
-
options.auto_detect = true;
|
229
|
-
return make_shared<ReadCSVRelation>(context, csv_file, options);
|
226
|
+
shared_ptr<Relation> Connection::ReadCSV(const string &csv_file, named_parameter_map_t &&options) {
|
227
|
+
return make_shared<ReadCSVRelation>(context, csv_file, std::move(options));
|
230
228
|
}
|
231
229
|
|
232
230
|
shared_ptr<Relation> Connection::ReadCSV(const string &csv_file, const vector<string> &columns) {
|
@@ -158,11 +158,12 @@ void WriteExtensionFileToDisk(FileSystem &fs, const string &path, void *data, id
|
|
158
158
|
}
|
159
159
|
|
160
160
|
string ExtensionHelper::ExtensionUrlTemplate(optional_ptr<const ClientConfig> client_config, const string &repository) {
|
161
|
-
string default_endpoint = "http://extensions.duckdb.org";
|
162
161
|
string versioned_path = "/${REVISION}/${PLATFORM}/${NAME}.duckdb_extension";
|
163
162
|
#ifdef WASM_LOADABLE_EXTENSIONS
|
163
|
+
string default_endpoint = "https://extensions.duckdb.org";
|
164
164
|
versioned_path = "/duckdb-wasm" + versioned_path + ".wasm";
|
165
165
|
#else
|
166
|
+
string default_endpoint = "http://extensions.duckdb.org";
|
166
167
|
versioned_path = versioned_path + ".gz";
|
167
168
|
#endif
|
168
169
|
string custom_endpoint = client_config ? client_config->custom_extension_repo : string();
|
@@ -1,6 +1,5 @@
|
|
1
1
|
#include "duckdb/main/relation/read_csv_relation.hpp"
|
2
2
|
|
3
|
-
#include "duckdb/common/string_util.hpp"
|
4
3
|
#include "duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp"
|
5
4
|
#include "duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp"
|
6
5
|
#include "duckdb/execution/operator/scan/csv/csv_sniffer.hpp"
|
@@ -8,6 +7,9 @@
|
|
8
7
|
#include "duckdb/parser/expression/comparison_expression.hpp"
|
9
8
|
#include "duckdb/parser/expression/constant_expression.hpp"
|
10
9
|
#include "duckdb/parser/expression/function_expression.hpp"
|
10
|
+
#include "duckdb/common/string_util.hpp"
|
11
|
+
#include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp"
|
12
|
+
#include "duckdb/common/multi_file_reader.hpp"
|
11
13
|
#include "duckdb/parser/expression/star_expression.hpp"
|
12
14
|
#include "duckdb/parser/query_node/select_node.hpp"
|
13
15
|
#include "duckdb/parser/tableref/basetableref.hpp"
|
@@ -34,8 +36,8 @@ ReadCSVRelation::ReadCSVRelation(const shared_ptr<ClientContext> &context, const
|
|
34
36
|
AddNamedParameter("columns", Value::STRUCT(std::move(column_names)));
|
35
37
|
}
|
36
38
|
|
37
|
-
ReadCSVRelation::ReadCSVRelation(const shared_ptr<ClientContext> &context, const string &csv_file,
|
38
|
-
|
39
|
+
ReadCSVRelation::ReadCSVRelation(const std::shared_ptr<ClientContext> &context, const string &csv_file,
|
40
|
+
named_parameter_map_t &&options, string alias_p)
|
39
41
|
: TableFunctionRelation(context, "read_csv_auto", {Value(csv_file)}, nullptr, false), alias(std::move(alias_p)),
|
40
42
|
auto_detect(true) {
|
41
43
|
|
@@ -43,12 +45,24 @@ ReadCSVRelation::ReadCSVRelation(const shared_ptr<ClientContext> &context, const
|
|
43
45
|
alias = StringUtil::Split(csv_file, ".")[0];
|
44
46
|
}
|
45
47
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
auto
|
48
|
+
auto files = MultiFileReader::GetFileList(*context, csv_file, "CSV");
|
49
|
+
D_ASSERT(!files.empty());
|
50
|
+
|
51
|
+
auto &file_name = files[0];
|
52
|
+
options["auto_detect"] = Value::BOOLEAN(true);
|
53
|
+
CSVReaderOptions csv_options;
|
54
|
+
csv_options.file_path = file_name;
|
55
|
+
vector<string> empty;
|
56
|
+
|
57
|
+
vector<LogicalType> unused_types;
|
58
|
+
vector<string> unused_names;
|
59
|
+
csv_options.FromNamedParameters(options, *context, unused_types, unused_names);
|
60
|
+
// Run the auto-detect, populating the options with the detected settings
|
61
|
+
|
62
|
+
auto bm_file_handle = BaseCSVReader::OpenCSV(*context, csv_options);
|
63
|
+
auto buffer_manager = make_shared<CSVBufferManager>(*context, std::move(bm_file_handle), csv_options);
|
50
64
|
CSVStateMachineCache state_machine_cache;
|
51
|
-
CSVSniffer sniffer(
|
65
|
+
CSVSniffer sniffer(csv_options, buffer_manager, state_machine_cache);
|
52
66
|
auto sniffer_result = sniffer.SniffCSV();
|
53
67
|
auto &types = sniffer_result.return_types;
|
54
68
|
auto &names = sniffer_result.names;
|
@@ -56,7 +70,12 @@ ReadCSVRelation::ReadCSVRelation(const shared_ptr<ClientContext> &context, const
|
|
56
70
|
columns.emplace_back(names[i], types[i]);
|
57
71
|
}
|
58
72
|
|
59
|
-
|
73
|
+
//! Capture the options potentially set/altered by the auto detection phase
|
74
|
+
csv_options.ToNamedParameters(options);
|
75
|
+
|
76
|
+
// No need to auto-detect again
|
77
|
+
options["auto_detect"] = Value::BOOLEAN(false);
|
78
|
+
SetNamedParameters(std::move(options));
|
60
79
|
}
|
61
80
|
|
62
81
|
string ReadCSVRelation::GetAlias() {
|
@@ -9,6 +9,7 @@
|
|
9
9
|
#include "duckdb/main/client_context.hpp"
|
10
10
|
#include "duckdb/parser/expression/comparison_expression.hpp"
|
11
11
|
#include "duckdb/parser/expression/columnref_expression.hpp"
|
12
|
+
#include "duckdb/common/shared_ptr.hpp"
|
12
13
|
|
13
14
|
namespace duckdb {
|
14
15
|
|
@@ -16,7 +17,12 @@ void TableFunctionRelation::AddNamedParameter(const string &name, Value argument
|
|
16
17
|
named_parameters[name] = std::move(argument);
|
17
18
|
}
|
18
19
|
|
19
|
-
TableFunctionRelation::
|
20
|
+
void TableFunctionRelation::SetNamedParameters(named_parameter_map_t &&options) {
|
21
|
+
D_ASSERT(named_parameters.empty());
|
22
|
+
named_parameters = std::move(options);
|
23
|
+
}
|
24
|
+
|
25
|
+
TableFunctionRelation::TableFunctionRelation(const shared_ptr<ClientContext> &context, string name_p,
|
20
26
|
vector<Value> parameters_p, named_parameter_map_t named_parameters,
|
21
27
|
shared_ptr<Relation> input_relation_p, bool auto_init)
|
22
28
|
: Relation(context, RelationType::TABLE_FUNCTION_RELATION), name(std::move(name_p)),
|
@@ -25,7 +31,7 @@ TableFunctionRelation::TableFunctionRelation(const std::shared_ptr<ClientContext
|
|
25
31
|
InitializeColumns();
|
26
32
|
}
|
27
33
|
|
28
|
-
TableFunctionRelation::TableFunctionRelation(const
|
34
|
+
TableFunctionRelation::TableFunctionRelation(const shared_ptr<ClientContext> &context, string name_p,
|
29
35
|
vector<Value> parameters_p, shared_ptr<Relation> input_relation_p,
|
30
36
|
bool auto_init)
|
31
37
|
: Relation(context, RelationType::TABLE_FUNCTION_RELATION), name(std::move(name_p)),
|