duckdb 0.8.2-dev4514.0 → 0.8.2-dev4623.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/duckdb.js +11 -1
- package/package.json +3 -1
- package/src/connection.cpp +48 -7
- package/src/duckdb/src/catalog/catalog.cpp +5 -0
- package/src/duckdb/src/catalog/duck_catalog.cpp +4 -0
- package/src/duckdb/src/common/enum_util.cpp +24 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_reader_options.cpp +213 -2
- package/src/duckdb/src/execution/operator/persistent/physical_batch_insert.cpp +59 -38
- package/src/duckdb/src/function/pragma/pragma_queries.cpp +5 -0
- package/src/duckdb/src/function/table/arrow.cpp +18 -13
- package/src/duckdb/src/function/table/read_csv.cpp +3 -130
- package/src/duckdb/src/function/table/system/pragma_metadata_info.cpp +83 -0
- package/src/duckdb/src/function/table/system/pragma_storage_info.cpp +5 -0
- package/src/duckdb/src/function/table/system_functions.cpp +1 -0
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/catalog/catalog.hpp +2 -0
- package/src/duckdb/src/include/duckdb/catalog/duck_catalog.hpp +1 -0
- package/src/duckdb/src/include/duckdb/common/box_renderer.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/enum_util.hpp +8 -0
- package/src/duckdb/src/include/duckdb/common/serializer/deserialization_data.hpp +36 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_reader_options.hpp +24 -0
- package/src/duckdb/src/include/duckdb/function/compression_function.hpp +36 -4
- package/src/duckdb/src/include/duckdb/function/table/arrow.hpp +2 -0
- package/src/duckdb/src/include/duckdb/function/table/system_functions.hpp +4 -0
- package/src/duckdb/src/include/duckdb/main/connection.hpp +1 -1
- package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +10 -4
- package/src/duckdb/src/include/duckdb/main/relation/read_csv_relation.hpp +3 -3
- package/src/duckdb/src/include/duckdb/main/relation/table_function_relation.hpp +1 -0
- package/src/duckdb/src/include/duckdb/storage/checkpoint/string_checkpoint_state.hpp +27 -4
- package/src/duckdb/src/include/duckdb/storage/checkpoint/write_overflow_strings_to_disk.hpp +4 -2
- package/src/duckdb/src/include/duckdb/storage/data_pointer.hpp +22 -1
- package/src/duckdb/src/include/duckdb/storage/database_size.hpp +6 -0
- package/src/duckdb/src/include/duckdb/storage/metadata/metadata_manager.hpp +2 -0
- package/src/duckdb/src/include/duckdb/storage/storage_manager.hpp +2 -0
- package/src/duckdb/src/include/duckdb/storage/string_uncompressed.hpp +6 -1
- package/src/duckdb/src/include/duckdb/storage/table/column_segment.hpp +7 -3
- package/src/duckdb/src/include/duckdb/storage/table_storage_info.hpp +1 -0
- package/src/duckdb/src/main/connection.cpp +4 -6
- package/src/duckdb/src/main/extension/extension_install.cpp +2 -1
- package/src/duckdb/src/main/relation/read_csv_relation.cpp +28 -9
- package/src/duckdb/src/main/relation/table_function_relation.cpp +8 -2
- package/src/duckdb/src/planner/binder/expression/bind_aggregate_expression.cpp +1 -4
- package/src/duckdb/src/storage/checkpoint/row_group_writer.cpp +1 -4
- package/src/duckdb/src/storage/checkpoint/write_overflow_strings_to_disk.cpp +47 -10
- package/src/duckdb/src/storage/checkpoint_manager.cpp +0 -2
- package/src/duckdb/src/storage/compression/fixed_size_uncompressed.cpp +6 -1
- package/src/duckdb/src/storage/compression/string_uncompressed.cpp +62 -12
- package/src/duckdb/src/storage/compression/validity_uncompressed.cpp +2 -1
- package/src/duckdb/src/storage/data_pointer.cpp +20 -0
- package/src/duckdb/src/storage/local_storage.cpp +3 -7
- package/src/duckdb/src/storage/metadata/metadata_manager.cpp +29 -15
- package/src/duckdb/src/storage/serialization/serialize_storage.cpp +4 -0
- package/src/duckdb/src/storage/single_file_block_manager.cpp +15 -9
- package/src/duckdb/src/storage/storage_info.cpp +1 -1
- package/src/duckdb/src/storage/storage_manager.cpp +5 -0
- package/src/duckdb/src/storage/table/column_checkpoint_state.cpp +3 -0
- package/src/duckdb/src/storage/table/column_data.cpp +17 -14
- package/src/duckdb/src/storage/table/column_data_checkpointer.cpp +4 -8
- package/src/duckdb/src/storage/table/column_segment.cpp +21 -12
- package/src/duckdb/ub_src_function_table_system.cpp +2 -0
- package/src/duckdb/ub_src_storage.cpp +2 -0
- package/src/duckdb_node.hpp +1 -0
- package/test/close_hang.test.ts +39 -0
package/lib/duckdb.js
CHANGED
@@ -412,6 +412,13 @@ Connection.prototype.register_buffer;
|
|
412
412
|
*/
|
413
413
|
Connection.prototype.unregister_buffer;
|
414
414
|
|
415
|
+
/**
|
416
|
+
* Closes connection
|
417
|
+
* @method
|
418
|
+
* @param callback
|
419
|
+
* @return {void}
|
420
|
+
*/
|
421
|
+
Connection.prototype.close;
|
415
422
|
|
416
423
|
/**
|
417
424
|
* Closes database instance
|
@@ -420,7 +427,10 @@ Connection.prototype.unregister_buffer;
|
|
420
427
|
* @return {void}
|
421
428
|
*/
|
422
429
|
Database.prototype.close = function() {
|
423
|
-
this.default_connection
|
430
|
+
if (this.default_connection) {
|
431
|
+
this.default_connection.close(); // this queues up a job in the internals, which blocks the below close call
|
432
|
+
this.default_connection = null;
|
433
|
+
}
|
424
434
|
this.close_internal.apply(this, arguments);
|
425
435
|
};
|
426
436
|
|
package/package.json
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
"name": "duckdb",
|
3
3
|
"main": "./lib/duckdb.js",
|
4
4
|
"types": "./lib/duckdb.d.ts",
|
5
|
-
"version": "0.8.2-
|
5
|
+
"version": "0.8.2-dev4623.0",
|
6
6
|
"description": "DuckDB node.js API",
|
7
7
|
"gypfile": true,
|
8
8
|
"dependencies": {
|
@@ -29,12 +29,14 @@
|
|
29
29
|
"devDependencies": {
|
30
30
|
"@types/chai": "^4.3.4",
|
31
31
|
"@types/chai-as-promised": "^7.1.5",
|
32
|
+
"@types/fs-extra": "^11.0.1",
|
32
33
|
"@types/mocha": "^10.0.0",
|
33
34
|
"@types/node": "^18.11.0",
|
34
35
|
"apache-arrow": "^9.0.0",
|
35
36
|
"aws-sdk": "^2.790.0",
|
36
37
|
"chai": "^4.3.6",
|
37
38
|
"chai-as-promised": "^7.1.1",
|
39
|
+
"fs-extra": "^11.1.1",
|
38
40
|
"jsdoc3-parser": "^2.0.0",
|
39
41
|
"mocha": "^8.3.0",
|
40
42
|
"ts-node": "^10.9.1",
|
package/src/connection.cpp
CHANGED
@@ -12,13 +12,13 @@ namespace node_duckdb {
|
|
12
12
|
Napi::FunctionReference Connection::Init(Napi::Env env, Napi::Object exports) {
|
13
13
|
Napi::HandleScope scope(env);
|
14
14
|
|
15
|
-
Napi::Function t =
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
15
|
+
Napi::Function t = DefineClass(
|
16
|
+
env, "Connection",
|
17
|
+
{InstanceMethod("prepare", &Connection::Prepare), InstanceMethod("exec", &Connection::Exec),
|
18
|
+
InstanceMethod("register_udf_bulk", &Connection::RegisterUdf),
|
19
|
+
InstanceMethod("register_buffer", &Connection::RegisterBuffer),
|
20
|
+
InstanceMethod("unregister_udf", &Connection::UnregisterUdf), InstanceMethod("close", &Connection::Close),
|
21
|
+
InstanceMethod("unregister_buffer", &Connection::UnRegisterBuffer)});
|
22
22
|
|
23
23
|
exports.Set("Connection", t);
|
24
24
|
|
@@ -407,6 +407,36 @@ struct ExecTaskWithCallback : public ExecTask {
|
|
407
407
|
std::function<void(void)> cpp_callback;
|
408
408
|
};
|
409
409
|
|
410
|
+
struct CloseConnectionTask : public Task {
|
411
|
+
CloseConnectionTask(Connection &connection, Napi::Function callback) : Task(connection, callback) {
|
412
|
+
}
|
413
|
+
|
414
|
+
void DoWork() override {
|
415
|
+
auto &connection = Get<Connection>();
|
416
|
+
if (connection.connection) {
|
417
|
+
connection.connection.reset();
|
418
|
+
success = true;
|
419
|
+
} else {
|
420
|
+
success = false;
|
421
|
+
}
|
422
|
+
}
|
423
|
+
|
424
|
+
void Callback() override {
|
425
|
+
auto &connection = Get<Connection>();
|
426
|
+
auto env = connection.Env();
|
427
|
+
Napi::HandleScope scope(env);
|
428
|
+
|
429
|
+
auto cb = callback.Value();
|
430
|
+
if (!success) {
|
431
|
+
cb.MakeCallback(connection.Value(), {Utils::CreateError(env, "Connection was already closed")});
|
432
|
+
return;
|
433
|
+
}
|
434
|
+
cb.MakeCallback(connection.Value(), {env.Null(), connection.Value()});
|
435
|
+
}
|
436
|
+
|
437
|
+
bool success = false;
|
438
|
+
};
|
439
|
+
|
410
440
|
Napi::Value Connection::Exec(const Napi::CallbackInfo &info) {
|
411
441
|
auto env = info.Env();
|
412
442
|
|
@@ -512,6 +542,17 @@ Napi::Value Connection::UnRegisterBuffer(const Napi::CallbackInfo &info) {
|
|
512
542
|
return Value();
|
513
543
|
}
|
514
544
|
|
545
|
+
Napi::Value Connection::Close(const Napi::CallbackInfo &info) {
|
546
|
+
Napi::Function callback;
|
547
|
+
if (info.Length() > 0 && info[0].IsFunction()) {
|
548
|
+
callback = info[0].As<Napi::Function>();
|
549
|
+
}
|
550
|
+
|
551
|
+
database_ref->Schedule(info.Env(), duckdb::make_uniq<CloseConnectionTask>(*this, callback));
|
552
|
+
|
553
|
+
return info.Env().Undefined();
|
554
|
+
}
|
555
|
+
|
515
556
|
Napi::Object Connection::NewInstance(const Napi::Value &db) {
|
516
557
|
return NodeDuckDB::GetData(db.Env())->connection_constructor.New({db});
|
517
558
|
}
|
@@ -35,6 +35,7 @@
|
|
35
35
|
#include "duckdb/main/database_manager.hpp"
|
36
36
|
#include "duckdb/function/built_in_functions.hpp"
|
37
37
|
#include "duckdb/catalog/similar_catalog_entry.hpp"
|
38
|
+
#include "duckdb/storage/database_size.hpp"
|
38
39
|
#include <algorithm>
|
39
40
|
|
40
41
|
namespace duckdb {
|
@@ -831,6 +832,10 @@ void Catalog::Alter(ClientContext &context, AlterInfo &info) {
|
|
831
832
|
return lookup.schema->Alter(context, info);
|
832
833
|
}
|
833
834
|
|
835
|
+
vector<MetadataBlockInfo> Catalog::GetMetadataInfo(ClientContext &context) {
|
836
|
+
return vector<MetadataBlockInfo>();
|
837
|
+
}
|
838
|
+
|
834
839
|
void Catalog::Verify() {
|
835
840
|
}
|
836
841
|
|
@@ -132,6 +132,10 @@ DatabaseSize DuckCatalog::GetDatabaseSize(ClientContext &context) {
|
|
132
132
|
return db.GetStorageManager().GetDatabaseSize();
|
133
133
|
}
|
134
134
|
|
135
|
+
vector<MetadataBlockInfo> DuckCatalog::GetMetadataInfo(ClientContext &context) {
|
136
|
+
return db.GetStorageManager().GetMetadataInfo();
|
137
|
+
}
|
138
|
+
|
135
139
|
bool DuckCatalog::InMemory() {
|
136
140
|
return db.GetStorageManager().InMemory();
|
137
141
|
}
|
@@ -11,6 +11,7 @@
|
|
11
11
|
|
12
12
|
#include "duckdb/common/enum_util.hpp"
|
13
13
|
#include "duckdb/catalog/catalog_entry/table_column_type.hpp"
|
14
|
+
#include "duckdb/common/box_renderer.hpp"
|
14
15
|
#include "duckdb/common/enums/access_mode.hpp"
|
15
16
|
#include "duckdb/common/enums/aggregate_handling.hpp"
|
16
17
|
#include "duckdb/common/enums/catalog_type.hpp"
|
@@ -4797,6 +4798,29 @@ RelationType EnumUtil::FromString<RelationType>(const char *value) {
|
|
4797
4798
|
throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value));
|
4798
4799
|
}
|
4799
4800
|
|
4801
|
+
template<>
|
4802
|
+
const char* EnumUtil::ToChars<RenderMode>(RenderMode value) {
|
4803
|
+
switch(value) {
|
4804
|
+
case RenderMode::ROWS:
|
4805
|
+
return "ROWS";
|
4806
|
+
case RenderMode::COLUMNS:
|
4807
|
+
return "COLUMNS";
|
4808
|
+
default:
|
4809
|
+
throw NotImplementedException(StringUtil::Format("Enum value: '%d' not implemented", value));
|
4810
|
+
}
|
4811
|
+
}
|
4812
|
+
|
4813
|
+
template<>
|
4814
|
+
RenderMode EnumUtil::FromString<RenderMode>(const char *value) {
|
4815
|
+
if (StringUtil::Equals(value, "ROWS")) {
|
4816
|
+
return RenderMode::ROWS;
|
4817
|
+
}
|
4818
|
+
if (StringUtil::Equals(value, "COLUMNS")) {
|
4819
|
+
return RenderMode::COLUMNS;
|
4820
|
+
}
|
4821
|
+
throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value));
|
4822
|
+
}
|
4823
|
+
|
4800
4824
|
template<>
|
4801
4825
|
const char* EnumUtil::ToChars<ResultModifierType>(ResultModifierType value) {
|
4802
4826
|
switch(value) {
|
@@ -2,6 +2,8 @@
|
|
2
2
|
#include "duckdb/common/bind_helpers.hpp"
|
3
3
|
#include "duckdb/common/vector_size.hpp"
|
4
4
|
#include "duckdb/common/string_util.hpp"
|
5
|
+
#include "duckdb/common/enum_util.hpp"
|
6
|
+
#include "duckdb/common/multi_file_reader.hpp"
|
5
7
|
|
6
8
|
namespace duckdb {
|
7
9
|
|
@@ -60,6 +62,10 @@ static int64_t ParseInteger(const Value &value, const string &loption) {
|
|
60
62
|
return value.GetValue<int64_t>();
|
61
63
|
}
|
62
64
|
|
65
|
+
bool CSVReaderOptions::GetHeader() const {
|
66
|
+
return this->dialect_options.header;
|
67
|
+
}
|
68
|
+
|
63
69
|
void CSVReaderOptions::SetHeader(bool input) {
|
64
70
|
this->dialect_options.header = input;
|
65
71
|
this->has_header = true;
|
@@ -69,6 +75,10 @@ void CSVReaderOptions::SetCompression(const string &compression_p) {
|
|
69
75
|
this->compression = FileCompressionTypeFromString(compression_p);
|
70
76
|
}
|
71
77
|
|
78
|
+
string CSVReaderOptions::GetEscape() const {
|
79
|
+
return std::string(1, this->dialect_options.state_machine_options.escape);
|
80
|
+
}
|
81
|
+
|
72
82
|
void CSVReaderOptions::SetEscape(const string &input) {
|
73
83
|
auto escape_str = input;
|
74
84
|
if (escape_str.size() > 1) {
|
@@ -81,6 +91,19 @@ void CSVReaderOptions::SetEscape(const string &input) {
|
|
81
91
|
this->has_escape = true;
|
82
92
|
}
|
83
93
|
|
94
|
+
int64_t CSVReaderOptions::GetSkipRows() const {
|
95
|
+
return this->dialect_options.skip_rows;
|
96
|
+
}
|
97
|
+
|
98
|
+
void CSVReaderOptions::SetSkipRows(int64_t skip_rows) {
|
99
|
+
dialect_options.skip_rows = skip_rows;
|
100
|
+
skip_rows_set = true;
|
101
|
+
}
|
102
|
+
|
103
|
+
string CSVReaderOptions::GetDelimiter() const {
|
104
|
+
return std::string(1, this->dialect_options.state_machine_options.delimiter);
|
105
|
+
}
|
106
|
+
|
84
107
|
void CSVReaderOptions::SetDelimiter(const string &input) {
|
85
108
|
auto delim_str = StringUtil::Replace(input, "\\t", "\t");
|
86
109
|
if (delim_str.size() > 1) {
|
@@ -93,6 +116,10 @@ void CSVReaderOptions::SetDelimiter(const string &input) {
|
|
93
116
|
this->dialect_options.state_machine_options.delimiter = delim_str[0];
|
94
117
|
}
|
95
118
|
|
119
|
+
string CSVReaderOptions::GetQuote() const {
|
120
|
+
return std::string(1, this->dialect_options.state_machine_options.quote);
|
121
|
+
}
|
122
|
+
|
96
123
|
void CSVReaderOptions::SetQuote(const string "e_p) {
|
97
124
|
auto quote_str = quote_p;
|
98
125
|
if (quote_str.size() > 1) {
|
@@ -105,6 +132,10 @@ void CSVReaderOptions::SetQuote(const string "e_p) {
|
|
105
132
|
this->has_quote = true;
|
106
133
|
}
|
107
134
|
|
135
|
+
NewLineIdentifier CSVReaderOptions::GetNewline() const {
|
136
|
+
return dialect_options.new_line;
|
137
|
+
}
|
138
|
+
|
108
139
|
void CSVReaderOptions::SetNewline(const string &input) {
|
109
140
|
if (input == "\\n" || input == "\\r") {
|
110
141
|
dialect_options.new_line = NewLineIdentifier::SINGLE;
|
@@ -152,8 +183,7 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value,
|
|
152
183
|
sample_chunks = sample_size / STANDARD_VECTOR_SIZE + 1;
|
153
184
|
}
|
154
185
|
} else if (loption == "skip") {
|
155
|
-
|
156
|
-
skip_rows_set = true;
|
186
|
+
SetSkipRows(ParseInteger(value, loption));
|
157
187
|
} else if (loption == "max_line_size" || loption == "maximum_line_size") {
|
158
188
|
maximum_line_size = ParseInteger(value, loption);
|
159
189
|
} else if (loption == "sample_chunk_size") {
|
@@ -296,4 +326,185 @@ string CSVReaderOptions::ToString() const {
|
|
296
326
|
"\n ignore_errors=" + std::to_string(ignore_errors) + "\n all_varchar=" + std::to_string(all_varchar);
|
297
327
|
}
|
298
328
|
|
329
|
+
static Value StringVectorToValue(const vector<string> &vec) {
|
330
|
+
vector<Value> content;
|
331
|
+
content.reserve(vec.size());
|
332
|
+
for (auto &item : vec) {
|
333
|
+
content.push_back(Value(item));
|
334
|
+
}
|
335
|
+
return Value::LIST(std::move(content));
|
336
|
+
}
|
337
|
+
|
338
|
+
static uint8_t GetCandidateSpecificity(const LogicalType &candidate_type) {
|
339
|
+
//! Const ht with accepted auto_types and their weights in specificity
|
340
|
+
const duckdb::unordered_map<uint8_t, uint8_t> auto_type_candidates_specificity {
|
341
|
+
{(uint8_t)LogicalTypeId::VARCHAR, 0}, {(uint8_t)LogicalTypeId::TIMESTAMP, 1},
|
342
|
+
{(uint8_t)LogicalTypeId::DATE, 2}, {(uint8_t)LogicalTypeId::TIME, 3},
|
343
|
+
{(uint8_t)LogicalTypeId::DOUBLE, 4}, {(uint8_t)LogicalTypeId::FLOAT, 5},
|
344
|
+
{(uint8_t)LogicalTypeId::BIGINT, 6}, {(uint8_t)LogicalTypeId::INTEGER, 7},
|
345
|
+
{(uint8_t)LogicalTypeId::SMALLINT, 8}, {(uint8_t)LogicalTypeId::TINYINT, 9},
|
346
|
+
{(uint8_t)LogicalTypeId::BOOLEAN, 10}, {(uint8_t)LogicalTypeId::SQLNULL, 11}};
|
347
|
+
|
348
|
+
auto id = (uint8_t)candidate_type.id();
|
349
|
+
auto it = auto_type_candidates_specificity.find(id);
|
350
|
+
if (it == auto_type_candidates_specificity.end()) {
|
351
|
+
throw BinderException("Auto Type Candidate of type %s is not accepted as a valid input",
|
352
|
+
EnumUtil::ToString(candidate_type.id()));
|
353
|
+
}
|
354
|
+
return it->second;
|
355
|
+
}
|
356
|
+
|
357
|
+
void CSVReaderOptions::FromNamedParameters(named_parameter_map_t &in, ClientContext &context,
|
358
|
+
vector<LogicalType> &return_types, vector<string> &names) {
|
359
|
+
for (auto &kv : in) {
|
360
|
+
if (MultiFileReader::ParseOption(kv.first, kv.second, file_options, context)) {
|
361
|
+
continue;
|
362
|
+
}
|
363
|
+
auto loption = StringUtil::Lower(kv.first);
|
364
|
+
if (loption == "columns") {
|
365
|
+
explicitly_set_columns = true;
|
366
|
+
auto &child_type = kv.second.type();
|
367
|
+
if (child_type.id() != LogicalTypeId::STRUCT) {
|
368
|
+
throw BinderException("read_csv columns requires a struct as input");
|
369
|
+
}
|
370
|
+
auto &struct_children = StructValue::GetChildren(kv.second);
|
371
|
+
D_ASSERT(StructType::GetChildCount(child_type) == struct_children.size());
|
372
|
+
for (idx_t i = 0; i < struct_children.size(); i++) {
|
373
|
+
auto &name = StructType::GetChildName(child_type, i);
|
374
|
+
auto &val = struct_children[i];
|
375
|
+
names.push_back(name);
|
376
|
+
if (val.type().id() != LogicalTypeId::VARCHAR) {
|
377
|
+
throw BinderException("read_csv requires a type specification as string");
|
378
|
+
}
|
379
|
+
return_types.emplace_back(TransformStringToLogicalType(StringValue::Get(val), context));
|
380
|
+
}
|
381
|
+
if (names.empty()) {
|
382
|
+
throw BinderException("read_csv requires at least a single column as input!");
|
383
|
+
}
|
384
|
+
} else if (loption == "auto_type_candidates") {
|
385
|
+
auto_type_candidates.clear();
|
386
|
+
map<uint8_t, LogicalType> candidate_types;
|
387
|
+
// We always have the extremes of Null and Varchar, so we can default to varchar if the
|
388
|
+
// sniffer is not able to confidently detect that column type
|
389
|
+
candidate_types[GetCandidateSpecificity(LogicalType::VARCHAR)] = LogicalType::VARCHAR;
|
390
|
+
candidate_types[GetCandidateSpecificity(LogicalType::SQLNULL)] = LogicalType::SQLNULL;
|
391
|
+
|
392
|
+
auto &child_type = kv.second.type();
|
393
|
+
if (child_type.id() != LogicalTypeId::LIST) {
|
394
|
+
throw BinderException("read_csv auto_types requires a list as input");
|
395
|
+
}
|
396
|
+
auto &list_children = ListValue::GetChildren(kv.second);
|
397
|
+
if (list_children.empty()) {
|
398
|
+
throw BinderException("auto_type_candidates requires at least one type");
|
399
|
+
}
|
400
|
+
for (auto &child : list_children) {
|
401
|
+
if (child.type().id() != LogicalTypeId::VARCHAR) {
|
402
|
+
throw BinderException("auto_type_candidates requires a type specification as string");
|
403
|
+
}
|
404
|
+
auto candidate_type = TransformStringToLogicalType(StringValue::Get(child), context);
|
405
|
+
candidate_types[GetCandidateSpecificity(candidate_type)] = candidate_type;
|
406
|
+
}
|
407
|
+
for (auto &candidate_type : candidate_types) {
|
408
|
+
auto_type_candidates.emplace_back(candidate_type.second);
|
409
|
+
}
|
410
|
+
} else if (loption == "column_names" || loption == "names") {
|
411
|
+
if (!name_list.empty()) {
|
412
|
+
throw BinderException("read_csv_auto column_names/names can only be supplied once");
|
413
|
+
}
|
414
|
+
if (kv.second.IsNull()) {
|
415
|
+
throw BinderException("read_csv_auto %s cannot be NULL", kv.first);
|
416
|
+
}
|
417
|
+
auto &children = ListValue::GetChildren(kv.second);
|
418
|
+
for (auto &child : children) {
|
419
|
+
name_list.push_back(StringValue::Get(child));
|
420
|
+
}
|
421
|
+
} else if (loption == "column_types" || loption == "types" || loption == "dtypes") {
|
422
|
+
auto &child_type = kv.second.type();
|
423
|
+
if (child_type.id() != LogicalTypeId::STRUCT && child_type.id() != LogicalTypeId::LIST) {
|
424
|
+
throw BinderException("read_csv_auto %s requires a struct or list as input", kv.first);
|
425
|
+
}
|
426
|
+
if (!sql_type_list.empty()) {
|
427
|
+
throw BinderException("read_csv_auto column_types/types/dtypes can only be supplied once");
|
428
|
+
}
|
429
|
+
vector<string> sql_type_names;
|
430
|
+
if (child_type.id() == LogicalTypeId::STRUCT) {
|
431
|
+
auto &struct_children = StructValue::GetChildren(kv.second);
|
432
|
+
D_ASSERT(StructType::GetChildCount(child_type) == struct_children.size());
|
433
|
+
for (idx_t i = 0; i < struct_children.size(); i++) {
|
434
|
+
auto &name = StructType::GetChildName(child_type, i);
|
435
|
+
auto &val = struct_children[i];
|
436
|
+
if (val.type().id() != LogicalTypeId::VARCHAR) {
|
437
|
+
throw BinderException("read_csv_auto %s requires a type specification as string", kv.first);
|
438
|
+
}
|
439
|
+
sql_type_names.push_back(StringValue::Get(val));
|
440
|
+
sql_types_per_column[name] = i;
|
441
|
+
}
|
442
|
+
} else {
|
443
|
+
auto &list_child = ListType::GetChildType(child_type);
|
444
|
+
if (list_child.id() != LogicalTypeId::VARCHAR) {
|
445
|
+
throw BinderException("read_csv_auto %s requires a list of types (varchar) as input", kv.first);
|
446
|
+
}
|
447
|
+
auto &children = ListValue::GetChildren(kv.second);
|
448
|
+
for (auto &child : children) {
|
449
|
+
sql_type_names.push_back(StringValue::Get(child));
|
450
|
+
}
|
451
|
+
}
|
452
|
+
sql_type_list.reserve(sql_type_names.size());
|
453
|
+
for (auto &sql_type : sql_type_names) {
|
454
|
+
auto def_type = TransformStringToLogicalType(sql_type);
|
455
|
+
if (def_type.id() == LogicalTypeId::USER) {
|
456
|
+
throw BinderException("Unrecognized type \"%s\" for read_csv_auto %s definition", sql_type,
|
457
|
+
kv.first);
|
458
|
+
}
|
459
|
+
sql_type_list.push_back(std::move(def_type));
|
460
|
+
}
|
461
|
+
} else if (loption == "all_varchar") {
|
462
|
+
all_varchar = BooleanValue::Get(kv.second);
|
463
|
+
} else if (loption == "normalize_names") {
|
464
|
+
normalize_names = BooleanValue::Get(kv.second);
|
465
|
+
} else {
|
466
|
+
SetReadOption(loption, kv.second, names);
|
467
|
+
}
|
468
|
+
}
|
469
|
+
}
|
470
|
+
|
471
|
+
//! This function is used to remember options set by the sniffer, for use in ReadCSVRelation
|
472
|
+
void CSVReaderOptions::ToNamedParameters(named_parameter_map_t &named_params) {
|
473
|
+
if (has_delimiter) {
|
474
|
+
named_params["delim"] = Value(GetDelimiter());
|
475
|
+
}
|
476
|
+
if (has_newline) {
|
477
|
+
named_params["newline"] = Value(EnumUtil::ToString(GetNewline()));
|
478
|
+
}
|
479
|
+
if (has_quote) {
|
480
|
+
named_params["quote"] = Value(GetQuote());
|
481
|
+
}
|
482
|
+
if (has_escape) {
|
483
|
+
named_params["escape"] = Value(GetEscape());
|
484
|
+
}
|
485
|
+
if (has_header) {
|
486
|
+
named_params["header"] = Value(GetHeader());
|
487
|
+
}
|
488
|
+
named_params["max_line_size"] = Value::BIGINT(maximum_line_size);
|
489
|
+
if (skip_rows_set) {
|
490
|
+
named_params["skip"] = Value::BIGINT(GetSkipRows());
|
491
|
+
}
|
492
|
+
named_params["sample_chunks"] = Value::BIGINT(sample_chunks);
|
493
|
+
named_params["sample_chunk_size"] = Value::BIGINT(sample_chunk_size);
|
494
|
+
named_params["null_padding"] = Value::BOOLEAN(null_padding);
|
495
|
+
if (!date_format.at(LogicalType::DATE).format_specifier.empty()) {
|
496
|
+
named_params["dateformat"] = Value(date_format.at(LogicalType::DATE).format_specifier);
|
497
|
+
}
|
498
|
+
if (!date_format.at(LogicalType::TIMESTAMP).format_specifier.empty()) {
|
499
|
+
named_params["timestampformat"] = Value(date_format.at(LogicalType::TIMESTAMP).format_specifier);
|
500
|
+
}
|
501
|
+
|
502
|
+
named_params["normalize_names"] = Value::BOOLEAN(normalize_names);
|
503
|
+
if (!name_list.empty()) {
|
504
|
+
named_params["column_names"] = StringVectorToValue(name_list);
|
505
|
+
}
|
506
|
+
named_params["all_varchar"] = Value::BOOLEAN(all_varchar);
|
507
|
+
named_params["maximum_line_size"] = Value::BIGINT(maximum_line_size);
|
508
|
+
}
|
509
|
+
|
299
510
|
} // namespace duckdb
|
@@ -6,6 +6,7 @@
|
|
6
6
|
#include "duckdb/storage/table_io_manager.hpp"
|
7
7
|
#include "duckdb/transaction/local_storage.hpp"
|
8
8
|
#include "duckdb/catalog/catalog_entry/duck_table_entry.hpp"
|
9
|
+
#include "duckdb/transaction/duck_transaction.hpp"
|
9
10
|
#include "duckdb/storage/table/append_state.hpp"
|
10
11
|
#include "duckdb/storage/table/scan_state.hpp"
|
11
12
|
|
@@ -119,6 +120,7 @@ public:
|
|
119
120
|
idx_t insert_count;
|
120
121
|
vector<RowGroupBatchEntry> collections;
|
121
122
|
idx_t next_start = 0;
|
123
|
+
bool optimistically_written = false;
|
122
124
|
|
123
125
|
void FindMergeCollections(idx_t min_batch_index, optional_idx &merged_batch_index,
|
124
126
|
vector<unique_ptr<RowGroupCollection>> &result) {
|
@@ -176,10 +178,12 @@ public:
|
|
176
178
|
unique_ptr<RowGroupCollection> MergeCollections(ClientContext &context,
|
177
179
|
vector<unique_ptr<RowGroupCollection>> merge_collections,
|
178
180
|
OptimisticDataWriter &writer) {
|
181
|
+
D_ASSERT(!merge_collections.empty());
|
179
182
|
CollectionMerger merger(context);
|
180
183
|
for (auto &collection : merge_collections) {
|
181
184
|
merger.AddCollection(std::move(collection));
|
182
185
|
}
|
186
|
+
optimistically_written = true;
|
183
187
|
return merger.Flush(writer);
|
184
188
|
}
|
185
189
|
|
@@ -373,48 +377,65 @@ SinkFinalizeType PhysicalBatchInsert::Finalize(Pipeline &pipeline, Event &event,
|
|
373
377
|
OperatorSinkFinalizeInput &input) const {
|
374
378
|
auto &gstate = input.global_state.Cast<BatchInsertGlobalState>();
|
375
379
|
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
380
|
+
if (gstate.optimistically_written || gstate.insert_count >= LocalStorage::MERGE_THRESHOLD) {
|
381
|
+
// we have written data to disk optimistically or are inserting a large amount of data
|
382
|
+
// perform a final pass over all of the row groups and merge them together
|
383
|
+
vector<unique_ptr<CollectionMerger>> mergers;
|
384
|
+
unique_ptr<CollectionMerger> current_merger;
|
385
|
+
|
386
|
+
auto &storage = gstate.table.GetStorage();
|
387
|
+
for (auto &entry : gstate.collections) {
|
388
|
+
if (entry.type == RowGroupBatchType::NOT_FLUSHED) {
|
389
|
+
// this collection has not been flushed: add it to the merge set
|
390
|
+
if (!current_merger) {
|
391
|
+
current_merger = make_uniq<CollectionMerger>(context);
|
392
|
+
}
|
393
|
+
current_merger->AddCollection(std::move(entry.collection));
|
394
|
+
} else {
|
395
|
+
// this collection has been flushed: it does not need to be merged
|
396
|
+
// create a separate collection merger only for this entry
|
397
|
+
if (current_merger) {
|
398
|
+
// we have small collections remaining: flush them
|
399
|
+
mergers.push_back(std::move(current_merger));
|
400
|
+
current_merger.reset();
|
401
|
+
}
|
402
|
+
auto larger_merger = make_uniq<CollectionMerger>(context);
|
403
|
+
larger_merger->AddCollection(std::move(entry.collection));
|
404
|
+
mergers.push_back(std::move(larger_merger));
|
396
405
|
}
|
397
|
-
auto larger_merger = make_uniq<CollectionMerger>(context);
|
398
|
-
larger_merger->AddCollection(std::move(entry.collection));
|
399
|
-
mergers.push_back(std::move(larger_merger));
|
400
406
|
}
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
}
|
407
|
+
if (current_merger) {
|
408
|
+
mergers.push_back(std::move(current_merger));
|
409
|
+
}
|
405
410
|
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
411
|
+
// now that we have created all of the mergers, perform the actual merging
|
412
|
+
vector<unique_ptr<RowGroupCollection>> final_collections;
|
413
|
+
final_collections.reserve(mergers.size());
|
414
|
+
auto &writer = storage.CreateOptimisticWriter(context);
|
415
|
+
for (auto &merger : mergers) {
|
416
|
+
final_collections.push_back(merger->Flush(writer));
|
417
|
+
}
|
418
|
+
storage.FinalizeOptimisticWriter(context, writer);
|
414
419
|
|
415
|
-
|
416
|
-
|
417
|
-
|
420
|
+
// finally, merge the row groups into the local storage
|
421
|
+
for (auto &collection : final_collections) {
|
422
|
+
storage.LocalMerge(context, *collection);
|
423
|
+
}
|
424
|
+
} else {
|
425
|
+
// we are writing a small amount of data to disk
|
426
|
+
// append directly to transaction local storage
|
427
|
+
auto &table = gstate.table;
|
428
|
+
auto &storage = table.GetStorage();
|
429
|
+
LocalAppendState append_state;
|
430
|
+
storage.InitializeLocalAppend(append_state, context);
|
431
|
+
auto &transaction = DuckTransaction::Get(context, table.catalog);
|
432
|
+
for (auto &entry : gstate.collections) {
|
433
|
+
entry.collection->Scan(transaction, [&](DataChunk &insert_chunk) {
|
434
|
+
storage.LocalAppend(append_state, table, context, insert_chunk);
|
435
|
+
return true;
|
436
|
+
});
|
437
|
+
}
|
438
|
+
storage.FinalizeLocalAppend(append_state);
|
418
439
|
}
|
419
440
|
return SinkFinalizeType::READY;
|
420
441
|
}
|
@@ -187,9 +187,14 @@ string PragmaStorageInfo(ClientContext &context, const FunctionParameters ¶m
|
|
187
187
|
return StringUtil::Format("SELECT * FROM pragma_storage_info('%s');", parameters.values[0].ToString());
|
188
188
|
}
|
189
189
|
|
190
|
+
string PragmaMetadataInfo(ClientContext &context, const FunctionParameters ¶meters) {
|
191
|
+
return "SELECT * FROM pragma_metadata_info();";
|
192
|
+
}
|
193
|
+
|
190
194
|
void PragmaQueries::RegisterFunction(BuiltinFunctions &set) {
|
191
195
|
set.AddFunction(PragmaFunction::PragmaCall("table_info", PragmaTableInfo, {LogicalType::VARCHAR}));
|
192
196
|
set.AddFunction(PragmaFunction::PragmaCall("storage_info", PragmaStorageInfo, {LogicalType::VARCHAR}));
|
197
|
+
set.AddFunction(PragmaFunction::PragmaCall("metadata_info", PragmaMetadataInfo, {}));
|
193
198
|
set.AddFunction(PragmaFunction::PragmaStatement("show_tables", PragmaShowTables));
|
194
199
|
set.AddFunction(PragmaFunction::PragmaStatement("show_tables_expanded", PragmaShowTablesExpanded));
|
195
200
|
set.AddFunction(PragmaFunction::PragmaStatement("show_databases", PragmaShowDatabases));
|
@@ -208,18 +208,10 @@ void ArrowTableFunction::RenameArrowColumns(vector<string> &names) {
|
|
208
208
|
}
|
209
209
|
}
|
210
210
|
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
auto stream_factory_get_schema = (stream_factory_get_schema_t)input.inputs[2].GetPointer(); // NOLINT
|
216
|
-
|
217
|
-
auto res = make_uniq<ArrowScanFunctionData>(stream_factory_produce, stream_factory_ptr);
|
218
|
-
|
219
|
-
auto &data = *res;
|
220
|
-
stream_factory_get_schema(stream_factory_ptr, data.schema_root);
|
221
|
-
for (idx_t col_idx = 0; col_idx < (idx_t)data.schema_root.arrow_schema.n_children; col_idx++) {
|
222
|
-
auto &schema = *data.schema_root.arrow_schema.children[col_idx];
|
211
|
+
void ArrowTableFunction::PopulateArrowTableType(ArrowTableType &arrow_table, ArrowSchemaWrapper &schema_p,
|
212
|
+
vector<string> &names, vector<LogicalType> &return_types) {
|
213
|
+
for (idx_t col_idx = 0; col_idx < (idx_t)schema_p.arrow_schema.n_children; col_idx++) {
|
214
|
+
auto &schema = *schema_p.arrow_schema.children[col_idx];
|
223
215
|
if (!schema.release) {
|
224
216
|
throw InvalidInputException("arrow_scan: released schema passed");
|
225
217
|
}
|
@@ -233,7 +225,7 @@ unique_ptr<FunctionData> ArrowTableFunction::ArrowScanBind(ClientContext &contex
|
|
233
225
|
} else {
|
234
226
|
return_types.emplace_back(arrow_type->GetDuckType());
|
235
227
|
}
|
236
|
-
|
228
|
+
arrow_table.AddColumn(col_idx, std::move(arrow_type));
|
237
229
|
auto format = string(schema.format);
|
238
230
|
auto name = string(schema.name);
|
239
231
|
if (name.empty()) {
|
@@ -241,6 +233,19 @@ unique_ptr<FunctionData> ArrowTableFunction::ArrowScanBind(ClientContext &contex
|
|
241
233
|
}
|
242
234
|
names.push_back(name);
|
243
235
|
}
|
236
|
+
}
|
237
|
+
|
238
|
+
unique_ptr<FunctionData> ArrowTableFunction::ArrowScanBind(ClientContext &context, TableFunctionBindInput &input,
|
239
|
+
vector<LogicalType> &return_types, vector<string> &names) {
|
240
|
+
auto stream_factory_ptr = input.inputs[0].GetPointer();
|
241
|
+
auto stream_factory_produce = (stream_factory_produce_t)input.inputs[1].GetPointer(); // NOLINT
|
242
|
+
auto stream_factory_get_schema = (stream_factory_get_schema_t)input.inputs[2].GetPointer(); // NOLINT
|
243
|
+
|
244
|
+
auto res = make_uniq<ArrowScanFunctionData>(stream_factory_produce, stream_factory_ptr);
|
245
|
+
|
246
|
+
auto &data = *res;
|
247
|
+
stream_factory_get_schema(stream_factory_ptr, data.schema_root);
|
248
|
+
PopulateArrowTableType(res->arrow_table, data.schema_root, names, return_types);
|
244
249
|
RenameArrowColumns(names);
|
245
250
|
res->all_types = return_types;
|
246
251
|
return std::move(res);
|