duckdb 0.8.2-dev4514.0 → 0.8.2-dev4623.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. package/lib/duckdb.js +11 -1
  2. package/package.json +3 -1
  3. package/src/connection.cpp +48 -7
  4. package/src/duckdb/src/catalog/catalog.cpp +5 -0
  5. package/src/duckdb/src/catalog/duck_catalog.cpp +4 -0
  6. package/src/duckdb/src/common/enum_util.cpp +24 -0
  7. package/src/duckdb/src/execution/operator/csv_scanner/csv_reader_options.cpp +213 -2
  8. package/src/duckdb/src/execution/operator/persistent/physical_batch_insert.cpp +59 -38
  9. package/src/duckdb/src/function/pragma/pragma_queries.cpp +5 -0
  10. package/src/duckdb/src/function/table/arrow.cpp +18 -13
  11. package/src/duckdb/src/function/table/read_csv.cpp +3 -130
  12. package/src/duckdb/src/function/table/system/pragma_metadata_info.cpp +83 -0
  13. package/src/duckdb/src/function/table/system/pragma_storage_info.cpp +5 -0
  14. package/src/duckdb/src/function/table/system_functions.cpp +1 -0
  15. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  16. package/src/duckdb/src/include/duckdb/catalog/catalog.hpp +2 -0
  17. package/src/duckdb/src/include/duckdb/catalog/duck_catalog.hpp +1 -0
  18. package/src/duckdb/src/include/duckdb/common/box_renderer.hpp +1 -1
  19. package/src/duckdb/src/include/duckdb/common/enum_util.hpp +8 -0
  20. package/src/duckdb/src/include/duckdb/common/serializer/deserialization_data.hpp +36 -0
  21. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_reader_options.hpp +24 -0
  22. package/src/duckdb/src/include/duckdb/function/compression_function.hpp +36 -4
  23. package/src/duckdb/src/include/duckdb/function/table/arrow.hpp +2 -0
  24. package/src/duckdb/src/include/duckdb/function/table/system_functions.hpp +4 -0
  25. package/src/duckdb/src/include/duckdb/main/connection.hpp +1 -1
  26. package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +10 -4
  27. package/src/duckdb/src/include/duckdb/main/relation/read_csv_relation.hpp +3 -3
  28. package/src/duckdb/src/include/duckdb/main/relation/table_function_relation.hpp +1 -0
  29. package/src/duckdb/src/include/duckdb/storage/checkpoint/string_checkpoint_state.hpp +27 -4
  30. package/src/duckdb/src/include/duckdb/storage/checkpoint/write_overflow_strings_to_disk.hpp +4 -2
  31. package/src/duckdb/src/include/duckdb/storage/data_pointer.hpp +22 -1
  32. package/src/duckdb/src/include/duckdb/storage/database_size.hpp +6 -0
  33. package/src/duckdb/src/include/duckdb/storage/metadata/metadata_manager.hpp +2 -0
  34. package/src/duckdb/src/include/duckdb/storage/storage_manager.hpp +2 -0
  35. package/src/duckdb/src/include/duckdb/storage/string_uncompressed.hpp +6 -1
  36. package/src/duckdb/src/include/duckdb/storage/table/column_segment.hpp +7 -3
  37. package/src/duckdb/src/include/duckdb/storage/table_storage_info.hpp +1 -0
  38. package/src/duckdb/src/main/connection.cpp +4 -6
  39. package/src/duckdb/src/main/extension/extension_install.cpp +2 -1
  40. package/src/duckdb/src/main/relation/read_csv_relation.cpp +28 -9
  41. package/src/duckdb/src/main/relation/table_function_relation.cpp +8 -2
  42. package/src/duckdb/src/planner/binder/expression/bind_aggregate_expression.cpp +1 -4
  43. package/src/duckdb/src/storage/checkpoint/row_group_writer.cpp +1 -4
  44. package/src/duckdb/src/storage/checkpoint/write_overflow_strings_to_disk.cpp +47 -10
  45. package/src/duckdb/src/storage/checkpoint_manager.cpp +0 -2
  46. package/src/duckdb/src/storage/compression/fixed_size_uncompressed.cpp +6 -1
  47. package/src/duckdb/src/storage/compression/string_uncompressed.cpp +62 -12
  48. package/src/duckdb/src/storage/compression/validity_uncompressed.cpp +2 -1
  49. package/src/duckdb/src/storage/data_pointer.cpp +20 -0
  50. package/src/duckdb/src/storage/local_storage.cpp +3 -7
  51. package/src/duckdb/src/storage/metadata/metadata_manager.cpp +29 -15
  52. package/src/duckdb/src/storage/serialization/serialize_storage.cpp +4 -0
  53. package/src/duckdb/src/storage/single_file_block_manager.cpp +15 -9
  54. package/src/duckdb/src/storage/storage_info.cpp +1 -1
  55. package/src/duckdb/src/storage/storage_manager.cpp +5 -0
  56. package/src/duckdb/src/storage/table/column_checkpoint_state.cpp +3 -0
  57. package/src/duckdb/src/storage/table/column_data.cpp +17 -14
  58. package/src/duckdb/src/storage/table/column_data_checkpointer.cpp +4 -8
  59. package/src/duckdb/src/storage/table/column_segment.cpp +21 -12
  60. package/src/duckdb/ub_src_function_table_system.cpp +2 -0
  61. package/src/duckdb/ub_src_storage.cpp +2 -0
  62. package/src/duckdb_node.hpp +1 -0
  63. package/test/close_hang.test.ts +39 -0
package/lib/duckdb.js CHANGED
@@ -412,6 +412,13 @@ Connection.prototype.register_buffer;
412
412
  */
413
413
  Connection.prototype.unregister_buffer;
414
414
 
415
+ /**
416
+ * Closes connection
417
+ * @method
418
+ * @param callback
419
+ * @return {void}
420
+ */
421
+ Connection.prototype.close;
415
422
 
416
423
  /**
417
424
  * Closes database instance
@@ -420,7 +427,10 @@ Connection.prototype.unregister_buffer;
420
427
  * @return {void}
421
428
  */
422
429
  Database.prototype.close = function() {
423
- this.default_connection = null
430
+ if (this.default_connection) {
431
+ this.default_connection.close(); // this queues up a job in the internals, which blocks the below close call
432
+ this.default_connection = null;
433
+ }
424
434
  this.close_internal.apply(this, arguments);
425
435
  };
426
436
 
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "0.8.2-dev4514.0",
5
+ "version": "0.8.2-dev4623.0",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
@@ -29,12 +29,14 @@
29
29
  "devDependencies": {
30
30
  "@types/chai": "^4.3.4",
31
31
  "@types/chai-as-promised": "^7.1.5",
32
+ "@types/fs-extra": "^11.0.1",
32
33
  "@types/mocha": "^10.0.0",
33
34
  "@types/node": "^18.11.0",
34
35
  "apache-arrow": "^9.0.0",
35
36
  "aws-sdk": "^2.790.0",
36
37
  "chai": "^4.3.6",
37
38
  "chai-as-promised": "^7.1.1",
39
+ "fs-extra": "^11.1.1",
38
40
  "jsdoc3-parser": "^2.0.0",
39
41
  "mocha": "^8.3.0",
40
42
  "ts-node": "^10.9.1",
@@ -12,13 +12,13 @@ namespace node_duckdb {
12
12
  Napi::FunctionReference Connection::Init(Napi::Env env, Napi::Object exports) {
13
13
  Napi::HandleScope scope(env);
14
14
 
15
- Napi::Function t =
16
- DefineClass(env, "Connection",
17
- {InstanceMethod("prepare", &Connection::Prepare), InstanceMethod("exec", &Connection::Exec),
18
- InstanceMethod("register_udf_bulk", &Connection::RegisterUdf),
19
- InstanceMethod("register_buffer", &Connection::RegisterBuffer),
20
- InstanceMethod("unregister_udf", &Connection::UnregisterUdf),
21
- InstanceMethod("unregister_buffer", &Connection::UnRegisterBuffer)});
15
+ Napi::Function t = DefineClass(
16
+ env, "Connection",
17
+ {InstanceMethod("prepare", &Connection::Prepare), InstanceMethod("exec", &Connection::Exec),
18
+ InstanceMethod("register_udf_bulk", &Connection::RegisterUdf),
19
+ InstanceMethod("register_buffer", &Connection::RegisterBuffer),
20
+ InstanceMethod("unregister_udf", &Connection::UnregisterUdf), InstanceMethod("close", &Connection::Close),
21
+ InstanceMethod("unregister_buffer", &Connection::UnRegisterBuffer)});
22
22
 
23
23
  exports.Set("Connection", t);
24
24
 
@@ -407,6 +407,36 @@ struct ExecTaskWithCallback : public ExecTask {
407
407
  std::function<void(void)> cpp_callback;
408
408
  };
409
409
 
410
+ struct CloseConnectionTask : public Task {
411
+ CloseConnectionTask(Connection &connection, Napi::Function callback) : Task(connection, callback) {
412
+ }
413
+
414
+ void DoWork() override {
415
+ auto &connection = Get<Connection>();
416
+ if (connection.connection) {
417
+ connection.connection.reset();
418
+ success = true;
419
+ } else {
420
+ success = false;
421
+ }
422
+ }
423
+
424
+ void Callback() override {
425
+ auto &connection = Get<Connection>();
426
+ auto env = connection.Env();
427
+ Napi::HandleScope scope(env);
428
+
429
+ auto cb = callback.Value();
430
+ if (!success) {
431
+ cb.MakeCallback(connection.Value(), {Utils::CreateError(env, "Connection was already closed")});
432
+ return;
433
+ }
434
+ cb.MakeCallback(connection.Value(), {env.Null(), connection.Value()});
435
+ }
436
+
437
+ bool success = false;
438
+ };
439
+
410
440
  Napi::Value Connection::Exec(const Napi::CallbackInfo &info) {
411
441
  auto env = info.Env();
412
442
 
@@ -512,6 +542,17 @@ Napi::Value Connection::UnRegisterBuffer(const Napi::CallbackInfo &info) {
512
542
  return Value();
513
543
  }
514
544
 
545
+ Napi::Value Connection::Close(const Napi::CallbackInfo &info) {
546
+ Napi::Function callback;
547
+ if (info.Length() > 0 && info[0].IsFunction()) {
548
+ callback = info[0].As<Napi::Function>();
549
+ }
550
+
551
+ database_ref->Schedule(info.Env(), duckdb::make_uniq<CloseConnectionTask>(*this, callback));
552
+
553
+ return info.Env().Undefined();
554
+ }
555
+
515
556
  Napi::Object Connection::NewInstance(const Napi::Value &db) {
516
557
  return NodeDuckDB::GetData(db.Env())->connection_constructor.New({db});
517
558
  }
@@ -35,6 +35,7 @@
35
35
  #include "duckdb/main/database_manager.hpp"
36
36
  #include "duckdb/function/built_in_functions.hpp"
37
37
  #include "duckdb/catalog/similar_catalog_entry.hpp"
38
+ #include "duckdb/storage/database_size.hpp"
38
39
  #include <algorithm>
39
40
 
40
41
  namespace duckdb {
@@ -831,6 +832,10 @@ void Catalog::Alter(ClientContext &context, AlterInfo &info) {
831
832
  return lookup.schema->Alter(context, info);
832
833
  }
833
834
 
835
+ vector<MetadataBlockInfo> Catalog::GetMetadataInfo(ClientContext &context) {
836
+ return vector<MetadataBlockInfo>();
837
+ }
838
+
834
839
  void Catalog::Verify() {
835
840
  }
836
841
 
@@ -132,6 +132,10 @@ DatabaseSize DuckCatalog::GetDatabaseSize(ClientContext &context) {
132
132
  return db.GetStorageManager().GetDatabaseSize();
133
133
  }
134
134
 
135
+ vector<MetadataBlockInfo> DuckCatalog::GetMetadataInfo(ClientContext &context) {
136
+ return db.GetStorageManager().GetMetadataInfo();
137
+ }
138
+
135
139
  bool DuckCatalog::InMemory() {
136
140
  return db.GetStorageManager().InMemory();
137
141
  }
@@ -11,6 +11,7 @@
11
11
 
12
12
  #include "duckdb/common/enum_util.hpp"
13
13
  #include "duckdb/catalog/catalog_entry/table_column_type.hpp"
14
+ #include "duckdb/common/box_renderer.hpp"
14
15
  #include "duckdb/common/enums/access_mode.hpp"
15
16
  #include "duckdb/common/enums/aggregate_handling.hpp"
16
17
  #include "duckdb/common/enums/catalog_type.hpp"
@@ -4797,6 +4798,29 @@ RelationType EnumUtil::FromString<RelationType>(const char *value) {
4797
4798
  throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value));
4798
4799
  }
4799
4800
 
4801
+ template<>
4802
+ const char* EnumUtil::ToChars<RenderMode>(RenderMode value) {
4803
+ switch(value) {
4804
+ case RenderMode::ROWS:
4805
+ return "ROWS";
4806
+ case RenderMode::COLUMNS:
4807
+ return "COLUMNS";
4808
+ default:
4809
+ throw NotImplementedException(StringUtil::Format("Enum value: '%d' not implemented", value));
4810
+ }
4811
+ }
4812
+
4813
+ template<>
4814
+ RenderMode EnumUtil::FromString<RenderMode>(const char *value) {
4815
+ if (StringUtil::Equals(value, "ROWS")) {
4816
+ return RenderMode::ROWS;
4817
+ }
4818
+ if (StringUtil::Equals(value, "COLUMNS")) {
4819
+ return RenderMode::COLUMNS;
4820
+ }
4821
+ throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value));
4822
+ }
4823
+
4800
4824
  template<>
4801
4825
  const char* EnumUtil::ToChars<ResultModifierType>(ResultModifierType value) {
4802
4826
  switch(value) {
@@ -2,6 +2,8 @@
2
2
  #include "duckdb/common/bind_helpers.hpp"
3
3
  #include "duckdb/common/vector_size.hpp"
4
4
  #include "duckdb/common/string_util.hpp"
5
+ #include "duckdb/common/enum_util.hpp"
6
+ #include "duckdb/common/multi_file_reader.hpp"
5
7
 
6
8
  namespace duckdb {
7
9
 
@@ -60,6 +62,10 @@ static int64_t ParseInteger(const Value &value, const string &loption) {
60
62
  return value.GetValue<int64_t>();
61
63
  }
62
64
 
65
+ bool CSVReaderOptions::GetHeader() const {
66
+ return this->dialect_options.header;
67
+ }
68
+
63
69
  void CSVReaderOptions::SetHeader(bool input) {
64
70
  this->dialect_options.header = input;
65
71
  this->has_header = true;
@@ -69,6 +75,10 @@ void CSVReaderOptions::SetCompression(const string &compression_p) {
69
75
  this->compression = FileCompressionTypeFromString(compression_p);
70
76
  }
71
77
 
78
+ string CSVReaderOptions::GetEscape() const {
79
+ return std::string(1, this->dialect_options.state_machine_options.escape);
80
+ }
81
+
72
82
  void CSVReaderOptions::SetEscape(const string &input) {
73
83
  auto escape_str = input;
74
84
  if (escape_str.size() > 1) {
@@ -81,6 +91,19 @@ void CSVReaderOptions::SetEscape(const string &input) {
81
91
  this->has_escape = true;
82
92
  }
83
93
 
94
+ int64_t CSVReaderOptions::GetSkipRows() const {
95
+ return this->dialect_options.skip_rows;
96
+ }
97
+
98
+ void CSVReaderOptions::SetSkipRows(int64_t skip_rows) {
99
+ dialect_options.skip_rows = skip_rows;
100
+ skip_rows_set = true;
101
+ }
102
+
103
+ string CSVReaderOptions::GetDelimiter() const {
104
+ return std::string(1, this->dialect_options.state_machine_options.delimiter);
105
+ }
106
+
84
107
  void CSVReaderOptions::SetDelimiter(const string &input) {
85
108
  auto delim_str = StringUtil::Replace(input, "\\t", "\t");
86
109
  if (delim_str.size() > 1) {
@@ -93,6 +116,10 @@ void CSVReaderOptions::SetDelimiter(const string &input) {
93
116
  this->dialect_options.state_machine_options.delimiter = delim_str[0];
94
117
  }
95
118
 
119
+ string CSVReaderOptions::GetQuote() const {
120
+ return std::string(1, this->dialect_options.state_machine_options.quote);
121
+ }
122
+
96
123
  void CSVReaderOptions::SetQuote(const string &quote_p) {
97
124
  auto quote_str = quote_p;
98
125
  if (quote_str.size() > 1) {
@@ -105,6 +132,10 @@ void CSVReaderOptions::SetQuote(const string &quote_p) {
105
132
  this->has_quote = true;
106
133
  }
107
134
 
135
+ NewLineIdentifier CSVReaderOptions::GetNewline() const {
136
+ return dialect_options.new_line;
137
+ }
138
+
108
139
  void CSVReaderOptions::SetNewline(const string &input) {
109
140
  if (input == "\\n" || input == "\\r") {
110
141
  dialect_options.new_line = NewLineIdentifier::SINGLE;
@@ -152,8 +183,7 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value,
152
183
  sample_chunks = sample_size / STANDARD_VECTOR_SIZE + 1;
153
184
  }
154
185
  } else if (loption == "skip") {
155
- dialect_options.skip_rows = ParseInteger(value, loption);
156
- skip_rows_set = true;
186
+ SetSkipRows(ParseInteger(value, loption));
157
187
  } else if (loption == "max_line_size" || loption == "maximum_line_size") {
158
188
  maximum_line_size = ParseInteger(value, loption);
159
189
  } else if (loption == "sample_chunk_size") {
@@ -296,4 +326,185 @@ string CSVReaderOptions::ToString() const {
296
326
  "\n ignore_errors=" + std::to_string(ignore_errors) + "\n all_varchar=" + std::to_string(all_varchar);
297
327
  }
298
328
 
329
+ static Value StringVectorToValue(const vector<string> &vec) {
330
+ vector<Value> content;
331
+ content.reserve(vec.size());
332
+ for (auto &item : vec) {
333
+ content.push_back(Value(item));
334
+ }
335
+ return Value::LIST(std::move(content));
336
+ }
337
+
338
+ static uint8_t GetCandidateSpecificity(const LogicalType &candidate_type) {
339
+ //! Const ht with accepted auto_types and their weights in specificity
340
+ const duckdb::unordered_map<uint8_t, uint8_t> auto_type_candidates_specificity {
341
+ {(uint8_t)LogicalTypeId::VARCHAR, 0}, {(uint8_t)LogicalTypeId::TIMESTAMP, 1},
342
+ {(uint8_t)LogicalTypeId::DATE, 2}, {(uint8_t)LogicalTypeId::TIME, 3},
343
+ {(uint8_t)LogicalTypeId::DOUBLE, 4}, {(uint8_t)LogicalTypeId::FLOAT, 5},
344
+ {(uint8_t)LogicalTypeId::BIGINT, 6}, {(uint8_t)LogicalTypeId::INTEGER, 7},
345
+ {(uint8_t)LogicalTypeId::SMALLINT, 8}, {(uint8_t)LogicalTypeId::TINYINT, 9},
346
+ {(uint8_t)LogicalTypeId::BOOLEAN, 10}, {(uint8_t)LogicalTypeId::SQLNULL, 11}};
347
+
348
+ auto id = (uint8_t)candidate_type.id();
349
+ auto it = auto_type_candidates_specificity.find(id);
350
+ if (it == auto_type_candidates_specificity.end()) {
351
+ throw BinderException("Auto Type Candidate of type %s is not accepted as a valid input",
352
+ EnumUtil::ToString(candidate_type.id()));
353
+ }
354
+ return it->second;
355
+ }
356
+
357
+ void CSVReaderOptions::FromNamedParameters(named_parameter_map_t &in, ClientContext &context,
358
+ vector<LogicalType> &return_types, vector<string> &names) {
359
+ for (auto &kv : in) {
360
+ if (MultiFileReader::ParseOption(kv.first, kv.second, file_options, context)) {
361
+ continue;
362
+ }
363
+ auto loption = StringUtil::Lower(kv.first);
364
+ if (loption == "columns") {
365
+ explicitly_set_columns = true;
366
+ auto &child_type = kv.second.type();
367
+ if (child_type.id() != LogicalTypeId::STRUCT) {
368
+ throw BinderException("read_csv columns requires a struct as input");
369
+ }
370
+ auto &struct_children = StructValue::GetChildren(kv.second);
371
+ D_ASSERT(StructType::GetChildCount(child_type) == struct_children.size());
372
+ for (idx_t i = 0; i < struct_children.size(); i++) {
373
+ auto &name = StructType::GetChildName(child_type, i);
374
+ auto &val = struct_children[i];
375
+ names.push_back(name);
376
+ if (val.type().id() != LogicalTypeId::VARCHAR) {
377
+ throw BinderException("read_csv requires a type specification as string");
378
+ }
379
+ return_types.emplace_back(TransformStringToLogicalType(StringValue::Get(val), context));
380
+ }
381
+ if (names.empty()) {
382
+ throw BinderException("read_csv requires at least a single column as input!");
383
+ }
384
+ } else if (loption == "auto_type_candidates") {
385
+ auto_type_candidates.clear();
386
+ map<uint8_t, LogicalType> candidate_types;
387
+ // We always have the extremes of Null and Varchar, so we can default to varchar if the
388
+ // sniffer is not able to confidently detect that column type
389
+ candidate_types[GetCandidateSpecificity(LogicalType::VARCHAR)] = LogicalType::VARCHAR;
390
+ candidate_types[GetCandidateSpecificity(LogicalType::SQLNULL)] = LogicalType::SQLNULL;
391
+
392
+ auto &child_type = kv.second.type();
393
+ if (child_type.id() != LogicalTypeId::LIST) {
394
+ throw BinderException("read_csv auto_types requires a list as input");
395
+ }
396
+ auto &list_children = ListValue::GetChildren(kv.second);
397
+ if (list_children.empty()) {
398
+ throw BinderException("auto_type_candidates requires at least one type");
399
+ }
400
+ for (auto &child : list_children) {
401
+ if (child.type().id() != LogicalTypeId::VARCHAR) {
402
+ throw BinderException("auto_type_candidates requires a type specification as string");
403
+ }
404
+ auto candidate_type = TransformStringToLogicalType(StringValue::Get(child), context);
405
+ candidate_types[GetCandidateSpecificity(candidate_type)] = candidate_type;
406
+ }
407
+ for (auto &candidate_type : candidate_types) {
408
+ auto_type_candidates.emplace_back(candidate_type.second);
409
+ }
410
+ } else if (loption == "column_names" || loption == "names") {
411
+ if (!name_list.empty()) {
412
+ throw BinderException("read_csv_auto column_names/names can only be supplied once");
413
+ }
414
+ if (kv.second.IsNull()) {
415
+ throw BinderException("read_csv_auto %s cannot be NULL", kv.first);
416
+ }
417
+ auto &children = ListValue::GetChildren(kv.second);
418
+ for (auto &child : children) {
419
+ name_list.push_back(StringValue::Get(child));
420
+ }
421
+ } else if (loption == "column_types" || loption == "types" || loption == "dtypes") {
422
+ auto &child_type = kv.second.type();
423
+ if (child_type.id() != LogicalTypeId::STRUCT && child_type.id() != LogicalTypeId::LIST) {
424
+ throw BinderException("read_csv_auto %s requires a struct or list as input", kv.first);
425
+ }
426
+ if (!sql_type_list.empty()) {
427
+ throw BinderException("read_csv_auto column_types/types/dtypes can only be supplied once");
428
+ }
429
+ vector<string> sql_type_names;
430
+ if (child_type.id() == LogicalTypeId::STRUCT) {
431
+ auto &struct_children = StructValue::GetChildren(kv.second);
432
+ D_ASSERT(StructType::GetChildCount(child_type) == struct_children.size());
433
+ for (idx_t i = 0; i < struct_children.size(); i++) {
434
+ auto &name = StructType::GetChildName(child_type, i);
435
+ auto &val = struct_children[i];
436
+ if (val.type().id() != LogicalTypeId::VARCHAR) {
437
+ throw BinderException("read_csv_auto %s requires a type specification as string", kv.first);
438
+ }
439
+ sql_type_names.push_back(StringValue::Get(val));
440
+ sql_types_per_column[name] = i;
441
+ }
442
+ } else {
443
+ auto &list_child = ListType::GetChildType(child_type);
444
+ if (list_child.id() != LogicalTypeId::VARCHAR) {
445
+ throw BinderException("read_csv_auto %s requires a list of types (varchar) as input", kv.first);
446
+ }
447
+ auto &children = ListValue::GetChildren(kv.second);
448
+ for (auto &child : children) {
449
+ sql_type_names.push_back(StringValue::Get(child));
450
+ }
451
+ }
452
+ sql_type_list.reserve(sql_type_names.size());
453
+ for (auto &sql_type : sql_type_names) {
454
+ auto def_type = TransformStringToLogicalType(sql_type);
455
+ if (def_type.id() == LogicalTypeId::USER) {
456
+ throw BinderException("Unrecognized type \"%s\" for read_csv_auto %s definition", sql_type,
457
+ kv.first);
458
+ }
459
+ sql_type_list.push_back(std::move(def_type));
460
+ }
461
+ } else if (loption == "all_varchar") {
462
+ all_varchar = BooleanValue::Get(kv.second);
463
+ } else if (loption == "normalize_names") {
464
+ normalize_names = BooleanValue::Get(kv.second);
465
+ } else {
466
+ SetReadOption(loption, kv.second, names);
467
+ }
468
+ }
469
+ }
470
+
471
+ //! This function is used to remember options set by the sniffer, for use in ReadCSVRelation
472
+ void CSVReaderOptions::ToNamedParameters(named_parameter_map_t &named_params) {
473
+ if (has_delimiter) {
474
+ named_params["delim"] = Value(GetDelimiter());
475
+ }
476
+ if (has_newline) {
477
+ named_params["newline"] = Value(EnumUtil::ToString(GetNewline()));
478
+ }
479
+ if (has_quote) {
480
+ named_params["quote"] = Value(GetQuote());
481
+ }
482
+ if (has_escape) {
483
+ named_params["escape"] = Value(GetEscape());
484
+ }
485
+ if (has_header) {
486
+ named_params["header"] = Value(GetHeader());
487
+ }
488
+ named_params["max_line_size"] = Value::BIGINT(maximum_line_size);
489
+ if (skip_rows_set) {
490
+ named_params["skip"] = Value::BIGINT(GetSkipRows());
491
+ }
492
+ named_params["sample_chunks"] = Value::BIGINT(sample_chunks);
493
+ named_params["sample_chunk_size"] = Value::BIGINT(sample_chunk_size);
494
+ named_params["null_padding"] = Value::BOOLEAN(null_padding);
495
+ if (!date_format.at(LogicalType::DATE).format_specifier.empty()) {
496
+ named_params["dateformat"] = Value(date_format.at(LogicalType::DATE).format_specifier);
497
+ }
498
+ if (!date_format.at(LogicalType::TIMESTAMP).format_specifier.empty()) {
499
+ named_params["timestampformat"] = Value(date_format.at(LogicalType::TIMESTAMP).format_specifier);
500
+ }
501
+
502
+ named_params["normalize_names"] = Value::BOOLEAN(normalize_names);
503
+ if (!name_list.empty()) {
504
+ named_params["column_names"] = StringVectorToValue(name_list);
505
+ }
506
+ named_params["all_varchar"] = Value::BOOLEAN(all_varchar);
507
+ named_params["maximum_line_size"] = Value::BIGINT(maximum_line_size);
508
+ }
509
+
299
510
  } // namespace duckdb
@@ -6,6 +6,7 @@
6
6
  #include "duckdb/storage/table_io_manager.hpp"
7
7
  #include "duckdb/transaction/local_storage.hpp"
8
8
  #include "duckdb/catalog/catalog_entry/duck_table_entry.hpp"
9
+ #include "duckdb/transaction/duck_transaction.hpp"
9
10
  #include "duckdb/storage/table/append_state.hpp"
10
11
  #include "duckdb/storage/table/scan_state.hpp"
11
12
 
@@ -119,6 +120,7 @@ public:
119
120
  idx_t insert_count;
120
121
  vector<RowGroupBatchEntry> collections;
121
122
  idx_t next_start = 0;
123
+ bool optimistically_written = false;
122
124
 
123
125
  void FindMergeCollections(idx_t min_batch_index, optional_idx &merged_batch_index,
124
126
  vector<unique_ptr<RowGroupCollection>> &result) {
@@ -176,10 +178,12 @@ public:
176
178
  unique_ptr<RowGroupCollection> MergeCollections(ClientContext &context,
177
179
  vector<unique_ptr<RowGroupCollection>> merge_collections,
178
180
  OptimisticDataWriter &writer) {
181
+ D_ASSERT(!merge_collections.empty());
179
182
  CollectionMerger merger(context);
180
183
  for (auto &collection : merge_collections) {
181
184
  merger.AddCollection(std::move(collection));
182
185
  }
186
+ optimistically_written = true;
183
187
  return merger.Flush(writer);
184
188
  }
185
189
 
@@ -373,48 +377,65 @@ SinkFinalizeType PhysicalBatchInsert::Finalize(Pipeline &pipeline, Event &event,
373
377
  OperatorSinkFinalizeInput &input) const {
374
378
  auto &gstate = input.global_state.Cast<BatchInsertGlobalState>();
375
379
 
376
- // in the finalize, do a final pass over all of the collections we created and try to merge smaller collections
377
- // together
378
- vector<unique_ptr<CollectionMerger>> mergers;
379
- unique_ptr<CollectionMerger> current_merger;
380
-
381
- auto &storage = gstate.table.GetStorage();
382
- for (auto &entry : gstate.collections) {
383
- if (entry.type == RowGroupBatchType::NOT_FLUSHED) {
384
- // this collection has not been flushed: add it to the merge set
385
- if (!current_merger) {
386
- current_merger = make_uniq<CollectionMerger>(context);
387
- }
388
- current_merger->AddCollection(std::move(entry.collection));
389
- } else {
390
- // this collection has been flushed: it does not need to be merged
391
- // create a separate collection merger only for this entry
392
- if (current_merger) {
393
- // we have small collections remaining: flush them
394
- mergers.push_back(std::move(current_merger));
395
- current_merger.reset();
380
+ if (gstate.optimistically_written || gstate.insert_count >= LocalStorage::MERGE_THRESHOLD) {
381
+ // we have written data to disk optimistically or are inserting a large amount of data
382
+ // perform a final pass over all of the row groups and merge them together
383
+ vector<unique_ptr<CollectionMerger>> mergers;
384
+ unique_ptr<CollectionMerger> current_merger;
385
+
386
+ auto &storage = gstate.table.GetStorage();
387
+ for (auto &entry : gstate.collections) {
388
+ if (entry.type == RowGroupBatchType::NOT_FLUSHED) {
389
+ // this collection has not been flushed: add it to the merge set
390
+ if (!current_merger) {
391
+ current_merger = make_uniq<CollectionMerger>(context);
392
+ }
393
+ current_merger->AddCollection(std::move(entry.collection));
394
+ } else {
395
+ // this collection has been flushed: it does not need to be merged
396
+ // create a separate collection merger only for this entry
397
+ if (current_merger) {
398
+ // we have small collections remaining: flush them
399
+ mergers.push_back(std::move(current_merger));
400
+ current_merger.reset();
401
+ }
402
+ auto larger_merger = make_uniq<CollectionMerger>(context);
403
+ larger_merger->AddCollection(std::move(entry.collection));
404
+ mergers.push_back(std::move(larger_merger));
396
405
  }
397
- auto larger_merger = make_uniq<CollectionMerger>(context);
398
- larger_merger->AddCollection(std::move(entry.collection));
399
- mergers.push_back(std::move(larger_merger));
400
406
  }
401
- }
402
- if (current_merger) {
403
- mergers.push_back(std::move(current_merger));
404
- }
407
+ if (current_merger) {
408
+ mergers.push_back(std::move(current_merger));
409
+ }
405
410
 
406
- // now that we have created all of the mergers, perform the actual merging
407
- vector<unique_ptr<RowGroupCollection>> final_collections;
408
- final_collections.reserve(mergers.size());
409
- auto &writer = storage.CreateOptimisticWriter(context);
410
- for (auto &merger : mergers) {
411
- final_collections.push_back(merger->Flush(writer));
412
- }
413
- storage.FinalizeOptimisticWriter(context, writer);
411
+ // now that we have created all of the mergers, perform the actual merging
412
+ vector<unique_ptr<RowGroupCollection>> final_collections;
413
+ final_collections.reserve(mergers.size());
414
+ auto &writer = storage.CreateOptimisticWriter(context);
415
+ for (auto &merger : mergers) {
416
+ final_collections.push_back(merger->Flush(writer));
417
+ }
418
+ storage.FinalizeOptimisticWriter(context, writer);
414
419
 
415
- // finally, merge the row groups into the local storage
416
- for (auto &collection : final_collections) {
417
- storage.LocalMerge(context, *collection);
420
+ // finally, merge the row groups into the local storage
421
+ for (auto &collection : final_collections) {
422
+ storage.LocalMerge(context, *collection);
423
+ }
424
+ } else {
425
+ // we are writing a small amount of data to disk
426
+ // append directly to transaction local storage
427
+ auto &table = gstate.table;
428
+ auto &storage = table.GetStorage();
429
+ LocalAppendState append_state;
430
+ storage.InitializeLocalAppend(append_state, context);
431
+ auto &transaction = DuckTransaction::Get(context, table.catalog);
432
+ for (auto &entry : gstate.collections) {
433
+ entry.collection->Scan(transaction, [&](DataChunk &insert_chunk) {
434
+ storage.LocalAppend(append_state, table, context, insert_chunk);
435
+ return true;
436
+ });
437
+ }
438
+ storage.FinalizeLocalAppend(append_state);
418
439
  }
419
440
  return SinkFinalizeType::READY;
420
441
  }
@@ -187,9 +187,14 @@ string PragmaStorageInfo(ClientContext &context, const FunctionParameters &param
187
187
  return StringUtil::Format("SELECT * FROM pragma_storage_info('%s');", parameters.values[0].ToString());
188
188
  }
189
189
 
190
+ string PragmaMetadataInfo(ClientContext &context, const FunctionParameters &parameters) {
191
+ return "SELECT * FROM pragma_metadata_info();";
192
+ }
193
+
190
194
  void PragmaQueries::RegisterFunction(BuiltinFunctions &set) {
191
195
  set.AddFunction(PragmaFunction::PragmaCall("table_info", PragmaTableInfo, {LogicalType::VARCHAR}));
192
196
  set.AddFunction(PragmaFunction::PragmaCall("storage_info", PragmaStorageInfo, {LogicalType::VARCHAR}));
197
+ set.AddFunction(PragmaFunction::PragmaCall("metadata_info", PragmaMetadataInfo, {}));
193
198
  set.AddFunction(PragmaFunction::PragmaStatement("show_tables", PragmaShowTables));
194
199
  set.AddFunction(PragmaFunction::PragmaStatement("show_tables_expanded", PragmaShowTablesExpanded));
195
200
  set.AddFunction(PragmaFunction::PragmaStatement("show_databases", PragmaShowDatabases));
@@ -208,18 +208,10 @@ void ArrowTableFunction::RenameArrowColumns(vector<string> &names) {
208
208
  }
209
209
  }
210
210
 
211
- unique_ptr<FunctionData> ArrowTableFunction::ArrowScanBind(ClientContext &context, TableFunctionBindInput &input,
212
- vector<LogicalType> &return_types, vector<string> &names) {
213
- auto stream_factory_ptr = input.inputs[0].GetPointer();
214
- auto stream_factory_produce = (stream_factory_produce_t)input.inputs[1].GetPointer(); // NOLINT
215
- auto stream_factory_get_schema = (stream_factory_get_schema_t)input.inputs[2].GetPointer(); // NOLINT
216
-
217
- auto res = make_uniq<ArrowScanFunctionData>(stream_factory_produce, stream_factory_ptr);
218
-
219
- auto &data = *res;
220
- stream_factory_get_schema(stream_factory_ptr, data.schema_root);
221
- for (idx_t col_idx = 0; col_idx < (idx_t)data.schema_root.arrow_schema.n_children; col_idx++) {
222
- auto &schema = *data.schema_root.arrow_schema.children[col_idx];
211
+ void ArrowTableFunction::PopulateArrowTableType(ArrowTableType &arrow_table, ArrowSchemaWrapper &schema_p,
212
+ vector<string> &names, vector<LogicalType> &return_types) {
213
+ for (idx_t col_idx = 0; col_idx < (idx_t)schema_p.arrow_schema.n_children; col_idx++) {
214
+ auto &schema = *schema_p.arrow_schema.children[col_idx];
223
215
  if (!schema.release) {
224
216
  throw InvalidInputException("arrow_scan: released schema passed");
225
217
  }
@@ -233,7 +225,7 @@ unique_ptr<FunctionData> ArrowTableFunction::ArrowScanBind(ClientContext &contex
233
225
  } else {
234
226
  return_types.emplace_back(arrow_type->GetDuckType());
235
227
  }
236
- res->arrow_table.AddColumn(col_idx, std::move(arrow_type));
228
+ arrow_table.AddColumn(col_idx, std::move(arrow_type));
237
229
  auto format = string(schema.format);
238
230
  auto name = string(schema.name);
239
231
  if (name.empty()) {
@@ -241,6 +233,19 @@ unique_ptr<FunctionData> ArrowTableFunction::ArrowScanBind(ClientContext &contex
241
233
  }
242
234
  names.push_back(name);
243
235
  }
236
+ }
237
+
238
+ unique_ptr<FunctionData> ArrowTableFunction::ArrowScanBind(ClientContext &context, TableFunctionBindInput &input,
239
+ vector<LogicalType> &return_types, vector<string> &names) {
240
+ auto stream_factory_ptr = input.inputs[0].GetPointer();
241
+ auto stream_factory_produce = (stream_factory_produce_t)input.inputs[1].GetPointer(); // NOLINT
242
+ auto stream_factory_get_schema = (stream_factory_get_schema_t)input.inputs[2].GetPointer(); // NOLINT
243
+
244
+ auto res = make_uniq<ArrowScanFunctionData>(stream_factory_produce, stream_factory_ptr);
245
+
246
+ auto &data = *res;
247
+ stream_factory_get_schema(stream_factory_ptr, data.schema_root);
248
+ PopulateArrowTableType(res->arrow_table, data.schema_root, names, return_types);
244
249
  RenameArrowColumns(names);
245
250
  res->all_types = return_types;
246
251
  return std::move(res);