duckdb 0.7.2-dev3117.0 → 0.7.2-dev3154.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. package/package.json +1 -1
  2. package/src/duckdb/extension/parquet/include/parquet_writer.hpp +7 -0
  3. package/src/duckdb/extension/parquet/parquet-extension.cpp +42 -0
  4. package/src/duckdb/extension/parquet/parquet_writer.cpp +23 -9
  5. package/src/duckdb/src/common/enums/physical_operator_type.cpp +2 -0
  6. package/src/duckdb/src/common/types/vector.cpp +4 -5
  7. package/src/duckdb/src/common/types/vector_buffer.cpp +1 -1
  8. package/src/duckdb/src/core_functions/function_list.cpp +1 -0
  9. package/src/duckdb/src/core_functions/scalar/map/map_concat.cpp +186 -0
  10. package/src/duckdb/src/execution/operator/persistent/physical_batch_copy_to_file.cpp +65 -21
  11. package/src/duckdb/src/execution/operator/persistent/physical_fixed_batch_copy.cpp +494 -0
  12. package/src/duckdb/src/execution/physical_plan/plan_copy_to_file.cpp +16 -6
  13. package/src/duckdb/src/execution/window_segment_tree.cpp +17 -13
  14. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  15. package/src/duckdb/src/include/duckdb/common/enums/physical_operator_type.hpp +1 -0
  16. package/src/duckdb/src/include/duckdb/core_functions/scalar/map_functions.hpp +14 -1
  17. package/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_batch_copy_to_file.hpp +13 -0
  18. package/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_fixed_batch_copy.hpp +72 -0
  19. package/src/duckdb/src/include/duckdb/function/copy_function.hpp +3 -1
  20. package/src/duckdb/src/planner/operator/logical_delete.cpp +2 -0
  21. package/src/duckdb/src/planner/operator/logical_update.cpp +2 -0
  22. package/src/duckdb/third_party/utf8proc/include/utf8proc_wrapper.hpp +1 -0
  23. package/src/duckdb/ub_src_core_functions_scalar_map.cpp +2 -0
  24. package/src/duckdb/ub_src_execution_operator_persistent.cpp +2 -0
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "0.7.2-dev3117.0",
5
+ "version": "0.7.2-dev3154.0",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
@@ -25,12 +25,19 @@ namespace duckdb {
25
25
  class FileSystem;
26
26
  class FileOpener;
27
27
 
28
+ struct PreparedRowGroup {
29
+ duckdb_parquet::format::RowGroup row_group;
30
+ vector<duckdb::unique_ptr<ColumnWriterState>> states;
31
+ };
32
+
28
33
  class ParquetWriter {
29
34
  public:
30
35
  ParquetWriter(FileSystem &fs, string file_name, FileOpener *file_opener, vector<LogicalType> types,
31
36
  vector<string> names, duckdb_parquet::format::CompressionCodec::type codec);
32
37
 
33
38
  public:
39
+ void PrepareRowGroup(ColumnDataCollection &buffer, PreparedRowGroup &result);
40
+ void FlushRowGroup(PreparedRowGroup &row_group);
34
41
  void Flush(ColumnDataCollection &buffer);
35
42
  void Finalize();
36
43
 
@@ -674,9 +674,48 @@ CopyFunctionExecutionMode ParquetWriteExecutionMode(bool preserve_insertion_orde
674
674
  if (!preserve_insertion_order) {
675
675
  return CopyFunctionExecutionMode::PARALLEL_COPY_TO_FILE;
676
676
  }
677
+ if (supports_batch_index) {
678
+ return CopyFunctionExecutionMode::BATCH_COPY_TO_FILE;
679
+ }
677
680
  return CopyFunctionExecutionMode::REGULAR_COPY_TO_FILE;
678
681
  }
682
+ //===--------------------------------------------------------------------===//
683
+ // Prepare Batch
684
+ //===--------------------------------------------------------------------===//
685
+ struct ParquetWriteBatchData : public PreparedBatchData {
686
+ PreparedRowGroup prepared_row_group;
687
+ };
688
+
689
+ unique_ptr<PreparedBatchData> ParquetWritePrepareBatch(ClientContext &context, FunctionData &bind_data,
690
+ GlobalFunctionData &gstate,
691
+ unique_ptr<ColumnDataCollection> collection) {
692
+ auto &global_state = gstate.Cast<ParquetWriteGlobalState>();
693
+ auto result = make_uniq<ParquetWriteBatchData>();
694
+ global_state.writer->PrepareRowGroup(*collection, result->prepared_row_group);
695
+ return std::move(result);
696
+ }
679
697
 
698
+ //===--------------------------------------------------------------------===//
699
+ // Flush Batch
700
+ //===--------------------------------------------------------------------===//
701
+ void ParquetWriteFlushBatch(ClientContext &context, FunctionData &bind_data, GlobalFunctionData &gstate,
702
+ PreparedBatchData &batch_p) {
703
+ auto &global_state = gstate.Cast<ParquetWriteGlobalState>();
704
+ auto &batch = batch_p.Cast<ParquetWriteBatchData>();
705
+ global_state.writer->FlushRowGroup(batch.prepared_row_group);
706
+ }
707
+
708
+ //===--------------------------------------------------------------------===//
709
+ // Desired Batch Size
710
+ //===--------------------------------------------------------------------===//
711
+ idx_t ParquetWriteDesiredBatchSize(ClientContext &context, FunctionData &bind_data_p) {
712
+ auto &bind_data = bind_data_p.Cast<ParquetWriteBindData>();
713
+ return bind_data.row_group_size;
714
+ }
715
+
716
+ //===--------------------------------------------------------------------===//
717
+ // Scan Replacement
718
+ //===--------------------------------------------------------------------===//
680
719
  unique_ptr<TableRef> ParquetScanReplacement(ClientContext &context, const string &table_name,
681
720
  ReplacementScanData *data) {
682
721
  auto lower_name = StringUtil::Lower(table_name);
@@ -719,6 +758,9 @@ void ParquetExtension::Load(DuckDB &db) {
719
758
  function.execution_mode = ParquetWriteExecutionMode;
720
759
  function.copy_from_bind = ParquetScanFunction::ParquetReadBind;
721
760
  function.copy_from_function = scan_fun.functions[0];
761
+ function.prepare_batch = ParquetWritePrepareBatch;
762
+ function.flush_batch = ParquetWriteFlushBatch;
763
+ function.desired_batch_size = ParquetWriteDesiredBatchSize;
722
764
 
723
765
  function.extension = "parquet";
724
766
  ExtensionUtil::RegisterFunction(db_instance, function);
@@ -261,17 +261,13 @@ ParquetWriter::ParquetWriter(FileSystem &fs, string file_name_p, FileOpener *fil
261
261
  }
262
262
  }
263
263
 
264
- void ParquetWriter::Flush(ColumnDataCollection &buffer) {
265
- if (buffer.Count() == 0) {
266
- return;
267
- }
268
-
264
+ void ParquetWriter::PrepareRowGroup(ColumnDataCollection &buffer, PreparedRowGroup &result) {
269
265
  // set up a new row group for this chunk collection
270
- ParquetRowGroup row_group;
266
+ auto &row_group = result.row_group;
271
267
  row_group.num_rows = buffer.Count();
272
268
  row_group.__isset.file_offset = true;
273
269
 
274
- vector<duckdb::unique_ptr<ColumnWriterState>> states;
270
+ auto &states = result.states;
275
271
  // iterate over each of the columns of the chunk collection and write them
276
272
  D_ASSERT(buffer.ColumnCount() == column_writers.size());
277
273
  for (idx_t col_idx = 0; col_idx < buffer.ColumnCount(); col_idx++) {
@@ -292,10 +288,17 @@ void ParquetWriter::Flush(ColumnDataCollection &buffer) {
292
288
  }
293
289
  states.push_back(std::move(write_state));
294
290
  }
291
+ }
295
292
 
293
+ void ParquetWriter::FlushRowGroup(PreparedRowGroup &prepared) {
296
294
  lock_guard<mutex> glock(lock);
295
+ auto &row_group = prepared.row_group;
296
+ auto &states = prepared.states;
297
+ if (states.empty()) {
298
+ throw InternalException("Attempting to flush a row group with no rows");
299
+ }
297
300
  row_group.file_offset = writer->GetTotalWritten();
298
- for (idx_t col_idx = 0; col_idx < buffer.ColumnCount(); col_idx++) {
301
+ for (idx_t col_idx = 0; col_idx < states.size(); col_idx++) {
299
302
  const auto &col_writer = column_writers[col_idx];
300
303
  auto write_state = std::move(states[col_idx]);
301
304
  col_writer->FinalizeWrite(*write_state);
@@ -303,7 +306,18 @@ void ParquetWriter::Flush(ColumnDataCollection &buffer) {
303
306
 
304
307
  // append the row group to the file meta data
305
308
  file_meta_data.row_groups.push_back(row_group);
306
- file_meta_data.num_rows += buffer.Count();
309
+ file_meta_data.num_rows += row_group.num_rows;
310
+ }
311
+
312
+ void ParquetWriter::Flush(ColumnDataCollection &buffer) {
313
+ if (buffer.Count() == 0) {
314
+ return;
315
+ }
316
+
317
+ PreparedRowGroup prepared_row_group;
318
+ PrepareRowGroup(buffer, prepared_row_group);
319
+
320
+ FlushRowGroup(prepared_row_group);
307
321
  }
308
322
 
309
323
  void ParquetWriter::Finalize() {
@@ -49,6 +49,8 @@ string PhysicalOperatorToString(PhysicalOperatorType type) {
49
49
  return "COPY_TO_FILE";
50
50
  case PhysicalOperatorType::BATCH_COPY_TO_FILE:
51
51
  return "BATCH_COPY_TO_FILE";
52
+ case PhysicalOperatorType::FIXED_BATCH_COPY_TO_FILE:
53
+ return "FIXED_BATCH_COPY_TO_FILE";
52
54
  case PhysicalOperatorType::DELIM_JOIN:
53
55
  return "DELIM_JOIN";
54
56
  case PhysicalOperatorType::BLOCKWISE_NL_JOIN:
@@ -1763,15 +1763,14 @@ MapInvalidReason MapVector::CheckMapValidity(Vector &map, idx_t count, const Sel
1763
1763
 
1764
1764
  for (idx_t row = 0; row < count; row++) {
1765
1765
  auto mapped_row = sel.get_index(row);
1766
- auto row_idx = map_vdata.sel->get_index(mapped_row);
1766
+ auto map_idx = map_vdata.sel->get_index(mapped_row);
1767
1767
  // map is allowed to be NULL
1768
- if (!map_validity.RowIsValid(row_idx)) {
1768
+ if (!map_validity.RowIsValid(map_idx)) {
1769
1769
  continue;
1770
1770
  }
1771
- row_idx = key_vdata.sel->get_index(row);
1772
1771
  value_set_t unique_keys;
1773
- for (idx_t i = 0; i < list_data[row_idx].length; i++) {
1774
- auto index = list_data[row_idx].offset + i;
1772
+ for (idx_t i = 0; i < list_data[map_idx].length; i++) {
1773
+ auto index = list_data[map_idx].offset + i;
1775
1774
  index = key_vdata.sel->get_index(index);
1776
1775
  if (!key_validity.RowIsValid(index)) {
1777
1776
  return MapInvalidReason::NULL_KEY;
@@ -89,7 +89,7 @@ void VectorListBuffer::Append(const Vector &to_append, const SelectionVector &se
89
89
  }
90
90
 
91
91
  void VectorListBuffer::PushBack(const Value &insert) {
92
- if (size + 1 > capacity) {
92
+ while (size + 1 > capacity) {
93
93
  child->Resize(capacity, capacity * 2);
94
94
  capacity *= 2;
95
95
  }
@@ -212,6 +212,7 @@ static StaticFunctionDefinition internal_functions[] = {
212
212
  DUCKDB_SCALAR_FUNCTION(MakeTimeFun),
213
213
  DUCKDB_SCALAR_FUNCTION(MakeTimestampFun),
214
214
  DUCKDB_SCALAR_FUNCTION(MapFun),
215
+ DUCKDB_SCALAR_FUNCTION(MapConcatFun),
215
216
  DUCKDB_SCALAR_FUNCTION(MapEntriesFun),
216
217
  DUCKDB_SCALAR_FUNCTION(MapExtractFun),
217
218
  DUCKDB_SCALAR_FUNCTION(MapFromEntriesFun),
@@ -0,0 +1,186 @@
1
+ #include "duckdb/planner/expression/bound_function_expression.hpp"
2
+ #include "duckdb/common/string_util.hpp"
3
+ #include "duckdb/parser/expression/bound_expression.hpp"
4
+ #include "duckdb/function/scalar/nested_functions.hpp"
5
+ #include "duckdb/common/types/data_chunk.hpp"
6
+ #include "duckdb/common/pair.hpp"
7
+ #include "duckdb/common/types.hpp"
8
+ #include "duckdb/common/unordered_map.hpp"
9
+ #include "duckdb/core_functions/scalar/map_functions.hpp"
10
+
11
+ namespace duckdb {
12
+
13
+ namespace {
14
+
15
+ struct MapKeyIndexPair {
16
+ MapKeyIndexPair(idx_t map, idx_t key) : map_index(map), key_index(key) {
17
+ }
18
+ // The index of the map that this key comes from
19
+ idx_t map_index;
20
+ // The index within the maps key_list
21
+ idx_t key_index;
22
+ };
23
+
24
+ } // namespace
25
+
26
+ vector<Value> GetListEntries(vector<Value> keys, vector<Value> values) {
27
+ D_ASSERT(keys.size() == values.size());
28
+ vector<Value> entries;
29
+ for (idx_t i = 0; i < keys.size(); i++) {
30
+ child_list_t<Value> children;
31
+ children.emplace_back(make_pair("key", std::move(keys[i])));
32
+ children.emplace_back(make_pair("value", std::move(values[i])));
33
+ entries.push_back(Value::STRUCT(std::move(children)));
34
+ }
35
+ return entries;
36
+ }
37
+
38
+ static void MapConcatFunction(DataChunk &args, ExpressionState &state, Vector &result) {
39
+ if (result.GetType().id() == LogicalTypeId::SQLNULL) {
40
+ // All inputs are NULL, just return NULL
41
+ auto &validity = FlatVector::Validity(result);
42
+ validity.SetInvalid(0);
43
+ result.SetVectorType(VectorType::CONSTANT_VECTOR);
44
+ return;
45
+ }
46
+ D_ASSERT(result.GetType().id() == LogicalTypeId::MAP);
47
+ auto count = args.size();
48
+
49
+ auto map_count = args.ColumnCount();
50
+ vector<UnifiedVectorFormat> map_formats(map_count);
51
+ for (idx_t i = 0; i < map_count; i++) {
52
+ auto &map = args.data[i];
53
+ map.ToUnifiedFormat(count, map_formats[i]);
54
+ }
55
+ auto result_data = FlatVector::GetData<list_entry_t>(result);
56
+
57
+ for (idx_t i = 0; i < count; i++) {
58
+ // Loop through all the maps per list
59
+ // we cant do better because all the entries of the child vector have to be contiguous
60
+ // so we cant start the next row before we have finished the one before it
61
+ auto &result_entry = result_data[i];
62
+ vector<MapKeyIndexPair> index_to_map;
63
+ vector<Value> keys_list;
64
+ for (idx_t map_idx = 0; map_idx < map_count; map_idx++) {
65
+ if (args.data[map_idx].GetType().id() == LogicalTypeId::SQLNULL) {
66
+ continue;
67
+ }
68
+ auto &map_format = map_formats[map_idx];
69
+ auto &keys = MapVector::GetKeys(args.data[map_idx]);
70
+
71
+ auto index = map_format.sel->get_index(i);
72
+ auto entry = ((list_entry_t *)map_format.data)[index];
73
+
74
+ // Update the list for this row
75
+ for (idx_t list_idx = 0; list_idx < entry.length; list_idx++) {
76
+ auto key_index = entry.offset + list_idx;
77
+ auto key = keys.GetValue(key_index);
78
+ auto entry = std::find(keys_list.begin(), keys_list.end(), key);
79
+ if (entry == keys_list.end()) {
80
+ // Result list does not contain this value yet
81
+ keys_list.push_back(key);
82
+ index_to_map.emplace_back(map_idx, key_index);
83
+ } else {
84
+ // Result list already contains this, update where to find the value at
85
+ auto distance = std::distance(keys_list.begin(), entry);
86
+ auto &mapping = *(index_to_map.begin() + distance);
87
+ mapping.key_index = key_index;
88
+ mapping.map_index = map_idx;
89
+ }
90
+ }
91
+ }
92
+ vector<Value> values_list;
93
+ D_ASSERT(keys_list.size() == index_to_map.size());
94
+ // Get the values from the mapping
95
+ for (auto &mapping : index_to_map) {
96
+ auto &map = args.data[mapping.map_index];
97
+ auto &values = MapVector::GetValues(map);
98
+ values_list.push_back(values.GetValue(mapping.key_index));
99
+ }
100
+ idx_t entries_count = keys_list.size();
101
+ D_ASSERT(values_list.size() == keys_list.size());
102
+ result_entry.offset = ListVector::GetListSize(result);
103
+ result_entry.length = values_list.size();
104
+ auto list_entries = GetListEntries(std::move(keys_list), std::move(values_list));
105
+ for (auto &list_entry : list_entries) {
106
+ ListVector::PushBack(result, list_entry);
107
+ }
108
+ ListVector::SetListSize(result, ListVector::GetListSize(result) + entries_count);
109
+ }
110
+
111
+ if (args.AllConstant()) {
112
+ result.SetVectorType(VectorType::CONSTANT_VECTOR);
113
+ }
114
+ result.Verify(count);
115
+ }
116
+
117
+ static bool IsEmptyMap(const LogicalType &map) {
118
+ D_ASSERT(map.id() == LogicalTypeId::MAP);
119
+ auto &key_type = MapType::KeyType(map);
120
+ auto &value_type = MapType::ValueType(map);
121
+ return key_type.id() == LogicalType::SQLNULL && value_type.id() == LogicalType::SQLNULL;
122
+ }
123
+
124
+ static unique_ptr<FunctionData> MapConcatBind(ClientContext &context, ScalarFunction &bound_function,
125
+ vector<unique_ptr<Expression>> &arguments) {
126
+
127
+ auto arg_count = arguments.size();
128
+ if (arg_count < 2) {
129
+ throw InvalidInputException("The provided amount of arguments is incorrect, please provide 2 or more maps");
130
+ }
131
+
132
+ if (arguments[0]->return_type.id() == LogicalTypeId::UNKNOWN) {
133
+ // Prepared statement
134
+ bound_function.arguments.emplace_back(LogicalTypeId::UNKNOWN);
135
+ bound_function.return_type = LogicalType(LogicalTypeId::SQLNULL);
136
+ return nullptr;
137
+ }
138
+
139
+ LogicalType expected = LogicalType::SQLNULL;
140
+
141
+ bool is_null = true;
142
+ // Check and verify that all the maps are of the same type
143
+ for (idx_t i = 0; i < arg_count; i++) {
144
+ auto &arg = arguments[i];
145
+ auto &map = arg->return_type;
146
+ if (map.id() == LogicalTypeId::UNKNOWN) {
147
+ // Prepared statement
148
+ bound_function.arguments.emplace_back(LogicalTypeId::UNKNOWN);
149
+ bound_function.return_type = LogicalType(LogicalTypeId::SQLNULL);
150
+ return nullptr;
151
+ }
152
+ if (map.id() == LogicalTypeId::SQLNULL) {
153
+ // The maps are allowed to be NULL
154
+ continue;
155
+ }
156
+ is_null = false;
157
+ if (IsEmptyMap(map)) {
158
+ // Map is allowed to be empty
159
+ continue;
160
+ }
161
+
162
+ if (expected.id() == LogicalTypeId::SQLNULL) {
163
+ expected = map;
164
+ } else if (map != expected) {
165
+ throw InvalidInputException(
166
+ "'value' type of map differs between arguments, expected '%s', found '%s' instead", expected.ToString(),
167
+ map.ToString());
168
+ }
169
+ }
170
+
171
+ if (expected.id() == LogicalTypeId::SQLNULL && is_null == false) {
172
+ expected = LogicalType::MAP(LogicalType::SQLNULL, LogicalType::SQLNULL);
173
+ }
174
+ bound_function.return_type = expected;
175
+ return make_uniq<VariableReturnBindData>(bound_function.return_type);
176
+ }
177
+
178
+ ScalarFunction MapConcatFun::GetFunction() {
179
+ //! the arguments and return types are actually set in the binder function
180
+ ScalarFunction fun("map_concat", {}, LogicalTypeId::LIST, MapConcatFunction, MapConcatBind);
181
+ fun.null_handling = FunctionNullHandling::SPECIAL_HANDLING;
182
+ fun.varargs = LogicalType::ANY;
183
+ return fun;
184
+ }
185
+
186
+ } // namespace duckdb
@@ -1,18 +1,17 @@
1
1
  #include "duckdb/execution/operator/persistent/physical_batch_copy_to_file.hpp"
2
2
  #include "duckdb/execution/operator/persistent/physical_copy_to_file.hpp"
3
+ #include "duckdb/parallel/base_pipeline_event.hpp"
3
4
  #include "duckdb/common/vector_operations/vector_operations.hpp"
4
5
  #include "duckdb/common/types/batched_data_collection.hpp"
5
- #include "duckdb/common/file_system.hpp"
6
- #include "duckdb/common/file_opener.hpp"
7
6
  #include "duckdb/common/allocator.hpp"
8
7
  #include <algorithm>
9
8
 
10
9
  namespace duckdb {
11
10
 
12
11
  PhysicalBatchCopyToFile::PhysicalBatchCopyToFile(vector<LogicalType> types, CopyFunction function_p,
13
- unique_ptr<FunctionData> bind_data, idx_t estimated_cardinality)
12
+ unique_ptr<FunctionData> bind_data_p, idx_t estimated_cardinality)
14
13
  : PhysicalOperator(PhysicalOperatorType::BATCH_COPY_TO_FILE, std::move(types), estimated_cardinality),
15
- function(std::move(function_p)), bind_data(std::move(bind_data)) {
14
+ function(std::move(function_p)), bind_data(std::move(bind_data_p)) {
16
15
  if (!function.flush_batch || !function.prepare_batch) {
17
16
  throw InternalException(
18
17
  "PhysicalBatchCopyToFile created for copy function that does not have prepare_batch/flush_batch defined");
@@ -20,32 +19,52 @@ PhysicalBatchCopyToFile::PhysicalBatchCopyToFile(vector<LogicalType> types, Copy
20
19
  }
21
20
 
22
21
  //===--------------------------------------------------------------------===//
23
- // Sink
22
+ // States
24
23
  //===--------------------------------------------------------------------===//
25
24
  class BatchCopyToGlobalState : public GlobalSinkState {
26
25
  public:
27
26
  explicit BatchCopyToGlobalState(unique_ptr<GlobalFunctionData> global_state)
28
- : rows_copied(0), global_state(std::move(global_state)) {
27
+ : rows_copied(0), global_state(std::move(global_state)), any_flushing(false) {
29
28
  }
30
29
 
31
30
  mutex lock;
32
- mutex flush_lock;
31
+ //! The total number of rows copied to the file
33
32
  atomic<idx_t> rows_copied;
33
+ //! Global copy state
34
34
  unique_ptr<GlobalFunctionData> global_state;
35
+ //! The prepared batch data by batch index - ready to flush
35
36
  map<idx_t, unique_ptr<PreparedBatchData>> batch_data;
37
+ //! Lock for flushing to disk
38
+ mutex flush_lock;
39
+ //! Whether or not any threads are flushing (only one thread can flush at a time)
40
+ atomic<bool> any_flushing;
41
+
42
+ void AddBatchData(idx_t batch_index, unique_ptr<PreparedBatchData> new_batch) {
43
+ // move the batch data to the set of prepared batch data
44
+ lock_guard<mutex> l(lock);
45
+ auto entry = batch_data.insert(make_pair(batch_index, std::move(new_batch)));
46
+ if (!entry.second) {
47
+ throw InternalException("Duplicate batch index %llu encountered in PhysicalBatchCopyToFile", batch_index);
48
+ }
49
+ }
36
50
  };
37
51
 
38
52
  class BatchCopyToLocalState : public LocalSinkState {
39
53
  public:
40
54
  explicit BatchCopyToLocalState(unique_ptr<LocalFunctionData> local_state_p)
41
- : local_state(std::move(local_state_p)), rows_copied(0), batch_index(0) {
55
+ : local_state(std::move(local_state_p)), rows_copied(0) {
42
56
  }
43
57
 
58
+ //! Local copy state
44
59
  unique_ptr<LocalFunctionData> local_state;
60
+ //! The current collection we are appending to
45
61
  unique_ptr<ColumnDataCollection> collection;
62
+ //! The append state of the collection
46
63
  ColumnDataAppendState append_state;
64
+ //! How many rows have been copied in total
47
65
  idx_t rows_copied;
48
- idx_t batch_index;
66
+ //! The current batch index
67
+ optional_idx batch_index;
49
68
 
50
69
  void InitializeCollection(ClientContext &context, const PhysicalOperator &op) {
51
70
  collection = make_uniq<ColumnDataCollection>(Allocator::Get(context), op.children[0]->types);
@@ -53,11 +72,15 @@ public:
53
72
  }
54
73
  };
55
74
 
75
+ //===--------------------------------------------------------------------===//
76
+ // Sink
77
+ //===--------------------------------------------------------------------===//
56
78
  SinkResultType PhysicalBatchCopyToFile::Sink(ExecutionContext &context, DataChunk &chunk,
57
79
  OperatorSinkInput &input) const {
58
80
  auto &state = input.local_state.Cast<BatchCopyToLocalState>();
59
81
  if (!state.collection) {
60
82
  state.InitializeCollection(context.client, *this);
83
+ state.batch_index = state.partition_info.batch_index.GetIndex();
61
84
  }
62
85
  state.rows_copied += chunk.size();
63
86
  state.collection->Append(state.append_state, chunk);
@@ -71,10 +94,13 @@ void PhysicalBatchCopyToFile::Combine(ExecutionContext &context, GlobalSinkState
71
94
  gstate.rows_copied += state.rows_copied;
72
95
  }
73
96
 
74
- SinkFinalizeType PhysicalBatchCopyToFile::Finalize(Pipeline &pipeline, Event &event, ClientContext &context,
75
- GlobalSinkState &gstate_p) const {
97
+ //===--------------------------------------------------------------------===//
98
+ // Finalize
99
+ //===--------------------------------------------------------------------===//
100
+ SinkFinalizeType PhysicalBatchCopyToFile::FinalFlush(ClientContext &context, GlobalSinkState &gstate_p) const {
76
101
  auto &gstate = gstate_p.Cast<BatchCopyToGlobalState>();
77
- FlushBatchData(context, gstate_p, NumericLimits<int64_t>::Maximum());
102
+ idx_t min_batch_index = idx_t(NumericLimits<int64_t>::Maximum());
103
+ FlushBatchData(context, gstate_p, min_batch_index);
78
104
  if (function.copy_to_finalize) {
79
105
  function.copy_to_finalize(context, *bind_data, *gstate.global_state);
80
106
 
@@ -85,25 +111,39 @@ SinkFinalizeType PhysicalBatchCopyToFile::Finalize(Pipeline &pipeline, Event &ev
85
111
  return SinkFinalizeType::READY;
86
112
  }
87
113
 
114
+ SinkFinalizeType PhysicalBatchCopyToFile::Finalize(Pipeline &pipeline, Event &event, ClientContext &context,
115
+ GlobalSinkState &gstate_p) const {
116
+ FinalFlush(context, gstate_p);
117
+ return SinkFinalizeType::READY;
118
+ }
119
+
120
+ //===--------------------------------------------------------------------===//
121
+ // Batch Data Handling
122
+ //===--------------------------------------------------------------------===//
88
123
  void PhysicalBatchCopyToFile::PrepareBatchData(ClientContext &context, GlobalSinkState &gstate_p, idx_t batch_index,
89
124
  unique_ptr<ColumnDataCollection> collection) const {
90
125
  auto &gstate = gstate_p.Cast<BatchCopyToGlobalState>();
91
126
 
92
127
  // prepare the batch
93
128
  auto batch_data = function.prepare_batch(context, *bind_data, *gstate.global_state, std::move(collection));
94
- // move the batch data to the set of prepared batch data
95
- lock_guard<mutex> l(gstate.lock);
96
- gstate.batch_data[batch_index] = std::move(batch_data);
129
+ gstate.AddBatchData(batch_index, std::move(batch_data));
97
130
  }
98
131
 
99
132
  void PhysicalBatchCopyToFile::FlushBatchData(ClientContext &context, GlobalSinkState &gstate_p, idx_t min_index) const {
100
133
  auto &gstate = gstate_p.Cast<BatchCopyToGlobalState>();
101
134
 
102
135
  // flush batch data to disk (if there are any to flush)
103
- while (true) {
104
- // grab the flush lock - we can only call flush_batch with this lock
105
- // otherwise the data might end up in the wrong order
136
+ // grab the flush lock - we can only call flush_batch with this lock
137
+ // otherwise the data might end up in the wrong order
138
+ {
106
139
  lock_guard<mutex> l(gstate.flush_lock);
140
+ if (gstate.any_flushing) {
141
+ return;
142
+ }
143
+ gstate.any_flushing = true;
144
+ }
145
+ ActiveFlushGuard active_flush(gstate.any_flushing);
146
+ while (true) {
107
147
  unique_ptr<PreparedBatchData> batch_data;
108
148
  {
109
149
  // fetch the next batch to flush (if any)
@@ -128,14 +168,18 @@ void PhysicalBatchCopyToFile::FlushBatchData(ClientContext &context, GlobalSinkS
128
168
  }
129
169
  }
130
170
 
171
+ //===--------------------------------------------------------------------===//
172
+ // Next Batch
173
+ //===--------------------------------------------------------------------===//
131
174
  void PhysicalBatchCopyToFile::NextBatch(ExecutionContext &context, GlobalSinkState &gstate_p,
132
175
  LocalSinkState &lstate) const {
133
176
  auto &state = lstate.Cast<BatchCopyToLocalState>();
134
- if (state.collection) {
177
+ if (state.collection && state.collection->Count() > 0) {
135
178
  // we finished processing this batch
136
179
  // start flushing data
137
- PrepareBatchData(context.client, gstate_p, state.batch_index, std::move(state.collection));
138
- FlushBatchData(context.client, gstate_p, lstate.partition_info.min_batch_index.GetIndex());
180
+ auto min_batch_index = lstate.partition_info.min_batch_index.GetIndex();
181
+ PrepareBatchData(context.client, gstate_p, state.batch_index.GetIndex(), std::move(state.collection));
182
+ FlushBatchData(context.client, gstate_p, min_batch_index);
139
183
  }
140
184
  state.batch_index = lstate.partition_info.batch_index.GetIndex();
141
185