duckdb 0.7.2-dev1901.0 → 0.7.2-dev2233.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +2 -0
- package/package.json +1 -1
- package/src/duckdb/extension/parquet/column_reader.cpp +3 -0
- package/src/duckdb/extension/parquet/include/parquet_writer.hpp +1 -1
- package/src/duckdb/extension/parquet/parquet_metadata.cpp +4 -2
- package/src/duckdb/src/catalog/catalog_entry/duck_index_entry.cpp +1 -1
- package/src/duckdb/src/common/arrow/arrow_appender.cpp +69 -44
- package/src/duckdb/src/common/arrow/arrow_converter.cpp +1 -1
- package/src/duckdb/src/common/arrow/arrow_wrapper.cpp +20 -2
- package/src/duckdb/src/common/box_renderer.cpp +4 -2
- package/src/duckdb/src/common/constants.cpp +10 -1
- package/src/duckdb/src/common/filename_pattern.cpp +41 -0
- package/src/duckdb/src/common/hive_partitioning.cpp +144 -15
- package/src/duckdb/src/common/radix_partitioning.cpp +101 -369
- package/src/duckdb/src/common/row_operations/row_aggregate.cpp +8 -9
- package/src/duckdb/src/common/row_operations/row_external.cpp +1 -1
- package/src/duckdb/src/common/row_operations/row_gather.cpp +5 -3
- package/src/duckdb/src/common/row_operations/row_match.cpp +117 -22
- package/src/duckdb/src/common/row_operations/row_scatter.cpp +2 -2
- package/src/duckdb/src/common/sort/partition_state.cpp +1 -1
- package/src/duckdb/src/common/sort/sort_state.cpp +2 -1
- package/src/duckdb/src/common/sort/sorted_block.cpp +1 -1
- package/src/duckdb/src/common/types/{column_data_allocator.cpp → column/column_data_allocator.cpp} +2 -2
- package/src/duckdb/src/common/types/{column_data_collection.cpp → column/column_data_collection.cpp} +29 -6
- package/src/duckdb/src/common/types/{column_data_collection_segment.cpp → column/column_data_collection_segment.cpp} +2 -1
- package/src/duckdb/src/common/types/{column_data_consumer.cpp → column/column_data_consumer.cpp} +1 -1
- package/src/duckdb/src/common/types/{partitioned_column_data.cpp → column/partitioned_column_data.cpp} +11 -9
- package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +316 -0
- package/src/duckdb/src/common/types/{row_data_collection.cpp → row/row_data_collection.cpp} +1 -1
- package/src/duckdb/src/common/types/{row_data_collection_scanner.cpp → row/row_data_collection_scanner.cpp} +2 -2
- package/src/duckdb/src/common/types/{row_layout.cpp → row/row_layout.cpp} +1 -1
- package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +465 -0
- package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +511 -0
- package/src/duckdb/src/common/types/row/tuple_data_iterator.cpp +96 -0
- package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +119 -0
- package/src/duckdb/src/common/types/row/tuple_data_scatter_gather.cpp +1200 -0
- package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +170 -0
- package/src/duckdb/src/common/types/vector.cpp +1 -1
- package/src/duckdb/src/execution/aggregate_hashtable.cpp +252 -290
- package/src/duckdb/src/execution/join_hashtable.cpp +192 -328
- package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +4 -4
- package/src/duckdb/src/execution/operator/helper/physical_execute.cpp +3 -3
- package/src/duckdb/src/execution/operator/helper/physical_limit_percent.cpp +2 -3
- package/src/duckdb/src/execution/operator/helper/physical_result_collector.cpp +2 -3
- package/src/duckdb/src/execution/operator/join/perfect_hash_join_executor.cpp +36 -21
- package/src/duckdb/src/execution/operator/join/physical_blockwise_nl_join.cpp +2 -2
- package/src/duckdb/src/execution/operator/join/physical_cross_product.cpp +1 -1
- package/src/duckdb/src/execution/operator/join/physical_delim_join.cpp +2 -2
- package/src/duckdb/src/execution/operator/join/physical_hash_join.cpp +166 -144
- package/src/duckdb/src/execution/operator/join/physical_index_join.cpp +5 -5
- package/src/duckdb/src/execution/operator/join/physical_join.cpp +2 -10
- package/src/duckdb/src/execution/operator/join/physical_positional_join.cpp +0 -1
- package/src/duckdb/src/execution/operator/order/physical_top_n.cpp +2 -2
- package/src/duckdb/src/execution/operator/persistent/base_csv_reader.cpp +3 -0
- package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +71 -22
- package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +17 -13
- package/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp +0 -7
- package/src/duckdb/src/execution/operator/persistent/parallel_csv_reader.cpp +124 -29
- package/src/duckdb/src/execution/operator/persistent/physical_copy_to_file.cpp +13 -11
- package/src/duckdb/src/execution/operator/persistent/physical_delete.cpp +3 -2
- package/src/duckdb/src/execution/operator/persistent/physical_export.cpp +25 -24
- package/src/duckdb/src/execution/operator/persistent/physical_insert.cpp +1 -1
- package/src/duckdb/src/execution/operator/persistent/physical_update.cpp +4 -3
- package/src/duckdb/src/execution/operator/scan/physical_table_scan.cpp +1 -1
- package/src/duckdb/src/execution/operator/schema/physical_create_type.cpp +1 -1
- package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +3 -3
- package/src/duckdb/src/execution/partitionable_hashtable.cpp +9 -37
- package/src/duckdb/src/execution/physical_operator.cpp +1 -1
- package/src/duckdb/src/execution/physical_plan/plan_comparison_join.cpp +19 -18
- package/src/duckdb/src/execution/physical_plan/plan_copy_to_file.cpp +2 -1
- package/src/duckdb/src/execution/physical_plan/plan_execute.cpp +2 -2
- package/src/duckdb/src/execution/physical_plan/plan_explain.cpp +5 -6
- package/src/duckdb/src/execution/physical_plan/plan_expression_get.cpp +2 -2
- package/src/duckdb/src/execution/physical_plan/plan_recursive_cte.cpp +3 -3
- package/src/duckdb/src/execution/physical_plan_generator.cpp +1 -1
- package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +39 -17
- package/src/duckdb/src/function/aggregate/sorted_aggregate_function.cpp +2 -2
- package/src/duckdb/src/function/table/pragma_detailed_profiling_output.cpp +5 -5
- package/src/duckdb/src/function/table/pragma_last_profiling_output.cpp +2 -2
- package/src/duckdb/src/function/table/read_csv.cpp +124 -58
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/catalog/catalog_entry/index_catalog_entry.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/arrow/arrow_appender.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/constants.hpp +2 -0
- package/src/duckdb/src/include/duckdb/common/exception.hpp +3 -0
- package/src/duckdb/src/include/duckdb/common/fast_mem.hpp +528 -0
- package/src/duckdb/src/include/duckdb/common/filename_pattern.hpp +34 -0
- package/src/duckdb/src/include/duckdb/common/helper.hpp +10 -0
- package/src/duckdb/src/include/duckdb/common/hive_partitioning.hpp +13 -3
- package/src/duckdb/src/include/duckdb/common/optional_ptr.hpp +8 -0
- package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +34 -0
- package/src/duckdb/src/include/duckdb/common/radix_partitioning.hpp +80 -27
- package/src/duckdb/src/include/duckdb/common/reference_map.hpp +38 -0
- package/src/duckdb/src/include/duckdb/common/row_operations/row_operations.hpp +7 -6
- package/src/duckdb/src/include/duckdb/common/sort/comparators.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/sort/sort.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/sort/sorted_block.hpp +2 -2
- package/src/duckdb/src/include/duckdb/common/types/batched_data_collection.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/types/{column_data_allocator.hpp → column/column_data_allocator.hpp} +4 -4
- package/src/duckdb/src/include/duckdb/common/types/{column_data_collection.hpp → column/column_data_collection.hpp} +4 -4
- package/src/duckdb/src/include/duckdb/common/types/{column_data_collection_iterators.hpp → column/column_data_collection_iterators.hpp} +2 -2
- package/src/duckdb/src/include/duckdb/common/types/{column_data_collection_segment.hpp → column/column_data_collection_segment.hpp} +3 -3
- package/src/duckdb/src/include/duckdb/common/types/{column_data_consumer.hpp → column/column_data_consumer.hpp} +8 -4
- package/src/duckdb/src/include/duckdb/common/types/{column_data_scan_states.hpp → column/column_data_scan_states.hpp} +1 -1
- package/src/duckdb/src/include/duckdb/common/types/{partitioned_column_data.hpp → column/partitioned_column_data.hpp} +15 -7
- package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +140 -0
- package/src/duckdb/src/include/duckdb/common/types/{row_data_collection.hpp → row/row_data_collection.hpp} +1 -1
- package/src/duckdb/src/include/duckdb/common/types/{row_data_collection_scanner.hpp → row/row_data_collection_scanner.hpp} +2 -2
- package/src/duckdb/src/include/duckdb/common/types/{row_layout.hpp → row/row_layout.hpp} +3 -1
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +116 -0
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +239 -0
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_iterator.hpp +64 -0
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +113 -0
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +124 -0
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +74 -0
- package/src/duckdb/src/include/duckdb/common/types/validity_mask.hpp +3 -0
- package/src/duckdb/src/include/duckdb/common/types/value.hpp +4 -12
- package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +34 -31
- package/src/duckdb/src/include/duckdb/execution/base_aggregate_hashtable.hpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/execution_context.hpp +3 -2
- package/src/duckdb/src/include/duckdb/execution/expression_executor.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/join_hashtable.hpp +41 -67
- package/src/duckdb/src/include/duckdb/execution/nested_loop_join.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/helper/physical_execute.hpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/operator/helper/physical_result_collector.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/join/outer_join_marker.hpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/operator/join/perfect_hash_join_executor.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/join/physical_cross_product.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/join/physical_hash_join.hpp +0 -2
- package/src/duckdb/src/include/duckdb/execution/operator/join/physical_index_join.hpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/operator/join/physical_positional_join.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +4 -1
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +8 -3
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp +5 -7
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/parallel_csv_reader.hpp +5 -1
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_copy_to_file.hpp +4 -1
- package/src/duckdb/src/include/duckdb/execution/operator/scan/physical_column_data_scan.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/set/physical_recursive_cte.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +2 -2
- package/src/duckdb/src/include/duckdb/function/function.hpp +2 -0
- package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +25 -0
- package/src/duckdb/src/include/duckdb/main/client_data.hpp +3 -0
- package/src/duckdb/src/include/duckdb/main/config.hpp +0 -2
- package/src/duckdb/src/include/duckdb/main/materialized_query_result.hpp +1 -1
- package/src/duckdb/src/include/duckdb/main/query_result.hpp +14 -1
- package/src/duckdb/src/include/duckdb/optimizer/expression_rewriter.hpp +3 -3
- package/src/duckdb/src/include/duckdb/optimizer/join_order/cardinality_estimator.hpp +16 -16
- package/src/duckdb/src/include/duckdb/optimizer/join_order/join_node.hpp +8 -8
- package/src/duckdb/src/include/duckdb/optimizer/join_order/join_order_optimizer.hpp +23 -15
- package/src/duckdb/src/include/duckdb/optimizer/join_order/join_relation.hpp +9 -10
- package/src/duckdb/src/include/duckdb/optimizer/join_order/query_graph.hpp +18 -11
- package/src/duckdb/src/include/duckdb/parallel/meta_pipeline.hpp +1 -1
- package/src/duckdb/src/include/duckdb/parser/parsed_data/exported_table_data.hpp +5 -1
- package/src/duckdb/src/include/duckdb/parser/parsed_data/vacuum_info.hpp +3 -2
- package/src/duckdb/src/include/duckdb/parser/query_error_context.hpp +4 -2
- package/src/duckdb/src/include/duckdb/parser/transformer.hpp +9 -35
- package/src/duckdb/src/include/duckdb/planner/binder.hpp +24 -23
- package/src/duckdb/src/include/duckdb/planner/expression_binder.hpp +3 -3
- package/src/duckdb/src/include/duckdb/planner/operator/logical_column_data_get.hpp +1 -1
- package/src/duckdb/src/include/duckdb/planner/operator/logical_copy_to_file.hpp +3 -1
- package/src/duckdb/src/include/duckdb/storage/table/table_index_list.hpp +1 -1
- package/src/duckdb/src/main/appender.cpp +6 -6
- package/src/duckdb/src/main/client_context.cpp +1 -1
- package/src/duckdb/src/main/connection.cpp +2 -2
- package/src/duckdb/src/main/query_result.cpp +13 -0
- package/src/duckdb/src/main/settings/settings.cpp +3 -4
- package/src/duckdb/src/optimizer/expression_rewriter.cpp +4 -4
- package/src/duckdb/src/optimizer/join_order/cardinality_estimator.cpp +91 -105
- package/src/duckdb/src/optimizer/join_order/join_node.cpp +5 -8
- package/src/duckdb/src/optimizer/join_order/join_order_optimizer.cpp +163 -160
- package/src/duckdb/src/optimizer/join_order/join_relation_set.cpp +30 -30
- package/src/duckdb/src/optimizer/join_order/query_graph.cpp +37 -38
- package/src/duckdb/src/parallel/executor.cpp +1 -1
- package/src/duckdb/src/parallel/meta_pipeline.cpp +2 -2
- package/src/duckdb/src/parser/transform/helpers/transform_cte.cpp +1 -1
- package/src/duckdb/src/parser/transform/tableref/transform_subquery.cpp +1 -1
- package/src/duckdb/src/parser/transformer.cpp +50 -9
- package/src/duckdb/src/planner/binder/expression/bind_operator_expression.cpp +13 -0
- package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +15 -5
- package/src/duckdb/src/planner/binder/statement/bind_create.cpp +19 -17
- package/src/duckdb/src/planner/binder/statement/bind_create_table.cpp +4 -4
- package/src/duckdb/src/planner/binder/statement/bind_export.cpp +20 -21
- package/src/duckdb/src/planner/binder/tableref/bind_basetableref.cpp +24 -22
- package/src/duckdb/src/planner/binder/tableref/bind_subqueryref.cpp +2 -2
- package/src/duckdb/src/planner/binder/tableref/bind_table_function.cpp +9 -0
- package/src/duckdb/src/planner/binder.cpp +16 -19
- package/src/duckdb/src/planner/expression_binder.cpp +8 -8
- package/src/duckdb/src/planner/operator/logical_copy_to_file.cpp +3 -3
- package/src/duckdb/src/storage/checkpoint_manager.cpp +23 -23
- package/src/duckdb/src/storage/standard_buffer_manager.cpp +1 -1
- package/src/duckdb/src/storage/table_index_list.cpp +3 -3
- package/src/duckdb/src/verification/statement_verifier.cpp +1 -1
- package/src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp +5552 -5598
- package/src/duckdb/ub_src_common.cpp +2 -0
- package/src/duckdb/ub_src_common_types.cpp +0 -16
- package/src/duckdb/ub_src_common_types_column.cpp +10 -0
- package/src/duckdb/ub_src_common_types_row.cpp +20 -0
@@ -12,6 +12,7 @@
|
|
12
12
|
#include "duckdb/planner/operator/logical_get.hpp"
|
13
13
|
#include "duckdb/main/extension_helper.hpp"
|
14
14
|
#include "duckdb/common/multi_file_reader.hpp"
|
15
|
+
#include "duckdb/main/client_data.hpp"
|
15
16
|
|
16
17
|
#include <limits>
|
17
18
|
|
@@ -23,21 +24,22 @@ unique_ptr<CSVFileHandle> ReadCSV::OpenCSV(const string &file_path, FileCompress
|
|
23
24
|
auto opener = FileSystem::GetFileOpener(context);
|
24
25
|
auto file_handle =
|
25
26
|
fs.OpenFile(file_path.c_str(), FileFlags::FILE_FLAGS_READ, FileLockType::NO_LOCK, compression, opener);
|
27
|
+
if (file_handle->CanSeek()) {
|
28
|
+
file_handle->Reset();
|
29
|
+
}
|
26
30
|
return make_uniq<CSVFileHandle>(std::move(file_handle));
|
27
31
|
}
|
28
32
|
|
29
33
|
void ReadCSVData::FinalizeRead(ClientContext &context) {
|
30
34
|
BaseCSVData::Finalize();
|
31
|
-
|
32
|
-
single_threaded = !config.options.experimental_parallel_csv_reader;
|
33
|
-
if (options.has_parallel) {
|
34
|
-
// Override the option set in the config
|
35
|
-
single_threaded = !options.use_parallel;
|
36
|
-
}
|
35
|
+
// Here we identify if we can run this CSV file on parallel or not.
|
37
36
|
bool null_or_empty = options.delimiter.empty() || options.escape.empty() || options.quote.empty() ||
|
38
37
|
options.delimiter[0] == '\0' || options.escape[0] == '\0' || options.quote[0] == '\0';
|
39
38
|
bool complex_options = options.delimiter.size() > 1 || options.escape.size() > 1 || options.quote.size() > 1;
|
40
|
-
|
39
|
+
bool not_supported_options = options.null_padding;
|
40
|
+
|
41
|
+
if (!options.run_parallel || null_or_empty || not_supported_options || complex_options ||
|
42
|
+
options.new_line == NewLineIdentifier::MIX) {
|
41
43
|
// not supported for parallel CSV reading
|
42
44
|
single_threaded = true;
|
43
45
|
}
|
@@ -175,6 +177,8 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
|
|
175
177
|
options.all_varchar = BooleanValue::Get(kv.second);
|
176
178
|
} else if (loption == "normalize_names") {
|
177
179
|
options.normalize_names = BooleanValue::Get(kv.second);
|
180
|
+
} else if (loption == "parallel") {
|
181
|
+
options.run_parallel = BooleanValue::Get(kv.second);
|
178
182
|
} else {
|
179
183
|
options.SetReadOption(loption, kv.second, names);
|
180
184
|
}
|
@@ -214,6 +218,13 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
|
|
214
218
|
if (options.file_options.union_by_name) {
|
215
219
|
result->reader_bind =
|
216
220
|
MultiFileReader::BindUnionReader<BufferedCSVReader>(context, return_types, names, *result, options);
|
221
|
+
if (result->union_readers.size() > 1) {
|
222
|
+
result->column_info.emplace_back(result->csv_names, result->csv_types);
|
223
|
+
for (idx_t i = 1; i < result->union_readers.size(); i++) {
|
224
|
+
result->column_info.emplace_back(result->union_readers[i]->names,
|
225
|
+
result->union_readers[i]->return_types);
|
226
|
+
}
|
227
|
+
}
|
217
228
|
if (!options.sql_types_per_column.empty()) {
|
218
229
|
auto exception = BufferedCSVReader::ColumnTypesError(options.sql_types_per_column, names);
|
219
230
|
if (!exception.empty()) {
|
@@ -253,17 +264,27 @@ public:
|
|
253
264
|
file_size = file_handle->FileSize();
|
254
265
|
first_file_size = file_size;
|
255
266
|
bytes_read = 0;
|
256
|
-
if (buffer_size < file_size) {
|
267
|
+
if (buffer_size < file_size || file_size == 0) {
|
257
268
|
bytes_per_local_state = buffer_size / ParallelCSVGlobalState::MaxThreads();
|
258
269
|
} else {
|
259
270
|
bytes_per_local_state = file_size / MaxThreads();
|
260
271
|
}
|
261
|
-
|
262
|
-
|
263
|
-
|
272
|
+
if (bytes_per_local_state == 0) {
|
273
|
+
// In practice, I think this won't happen, it only happens because we are mocking up test scenarios
|
274
|
+
// this boy needs to be at least one.
|
275
|
+
bytes_per_local_state = 1;
|
276
|
+
}
|
277
|
+
for (idx_t i = 0; i < rows_to_skip; i++) {
|
278
|
+
file_handle->ReadLine();
|
279
|
+
}
|
280
|
+
first_position = current_csv_position;
|
281
|
+
current_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle, current_csv_position, file_number);
|
282
|
+
next_buffer = shared_ptr<CSVBuffer>(
|
283
|
+
current_buffer->Next(*file_handle, buffer_size, current_csv_position, file_number).release());
|
264
284
|
running_threads = MaxThreads();
|
265
285
|
}
|
266
286
|
ParallelCSVGlobalState() {
|
287
|
+
running_threads = MaxThreads();
|
267
288
|
}
|
268
289
|
|
269
290
|
~ParallelCSVGlobalState() override {
|
@@ -281,7 +302,7 @@ public:
|
|
281
302
|
//! Verify if the CSV File was read correctly
|
282
303
|
void Verify();
|
283
304
|
|
284
|
-
void UpdateVerification(VerificationPositions positions);
|
305
|
+
void UpdateVerification(VerificationPositions positions, idx_t file_number);
|
285
306
|
|
286
307
|
void IncrementThread();
|
287
308
|
|
@@ -332,14 +353,18 @@ private:
|
|
332
353
|
//! Current batch index
|
333
354
|
idx_t batch_index = 0;
|
334
355
|
//! Forces parallelism for small CSV Files, should only be used for testing.
|
335
|
-
bool force_parallelism;
|
356
|
+
bool force_parallelism = false;
|
336
357
|
//! Current (Global) position of CSV
|
337
358
|
idx_t current_csv_position = 0;
|
359
|
+
//! First Position of First Buffer
|
360
|
+
idx_t first_position = 0;
|
361
|
+
//! Current File Number
|
362
|
+
idx_t file_number = 0;
|
338
363
|
idx_t max_tuple_end = 0;
|
339
364
|
//! the vector stores positions where threads ended the last line they read in the CSV File, and the set stores
|
340
365
|
//! positions where they started reading the first line.
|
341
|
-
vector<idx_t
|
342
|
-
set<idx_t
|
366
|
+
vector<vector<idx_t>> tuple_end;
|
367
|
+
vector<set<idx_t>> tuple_start;
|
343
368
|
idx_t running_threads = 0;
|
344
369
|
//! The column ids to read
|
345
370
|
vector<column_t> column_ids;
|
@@ -349,10 +374,9 @@ idx_t ParallelCSVGlobalState::MaxThreads() const {
|
|
349
374
|
if (force_parallelism) {
|
350
375
|
return system_threads;
|
351
376
|
}
|
352
|
-
|
353
377
|
idx_t one_mb = 1000000; // We initialize max one thread per Mb
|
354
378
|
idx_t threads_per_mb = first_file_size / one_mb + 1;
|
355
|
-
if (threads_per_mb < system_threads) {
|
379
|
+
if (threads_per_mb < system_threads || threads_per_mb == 1) {
|
356
380
|
return threads_per_mb;
|
357
381
|
}
|
358
382
|
|
@@ -378,25 +402,36 @@ bool ParallelCSVGlobalState::Finished() {
|
|
378
402
|
void ParallelCSVGlobalState::Verify() {
|
379
403
|
// All threads are done, we run some magic sweet verification code
|
380
404
|
if (running_threads == 0) {
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
405
|
+
D_ASSERT(tuple_end.size() == tuple_start.size());
|
406
|
+
for (idx_t i = 0; i < tuple_start.size(); i++) {
|
407
|
+
auto ¤t_tuple_end = tuple_end[i];
|
408
|
+
auto ¤t_tuple_start = tuple_start[i];
|
409
|
+
// figure out max value of last_pos
|
410
|
+
if (current_tuple_end.empty()) {
|
411
|
+
return;
|
386
412
|
}
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
413
|
+
auto max_value = *max_element(std::begin(current_tuple_end), std::end(current_tuple_end));
|
414
|
+
for (auto &last_pos : current_tuple_end) {
|
415
|
+
auto first_pos = current_tuple_start.find(last_pos);
|
416
|
+
if (first_pos == current_tuple_start.end()) {
|
417
|
+
// this might be necessary due to carriage returns outside buffer scopes.
|
418
|
+
first_pos = current_tuple_start.find(last_pos + 1);
|
393
419
|
}
|
394
|
-
|
395
|
-
|
396
|
-
|
420
|
+
if (first_pos == current_tuple_start.end() && last_pos != max_value) {
|
421
|
+
string error =
|
422
|
+
"Not possible to read this CSV File with multithreading. Tuple: " + to_string(last_pos) +
|
423
|
+
" does not have a match\n";
|
424
|
+
error += "End Lines: \n";
|
425
|
+
for (auto &end_line : current_tuple_end) {
|
426
|
+
error += to_string(end_line) + "\n";
|
427
|
+
}
|
428
|
+
error += "Start Lines: \n";
|
429
|
+
for (auto &start_line : current_tuple_start) {
|
430
|
+
error += to_string(start_line) + "\n";
|
431
|
+
}
|
432
|
+
throw InvalidInputException(
|
433
|
+
"CSV File not supported for multithreading. Please run single-threaded CSV Reading");
|
397
434
|
}
|
398
|
-
throw InvalidInputException(
|
399
|
-
"CSV File not supported for multithreading. Please run single-threaded CSV Reading");
|
400
435
|
}
|
401
436
|
}
|
402
437
|
}
|
@@ -411,9 +446,11 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
|
|
411
446
|
current_file_path = bind_data.files[file_index++];
|
412
447
|
file_handle = ReadCSV::OpenCSV(current_file_path, bind_data.options.compression, context);
|
413
448
|
current_csv_position = 0;
|
414
|
-
|
415
|
-
|
416
|
-
|
449
|
+
file_number++;
|
450
|
+
current_buffer =
|
451
|
+
make_shared<CSVBuffer>(context, buffer_size, *file_handle, current_csv_position, file_number);
|
452
|
+
next_buffer = shared_ptr<CSVBuffer>(
|
453
|
+
current_buffer->Next(*file_handle, buffer_size, current_csv_position, file_number).release());
|
417
454
|
} else {
|
418
455
|
// We are done scanning.
|
419
456
|
reader.reset();
|
@@ -433,8 +470,8 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
|
|
433
470
|
current_buffer = next_buffer;
|
434
471
|
if (next_buffer) {
|
435
472
|
// Next buffer gets the next-next buffer
|
436
|
-
next_buffer =
|
437
|
-
|
473
|
+
next_buffer = shared_ptr<CSVBuffer>(
|
474
|
+
next_buffer->Next(*file_handle, buffer_size, current_csv_position, file_number).release());
|
438
475
|
}
|
439
476
|
}
|
440
477
|
if (!reader || reader->options.file_path != current_file_path) {
|
@@ -443,13 +480,18 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
|
|
443
480
|
if (file_index > 0 && file_index <= bind_data.union_readers.size() && bind_data.union_readers[file_index - 1]) {
|
444
481
|
// we are doing UNION BY NAME - fetch the options from the union reader for this file
|
445
482
|
auto &union_reader = *bind_data.union_readers[file_index - 1];
|
446
|
-
reader =
|
447
|
-
|
483
|
+
reader = make_uniq<ParallelCSVReader>(context, union_reader.options, std::move(result), first_position,
|
484
|
+
union_reader.GetTypes());
|
448
485
|
reader->names = union_reader.GetNames();
|
486
|
+
} else if (file_index <= bind_data.column_info.size()) {
|
487
|
+
// Serialized Union By name
|
488
|
+
reader = make_uniq<ParallelCSVReader>(context, bind_data.options, std::move(result), first_position,
|
489
|
+
bind_data.column_info[file_index - 1].types);
|
490
|
+
reader->names = bind_data.column_info[file_index - 1].names;
|
449
491
|
} else {
|
450
492
|
// regular file - use the standard options
|
451
|
-
reader = make_uniq<ParallelCSVReader>(context, bind_data.options, std::move(result),
|
452
|
-
|
493
|
+
reader = make_uniq<ParallelCSVReader>(context, bind_data.options, std::move(result), first_position,
|
494
|
+
bind_data.csv_types);
|
453
495
|
reader->names = bind_data.csv_names;
|
454
496
|
}
|
455
497
|
reader->options.file_path = current_file_path;
|
@@ -461,14 +503,20 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
|
|
461
503
|
}
|
462
504
|
return true;
|
463
505
|
}
|
464
|
-
void ParallelCSVGlobalState::UpdateVerification(VerificationPositions positions) {
|
506
|
+
void ParallelCSVGlobalState::UpdateVerification(VerificationPositions positions, idx_t file_number_p) {
|
465
507
|
lock_guard<mutex> parallel_lock(main_mutex);
|
466
508
|
if (positions.beginning_of_first_line < positions.end_of_last_line) {
|
467
509
|
if (positions.end_of_last_line > max_tuple_end) {
|
468
510
|
max_tuple_end = positions.end_of_last_line;
|
469
511
|
}
|
470
|
-
tuple_start.
|
471
|
-
|
512
|
+
while (file_number_p >= tuple_start.size()) {
|
513
|
+
vector<idx_t> empty_tuple_end;
|
514
|
+
set<idx_t> empty_set;
|
515
|
+
tuple_start.emplace_back(empty_set);
|
516
|
+
tuple_end.emplace_back(empty_tuple_end);
|
517
|
+
}
|
518
|
+
tuple_start[file_number_p].insert(positions.beginning_of_first_line);
|
519
|
+
tuple_end[file_number_p].push_back(positions.end_of_last_line);
|
472
520
|
}
|
473
521
|
}
|
474
522
|
|
@@ -483,11 +531,9 @@ static unique_ptr<GlobalTableFunctionState> ParallelCSVInitGlobal(ClientContext
|
|
483
531
|
|
484
532
|
bind_data.options.file_path = bind_data.files[0];
|
485
533
|
file_handle = ReadCSV::OpenCSV(bind_data.options.file_path, bind_data.options.compression, context);
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
context.db->NumberOfThreads(), bind_data.options.buffer_size, rows_to_skip,
|
490
|
-
ClientConfig::GetConfig(context).verify_parallelism, input.column_ids);
|
534
|
+
return make_uniq<ParallelCSVGlobalState>(
|
535
|
+
context, std::move(file_handle), bind_data.files, context.db->NumberOfThreads(), bind_data.options.buffer_size,
|
536
|
+
bind_data.options.skip_rows, ClientConfig::GetConfig(context).verify_parallelism, input.column_ids);
|
491
537
|
}
|
492
538
|
|
493
539
|
//===--------------------------------------------------------------------===//
|
@@ -534,11 +580,10 @@ static void ParallelReadCSVFunction(ClientContext &context, TableFunctionInput &
|
|
534
580
|
}
|
535
581
|
if (csv_local_state.csv_reader->finished) {
|
536
582
|
auto verification_updates = csv_local_state.csv_reader->GetVerificationPositions();
|
537
|
-
if (
|
538
|
-
|
539
|
-
|
583
|
+
if (verification_updates.beginning_of_first_line != verification_updates.end_of_last_line) {
|
584
|
+
csv_global_state.UpdateVerification(verification_updates,
|
585
|
+
csv_local_state.csv_reader->buffer->buffer->GetFileNumber());
|
540
586
|
}
|
541
|
-
csv_global_state.UpdateVerification(verification_updates);
|
542
587
|
auto has_next = csv_global_state.Next(context, bind_data, csv_local_state.csv_reader);
|
543
588
|
if (!has_next) {
|
544
589
|
csv_global_state.DecrementThread();
|
@@ -642,14 +687,17 @@ static unique_ptr<GlobalTableFunctionState> SingleThreadedCSVInit(ClientContext
|
|
642
687
|
TableFunctionInitInput &input) {
|
643
688
|
auto &bind_data = (ReadCSVData &)*input.bind_data;
|
644
689
|
auto result = make_uniq<SingleThreadedCSVState>(bind_data.files.size());
|
645
|
-
if (bind_data.
|
646
|
-
result->initial_reader = std::move(bind_data.initial_reader);
|
647
|
-
} else if (bind_data.files.empty()) {
|
690
|
+
if (bind_data.files.empty()) {
|
648
691
|
// This can happen when a filename based filter pushdown has eliminated all possible files for this scan.
|
649
692
|
return std::move(result);
|
650
693
|
} else {
|
651
694
|
bind_data.options.file_path = bind_data.files[0];
|
652
|
-
|
695
|
+
if (bind_data.initial_reader && !bind_data.file_exists) {
|
696
|
+
// If this is not an on disk file we gotta reuse the reader.
|
697
|
+
result->initial_reader = std::move(bind_data.initial_reader);
|
698
|
+
} else {
|
699
|
+
result->initial_reader = make_uniq<BufferedCSVReader>(context, bind_data.options, bind_data.csv_types);
|
700
|
+
}
|
653
701
|
if (!bind_data.options.file_options.union_by_name) {
|
654
702
|
result->initial_reader->names = bind_data.csv_names;
|
655
703
|
}
|
@@ -741,6 +789,14 @@ static void SingleThreadedCSVFunction(ClientContext &context, TableFunctionInput
|
|
741
789
|
//===--------------------------------------------------------------------===//
|
742
790
|
static unique_ptr<GlobalTableFunctionState> ReadCSVInitGlobal(ClientContext &context, TableFunctionInitInput &input) {
|
743
791
|
auto &bind_data = (ReadCSVData &)*input.bind_data;
|
792
|
+
auto &fs = FileSystem::GetFileSystem(context);
|
793
|
+
for (auto &file : bind_data.files) {
|
794
|
+
if (!fs.FileExists(file)) {
|
795
|
+
bind_data.file_exists = false;
|
796
|
+
break;
|
797
|
+
}
|
798
|
+
}
|
799
|
+
bind_data.single_threaded = bind_data.single_threaded || !bind_data.file_exists;
|
744
800
|
if (bind_data.single_threaded) {
|
745
801
|
return SingleThreadedCSVInit(context, input);
|
746
802
|
} else {
|
@@ -863,6 +919,7 @@ void BufferedCSVReaderOptions::Serialize(FieldWriter &writer) const {
|
|
863
919
|
writer.WriteField<idx_t>(buffer_sample_size);
|
864
920
|
writer.WriteString(null_str);
|
865
921
|
writer.WriteField<FileCompressionType>(compression);
|
922
|
+
writer.WriteField<NewLineIdentifier>(new_line);
|
866
923
|
// read options
|
867
924
|
writer.WriteField<idx_t>(skip_rows);
|
868
925
|
writer.WriteField<bool>(skip_rows_set);
|
@@ -896,6 +953,7 @@ void BufferedCSVReaderOptions::Deserialize(FieldReader &reader) {
|
|
896
953
|
buffer_sample_size = reader.ReadRequired<idx_t>();
|
897
954
|
null_str = reader.ReadRequired<string>();
|
898
955
|
compression = reader.ReadRequired<FileCompressionType>();
|
956
|
+
new_line = reader.ReadRequired<NewLineIdentifier>();
|
899
957
|
// read options
|
900
958
|
skip_rows = reader.ReadRequired<idx_t>();
|
901
959
|
skip_rows_set = reader.ReadRequired<bool>();
|
@@ -926,6 +984,10 @@ static void CSVReaderSerialize(FieldWriter &writer, const FunctionData *bind_dat
|
|
926
984
|
bind_data.options.Serialize(writer);
|
927
985
|
writer.WriteField<bool>(bind_data.single_threaded);
|
928
986
|
writer.WriteSerializable(bind_data.reader_bind);
|
987
|
+
writer.WriteField<uint32_t>(bind_data.column_info.size());
|
988
|
+
for (auto &col : bind_data.column_info) {
|
989
|
+
col.Serialize(writer);
|
990
|
+
}
|
929
991
|
}
|
930
992
|
|
931
993
|
static unique_ptr<FunctionData> CSVReaderDeserialize(ClientContext &context, FieldReader &reader,
|
@@ -941,6 +1003,10 @@ static unique_ptr<FunctionData> CSVReaderDeserialize(ClientContext &context, Fie
|
|
941
1003
|
result_data->options.Deserialize(reader);
|
942
1004
|
result_data->single_threaded = reader.ReadField<bool>(true);
|
943
1005
|
result_data->reader_bind = reader.ReadRequiredSerializable<MultiFileReaderBindData, MultiFileReaderBindData>();
|
1006
|
+
uint32_t file_number = reader.ReadRequired<uint32_t>();
|
1007
|
+
for (idx_t i = 0; i < file_number; i++) {
|
1008
|
+
result_data->column_info.emplace_back(ColumnInfo::Deserialize(reader));
|
1009
|
+
}
|
944
1010
|
return std::move(result_data);
|
945
1011
|
}
|
946
1012
|
|
@@ -1,8 +1,8 @@
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
2
|
-
#define DUCKDB_VERSION "0.7.2-
|
2
|
+
#define DUCKDB_VERSION "0.7.2-dev2233"
|
3
3
|
#endif
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
5
|
+
#define DUCKDB_SOURCE_ID "c81600ed51"
|
6
6
|
#endif
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
8
8
|
#include "duckdb/main/database.hpp"
|
@@ -27,7 +27,7 @@ public:
|
|
27
27
|
//! Create an IndexCatalogEntry and initialize storage for it
|
28
28
|
IndexCatalogEntry(Catalog *catalog, SchemaCatalogEntry *schema, CreateIndexInfo *info);
|
29
29
|
|
30
|
-
Index
|
30
|
+
optional_ptr<Index> index;
|
31
31
|
string sql;
|
32
32
|
vector<unique_ptr<ParsedExpression>> expressions;
|
33
33
|
vector<unique_ptr<ParsedExpression>> parsed_expressions;
|
@@ -23,7 +23,7 @@ public:
|
|
23
23
|
DUCKDB_API ~ArrowAppender();
|
24
24
|
|
25
25
|
//! Append a data chunk to the underlying arrow array
|
26
|
-
DUCKDB_API void Append(DataChunk &input);
|
26
|
+
DUCKDB_API void Append(DataChunk &input, idx_t from, idx_t to, idx_t input_size);
|
27
27
|
//! Returns the underlying arrow array
|
28
28
|
DUCKDB_API ArrowArray Finalize();
|
29
29
|
|
@@ -121,6 +121,9 @@ public:
|
|
121
121
|
DUCKDB_API static bool UncaughtException();
|
122
122
|
|
123
123
|
DUCKDB_API static string GetStackTrace(int max_depth = 120);
|
124
|
+
DUCKDB_API static string FormatStackTrace(string message = "") {
|
125
|
+
return (message + "\n" + GetStackTrace());
|
126
|
+
}
|
124
127
|
|
125
128
|
private:
|
126
129
|
string exception_message_;
|