duckdb 0.7.2-dev1901.0 → 0.7.2-dev2233.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. package/binding.gyp +2 -0
  2. package/package.json +1 -1
  3. package/src/duckdb/extension/parquet/column_reader.cpp +3 -0
  4. package/src/duckdb/extension/parquet/include/parquet_writer.hpp +1 -1
  5. package/src/duckdb/extension/parquet/parquet_metadata.cpp +4 -2
  6. package/src/duckdb/src/catalog/catalog_entry/duck_index_entry.cpp +1 -1
  7. package/src/duckdb/src/common/arrow/arrow_appender.cpp +69 -44
  8. package/src/duckdb/src/common/arrow/arrow_converter.cpp +1 -1
  9. package/src/duckdb/src/common/arrow/arrow_wrapper.cpp +20 -2
  10. package/src/duckdb/src/common/box_renderer.cpp +4 -2
  11. package/src/duckdb/src/common/constants.cpp +10 -1
  12. package/src/duckdb/src/common/filename_pattern.cpp +41 -0
  13. package/src/duckdb/src/common/hive_partitioning.cpp +144 -15
  14. package/src/duckdb/src/common/radix_partitioning.cpp +101 -369
  15. package/src/duckdb/src/common/row_operations/row_aggregate.cpp +8 -9
  16. package/src/duckdb/src/common/row_operations/row_external.cpp +1 -1
  17. package/src/duckdb/src/common/row_operations/row_gather.cpp +5 -3
  18. package/src/duckdb/src/common/row_operations/row_match.cpp +117 -22
  19. package/src/duckdb/src/common/row_operations/row_scatter.cpp +2 -2
  20. package/src/duckdb/src/common/sort/partition_state.cpp +1 -1
  21. package/src/duckdb/src/common/sort/sort_state.cpp +2 -1
  22. package/src/duckdb/src/common/sort/sorted_block.cpp +1 -1
  23. package/src/duckdb/src/common/types/{column_data_allocator.cpp → column/column_data_allocator.cpp} +2 -2
  24. package/src/duckdb/src/common/types/{column_data_collection.cpp → column/column_data_collection.cpp} +29 -6
  25. package/src/duckdb/src/common/types/{column_data_collection_segment.cpp → column/column_data_collection_segment.cpp} +2 -1
  26. package/src/duckdb/src/common/types/{column_data_consumer.cpp → column/column_data_consumer.cpp} +1 -1
  27. package/src/duckdb/src/common/types/{partitioned_column_data.cpp → column/partitioned_column_data.cpp} +11 -9
  28. package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +316 -0
  29. package/src/duckdb/src/common/types/{row_data_collection.cpp → row/row_data_collection.cpp} +1 -1
  30. package/src/duckdb/src/common/types/{row_data_collection_scanner.cpp → row/row_data_collection_scanner.cpp} +2 -2
  31. package/src/duckdb/src/common/types/{row_layout.cpp → row/row_layout.cpp} +1 -1
  32. package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +465 -0
  33. package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +511 -0
  34. package/src/duckdb/src/common/types/row/tuple_data_iterator.cpp +96 -0
  35. package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +119 -0
  36. package/src/duckdb/src/common/types/row/tuple_data_scatter_gather.cpp +1200 -0
  37. package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +170 -0
  38. package/src/duckdb/src/common/types/vector.cpp +1 -1
  39. package/src/duckdb/src/execution/aggregate_hashtable.cpp +252 -290
  40. package/src/duckdb/src/execution/join_hashtable.cpp +192 -328
  41. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +4 -4
  42. package/src/duckdb/src/execution/operator/helper/physical_execute.cpp +3 -3
  43. package/src/duckdb/src/execution/operator/helper/physical_limit_percent.cpp +2 -3
  44. package/src/duckdb/src/execution/operator/helper/physical_result_collector.cpp +2 -3
  45. package/src/duckdb/src/execution/operator/join/perfect_hash_join_executor.cpp +36 -21
  46. package/src/duckdb/src/execution/operator/join/physical_blockwise_nl_join.cpp +2 -2
  47. package/src/duckdb/src/execution/operator/join/physical_cross_product.cpp +1 -1
  48. package/src/duckdb/src/execution/operator/join/physical_delim_join.cpp +2 -2
  49. package/src/duckdb/src/execution/operator/join/physical_hash_join.cpp +166 -144
  50. package/src/duckdb/src/execution/operator/join/physical_index_join.cpp +5 -5
  51. package/src/duckdb/src/execution/operator/join/physical_join.cpp +2 -10
  52. package/src/duckdb/src/execution/operator/join/physical_positional_join.cpp +0 -1
  53. package/src/duckdb/src/execution/operator/order/physical_top_n.cpp +2 -2
  54. package/src/duckdb/src/execution/operator/persistent/base_csv_reader.cpp +3 -0
  55. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +71 -22
  56. package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +17 -13
  57. package/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp +0 -7
  58. package/src/duckdb/src/execution/operator/persistent/parallel_csv_reader.cpp +124 -29
  59. package/src/duckdb/src/execution/operator/persistent/physical_copy_to_file.cpp +13 -11
  60. package/src/duckdb/src/execution/operator/persistent/physical_delete.cpp +3 -2
  61. package/src/duckdb/src/execution/operator/persistent/physical_export.cpp +25 -24
  62. package/src/duckdb/src/execution/operator/persistent/physical_insert.cpp +1 -1
  63. package/src/duckdb/src/execution/operator/persistent/physical_update.cpp +4 -3
  64. package/src/duckdb/src/execution/operator/scan/physical_table_scan.cpp +1 -1
  65. package/src/duckdb/src/execution/operator/schema/physical_create_type.cpp +1 -1
  66. package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +3 -3
  67. package/src/duckdb/src/execution/partitionable_hashtable.cpp +9 -37
  68. package/src/duckdb/src/execution/physical_operator.cpp +1 -1
  69. package/src/duckdb/src/execution/physical_plan/plan_comparison_join.cpp +19 -18
  70. package/src/duckdb/src/execution/physical_plan/plan_copy_to_file.cpp +2 -1
  71. package/src/duckdb/src/execution/physical_plan/plan_execute.cpp +2 -2
  72. package/src/duckdb/src/execution/physical_plan/plan_explain.cpp +5 -6
  73. package/src/duckdb/src/execution/physical_plan/plan_expression_get.cpp +2 -2
  74. package/src/duckdb/src/execution/physical_plan/plan_recursive_cte.cpp +3 -3
  75. package/src/duckdb/src/execution/physical_plan_generator.cpp +1 -1
  76. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +39 -17
  77. package/src/duckdb/src/function/aggregate/sorted_aggregate_function.cpp +2 -2
  78. package/src/duckdb/src/function/table/pragma_detailed_profiling_output.cpp +5 -5
  79. package/src/duckdb/src/function/table/pragma_last_profiling_output.cpp +2 -2
  80. package/src/duckdb/src/function/table/read_csv.cpp +124 -58
  81. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  82. package/src/duckdb/src/include/duckdb/catalog/catalog_entry/index_catalog_entry.hpp +1 -1
  83. package/src/duckdb/src/include/duckdb/common/arrow/arrow_appender.hpp +1 -1
  84. package/src/duckdb/src/include/duckdb/common/constants.hpp +2 -0
  85. package/src/duckdb/src/include/duckdb/common/exception.hpp +3 -0
  86. package/src/duckdb/src/include/duckdb/common/fast_mem.hpp +528 -0
  87. package/src/duckdb/src/include/duckdb/common/filename_pattern.hpp +34 -0
  88. package/src/duckdb/src/include/duckdb/common/helper.hpp +10 -0
  89. package/src/duckdb/src/include/duckdb/common/hive_partitioning.hpp +13 -3
  90. package/src/duckdb/src/include/duckdb/common/optional_ptr.hpp +8 -0
  91. package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +34 -0
  92. package/src/duckdb/src/include/duckdb/common/radix_partitioning.hpp +80 -27
  93. package/src/duckdb/src/include/duckdb/common/reference_map.hpp +38 -0
  94. package/src/duckdb/src/include/duckdb/common/row_operations/row_operations.hpp +7 -6
  95. package/src/duckdb/src/include/duckdb/common/sort/comparators.hpp +1 -1
  96. package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +1 -1
  97. package/src/duckdb/src/include/duckdb/common/sort/sort.hpp +1 -1
  98. package/src/duckdb/src/include/duckdb/common/sort/sorted_block.hpp +2 -2
  99. package/src/duckdb/src/include/duckdb/common/types/batched_data_collection.hpp +1 -1
  100. package/src/duckdb/src/include/duckdb/common/types/{column_data_allocator.hpp → column/column_data_allocator.hpp} +4 -4
  101. package/src/duckdb/src/include/duckdb/common/types/{column_data_collection.hpp → column/column_data_collection.hpp} +4 -4
  102. package/src/duckdb/src/include/duckdb/common/types/{column_data_collection_iterators.hpp → column/column_data_collection_iterators.hpp} +2 -2
  103. package/src/duckdb/src/include/duckdb/common/types/{column_data_collection_segment.hpp → column/column_data_collection_segment.hpp} +3 -3
  104. package/src/duckdb/src/include/duckdb/common/types/{column_data_consumer.hpp → column/column_data_consumer.hpp} +8 -4
  105. package/src/duckdb/src/include/duckdb/common/types/{column_data_scan_states.hpp → column/column_data_scan_states.hpp} +1 -1
  106. package/src/duckdb/src/include/duckdb/common/types/{partitioned_column_data.hpp → column/partitioned_column_data.hpp} +15 -7
  107. package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +140 -0
  108. package/src/duckdb/src/include/duckdb/common/types/{row_data_collection.hpp → row/row_data_collection.hpp} +1 -1
  109. package/src/duckdb/src/include/duckdb/common/types/{row_data_collection_scanner.hpp → row/row_data_collection_scanner.hpp} +2 -2
  110. package/src/duckdb/src/include/duckdb/common/types/{row_layout.hpp → row/row_layout.hpp} +3 -1
  111. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +116 -0
  112. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +239 -0
  113. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_iterator.hpp +64 -0
  114. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +113 -0
  115. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +124 -0
  116. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +74 -0
  117. package/src/duckdb/src/include/duckdb/common/types/validity_mask.hpp +3 -0
  118. package/src/duckdb/src/include/duckdb/common/types/value.hpp +4 -12
  119. package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +34 -31
  120. package/src/duckdb/src/include/duckdb/execution/base_aggregate_hashtable.hpp +2 -2
  121. package/src/duckdb/src/include/duckdb/execution/execution_context.hpp +3 -2
  122. package/src/duckdb/src/include/duckdb/execution/expression_executor.hpp +1 -1
  123. package/src/duckdb/src/include/duckdb/execution/join_hashtable.hpp +41 -67
  124. package/src/duckdb/src/include/duckdb/execution/nested_loop_join.hpp +1 -1
  125. package/src/duckdb/src/include/duckdb/execution/operator/helper/physical_execute.hpp +2 -2
  126. package/src/duckdb/src/include/duckdb/execution/operator/helper/physical_result_collector.hpp +1 -1
  127. package/src/duckdb/src/include/duckdb/execution/operator/join/outer_join_marker.hpp +2 -2
  128. package/src/duckdb/src/include/duckdb/execution/operator/join/perfect_hash_join_executor.hpp +1 -1
  129. package/src/duckdb/src/include/duckdb/execution/operator/join/physical_cross_product.hpp +1 -1
  130. package/src/duckdb/src/include/duckdb/execution/operator/join/physical_hash_join.hpp +0 -2
  131. package/src/duckdb/src/include/duckdb/execution/operator/join/physical_index_join.hpp +2 -2
  132. package/src/duckdb/src/include/duckdb/execution/operator/join/physical_positional_join.hpp +1 -1
  133. package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +4 -1
  134. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +8 -3
  135. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp +5 -7
  136. package/src/duckdb/src/include/duckdb/execution/operator/persistent/parallel_csv_reader.hpp +5 -1
  137. package/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_copy_to_file.hpp +4 -1
  138. package/src/duckdb/src/include/duckdb/execution/operator/scan/physical_column_data_scan.hpp +1 -1
  139. package/src/duckdb/src/include/duckdb/execution/operator/set/physical_recursive_cte.hpp +1 -1
  140. package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +2 -2
  141. package/src/duckdb/src/include/duckdb/function/function.hpp +2 -0
  142. package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +25 -0
  143. package/src/duckdb/src/include/duckdb/main/client_data.hpp +3 -0
  144. package/src/duckdb/src/include/duckdb/main/config.hpp +0 -2
  145. package/src/duckdb/src/include/duckdb/main/materialized_query_result.hpp +1 -1
  146. package/src/duckdb/src/include/duckdb/main/query_result.hpp +14 -1
  147. package/src/duckdb/src/include/duckdb/optimizer/expression_rewriter.hpp +3 -3
  148. package/src/duckdb/src/include/duckdb/optimizer/join_order/cardinality_estimator.hpp +16 -16
  149. package/src/duckdb/src/include/duckdb/optimizer/join_order/join_node.hpp +8 -8
  150. package/src/duckdb/src/include/duckdb/optimizer/join_order/join_order_optimizer.hpp +23 -15
  151. package/src/duckdb/src/include/duckdb/optimizer/join_order/join_relation.hpp +9 -10
  152. package/src/duckdb/src/include/duckdb/optimizer/join_order/query_graph.hpp +18 -11
  153. package/src/duckdb/src/include/duckdb/parallel/meta_pipeline.hpp +1 -1
  154. package/src/duckdb/src/include/duckdb/parser/parsed_data/exported_table_data.hpp +5 -1
  155. package/src/duckdb/src/include/duckdb/parser/parsed_data/vacuum_info.hpp +3 -2
  156. package/src/duckdb/src/include/duckdb/parser/query_error_context.hpp +4 -2
  157. package/src/duckdb/src/include/duckdb/parser/transformer.hpp +9 -35
  158. package/src/duckdb/src/include/duckdb/planner/binder.hpp +24 -23
  159. package/src/duckdb/src/include/duckdb/planner/expression_binder.hpp +3 -3
  160. package/src/duckdb/src/include/duckdb/planner/operator/logical_column_data_get.hpp +1 -1
  161. package/src/duckdb/src/include/duckdb/planner/operator/logical_copy_to_file.hpp +3 -1
  162. package/src/duckdb/src/include/duckdb/storage/table/table_index_list.hpp +1 -1
  163. package/src/duckdb/src/main/appender.cpp +6 -6
  164. package/src/duckdb/src/main/client_context.cpp +1 -1
  165. package/src/duckdb/src/main/connection.cpp +2 -2
  166. package/src/duckdb/src/main/query_result.cpp +13 -0
  167. package/src/duckdb/src/main/settings/settings.cpp +3 -4
  168. package/src/duckdb/src/optimizer/expression_rewriter.cpp +4 -4
  169. package/src/duckdb/src/optimizer/join_order/cardinality_estimator.cpp +91 -105
  170. package/src/duckdb/src/optimizer/join_order/join_node.cpp +5 -8
  171. package/src/duckdb/src/optimizer/join_order/join_order_optimizer.cpp +163 -160
  172. package/src/duckdb/src/optimizer/join_order/join_relation_set.cpp +30 -30
  173. package/src/duckdb/src/optimizer/join_order/query_graph.cpp +37 -38
  174. package/src/duckdb/src/parallel/executor.cpp +1 -1
  175. package/src/duckdb/src/parallel/meta_pipeline.cpp +2 -2
  176. package/src/duckdb/src/parser/transform/helpers/transform_cte.cpp +1 -1
  177. package/src/duckdb/src/parser/transform/tableref/transform_subquery.cpp +1 -1
  178. package/src/duckdb/src/parser/transformer.cpp +50 -9
  179. package/src/duckdb/src/planner/binder/expression/bind_operator_expression.cpp +13 -0
  180. package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +15 -5
  181. package/src/duckdb/src/planner/binder/statement/bind_create.cpp +19 -17
  182. package/src/duckdb/src/planner/binder/statement/bind_create_table.cpp +4 -4
  183. package/src/duckdb/src/planner/binder/statement/bind_export.cpp +20 -21
  184. package/src/duckdb/src/planner/binder/tableref/bind_basetableref.cpp +24 -22
  185. package/src/duckdb/src/planner/binder/tableref/bind_subqueryref.cpp +2 -2
  186. package/src/duckdb/src/planner/binder/tableref/bind_table_function.cpp +9 -0
  187. package/src/duckdb/src/planner/binder.cpp +16 -19
  188. package/src/duckdb/src/planner/expression_binder.cpp +8 -8
  189. package/src/duckdb/src/planner/operator/logical_copy_to_file.cpp +3 -3
  190. package/src/duckdb/src/storage/checkpoint_manager.cpp +23 -23
  191. package/src/duckdb/src/storage/standard_buffer_manager.cpp +1 -1
  192. package/src/duckdb/src/storage/table_index_list.cpp +3 -3
  193. package/src/duckdb/src/verification/statement_verifier.cpp +1 -1
  194. package/src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp +5552 -5598
  195. package/src/duckdb/ub_src_common.cpp +2 -0
  196. package/src/duckdb/ub_src_common_types.cpp +0 -16
  197. package/src/duckdb/ub_src_common_types_column.cpp +10 -0
  198. package/src/duckdb/ub_src_common_types_row.cpp +20 -0
@@ -12,6 +12,7 @@
12
12
  #include "duckdb/planner/operator/logical_get.hpp"
13
13
  #include "duckdb/main/extension_helper.hpp"
14
14
  #include "duckdb/common/multi_file_reader.hpp"
15
+ #include "duckdb/main/client_data.hpp"
15
16
 
16
17
  #include <limits>
17
18
 
@@ -23,21 +24,22 @@ unique_ptr<CSVFileHandle> ReadCSV::OpenCSV(const string &file_path, FileCompress
23
24
  auto opener = FileSystem::GetFileOpener(context);
24
25
  auto file_handle =
25
26
  fs.OpenFile(file_path.c_str(), FileFlags::FILE_FLAGS_READ, FileLockType::NO_LOCK, compression, opener);
27
+ if (file_handle->CanSeek()) {
28
+ file_handle->Reset();
29
+ }
26
30
  return make_uniq<CSVFileHandle>(std::move(file_handle));
27
31
  }
28
32
 
29
33
  void ReadCSVData::FinalizeRead(ClientContext &context) {
30
34
  BaseCSVData::Finalize();
31
- auto &config = DBConfig::GetConfig(context);
32
- single_threaded = !config.options.experimental_parallel_csv_reader;
33
- if (options.has_parallel) {
34
- // Override the option set in the config
35
- single_threaded = !options.use_parallel;
36
- }
35
+ // Here we identify if we can run this CSV file on parallel or not.
37
36
  bool null_or_empty = options.delimiter.empty() || options.escape.empty() || options.quote.empty() ||
38
37
  options.delimiter[0] == '\0' || options.escape[0] == '\0' || options.quote[0] == '\0';
39
38
  bool complex_options = options.delimiter.size() > 1 || options.escape.size() > 1 || options.quote.size() > 1;
40
- if (null_or_empty || complex_options || options.new_line == NewLineIdentifier::MIX) {
39
+ bool not_supported_options = options.null_padding;
40
+
41
+ if (!options.run_parallel || null_or_empty || not_supported_options || complex_options ||
42
+ options.new_line == NewLineIdentifier::MIX) {
41
43
  // not supported for parallel CSV reading
42
44
  single_threaded = true;
43
45
  }
@@ -175,6 +177,8 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
175
177
  options.all_varchar = BooleanValue::Get(kv.second);
176
178
  } else if (loption == "normalize_names") {
177
179
  options.normalize_names = BooleanValue::Get(kv.second);
180
+ } else if (loption == "parallel") {
181
+ options.run_parallel = BooleanValue::Get(kv.second);
178
182
  } else {
179
183
  options.SetReadOption(loption, kv.second, names);
180
184
  }
@@ -214,6 +218,13 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
214
218
  if (options.file_options.union_by_name) {
215
219
  result->reader_bind =
216
220
  MultiFileReader::BindUnionReader<BufferedCSVReader>(context, return_types, names, *result, options);
221
+ if (result->union_readers.size() > 1) {
222
+ result->column_info.emplace_back(result->csv_names, result->csv_types);
223
+ for (idx_t i = 1; i < result->union_readers.size(); i++) {
224
+ result->column_info.emplace_back(result->union_readers[i]->names,
225
+ result->union_readers[i]->return_types);
226
+ }
227
+ }
217
228
  if (!options.sql_types_per_column.empty()) {
218
229
  auto exception = BufferedCSVReader::ColumnTypesError(options.sql_types_per_column, names);
219
230
  if (!exception.empty()) {
@@ -253,17 +264,27 @@ public:
253
264
  file_size = file_handle->FileSize();
254
265
  first_file_size = file_size;
255
266
  bytes_read = 0;
256
- if (buffer_size < file_size) {
267
+ if (buffer_size < file_size || file_size == 0) {
257
268
  bytes_per_local_state = buffer_size / ParallelCSVGlobalState::MaxThreads();
258
269
  } else {
259
270
  bytes_per_local_state = file_size / MaxThreads();
260
271
  }
261
- current_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle, current_csv_position);
262
- next_buffer =
263
- shared_ptr<CSVBuffer>(current_buffer->Next(*file_handle, buffer_size, current_csv_position).release());
272
+ if (bytes_per_local_state == 0) {
273
+ // In practice, I think this won't happen, it only happens because we are mocking up test scenarios
274
+ // this boy needs to be at least one.
275
+ bytes_per_local_state = 1;
276
+ }
277
+ for (idx_t i = 0; i < rows_to_skip; i++) {
278
+ file_handle->ReadLine();
279
+ }
280
+ first_position = current_csv_position;
281
+ current_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle, current_csv_position, file_number);
282
+ next_buffer = shared_ptr<CSVBuffer>(
283
+ current_buffer->Next(*file_handle, buffer_size, current_csv_position, file_number).release());
264
284
  running_threads = MaxThreads();
265
285
  }
266
286
  ParallelCSVGlobalState() {
287
+ running_threads = MaxThreads();
267
288
  }
268
289
 
269
290
  ~ParallelCSVGlobalState() override {
@@ -281,7 +302,7 @@ public:
281
302
  //! Verify if the CSV File was read correctly
282
303
  void Verify();
283
304
 
284
- void UpdateVerification(VerificationPositions positions);
305
+ void UpdateVerification(VerificationPositions positions, idx_t file_number);
285
306
 
286
307
  void IncrementThread();
287
308
 
@@ -332,14 +353,18 @@ private:
332
353
  //! Current batch index
333
354
  idx_t batch_index = 0;
334
355
  //! Forces parallelism for small CSV Files, should only be used for testing.
335
- bool force_parallelism;
356
+ bool force_parallelism = false;
336
357
  //! Current (Global) position of CSV
337
358
  idx_t current_csv_position = 0;
359
+ //! First Position of First Buffer
360
+ idx_t first_position = 0;
361
+ //! Current File Number
362
+ idx_t file_number = 0;
338
363
  idx_t max_tuple_end = 0;
339
364
  //! the vector stores positions where threads ended the last line they read in the CSV File, and the set stores
340
365
  //! positions where they started reading the first line.
341
- vector<idx_t> tuple_end;
342
- set<idx_t> tuple_start;
366
+ vector<vector<idx_t>> tuple_end;
367
+ vector<set<idx_t>> tuple_start;
343
368
  idx_t running_threads = 0;
344
369
  //! The column ids to read
345
370
  vector<column_t> column_ids;
@@ -349,10 +374,9 @@ idx_t ParallelCSVGlobalState::MaxThreads() const {
349
374
  if (force_parallelism) {
350
375
  return system_threads;
351
376
  }
352
-
353
377
  idx_t one_mb = 1000000; // We initialize max one thread per Mb
354
378
  idx_t threads_per_mb = first_file_size / one_mb + 1;
355
- if (threads_per_mb < system_threads) {
379
+ if (threads_per_mb < system_threads || threads_per_mb == 1) {
356
380
  return threads_per_mb;
357
381
  }
358
382
 
@@ -378,25 +402,36 @@ bool ParallelCSVGlobalState::Finished() {
378
402
  void ParallelCSVGlobalState::Verify() {
379
403
  // All threads are done, we run some magic sweet verification code
380
404
  if (running_threads == 0) {
381
- for (auto &last_pos : tuple_end) {
382
- auto first_pos = tuple_start.find(last_pos);
383
- if (first_pos == tuple_start.end()) {
384
- // this might be necessary due to carriage returns outside buffer scopes.
385
- first_pos = tuple_start.find(last_pos + 1);
405
+ D_ASSERT(tuple_end.size() == tuple_start.size());
406
+ for (idx_t i = 0; i < tuple_start.size(); i++) {
407
+ auto &current_tuple_end = tuple_end[i];
408
+ auto &current_tuple_start = tuple_start[i];
409
+ // figure out max value of last_pos
410
+ if (current_tuple_end.empty()) {
411
+ return;
386
412
  }
387
- if (first_pos == tuple_start.end() && last_pos != NumericLimits<uint64_t>::Maximum()) {
388
- string error = "Not possible to read this CSV File with multithreading. Tuple: " + to_string(last_pos) +
389
- " does not have a match\n";
390
- error += "End Lines: \n";
391
- for (auto &end_line : tuple_end) {
392
- error += to_string(end_line) + "\n";
413
+ auto max_value = *max_element(std::begin(current_tuple_end), std::end(current_tuple_end));
414
+ for (auto &last_pos : current_tuple_end) {
415
+ auto first_pos = current_tuple_start.find(last_pos);
416
+ if (first_pos == current_tuple_start.end()) {
417
+ // this might be necessary due to carriage returns outside buffer scopes.
418
+ first_pos = current_tuple_start.find(last_pos + 1);
393
419
  }
394
- error += "Start Lines: \n";
395
- for (auto &start_line : tuple_start) {
396
- error += to_string(start_line) + "\n";
420
+ if (first_pos == current_tuple_start.end() && last_pos != max_value) {
421
+ string error =
422
+ "Not possible to read this CSV File with multithreading. Tuple: " + to_string(last_pos) +
423
+ " does not have a match\n";
424
+ error += "End Lines: \n";
425
+ for (auto &end_line : current_tuple_end) {
426
+ error += to_string(end_line) + "\n";
427
+ }
428
+ error += "Start Lines: \n";
429
+ for (auto &start_line : current_tuple_start) {
430
+ error += to_string(start_line) + "\n";
431
+ }
432
+ throw InvalidInputException(
433
+ "CSV File not supported for multithreading. Please run single-threaded CSV Reading");
397
434
  }
398
- throw InvalidInputException(
399
- "CSV File not supported for multithreading. Please run single-threaded CSV Reading");
400
435
  }
401
436
  }
402
437
  }
@@ -411,9 +446,11 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
411
446
  current_file_path = bind_data.files[file_index++];
412
447
  file_handle = ReadCSV::OpenCSV(current_file_path, bind_data.options.compression, context);
413
448
  current_csv_position = 0;
414
- current_buffer = make_shared<CSVBuffer>(context, buffer_size, *file_handle, current_csv_position);
415
- next_buffer =
416
- shared_ptr<CSVBuffer>(current_buffer->Next(*file_handle, buffer_size, current_csv_position).release());
449
+ file_number++;
450
+ current_buffer =
451
+ make_shared<CSVBuffer>(context, buffer_size, *file_handle, current_csv_position, file_number);
452
+ next_buffer = shared_ptr<CSVBuffer>(
453
+ current_buffer->Next(*file_handle, buffer_size, current_csv_position, file_number).release());
417
454
  } else {
418
455
  // We are done scanning.
419
456
  reader.reset();
@@ -433,8 +470,8 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
433
470
  current_buffer = next_buffer;
434
471
  if (next_buffer) {
435
472
  // Next buffer gets the next-next buffer
436
- next_buffer =
437
- shared_ptr<CSVBuffer>(next_buffer->Next(*file_handle, buffer_size, current_csv_position).release());
473
+ next_buffer = shared_ptr<CSVBuffer>(
474
+ next_buffer->Next(*file_handle, buffer_size, current_csv_position, file_number).release());
438
475
  }
439
476
  }
440
477
  if (!reader || reader->options.file_path != current_file_path) {
@@ -443,13 +480,18 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
443
480
  if (file_index > 0 && file_index <= bind_data.union_readers.size() && bind_data.union_readers[file_index - 1]) {
444
481
  // we are doing UNION BY NAME - fetch the options from the union reader for this file
445
482
  auto &union_reader = *bind_data.union_readers[file_index - 1];
446
- reader =
447
- make_uniq<ParallelCSVReader>(context, union_reader.options, std::move(result), union_reader.GetTypes());
483
+ reader = make_uniq<ParallelCSVReader>(context, union_reader.options, std::move(result), first_position,
484
+ union_reader.GetTypes());
448
485
  reader->names = union_reader.GetNames();
486
+ } else if (file_index <= bind_data.column_info.size()) {
487
+ // Serialized Union By name
488
+ reader = make_uniq<ParallelCSVReader>(context, bind_data.options, std::move(result), first_position,
489
+ bind_data.column_info[file_index - 1].types);
490
+ reader->names = bind_data.column_info[file_index - 1].names;
449
491
  } else {
450
492
  // regular file - use the standard options
451
- reader = make_uniq<ParallelCSVReader>(context, bind_data.options, std::move(result), bind_data.csv_types);
452
- reader->options.file_path = current_file_path;
493
+ reader = make_uniq<ParallelCSVReader>(context, bind_data.options, std::move(result), first_position,
494
+ bind_data.csv_types);
453
495
  reader->names = bind_data.csv_names;
454
496
  }
455
497
  reader->options.file_path = current_file_path;
@@ -461,14 +503,20 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
461
503
  }
462
504
  return true;
463
505
  }
464
- void ParallelCSVGlobalState::UpdateVerification(VerificationPositions positions) {
506
+ void ParallelCSVGlobalState::UpdateVerification(VerificationPositions positions, idx_t file_number_p) {
465
507
  lock_guard<mutex> parallel_lock(main_mutex);
466
508
  if (positions.beginning_of_first_line < positions.end_of_last_line) {
467
509
  if (positions.end_of_last_line > max_tuple_end) {
468
510
  max_tuple_end = positions.end_of_last_line;
469
511
  }
470
- tuple_start.insert(positions.beginning_of_first_line);
471
- tuple_end.push_back(positions.end_of_last_line);
512
+ while (file_number_p >= tuple_start.size()) {
513
+ vector<idx_t> empty_tuple_end;
514
+ set<idx_t> empty_set;
515
+ tuple_start.emplace_back(empty_set);
516
+ tuple_end.emplace_back(empty_tuple_end);
517
+ }
518
+ tuple_start[file_number_p].insert(positions.beginning_of_first_line);
519
+ tuple_end[file_number_p].push_back(positions.end_of_last_line);
472
520
  }
473
521
  }
474
522
 
@@ -483,11 +531,9 @@ static unique_ptr<GlobalTableFunctionState> ParallelCSVInitGlobal(ClientContext
483
531
 
484
532
  bind_data.options.file_path = bind_data.files[0];
485
533
  file_handle = ReadCSV::OpenCSV(bind_data.options.file_path, bind_data.options.compression, context);
486
- idx_t rows_to_skip =
487
- bind_data.options.skip_rows + (bind_data.options.has_header && bind_data.options.header ? 1 : 0);
488
- return make_uniq<ParallelCSVGlobalState>(context, std::move(file_handle), bind_data.files,
489
- context.db->NumberOfThreads(), bind_data.options.buffer_size, rows_to_skip,
490
- ClientConfig::GetConfig(context).verify_parallelism, input.column_ids);
534
+ return make_uniq<ParallelCSVGlobalState>(
535
+ context, std::move(file_handle), bind_data.files, context.db->NumberOfThreads(), bind_data.options.buffer_size,
536
+ bind_data.options.skip_rows, ClientConfig::GetConfig(context).verify_parallelism, input.column_ids);
491
537
  }
492
538
 
493
539
  //===--------------------------------------------------------------------===//
@@ -534,11 +580,10 @@ static void ParallelReadCSVFunction(ClientContext &context, TableFunctionInput &
534
580
  }
535
581
  if (csv_local_state.csv_reader->finished) {
536
582
  auto verification_updates = csv_local_state.csv_reader->GetVerificationPositions();
537
- if (!csv_local_state.csv_reader->buffer->next_buffer) {
538
- // if it's the last line of the file we mark as the maximum
539
- verification_updates.end_of_last_line = NumericLimits<uint64_t>::Maximum();
583
+ if (verification_updates.beginning_of_first_line != verification_updates.end_of_last_line) {
584
+ csv_global_state.UpdateVerification(verification_updates,
585
+ csv_local_state.csv_reader->buffer->buffer->GetFileNumber());
540
586
  }
541
- csv_global_state.UpdateVerification(verification_updates);
542
587
  auto has_next = csv_global_state.Next(context, bind_data, csv_local_state.csv_reader);
543
588
  if (!has_next) {
544
589
  csv_global_state.DecrementThread();
@@ -642,14 +687,17 @@ static unique_ptr<GlobalTableFunctionState> SingleThreadedCSVInit(ClientContext
642
687
  TableFunctionInitInput &input) {
643
688
  auto &bind_data = (ReadCSVData &)*input.bind_data;
644
689
  auto result = make_uniq<SingleThreadedCSVState>(bind_data.files.size());
645
- if (bind_data.initial_reader) {
646
- result->initial_reader = std::move(bind_data.initial_reader);
647
- } else if (bind_data.files.empty()) {
690
+ if (bind_data.files.empty()) {
648
691
  // This can happen when a filename based filter pushdown has eliminated all possible files for this scan.
649
692
  return std::move(result);
650
693
  } else {
651
694
  bind_data.options.file_path = bind_data.files[0];
652
- result->initial_reader = make_uniq<BufferedCSVReader>(context, bind_data.options, bind_data.csv_types);
695
+ if (bind_data.initial_reader && !bind_data.file_exists) {
696
+ // If this is not an on disk file we gotta reuse the reader.
697
+ result->initial_reader = std::move(bind_data.initial_reader);
698
+ } else {
699
+ result->initial_reader = make_uniq<BufferedCSVReader>(context, bind_data.options, bind_data.csv_types);
700
+ }
653
701
  if (!bind_data.options.file_options.union_by_name) {
654
702
  result->initial_reader->names = bind_data.csv_names;
655
703
  }
@@ -741,6 +789,14 @@ static void SingleThreadedCSVFunction(ClientContext &context, TableFunctionInput
741
789
  //===--------------------------------------------------------------------===//
742
790
  static unique_ptr<GlobalTableFunctionState> ReadCSVInitGlobal(ClientContext &context, TableFunctionInitInput &input) {
743
791
  auto &bind_data = (ReadCSVData &)*input.bind_data;
792
+ auto &fs = FileSystem::GetFileSystem(context);
793
+ for (auto &file : bind_data.files) {
794
+ if (!fs.FileExists(file)) {
795
+ bind_data.file_exists = false;
796
+ break;
797
+ }
798
+ }
799
+ bind_data.single_threaded = bind_data.single_threaded || !bind_data.file_exists;
744
800
  if (bind_data.single_threaded) {
745
801
  return SingleThreadedCSVInit(context, input);
746
802
  } else {
@@ -863,6 +919,7 @@ void BufferedCSVReaderOptions::Serialize(FieldWriter &writer) const {
863
919
  writer.WriteField<idx_t>(buffer_sample_size);
864
920
  writer.WriteString(null_str);
865
921
  writer.WriteField<FileCompressionType>(compression);
922
+ writer.WriteField<NewLineIdentifier>(new_line);
866
923
  // read options
867
924
  writer.WriteField<idx_t>(skip_rows);
868
925
  writer.WriteField<bool>(skip_rows_set);
@@ -896,6 +953,7 @@ void BufferedCSVReaderOptions::Deserialize(FieldReader &reader) {
896
953
  buffer_sample_size = reader.ReadRequired<idx_t>();
897
954
  null_str = reader.ReadRequired<string>();
898
955
  compression = reader.ReadRequired<FileCompressionType>();
956
+ new_line = reader.ReadRequired<NewLineIdentifier>();
899
957
  // read options
900
958
  skip_rows = reader.ReadRequired<idx_t>();
901
959
  skip_rows_set = reader.ReadRequired<bool>();
@@ -926,6 +984,10 @@ static void CSVReaderSerialize(FieldWriter &writer, const FunctionData *bind_dat
926
984
  bind_data.options.Serialize(writer);
927
985
  writer.WriteField<bool>(bind_data.single_threaded);
928
986
  writer.WriteSerializable(bind_data.reader_bind);
987
+ writer.WriteField<uint32_t>(bind_data.column_info.size());
988
+ for (auto &col : bind_data.column_info) {
989
+ col.Serialize(writer);
990
+ }
929
991
  }
930
992
 
931
993
  static unique_ptr<FunctionData> CSVReaderDeserialize(ClientContext &context, FieldReader &reader,
@@ -941,6 +1003,10 @@ static unique_ptr<FunctionData> CSVReaderDeserialize(ClientContext &context, Fie
941
1003
  result_data->options.Deserialize(reader);
942
1004
  result_data->single_threaded = reader.ReadField<bool>(true);
943
1005
  result_data->reader_bind = reader.ReadRequiredSerializable<MultiFileReaderBindData, MultiFileReaderBindData>();
1006
+ uint32_t file_number = reader.ReadRequired<uint32_t>();
1007
+ for (idx_t i = 0; i < file_number; i++) {
1008
+ result_data->column_info.emplace_back(ColumnInfo::Deserialize(reader));
1009
+ }
944
1010
  return std::move(result_data);
945
1011
  }
946
1012
 
@@ -1,8 +1,8 @@
1
1
  #ifndef DUCKDB_VERSION
2
- #define DUCKDB_VERSION "0.7.2-dev1901"
2
+ #define DUCKDB_VERSION "0.7.2-dev2233"
3
3
  #endif
4
4
  #ifndef DUCKDB_SOURCE_ID
5
- #define DUCKDB_SOURCE_ID "5aa369b4b1"
5
+ #define DUCKDB_SOURCE_ID "c81600ed51"
6
6
  #endif
7
7
  #include "duckdb/function/table/system_functions.hpp"
8
8
  #include "duckdb/main/database.hpp"
@@ -27,7 +27,7 @@ public:
27
27
  //! Create an IndexCatalogEntry and initialize storage for it
28
28
  IndexCatalogEntry(Catalog *catalog, SchemaCatalogEntry *schema, CreateIndexInfo *info);
29
29
 
30
- Index *index;
30
+ optional_ptr<Index> index;
31
31
  string sql;
32
32
  vector<unique_ptr<ParsedExpression>> expressions;
33
33
  vector<unique_ptr<ParsedExpression>> parsed_expressions;
@@ -23,7 +23,7 @@ public:
23
23
  DUCKDB_API ~ArrowAppender();
24
24
 
25
25
  //! Append a data chunk to the underlying arrow array
26
- DUCKDB_API void Append(DataChunk &input);
26
+ DUCKDB_API void Append(DataChunk &input, idx_t from, idx_t to, idx_t input_size);
27
27
  //! Returns the underlying arrow array
28
28
  DUCKDB_API ArrowArray Finalize();
29
29
 
@@ -110,6 +110,8 @@ struct PhysicalIndex {
110
110
  }
111
111
  };
112
112
 
113
+ DUCKDB_API bool IsPowerOfTwo(uint64_t v);
113
114
  DUCKDB_API uint64_t NextPowerOfTwo(uint64_t v);
115
+ DUCKDB_API uint64_t PreviousPowerOfTwo(uint64_t v);
114
116
 
115
117
  } // namespace duckdb
@@ -121,6 +121,9 @@ public:
121
121
  DUCKDB_API static bool UncaughtException();
122
122
 
123
123
  DUCKDB_API static string GetStackTrace(int max_depth = 120);
124
+ DUCKDB_API static string FormatStackTrace(string message = "") {
125
+ return (message + "\n" + GetStackTrace());
126
+ }
124
127
 
125
128
  private:
126
129
  string exception_message_;