duckdb 0.7.2-dev1901.0 → 0.7.2-dev2233.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. package/binding.gyp +2 -0
  2. package/package.json +1 -1
  3. package/src/duckdb/extension/parquet/column_reader.cpp +3 -0
  4. package/src/duckdb/extension/parquet/include/parquet_writer.hpp +1 -1
  5. package/src/duckdb/extension/parquet/parquet_metadata.cpp +4 -2
  6. package/src/duckdb/src/catalog/catalog_entry/duck_index_entry.cpp +1 -1
  7. package/src/duckdb/src/common/arrow/arrow_appender.cpp +69 -44
  8. package/src/duckdb/src/common/arrow/arrow_converter.cpp +1 -1
  9. package/src/duckdb/src/common/arrow/arrow_wrapper.cpp +20 -2
  10. package/src/duckdb/src/common/box_renderer.cpp +4 -2
  11. package/src/duckdb/src/common/constants.cpp +10 -1
  12. package/src/duckdb/src/common/filename_pattern.cpp +41 -0
  13. package/src/duckdb/src/common/hive_partitioning.cpp +144 -15
  14. package/src/duckdb/src/common/radix_partitioning.cpp +101 -369
  15. package/src/duckdb/src/common/row_operations/row_aggregate.cpp +8 -9
  16. package/src/duckdb/src/common/row_operations/row_external.cpp +1 -1
  17. package/src/duckdb/src/common/row_operations/row_gather.cpp +5 -3
  18. package/src/duckdb/src/common/row_operations/row_match.cpp +117 -22
  19. package/src/duckdb/src/common/row_operations/row_scatter.cpp +2 -2
  20. package/src/duckdb/src/common/sort/partition_state.cpp +1 -1
  21. package/src/duckdb/src/common/sort/sort_state.cpp +2 -1
  22. package/src/duckdb/src/common/sort/sorted_block.cpp +1 -1
  23. package/src/duckdb/src/common/types/{column_data_allocator.cpp → column/column_data_allocator.cpp} +2 -2
  24. package/src/duckdb/src/common/types/{column_data_collection.cpp → column/column_data_collection.cpp} +29 -6
  25. package/src/duckdb/src/common/types/{column_data_collection_segment.cpp → column/column_data_collection_segment.cpp} +2 -1
  26. package/src/duckdb/src/common/types/{column_data_consumer.cpp → column/column_data_consumer.cpp} +1 -1
  27. package/src/duckdb/src/common/types/{partitioned_column_data.cpp → column/partitioned_column_data.cpp} +11 -9
  28. package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +316 -0
  29. package/src/duckdb/src/common/types/{row_data_collection.cpp → row/row_data_collection.cpp} +1 -1
  30. package/src/duckdb/src/common/types/{row_data_collection_scanner.cpp → row/row_data_collection_scanner.cpp} +2 -2
  31. package/src/duckdb/src/common/types/{row_layout.cpp → row/row_layout.cpp} +1 -1
  32. package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +465 -0
  33. package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +511 -0
  34. package/src/duckdb/src/common/types/row/tuple_data_iterator.cpp +96 -0
  35. package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +119 -0
  36. package/src/duckdb/src/common/types/row/tuple_data_scatter_gather.cpp +1200 -0
  37. package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +170 -0
  38. package/src/duckdb/src/common/types/vector.cpp +1 -1
  39. package/src/duckdb/src/execution/aggregate_hashtable.cpp +252 -290
  40. package/src/duckdb/src/execution/join_hashtable.cpp +192 -328
  41. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +4 -4
  42. package/src/duckdb/src/execution/operator/helper/physical_execute.cpp +3 -3
  43. package/src/duckdb/src/execution/operator/helper/physical_limit_percent.cpp +2 -3
  44. package/src/duckdb/src/execution/operator/helper/physical_result_collector.cpp +2 -3
  45. package/src/duckdb/src/execution/operator/join/perfect_hash_join_executor.cpp +36 -21
  46. package/src/duckdb/src/execution/operator/join/physical_blockwise_nl_join.cpp +2 -2
  47. package/src/duckdb/src/execution/operator/join/physical_cross_product.cpp +1 -1
  48. package/src/duckdb/src/execution/operator/join/physical_delim_join.cpp +2 -2
  49. package/src/duckdb/src/execution/operator/join/physical_hash_join.cpp +166 -144
  50. package/src/duckdb/src/execution/operator/join/physical_index_join.cpp +5 -5
  51. package/src/duckdb/src/execution/operator/join/physical_join.cpp +2 -10
  52. package/src/duckdb/src/execution/operator/join/physical_positional_join.cpp +0 -1
  53. package/src/duckdb/src/execution/operator/order/physical_top_n.cpp +2 -2
  54. package/src/duckdb/src/execution/operator/persistent/base_csv_reader.cpp +3 -0
  55. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +71 -22
  56. package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +17 -13
  57. package/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp +0 -7
  58. package/src/duckdb/src/execution/operator/persistent/parallel_csv_reader.cpp +124 -29
  59. package/src/duckdb/src/execution/operator/persistent/physical_copy_to_file.cpp +13 -11
  60. package/src/duckdb/src/execution/operator/persistent/physical_delete.cpp +3 -2
  61. package/src/duckdb/src/execution/operator/persistent/physical_export.cpp +25 -24
  62. package/src/duckdb/src/execution/operator/persistent/physical_insert.cpp +1 -1
  63. package/src/duckdb/src/execution/operator/persistent/physical_update.cpp +4 -3
  64. package/src/duckdb/src/execution/operator/scan/physical_table_scan.cpp +1 -1
  65. package/src/duckdb/src/execution/operator/schema/physical_create_type.cpp +1 -1
  66. package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +3 -3
  67. package/src/duckdb/src/execution/partitionable_hashtable.cpp +9 -37
  68. package/src/duckdb/src/execution/physical_operator.cpp +1 -1
  69. package/src/duckdb/src/execution/physical_plan/plan_comparison_join.cpp +19 -18
  70. package/src/duckdb/src/execution/physical_plan/plan_copy_to_file.cpp +2 -1
  71. package/src/duckdb/src/execution/physical_plan/plan_execute.cpp +2 -2
  72. package/src/duckdb/src/execution/physical_plan/plan_explain.cpp +5 -6
  73. package/src/duckdb/src/execution/physical_plan/plan_expression_get.cpp +2 -2
  74. package/src/duckdb/src/execution/physical_plan/plan_recursive_cte.cpp +3 -3
  75. package/src/duckdb/src/execution/physical_plan_generator.cpp +1 -1
  76. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +39 -17
  77. package/src/duckdb/src/function/aggregate/sorted_aggregate_function.cpp +2 -2
  78. package/src/duckdb/src/function/table/pragma_detailed_profiling_output.cpp +5 -5
  79. package/src/duckdb/src/function/table/pragma_last_profiling_output.cpp +2 -2
  80. package/src/duckdb/src/function/table/read_csv.cpp +124 -58
  81. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  82. package/src/duckdb/src/include/duckdb/catalog/catalog_entry/index_catalog_entry.hpp +1 -1
  83. package/src/duckdb/src/include/duckdb/common/arrow/arrow_appender.hpp +1 -1
  84. package/src/duckdb/src/include/duckdb/common/constants.hpp +2 -0
  85. package/src/duckdb/src/include/duckdb/common/exception.hpp +3 -0
  86. package/src/duckdb/src/include/duckdb/common/fast_mem.hpp +528 -0
  87. package/src/duckdb/src/include/duckdb/common/filename_pattern.hpp +34 -0
  88. package/src/duckdb/src/include/duckdb/common/helper.hpp +10 -0
  89. package/src/duckdb/src/include/duckdb/common/hive_partitioning.hpp +13 -3
  90. package/src/duckdb/src/include/duckdb/common/optional_ptr.hpp +8 -0
  91. package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +34 -0
  92. package/src/duckdb/src/include/duckdb/common/radix_partitioning.hpp +80 -27
  93. package/src/duckdb/src/include/duckdb/common/reference_map.hpp +38 -0
  94. package/src/duckdb/src/include/duckdb/common/row_operations/row_operations.hpp +7 -6
  95. package/src/duckdb/src/include/duckdb/common/sort/comparators.hpp +1 -1
  96. package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +1 -1
  97. package/src/duckdb/src/include/duckdb/common/sort/sort.hpp +1 -1
  98. package/src/duckdb/src/include/duckdb/common/sort/sorted_block.hpp +2 -2
  99. package/src/duckdb/src/include/duckdb/common/types/batched_data_collection.hpp +1 -1
  100. package/src/duckdb/src/include/duckdb/common/types/{column_data_allocator.hpp → column/column_data_allocator.hpp} +4 -4
  101. package/src/duckdb/src/include/duckdb/common/types/{column_data_collection.hpp → column/column_data_collection.hpp} +4 -4
  102. package/src/duckdb/src/include/duckdb/common/types/{column_data_collection_iterators.hpp → column/column_data_collection_iterators.hpp} +2 -2
  103. package/src/duckdb/src/include/duckdb/common/types/{column_data_collection_segment.hpp → column/column_data_collection_segment.hpp} +3 -3
  104. package/src/duckdb/src/include/duckdb/common/types/{column_data_consumer.hpp → column/column_data_consumer.hpp} +8 -4
  105. package/src/duckdb/src/include/duckdb/common/types/{column_data_scan_states.hpp → column/column_data_scan_states.hpp} +1 -1
  106. package/src/duckdb/src/include/duckdb/common/types/{partitioned_column_data.hpp → column/partitioned_column_data.hpp} +15 -7
  107. package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +140 -0
  108. package/src/duckdb/src/include/duckdb/common/types/{row_data_collection.hpp → row/row_data_collection.hpp} +1 -1
  109. package/src/duckdb/src/include/duckdb/common/types/{row_data_collection_scanner.hpp → row/row_data_collection_scanner.hpp} +2 -2
  110. package/src/duckdb/src/include/duckdb/common/types/{row_layout.hpp → row/row_layout.hpp} +3 -1
  111. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +116 -0
  112. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +239 -0
  113. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_iterator.hpp +64 -0
  114. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +113 -0
  115. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +124 -0
  116. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +74 -0
  117. package/src/duckdb/src/include/duckdb/common/types/validity_mask.hpp +3 -0
  118. package/src/duckdb/src/include/duckdb/common/types/value.hpp +4 -12
  119. package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +34 -31
  120. package/src/duckdb/src/include/duckdb/execution/base_aggregate_hashtable.hpp +2 -2
  121. package/src/duckdb/src/include/duckdb/execution/execution_context.hpp +3 -2
  122. package/src/duckdb/src/include/duckdb/execution/expression_executor.hpp +1 -1
  123. package/src/duckdb/src/include/duckdb/execution/join_hashtable.hpp +41 -67
  124. package/src/duckdb/src/include/duckdb/execution/nested_loop_join.hpp +1 -1
  125. package/src/duckdb/src/include/duckdb/execution/operator/helper/physical_execute.hpp +2 -2
  126. package/src/duckdb/src/include/duckdb/execution/operator/helper/physical_result_collector.hpp +1 -1
  127. package/src/duckdb/src/include/duckdb/execution/operator/join/outer_join_marker.hpp +2 -2
  128. package/src/duckdb/src/include/duckdb/execution/operator/join/perfect_hash_join_executor.hpp +1 -1
  129. package/src/duckdb/src/include/duckdb/execution/operator/join/physical_cross_product.hpp +1 -1
  130. package/src/duckdb/src/include/duckdb/execution/operator/join/physical_hash_join.hpp +0 -2
  131. package/src/duckdb/src/include/duckdb/execution/operator/join/physical_index_join.hpp +2 -2
  132. package/src/duckdb/src/include/duckdb/execution/operator/join/physical_positional_join.hpp +1 -1
  133. package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +4 -1
  134. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +8 -3
  135. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp +5 -7
  136. package/src/duckdb/src/include/duckdb/execution/operator/persistent/parallel_csv_reader.hpp +5 -1
  137. package/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_copy_to_file.hpp +4 -1
  138. package/src/duckdb/src/include/duckdb/execution/operator/scan/physical_column_data_scan.hpp +1 -1
  139. package/src/duckdb/src/include/duckdb/execution/operator/set/physical_recursive_cte.hpp +1 -1
  140. package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +2 -2
  141. package/src/duckdb/src/include/duckdb/function/function.hpp +2 -0
  142. package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +25 -0
  143. package/src/duckdb/src/include/duckdb/main/client_data.hpp +3 -0
  144. package/src/duckdb/src/include/duckdb/main/config.hpp +0 -2
  145. package/src/duckdb/src/include/duckdb/main/materialized_query_result.hpp +1 -1
  146. package/src/duckdb/src/include/duckdb/main/query_result.hpp +14 -1
  147. package/src/duckdb/src/include/duckdb/optimizer/expression_rewriter.hpp +3 -3
  148. package/src/duckdb/src/include/duckdb/optimizer/join_order/cardinality_estimator.hpp +16 -16
  149. package/src/duckdb/src/include/duckdb/optimizer/join_order/join_node.hpp +8 -8
  150. package/src/duckdb/src/include/duckdb/optimizer/join_order/join_order_optimizer.hpp +23 -15
  151. package/src/duckdb/src/include/duckdb/optimizer/join_order/join_relation.hpp +9 -10
  152. package/src/duckdb/src/include/duckdb/optimizer/join_order/query_graph.hpp +18 -11
  153. package/src/duckdb/src/include/duckdb/parallel/meta_pipeline.hpp +1 -1
  154. package/src/duckdb/src/include/duckdb/parser/parsed_data/exported_table_data.hpp +5 -1
  155. package/src/duckdb/src/include/duckdb/parser/parsed_data/vacuum_info.hpp +3 -2
  156. package/src/duckdb/src/include/duckdb/parser/query_error_context.hpp +4 -2
  157. package/src/duckdb/src/include/duckdb/parser/transformer.hpp +9 -35
  158. package/src/duckdb/src/include/duckdb/planner/binder.hpp +24 -23
  159. package/src/duckdb/src/include/duckdb/planner/expression_binder.hpp +3 -3
  160. package/src/duckdb/src/include/duckdb/planner/operator/logical_column_data_get.hpp +1 -1
  161. package/src/duckdb/src/include/duckdb/planner/operator/logical_copy_to_file.hpp +3 -1
  162. package/src/duckdb/src/include/duckdb/storage/table/table_index_list.hpp +1 -1
  163. package/src/duckdb/src/main/appender.cpp +6 -6
  164. package/src/duckdb/src/main/client_context.cpp +1 -1
  165. package/src/duckdb/src/main/connection.cpp +2 -2
  166. package/src/duckdb/src/main/query_result.cpp +13 -0
  167. package/src/duckdb/src/main/settings/settings.cpp +3 -4
  168. package/src/duckdb/src/optimizer/expression_rewriter.cpp +4 -4
  169. package/src/duckdb/src/optimizer/join_order/cardinality_estimator.cpp +91 -105
  170. package/src/duckdb/src/optimizer/join_order/join_node.cpp +5 -8
  171. package/src/duckdb/src/optimizer/join_order/join_order_optimizer.cpp +163 -160
  172. package/src/duckdb/src/optimizer/join_order/join_relation_set.cpp +30 -30
  173. package/src/duckdb/src/optimizer/join_order/query_graph.cpp +37 -38
  174. package/src/duckdb/src/parallel/executor.cpp +1 -1
  175. package/src/duckdb/src/parallel/meta_pipeline.cpp +2 -2
  176. package/src/duckdb/src/parser/transform/helpers/transform_cte.cpp +1 -1
  177. package/src/duckdb/src/parser/transform/tableref/transform_subquery.cpp +1 -1
  178. package/src/duckdb/src/parser/transformer.cpp +50 -9
  179. package/src/duckdb/src/planner/binder/expression/bind_operator_expression.cpp +13 -0
  180. package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +15 -5
  181. package/src/duckdb/src/planner/binder/statement/bind_create.cpp +19 -17
  182. package/src/duckdb/src/planner/binder/statement/bind_create_table.cpp +4 -4
  183. package/src/duckdb/src/planner/binder/statement/bind_export.cpp +20 -21
  184. package/src/duckdb/src/planner/binder/tableref/bind_basetableref.cpp +24 -22
  185. package/src/duckdb/src/planner/binder/tableref/bind_subqueryref.cpp +2 -2
  186. package/src/duckdb/src/planner/binder/tableref/bind_table_function.cpp +9 -0
  187. package/src/duckdb/src/planner/binder.cpp +16 -19
  188. package/src/duckdb/src/planner/expression_binder.cpp +8 -8
  189. package/src/duckdb/src/planner/operator/logical_copy_to_file.cpp +3 -3
  190. package/src/duckdb/src/storage/checkpoint_manager.cpp +23 -23
  191. package/src/duckdb/src/storage/standard_buffer_manager.cpp +1 -1
  192. package/src/duckdb/src/storage/table_index_list.cpp +3 -3
  193. package/src/duckdb/src/verification/statement_verifier.cpp +1 -1
  194. package/src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp +5552 -5598
  195. package/src/duckdb/ub_src_common.cpp +2 -0
  196. package/src/duckdb/ub_src_common_types.cpp +0 -16
  197. package/src/duckdb/ub_src_common_types_column.cpp +10 -0
  198. package/src/duckdb/ub_src_common_types_row.cpp +20 -0
@@ -60,7 +60,7 @@ public:
60
60
  PhysicalIndexJoin::PhysicalIndexJoin(LogicalOperator &op, unique_ptr<PhysicalOperator> left,
61
61
  unique_ptr<PhysicalOperator> right, vector<JoinCondition> cond, JoinType join_type,
62
62
  const vector<idx_t> &left_projection_map_p, vector<idx_t> right_projection_map_p,
63
- vector<column_t> column_ids_p, Index *index_p, bool lhs_first,
63
+ vector<column_t> column_ids_p, Index &index_p, bool lhs_first,
64
64
  idx_t estimated_cardinality)
65
65
  : CachingPhysicalOperator(PhysicalOperatorType::INDEX_JOIN, std::move(op.types), estimated_cardinality),
66
66
  left_projection_map(left_projection_map_p), right_projection_map(std::move(right_projection_map_p)),
@@ -74,7 +74,7 @@ PhysicalIndexJoin::PhysicalIndexJoin(LogicalOperator &op, unique_ptr<PhysicalOpe
74
74
  condition_types.push_back(condition.left->return_type);
75
75
  }
76
76
  //! Only add to fetch_ids columns that are not indexed
77
- for (auto &index_id : index->column_ids) {
77
+ for (auto &index_id : index.column_ids) {
78
78
  index_ids.insert(index_id);
79
79
  }
80
80
 
@@ -165,7 +165,7 @@ void PhysicalIndexJoin::Output(ExecutionContext &context, DataChunk &input, Data
165
165
  void PhysicalIndexJoin::GetRHSMatches(ExecutionContext &context, DataChunk &input, OperatorState &state_p) const {
166
166
 
167
167
  auto &state = state_p.Cast<IndexJoinOperatorState>();
168
- auto &art = index->Cast<ART>();
168
+ auto &art = index.Cast<ART>();
169
169
  ;
170
170
 
171
171
  // generate the keys for this chunk
@@ -177,11 +177,11 @@ void PhysicalIndexJoin::GetRHSMatches(ExecutionContext &context, DataChunk &inpu
177
177
  if (!state.keys[i].Empty()) {
178
178
  if (fetch_types.empty()) {
179
179
  IndexLock lock;
180
- index->InitializeLock(lock);
180
+ index.InitializeLock(lock);
181
181
  art.SearchEqualJoinNoFetch(state.keys[i], state.result_sizes[i]);
182
182
  } else {
183
183
  IndexLock lock;
184
- index->InitializeLock(lock);
184
+ index.InitializeLock(lock);
185
185
  art.SearchEqual(state.keys[i], (idx_t)-1, state.rhs_rows[i]);
186
186
  state.result_sizes[i] = state.rhs_rows[i].size();
187
187
  }
@@ -41,7 +41,7 @@ void PhysicalJoin::BuildJoinPipelines(Pipeline &current, MetaPipeline &meta_pipe
41
41
 
42
42
  // on the RHS (build side), we construct a child MetaPipeline with this operator as its sink
43
43
  auto child_meta_pipeline = meta_pipeline.CreateChildMetaPipeline(current, &op);
44
- child_meta_pipeline->Build(op.children[1].get());
44
+ child_meta_pipeline->Build(*op.children[1]);
45
45
 
46
46
  // continue building the current pipeline on the LHS (probe side)
47
47
  op.children[0]->BuildPipelines(current, meta_pipeline);
@@ -60,18 +60,10 @@ void PhysicalJoin::BuildJoinPipelines(Pipeline &current, MetaPipeline &meta_pipe
60
60
  // Join can become a source operator if it's RIGHT/OUTER, or if the hash join goes out-of-core
61
61
  bool add_child_pipeline = false;
62
62
  auto &join_op = op.Cast<PhysicalJoin>();
63
- if (IsRightOuterJoin(join_op.join_type)) {
63
+ if (IsRightOuterJoin(join_op.join_type) || join_op.type == PhysicalOperatorType::HASH_JOIN) {
64
64
  add_child_pipeline = true;
65
65
  }
66
66
 
67
- if (join_op.type == PhysicalOperatorType::HASH_JOIN) {
68
- auto &hash_join_op = join_op.Cast<PhysicalHashJoin>();
69
- hash_join_op.can_go_external = !meta_pipeline.HasRecursiveCTE();
70
- if (hash_join_op.can_go_external) {
71
- add_child_pipeline = true;
72
- }
73
- }
74
-
75
67
  if (add_child_pipeline) {
76
68
  meta_pipeline.CreateChildPipeline(current, &op, last_pipeline);
77
69
  }
@@ -1,6 +1,5 @@
1
1
  #include "duckdb/execution/operator/join/physical_positional_join.hpp"
2
2
 
3
- #include "duckdb/common/types/column_data_collection.hpp"
4
3
  #include "duckdb/common/vector_operations/vector_operations.hpp"
5
4
  #include "duckdb/execution/operator/join/physical_join.hpp"
6
5
 
@@ -1,12 +1,12 @@
1
1
  #include "duckdb/execution/operator/order/physical_top_n.hpp"
2
2
 
3
3
  #include "duckdb/common/assert.hpp"
4
+ #include "duckdb/common/sort/sort.hpp"
5
+ #include "duckdb/common/types/row/row_layout.hpp"
4
6
  #include "duckdb/common/value_operations/value_operations.hpp"
5
7
  #include "duckdb/common/vector_operations/vector_operations.hpp"
6
8
  #include "duckdb/execution/expression_executor.hpp"
7
9
  #include "duckdb/storage/data_table.hpp"
8
- #include "duckdb/common/sort/sort.hpp"
9
- #include "duckdb/common/types/row_layout.hpp"
10
10
 
11
11
  namespace duckdb {
12
12
 
@@ -42,6 +42,9 @@ BaseCSVReader::~BaseCSVReader() {
42
42
  unique_ptr<CSVFileHandle> BaseCSVReader::OpenCSV(const BufferedCSVReaderOptions &options_p) {
43
43
  auto file_handle = fs.OpenFile(options_p.file_path.c_str(), FileFlags::FILE_FLAGS_READ, FileLockType::NO_LOCK,
44
44
  options_p.compression, this->opener);
45
+ if (file_handle->CanSeek()) {
46
+ file_handle->Reset();
47
+ }
45
48
  return make_uniq<CSVFileHandle>(std::move(file_handle));
46
49
  }
47
50
 
@@ -15,6 +15,7 @@
15
15
  #include "utf8proc.hpp"
16
16
  #include "duckdb/parser/keyword_helper.hpp"
17
17
  #include "duckdb/main/error_manager.hpp"
18
+ #include "duckdb/main/client_data.hpp"
18
19
 
19
20
  #include <algorithm>
20
21
  #include <cctype>
@@ -954,10 +955,10 @@ bool BufferedCSVReader::TryParseComplexCSV(DataChunk &insert_chunk, string &erro
954
955
  bool has_quotes = false;
955
956
  uint8_t delimiter_pos = 0, escape_pos = 0, quote_pos = 0;
956
957
  idx_t offset = 0;
957
-
958
+ idx_t line_start = 0;
958
959
  // read values into the buffer (if any)
959
960
  if (position >= buffer_size) {
960
- if (!ReadBuffer(start)) {
961
+ if (!ReadBuffer(start, line_start)) {
961
962
  return true;
962
963
  }
963
964
  }
@@ -994,7 +995,7 @@ value_start:
994
995
  goto in_quotes;
995
996
  }
996
997
  }
997
- } while (ReadBuffer(start));
998
+ } while (ReadBuffer(start, line_start));
998
999
  // file ends while scanning for quote/delimiter, go to final state
999
1000
  goto final_state;
1000
1001
  normal:
@@ -1011,7 +1012,7 @@ normal:
1011
1012
  goto add_row;
1012
1013
  }
1013
1014
  }
1014
- } while (ReadBuffer(start));
1015
+ } while (ReadBuffer(start, line_start));
1015
1016
  goto final_state;
1016
1017
  add_value:
1017
1018
  AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes);
@@ -1019,7 +1020,7 @@ add_value:
1019
1020
  offset = 0;
1020
1021
  has_quotes = false;
1021
1022
  start = ++position;
1022
- if (position >= buffer_size && !ReadBuffer(start)) {
1023
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
1023
1024
  // file ends right after delimiter, go to final state
1024
1025
  goto final_state;
1025
1026
  }
@@ -1029,14 +1030,17 @@ add_row : {
1029
1030
  bool carriage_return = buffer[position] == '\r';
1030
1031
  AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes);
1031
1032
  finished_chunk = AddRow(insert_chunk, column, error_message);
1033
+
1032
1034
  if (!error_message.empty()) {
1033
1035
  return false;
1034
1036
  }
1035
1037
  // increase position by 1 and move start to the new position
1036
1038
  offset = 0;
1037
1039
  has_quotes = false;
1038
- start = ++position;
1039
- if (position >= buffer_size && !ReadBuffer(start)) {
1040
+ position++;
1041
+ SkipEmptyLines();
1042
+ start = position;
1043
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
1040
1044
  // file ends right after newline, go to final state
1041
1045
  goto final_state;
1042
1046
  }
@@ -1069,7 +1073,7 @@ in_quotes:
1069
1073
  goto handle_escape;
1070
1074
  }
1071
1075
  }
1072
- } while (ReadBuffer(start));
1076
+ } while (ReadBuffer(start, line_start));
1073
1077
  // still in quoted state at the end of the file, error:
1074
1078
  error_message = StringUtil::Format("Error in file \"%s\" on line %s: unterminated quotes. (%s)", options.file_path,
1075
1079
  GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
@@ -1082,7 +1086,7 @@ unquote:
1082
1086
  delimiter_pos = 0;
1083
1087
  quote_pos = 0;
1084
1088
  position++;
1085
- if (position >= buffer_size && !ReadBuffer(start)) {
1089
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
1086
1090
  // file ends right after unquote, go to final state
1087
1091
  offset = options.quote.size();
1088
1092
  goto final_state;
@@ -1116,7 +1120,7 @@ unquote:
1116
1120
  goto in_quotes;
1117
1121
  }
1118
1122
  }
1119
- } while (ReadBuffer(start));
1123
+ } while (ReadBuffer(start, line_start));
1120
1124
  error_message = StringUtil::Format(
1121
1125
  "Error in file \"%s\" on line %s: quote should be followed by end of value, end of row or another quote. (%s)",
1122
1126
  options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
@@ -1142,7 +1146,7 @@ handle_escape:
1142
1146
  goto in_quotes;
1143
1147
  }
1144
1148
  }
1145
- } while (ReadBuffer(start));
1149
+ } while (ReadBuffer(start, line_start));
1146
1150
  error_message =
1147
1151
  StringUtil::Format("Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)",
1148
1152
  options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
@@ -1153,7 +1157,7 @@ carriage_return:
1153
1157
  if (buffer[position] == '\n') {
1154
1158
  // newline after carriage return: skip
1155
1159
  start = ++position;
1156
- if (position >= buffer_size && !ReadBuffer(start)) {
1160
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
1157
1161
  // file ends right after newline, go to final state
1158
1162
  goto final_state;
1159
1163
  }
@@ -1170,6 +1174,7 @@ final_state:
1170
1174
  // remaining values to be added to the chunk
1171
1175
  AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes);
1172
1176
  finished_chunk = AddRow(insert_chunk, column, error_message);
1177
+ SkipEmptyLines();
1173
1178
  if (!error_message.empty()) {
1174
1179
  return false;
1175
1180
  }
@@ -1184,6 +1189,18 @@ final_state:
1184
1189
  return true;
1185
1190
  }
1186
1191
 
1192
+ void BufferedCSVReader::SkipEmptyLines() {
1193
+ if (parse_chunk.data.size() == 1) {
1194
+ // Empty lines are null data.
1195
+ return;
1196
+ }
1197
+ for (; position < buffer_size; position++) {
1198
+ if (!StringUtil::CharacterIsNewline(buffer[position])) {
1199
+ return;
1200
+ }
1201
+ }
1202
+ }
1203
+
1187
1204
  bool BufferedCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message) {
1188
1205
  // used for parsing algorithm
1189
1206
  bool finished_chunk = false;
@@ -1192,12 +1209,14 @@ bool BufferedCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error
1192
1209
  bool has_quotes = false;
1193
1210
  vector<idx_t> escape_positions;
1194
1211
 
1212
+ idx_t line_start = position;
1195
1213
  // read values into the buffer (if any)
1196
1214
  if (position >= buffer_size) {
1197
- if (!ReadBuffer(start)) {
1215
+ if (!ReadBuffer(start, line_start)) {
1198
1216
  return true;
1199
1217
  }
1200
1218
  }
1219
+
1201
1220
  // start parsing the first value
1202
1221
  goto value_start;
1203
1222
  value_start:
@@ -1227,7 +1246,7 @@ normal:
1227
1246
  goto add_row;
1228
1247
  }
1229
1248
  }
1230
- } while (ReadBuffer(start));
1249
+ } while (ReadBuffer(start, line_start));
1231
1250
  // file ends during normal scan: go to end state
1232
1251
  goto final_state;
1233
1252
  add_value:
@@ -1236,7 +1255,7 @@ add_value:
1236
1255
  offset = 0;
1237
1256
  has_quotes = false;
1238
1257
  start = ++position;
1239
- if (position >= buffer_size && !ReadBuffer(start)) {
1258
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
1240
1259
  // file ends right after delimiter, go to final state
1241
1260
  goto final_state;
1242
1261
  }
@@ -1249,14 +1268,19 @@ add_row : {
1249
1268
  return false;
1250
1269
  }
1251
1270
  finished_chunk = AddRow(insert_chunk, column, error_message);
1271
+ if (context.client_data->max_line_length < position - line_start) {
1272
+ context.client_data->max_line_length = position - line_start;
1273
+ }
1252
1274
  if (!error_message.empty()) {
1253
1275
  return false;
1254
1276
  }
1255
1277
  // increase position by 1 and move start to the new position
1256
1278
  offset = 0;
1257
1279
  has_quotes = false;
1258
- start = ++position;
1259
- if (position >= buffer_size && !ReadBuffer(start)) {
1280
+ position++;
1281
+ start = position;
1282
+ line_start = position;
1283
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
1260
1284
  // file ends right after delimiter, go to final state
1261
1285
  goto final_state;
1262
1286
  }
@@ -1265,6 +1289,14 @@ add_row : {
1265
1289
  goto carriage_return;
1266
1290
  } else {
1267
1291
  SetNewLineDelimiter();
1292
+ SkipEmptyLines();
1293
+
1294
+ start = position;
1295
+ line_start = position;
1296
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
1297
+ // file ends right after delimiter, go to final state
1298
+ goto final_state;
1299
+ }
1268
1300
  // \n newline, move to value start
1269
1301
  if (finished_chunk) {
1270
1302
  return true;
@@ -1288,7 +1320,7 @@ in_quotes:
1288
1320
  goto handle_escape;
1289
1321
  }
1290
1322
  }
1291
- } while (ReadBuffer(start));
1323
+ } while (ReadBuffer(start, line_start));
1292
1324
  // still in quoted state at the end of the file, error:
1293
1325
  throw InvalidInputException("Error in file \"%s\" on line %s: unterminated quotes. (%s)", options.file_path,
1294
1326
  GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
@@ -1298,7 +1330,7 @@ unquote:
1298
1330
  // in this state we expect either another quote (entering the quoted state again, and escaping the quote)
1299
1331
  // or a delimiter/newline, ending the current value and moving on to the next value
1300
1332
  position++;
1301
- if (position >= buffer_size && !ReadBuffer(start)) {
1333
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
1302
1334
  // file ends right after unquote, go to final state
1303
1335
  offset = 1;
1304
1336
  goto final_state;
@@ -1325,7 +1357,7 @@ handle_escape:
1325
1357
  /* state: handle_escape */
1326
1358
  // escape should be followed by a quote or another escape character
1327
1359
  position++;
1328
- if (position >= buffer_size && !ReadBuffer(start)) {
1360
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
1329
1361
  error_message = StringUtil::Format(
1330
1362
  "Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", options.file_path,
1331
1363
  GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
@@ -1347,7 +1379,7 @@ carriage_return:
1347
1379
  // newline after carriage return: skip
1348
1380
  // increase position by 1 and move start to the new position
1349
1381
  start = ++position;
1350
- if (position >= buffer_size && !ReadBuffer(start)) {
1382
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
1351
1383
  // file ends right after delimiter, go to final state
1352
1384
  goto final_state;
1353
1385
  }
@@ -1357,6 +1389,14 @@ carriage_return:
1357
1389
  if (finished_chunk) {
1358
1390
  return true;
1359
1391
  }
1392
+ SkipEmptyLines();
1393
+ start = position;
1394
+ line_start = position;
1395
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
1396
+ // file ends right after delimiter, go to final state
1397
+ goto final_state;
1398
+ }
1399
+
1360
1400
  goto value_start;
1361
1401
  final_state:
1362
1402
  if (finished_chunk) {
@@ -1367,6 +1407,10 @@ final_state:
1367
1407
  // remaining values to be added to the chunk
1368
1408
  AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes);
1369
1409
  finished_chunk = AddRow(insert_chunk, column, error_message);
1410
+ SkipEmptyLines();
1411
+ if (context.client_data->max_line_length < position - line_start) {
1412
+ context.client_data->max_line_length = position - line_start;
1413
+ }
1370
1414
  if (!error_message.empty()) {
1371
1415
  return false;
1372
1416
  }
@@ -1382,7 +1426,10 @@ final_state:
1382
1426
  return true;
1383
1427
  }
1384
1428
 
1385
- bool BufferedCSVReader::ReadBuffer(idx_t &start) {
1429
+ bool BufferedCSVReader::ReadBuffer(idx_t &start, idx_t &line_start) {
1430
+ if (start > buffer_size) {
1431
+ return false;
1432
+ }
1386
1433
  auto old_buffer = std::move(buffer);
1387
1434
 
1388
1435
  // the remaining part of the last buffer
@@ -1420,9 +1467,11 @@ bool BufferedCSVReader::ReadBuffer(idx_t &start) {
1420
1467
  if (!bom_checked) {
1421
1468
  bom_checked = true;
1422
1469
  if (read_count >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') {
1470
+ start += 3;
1423
1471
  position += 3;
1424
1472
  }
1425
1473
  }
1474
+ line_start = start;
1426
1475
 
1427
1476
  return read_count > 0;
1428
1477
  }
@@ -4,8 +4,8 @@
4
4
  namespace duckdb {
5
5
 
6
6
  CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle,
7
- idx_t &global_csv_current_position)
8
- : context(context), first_buffer(true) {
7
+ idx_t &global_csv_current_position, idx_t file_number_p)
8
+ : context(context), first_buffer(true), file_number(file_number_p) {
9
9
  this->handle = AllocateBuffer(buffer_size_p);
10
10
 
11
11
  auto buffer = Ptr();
@@ -19,23 +19,23 @@ CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle
19
19
  }
20
20
 
21
21
  CSVBuffer::CSVBuffer(ClientContext &context, BufferHandle buffer_p, idx_t buffer_size_p, idx_t actual_size_p,
22
- bool final_buffer, idx_t global_csv_current_position)
22
+ bool final_buffer, idx_t global_csv_current_position, idx_t file_number_p)
23
23
  : context(context), handle(std::move(buffer_p)), actual_size(actual_size_p), last_buffer(final_buffer),
24
- global_csv_start(global_csv_current_position) {
24
+ global_csv_start(global_csv_current_position), file_number(file_number_p) {
25
25
  }
26
26
 
27
- unique_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_size,
28
- idx_t &global_csv_current_position) {
29
- if (file_handle.FinishedReading()) {
30
- // this was the last buffer
31
- return nullptr;
32
- }
33
-
27
+ unique_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t &global_csv_current_position,
28
+ idx_t file_number_p) {
34
29
  auto next_buffer = AllocateBuffer(buffer_size);
35
30
  idx_t next_buffer_actual_size = file_handle.Read(next_buffer.Ptr(), buffer_size);
31
+ if (next_buffer_actual_size == 0) {
32
+ // We are done reading
33
+ return nullptr;
34
+ }
36
35
 
37
- auto next_csv_buffer = make_uniq<CSVBuffer>(context, std::move(next_buffer), buffer_size, next_buffer_actual_size,
38
- file_handle.FinishedReading(), global_csv_current_position);
36
+ auto next_csv_buffer =
37
+ make_uniq<CSVBuffer>(context, std::move(next_buffer), buffer_size, next_buffer_actual_size,
38
+ file_handle.FinishedReading(), global_csv_current_position, file_number_p);
39
39
  global_csv_current_position += next_buffer_actual_size;
40
40
  return next_csv_buffer;
41
41
  }
@@ -65,4 +65,8 @@ idx_t CSVBuffer::GetCSVGlobalStart() {
65
65
  return global_csv_start;
66
66
  }
67
67
 
68
+ idx_t CSVBuffer::GetFileNumber() {
69
+ return file_number;
70
+ }
71
+
68
72
  } // namespace duckdb
@@ -74,11 +74,6 @@ void BufferedCSVReaderOptions::SetEscape(const string &input) {
74
74
  this->has_escape = true;
75
75
  }
76
76
 
77
- void BufferedCSVReaderOptions::SetParallel(bool use_parallel) {
78
- this->has_parallel = true;
79
- this->use_parallel = use_parallel;
80
- }
81
-
82
77
  void BufferedCSVReaderOptions::SetDelimiter(const string &input) {
83
78
  this->delimiter = StringUtil::Replace(input, "\\t", "\t");
84
79
  this->has_delimiter = true;
@@ -126,8 +121,6 @@ void BufferedCSVReaderOptions::SetReadOption(const string &loption, const Value
126
121
  }
127
122
  if (loption == "auto_detect") {
128
123
  auto_detect = ParseBoolean(value, loption);
129
- } else if (loption == "parallel") {
130
- SetParallel(ParseBoolean(value, loption));
131
124
  } else if (loption == "sample_size") {
132
125
  int64_t sample_size = ParseInteger(value, loption);
133
126
  if (sample_size < 1 && sample_size != -1) {