duckdb 1.1.2-dev4.0 → 1.1.2-dev6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (288) hide show
  1. package/package.json +1 -1
  2. package/src/duckdb/extension/icu/third_party/icu/common/putil.cpp +0 -5
  3. package/src/duckdb/extension/icu/third_party/icu/common/rbbiscan.cpp +1 -1
  4. package/src/duckdb/extension/icu/third_party/icu/common/rbbitblb.cpp +1 -1
  5. package/src/duckdb/extension/icu/third_party/icu/common/ucurr.cpp +1 -1
  6. package/src/duckdb/extension/icu/third_party/icu/common/uresbund.cpp +1 -1
  7. package/src/duckdb/extension/icu/third_party/icu/common/uresimp.h +31 -31
  8. package/src/duckdb/extension/icu/third_party/icu/common/ustring.cpp +1 -1
  9. package/src/duckdb/extension/icu/third_party/icu/common/uvector.cpp +1 -1
  10. package/src/duckdb/extension/icu/third_party/icu/i18n/coleitr.cpp +12 -12
  11. package/src/duckdb/extension/icu/third_party/icu/i18n/format.cpp +1 -1
  12. package/src/duckdb/extension/icu/third_party/icu/i18n/listformatter.cpp +4 -4
  13. package/src/duckdb/extension/icu/third_party/icu/i18n/number_decimalquantity.h +1 -1
  14. package/src/duckdb/extension/icu/third_party/icu/i18n/tzgnames.cpp +1 -1
  15. package/src/duckdb/extension/icu/third_party/icu/i18n/unicode/coleitr.h +28 -28
  16. package/src/duckdb/extension/icu/third_party/icu/i18n/unicode/format.h +7 -7
  17. package/src/duckdb/extension/icu/third_party/icu/i18n/unicode/ucol.h +1 -1
  18. package/src/duckdb/extension/icu/third_party/icu/i18n/unicode/ucoleitr.h +41 -41
  19. package/src/duckdb/extension/icu/third_party/icu/i18n/unicode/umsg.h +41 -41
  20. package/src/duckdb/extension/icu/third_party/icu/i18n/usrchimp.h +3 -3
  21. package/src/duckdb/extension/json/include/json_common.hpp +1 -1
  22. package/src/duckdb/extension/json/json_functions/json_structure.cpp +13 -7
  23. package/src/duckdb/extension/parquet/column_writer.cpp +2 -1
  24. package/src/duckdb/extension/parquet/geo_parquet.cpp +24 -9
  25. package/src/duckdb/extension/parquet/include/geo_parquet.hpp +3 -1
  26. package/src/duckdb/extension/parquet/include/parquet_reader.hpp +1 -0
  27. package/src/duckdb/extension/parquet/include/parquet_rle_bp_decoder.hpp +1 -1
  28. package/src/duckdb/extension/parquet/include/templated_column_reader.hpp +0 -4
  29. package/src/duckdb/extension/parquet/parquet_extension.cpp +20 -6
  30. package/src/duckdb/extension/parquet/parquet_reader.cpp +1 -2
  31. package/src/duckdb/extension/parquet/parquet_writer.cpp +1 -1
  32. package/src/duckdb/extension/parquet/serialize_parquet.cpp +0 -2
  33. package/src/duckdb/src/catalog/catalog_entry/duck_schema_entry.cpp +8 -1
  34. package/src/duckdb/src/catalog/default/default_functions.cpp +5 -5
  35. package/src/duckdb/src/common/allocator.cpp +3 -2
  36. package/src/duckdb/src/common/arrow/arrow_appender.cpp +1 -0
  37. package/src/duckdb/src/common/arrow/arrow_converter.cpp +11 -0
  38. package/src/duckdb/src/common/arrow/schema_metadata.cpp +6 -4
  39. package/src/duckdb/src/common/enum_util.cpp +33 -0
  40. package/src/duckdb/src/common/exception.cpp +3 -0
  41. package/src/duckdb/src/common/extra_type_info.cpp +1 -44
  42. package/src/duckdb/src/common/field_writer.cpp +97 -0
  43. package/src/duckdb/src/common/render_tree.cpp +7 -5
  44. package/src/duckdb/src/common/row_operations/row_match.cpp +359 -0
  45. package/src/duckdb/src/common/serializer/buffered_deserializer.cpp +27 -0
  46. package/src/duckdb/src/common/serializer/buffered_serializer.cpp +36 -0
  47. package/src/duckdb/src/common/serializer/format_serializer.cpp +15 -0
  48. package/src/duckdb/src/common/serializer.cpp +24 -0
  49. package/src/duckdb/src/common/sort/comparators.cpp +2 -2
  50. package/src/duckdb/src/common/types/bit.cpp +57 -34
  51. package/src/duckdb/src/common/types/data_chunk.cpp +32 -29
  52. package/src/duckdb/src/common/types/vector_cache.cpp +12 -6
  53. package/src/duckdb/src/common/vector_operations/comparison_operators.cpp +14 -0
  54. package/src/duckdb/src/core_functions/aggregate/distributive/bitstring_agg.cpp +20 -1
  55. package/src/duckdb/src/core_functions/aggregate/distributive/minmax.cpp +2 -2
  56. package/src/duckdb/src/core_functions/aggregate/holistic/approx_top_k.cpp +32 -7
  57. package/src/duckdb/src/core_functions/function_list.cpp +1 -2
  58. package/src/duckdb/src/core_functions/scalar/bit/bitstring.cpp +23 -5
  59. package/src/duckdb/src/core_functions/scalar/date/date_diff.cpp +12 -6
  60. package/src/duckdb/src/core_functions/scalar/date/date_part.cpp +1 -1
  61. package/src/duckdb/src/execution/expression_executor/execute_between.cpp +4 -3
  62. package/src/duckdb/src/execution/expression_executor/execute_case.cpp +4 -3
  63. package/src/duckdb/src/execution/expression_executor/execute_cast.cpp +2 -1
  64. package/src/duckdb/src/execution/expression_executor/execute_comparison.cpp +3 -2
  65. package/src/duckdb/src/execution/expression_executor/execute_conjunction.cpp +2 -1
  66. package/src/duckdb/src/execution/expression_executor/execute_function.cpp +2 -1
  67. package/src/duckdb/src/execution/expression_executor/execute_operator.cpp +3 -2
  68. package/src/duckdb/src/execution/expression_executor/execute_reference.cpp +1 -1
  69. package/src/duckdb/src/execution/expression_executor.cpp +9 -3
  70. package/src/duckdb/src/execution/expression_executor_state.cpp +11 -9
  71. package/src/duckdb/src/execution/index/art/fixed_size_allocator.cpp +238 -0
  72. package/src/duckdb/src/execution/index/art/plan_art.cpp +94 -0
  73. package/src/duckdb/src/execution/index/index_type_set.cpp +4 -1
  74. package/src/duckdb/src/execution/join_hashtable.cpp +7 -8
  75. package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +6 -4
  76. package/src/duckdb/src/execution/operator/csv_scanner/buffer_manager/csv_buffer_manager.cpp +4 -4
  77. package/src/duckdb/src/execution/operator/csv_scanner/scanner/base_scanner.cpp +1 -1
  78. package/src/duckdb/src/execution/operator/csv_scanner/scanner/csv_schema.cpp +44 -5
  79. package/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +28 -24
  80. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +25 -26
  81. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +5 -3
  82. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +4 -4
  83. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +2 -2
  84. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +1 -1
  85. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +1 -1
  86. package/src/duckdb/src/execution/operator/csv_scanner/state_machine/csv_state_machine.cpp +1 -1
  87. package/src/duckdb/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp +2 -2
  88. package/src/duckdb/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp +1 -1
  89. package/src/duckdb/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +1 -1
  90. package/src/duckdb/src/execution/operator/csv_scanner/util/csv_reader_options.cpp +73 -27
  91. package/src/duckdb/src/execution/operator/helper/physical_buffered_collector.cpp +1 -1
  92. package/src/duckdb/src/execution/operator/persistent/base_csv_reader.cpp +695 -0
  93. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +1487 -0
  94. package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +72 -0
  95. package/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp +158 -0
  96. package/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp +280 -0
  97. package/src/duckdb/src/execution/operator/persistent/parallel_csv_reader.cpp +666 -0
  98. package/src/duckdb/src/execution/operator/persistent/physical_export.cpp +14 -4
  99. package/src/duckdb/src/execution/operator/schema/physical_create_index.cpp +207 -0
  100. package/src/duckdb/src/execution/partitionable_hashtable.cpp +207 -0
  101. package/src/duckdb/src/execution/perfect_aggregate_hashtable.cpp +6 -1
  102. package/src/duckdb/src/execution/physical_plan/plan_comparison_join.cpp +0 -4
  103. package/src/duckdb/src/execution/physical_plan/plan_create_index.cpp +14 -87
  104. package/src/duckdb/src/execution/physical_plan/plan_export.cpp +1 -1
  105. package/src/duckdb/src/execution/physical_plan/plan_get.cpp +1 -1
  106. package/src/duckdb/src/execution/reservoir_sample.cpp +1 -1
  107. package/src/duckdb/src/execution/window_executor.cpp +3 -3
  108. package/src/duckdb/src/function/pragma/pragma_queries.cpp +1 -1
  109. package/src/duckdb/src/function/scalar/strftime_format.cpp +1 -2
  110. package/src/duckdb/src/function/scalar/string/concat.cpp +118 -151
  111. package/src/duckdb/src/function/table/arrow.cpp +13 -0
  112. package/src/duckdb/src/function/table/arrow_conversion.cpp +12 -7
  113. package/src/duckdb/src/function/table/copy_csv.cpp +1 -1
  114. package/src/duckdb/src/function/table/read_csv.cpp +2 -30
  115. package/src/duckdb/src/function/table/sniff_csv.cpp +2 -1
  116. package/src/duckdb/src/function/table/system/duckdb_secrets.cpp +15 -7
  117. package/src/duckdb/src/function/table/version/pragma_version.cpp +3 -3
  118. package/src/duckdb/src/include/duckdb/catalog/catalog_entry_retriever.hpp +1 -1
  119. package/src/duckdb/src/include/duckdb/common/atomic.hpp +13 -1
  120. package/src/duckdb/src/include/duckdb/common/bitpacking.hpp +3 -4
  121. package/src/duckdb/src/include/duckdb/common/enum_util.hpp +8 -0
  122. package/src/duckdb/src/include/duckdb/common/enums/metric_type.hpp +2 -0
  123. package/src/duckdb/src/include/duckdb/common/exception.hpp +10 -0
  124. package/src/duckdb/src/include/duckdb/common/extra_type_info/enum_type_info.hpp +53 -0
  125. package/src/duckdb/src/include/duckdb/common/insertion_order_preserving_map.hpp +5 -5
  126. package/src/duckdb/src/include/duckdb/common/multi_file_reader.hpp +5 -0
  127. package/src/duckdb/src/include/duckdb/common/types/bit.hpp +36 -33
  128. package/src/duckdb/src/include/duckdb/common/types/data_chunk.hpp +10 -13
  129. package/src/duckdb/src/include/duckdb/common/types/uhugeint.hpp +1 -1
  130. package/src/duckdb/src/include/duckdb/common/types/vector_cache.hpp +7 -5
  131. package/src/duckdb/src/include/duckdb/common/windows_undefs.hpp +2 -1
  132. package/src/duckdb/src/include/duckdb/core_functions/aggregate/minmax_n_helpers.hpp +2 -0
  133. package/src/duckdb/src/include/duckdb/core_functions/scalar/bit_functions.hpp +1 -1
  134. package/src/duckdb/src/include/duckdb/core_functions/scalar/list_functions.hpp +0 -6
  135. package/src/duckdb/src/include/duckdb/core_functions/scalar/math_functions.hpp +1 -1
  136. package/src/duckdb/src/include/duckdb/execution/expression_executor_state.hpp +3 -2
  137. package/src/duckdb/src/include/duckdb/execution/index/art/art.hpp +3 -0
  138. package/src/duckdb/src/include/duckdb/execution/index/index_type.hpp +16 -1
  139. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp +4 -4
  140. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp +4 -2
  141. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp +3 -2
  142. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp +91 -36
  143. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp +36 -0
  144. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +1 -1
  145. package/src/duckdb/src/include/duckdb/execution/operator/join/perfect_hash_join_executor.hpp +0 -1
  146. package/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_export.hpp +2 -5
  147. package/src/duckdb/src/include/duckdb/function/table_function.hpp +1 -1
  148. package/src/duckdb/src/include/duckdb/main/database.hpp +5 -0
  149. package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +1 -0
  150. package/src/duckdb/src/include/duckdb/main/profiling_info.hpp +20 -22
  151. package/src/duckdb/src/include/duckdb/main/query_profiler.hpp +7 -9
  152. package/src/duckdb/src/include/duckdb/main/secret/secret.hpp +8 -1
  153. package/src/duckdb/src/include/duckdb/main/table_description.hpp +14 -0
  154. package/src/duckdb/src/include/duckdb/optimizer/unnest_rewriter.hpp +5 -5
  155. package/src/duckdb/src/include/duckdb/parser/parsed_data/exported_table_data.hpp +15 -5
  156. package/src/duckdb/src/include/duckdb/parser/transformer.hpp +2 -0
  157. package/src/duckdb/src/include/duckdb/planner/expression_binder/order_binder.hpp +4 -0
  158. package/src/duckdb/src/include/duckdb/planner/operator/logical_export.hpp +10 -13
  159. package/src/duckdb/src/include/duckdb/planner/table_filter.hpp +1 -0
  160. package/src/duckdb/src/include/duckdb/storage/metadata/metadata_manager.hpp +2 -2
  161. package/src/duckdb/src/include/duckdb/storage/standard_buffer_manager.hpp +1 -1
  162. package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +0 -2
  163. package/src/duckdb/src/include/duckdb/storage/table/segment_tree.hpp +1 -0
  164. package/src/duckdb/src/include/duckdb/transaction/duck_transaction.hpp +5 -1
  165. package/src/duckdb/src/include/duckdb.h +2 -2
  166. package/src/duckdb/src/main/appender.cpp +3 -0
  167. package/src/duckdb/src/main/capi/profiling_info-c.cpp +5 -2
  168. package/src/duckdb/src/main/client_context.cpp +8 -2
  169. package/src/duckdb/src/main/connection.cpp +1 -1
  170. package/src/duckdb/src/main/database.cpp +13 -0
  171. package/src/duckdb/src/main/extension/extension_helper.cpp +1 -1
  172. package/src/duckdb/src/main/extension/extension_install.cpp +9 -1
  173. package/src/duckdb/src/main/extension/extension_load.cpp +3 -2
  174. package/src/duckdb/src/main/extension_install_info.cpp +1 -1
  175. package/src/duckdb/src/main/profiling_info.cpp +78 -58
  176. package/src/duckdb/src/main/query_profiler.cpp +79 -89
  177. package/src/duckdb/src/main/relation/read_csv_relation.cpp +1 -1
  178. package/src/duckdb/src/main/secret/secret.cpp +2 -1
  179. package/src/duckdb/src/main/secret/secret_manager.cpp +14 -0
  180. package/src/duckdb/src/optimizer/cte_filter_pusher.cpp +4 -2
  181. package/src/duckdb/src/optimizer/deliminator.cpp +0 -7
  182. package/src/duckdb/src/optimizer/in_clause_rewriter.cpp +7 -0
  183. package/src/duckdb/src/optimizer/pushdown/pushdown_left_join.cpp +4 -1
  184. package/src/duckdb/src/optimizer/unnest_rewriter.cpp +21 -21
  185. package/src/duckdb/src/parallel/task_scheduler.cpp +9 -0
  186. package/src/duckdb/src/parser/parsed_data/exported_table_data.cpp +22 -0
  187. package/src/duckdb/src/parser/parsed_expression_iterator.cpp +3 -0
  188. package/src/duckdb/src/parser/statement/insert_statement.cpp +7 -1
  189. package/src/duckdb/src/parser/transform/expression/transform_boolean_test.cpp +1 -1
  190. package/src/duckdb/src/parser/transform/helpers/transform_typename.cpp +89 -87
  191. package/src/duckdb/src/parser/transform/statement/transform_pivot_stmt.cpp +2 -2
  192. package/src/duckdb/src/planner/binder/expression/bind_macro_expression.cpp +4 -9
  193. package/src/duckdb/src/planner/binder/query_node/bind_select_node.cpp +4 -0
  194. package/src/duckdb/src/planner/binder/query_node/plan_setop.cpp +2 -2
  195. package/src/duckdb/src/planner/binder/statement/bind_create_table.cpp +4 -1
  196. package/src/duckdb/src/planner/binder/statement/bind_export.cpp +4 -3
  197. package/src/duckdb/src/planner/expression_binder/order_binder.cpp +13 -3
  198. package/src/duckdb/src/planner/expression_binder.cpp +1 -1
  199. package/src/duckdb/src/planner/operator/logical_export.cpp +28 -0
  200. package/src/duckdb/src/planner/table_binding.cpp +1 -2
  201. package/src/duckdb/src/planner/table_filter.cpp +6 -2
  202. package/src/duckdb/src/storage/buffer/buffer_pool.cpp +2 -1
  203. package/src/duckdb/src/storage/checkpoint_manager.cpp +1 -1
  204. package/src/duckdb/src/storage/compression/bitpacking.cpp +7 -3
  205. package/src/duckdb/src/storage/compression/dictionary_compression.cpp +1 -1
  206. package/src/duckdb/src/storage/metadata/metadata_manager.cpp +2 -2
  207. package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +16 -0
  208. package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +29 -0
  209. package/src/duckdb/src/storage/serialization/serialize_parse_info.cpp +15 -0
  210. package/src/duckdb/src/storage/single_file_block_manager.cpp +2 -1
  211. package/src/duckdb/src/storage/statistics/distinct_statistics.cpp +3 -5
  212. package/src/duckdb/src/storage/storage_info.cpp +4 -4
  213. package/src/duckdb/src/storage/table/row_group_collection.cpp +1 -1
  214. package/src/duckdb/src/storage/table/row_version_manager.cpp +5 -1
  215. package/src/duckdb/src/storage/temporary_file_manager.cpp +1 -1
  216. package/src/duckdb/src/transaction/duck_transaction.cpp +15 -14
  217. package/src/duckdb/third_party/brotli/common/brotli_platform.h +1 -1
  218. package/src/duckdb/third_party/brotli/dec/decode.cpp +1 -1
  219. package/src/duckdb/third_party/brotli/enc/memory.cpp +4 -4
  220. package/src/duckdb/third_party/fsst/libfsst.cpp +1 -1
  221. package/src/duckdb/third_party/hyperloglog/sds.cpp +1 -1
  222. package/src/duckdb/third_party/hyperloglog/sds.hpp +1 -1
  223. package/src/duckdb/third_party/libpg_query/include/common/keywords.hpp +1 -1
  224. package/src/duckdb/third_party/libpg_query/include/datatype/timestamp.hpp +1 -1
  225. package/src/duckdb/third_party/libpg_query/include/mb/pg_wchar.hpp +1 -1
  226. package/src/duckdb/third_party/libpg_query/include/nodes/bitmapset.hpp +1 -1
  227. package/src/duckdb/third_party/libpg_query/include/nodes/lockoptions.hpp +1 -1
  228. package/src/duckdb/third_party/libpg_query/include/nodes/makefuncs.hpp +1 -1
  229. package/src/duckdb/third_party/libpg_query/include/nodes/pg_list.hpp +1 -1
  230. package/src/duckdb/third_party/libpg_query/include/nodes/value.hpp +1 -1
  231. package/src/duckdb/third_party/libpg_query/include/parser/gramparse.hpp +1 -1
  232. package/src/duckdb/third_party/libpg_query/include/parser/parser.hpp +1 -1
  233. package/src/duckdb/third_party/libpg_query/include/parser/scanner.hpp +1 -1
  234. package/src/duckdb/third_party/libpg_query/include/parser/scansup.hpp +1 -1
  235. package/src/duckdb/third_party/libpg_query/include/pg_functions.hpp +1 -1
  236. package/src/duckdb/third_party/libpg_query/pg_functions.cpp +1 -1
  237. package/src/duckdb/third_party/libpg_query/src_backend_nodes_list.cpp +1 -1
  238. package/src/duckdb/third_party/libpg_query/src_backend_nodes_makefuncs.cpp +1 -1
  239. package/src/duckdb/third_party/libpg_query/src_backend_nodes_value.cpp +1 -1
  240. package/src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp +1964 -1964
  241. package/src/duckdb/third_party/libpg_query/src_backend_parser_parser.cpp +1 -1
  242. package/src/duckdb/third_party/libpg_query/src_backend_parser_scansup.cpp +1 -1
  243. package/src/duckdb/third_party/libpg_query/src_common_keywords.cpp +1 -1
  244. package/src/duckdb/third_party/lz4/lz4.cpp +1 -1
  245. package/src/duckdb/third_party/mbedtls/include/des_alt.h +1 -1
  246. package/src/duckdb/third_party/mbedtls/include/mbedtls/aes_alt.h +1 -1
  247. package/src/duckdb/third_party/mbedtls/include/mbedtls/aria_alt.h +1 -1
  248. package/src/duckdb/third_party/mbedtls/include/mbedtls/asn1write.h +1 -1
  249. package/src/duckdb/third_party/mbedtls/include/mbedtls/camellia_alt.h +1 -1
  250. package/src/duckdb/third_party/mbedtls/include/mbedtls/ccm_alt.h +1 -1
  251. package/src/duckdb/third_party/mbedtls/include/mbedtls/chacha20.h +1 -1
  252. package/src/duckdb/third_party/mbedtls/include/mbedtls/chachapoly.h +1 -1
  253. package/src/duckdb/third_party/mbedtls/include/mbedtls/cmac.h +1 -1
  254. package/src/duckdb/third_party/mbedtls/include/mbedtls/config_psa.h +1 -1
  255. package/src/duckdb/third_party/mbedtls/include/mbedtls/ecdsa.h +1 -1
  256. package/src/duckdb/third_party/mbedtls/include/mbedtls/ecp.h +1 -1
  257. package/src/duckdb/third_party/mbedtls/include/mbedtls/gcm_alt.h +1 -1
  258. package/src/duckdb/third_party/mbedtls/include/mbedtls/md5.h +1 -1
  259. package/src/duckdb/third_party/mbedtls/include/mbedtls/nist_kw.h +1 -1
  260. package/src/duckdb/third_party/mbedtls/include/mbedtls/pkcs12.h +1 -1
  261. package/src/duckdb/third_party/mbedtls/include/mbedtls/pkcs5.h +1 -1
  262. package/src/duckdb/third_party/mbedtls/include/mbedtls/psa_util.h +1 -1
  263. package/src/duckdb/third_party/mbedtls/include/mbedtls/ripemd160.h +1 -1
  264. package/src/duckdb/third_party/mbedtls/include/mbedtls/threading.h +1 -1
  265. package/src/duckdb/third_party/mbedtls/include/mbedtls/timing.h +1 -1
  266. package/src/duckdb/third_party/mbedtls/include/platform_alt.h +1 -1
  267. package/src/duckdb/third_party/mbedtls/include/psa/crypto.h +1 -1
  268. package/src/duckdb/third_party/mbedtls/include/rsa_alt.h +1 -1
  269. package/src/duckdb/third_party/mbedtls/include/sha1_alt.h +1 -1
  270. package/src/duckdb/third_party/mbedtls/include/sha256_alt.h +1 -1
  271. package/src/duckdb/third_party/mbedtls/include/sha512_alt.h +1 -1
  272. package/src/duckdb/third_party/mbedtls/include/ssl_misc.h +1 -1
  273. package/src/duckdb/third_party/mbedtls/library/aesni.h +1 -1
  274. package/src/duckdb/third_party/mbedtls/library/padlock.h +1 -1
  275. package/src/duckdb/third_party/miniz/miniz.cpp +1 -1
  276. package/src/duckdb/third_party/parquet/parquet_types.cpp +1 -1
  277. package/src/duckdb/third_party/parquet/windows_compatibility.h +1 -1
  278. package/src/duckdb/third_party/pcg/pcg_extras.hpp +1 -1
  279. package/src/duckdb/third_party/pcg/pcg_uint128.hpp +1 -1
  280. package/src/duckdb/third_party/skiplist/Node.h +4 -4
  281. package/src/duckdb/third_party/snappy/snappy.cc +1 -1
  282. package/src/duckdb/third_party/snappy/snappy_version.hpp +1 -1
  283. package/src/duckdb/third_party/thrift/thrift/thrift-config.h +1 -1
  284. package/src/duckdb/third_party/zstd/decompress/zstd_decompress_block.cpp +1 -1
  285. package/src/duckdb/third_party/zstd/include/zstd_static.h +1 -1
  286. package/src/duckdb/ub_src_execution_index_art.cpp +2 -0
  287. package/src/duckdb/ub_src_parser_parsed_data.cpp +2 -0
  288. package/src/duckdb/ub_src_planner_operator.cpp +2 -0
@@ -0,0 +1,695 @@
1
+ #include "duckdb/execution/operator/persistent/base_csv_reader.hpp"
2
+ #include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp"
3
+ #include "duckdb/common/file_system.hpp"
4
+ #include "duckdb/common/string_util.hpp"
5
+ #include "duckdb/common/to_string.hpp"
6
+ #include "duckdb/common/types/cast_helpers.hpp"
7
+ #include "duckdb/common/operator/cast_operators.hpp"
8
+ #include "duckdb/common/operator/decimal_cast_operators.hpp"
9
+ #include "duckdb/common/vector_operations/unary_executor.hpp"
10
+ #include "duckdb/common/vector_operations/vector_operations.hpp"
11
+ #include "duckdb/function/scalar/strftime_format.hpp"
12
+ #include "duckdb/main/appender.hpp"
13
+ #include "duckdb/main/database.hpp"
14
+ #include "duckdb/parser/column_definition.hpp"
15
+ #include "duckdb/storage/data_table.hpp"
16
+ #include "utf8proc_wrapper.hpp"
17
+ #include "utf8proc.hpp"
18
+ #include "duckdb/parser/keyword_helper.hpp"
19
+ #include "duckdb/main/error_manager.hpp"
20
+ #include "duckdb/execution/operator/persistent/parallel_csv_reader.hpp"
21
+ #include "duckdb/execution/operator/persistent/csv_rejects_table.hpp"
22
+ #include "duckdb/main/client_data.hpp"
23
+ #include <algorithm>
24
+ #include <cctype>
25
+ #include <cstring>
26
+ #include <fstream>
27
+
28
+ namespace duckdb {
29
+
30
+ string BaseCSVReader::GetLineNumberStr(idx_t line_error, bool is_line_estimated, idx_t buffer_idx) {
31
+ // If an error happens during auto-detect it is an estimated line
32
+ string estimated = (is_line_estimated ? string(" (estimated)") : string(""));
33
+ return to_string(GetLineError(line_error, buffer_idx)) + estimated;
34
+ }
35
+
36
+ BaseCSVReader::BaseCSVReader(ClientContext &context_p, BufferedCSVReaderOptions options_p,
37
+ const vector<LogicalType> &requested_types)
38
+ : context(context_p), fs(FileSystem::GetFileSystem(context)), allocator(BufferAllocator::Get(context)),
39
+ options(std::move(options_p)) {
40
+ }
41
+
42
+ BaseCSVReader::~BaseCSVReader() {
43
+ }
44
+
45
+ unique_ptr<CSVFileHandle> BaseCSVReader::OpenCSV(const BufferedCSVReaderOptions &options_p) {
46
+ return CSVFileHandle::OpenFile(fs, allocator, options_p.file_path, options_p.compression, true);
47
+ }
48
+
49
+ void BaseCSVReader::InitParseChunk(idx_t num_cols) {
50
+ // adapt not null info
51
+ if (options.force_not_null.size() != num_cols) {
52
+ options.force_not_null.resize(num_cols, false);
53
+ }
54
+ if (num_cols == parse_chunk.ColumnCount()) {
55
+ parse_chunk.Reset();
56
+ } else {
57
+ parse_chunk.Destroy();
58
+
59
+ // initialize the parse_chunk with a set of VARCHAR types
60
+ vector<LogicalType> varchar_types(num_cols, LogicalType::VARCHAR);
61
+ parse_chunk.Initialize(allocator, varchar_types);
62
+ }
63
+ }
64
+
65
+ void BaseCSVReader::InitializeProjection() {
66
+ for (idx_t i = 0; i < GetTypes().size(); i++) {
67
+ reader_data.column_ids.push_back(i);
68
+ reader_data.column_mapping.push_back(i);
69
+ }
70
+ }
71
+
72
+ void BaseCSVReader::SetDateFormat(const string &format_specifier, const LogicalTypeId &sql_type) {
73
+ options.has_format[sql_type] = true;
74
+ auto &date_format = options.date_format[sql_type];
75
+ date_format.format_specifier = format_specifier;
76
+ StrTimeFormat::ParseFormatSpecifier(date_format.format_specifier, date_format);
77
+ }
78
+
79
+ struct TryCastDecimalOperator {
80
+ template <class OP, class T>
81
+ static bool Operation(string_t input, uint8_t width, uint8_t scale) {
82
+ T result;
83
+ string error_message;
84
+ return OP::Operation(input, result, &error_message, width, scale);
85
+ }
86
+ };
87
+
88
+ struct TryCastFloatingOperator {
89
+ template <class OP, class T>
90
+ static bool Operation(string_t input) {
91
+ T result;
92
+ string error_message;
93
+ return OP::Operation(input, result, &error_message);
94
+ }
95
+ };
96
+
97
+ bool TryCastDecimalValueCommaSeparated(const string_t &value_str, const LogicalType &sql_type) {
98
+ auto width = DecimalType::GetWidth(sql_type);
99
+ auto scale = DecimalType::GetScale(sql_type);
100
+ switch (sql_type.InternalType()) {
101
+ case PhysicalType::INT16:
102
+ return TryCastDecimalOperator::Operation<TryCastToDecimalCommaSeparated, int16_t>(value_str, width, scale);
103
+ case PhysicalType::INT32:
104
+ return TryCastDecimalOperator::Operation<TryCastToDecimalCommaSeparated, int32_t>(value_str, width, scale);
105
+ case PhysicalType::INT64:
106
+ return TryCastDecimalOperator::Operation<TryCastToDecimalCommaSeparated, int64_t>(value_str, width, scale);
107
+ case PhysicalType::INT128:
108
+ return TryCastDecimalOperator::Operation<TryCastToDecimalCommaSeparated, hugeint_t>(value_str, width, scale);
109
+ default:
110
+ throw InternalException("Unimplemented physical type for decimal");
111
+ }
112
+ }
113
+
114
+ bool TryCastFloatingValueCommaSeparated(const string_t &value_str, const LogicalType &sql_type) {
115
+ switch (sql_type.InternalType()) {
116
+ case PhysicalType::DOUBLE:
117
+ return TryCastFloatingOperator::Operation<TryCastErrorMessageCommaSeparated, double>(value_str);
118
+ case PhysicalType::FLOAT:
119
+ return TryCastFloatingOperator::Operation<TryCastErrorMessageCommaSeparated, float>(value_str);
120
+ default:
121
+ throw InternalException("Unimplemented physical type for floating");
122
+ }
123
+ }
124
+
125
+ bool BaseCSVReader::TryCastValue(const Value &value, const LogicalType &sql_type) {
126
+ if (value.IsNull()) {
127
+ return true;
128
+ }
129
+ if (options.has_format[LogicalTypeId::DATE] && sql_type.id() == LogicalTypeId::DATE) {
130
+ date_t result;
131
+ string error_message;
132
+ return options.date_format[LogicalTypeId::DATE].TryParseDate(string_t(StringValue::Get(value)), result,
133
+ error_message);
134
+ } else if (options.has_format[LogicalTypeId::TIMESTAMP] && sql_type.id() == LogicalTypeId::TIMESTAMP) {
135
+ timestamp_t result;
136
+ string error_message;
137
+ return options.date_format[LogicalTypeId::TIMESTAMP].TryParseTimestamp(string_t(StringValue::Get(value)),
138
+ result, error_message);
139
+ } else if (options.decimal_separator != "." && sql_type.id() == LogicalTypeId::DECIMAL) {
140
+ return TryCastDecimalValueCommaSeparated(string_t(StringValue::Get(value)), sql_type);
141
+ } else if (options.decimal_separator != "." &&
142
+ ((sql_type.id() == LogicalTypeId::FLOAT) || (sql_type.id() == LogicalTypeId::DOUBLE))) {
143
+ return TryCastFloatingValueCommaSeparated(string_t(StringValue::Get(value)), sql_type);
144
+ } else {
145
+ Value new_value;
146
+ string error_message;
147
+ return value.TryCastAs(context, sql_type, new_value, &error_message, true);
148
+ }
149
+ }
150
+
151
+ struct TryCastDateOperator {
152
+ static bool Operation(BufferedCSVReaderOptions &options, string_t input, date_t &result, string &error_message) {
153
+ return options.date_format[LogicalTypeId::DATE].TryParseDate(input, result, error_message);
154
+ }
155
+ };
156
+
157
+ struct TryCastTimestampOperator {
158
+ static bool Operation(BufferedCSVReaderOptions &options, string_t input, timestamp_t &result,
159
+ string &error_message) {
160
+ return options.date_format[LogicalTypeId::TIMESTAMP].TryParseTimestamp(input, result, error_message);
161
+ }
162
+ };
163
+
164
+ template <class OP, class T>
165
+ static bool TemplatedTryCastDateVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector,
166
+ idx_t count, string &error_message, idx_t &line_error) {
167
+ D_ASSERT(input_vector.GetType().id() == LogicalTypeId::VARCHAR);
168
+ bool all_converted = true;
169
+ idx_t cur_line = 0;
170
+ UnaryExecutor::Execute<string_t, T>(input_vector, result_vector, count, [&](string_t input) {
171
+ T result;
172
+ if (!OP::Operation(options, input, result, error_message)) {
173
+ line_error = cur_line;
174
+ all_converted = false;
175
+ }
176
+ cur_line++;
177
+ return result;
178
+ });
179
+ return all_converted;
180
+ }
181
+
182
+ bool TryCastDateVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector, idx_t count,
183
+ string &error_message, idx_t &line_error) {
184
+ return TemplatedTryCastDateVector<TryCastDateOperator, date_t>(options, input_vector, result_vector, count,
185
+ error_message, line_error);
186
+ }
187
+
188
+ bool TryCastTimestampVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector, idx_t count,
189
+ string &error_message) {
190
+ idx_t line_error;
191
+ return TemplatedTryCastDateVector<TryCastTimestampOperator, timestamp_t>(options, input_vector, result_vector,
192
+ count, error_message, line_error);
193
+ }
194
+
195
+ template <class OP, class T>
196
+ bool TemplatedTryCastFloatingVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector,
197
+ idx_t count, string &error_message, idx_t &line_error) {
198
+ D_ASSERT(input_vector.GetType().id() == LogicalTypeId::VARCHAR);
199
+ bool all_converted = true;
200
+ idx_t row = 0;
201
+ UnaryExecutor::Execute<string_t, T>(input_vector, result_vector, count, [&](string_t input) {
202
+ T result;
203
+ if (!OP::Operation(input, result, &error_message)) {
204
+ line_error = row;
205
+ all_converted = false;
206
+ } else {
207
+ row++;
208
+ }
209
+ return result;
210
+ });
211
+ return all_converted;
212
+ }
213
+
214
+ template <class OP, class T>
215
+ bool TemplatedTryCastDecimalVector(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector,
216
+ idx_t count, string &error_message, uint8_t width, uint8_t scale) {
217
+ D_ASSERT(input_vector.GetType().id() == LogicalTypeId::VARCHAR);
218
+ bool all_converted = true;
219
+ UnaryExecutor::Execute<string_t, T>(input_vector, result_vector, count, [&](string_t input) {
220
+ T result;
221
+ if (!OP::Operation(input, result, &error_message, width, scale)) {
222
+ all_converted = false;
223
+ }
224
+ return result;
225
+ });
226
+ return all_converted;
227
+ }
228
+
229
+ bool BaseCSVReader::TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type) {
230
+ // try vector-cast from string to sql_type
231
+ Vector dummy_result(sql_type);
232
+ if (options.has_format[LogicalTypeId::DATE] && sql_type == LogicalTypeId::DATE) {
233
+ // use the date format to cast the chunk
234
+ string error_message;
235
+ idx_t line_error;
236
+ return TryCastDateVector(options, parse_chunk_col, dummy_result, size, error_message, line_error);
237
+ } else if (options.has_format[LogicalTypeId::TIMESTAMP] && sql_type == LogicalTypeId::TIMESTAMP) {
238
+ // use the timestamp format to cast the chunk
239
+ string error_message;
240
+ return TryCastTimestampVector(options, parse_chunk_col, dummy_result, size, error_message);
241
+ } else {
242
+ // target type is not varchar: perform a cast
243
+ string error_message;
244
+ return VectorOperations::DefaultTryCast(parse_chunk_col, dummy_result, size, &error_message, true);
245
+ }
246
+ }
247
+
248
+ void BaseCSVReader::AddValue(string_t str_val, idx_t &column, vector<idx_t> &escape_positions, bool has_quotes,
249
+ idx_t buffer_idx) {
250
+ auto length = str_val.GetSize();
251
+ if (length == 0 && column == 0) {
252
+ row_empty = true;
253
+ } else {
254
+ row_empty = false;
255
+ }
256
+ if (!return_types.empty() && column == return_types.size() && length == 0) {
257
+ // skip a single trailing delimiter in last column
258
+ return;
259
+ }
260
+ if (mode == ParserMode::SNIFFING_DIALECT) {
261
+ column++;
262
+ return;
263
+ }
264
+ if (column >= return_types.size()) {
265
+ if (options.ignore_errors) {
266
+ error_column_overflow = true;
267
+ return;
268
+ } else {
269
+ throw InvalidInputException(
270
+ "Error in file \"%s\", on line %s: expected %lld values per row, but got more. (%s)", options.file_path,
271
+ GetLineNumberStr(linenr, linenr_estimated, buffer_idx).c_str(), return_types.size(),
272
+ options.ToString());
273
+ }
274
+ }
275
+
276
+ // insert the line number into the chunk
277
+ idx_t row_entry = parse_chunk.size();
278
+
279
+ // test against null string, but only if the value was not quoted
280
+ if ((!(has_quotes && !options.allow_quoted_nulls) || return_types[column].id() != LogicalTypeId::VARCHAR) &&
281
+ !options.force_not_null[column] && Equals::Operation(str_val, string_t(options.null_str))) {
282
+ FlatVector::SetNull(parse_chunk.data[column], row_entry, true);
283
+ } else {
284
+ auto &v = parse_chunk.data[column];
285
+ auto parse_data = FlatVector::GetData<string_t>(v);
286
+ if (!escape_positions.empty()) {
287
+ // remove escape characters (if any)
288
+ string old_val = str_val.GetString();
289
+ string new_val = "";
290
+ idx_t prev_pos = 0;
291
+ for (idx_t i = 0; i < escape_positions.size(); i++) {
292
+ idx_t next_pos = escape_positions[i];
293
+ new_val += old_val.substr(prev_pos, next_pos - prev_pos);
294
+
295
+ if (options.escape.empty() || options.escape == options.quote) {
296
+ prev_pos = next_pos + options.quote.size();
297
+ } else {
298
+ prev_pos = next_pos + options.escape.size();
299
+ }
300
+ }
301
+ new_val += old_val.substr(prev_pos, old_val.size() - prev_pos);
302
+ escape_positions.clear();
303
+ parse_data[row_entry] = StringVector::AddStringOrBlob(v, string_t(new_val));
304
+ } else {
305
+ parse_data[row_entry] = str_val;
306
+ }
307
+ }
308
+
309
+ // move to the next column
310
+ column++;
311
+ }
312
+
313
+ bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error_message, idx_t buffer_idx) {
314
+ linenr++;
315
+
316
+ if (row_empty) {
317
+ row_empty = false;
318
+ if (return_types.size() != 1) {
319
+ if (mode == ParserMode::PARSING) {
320
+ FlatVector::SetNull(parse_chunk.data[0], parse_chunk.size(), false);
321
+ }
322
+ column = 0;
323
+ return false;
324
+ }
325
+ }
326
+
327
+ // Error forwarded by 'ignore_errors' - originally encountered in 'AddValue'
328
+ if (error_column_overflow) {
329
+ D_ASSERT(options.ignore_errors);
330
+ error_column_overflow = false;
331
+ column = 0;
332
+ return false;
333
+ }
334
+
335
+ if (column < return_types.size() && mode != ParserMode::SNIFFING_DIALECT) {
336
+ if (options.null_padding) {
337
+ for (; column < return_types.size(); column++) {
338
+ FlatVector::SetNull(parse_chunk.data[column], parse_chunk.size(), true);
339
+ }
340
+ } else if (options.ignore_errors) {
341
+ column = 0;
342
+ return false;
343
+ } else {
344
+ if (mode == ParserMode::SNIFFING_DATATYPES) {
345
+ error_message = "Error when adding line";
346
+ return false;
347
+ } else {
348
+ throw InvalidInputException(
349
+ "Error in file \"%s\" on line %s: expected %lld values per row, but got %d.\nParser options:\n%s",
350
+ options.file_path, GetLineNumberStr(linenr, linenr_estimated, buffer_idx).c_str(),
351
+ return_types.size(), column, options.ToString());
352
+ }
353
+ }
354
+ }
355
+
356
+ if (mode == ParserMode::SNIFFING_DIALECT) {
357
+ sniffed_column_counts.push_back(column);
358
+
359
+ if (sniffed_column_counts.size() == options.sample_chunk_size) {
360
+ return true;
361
+ }
362
+ } else {
363
+ parse_chunk.SetCardinality(parse_chunk.size() + 1);
364
+ }
365
+
366
+ if (mode == ParserMode::PARSING_HEADER) {
367
+ return true;
368
+ }
369
+
370
+ if (mode == ParserMode::SNIFFING_DATATYPES && parse_chunk.size() == options.sample_chunk_size) {
371
+ return true;
372
+ }
373
+
374
+ if (mode == ParserMode::PARSING && parse_chunk.size() == STANDARD_VECTOR_SIZE) {
375
+ Flush(insert_chunk, buffer_idx);
376
+ return true;
377
+ }
378
+
379
+ column = 0;
380
+ return false;
381
+ }
382
+
383
+ void BaseCSVReader::VerifyUTF8(idx_t col_idx, idx_t row_idx, DataChunk &chunk, int64_t offset) {
384
+ D_ASSERT(col_idx < chunk.data.size());
385
+ D_ASSERT(row_idx < chunk.size());
386
+ auto &v = chunk.data[col_idx];
387
+ if (FlatVector::IsNull(v, row_idx)) {
388
+ return;
389
+ }
390
+
391
+ auto parse_data = FlatVector::GetData<string_t>(chunk.data[col_idx]);
392
+ auto s = parse_data[row_idx];
393
+ auto utf_type = Utf8Proc::Analyze(s.GetData(), s.GetSize());
394
+ if (utf_type == UnicodeType::INVALID) {
395
+ string col_name = to_string(col_idx);
396
+ if (col_idx < names.size()) {
397
+ col_name = "\"" + names[col_idx] + "\"";
398
+ }
399
+ int64_t error_line = linenr - (chunk.size() - row_idx) + 1 + offset;
400
+ D_ASSERT(error_line >= 0);
401
+ throw InvalidInputException("Error in file \"%s\" at line %llu in column \"%s\": "
402
+ "%s. Parser options:\n%s",
403
+ options.file_path, error_line, col_name,
404
+ ErrorManager::InvalidUnicodeError(s.GetString(), "CSV file"), options.ToString());
405
+ }
406
+ }
407
+
408
+ void BaseCSVReader::VerifyUTF8(idx_t col_idx) {
409
+ D_ASSERT(col_idx < parse_chunk.data.size());
410
+ for (idx_t i = 0; i < parse_chunk.size(); i++) {
411
+ VerifyUTF8(col_idx, i, parse_chunk);
412
+ }
413
+ }
414
+
415
+ bool TryCastDecimalVectorCommaSeparated(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector,
416
+ idx_t count, string &error_message, const LogicalType &result_type) {
417
+ auto width = DecimalType::GetWidth(result_type);
418
+ auto scale = DecimalType::GetScale(result_type);
419
+ switch (result_type.InternalType()) {
420
+ case PhysicalType::INT16:
421
+ return TemplatedTryCastDecimalVector<TryCastToDecimalCommaSeparated, int16_t>(
422
+ options, input_vector, result_vector, count, error_message, width, scale);
423
+ case PhysicalType::INT32:
424
+ return TemplatedTryCastDecimalVector<TryCastToDecimalCommaSeparated, int32_t>(
425
+ options, input_vector, result_vector, count, error_message, width, scale);
426
+ case PhysicalType::INT64:
427
+ return TemplatedTryCastDecimalVector<TryCastToDecimalCommaSeparated, int64_t>(
428
+ options, input_vector, result_vector, count, error_message, width, scale);
429
+ case PhysicalType::INT128:
430
+ return TemplatedTryCastDecimalVector<TryCastToDecimalCommaSeparated, hugeint_t>(
431
+ options, input_vector, result_vector, count, error_message, width, scale);
432
+ default:
433
+ throw InternalException("Unimplemented physical type for decimal");
434
+ }
435
+ }
436
+
437
+ bool TryCastFloatingVectorCommaSeparated(BufferedCSVReaderOptions &options, Vector &input_vector, Vector &result_vector,
438
+ idx_t count, string &error_message, const LogicalType &result_type,
439
+ idx_t &line_error) {
440
+ switch (result_type.InternalType()) {
441
+ case PhysicalType::DOUBLE:
442
+ return TemplatedTryCastFloatingVector<TryCastErrorMessageCommaSeparated, double>(
443
+ options, input_vector, result_vector, count, error_message, line_error);
444
+ case PhysicalType::FLOAT:
445
+ return TemplatedTryCastFloatingVector<TryCastErrorMessageCommaSeparated, float>(
446
+ options, input_vector, result_vector, count, error_message, line_error);
447
+ default:
448
+ throw InternalException("Unimplemented physical type for floating");
449
+ }
450
+ }
451
+
452
+ // Location of erroneous value in the current parse chunk
453
+ struct ErrorLocation {
454
+ idx_t row_idx;
455
+ idx_t col_idx;
456
+ idx_t row_line;
457
+
458
+ ErrorLocation(idx_t row_idx, idx_t col_idx, idx_t row_line)
459
+ : row_idx(row_idx), col_idx(col_idx), row_line(row_line) {
460
+ }
461
+ };
462
+
463
+ bool BaseCSVReader::Flush(DataChunk &insert_chunk, idx_t buffer_idx, bool try_add_line) {
464
+ if (parse_chunk.size() == 0) {
465
+ return true;
466
+ }
467
+
468
+ bool conversion_error_ignored = false;
469
+
470
+ // convert the columns in the parsed chunk to the types of the table
471
+ insert_chunk.SetCardinality(parse_chunk);
472
+ if (reader_data.column_ids.empty() && !reader_data.empty_columns) {
473
+ throw InternalException("BaseCSVReader::Flush called on a CSV reader that was not correctly initialized. Call "
474
+ "MultiFileReader::InitializeReader or InitializeProjection");
475
+ }
476
+ D_ASSERT(reader_data.column_ids.size() == reader_data.column_mapping.size());
477
+ for (idx_t c = 0; c < reader_data.column_ids.size(); c++) {
478
+ auto col_idx = reader_data.column_ids[c];
479
+ auto result_idx = reader_data.column_mapping[c];
480
+ auto &parse_vector = parse_chunk.data[col_idx];
481
+ auto &result_vector = insert_chunk.data[result_idx];
482
+ auto &type = result_vector.GetType();
483
+ if (type.id() == LogicalTypeId::VARCHAR) {
484
+ // target type is varchar: no need to convert
485
+ // just test that all strings are valid utf-8 strings
486
+ VerifyUTF8(col_idx);
487
+ // reinterpret rather than reference so we can deal with user-defined types
488
+ result_vector.Reinterpret(parse_vector);
489
+ } else {
490
+ string error_message;
491
+ bool success;
492
+ idx_t line_error = 0;
493
+ bool target_type_not_varchar = false;
494
+ if (options.has_format[LogicalTypeId::DATE] && type.id() == LogicalTypeId::DATE) {
495
+ // use the date format to cast the chunk
496
+ success = TryCastDateVector(options, parse_vector, result_vector, parse_chunk.size(), error_message,
497
+ line_error);
498
+ } else if (options.has_format[LogicalTypeId::TIMESTAMP] && type.id() == LogicalTypeId::TIMESTAMP) {
499
+ // use the date format to cast the chunk
500
+ success =
501
+ TryCastTimestampVector(options, parse_vector, result_vector, parse_chunk.size(), error_message);
502
+ } else if (options.decimal_separator != "." &&
503
+ (type.id() == LogicalTypeId::FLOAT || type.id() == LogicalTypeId::DOUBLE)) {
504
+ success = TryCastFloatingVectorCommaSeparated(options, parse_vector, result_vector, parse_chunk.size(),
505
+ error_message, type, line_error);
506
+ } else if (options.decimal_separator != "." && type.id() == LogicalTypeId::DECIMAL) {
507
+ success = TryCastDecimalVectorCommaSeparated(options, parse_vector, result_vector, parse_chunk.size(),
508
+ error_message, type);
509
+ } else {
510
+ // target type is not varchar: perform a cast
511
+ target_type_not_varchar = true;
512
+ success =
513
+ VectorOperations::TryCast(context, parse_vector, result_vector, parse_chunk.size(), &error_message);
514
+ }
515
+ if (success) {
516
+ continue;
517
+ }
518
+ if (try_add_line) {
519
+ return false;
520
+ }
521
+
522
+ string col_name = to_string(col_idx);
523
+ if (col_idx < names.size()) {
524
+ col_name = "\"" + names[col_idx] + "\"";
525
+ }
526
+
527
+ // figure out the exact line number
528
+ if (target_type_not_varchar) {
529
+ UnifiedVectorFormat inserted_column_data;
530
+ result_vector.ToUnifiedFormat(parse_chunk.size(), inserted_column_data);
531
+ for (; line_error < parse_chunk.size(); line_error++) {
532
+ if (!inserted_column_data.validity.RowIsValid(line_error) &&
533
+ !FlatVector::IsNull(parse_vector, line_error)) {
534
+ break;
535
+ }
536
+ }
537
+ }
538
+
539
+ // The line_error must be summed with linenr (All lines emmited from this batch)
540
+ // But subtracted from the parse_chunk
541
+ D_ASSERT(line_error + linenr >= parse_chunk.size());
542
+ line_error += linenr;
543
+ line_error -= parse_chunk.size();
544
+
545
+ auto error_line = GetLineError(line_error, buffer_idx);
546
+
547
+ if (options.ignore_errors) {
548
+ conversion_error_ignored = true;
549
+
550
+ } else if (options.auto_detect) {
551
+ throw InvalidInputException("%s in column %s, at line %llu.\n\nParser "
552
+ "options:\n%s.\n\nConsider either increasing the sample size "
553
+ "(SAMPLE_SIZE=X [X rows] or SAMPLE_SIZE=-1 [all rows]), "
554
+ "or skipping column conversion (ALL_VARCHAR=1)",
555
+ error_message, col_name, error_line, options.ToString());
556
+ } else {
557
+ throw InvalidInputException("%s at line %llu in column %s. Parser options:\n%s ", error_message,
558
+ error_line, col_name, options.ToString());
559
+ }
560
+ }
561
+ }
562
+ if (conversion_error_ignored) {
563
+ D_ASSERT(options.ignore_errors);
564
+
565
+ SelectionVector succesful_rows(parse_chunk.size());
566
+ idx_t sel_size = 0;
567
+
568
+ // Keep track of failed cells
569
+ vector<ErrorLocation> failed_cells;
570
+
571
+ for (idx_t row_idx = 0; row_idx < parse_chunk.size(); row_idx++) {
572
+
573
+ auto global_row_idx = row_idx + linenr - parse_chunk.size();
574
+ auto row_line = GetLineError(global_row_idx, buffer_idx, false);
575
+
576
+ bool row_failed = false;
577
+ for (idx_t c = 0; c < reader_data.column_ids.size(); c++) {
578
+ auto col_idx = reader_data.column_ids[c];
579
+ auto result_idx = reader_data.column_mapping[c];
580
+
581
+ auto &parse_vector = parse_chunk.data[col_idx];
582
+ auto &result_vector = insert_chunk.data[result_idx];
583
+
584
+ bool was_already_null = FlatVector::IsNull(parse_vector, row_idx);
585
+ if (!was_already_null && FlatVector::IsNull(result_vector, row_idx)) {
586
+ row_failed = true;
587
+ failed_cells.emplace_back(row_idx, col_idx, row_line);
588
+ }
589
+ }
590
+ if (!row_failed) {
591
+ succesful_rows.set_index(sel_size++, row_idx);
592
+ }
593
+ }
594
+
595
+ // Now do a second pass to produce the reject table entries
596
+ if (!failed_cells.empty() && !options.rejects_table_name.empty()) {
597
+ auto limit = options.rejects_limit;
598
+
599
+ auto rejects = CSVRejectsTable::GetOrCreate(context, options.rejects_table_name);
600
+ lock_guard<mutex> lock(rejects->write_lock);
601
+
602
+ // short circuit if we already have too many rejects
603
+ if (limit == 0 || rejects->count < limit) {
604
+ auto &table = rejects->GetTable(context);
605
+ InternalAppender appender(context, table);
606
+ auto file_name = GetFileName();
607
+
608
+ for (auto &cell : failed_cells) {
609
+ if (limit != 0 && rejects->count >= limit) {
610
+ break;
611
+ }
612
+ rejects->count++;
613
+
614
+ auto row_idx = cell.row_idx;
615
+ auto col_idx = cell.col_idx;
616
+ auto row_line = cell.row_line;
617
+
618
+ auto col_name = to_string(col_idx);
619
+ if (col_idx < names.size()) {
620
+ col_name = "\"" + names[col_idx] + "\"";
621
+ }
622
+
623
+ auto &parse_vector = parse_chunk.data[col_idx];
624
+ auto parsed_str = FlatVector::GetData<string_t>(parse_vector)[row_idx];
625
+ auto &type = insert_chunk.data[col_idx].GetType();
626
+ auto row_error_msg = StringUtil::Format("Could not convert string '%s' to '%s'",
627
+ parsed_str.GetString(), type.ToString());
628
+
629
+ // Add the row to the rejects table
630
+ appender.BeginRow();
631
+ appender.Append(string_t(file_name));
632
+ appender.Append(row_line);
633
+ appender.Append(col_idx);
634
+ appender.Append(string_t(col_name));
635
+ appender.Append(parsed_str);
636
+
637
+ if (!options.rejects_recovery_columns.empty()) {
638
+ child_list_t<Value> recovery_key;
639
+ for (auto &key_idx : options.rejects_recovery_column_ids) {
640
+ // Figure out if the recovery key is valid.
641
+ // If not, error out for real.
642
+ auto &component_vector = parse_chunk.data[key_idx];
643
+ if (FlatVector::IsNull(component_vector, row_idx)) {
644
+ throw InvalidInputException("%s at line %llu in column %s. Parser options:\n%s ",
645
+ "Could not parse recovery column", row_line, col_name,
646
+ options.ToString());
647
+ }
648
+ auto component = Value(FlatVector::GetData<string_t>(component_vector)[row_idx]);
649
+ recovery_key.emplace_back(names[key_idx], component);
650
+ }
651
+ appender.Append(Value::STRUCT(recovery_key));
652
+ }
653
+
654
+ appender.Append(string_t(row_error_msg));
655
+ appender.EndRow();
656
+ }
657
+ appender.Close();
658
+ }
659
+ }
660
+
661
+ // Now slice the insert chunk to only include the succesful rows
662
+ insert_chunk.Slice(succesful_rows, sel_size);
663
+ }
664
+ parse_chunk.Reset();
665
+ return true;
666
+ }
667
+
668
+ void BaseCSVReader::SetNewLineDelimiter(bool carry, bool carry_followed_by_nl) {
669
+ if ((mode == ParserMode::SNIFFING_DIALECT && !options.has_newline) ||
670
+ options.new_line == NewLineIdentifier::NOT_SET) {
671
+ if (options.new_line == NewLineIdentifier::MIX) {
672
+ return;
673
+ }
674
+ NewLineIdentifier this_line_identifier;
675
+ if (carry) {
676
+ if (carry_followed_by_nl) {
677
+ this_line_identifier = NewLineIdentifier::CARRY_ON;
678
+ } else {
679
+ this_line_identifier = NewLineIdentifier::SINGLE;
680
+ }
681
+ } else {
682
+ this_line_identifier = NewLineIdentifier::SINGLE;
683
+ }
684
+ if (options.new_line == NewLineIdentifier::NOT_SET) {
685
+ options.new_line = this_line_identifier;
686
+ return;
687
+ }
688
+ if (options.new_line != this_line_identifier) {
689
+ options.new_line = NewLineIdentifier::MIX;
690
+ return;
691
+ }
692
+ options.new_line = this_line_identifier;
693
+ }
694
+ }
695
+ } // namespace duckdb