duckdb 1.1.2-dev4.0 → 1.1.2-dev6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (288) hide show
  1. package/package.json +1 -1
  2. package/src/duckdb/extension/icu/third_party/icu/common/putil.cpp +0 -5
  3. package/src/duckdb/extension/icu/third_party/icu/common/rbbiscan.cpp +1 -1
  4. package/src/duckdb/extension/icu/third_party/icu/common/rbbitblb.cpp +1 -1
  5. package/src/duckdb/extension/icu/third_party/icu/common/ucurr.cpp +1 -1
  6. package/src/duckdb/extension/icu/third_party/icu/common/uresbund.cpp +1 -1
  7. package/src/duckdb/extension/icu/third_party/icu/common/uresimp.h +31 -31
  8. package/src/duckdb/extension/icu/third_party/icu/common/ustring.cpp +1 -1
  9. package/src/duckdb/extension/icu/third_party/icu/common/uvector.cpp +1 -1
  10. package/src/duckdb/extension/icu/third_party/icu/i18n/coleitr.cpp +12 -12
  11. package/src/duckdb/extension/icu/third_party/icu/i18n/format.cpp +1 -1
  12. package/src/duckdb/extension/icu/third_party/icu/i18n/listformatter.cpp +4 -4
  13. package/src/duckdb/extension/icu/third_party/icu/i18n/number_decimalquantity.h +1 -1
  14. package/src/duckdb/extension/icu/third_party/icu/i18n/tzgnames.cpp +1 -1
  15. package/src/duckdb/extension/icu/third_party/icu/i18n/unicode/coleitr.h +28 -28
  16. package/src/duckdb/extension/icu/third_party/icu/i18n/unicode/format.h +7 -7
  17. package/src/duckdb/extension/icu/third_party/icu/i18n/unicode/ucol.h +1 -1
  18. package/src/duckdb/extension/icu/third_party/icu/i18n/unicode/ucoleitr.h +41 -41
  19. package/src/duckdb/extension/icu/third_party/icu/i18n/unicode/umsg.h +41 -41
  20. package/src/duckdb/extension/icu/third_party/icu/i18n/usrchimp.h +3 -3
  21. package/src/duckdb/extension/json/include/json_common.hpp +1 -1
  22. package/src/duckdb/extension/json/json_functions/json_structure.cpp +13 -7
  23. package/src/duckdb/extension/parquet/column_writer.cpp +2 -1
  24. package/src/duckdb/extension/parquet/geo_parquet.cpp +24 -9
  25. package/src/duckdb/extension/parquet/include/geo_parquet.hpp +3 -1
  26. package/src/duckdb/extension/parquet/include/parquet_reader.hpp +1 -0
  27. package/src/duckdb/extension/parquet/include/parquet_rle_bp_decoder.hpp +1 -1
  28. package/src/duckdb/extension/parquet/include/templated_column_reader.hpp +0 -4
  29. package/src/duckdb/extension/parquet/parquet_extension.cpp +20 -6
  30. package/src/duckdb/extension/parquet/parquet_reader.cpp +1 -2
  31. package/src/duckdb/extension/parquet/parquet_writer.cpp +1 -1
  32. package/src/duckdb/extension/parquet/serialize_parquet.cpp +0 -2
  33. package/src/duckdb/src/catalog/catalog_entry/duck_schema_entry.cpp +8 -1
  34. package/src/duckdb/src/catalog/default/default_functions.cpp +5 -5
  35. package/src/duckdb/src/common/allocator.cpp +3 -2
  36. package/src/duckdb/src/common/arrow/arrow_appender.cpp +1 -0
  37. package/src/duckdb/src/common/arrow/arrow_converter.cpp +11 -0
  38. package/src/duckdb/src/common/arrow/schema_metadata.cpp +6 -4
  39. package/src/duckdb/src/common/enum_util.cpp +33 -0
  40. package/src/duckdb/src/common/exception.cpp +3 -0
  41. package/src/duckdb/src/common/extra_type_info.cpp +1 -44
  42. package/src/duckdb/src/common/field_writer.cpp +97 -0
  43. package/src/duckdb/src/common/render_tree.cpp +7 -5
  44. package/src/duckdb/src/common/row_operations/row_match.cpp +359 -0
  45. package/src/duckdb/src/common/serializer/buffered_deserializer.cpp +27 -0
  46. package/src/duckdb/src/common/serializer/buffered_serializer.cpp +36 -0
  47. package/src/duckdb/src/common/serializer/format_serializer.cpp +15 -0
  48. package/src/duckdb/src/common/serializer.cpp +24 -0
  49. package/src/duckdb/src/common/sort/comparators.cpp +2 -2
  50. package/src/duckdb/src/common/types/bit.cpp +57 -34
  51. package/src/duckdb/src/common/types/data_chunk.cpp +32 -29
  52. package/src/duckdb/src/common/types/vector_cache.cpp +12 -6
  53. package/src/duckdb/src/common/vector_operations/comparison_operators.cpp +14 -0
  54. package/src/duckdb/src/core_functions/aggregate/distributive/bitstring_agg.cpp +20 -1
  55. package/src/duckdb/src/core_functions/aggregate/distributive/minmax.cpp +2 -2
  56. package/src/duckdb/src/core_functions/aggregate/holistic/approx_top_k.cpp +32 -7
  57. package/src/duckdb/src/core_functions/function_list.cpp +1 -2
  58. package/src/duckdb/src/core_functions/scalar/bit/bitstring.cpp +23 -5
  59. package/src/duckdb/src/core_functions/scalar/date/date_diff.cpp +12 -6
  60. package/src/duckdb/src/core_functions/scalar/date/date_part.cpp +1 -1
  61. package/src/duckdb/src/execution/expression_executor/execute_between.cpp +4 -3
  62. package/src/duckdb/src/execution/expression_executor/execute_case.cpp +4 -3
  63. package/src/duckdb/src/execution/expression_executor/execute_cast.cpp +2 -1
  64. package/src/duckdb/src/execution/expression_executor/execute_comparison.cpp +3 -2
  65. package/src/duckdb/src/execution/expression_executor/execute_conjunction.cpp +2 -1
  66. package/src/duckdb/src/execution/expression_executor/execute_function.cpp +2 -1
  67. package/src/duckdb/src/execution/expression_executor/execute_operator.cpp +3 -2
  68. package/src/duckdb/src/execution/expression_executor/execute_reference.cpp +1 -1
  69. package/src/duckdb/src/execution/expression_executor.cpp +9 -3
  70. package/src/duckdb/src/execution/expression_executor_state.cpp +11 -9
  71. package/src/duckdb/src/execution/index/art/fixed_size_allocator.cpp +238 -0
  72. package/src/duckdb/src/execution/index/art/plan_art.cpp +94 -0
  73. package/src/duckdb/src/execution/index/index_type_set.cpp +4 -1
  74. package/src/duckdb/src/execution/join_hashtable.cpp +7 -8
  75. package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +6 -4
  76. package/src/duckdb/src/execution/operator/csv_scanner/buffer_manager/csv_buffer_manager.cpp +4 -4
  77. package/src/duckdb/src/execution/operator/csv_scanner/scanner/base_scanner.cpp +1 -1
  78. package/src/duckdb/src/execution/operator/csv_scanner/scanner/csv_schema.cpp +44 -5
  79. package/src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +28 -24
  80. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +25 -26
  81. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +5 -3
  82. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +4 -4
  83. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +2 -2
  84. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +1 -1
  85. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +1 -1
  86. package/src/duckdb/src/execution/operator/csv_scanner/state_machine/csv_state_machine.cpp +1 -1
  87. package/src/duckdb/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp +2 -2
  88. package/src/duckdb/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp +1 -1
  89. package/src/duckdb/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +1 -1
  90. package/src/duckdb/src/execution/operator/csv_scanner/util/csv_reader_options.cpp +73 -27
  91. package/src/duckdb/src/execution/operator/helper/physical_buffered_collector.cpp +1 -1
  92. package/src/duckdb/src/execution/operator/persistent/base_csv_reader.cpp +695 -0
  93. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +1487 -0
  94. package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +72 -0
  95. package/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp +158 -0
  96. package/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp +280 -0
  97. package/src/duckdb/src/execution/operator/persistent/parallel_csv_reader.cpp +666 -0
  98. package/src/duckdb/src/execution/operator/persistent/physical_export.cpp +14 -4
  99. package/src/duckdb/src/execution/operator/schema/physical_create_index.cpp +207 -0
  100. package/src/duckdb/src/execution/partitionable_hashtable.cpp +207 -0
  101. package/src/duckdb/src/execution/perfect_aggregate_hashtable.cpp +6 -1
  102. package/src/duckdb/src/execution/physical_plan/plan_comparison_join.cpp +0 -4
  103. package/src/duckdb/src/execution/physical_plan/plan_create_index.cpp +14 -87
  104. package/src/duckdb/src/execution/physical_plan/plan_export.cpp +1 -1
  105. package/src/duckdb/src/execution/physical_plan/plan_get.cpp +1 -1
  106. package/src/duckdb/src/execution/reservoir_sample.cpp +1 -1
  107. package/src/duckdb/src/execution/window_executor.cpp +3 -3
  108. package/src/duckdb/src/function/pragma/pragma_queries.cpp +1 -1
  109. package/src/duckdb/src/function/scalar/strftime_format.cpp +1 -2
  110. package/src/duckdb/src/function/scalar/string/concat.cpp +118 -151
  111. package/src/duckdb/src/function/table/arrow.cpp +13 -0
  112. package/src/duckdb/src/function/table/arrow_conversion.cpp +12 -7
  113. package/src/duckdb/src/function/table/copy_csv.cpp +1 -1
  114. package/src/duckdb/src/function/table/read_csv.cpp +2 -30
  115. package/src/duckdb/src/function/table/sniff_csv.cpp +2 -1
  116. package/src/duckdb/src/function/table/system/duckdb_secrets.cpp +15 -7
  117. package/src/duckdb/src/function/table/version/pragma_version.cpp +3 -3
  118. package/src/duckdb/src/include/duckdb/catalog/catalog_entry_retriever.hpp +1 -1
  119. package/src/duckdb/src/include/duckdb/common/atomic.hpp +13 -1
  120. package/src/duckdb/src/include/duckdb/common/bitpacking.hpp +3 -4
  121. package/src/duckdb/src/include/duckdb/common/enum_util.hpp +8 -0
  122. package/src/duckdb/src/include/duckdb/common/enums/metric_type.hpp +2 -0
  123. package/src/duckdb/src/include/duckdb/common/exception.hpp +10 -0
  124. package/src/duckdb/src/include/duckdb/common/extra_type_info/enum_type_info.hpp +53 -0
  125. package/src/duckdb/src/include/duckdb/common/insertion_order_preserving_map.hpp +5 -5
  126. package/src/duckdb/src/include/duckdb/common/multi_file_reader.hpp +5 -0
  127. package/src/duckdb/src/include/duckdb/common/types/bit.hpp +36 -33
  128. package/src/duckdb/src/include/duckdb/common/types/data_chunk.hpp +10 -13
  129. package/src/duckdb/src/include/duckdb/common/types/uhugeint.hpp +1 -1
  130. package/src/duckdb/src/include/duckdb/common/types/vector_cache.hpp +7 -5
  131. package/src/duckdb/src/include/duckdb/common/windows_undefs.hpp +2 -1
  132. package/src/duckdb/src/include/duckdb/core_functions/aggregate/minmax_n_helpers.hpp +2 -0
  133. package/src/duckdb/src/include/duckdb/core_functions/scalar/bit_functions.hpp +1 -1
  134. package/src/duckdb/src/include/duckdb/core_functions/scalar/list_functions.hpp +0 -6
  135. package/src/duckdb/src/include/duckdb/core_functions/scalar/math_functions.hpp +1 -1
  136. package/src/duckdb/src/include/duckdb/execution/expression_executor_state.hpp +3 -2
  137. package/src/duckdb/src/include/duckdb/execution/index/art/art.hpp +3 -0
  138. package/src/duckdb/src/include/duckdb/execution/index/index_type.hpp +16 -1
  139. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp +4 -4
  140. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp +4 -2
  141. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_schema.hpp +3 -2
  142. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp +91 -36
  143. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp +36 -0
  144. package/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +1 -1
  145. package/src/duckdb/src/include/duckdb/execution/operator/join/perfect_hash_join_executor.hpp +0 -1
  146. package/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_export.hpp +2 -5
  147. package/src/duckdb/src/include/duckdb/function/table_function.hpp +1 -1
  148. package/src/duckdb/src/include/duckdb/main/database.hpp +5 -0
  149. package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +1 -0
  150. package/src/duckdb/src/include/duckdb/main/profiling_info.hpp +20 -22
  151. package/src/duckdb/src/include/duckdb/main/query_profiler.hpp +7 -9
  152. package/src/duckdb/src/include/duckdb/main/secret/secret.hpp +8 -1
  153. package/src/duckdb/src/include/duckdb/main/table_description.hpp +14 -0
  154. package/src/duckdb/src/include/duckdb/optimizer/unnest_rewriter.hpp +5 -5
  155. package/src/duckdb/src/include/duckdb/parser/parsed_data/exported_table_data.hpp +15 -5
  156. package/src/duckdb/src/include/duckdb/parser/transformer.hpp +2 -0
  157. package/src/duckdb/src/include/duckdb/planner/expression_binder/order_binder.hpp +4 -0
  158. package/src/duckdb/src/include/duckdb/planner/operator/logical_export.hpp +10 -13
  159. package/src/duckdb/src/include/duckdb/planner/table_filter.hpp +1 -0
  160. package/src/duckdb/src/include/duckdb/storage/metadata/metadata_manager.hpp +2 -2
  161. package/src/duckdb/src/include/duckdb/storage/standard_buffer_manager.hpp +1 -1
  162. package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +0 -2
  163. package/src/duckdb/src/include/duckdb/storage/table/segment_tree.hpp +1 -0
  164. package/src/duckdb/src/include/duckdb/transaction/duck_transaction.hpp +5 -1
  165. package/src/duckdb/src/include/duckdb.h +2 -2
  166. package/src/duckdb/src/main/appender.cpp +3 -0
  167. package/src/duckdb/src/main/capi/profiling_info-c.cpp +5 -2
  168. package/src/duckdb/src/main/client_context.cpp +8 -2
  169. package/src/duckdb/src/main/connection.cpp +1 -1
  170. package/src/duckdb/src/main/database.cpp +13 -0
  171. package/src/duckdb/src/main/extension/extension_helper.cpp +1 -1
  172. package/src/duckdb/src/main/extension/extension_install.cpp +9 -1
  173. package/src/duckdb/src/main/extension/extension_load.cpp +3 -2
  174. package/src/duckdb/src/main/extension_install_info.cpp +1 -1
  175. package/src/duckdb/src/main/profiling_info.cpp +78 -58
  176. package/src/duckdb/src/main/query_profiler.cpp +79 -89
  177. package/src/duckdb/src/main/relation/read_csv_relation.cpp +1 -1
  178. package/src/duckdb/src/main/secret/secret.cpp +2 -1
  179. package/src/duckdb/src/main/secret/secret_manager.cpp +14 -0
  180. package/src/duckdb/src/optimizer/cte_filter_pusher.cpp +4 -2
  181. package/src/duckdb/src/optimizer/deliminator.cpp +0 -7
  182. package/src/duckdb/src/optimizer/in_clause_rewriter.cpp +7 -0
  183. package/src/duckdb/src/optimizer/pushdown/pushdown_left_join.cpp +4 -1
  184. package/src/duckdb/src/optimizer/unnest_rewriter.cpp +21 -21
  185. package/src/duckdb/src/parallel/task_scheduler.cpp +9 -0
  186. package/src/duckdb/src/parser/parsed_data/exported_table_data.cpp +22 -0
  187. package/src/duckdb/src/parser/parsed_expression_iterator.cpp +3 -0
  188. package/src/duckdb/src/parser/statement/insert_statement.cpp +7 -1
  189. package/src/duckdb/src/parser/transform/expression/transform_boolean_test.cpp +1 -1
  190. package/src/duckdb/src/parser/transform/helpers/transform_typename.cpp +89 -87
  191. package/src/duckdb/src/parser/transform/statement/transform_pivot_stmt.cpp +2 -2
  192. package/src/duckdb/src/planner/binder/expression/bind_macro_expression.cpp +4 -9
  193. package/src/duckdb/src/planner/binder/query_node/bind_select_node.cpp +4 -0
  194. package/src/duckdb/src/planner/binder/query_node/plan_setop.cpp +2 -2
  195. package/src/duckdb/src/planner/binder/statement/bind_create_table.cpp +4 -1
  196. package/src/duckdb/src/planner/binder/statement/bind_export.cpp +4 -3
  197. package/src/duckdb/src/planner/expression_binder/order_binder.cpp +13 -3
  198. package/src/duckdb/src/planner/expression_binder.cpp +1 -1
  199. package/src/duckdb/src/planner/operator/logical_export.cpp +28 -0
  200. package/src/duckdb/src/planner/table_binding.cpp +1 -2
  201. package/src/duckdb/src/planner/table_filter.cpp +6 -2
  202. package/src/duckdb/src/storage/buffer/buffer_pool.cpp +2 -1
  203. package/src/duckdb/src/storage/checkpoint_manager.cpp +1 -1
  204. package/src/duckdb/src/storage/compression/bitpacking.cpp +7 -3
  205. package/src/duckdb/src/storage/compression/dictionary_compression.cpp +1 -1
  206. package/src/duckdb/src/storage/metadata/metadata_manager.cpp +2 -2
  207. package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +16 -0
  208. package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +29 -0
  209. package/src/duckdb/src/storage/serialization/serialize_parse_info.cpp +15 -0
  210. package/src/duckdb/src/storage/single_file_block_manager.cpp +2 -1
  211. package/src/duckdb/src/storage/statistics/distinct_statistics.cpp +3 -5
  212. package/src/duckdb/src/storage/storage_info.cpp +4 -4
  213. package/src/duckdb/src/storage/table/row_group_collection.cpp +1 -1
  214. package/src/duckdb/src/storage/table/row_version_manager.cpp +5 -1
  215. package/src/duckdb/src/storage/temporary_file_manager.cpp +1 -1
  216. package/src/duckdb/src/transaction/duck_transaction.cpp +15 -14
  217. package/src/duckdb/third_party/brotli/common/brotli_platform.h +1 -1
  218. package/src/duckdb/third_party/brotli/dec/decode.cpp +1 -1
  219. package/src/duckdb/third_party/brotli/enc/memory.cpp +4 -4
  220. package/src/duckdb/third_party/fsst/libfsst.cpp +1 -1
  221. package/src/duckdb/third_party/hyperloglog/sds.cpp +1 -1
  222. package/src/duckdb/third_party/hyperloglog/sds.hpp +1 -1
  223. package/src/duckdb/third_party/libpg_query/include/common/keywords.hpp +1 -1
  224. package/src/duckdb/third_party/libpg_query/include/datatype/timestamp.hpp +1 -1
  225. package/src/duckdb/third_party/libpg_query/include/mb/pg_wchar.hpp +1 -1
  226. package/src/duckdb/third_party/libpg_query/include/nodes/bitmapset.hpp +1 -1
  227. package/src/duckdb/third_party/libpg_query/include/nodes/lockoptions.hpp +1 -1
  228. package/src/duckdb/third_party/libpg_query/include/nodes/makefuncs.hpp +1 -1
  229. package/src/duckdb/third_party/libpg_query/include/nodes/pg_list.hpp +1 -1
  230. package/src/duckdb/third_party/libpg_query/include/nodes/value.hpp +1 -1
  231. package/src/duckdb/third_party/libpg_query/include/parser/gramparse.hpp +1 -1
  232. package/src/duckdb/third_party/libpg_query/include/parser/parser.hpp +1 -1
  233. package/src/duckdb/third_party/libpg_query/include/parser/scanner.hpp +1 -1
  234. package/src/duckdb/third_party/libpg_query/include/parser/scansup.hpp +1 -1
  235. package/src/duckdb/third_party/libpg_query/include/pg_functions.hpp +1 -1
  236. package/src/duckdb/third_party/libpg_query/pg_functions.cpp +1 -1
  237. package/src/duckdb/third_party/libpg_query/src_backend_nodes_list.cpp +1 -1
  238. package/src/duckdb/third_party/libpg_query/src_backend_nodes_makefuncs.cpp +1 -1
  239. package/src/duckdb/third_party/libpg_query/src_backend_nodes_value.cpp +1 -1
  240. package/src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp +1964 -1964
  241. package/src/duckdb/third_party/libpg_query/src_backend_parser_parser.cpp +1 -1
  242. package/src/duckdb/third_party/libpg_query/src_backend_parser_scansup.cpp +1 -1
  243. package/src/duckdb/third_party/libpg_query/src_common_keywords.cpp +1 -1
  244. package/src/duckdb/third_party/lz4/lz4.cpp +1 -1
  245. package/src/duckdb/third_party/mbedtls/include/des_alt.h +1 -1
  246. package/src/duckdb/third_party/mbedtls/include/mbedtls/aes_alt.h +1 -1
  247. package/src/duckdb/third_party/mbedtls/include/mbedtls/aria_alt.h +1 -1
  248. package/src/duckdb/third_party/mbedtls/include/mbedtls/asn1write.h +1 -1
  249. package/src/duckdb/third_party/mbedtls/include/mbedtls/camellia_alt.h +1 -1
  250. package/src/duckdb/third_party/mbedtls/include/mbedtls/ccm_alt.h +1 -1
  251. package/src/duckdb/third_party/mbedtls/include/mbedtls/chacha20.h +1 -1
  252. package/src/duckdb/third_party/mbedtls/include/mbedtls/chachapoly.h +1 -1
  253. package/src/duckdb/third_party/mbedtls/include/mbedtls/cmac.h +1 -1
  254. package/src/duckdb/third_party/mbedtls/include/mbedtls/config_psa.h +1 -1
  255. package/src/duckdb/third_party/mbedtls/include/mbedtls/ecdsa.h +1 -1
  256. package/src/duckdb/third_party/mbedtls/include/mbedtls/ecp.h +1 -1
  257. package/src/duckdb/third_party/mbedtls/include/mbedtls/gcm_alt.h +1 -1
  258. package/src/duckdb/third_party/mbedtls/include/mbedtls/md5.h +1 -1
  259. package/src/duckdb/third_party/mbedtls/include/mbedtls/nist_kw.h +1 -1
  260. package/src/duckdb/third_party/mbedtls/include/mbedtls/pkcs12.h +1 -1
  261. package/src/duckdb/third_party/mbedtls/include/mbedtls/pkcs5.h +1 -1
  262. package/src/duckdb/third_party/mbedtls/include/mbedtls/psa_util.h +1 -1
  263. package/src/duckdb/third_party/mbedtls/include/mbedtls/ripemd160.h +1 -1
  264. package/src/duckdb/third_party/mbedtls/include/mbedtls/threading.h +1 -1
  265. package/src/duckdb/third_party/mbedtls/include/mbedtls/timing.h +1 -1
  266. package/src/duckdb/third_party/mbedtls/include/platform_alt.h +1 -1
  267. package/src/duckdb/third_party/mbedtls/include/psa/crypto.h +1 -1
  268. package/src/duckdb/third_party/mbedtls/include/rsa_alt.h +1 -1
  269. package/src/duckdb/third_party/mbedtls/include/sha1_alt.h +1 -1
  270. package/src/duckdb/third_party/mbedtls/include/sha256_alt.h +1 -1
  271. package/src/duckdb/third_party/mbedtls/include/sha512_alt.h +1 -1
  272. package/src/duckdb/third_party/mbedtls/include/ssl_misc.h +1 -1
  273. package/src/duckdb/third_party/mbedtls/library/aesni.h +1 -1
  274. package/src/duckdb/third_party/mbedtls/library/padlock.h +1 -1
  275. package/src/duckdb/third_party/miniz/miniz.cpp +1 -1
  276. package/src/duckdb/third_party/parquet/parquet_types.cpp +1 -1
  277. package/src/duckdb/third_party/parquet/windows_compatibility.h +1 -1
  278. package/src/duckdb/third_party/pcg/pcg_extras.hpp +1 -1
  279. package/src/duckdb/third_party/pcg/pcg_uint128.hpp +1 -1
  280. package/src/duckdb/third_party/skiplist/Node.h +4 -4
  281. package/src/duckdb/third_party/snappy/snappy.cc +1 -1
  282. package/src/duckdb/third_party/snappy/snappy_version.hpp +1 -1
  283. package/src/duckdb/third_party/thrift/thrift/thrift-config.h +1 -1
  284. package/src/duckdb/third_party/zstd/decompress/zstd_decompress_block.cpp +1 -1
  285. package/src/duckdb/third_party/zstd/include/zstd_static.h +1 -1
  286. package/src/duckdb/ub_src_execution_index_art.cpp +2 -0
  287. package/src/duckdb/ub_src_parser_parsed_data.cpp +2 -0
  288. package/src/duckdb/ub_src_planner_operator.cpp +2 -0
@@ -0,0 +1,1487 @@
1
+ #include "duckdb/execution/operator/persistent/buffered_csv_reader.hpp"
2
+
3
+ #include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp"
4
+ #include "duckdb/common/file_system.hpp"
5
+ #include "duckdb/common/string_util.hpp"
6
+ #include "duckdb/common/to_string.hpp"
7
+ #include "duckdb/common/types/cast_helpers.hpp"
8
+ #include "duckdb/common/vector_operations/unary_executor.hpp"
9
+ #include "duckdb/common/vector_operations/vector_operations.hpp"
10
+ #include "duckdb/function/scalar/strftime_format.hpp"
11
+ #include "duckdb/main/database.hpp"
12
+ #include "duckdb/parser/column_definition.hpp"
13
+ #include "duckdb/storage/data_table.hpp"
14
+ #include "utf8proc_wrapper.hpp"
15
+ #include "utf8proc.hpp"
16
+ #include "duckdb/parser/keyword_helper.hpp"
17
+ #include "duckdb/main/error_manager.hpp"
18
+ #include "duckdb/main/client_data.hpp"
19
+
20
+ #include <algorithm>
21
+ #include <cctype>
22
+ #include <cstring>
23
+ #include <fstream>
24
+
25
+ namespace duckdb {
26
+
27
+ BufferedCSVReader::BufferedCSVReader(ClientContext &context, BufferedCSVReaderOptions options_p,
28
+ const vector<LogicalType> &requested_types)
29
+ : BaseCSVReader(context, std::move(options_p), requested_types), buffer_size(0), position(0), start(0) {
30
+ file_handle = OpenCSV(options);
31
+ Initialize(requested_types);
32
+ }
33
+
34
+ BufferedCSVReader::BufferedCSVReader(ClientContext &context, string filename, BufferedCSVReaderOptions options_p,
35
+ const vector<LogicalType> &requested_types)
36
+ : BaseCSVReader(context, std::move(options_p), requested_types), buffer_size(0), position(0), start(0) {
37
+ options.file_path = std::move(filename);
38
+ file_handle = OpenCSV(options);
39
+ Initialize(requested_types);
40
+ }
41
+
42
+ enum class QuoteRule : uint8_t { QUOTES_RFC = 0, QUOTES_OTHER = 1, NO_QUOTES = 2 };
43
+
44
+ static bool StartsWithNumericDate(string &separator, const string &value) {
45
+ auto begin = value.c_str();
46
+ auto end = begin + value.size();
47
+
48
+ // StrpTimeFormat::Parse will skip whitespace, so we can too
49
+ auto field1 = std::find_if_not(begin, end, StringUtil::CharacterIsSpace);
50
+ if (field1 == end) {
51
+ return false;
52
+ }
53
+
54
+ // first numeric field must start immediately
55
+ if (!StringUtil::CharacterIsDigit(*field1)) {
56
+ return false;
57
+ }
58
+ auto literal1 = std::find_if_not(field1, end, StringUtil::CharacterIsDigit);
59
+ if (literal1 == end) {
60
+ return false;
61
+ }
62
+
63
+ // second numeric field must exist
64
+ auto field2 = std::find_if(literal1, end, StringUtil::CharacterIsDigit);
65
+ if (field2 == end) {
66
+ return false;
67
+ }
68
+ auto literal2 = std::find_if_not(field2, end, StringUtil::CharacterIsDigit);
69
+ if (literal2 == end) {
70
+ return false;
71
+ }
72
+
73
+ // third numeric field must exist
74
+ auto field3 = std::find_if(literal2, end, StringUtil::CharacterIsDigit);
75
+ if (field3 == end) {
76
+ return false;
77
+ }
78
+
79
+ // second literal must match first
80
+ if (((field3 - literal2) != (field2 - literal1)) || strncmp(literal1, literal2, (field2 - literal1)) != 0) {
81
+ return false;
82
+ }
83
+
84
+ // copy the literal as the separator, escaping percent signs
85
+ separator.clear();
86
+ while (literal1 < field2) {
87
+ const auto literal_char = *literal1++;
88
+ if (literal_char == '%') {
89
+ separator.push_back(literal_char);
90
+ }
91
+ separator.push_back(literal_char);
92
+ }
93
+
94
+ return true;
95
+ }
96
+
97
+ string GenerateDateFormat(const string &separator, const char *format_template) {
98
+ string format_specifier = format_template;
99
+ auto amount_of_dashes = std::count(format_specifier.begin(), format_specifier.end(), '-');
100
+ if (!amount_of_dashes) {
101
+ return format_specifier;
102
+ }
103
+ string result;
104
+ result.reserve(format_specifier.size() - amount_of_dashes + (amount_of_dashes * separator.size()));
105
+ for (auto &character : format_specifier) {
106
+ if (character == '-') {
107
+ result += separator;
108
+ } else {
109
+ result += character;
110
+ }
111
+ }
112
+ return result;
113
+ }
114
+
115
+ TextSearchShiftArray::TextSearchShiftArray() {
116
+ }
117
+
118
+ TextSearchShiftArray::TextSearchShiftArray(string search_term) : length(search_term.size()) {
119
+ if (length > 255) {
120
+ throw InvalidInputException("Size of delimiter/quote/escape in CSV reader is limited to 255 bytes");
121
+ }
122
+ // initialize the shifts array
123
+ shifts = unique_ptr<uint8_t[]>(new uint8_t[length * 255]);
124
+ memset(shifts.get(), 0, length * 255 * sizeof(uint8_t));
125
+ // iterate over each of the characters in the array
126
+ for (idx_t main_idx = 0; main_idx < length; main_idx++) {
127
+ uint8_t current_char = (uint8_t)search_term[main_idx];
128
+ // now move over all the remaining positions
129
+ for (idx_t i = main_idx; i < length; i++) {
130
+ bool is_match = true;
131
+ // check if the prefix matches at this position
132
+ // if it does, we move to this position after encountering the current character
133
+ for (idx_t j = 0; j < main_idx; j++) {
134
+ if (search_term[i - main_idx + j] != search_term[j]) {
135
+ is_match = false;
136
+ }
137
+ }
138
+ if (!is_match) {
139
+ continue;
140
+ }
141
+ shifts[i * 255 + current_char] = main_idx + 1;
142
+ }
143
+ }
144
+ }
145
+
146
+ // Helper function to generate column names
147
+ static string GenerateColumnName(const idx_t total_cols, const idx_t col_number, const string &prefix = "column") {
148
+ int max_digits = NumericHelper::UnsignedLength(total_cols - 1);
149
+ int digits = NumericHelper::UnsignedLength(col_number);
150
+ string leading_zeros = string(max_digits - digits, '0');
151
+ string value = to_string(col_number);
152
+ return string(prefix + leading_zeros + value);
153
+ }
154
+
155
+ // Helper function for UTF-8 aware space trimming
156
+ static string TrimWhitespace(const string &col_name) {
157
+ utf8proc_int32_t codepoint;
158
+ auto str = reinterpret_cast<const utf8proc_uint8_t *>(col_name.c_str());
159
+ idx_t size = col_name.size();
160
+ // Find the first character that is not left trimmed
161
+ idx_t begin = 0;
162
+ while (begin < size) {
163
+ auto bytes = utf8proc_iterate(str + begin, size - begin, &codepoint);
164
+ D_ASSERT(bytes > 0);
165
+ if (utf8proc_category(codepoint) != UTF8PROC_CATEGORY_ZS) {
166
+ break;
167
+ }
168
+ begin += bytes;
169
+ }
170
+
171
+ // Find the last character that is not right trimmed
172
+ idx_t end;
173
+ end = begin;
174
+ for (auto next = begin; next < col_name.size();) {
175
+ auto bytes = utf8proc_iterate(str + next, size - next, &codepoint);
176
+ D_ASSERT(bytes > 0);
177
+ next += bytes;
178
+ if (utf8proc_category(codepoint) != UTF8PROC_CATEGORY_ZS) {
179
+ end = next;
180
+ }
181
+ }
182
+
183
+ // return the trimmed string
184
+ return col_name.substr(begin, end - begin);
185
+ }
186
+
187
+ static string NormalizeColumnName(const string &col_name) {
188
+ // normalize UTF8 characters to NFKD
189
+ auto nfkd = utf8proc_NFKD(reinterpret_cast<const utf8proc_uint8_t *>(col_name.c_str()), col_name.size());
190
+ const string col_name_nfkd = string(const_char_ptr_cast(nfkd), strlen(const_char_ptr_cast(nfkd)));
191
+ free(nfkd);
192
+
193
+ // only keep ASCII characters 0-9 a-z A-Z and replace spaces with regular whitespace
194
+ string col_name_ascii = "";
195
+ for (idx_t i = 0; i < col_name_nfkd.size(); i++) {
196
+ if (col_name_nfkd[i] == '_' || (col_name_nfkd[i] >= '0' && col_name_nfkd[i] <= '9') ||
197
+ (col_name_nfkd[i] >= 'A' && col_name_nfkd[i] <= 'Z') ||
198
+ (col_name_nfkd[i] >= 'a' && col_name_nfkd[i] <= 'z')) {
199
+ col_name_ascii += col_name_nfkd[i];
200
+ } else if (StringUtil::CharacterIsSpace(col_name_nfkd[i])) {
201
+ col_name_ascii += " ";
202
+ }
203
+ }
204
+
205
+ // trim whitespace and replace remaining whitespace by _
206
+ string col_name_trimmed = TrimWhitespace(col_name_ascii);
207
+ string col_name_cleaned = "";
208
+ bool in_whitespace = false;
209
+ for (idx_t i = 0; i < col_name_trimmed.size(); i++) {
210
+ if (col_name_trimmed[i] == ' ') {
211
+ if (!in_whitespace) {
212
+ col_name_cleaned += "_";
213
+ in_whitespace = true;
214
+ }
215
+ } else {
216
+ col_name_cleaned += col_name_trimmed[i];
217
+ in_whitespace = false;
218
+ }
219
+ }
220
+
221
+ // don't leave string empty; if not empty, make lowercase
222
+ if (col_name_cleaned.empty()) {
223
+ col_name_cleaned = "_";
224
+ } else {
225
+ col_name_cleaned = StringUtil::Lower(col_name_cleaned);
226
+ }
227
+
228
+ // prepend _ if name starts with a digit or is a reserved keyword
229
+ if (KeywordHelper::IsKeyword(col_name_cleaned) || (col_name_cleaned[0] >= '0' && col_name_cleaned[0] <= '9')) {
230
+ col_name_cleaned = "_" + col_name_cleaned;
231
+ }
232
+ return col_name_cleaned;
233
+ }
234
+
235
+ void BufferedCSVReader::Initialize(const vector<LogicalType> &requested_types) {
236
+ PrepareComplexParser();
237
+ if (options.auto_detect) {
238
+ return_types = SniffCSV(requested_types);
239
+ if (return_types.empty()) {
240
+ throw InvalidInputException("Failed to detect column types from CSV: is the file a valid CSV file?");
241
+ }
242
+ JumpToBeginning(options.skip_rows, options.header);
243
+ } else {
244
+ return_types = requested_types;
245
+ ResetBuffer();
246
+ SkipRowsAndReadHeader(options.skip_rows, options.header);
247
+ }
248
+ InitParseChunk(return_types.size());
249
+ }
250
+
251
+ void BufferedCSVReader::ResetBuffer() {
252
+ buffer.reset();
253
+ buffer_size = 0;
254
+ position = 0;
255
+ start = 0;
256
+ cached_buffers.clear();
257
+ }
258
+
259
+ void BufferedCSVReader::ResetStream() {
260
+ file_handle->Reset();
261
+ linenr = 0;
262
+ linenr_estimated = false;
263
+ bytes_per_line_avg = 0;
264
+ sample_chunk_idx = 0;
265
+ jumping_samples = false;
266
+ }
267
+
268
+ void BufferedCSVReader::JumpToBeginning(idx_t skip_rows = 0, bool skip_header = false) {
269
+ ResetBuffer();
270
+ ResetStream();
271
+ sample_chunk_idx = 0;
272
+ bytes_in_chunk = 0;
273
+ end_of_file_reached = false;
274
+ bom_checked = false;
275
+ SkipRowsAndReadHeader(skip_rows, skip_header);
276
+ }
277
+
278
+ void BufferedCSVReader::SkipRowsAndReadHeader(idx_t skip_rows, bool skip_header) {
279
+ for (idx_t i = 0; i < skip_rows; i++) {
280
+ // ignore skip rows
281
+ string read_line = file_handle->ReadLine();
282
+ linenr++;
283
+ }
284
+
285
+ if (skip_header) {
286
+ // ignore the first line as a header line
287
+ InitParseChunk(return_types.size());
288
+ ParseCSV(ParserMode::PARSING_HEADER);
289
+ }
290
+ }
291
+
292
+ void BufferedCSVReader::PrepareComplexParser() {
293
+ delimiter_search = TextSearchShiftArray(options.delimiter);
294
+ escape_search = TextSearchShiftArray(options.escape);
295
+ quote_search = TextSearchShiftArray(options.quote);
296
+ }
297
+
298
+ bool BufferedCSVReader::JumpToNextSample() {
299
+ // get bytes contained in the previously read chunk
300
+ idx_t remaining_bytes_in_buffer = buffer_size - start;
301
+ bytes_in_chunk -= remaining_bytes_in_buffer;
302
+ if (remaining_bytes_in_buffer == 0) {
303
+ return false;
304
+ }
305
+
306
+ // assess if it makes sense to jump, based on size of the first chunk relative to size of the entire file
307
+ if (sample_chunk_idx == 0) {
308
+ idx_t bytes_first_chunk = bytes_in_chunk;
309
+ double chunks_fit = (file_handle->FileSize() / (double)bytes_first_chunk);
310
+ jumping_samples = chunks_fit >= options.sample_chunks;
311
+
312
+ // jump back to the beginning
313
+ JumpToBeginning(options.skip_rows, options.header);
314
+ sample_chunk_idx++;
315
+ return true;
316
+ }
317
+
318
+ if (end_of_file_reached || sample_chunk_idx >= options.sample_chunks) {
319
+ return false;
320
+ }
321
+
322
+ // if we deal with any other sources than plaintext files, jumping_samples can be tricky. In that case
323
+ // we just read x continuous chunks from the stream TODO: make jumps possible for zipfiles.
324
+ if (!file_handle->OnDiskFile() || !jumping_samples) {
325
+ sample_chunk_idx++;
326
+ return true;
327
+ }
328
+
329
+ // update average bytes per line
330
+ double bytes_per_line = bytes_in_chunk / (double)options.sample_chunk_size;
331
+ bytes_per_line_avg = ((bytes_per_line_avg * (sample_chunk_idx)) + bytes_per_line) / (sample_chunk_idx + 1);
332
+
333
+ // if none of the previous conditions were met, we can jump
334
+ idx_t partition_size = (idx_t)round(file_handle->FileSize() / (double)options.sample_chunks);
335
+
336
+ // calculate offset to end of the current partition
337
+ int64_t offset = partition_size - bytes_in_chunk - remaining_bytes_in_buffer;
338
+ auto current_pos = file_handle->SeekPosition();
339
+
340
+ if (current_pos + offset < file_handle->FileSize()) {
341
+ // set position in stream and clear failure bits
342
+ file_handle->Seek(current_pos + offset);
343
+
344
+ // estimate linenr
345
+ linenr += (idx_t)round((offset + remaining_bytes_in_buffer) / bytes_per_line_avg);
346
+ linenr_estimated = true;
347
+ } else {
348
+ // seek backwards from the end in last chunk and hope to catch the end of the file
349
+ // TODO: actually it would be good to make sure that the end of file is being reached, because
350
+ // messy end-lines are quite common. For this case, however, we first need a skip_end detection anyways.
351
+ file_handle->Seek(file_handle->FileSize() - bytes_in_chunk);
352
+
353
+ // estimate linenr
354
+ linenr = (idx_t)round((file_handle->FileSize() - bytes_in_chunk) / bytes_per_line_avg);
355
+ linenr_estimated = true;
356
+ }
357
+
358
+ // reset buffers and parse chunk
359
+ ResetBuffer();
360
+
361
+ // seek beginning of next line
362
+ // FIXME: if this jump ends up in a quoted linebreak, we will have a problem
363
+ string read_line = file_handle->ReadLine();
364
+ linenr++;
365
+
366
+ sample_chunk_idx++;
367
+
368
+ return true;
369
+ }
370
+
371
+ void BufferedCSVReader::DetectDialect(const vector<LogicalType> &requested_types,
372
+ BufferedCSVReaderOptions &original_options,
373
+ vector<BufferedCSVReaderOptions> &info_candidates, idx_t &best_num_cols) {
374
+ // set up the candidates we consider for delimiter and quote rules based on user input
375
+ vector<string> delim_candidates;
376
+ vector<QuoteRule> quoterule_candidates;
377
+ vector<vector<string>> quote_candidates_map;
378
+ vector<vector<string>> escape_candidates_map = {{""}, {"\\"}, {""}};
379
+
380
+ if (options.has_delimiter) {
381
+ // user provided a delimiter: use that delimiter
382
+ delim_candidates = {options.delimiter};
383
+ } else {
384
+ // no delimiter provided: try standard/common delimiters
385
+ delim_candidates = {",", "|", ";", "\t"};
386
+ }
387
+ if (options.has_quote) {
388
+ // user provided quote: use that quote rule
389
+ quote_candidates_map = {{options.quote}, {options.quote}, {options.quote}};
390
+ } else {
391
+ // no quote rule provided: use standard/common quotes
392
+ quote_candidates_map = {{"\""}, {"\"", "'"}, {""}};
393
+ }
394
+ if (options.has_escape) {
395
+ // user provided escape: use that escape rule
396
+ if (options.escape.empty()) {
397
+ quoterule_candidates = {QuoteRule::QUOTES_RFC};
398
+ } else {
399
+ quoterule_candidates = {QuoteRule::QUOTES_OTHER};
400
+ }
401
+ escape_candidates_map[static_cast<uint8_t>(quoterule_candidates[0])] = {options.escape};
402
+ } else {
403
+ // no escape provided: try standard/common escapes
404
+ quoterule_candidates = {QuoteRule::QUOTES_RFC, QuoteRule::QUOTES_OTHER, QuoteRule::NO_QUOTES};
405
+ }
406
+
407
+ idx_t best_consistent_rows = 0;
408
+ idx_t prev_padding_count = 0;
409
+ for (auto quoterule : quoterule_candidates) {
410
+ const auto &quote_candidates = quote_candidates_map[static_cast<uint8_t>(quoterule)];
411
+ for (const auto &quote : quote_candidates) {
412
+ for (const auto &delim : delim_candidates) {
413
+ const auto &escape_candidates = escape_candidates_map[static_cast<uint8_t>(quoterule)];
414
+ for (const auto &escape : escape_candidates) {
415
+ BufferedCSVReaderOptions sniff_info = original_options;
416
+ sniff_info.delimiter = delim;
417
+ sniff_info.quote = quote;
418
+ sniff_info.escape = escape;
419
+
420
+ options = sniff_info;
421
+ PrepareComplexParser();
422
+
423
+ JumpToBeginning(original_options.skip_rows);
424
+ sniffed_column_counts.clear();
425
+ if (!TryParseCSV(ParserMode::SNIFFING_DIALECT)) {
426
+ continue;
427
+ }
428
+
429
+ idx_t start_row = original_options.skip_rows;
430
+ idx_t consistent_rows = 0;
431
+ idx_t num_cols = sniffed_column_counts.empty() ? 0 : sniffed_column_counts[0];
432
+ idx_t padding_count = 0;
433
+ bool allow_padding = original_options.null_padding;
434
+ for (idx_t row = 0; row < sniffed_column_counts.size(); row++) {
435
+ if (sniffed_column_counts[row] == num_cols) {
436
+ consistent_rows++;
437
+ } else if (num_cols < sniffed_column_counts[row] && !original_options.skip_rows_set) {
438
+ // we use the maximum amount of num_cols that we find
439
+ num_cols = sniffed_column_counts[row];
440
+ start_row = row + original_options.skip_rows;
441
+ consistent_rows = 1;
442
+ padding_count = 0;
443
+ } else if (num_cols >= sniffed_column_counts[row] && allow_padding) {
444
+ // we are missing some columns, we can parse this as long as we add padding
445
+ padding_count++;
446
+ }
447
+ }
448
+
449
+ // some logic
450
+ consistent_rows += padding_count;
451
+ bool more_values = (consistent_rows > best_consistent_rows && num_cols >= best_num_cols);
452
+ bool require_more_padding = padding_count > prev_padding_count;
453
+ bool require_less_padding = padding_count < prev_padding_count;
454
+ bool single_column_before = best_num_cols < 2 && num_cols > best_num_cols;
455
+ bool rows_consistent =
456
+ start_row + consistent_rows - original_options.skip_rows == sniffed_column_counts.size();
457
+ bool more_than_one_row = (consistent_rows > 1);
458
+ bool more_than_one_column = (num_cols > 1);
459
+ bool start_good = !info_candidates.empty() && (start_row <= info_candidates.front().skip_rows);
460
+
461
+ if (!requested_types.empty() && requested_types.size() != num_cols) {
462
+ continue;
463
+ } else if (rows_consistent && (single_column_before || (more_values && !require_more_padding) ||
464
+ (more_than_one_column && require_less_padding))) {
465
+ sniff_info.skip_rows = start_row;
466
+ sniff_info.num_cols = num_cols;
467
+ sniff_info.new_line = options.new_line;
468
+ best_consistent_rows = consistent_rows;
469
+ best_num_cols = num_cols;
470
+ prev_padding_count = padding_count;
471
+
472
+ info_candidates.clear();
473
+ info_candidates.push_back(sniff_info);
474
+ } else if (more_than_one_row && more_than_one_column && start_good && rows_consistent &&
475
+ !require_more_padding) {
476
+ bool same_quote_is_candidate = false;
477
+ for (auto &info_candidate : info_candidates) {
478
+ if (quote.compare(info_candidate.quote) == 0) {
479
+ same_quote_is_candidate = true;
480
+ }
481
+ }
482
+ if (!same_quote_is_candidate) {
483
+ sniff_info.skip_rows = start_row;
484
+ sniff_info.num_cols = num_cols;
485
+ sniff_info.new_line = options.new_line;
486
+ info_candidates.push_back(sniff_info);
487
+ }
488
+ }
489
+ }
490
+ }
491
+ }
492
+ }
493
+ }
494
+
495
+ void BufferedCSVReader::DetectCandidateTypes(const vector<LogicalType> &type_candidates,
496
+ const map<LogicalTypeId, vector<const char *>> &format_template_candidates,
497
+ const vector<BufferedCSVReaderOptions> &info_candidates,
498
+ BufferedCSVReaderOptions &original_options, idx_t best_num_cols,
499
+ vector<vector<LogicalType>> &best_sql_types_candidates,
500
+ std::map<LogicalTypeId, vector<string>> &best_format_candidates,
501
+ DataChunk &best_header_row) {
502
+ BufferedCSVReaderOptions best_options;
503
+ idx_t min_varchar_cols = best_num_cols + 1;
504
+
505
+ // check which info candidate leads to minimum amount of non-varchar columns...
506
+ for (const auto &t : format_template_candidates) {
507
+ best_format_candidates[t.first].clear();
508
+ }
509
+ for (auto &info_candidate : info_candidates) {
510
+ options = info_candidate;
511
+ vector<vector<LogicalType>> info_sql_types_candidates(options.num_cols, type_candidates);
512
+ std::map<LogicalTypeId, bool> has_format_candidates;
513
+ std::map<LogicalTypeId, vector<string>> format_candidates;
514
+ for (const auto &t : format_template_candidates) {
515
+ has_format_candidates[t.first] = false;
516
+ format_candidates[t.first].clear();
517
+ }
518
+
519
+ // set all return_types to VARCHAR so we can do datatype detection based on VARCHAR values
520
+ return_types.clear();
521
+ return_types.assign(options.num_cols, LogicalType::VARCHAR);
522
+
523
+ // jump to beginning and skip potential header
524
+ JumpToBeginning(options.skip_rows, true);
525
+ DataChunk header_row;
526
+ header_row.Initialize(allocator, return_types);
527
+ parse_chunk.Copy(header_row);
528
+
529
+ if (header_row.size() == 0) {
530
+ continue;
531
+ }
532
+
533
+ // init parse chunk and read csv with info candidate
534
+ InitParseChunk(return_types.size());
535
+ if (!TryParseCSV(ParserMode::SNIFFING_DATATYPES)) {
536
+ continue;
537
+ }
538
+ for (idx_t row_idx = 0; row_idx <= parse_chunk.size(); row_idx++) {
539
+ bool is_header_row = row_idx == 0;
540
+ idx_t row = row_idx - 1;
541
+ for (idx_t col = 0; col < parse_chunk.ColumnCount(); col++) {
542
+ auto &col_type_candidates = info_sql_types_candidates[col];
543
+ while (col_type_candidates.size() > 1) {
544
+ const auto &sql_type = col_type_candidates.back();
545
+ // try cast from string to sql_type
546
+ Value dummy_val;
547
+ if (is_header_row) {
548
+ VerifyUTF8(col, 0, header_row, -int64_t(parse_chunk.size()));
549
+ dummy_val = header_row.GetValue(col, 0);
550
+ } else {
551
+ VerifyUTF8(col, row, parse_chunk);
552
+ dummy_val = parse_chunk.GetValue(col, row);
553
+ }
554
+ // try formatting for date types if the user did not specify one and it starts with numeric values.
555
+ string separator;
556
+ if (has_format_candidates.count(sql_type.id()) && !original_options.has_format[sql_type.id()] &&
557
+ !dummy_val.IsNull() && StartsWithNumericDate(separator, StringValue::Get(dummy_val))) {
558
+ // generate date format candidates the first time through
559
+ auto &type_format_candidates = format_candidates[sql_type.id()];
560
+ const auto had_format_candidates = has_format_candidates[sql_type.id()];
561
+ if (!has_format_candidates[sql_type.id()]) {
562
+ has_format_candidates[sql_type.id()] = true;
563
+ // order by preference
564
+ auto entry = format_template_candidates.find(sql_type.id());
565
+ if (entry != format_template_candidates.end()) {
566
+ const auto &format_template_list = entry->second;
567
+ for (const auto &t : format_template_list) {
568
+ const auto format_string = GenerateDateFormat(separator, t);
569
+ // don't parse ISO 8601
570
+ if (format_string.find("%Y-%m-%d") == string::npos) {
571
+ type_format_candidates.emplace_back(format_string);
572
+ }
573
+ }
574
+ }
575
+ // initialise the first candidate
576
+ options.has_format[sql_type.id()] = true;
577
+ // all formats are constructed to be valid
578
+ SetDateFormat(type_format_candidates.back(), sql_type.id());
579
+ }
580
+ // check all formats and keep the first one that works
581
+ StrpTimeFormat::ParseResult result;
582
+ auto save_format_candidates = type_format_candidates;
583
+ while (!type_format_candidates.empty()) {
584
+ // avoid using exceptions for flow control...
585
+ auto &current_format = options.date_format[sql_type.id()];
586
+ if (current_format.Parse(StringValue::Get(dummy_val), result)) {
587
+ break;
588
+ }
589
+ // doesn't work - move to the next one
590
+ type_format_candidates.pop_back();
591
+ options.has_format[sql_type.id()] = (!type_format_candidates.empty());
592
+ if (!type_format_candidates.empty()) {
593
+ SetDateFormat(type_format_candidates.back(), sql_type.id());
594
+ }
595
+ }
596
+ // if none match, then this is not a value of type sql_type,
597
+ if (type_format_candidates.empty()) {
598
+ // so restore the candidates that did work.
599
+ // or throw them out if they were generated by this value.
600
+ if (had_format_candidates) {
601
+ type_format_candidates.swap(save_format_candidates);
602
+ if (!type_format_candidates.empty()) {
603
+ SetDateFormat(type_format_candidates.back(), sql_type.id());
604
+ }
605
+ } else {
606
+ has_format_candidates[sql_type.id()] = false;
607
+ }
608
+ }
609
+ }
610
+ // try cast from string to sql_type
611
+ if (TryCastValue(dummy_val, sql_type)) {
612
+ break;
613
+ } else {
614
+ col_type_candidates.pop_back();
615
+ }
616
+ }
617
+ }
618
+ // reset type detection, because first row could be header,
619
+ // but only do it if csv has more than one line (including header)
620
+ if (parse_chunk.size() > 0 && is_header_row) {
621
+ info_sql_types_candidates = vector<vector<LogicalType>>(options.num_cols, type_candidates);
622
+ for (auto &f : format_candidates) {
623
+ f.second.clear();
624
+ }
625
+ for (auto &h : has_format_candidates) {
626
+ h.second = false;
627
+ }
628
+ }
629
+ }
630
+
631
+ idx_t varchar_cols = 0;
632
+ for (idx_t col = 0; col < parse_chunk.ColumnCount(); col++) {
633
+ auto &col_type_candidates = info_sql_types_candidates[col];
634
+ // check number of varchar columns
635
+ const auto &col_type = col_type_candidates.back();
636
+ if (col_type == LogicalType::VARCHAR) {
637
+ varchar_cols++;
638
+ }
639
+ }
640
+
641
+ // it's good if the dialect creates more non-varchar columns, but only if we sacrifice < 30% of best_num_cols.
642
+ if (varchar_cols < min_varchar_cols && parse_chunk.ColumnCount() > (best_num_cols * 0.7)) {
643
+ // we have a new best_options candidate
644
+ best_options = info_candidate;
645
+ min_varchar_cols = varchar_cols;
646
+ best_sql_types_candidates = info_sql_types_candidates;
647
+ best_format_candidates = format_candidates;
648
+ best_header_row.Destroy();
649
+ auto header_row_types = header_row.GetTypes();
650
+ best_header_row.Initialize(allocator, header_row_types);
651
+ header_row.Copy(best_header_row);
652
+ }
653
+ }
654
+
655
+ options = best_options;
656
+ for (const auto &best : best_format_candidates) {
657
+ if (!best.second.empty()) {
658
+ SetDateFormat(best.second.back(), best.first);
659
+ }
660
+ }
661
+ }
662
+
663
+ void BufferedCSVReader::DetectHeader(const vector<vector<LogicalType>> &best_sql_types_candidates,
664
+ const DataChunk &best_header_row) {
665
+ // information for header detection
666
+ bool first_row_consistent = true;
667
+ bool first_row_nulls = false;
668
+
669
+ // check if header row is all null and/or consistent with detected column data types
670
+ first_row_nulls = true;
671
+ for (idx_t col = 0; col < best_sql_types_candidates.size(); col++) {
672
+ auto dummy_val = best_header_row.GetValue(col, 0);
673
+ if (!dummy_val.IsNull()) {
674
+ first_row_nulls = false;
675
+ }
676
+
677
+ // try cast to sql_type of column
678
+ const auto &sql_type = best_sql_types_candidates[col].back();
679
+ if (!TryCastValue(dummy_val, sql_type)) {
680
+ first_row_consistent = false;
681
+ }
682
+ }
683
+
684
+ // update parser info, and read, generate & set col_names based on previous findings
685
+ if (((!first_row_consistent || first_row_nulls) && !options.has_header) || (options.has_header && options.header)) {
686
+ options.header = true;
687
+ case_insensitive_map_t<idx_t> name_collision_count;
688
+ // get header names from CSV
689
+ for (idx_t col = 0; col < options.num_cols; col++) {
690
+ const auto &val = best_header_row.GetValue(col, 0);
691
+ string col_name = val.ToString();
692
+
693
+ // generate name if field is empty
694
+ if (col_name.empty() || val.IsNull()) {
695
+ col_name = GenerateColumnName(options.num_cols, col);
696
+ }
697
+
698
+ // normalize names or at least trim whitespace
699
+ if (options.normalize_names) {
700
+ col_name = NormalizeColumnName(col_name);
701
+ } else {
702
+ col_name = TrimWhitespace(col_name);
703
+ }
704
+
705
+ // avoid duplicate header names
706
+ const string col_name_raw = col_name;
707
+ while (name_collision_count.find(col_name) != name_collision_count.end()) {
708
+ name_collision_count[col_name] += 1;
709
+ col_name = col_name + "_" + to_string(name_collision_count[col_name]);
710
+ }
711
+
712
+ names.push_back(col_name);
713
+ name_collision_count[col_name] = 0;
714
+ }
715
+
716
+ } else {
717
+ options.header = false;
718
+ for (idx_t col = 0; col < options.num_cols; col++) {
719
+ string column_name = GenerateColumnName(options.num_cols, col);
720
+ names.push_back(column_name);
721
+ }
722
+ }
723
+ for (idx_t i = 0; i < MinValue<idx_t>(names.size(), options.name_list.size()); i++) {
724
+ names[i] = options.name_list[i];
725
+ }
726
+ }
727
+
728
+ vector<LogicalType> BufferedCSVReader::RefineTypeDetection(const vector<LogicalType> &type_candidates,
729
+ const vector<LogicalType> &requested_types,
730
+ vector<vector<LogicalType>> &best_sql_types_candidates,
731
+ map<LogicalTypeId, vector<string>> &best_format_candidates) {
732
+ // for the type refine we set the SQL types to VARCHAR for all columns
733
+ return_types.clear();
734
+ return_types.assign(options.num_cols, LogicalType::VARCHAR);
735
+
736
+ vector<LogicalType> detected_types;
737
+
738
+ // if data types were provided, exit here if number of columns does not match
739
+ if (!requested_types.empty()) {
740
+ if (requested_types.size() != options.num_cols) {
741
+ throw InvalidInputException(
742
+ "Error while determining column types: found %lld columns but expected %d. (%s)", options.num_cols,
743
+ requested_types.size(), options.ToString());
744
+ } else {
745
+ detected_types = requested_types;
746
+ }
747
+ } else if (options.all_varchar) {
748
+ // return all types varchar
749
+ detected_types = return_types;
750
+ } else {
751
+ // jump through the rest of the file and continue to refine the sql type guess
752
+ while (JumpToNextSample()) {
753
+ InitParseChunk(return_types.size());
754
+ // if jump ends up a bad line, we just skip this chunk
755
+ if (!TryParseCSV(ParserMode::SNIFFING_DATATYPES)) {
756
+ continue;
757
+ }
758
+ for (idx_t col = 0; col < parse_chunk.ColumnCount(); col++) {
759
+ vector<LogicalType> &col_type_candidates = best_sql_types_candidates[col];
760
+ while (col_type_candidates.size() > 1) {
761
+ const auto &sql_type = col_type_candidates.back();
762
+ // narrow down the date formats
763
+ if (best_format_candidates.count(sql_type.id())) {
764
+ auto &best_type_format_candidates = best_format_candidates[sql_type.id()];
765
+ auto save_format_candidates = best_type_format_candidates;
766
+ while (!best_type_format_candidates.empty()) {
767
+ if (TryCastVector(parse_chunk.data[col], parse_chunk.size(), sql_type)) {
768
+ break;
769
+ }
770
+ // doesn't work - move to the next one
771
+ best_type_format_candidates.pop_back();
772
+ options.has_format[sql_type.id()] = (!best_type_format_candidates.empty());
773
+ if (!best_type_format_candidates.empty()) {
774
+ SetDateFormat(best_type_format_candidates.back(), sql_type.id());
775
+ }
776
+ }
777
+ // if none match, then this is not a column of type sql_type,
778
+ if (best_type_format_candidates.empty()) {
779
+ // so restore the candidates that did work.
780
+ best_type_format_candidates.swap(save_format_candidates);
781
+ if (!best_type_format_candidates.empty()) {
782
+ SetDateFormat(best_type_format_candidates.back(), sql_type.id());
783
+ }
784
+ }
785
+ }
786
+
787
+ if (TryCastVector(parse_chunk.data[col], parse_chunk.size(), sql_type)) {
788
+ break;
789
+ } else {
790
+ col_type_candidates.pop_back();
791
+ }
792
+ }
793
+ }
794
+ }
795
+
796
+ // set sql types
797
+ for (auto &best_sql_types_candidate : best_sql_types_candidates) {
798
+ LogicalType d_type = best_sql_types_candidate.back();
799
+ if (best_sql_types_candidate.size() == type_candidates.size()) {
800
+ d_type = LogicalType::VARCHAR;
801
+ }
802
+ detected_types.push_back(d_type);
803
+ }
804
+ }
805
+
806
+ return detected_types;
807
+ }
808
+
809
+ string BufferedCSVReader::ColumnTypesError(case_insensitive_map_t<idx_t> sql_types_per_column,
810
+ const vector<string> &names) {
811
+ for (idx_t i = 0; i < names.size(); i++) {
812
+ auto it = sql_types_per_column.find(names[i]);
813
+ if (it != sql_types_per_column.end()) {
814
+ sql_types_per_column.erase(names[i]);
815
+ continue;
816
+ }
817
+ }
818
+ if (sql_types_per_column.empty()) {
819
+ return string();
820
+ }
821
+ string exception = "COLUMN_TYPES error: Columns with names: ";
822
+ for (auto &col : sql_types_per_column) {
823
+ exception += "\"" + col.first + "\",";
824
+ }
825
+ exception.pop_back();
826
+ exception += " do not exist in the CSV File";
827
+ return exception;
828
+ }
829
+
830
+ vector<LogicalType> BufferedCSVReader::SniffCSV(const vector<LogicalType> &requested_types) {
831
+ for (auto &type : requested_types) {
832
+ // auto detect for blobs not supported: there may be invalid UTF-8 in the file
833
+ if (type.id() == LogicalTypeId::BLOB) {
834
+ return requested_types;
835
+ }
836
+ }
837
+
838
+ // #######
839
+ // ### dialect detection
840
+ // #######
841
+ BufferedCSVReaderOptions original_options = options;
842
+ vector<BufferedCSVReaderOptions> info_candidates;
843
+ idx_t best_num_cols = 0;
844
+
845
+ DetectDialect(requested_types, original_options, info_candidates, best_num_cols);
846
+
847
+ // if no dialect candidate was found, then file was most likely empty and we throw an exception
848
+ if (info_candidates.empty()) {
849
+ throw InvalidInputException(
850
+ "Error in file \"%s\": CSV options could not be auto-detected. Consider setting parser options manually.",
851
+ options.file_path);
852
+ }
853
+
854
+ // #######
855
+ // ### type detection (initial)
856
+ // #######
857
+
858
+ // format template candidates, ordered by descending specificity (~ from high to low)
859
+ std::map<LogicalTypeId, vector<const char *>> format_template_candidates = {
860
+ {LogicalTypeId::DATE, {"%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d", "%y-%m-%d"}},
861
+ {LogicalTypeId::TIMESTAMP,
862
+ {"%Y-%m-%d %H:%M:%S.%f", "%m-%d-%Y %I:%M:%S %p", "%m-%d-%y %I:%M:%S %p", "%d-%m-%Y %H:%M:%S",
863
+ "%d-%m-%y %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%y-%m-%d %H:%M:%S"}},
864
+ };
865
+ vector<vector<LogicalType>> best_sql_types_candidates;
866
+ map<LogicalTypeId, vector<string>> best_format_candidates;
867
+ DataChunk best_header_row;
868
+ DetectCandidateTypes(options.auto_type_candidates, format_template_candidates, info_candidates, original_options,
869
+ best_num_cols, best_sql_types_candidates, best_format_candidates, best_header_row);
870
+
871
+ if (best_format_candidates.empty() || best_header_row.size() == 0) {
872
+ throw InvalidInputException(
873
+ "Error in file \"%s\": CSV options could not be auto-detected. Consider setting parser options manually.",
874
+ original_options.file_path);
875
+ }
876
+
877
+ // #######
878
+ // ### header detection
879
+ // #######
880
+ options.num_cols = best_num_cols;
881
+ DetectHeader(best_sql_types_candidates, best_header_row);
882
+ if (!options.sql_type_list.empty()) {
883
+ // user-defined types were supplied for certain columns
884
+ // override the types
885
+ if (!options.sql_types_per_column.empty()) {
886
+ // types supplied as name -> value map
887
+ idx_t found = 0;
888
+ for (idx_t i = 0; i < names.size(); i++) {
889
+ auto it = options.sql_types_per_column.find(names[i]);
890
+ if (it != options.sql_types_per_column.end()) {
891
+ best_sql_types_candidates[i] = {options.sql_type_list[it->second]};
892
+ found++;
893
+ continue;
894
+ }
895
+ }
896
+ if (!options.file_options.union_by_name && found < options.sql_types_per_column.size()) {
897
+ string exception = ColumnTypesError(options.sql_types_per_column, names);
898
+ if (!exception.empty()) {
899
+ throw BinderException(exception);
900
+ }
901
+ }
902
+ } else {
903
+ // types supplied as list
904
+ if (names.size() < options.sql_type_list.size()) {
905
+ throw BinderException("read_csv: %d types were provided, but CSV file only has %d columns",
906
+ options.sql_type_list.size(), names.size());
907
+ }
908
+ for (idx_t i = 0; i < options.sql_type_list.size(); i++) {
909
+ best_sql_types_candidates[i] = {options.sql_type_list[i]};
910
+ }
911
+ }
912
+ }
913
+
914
+ // #######
915
+ // ### type detection (refining)
916
+ // #######
917
+ return RefineTypeDetection(options.auto_type_candidates, requested_types, best_sql_types_candidates,
918
+ best_format_candidates);
919
+ }
920
+
921
+ bool BufferedCSVReader::TryParseComplexCSV(DataChunk &insert_chunk, string &error_message) {
922
+ // used for parsing algorithm
923
+ bool finished_chunk = false;
924
+ idx_t column = 0;
925
+ vector<idx_t> escape_positions;
926
+ bool has_quotes = false;
927
+ uint8_t delimiter_pos = 0, escape_pos = 0, quote_pos = 0;
928
+ idx_t offset = 0;
929
+ idx_t line_start = 0;
930
+ // read values into the buffer (if any)
931
+ if (position >= buffer_size) {
932
+ if (!ReadBuffer(start, line_start)) {
933
+ return true;
934
+ }
935
+ }
936
+ // start parsing the first value
937
+ start = position;
938
+ goto value_start;
939
+ value_start:
940
+ /* state: value_start */
941
+ // this state parses the first characters of a value
942
+ offset = 0;
943
+ delimiter_pos = 0;
944
+ quote_pos = 0;
945
+ do {
946
+ idx_t count = 0;
947
+ for (; position < buffer_size; position++) {
948
+ quote_search.Match(quote_pos, buffer[position]);
949
+ delimiter_search.Match(delimiter_pos, buffer[position]);
950
+ count++;
951
+ if (delimiter_pos == options.delimiter.size()) {
952
+ // found a delimiter, add the value
953
+ offset = options.delimiter.size() - 1;
954
+ goto add_value;
955
+ } else if (StringUtil::CharacterIsNewline(buffer[position])) {
956
+ // found a newline, add the row
957
+ goto add_row;
958
+ }
959
+ if (count > quote_pos) {
960
+ // did not find a quote directly at the start of the value, stop looking for the quote now
961
+ goto normal;
962
+ }
963
+ if (quote_pos == options.quote.size()) {
964
+ // found a quote, go to quoted loop and skip the initial quote
965
+ start += options.quote.size();
966
+ goto in_quotes;
967
+ }
968
+ }
969
+ } while (ReadBuffer(start, line_start));
970
+ // file ends while scanning for quote/delimiter, go to final state
971
+ goto final_state;
972
+ normal:
973
+ /* state: normal parsing state */
974
+ // this state parses the remainder of a non-quoted value until we reach a delimiter or newline
975
+ position++;
976
+ do {
977
+ for (; position < buffer_size; position++) {
978
+ delimiter_search.Match(delimiter_pos, buffer[position]);
979
+ if (delimiter_pos == options.delimiter.size()) {
980
+ offset = options.delimiter.size() - 1;
981
+ goto add_value;
982
+ } else if (StringUtil::CharacterIsNewline(buffer[position])) {
983
+ goto add_row;
984
+ }
985
+ }
986
+ } while (ReadBuffer(start, line_start));
987
+ goto final_state;
988
+ add_value:
989
+ AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes);
990
+ // increase position by 1 and move start to the new position
991
+ offset = 0;
992
+ has_quotes = false;
993
+ start = ++position;
994
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
995
+ // file ends right after delimiter, go to final state
996
+ goto final_state;
997
+ }
998
+ goto value_start;
999
+ add_row : {
1000
+ // check type of newline (\r or \n)
1001
+ bool carriage_return = buffer[position] == '\r';
1002
+ AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes);
1003
+ finished_chunk = AddRow(insert_chunk, column, error_message);
1004
+
1005
+ if (!error_message.empty()) {
1006
+ return false;
1007
+ }
1008
+ // increase position by 1 and move start to the new position
1009
+ offset = 0;
1010
+ has_quotes = false;
1011
+ position++;
1012
+ SkipEmptyLines();
1013
+ start = position;
1014
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
1015
+ // file ends right after newline, go to final state
1016
+ goto final_state;
1017
+ }
1018
+ if (carriage_return) {
1019
+ // \r newline, go to special state that parses an optional \n afterwards
1020
+ goto carriage_return;
1021
+ } else {
1022
+ // \n newline, move to value start
1023
+ if (finished_chunk) {
1024
+ return true;
1025
+ }
1026
+ goto value_start;
1027
+ }
1028
+ }
1029
+ in_quotes:
1030
+ /* state: in_quotes */
1031
+ // this state parses the remainder of a quoted value
1032
+ quote_pos = 0;
1033
+ escape_pos = 0;
1034
+ has_quotes = true;
1035
+ position++;
1036
+ do {
1037
+ for (; position < buffer_size; position++) {
1038
+ quote_search.Match(quote_pos, buffer[position]);
1039
+ escape_search.Match(escape_pos, buffer[position]);
1040
+ if (quote_pos == options.quote.size()) {
1041
+ goto unquote;
1042
+ } else if (escape_pos == options.escape.size()) {
1043
+ escape_positions.push_back(position - start - (options.escape.size() - 1));
1044
+ goto handle_escape;
1045
+ }
1046
+ }
1047
+ } while (ReadBuffer(start, line_start));
1048
+ // still in quoted state at the end of the file, error:
1049
+ error_message = StringUtil::Format("Error in file \"%s\" on line %s: unterminated quotes. (%s)", options.file_path,
1050
+ GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
1051
+ return false;
1052
+ unquote:
1053
+ /* state: unquote */
1054
+ // this state handles the state directly after we unquote
1055
+ // in this state we expect either another quote (entering the quoted state again, and escaping the quote)
1056
+ // or a delimiter/newline, ending the current value and moving on to the next value
1057
+ delimiter_pos = 0;
1058
+ quote_pos = 0;
1059
+ position++;
1060
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
1061
+ // file ends right after unquote, go to final state
1062
+ offset = options.quote.size();
1063
+ goto final_state;
1064
+ }
1065
+ if (StringUtil::CharacterIsNewline(buffer[position])) {
1066
+ // quote followed by newline, add row
1067
+ offset = options.quote.size();
1068
+ goto add_row;
1069
+ }
1070
+ do {
1071
+ idx_t count = 0;
1072
+ for (; position < buffer_size; position++) {
1073
+ quote_search.Match(quote_pos, buffer[position]);
1074
+ delimiter_search.Match(delimiter_pos, buffer[position]);
1075
+ count++;
1076
+ if (count > delimiter_pos && count > quote_pos) {
1077
+ error_message = StringUtil::Format(
1078
+ "Error in file \"%s\" on line %s: quote should be followed by end of value, end "
1079
+ "of row or another quote. (%s)",
1080
+ options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
1081
+ return false;
1082
+ }
1083
+ if (delimiter_pos == options.delimiter.size()) {
1084
+ // quote followed by delimiter, add value
1085
+ offset = options.quote.size() + options.delimiter.size() - 1;
1086
+ goto add_value;
1087
+ } else if (quote_pos == options.quote.size() &&
1088
+ (options.escape.empty() || options.escape == options.quote)) {
1089
+ // quote followed by quote, go back to quoted state and add to escape
1090
+ escape_positions.push_back(position - start - (options.quote.size() - 1));
1091
+ goto in_quotes;
1092
+ }
1093
+ }
1094
+ } while (ReadBuffer(start, line_start));
1095
+ error_message = StringUtil::Format(
1096
+ "Error in file \"%s\" on line %s: quote should be followed by end of value, end of row or another quote. (%s)",
1097
+ options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
1098
+ return false;
1099
+ handle_escape:
1100
+ escape_pos = 0;
1101
+ quote_pos = 0;
1102
+ position++;
1103
+ do {
1104
+ idx_t count = 0;
1105
+ for (; position < buffer_size; position++) {
1106
+ quote_search.Match(quote_pos, buffer[position]);
1107
+ escape_search.Match(escape_pos, buffer[position]);
1108
+ count++;
1109
+ if (count > escape_pos && count > quote_pos) {
1110
+ error_message = StringUtil::Format(
1111
+ "Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)",
1112
+ options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
1113
+ return false;
1114
+ }
1115
+ if (quote_pos == options.quote.size() || escape_pos == options.escape.size()) {
1116
+ // found quote or escape: move back to quoted state
1117
+ goto in_quotes;
1118
+ }
1119
+ }
1120
+ } while (ReadBuffer(start, line_start));
1121
+ error_message =
1122
+ StringUtil::Format("Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)",
1123
+ options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
1124
+ return false;
1125
+ carriage_return:
1126
+ /* state: carriage_return */
1127
+ // this stage optionally skips a newline (\n) character, which allows \r\n to be interpreted as a single line
1128
+ if (buffer[position] == '\n') {
1129
+ // newline after carriage return: skip
1130
+ start = ++position;
1131
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
1132
+ // file ends right after newline, go to final state
1133
+ goto final_state;
1134
+ }
1135
+ }
1136
+ if (finished_chunk) {
1137
+ return true;
1138
+ }
1139
+ goto value_start;
1140
+ final_state:
1141
+ if (finished_chunk) {
1142
+ return true;
1143
+ }
1144
+ if (column > 0 || position > start) {
1145
+ // remaining values to be added to the chunk
1146
+ AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes);
1147
+ finished_chunk = AddRow(insert_chunk, column, error_message);
1148
+ SkipEmptyLines();
1149
+ if (!error_message.empty()) {
1150
+ return false;
1151
+ }
1152
+ }
1153
+ // final stage, only reached after parsing the file is finished
1154
+ // flush the parsed chunk and finalize parsing
1155
+ if (mode == ParserMode::PARSING) {
1156
+ Flush(insert_chunk);
1157
+ }
1158
+
1159
+ end_of_file_reached = true;
1160
+ return true;
1161
+ }
1162
+
1163
+ void BufferedCSVReader::SkipEmptyLines() {
1164
+ if (parse_chunk.data.size() == 1) {
1165
+ // Empty lines are null data.
1166
+ return;
1167
+ }
1168
+ for (; position < buffer_size; position++) {
1169
+ if (!StringUtil::CharacterIsNewline(buffer[position])) {
1170
+ return;
1171
+ }
1172
+ }
1173
+ }
1174
+
1175
+ void UpdateMaxLineLength(ClientContext &context, idx_t line_length) {
1176
+ if (!context.client_data->debug_set_max_line_length) {
1177
+ return;
1178
+ }
1179
+ if (line_length < context.client_data->debug_max_line_length) {
1180
+ return;
1181
+ }
1182
+ context.client_data->debug_max_line_length = line_length;
1183
+ }
1184
+
1185
+ bool BufferedCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message) {
1186
+ // used for parsing algorithm
1187
+ bool finished_chunk = false;
1188
+ idx_t column = 0;
1189
+ idx_t offset = 0;
1190
+ bool has_quotes = false;
1191
+ vector<idx_t> escape_positions;
1192
+
1193
+ idx_t line_start = position;
1194
+ // read values into the buffer (if any)
1195
+ if (position >= buffer_size) {
1196
+ if (!ReadBuffer(start, line_start)) {
1197
+ return true;
1198
+ }
1199
+ }
1200
+
1201
+ // start parsing the first value
1202
+ goto value_start;
1203
+ value_start:
1204
+ offset = 0;
1205
+ /* state: value_start */
1206
+ // this state parses the first character of a value
1207
+ if (buffer[position] == options.quote[0]) {
1208
+ // quote: actual value starts in the next position
1209
+ // move to in_quotes state
1210
+ start = position + 1;
1211
+ goto in_quotes;
1212
+ } else {
1213
+ // no quote, move to normal parsing state
1214
+ start = position;
1215
+ goto normal;
1216
+ }
1217
+ normal:
1218
+ /* state: normal parsing state */
1219
+ // this state parses the remainder of a non-quoted value until we reach a delimiter or newline
1220
+ do {
1221
+ for (; position < buffer_size; position++) {
1222
+ if (buffer[position] == options.delimiter[0]) {
1223
+ // delimiter: end the value and add it to the chunk
1224
+ goto add_value;
1225
+ } else if (StringUtil::CharacterIsNewline(buffer[position])) {
1226
+ // newline: add row
1227
+ goto add_row;
1228
+ }
1229
+ }
1230
+ } while (ReadBuffer(start, line_start));
1231
+ // file ends during normal scan: go to end state
1232
+ goto final_state;
1233
+ add_value:
1234
+ AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes);
1235
+ // increase position by 1 and move start to the new position
1236
+ offset = 0;
1237
+ has_quotes = false;
1238
+ start = ++position;
1239
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
1240
+ // file ends right after delimiter, go to final state
1241
+ goto final_state;
1242
+ }
1243
+ goto value_start;
1244
+ add_row : {
1245
+ // check type of newline (\r or \n)
1246
+ bool carriage_return = buffer[position] == '\r';
1247
+ AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes);
1248
+ if (!error_message.empty()) {
1249
+ return false;
1250
+ }
1251
+ finished_chunk = AddRow(insert_chunk, column, error_message);
1252
+ UpdateMaxLineLength(context, position - line_start);
1253
+ if (!error_message.empty()) {
1254
+ return false;
1255
+ }
1256
+ // increase position by 1 and move start to the new position
1257
+ offset = 0;
1258
+ has_quotes = false;
1259
+ position++;
1260
+ start = position;
1261
+ line_start = position;
1262
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
1263
+ // file ends right after delimiter, go to final state
1264
+ goto final_state;
1265
+ }
1266
+ if (carriage_return) {
1267
+ // \r newline, go to special state that parses an optional \n afterwards
1268
+ goto carriage_return;
1269
+ } else {
1270
+ SetNewLineDelimiter();
1271
+ SkipEmptyLines();
1272
+
1273
+ start = position;
1274
+ line_start = position;
1275
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
1276
+ // file ends right after delimiter, go to final state
1277
+ goto final_state;
1278
+ }
1279
+ // \n newline, move to value start
1280
+ if (finished_chunk) {
1281
+ return true;
1282
+ }
1283
+ goto value_start;
1284
+ }
1285
+ }
1286
+ in_quotes:
1287
+ /* state: in_quotes */
1288
+ // this state parses the remainder of a quoted value
1289
+ has_quotes = true;
1290
+ position++;
1291
+ do {
1292
+ for (; position < buffer_size; position++) {
1293
+ if (buffer[position] == options.quote[0]) {
1294
+ // quote: move to unquoted state
1295
+ goto unquote;
1296
+ } else if (buffer[position] == options.escape[0]) {
1297
+ // escape: store the escaped position and move to handle_escape state
1298
+ escape_positions.push_back(position - start);
1299
+ goto handle_escape;
1300
+ }
1301
+ }
1302
+ } while (ReadBuffer(start, line_start));
1303
+ // still in quoted state at the end of the file, error:
1304
+ throw InvalidInputException("Error in file \"%s\" on line %s: unterminated quotes. (%s)", options.file_path,
1305
+ GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
1306
+ unquote:
1307
+ /* state: unquote */
1308
+ // this state handles the state directly after we unquote
1309
+ // in this state we expect either another quote (entering the quoted state again, and escaping the quote)
1310
+ // or a delimiter/newline, ending the current value and moving on to the next value
1311
+ position++;
1312
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
1313
+ // file ends right after unquote, go to final state
1314
+ offset = 1;
1315
+ goto final_state;
1316
+ }
1317
+ if (buffer[position] == options.quote[0] && (options.escape.empty() || options.escape[0] == options.quote[0])) {
1318
+ // escaped quote, return to quoted state and store escape position
1319
+ escape_positions.push_back(position - start);
1320
+ goto in_quotes;
1321
+ } else if (buffer[position] == options.delimiter[0]) {
1322
+ // delimiter, add value
1323
+ offset = 1;
1324
+ goto add_value;
1325
+ } else if (StringUtil::CharacterIsNewline(buffer[position])) {
1326
+ offset = 1;
1327
+ goto add_row;
1328
+ } else {
1329
+ error_message = StringUtil::Format(
1330
+ "Error in file \"%s\" on line %s: quote should be followed by end of value, end of "
1331
+ "row or another quote. (%s)",
1332
+ options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
1333
+ return false;
1334
+ }
1335
+ handle_escape:
1336
+ /* state: handle_escape */
1337
+ // escape should be followed by a quote or another escape character
1338
+ position++;
1339
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
1340
+ error_message = StringUtil::Format(
1341
+ "Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", options.file_path,
1342
+ GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
1343
+ return false;
1344
+ }
1345
+ if (buffer[position] != options.quote[0] && buffer[position] != options.escape[0]) {
1346
+ error_message = StringUtil::Format(
1347
+ "Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", options.file_path,
1348
+ GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
1349
+ return false;
1350
+ }
1351
+ // escape was followed by quote or escape, go back to quoted state
1352
+ goto in_quotes;
1353
+ carriage_return:
1354
+ /* state: carriage_return */
1355
+ // this stage optionally skips a newline (\n) character, which allows \r\n to be interpreted as a single line
1356
+ if (buffer[position] == '\n') {
1357
+ SetNewLineDelimiter(true, true);
1358
+ // newline after carriage return: skip
1359
+ // increase position by 1 and move start to the new position
1360
+ start = ++position;
1361
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
1362
+ // file ends right after delimiter, go to final state
1363
+ goto final_state;
1364
+ }
1365
+ } else {
1366
+ SetNewLineDelimiter(true, false);
1367
+ }
1368
+ if (finished_chunk) {
1369
+ return true;
1370
+ }
1371
+ SkipEmptyLines();
1372
+ start = position;
1373
+ line_start = position;
1374
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
1375
+ // file ends right after delimiter, go to final state
1376
+ goto final_state;
1377
+ }
1378
+
1379
+ goto value_start;
1380
+ final_state:
1381
+ if (finished_chunk) {
1382
+ return true;
1383
+ }
1384
+
1385
+ if (column > 0 || position > start) {
1386
+ // remaining values to be added to the chunk
1387
+ AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes);
1388
+ finished_chunk = AddRow(insert_chunk, column, error_message);
1389
+ SkipEmptyLines();
1390
+ UpdateMaxLineLength(context, position - line_start);
1391
+ if (!error_message.empty()) {
1392
+ return false;
1393
+ }
1394
+ }
1395
+
1396
+ // final stage, only reached after parsing the file is finished
1397
+ // flush the parsed chunk and finalize parsing
1398
+ if (mode == ParserMode::PARSING) {
1399
+ Flush(insert_chunk);
1400
+ }
1401
+
1402
+ end_of_file_reached = true;
1403
+ return true;
1404
+ }
1405
+
1406
+ bool BufferedCSVReader::ReadBuffer(idx_t &start, idx_t &line_start) {
1407
+ if (start > buffer_size) {
1408
+ return false;
1409
+ }
1410
+ auto old_buffer = std::move(buffer);
1411
+
1412
+ // the remaining part of the last buffer
1413
+ idx_t remaining = buffer_size - start;
1414
+
1415
+ bool large_buffers = mode == ParserMode::PARSING && !file_handle->OnDiskFile() && file_handle->CanSeek();
1416
+ idx_t buffer_read_size = large_buffers ? INITIAL_BUFFER_SIZE_LARGE : INITIAL_BUFFER_SIZE;
1417
+
1418
+ while (remaining > buffer_read_size) {
1419
+ buffer_read_size *= 2;
1420
+ }
1421
+
1422
+ // Check line length
1423
+ if (remaining > options.maximum_line_size) {
1424
+ throw InvalidInputException("Maximum line size of %llu bytes exceeded on line %s!", options.maximum_line_size,
1425
+ GetLineNumberStr(linenr, linenr_estimated));
1426
+ }
1427
+
1428
+ buffer = make_unsafe_uniq_array<char>(buffer_read_size + remaining + 1);
1429
+ buffer_size = remaining + buffer_read_size;
1430
+ if (remaining > 0) {
1431
+ // remaining from last buffer: copy it here
1432
+ memcpy(buffer.get(), old_buffer.get() + start, remaining);
1433
+ }
1434
+ idx_t read_count = file_handle->Read(buffer.get() + remaining, buffer_read_size);
1435
+
1436
+ bytes_in_chunk += read_count;
1437
+ buffer_size = remaining + read_count;
1438
+ buffer[buffer_size] = '\0';
1439
+ if (old_buffer) {
1440
+ cached_buffers.push_back(std::move(old_buffer));
1441
+ }
1442
+ start = 0;
1443
+ position = remaining;
1444
+ if (!bom_checked) {
1445
+ bom_checked = true;
1446
+ if (read_count >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') {
1447
+ start += 3;
1448
+ position += 3;
1449
+ }
1450
+ }
1451
+ line_start = start;
1452
+
1453
+ return read_count > 0;
1454
+ }
1455
+
1456
+ void BufferedCSVReader::ParseCSV(DataChunk &insert_chunk) {
1457
+ string error_message;
1458
+ if (!TryParseCSV(ParserMode::PARSING, insert_chunk, error_message)) {
1459
+ throw InvalidInputException(error_message);
1460
+ }
1461
+ }
1462
+
1463
+ bool BufferedCSVReader::TryParseCSV(ParserMode mode) {
1464
+ DataChunk dummy_chunk;
1465
+ string error_message;
1466
+ return TryParseCSV(mode, dummy_chunk, error_message);
1467
+ }
1468
+
1469
+ void BufferedCSVReader::ParseCSV(ParserMode mode) {
1470
+ DataChunk dummy_chunk;
1471
+ string error_message;
1472
+ if (!TryParseCSV(mode, dummy_chunk, error_message)) {
1473
+ throw InvalidInputException(error_message);
1474
+ }
1475
+ }
1476
+
1477
+ bool BufferedCSVReader::TryParseCSV(ParserMode parser_mode, DataChunk &insert_chunk, string &error_message) {
1478
+ mode = parser_mode;
1479
+
1480
+ if (options.quote.size() <= 1 && options.escape.size() <= 1 && options.delimiter.size() == 1) {
1481
+ return TryParseSimpleCSV(insert_chunk, error_message);
1482
+ } else {
1483
+ return TryParseComplexCSV(insert_chunk, error_message);
1484
+ }
1485
+ }
1486
+
1487
+ } // namespace duckdb