duckdb 0.8.2-dev3458.0 → 0.8.2-dev3949.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. package/binding.gyp +2 -0
  2. package/package.json +1 -1
  3. package/src/duckdb/extension/icu/icu_extension.cpp +5 -5
  4. package/src/duckdb/extension/json/include/json_deserializer.hpp +7 -16
  5. package/src/duckdb/extension/json/include/json_serializer.hpp +9 -15
  6. package/src/duckdb/extension/json/json_deserializer.cpp +29 -67
  7. package/src/duckdb/extension/json/json_scan.cpp +1 -1
  8. package/src/duckdb/extension/json/json_serializer.cpp +26 -69
  9. package/src/duckdb/src/common/enum_util.cpp +119 -7
  10. package/src/duckdb/src/common/extra_type_info.cpp +7 -3
  11. package/src/duckdb/src/common/radix_partitioning.cpp +8 -31
  12. package/src/duckdb/src/common/row_operations/row_aggregate.cpp +18 -3
  13. package/src/duckdb/src/common/serializer/binary_deserializer.cpp +62 -77
  14. package/src/duckdb/src/common/serializer/binary_serializer.cpp +84 -84
  15. package/src/duckdb/src/common/serializer/format_serializer.cpp +1 -1
  16. package/src/duckdb/src/common/sort/partition_state.cpp +41 -33
  17. package/src/duckdb/src/common/types/data_chunk.cpp +44 -8
  18. package/src/duckdb/src/common/types/hyperloglog.cpp +21 -0
  19. package/src/duckdb/src/common/types/interval.cpp +3 -0
  20. package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +252 -126
  21. package/src/duckdb/src/common/types/row/row_layout.cpp +3 -31
  22. package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +40 -32
  23. package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +39 -26
  24. package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +11 -1
  25. package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +21 -16
  26. package/src/duckdb/src/common/types/value.cpp +63 -42
  27. package/src/duckdb/src/common/types/vector.cpp +33 -67
  28. package/src/duckdb/src/core_functions/scalar/list/list_lambdas.cpp +3 -2
  29. package/src/duckdb/src/execution/aggregate_hashtable.cpp +222 -364
  30. package/src/duckdb/src/execution/join_hashtable.cpp +5 -6
  31. package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +240 -310
  32. package/src/duckdb/src/execution/operator/aggregate/physical_ungrouped_aggregate.cpp +202 -173
  33. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +36 -2
  34. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/base_csv_reader.cpp +58 -162
  35. package/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp +434 -0
  36. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +80 -0
  37. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer_manager.cpp +90 -0
  38. package/src/duckdb/src/execution/operator/csv_scanner/csv_file_handle.cpp +95 -0
  39. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/csv_reader_options.cpp +47 -28
  40. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine.cpp +35 -0
  41. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +107 -0
  42. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/parallel_csv_reader.cpp +44 -44
  43. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +52 -0
  44. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +336 -0
  45. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +165 -0
  46. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +398 -0
  47. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +175 -0
  48. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +39 -0
  49. package/src/duckdb/src/execution/operator/join/physical_asof_join.cpp +1 -1
  50. package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +1 -2
  51. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +614 -574
  52. package/src/duckdb/src/execution/window_executor.cpp +6 -5
  53. package/src/duckdb/src/function/cast/cast_function_set.cpp +1 -0
  54. package/src/duckdb/src/function/scalar/strftime_format.cpp +4 -4
  55. package/src/duckdb/src/function/table/copy_csv.cpp +94 -96
  56. package/src/duckdb/src/function/table/read_csv.cpp +150 -136
  57. package/src/duckdb/src/function/table/table_scan.cpp +0 -2
  58. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  59. package/src/duckdb/src/include/duckdb/common/enum_util.hpp +24 -0
  60. package/src/duckdb/src/include/duckdb/common/file_opener.hpp +9 -0
  61. package/src/duckdb/src/include/duckdb/common/fixed_size_map.hpp +208 -0
  62. package/src/duckdb/src/include/duckdb/common/optional_idx.hpp +3 -0
  63. package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +2 -1
  64. package/src/duckdb/src/include/duckdb/common/printer.hpp +11 -0
  65. package/src/duckdb/src/include/duckdb/common/serializer/binary_deserializer.hpp +43 -30
  66. package/src/duckdb/src/include/duckdb/common/serializer/binary_serializer.hpp +36 -35
  67. package/src/duckdb/src/include/duckdb/common/serializer/deserialization_data.hpp +18 -0
  68. package/src/duckdb/src/include/duckdb/common/serializer/encoding_util.hpp +132 -0
  69. package/src/duckdb/src/include/duckdb/common/serializer/format_deserializer.hpp +125 -150
  70. package/src/duckdb/src/include/duckdb/common/serializer/format_serializer.hpp +119 -107
  71. package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +2 -1
  72. package/src/duckdb/src/include/duckdb/common/shared_ptr.hpp +8 -0
  73. package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +13 -7
  74. package/src/duckdb/src/include/duckdb/common/types/data_chunk.hpp +5 -0
  75. package/src/duckdb/src/include/duckdb/common/types/hyperloglog.hpp +7 -1
  76. package/src/duckdb/src/include/duckdb/common/types/interval.hpp +7 -0
  77. package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +41 -9
  78. package/src/duckdb/src/include/duckdb/common/types/row/row_data_collection_scanner.hpp +5 -0
  79. package/src/duckdb/src/include/duckdb/common/types/row/row_layout.hpp +1 -23
  80. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +14 -8
  81. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +6 -3
  82. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +7 -0
  83. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +13 -8
  84. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +3 -2
  85. package/src/duckdb/src/include/duckdb/common/types/vector.hpp +3 -3
  86. package/src/duckdb/src/include/duckdb/common/vector.hpp +2 -2
  87. package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +125 -146
  88. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_hash_aggregate.hpp +5 -4
  89. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_window.hpp +4 -3
  90. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/base_csv_reader.hpp +17 -17
  91. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp +72 -0
  92. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +110 -0
  93. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp +103 -0
  94. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_file_handle.hpp +8 -15
  95. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_line_info.hpp +1 -1
  96. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_reader_options.hpp +52 -28
  97. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +127 -0
  98. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +75 -0
  99. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +51 -0
  100. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/parallel_csv_reader.hpp +21 -27
  101. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/quote_rules.hpp +21 -0
  102. package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +18 -27
  103. package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +5 -6
  104. package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +4 -4
  105. package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +17 -12
  106. package/src/duckdb/src/include/duckdb/main/client_context_file_opener.hpp +1 -0
  107. package/src/duckdb/src/include/duckdb/main/client_data.hpp +2 -1
  108. package/src/duckdb/src/include/duckdb/main/config.hpp +1 -0
  109. package/src/duckdb/src/include/duckdb/main/connection.hpp +2 -2
  110. package/src/duckdb/src/include/duckdb/main/relation/read_csv_relation.hpp +6 -6
  111. package/src/duckdb/src/include/duckdb/parallel/event.hpp +12 -1
  112. package/src/duckdb/src/include/duckdb/storage/block.hpp +6 -0
  113. package/src/duckdb/src/include/duckdb/storage/buffer/block_handle.hpp +3 -0
  114. package/src/duckdb/src/include/duckdb/storage/statistics/base_statistics.hpp +7 -3
  115. package/src/duckdb/src/include/duckdb/storage/statistics/column_statistics.hpp +4 -0
  116. package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +5 -0
  117. package/src/duckdb/src/include/duckdb/storage/statistics/list_stats.hpp +3 -0
  118. package/src/duckdb/src/include/duckdb/storage/statistics/numeric_stats.hpp +3 -0
  119. package/src/duckdb/src/include/duckdb/storage/statistics/string_stats.hpp +3 -0
  120. package/src/duckdb/src/include/duckdb/storage/statistics/struct_stats.hpp +3 -0
  121. package/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp +15 -3
  122. package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +4 -0
  123. package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +5 -0
  124. package/src/duckdb/src/include/duckdb/verification/deserialized_statement_verifier_v2.hpp +6 -0
  125. package/src/duckdb/src/include/duckdb/verification/statement_verifier.hpp +1 -0
  126. package/src/duckdb/src/include/duckdb.h +12 -0
  127. package/src/duckdb/src/main/capi/logical_types-c.cpp +22 -0
  128. package/src/duckdb/src/main/client_context_file_opener.cpp +17 -0
  129. package/src/duckdb/src/main/client_verify.cpp +1 -0
  130. package/src/duckdb/src/main/config.cpp +2 -2
  131. package/src/duckdb/src/main/connection.cpp +3 -3
  132. package/src/duckdb/src/main/relation/read_csv_relation.cpp +19 -13
  133. package/src/duckdb/src/parallel/pipeline_finish_event.cpp +1 -1
  134. package/src/duckdb/src/parser/tableref/pivotref.cpp +0 -16
  135. package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +1 -1
  136. package/src/duckdb/src/planner/binder/statement/bind_export.cpp +41 -25
  137. package/src/duckdb/src/planner/expression/bound_aggregate_expression.cpp +4 -4
  138. package/src/duckdb/src/planner/expression/bound_window_expression.cpp +10 -10
  139. package/src/duckdb/src/planner/logical_operator.cpp +1 -1
  140. package/src/duckdb/src/planner/planner.cpp +1 -1
  141. package/src/duckdb/src/storage/checkpoint_manager.cpp +4 -3
  142. package/src/duckdb/src/storage/serialization/serialize_constraint.cpp +1 -1
  143. package/src/duckdb/src/storage/serialization/serialize_create_info.cpp +5 -5
  144. package/src/duckdb/src/storage/serialization/serialize_expression.cpp +10 -10
  145. package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +20 -20
  146. package/src/duckdb/src/storage/serialization/serialize_macro_function.cpp +2 -2
  147. package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +118 -89
  148. package/src/duckdb/src/storage/serialization/serialize_parse_info.cpp +3 -3
  149. package/src/duckdb/src/storage/serialization/serialize_parsed_expression.cpp +27 -27
  150. package/src/duckdb/src/storage/serialization/serialize_query_node.cpp +16 -16
  151. package/src/duckdb/src/storage/serialization/serialize_result_modifier.cpp +8 -8
  152. package/src/duckdb/src/storage/serialization/serialize_statement.cpp +1 -1
  153. package/src/duckdb/src/storage/serialization/serialize_storage.cpp +39 -0
  154. package/src/duckdb/src/storage/serialization/serialize_tableref.cpp +9 -9
  155. package/src/duckdb/src/storage/statistics/base_statistics.cpp +67 -4
  156. package/src/duckdb/src/storage/statistics/column_statistics.cpp +16 -0
  157. package/src/duckdb/src/storage/statistics/list_stats.cpp +21 -0
  158. package/src/duckdb/src/storage/statistics/numeric_stats.cpp +126 -1
  159. package/src/duckdb/src/storage/statistics/string_stats.cpp +23 -0
  160. package/src/duckdb/src/storage/statistics/struct_stats.cpp +27 -0
  161. package/src/duckdb/src/storage/storage_info.cpp +1 -1
  162. package/src/duckdb/src/storage/table/chunk_info.cpp +82 -3
  163. package/src/duckdb/src/storage/table/row_group.cpp +68 -1
  164. package/src/duckdb/src/storage/table/table_statistics.cpp +21 -0
  165. package/src/duckdb/src/storage/wal_replay.cpp +2 -2
  166. package/src/duckdb/src/verification/deserialized_statement_verifier_v2.cpp +15 -1
  167. package/src/duckdb/src/verification/statement_verifier.cpp +2 -0
  168. package/src/duckdb/third_party/utf8proc/include/utf8proc_wrapper.hpp +8 -0
  169. package/src/duckdb/ub_src_execution.cpp +0 -2
  170. package/src/duckdb/ub_src_execution_operator_csv_scanner.cpp +18 -0
  171. package/src/duckdb/ub_src_execution_operator_csv_scanner_sniffer.cpp +12 -0
  172. package/src/duckdb/ub_src_execution_operator_persistent.cpp +0 -12
  173. package/src/duckdb/ub_src_storage_serialization.cpp +2 -0
  174. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +0 -1487
  175. package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +0 -72
  176. package/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp +0 -158
  177. package/src/duckdb/src/execution/partitionable_hashtable.cpp +0 -207
  178. package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +0 -133
  179. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +0 -74
  180. package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +0 -73
@@ -0,0 +1,434 @@
1
+ #include "duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp"
2
+
3
+ #include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp"
4
+ #include "duckdb/common/file_system.hpp"
5
+ #include "duckdb/common/string_util.hpp"
6
+ #include "duckdb/common/to_string.hpp"
7
+ #include "duckdb/common/types/cast_helpers.hpp"
8
+ #include "duckdb/common/vector_operations/unary_executor.hpp"
9
+ #include "duckdb/common/vector_operations/vector_operations.hpp"
10
+ #include "duckdb/execution/operator/scan/csv/csv_sniffer.hpp"
11
+ #include "duckdb/execution/operator/scan/csv/csv_state_machine.hpp"
12
+ #include "duckdb/function/scalar/strftime_format.hpp"
13
+ #include "duckdb/main/client_data.hpp"
14
+ #include "duckdb/main/database.hpp"
15
+ #include "duckdb/main/error_manager.hpp"
16
+ #include "duckdb/parser/column_definition.hpp"
17
+ #include "duckdb/parser/keyword_helper.hpp"
18
+ #include "duckdb/storage/data_table.hpp"
19
+ #include "utf8proc.hpp"
20
+ #include "utf8proc_wrapper.hpp"
21
+
22
+ #include <algorithm>
23
+ #include <cctype>
24
+ #include <cstring>
25
+ #include <fstream>
26
+
27
+ namespace duckdb {
28
+
29
+ BufferedCSVReader::BufferedCSVReader(ClientContext &context, CSVReaderOptions options_p,
30
+ const vector<LogicalType> &requested_types)
31
+ : BaseCSVReader(context, std::move(options_p), requested_types), buffer_size(0), position(0), start(0) {
32
+ file_handle = OpenCSV(context, options);
33
+ Initialize(requested_types);
34
+ }
35
+
36
+ BufferedCSVReader::BufferedCSVReader(ClientContext &context, string filename, CSVReaderOptions options_p,
37
+ const vector<LogicalType> &requested_types)
38
+ : BaseCSVReader(context, std::move(options_p), requested_types), buffer_size(0), position(0), start(0) {
39
+ options.file_path = std::move(filename);
40
+ file_handle = OpenCSV(context, options);
41
+ Initialize(requested_types);
42
+ }
43
+
44
+ void BufferedCSVReader::Initialize(const vector<LogicalType> &requested_types) {
45
+ if (options.auto_detect && options.file_options.union_by_name) {
46
+ // This is required for the sniffer to work on Union By Name
47
+ D_ASSERT(options.file_path == file_handle->GetFilePath());
48
+ auto bm_file_handle = BaseCSVReader::OpenCSV(context, options);
49
+ auto csv_buffer_manager = make_shared<CSVBufferManager>(context, std::move(bm_file_handle), options);
50
+ CSVSniffer sniffer(options, csv_buffer_manager, state_machine_cache);
51
+ auto sniffer_result = sniffer.SniffCSV();
52
+ return_types = sniffer_result.return_types;
53
+ names = sniffer_result.names;
54
+ if (return_types.empty()) {
55
+ throw InvalidInputException("Failed to detect column types from CSV: is the file a valid CSV file?");
56
+ }
57
+ } else {
58
+ return_types = requested_types;
59
+ ResetBuffer();
60
+ }
61
+ SkipRowsAndReadHeader(options.dialect_options.skip_rows, options.dialect_options.header);
62
+ InitParseChunk(return_types.size());
63
+ }
64
+
65
+ void BufferedCSVReader::ResetBuffer() {
66
+ buffer.reset();
67
+ buffer_size = 0;
68
+ position = 0;
69
+ start = 0;
70
+ cached_buffers.clear();
71
+ }
72
+
73
+ void BufferedCSVReader::SkipRowsAndReadHeader(idx_t skip_rows, bool skip_header) {
74
+ for (idx_t i = 0; i < skip_rows; i++) {
75
+ // ignore skip rows
76
+ string read_line = file_handle->ReadLine();
77
+ linenr++;
78
+ }
79
+
80
+ if (skip_header) {
81
+ // ignore the first line as a header line
82
+ InitParseChunk(return_types.size());
83
+ ParseCSV(ParserMode::PARSING_HEADER);
84
+ }
85
+ }
86
+
87
+ string BufferedCSVReader::ColumnTypesError(case_insensitive_map_t<idx_t> sql_types_per_column,
88
+ const vector<string> &names) {
89
+ for (idx_t i = 0; i < names.size(); i++) {
90
+ auto it = sql_types_per_column.find(names[i]);
91
+ if (it != sql_types_per_column.end()) {
92
+ sql_types_per_column.erase(names[i]);
93
+ continue;
94
+ }
95
+ }
96
+ if (sql_types_per_column.empty()) {
97
+ return string();
98
+ }
99
+ string exception = "COLUMN_TYPES error: Columns with names: ";
100
+ for (auto &col : sql_types_per_column) {
101
+ exception += "\"" + col.first + "\",";
102
+ }
103
+ exception.pop_back();
104
+ exception += " do not exist in the CSV File";
105
+ return exception;
106
+ }
107
+
108
+ void BufferedCSVReader::SkipEmptyLines() {
109
+ if (parse_chunk.data.size() == 1) {
110
+ // Empty lines are null data.
111
+ return;
112
+ }
113
+ for (; position < buffer_size; position++) {
114
+ if (!StringUtil::CharacterIsNewline(buffer[position])) {
115
+ return;
116
+ }
117
+ }
118
+ }
119
+
120
+ void UpdateMaxLineLength(ClientContext &context, idx_t line_length) {
121
+ if (!context.client_data->debug_set_max_line_length) {
122
+ return;
123
+ }
124
+ if (line_length < context.client_data->debug_max_line_length) {
125
+ return;
126
+ }
127
+ context.client_data->debug_max_line_length = line_length;
128
+ }
129
+
130
+ bool BufferedCSVReader::ReadBuffer(idx_t &start, idx_t &line_start) {
131
+ if (start > buffer_size) {
132
+ return false;
133
+ }
134
+ auto old_buffer = std::move(buffer);
135
+
136
+ // the remaining part of the last buffer
137
+ idx_t remaining = buffer_size - start;
138
+
139
+ idx_t buffer_read_size = INITIAL_BUFFER_SIZE_LARGE;
140
+
141
+ while (remaining > buffer_read_size) {
142
+ buffer_read_size *= 2;
143
+ }
144
+
145
+ // Check line length
146
+ if (remaining > options.maximum_line_size) {
147
+ throw InvalidInputException("Maximum line size of %llu bytes exceeded on line %s!", options.maximum_line_size,
148
+ GetLineNumberStr(linenr, linenr_estimated));
149
+ }
150
+
151
+ buffer = make_unsafe_uniq_array<char>(buffer_read_size + remaining + 1);
152
+ buffer_size = remaining + buffer_read_size;
153
+ if (remaining > 0) {
154
+ // remaining from last buffer: copy it here
155
+ memcpy(buffer.get(), old_buffer.get() + start, remaining);
156
+ }
157
+ idx_t read_count = file_handle->Read(buffer.get() + remaining, buffer_read_size);
158
+
159
+ bytes_in_chunk += read_count;
160
+ buffer_size = remaining + read_count;
161
+ buffer[buffer_size] = '\0';
162
+ if (old_buffer) {
163
+ cached_buffers.push_back(std::move(old_buffer));
164
+ }
165
+ start = 0;
166
+ position = remaining;
167
+ if (!bom_checked) {
168
+ bom_checked = true;
169
+ if (read_count >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') {
170
+ start += 3;
171
+ position += 3;
172
+ }
173
+ }
174
+ line_start = start;
175
+
176
+ return read_count > 0;
177
+ }
178
+
179
+ void BufferedCSVReader::ParseCSV(DataChunk &insert_chunk) {
180
+ string error_message;
181
+ if (!TryParseCSV(ParserMode::PARSING, insert_chunk, error_message)) {
182
+ throw InvalidInputException(error_message);
183
+ }
184
+ }
185
+
186
+ void BufferedCSVReader::ParseCSV(ParserMode mode) {
187
+ DataChunk dummy_chunk;
188
+ string error_message;
189
+ if (!TryParseCSV(mode, dummy_chunk, error_message)) {
190
+ throw InvalidInputException(error_message);
191
+ }
192
+ }
193
+
194
+ bool BufferedCSVReader::TryParseCSV(ParserMode parser_mode, DataChunk &insert_chunk, string &error_message) {
195
+ mode = parser_mode;
196
+ // used for parsing algorithm
197
+ bool finished_chunk = false;
198
+ idx_t column = 0;
199
+ idx_t offset = 0;
200
+ bool has_quotes = false;
201
+ vector<idx_t> escape_positions;
202
+
203
+ idx_t line_start = position;
204
+ idx_t line_size = 0;
205
+ // read values into the buffer (if any)
206
+ if (position >= buffer_size) {
207
+ if (!ReadBuffer(start, line_start)) {
208
+ return true;
209
+ }
210
+ }
211
+
212
+ // start parsing the first value
213
+ goto value_start;
214
+ value_start:
215
+ offset = 0;
216
+ /* state: value_start */
217
+ // this state parses the first character of a value
218
+ if (buffer[position] == options.dialect_options.state_machine_options.quote) {
219
+ // quote: actual value starts in the next position
220
+ // move to in_quotes state
221
+ start = position + 1;
222
+ line_size++;
223
+ goto in_quotes;
224
+ } else {
225
+ // no quote, move to normal parsing state
226
+ start = position;
227
+ goto normal;
228
+ }
229
+ normal:
230
+ /* state: normal parsing state */
231
+ // this state parses the remainder of a non-quoted value until we reach a delimiter or newline
232
+ do {
233
+ for (; position < buffer_size; position++) {
234
+ line_size++;
235
+ if (buffer[position] == options.dialect_options.state_machine_options.delimiter) {
236
+ // delimiter: end the value and add it to the chunk
237
+ goto add_value;
238
+ } else if (StringUtil::CharacterIsNewline(buffer[position])) {
239
+ // newline: add row
240
+ goto add_row;
241
+ }
242
+ }
243
+ } while (ReadBuffer(start, line_start));
244
+ // file ends during normal scan: go to end state
245
+ goto final_state;
246
+ add_value:
247
+ AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes);
248
+ // increase position by 1 and move start to the new position
249
+ offset = 0;
250
+ has_quotes = false;
251
+ start = ++position;
252
+ line_size++;
253
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
254
+ // file ends right after delimiter, go to final state
255
+ goto final_state;
256
+ }
257
+ goto value_start;
258
+ add_row : {
259
+ // check type of newline (\r or \n)
260
+ bool carriage_return = buffer[position] == '\r';
261
+ AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes);
262
+ if (!error_message.empty()) {
263
+ return false;
264
+ }
265
+ VerifyLineLength(position - line_start);
266
+
267
+ finished_chunk = AddRow(insert_chunk, column, error_message);
268
+ UpdateMaxLineLength(context, position - line_start);
269
+ if (!error_message.empty()) {
270
+ return false;
271
+ }
272
+ // increase position by 1 and move start to the new position
273
+ offset = 0;
274
+ has_quotes = false;
275
+ position++;
276
+ line_size = 0;
277
+ start = position;
278
+ line_start = position;
279
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
280
+ // file ends right after delimiter, go to final state
281
+ goto final_state;
282
+ }
283
+ if (carriage_return) {
284
+ // \r newline, go to special state that parses an optional \n afterwards
285
+ goto carriage_return;
286
+ } else {
287
+ SetNewLineDelimiter();
288
+ SkipEmptyLines();
289
+
290
+ start = position;
291
+ line_start = position;
292
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
293
+ // file ends right after delimiter, go to final state
294
+ goto final_state;
295
+ }
296
+ // \n newline, move to value start
297
+ if (finished_chunk) {
298
+ return true;
299
+ }
300
+ goto value_start;
301
+ }
302
+ }
303
+ in_quotes:
304
+ /* state: in_quotes */
305
+ // this state parses the remainder of a quoted value
306
+ has_quotes = true;
307
+ position++;
308
+ line_size++;
309
+ do {
310
+ for (; position < buffer_size; position++) {
311
+ line_size++;
312
+ if (buffer[position] == options.dialect_options.state_machine_options.quote) {
313
+ // quote: move to unquoted state
314
+ goto unquote;
315
+ } else if (buffer[position] == options.dialect_options.state_machine_options.escape) {
316
+ // escape: store the escaped position and move to handle_escape state
317
+ escape_positions.push_back(position - start);
318
+ goto handle_escape;
319
+ }
320
+ }
321
+ } while (ReadBuffer(start, line_start));
322
+ // still in quoted state at the end of the file, error:
323
+ throw InvalidInputException("Error in file \"%s\" on line %s: unterminated quotes. (%s)", options.file_path,
324
+ GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
325
+ unquote:
326
+ /* state: unquote */
327
+ // this state handles the state directly after we unquote
328
+ // in this state we expect either another quote (entering the quoted state again, and escaping the quote)
329
+ // or a delimiter/newline, ending the current value and moving on to the next value
330
+ position++;
331
+ line_size++;
332
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
333
+ // file ends right after unquote, go to final state
334
+ offset = 1;
335
+ goto final_state;
336
+ }
337
+ if (buffer[position] == options.dialect_options.state_machine_options.quote &&
338
+ (options.dialect_options.state_machine_options.escape == '\0' ||
339
+ options.dialect_options.state_machine_options.escape == options.dialect_options.state_machine_options.quote)) {
340
+ // escaped quote, return to quoted state and store escape position
341
+ escape_positions.push_back(position - start);
342
+ goto in_quotes;
343
+ } else if (buffer[position] == options.dialect_options.state_machine_options.delimiter) {
344
+ // delimiter, add value
345
+ offset = 1;
346
+ goto add_value;
347
+ } else if (StringUtil::CharacterIsNewline(buffer[position])) {
348
+ offset = 1;
349
+ goto add_row;
350
+ } else {
351
+ error_message = StringUtil::Format(
352
+ "Error in file \"%s\" on line %s: quote should be followed by end of value, end of "
353
+ "row or another quote. (%s)",
354
+ options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
355
+ return false;
356
+ }
357
+ handle_escape:
358
+ /* state: handle_escape */
359
+ // escape should be followed by a quote or another escape character
360
+ position++;
361
+ line_size++;
362
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
363
+ error_message = StringUtil::Format(
364
+ "Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", options.file_path,
365
+ GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
366
+ return false;
367
+ }
368
+ if (buffer[position] != options.dialect_options.state_machine_options.quote &&
369
+ buffer[position] != options.dialect_options.state_machine_options.escape) {
370
+ error_message = StringUtil::Format(
371
+ "Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", options.file_path,
372
+ GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
373
+ return false;
374
+ }
375
+ // escape was followed by quote or escape, go back to quoted state
376
+ goto in_quotes;
377
+ carriage_return:
378
+ /* state: carriage_return */
379
+ // this stage optionally skips a newline (\n) character, which allows \r\n to be interpreted as a single line
380
+ if (buffer[position] == '\n') {
381
+ SetNewLineDelimiter(true, true);
382
+ // newline after carriage return: skip
383
+ // increase position by 1 and move start to the new position
384
+ start = ++position;
385
+ line_size++;
386
+
387
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
388
+ // file ends right after delimiter, go to final state
389
+ goto final_state;
390
+ }
391
+ } else {
392
+ SetNewLineDelimiter(true, false);
393
+ }
394
+ if (finished_chunk) {
395
+ return true;
396
+ }
397
+ SkipEmptyLines();
398
+ start = position;
399
+ line_start = position;
400
+ if (position >= buffer_size && !ReadBuffer(start, line_start)) {
401
+ // file ends right after delimiter, go to final state
402
+ goto final_state;
403
+ }
404
+
405
+ goto value_start;
406
+ final_state:
407
+ if (finished_chunk) {
408
+ return true;
409
+ }
410
+
411
+ if (column > 0 || position > start) {
412
+ // remaining values to be added to the chunk
413
+ AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes);
414
+ VerifyLineLength(position - line_start);
415
+
416
+ finished_chunk = AddRow(insert_chunk, column, error_message);
417
+ SkipEmptyLines();
418
+ UpdateMaxLineLength(context, line_size);
419
+ if (!error_message.empty()) {
420
+ return false;
421
+ }
422
+ }
423
+
424
+ // final stage, only reached after parsing the file is finished
425
+ // flush the parsed chunk and finalize parsing
426
+ if (mode == ParserMode::PARSING) {
427
+ Flush(insert_chunk);
428
+ }
429
+
430
+ end_of_file_reached = true;
431
+ return true;
432
+ }
433
+
434
+ } // namespace duckdb
@@ -0,0 +1,80 @@
1
+ #include "duckdb/execution/operator/scan/csv/csv_buffer.hpp"
2
+ #include "duckdb/common/string_util.hpp"
3
+
4
+ namespace duckdb {
5
+
6
+ CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle,
7
+ idx_t &global_csv_current_position, idx_t file_number_p)
8
+ : context(context), first_buffer(true), file_number(file_number_p), can_seek(file_handle.CanSeek()) {
9
+ AllocateBuffer(buffer_size_p);
10
+ auto buffer = Ptr();
11
+ file_size = file_handle.Read(buffer, buffer_size_p);
12
+ global_csv_start = global_csv_current_position;
13
+ // BOM check (https://en.wikipedia.org/wiki/Byte_order_mark)
14
+ if (file_size >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') {
15
+ start_position += 3;
16
+ }
17
+ last_buffer = file_handle.FinishedReading();
18
+ }
19
+
20
+ CSVBuffer::CSVBuffer(CSVFileHandle &file_handle, ClientContext &context, idx_t buffer_size,
21
+ idx_t global_csv_current_position, idx_t file_number_p)
22
+ : context(context), global_csv_start(global_csv_current_position), file_number(file_number_p),
23
+ can_seek(file_handle.CanSeek()) {
24
+ AllocateBuffer(buffer_size);
25
+ file_size = file_handle.Read(handle.Ptr(), buffer_size);
26
+ last_buffer = file_handle.FinishedReading();
27
+ }
28
+
29
+ shared_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t file_number_p) {
30
+ auto next_csv_buffer =
31
+ make_shared<CSVBuffer>(file_handle, context, buffer_size, global_csv_start + file_size, file_number_p);
32
+ if (next_csv_buffer->GetBufferSize() == 0) {
33
+ // We are done reading
34
+ return nullptr;
35
+ }
36
+ return next_csv_buffer;
37
+ }
38
+
39
+ void CSVBuffer::AllocateBuffer(idx_t buffer_size) {
40
+ auto &buffer_manager = BufferManager::GetBufferManager(context);
41
+ bool can_destroy = can_seek;
42
+ handle = buffer_manager.Allocate(MaxValue<idx_t>(Storage::BLOCK_SIZE, buffer_size), can_destroy, &block);
43
+ }
44
+
45
+ idx_t CSVBuffer::GetBufferSize() {
46
+ return file_size;
47
+ }
48
+
49
+ void CSVBuffer::Reload(CSVFileHandle &file_handle) {
50
+ AllocateBuffer(file_size);
51
+ file_handle.Seek(global_csv_start);
52
+ file_handle.Read(handle.Ptr(), file_size);
53
+ }
54
+
55
+ unique_ptr<CSVBufferHandle> CSVBuffer::Pin(CSVFileHandle &file_handle) {
56
+ auto &buffer_manager = BufferManager::GetBufferManager(context);
57
+ if (can_seek && block->IsUnloaded()) {
58
+ // We have to reload it from disk
59
+ block = nullptr;
60
+ Reload(file_handle);
61
+ }
62
+ return make_uniq<CSVBufferHandle>(buffer_manager.Pin(block), file_size, first_buffer, last_buffer, global_csv_start,
63
+ start_position, file_number);
64
+ }
65
+
66
+ void CSVBuffer::Unpin() {
67
+ if (handle.IsValid()) {
68
+ handle.Destroy();
69
+ }
70
+ }
71
+
72
+ idx_t CSVBuffer::GetStart() {
73
+ return start_position;
74
+ }
75
+
76
+ bool CSVBuffer::IsCSVFileLastBuffer() {
77
+ return last_buffer;
78
+ }
79
+
80
+ } // namespace duckdb
@@ -0,0 +1,90 @@
1
+ #include "duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp"
2
+ #include "duckdb/execution/operator/scan/csv/csv_buffer.hpp"
3
+ namespace duckdb {
4
+
5
+ CSVBufferManager::CSVBufferManager(ClientContext &context_p, unique_ptr<CSVFileHandle> file_handle_p,
6
+ const CSVReaderOptions &options, idx_t file_idx_p)
7
+ : file_handle(std::move(file_handle_p)), context(context_p), file_idx(file_idx_p),
8
+ buffer_size(CSVBuffer::CSV_BUFFER_SIZE) {
9
+ if (options.skip_rows_set) {
10
+ // Skip rows if they are set
11
+ skip_rows = options.dialect_options.skip_rows;
12
+ }
13
+ auto file_size = file_handle->FileSize();
14
+ if (file_size > 0 && file_size < buffer_size) {
15
+ buffer_size = CSVBuffer::CSV_MINIMUM_BUFFER_SIZE;
16
+ }
17
+ if (options.buffer_size < buffer_size) {
18
+ buffer_size = options.buffer_size;
19
+ }
20
+ for (idx_t i = 0; i < skip_rows; i++) {
21
+ file_handle->ReadLine();
22
+ }
23
+ Initialize();
24
+ }
25
+
26
+ void CSVBufferManager::UnpinBuffer(idx_t cache_idx) {
27
+ if (cache_idx < cached_buffers.size()) {
28
+ cached_buffers[cache_idx]->Unpin();
29
+ }
30
+ }
31
+
32
+ void CSVBufferManager::Initialize() {
33
+ if (cached_buffers.empty()) {
34
+ cached_buffers.emplace_back(
35
+ make_shared<CSVBuffer>(context, buffer_size, *file_handle, global_csv_pos, file_idx));
36
+ last_buffer = cached_buffers.front();
37
+ }
38
+ start_pos = last_buffer->GetStart();
39
+ }
40
+
41
+ idx_t CSVBufferManager::GetStartPos() {
42
+ return start_pos;
43
+ }
44
+ bool CSVBufferManager::ReadNextAndCacheIt() {
45
+ D_ASSERT(last_buffer);
46
+ if (!last_buffer->IsCSVFileLastBuffer()) {
47
+ auto maybe_last_buffer = last_buffer->Next(*file_handle, buffer_size, file_idx);
48
+ if (!maybe_last_buffer) {
49
+ last_buffer->last_buffer = true;
50
+ return false;
51
+ }
52
+ last_buffer = std::move(maybe_last_buffer);
53
+ cached_buffers.emplace_back(last_buffer);
54
+ return true;
55
+ }
56
+ return false;
57
+ }
58
+
59
+ unique_ptr<CSVBufferHandle> CSVBufferManager::GetBuffer(const idx_t pos) {
60
+ while (pos >= cached_buffers.size()) {
61
+ if (done) {
62
+ return nullptr;
63
+ }
64
+ if (!ReadNextAndCacheIt()) {
65
+ done = true;
66
+ }
67
+ }
68
+ if (pos != 0) {
69
+ cached_buffers[pos - 1]->Unpin();
70
+ }
71
+ return cached_buffers[pos]->Pin(*file_handle);
72
+ }
73
+
74
+ bool CSVBufferIterator::Finished() {
75
+ return !cur_buffer_handle;
76
+ }
77
+
78
+ void CSVBufferIterator::Reset() {
79
+ if (cur_buffer_handle) {
80
+ cur_buffer_handle.reset();
81
+ }
82
+ if (cur_buffer_idx > 0) {
83
+ buffer_manager->UnpinBuffer(cur_buffer_idx - 1);
84
+ }
85
+ cur_buffer_idx = 0;
86
+ buffer_manager->Initialize();
87
+ cur_pos = buffer_manager->GetStartPos();
88
+ }
89
+
90
+ } // namespace duckdb
@@ -0,0 +1,95 @@
1
+ #include "duckdb/execution/operator/scan/csv/csv_file_handle.hpp"
2
+
3
+ namespace duckdb {
4
+
5
+ CSVFileHandle::CSVFileHandle(FileSystem &fs, Allocator &allocator, unique_ptr<FileHandle> file_handle_p,
6
+ const string &path_p, FileCompressionType compression)
7
+ : file_handle(std::move(file_handle_p)), path(path_p) {
8
+ can_seek = file_handle->CanSeek();
9
+ on_disk_file = file_handle->OnDiskFile();
10
+ file_size = file_handle->GetFileSize();
11
+ }
12
+
13
+ unique_ptr<FileHandle> CSVFileHandle::OpenFileHandle(FileSystem &fs, Allocator &allocator, const string &path,
14
+ FileCompressionType compression) {
15
+ auto file_handle = fs.OpenFile(path, FileFlags::FILE_FLAGS_READ, FileLockType::NO_LOCK, compression);
16
+ if (file_handle->CanSeek()) {
17
+ file_handle->Reset();
18
+ }
19
+ return file_handle;
20
+ }
21
+
22
+ unique_ptr<CSVFileHandle> CSVFileHandle::OpenFile(FileSystem &fs, Allocator &allocator, const string &path,
23
+ FileCompressionType compression) {
24
+ auto file_handle = CSVFileHandle::OpenFileHandle(fs, allocator, path, compression);
25
+ return make_uniq<CSVFileHandle>(fs, allocator, std::move(file_handle), path, compression);
26
+ }
27
+
28
+ bool CSVFileHandle::CanSeek() {
29
+ return can_seek;
30
+ }
31
+
32
+ void CSVFileHandle::Seek(idx_t position) {
33
+ if (!can_seek) {
34
+ throw InternalException("Cannot seek in this file");
35
+ }
36
+ file_handle->Seek(position);
37
+ }
38
+
39
+ bool CSVFileHandle::OnDiskFile() {
40
+ return on_disk_file;
41
+ }
42
+
43
+ idx_t CSVFileHandle::FileSize() {
44
+ return file_size;
45
+ }
46
+
47
+ bool CSVFileHandle::FinishedReading() {
48
+ return finished;
49
+ }
50
+
51
+ idx_t CSVFileHandle::Read(void *buffer, idx_t nr_bytes) {
52
+ requested_bytes += nr_bytes;
53
+ // if this is a plain file source OR we can seek we are not caching anything
54
+ auto bytes_read = file_handle->Read(buffer, nr_bytes);
55
+ if (!finished) {
56
+ finished = bytes_read == 0;
57
+ }
58
+ return bytes_read;
59
+ }
60
+
61
+ string CSVFileHandle::ReadLine() {
62
+ bool carriage_return = false;
63
+ string result;
64
+ char buffer[1];
65
+ while (true) {
66
+ idx_t bytes_read = Read(buffer, 1);
67
+ if (bytes_read == 0) {
68
+ return result;
69
+ }
70
+ if (carriage_return) {
71
+ if (buffer[0] != '\n') {
72
+ if (!file_handle->CanSeek()) {
73
+ throw BinderException(
74
+ "Carriage return newlines not supported when reading CSV files in which we cannot seek");
75
+ }
76
+ file_handle->Seek(file_handle->SeekPosition() - 1);
77
+ return result;
78
+ }
79
+ }
80
+ if (buffer[0] == '\n') {
81
+ return result;
82
+ }
83
+ if (buffer[0] != '\r') {
84
+ result += buffer[0];
85
+ } else {
86
+ carriage_return = true;
87
+ }
88
+ }
89
+ }
90
+
91
+ string CSVFileHandle::GetFilePath() {
92
+ return path;
93
+ }
94
+
95
+ } // namespace duckdb