duckdb 0.8.2-dev3458.0 → 0.8.2-dev3949.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. package/binding.gyp +2 -0
  2. package/package.json +1 -1
  3. package/src/duckdb/extension/icu/icu_extension.cpp +5 -5
  4. package/src/duckdb/extension/json/include/json_deserializer.hpp +7 -16
  5. package/src/duckdb/extension/json/include/json_serializer.hpp +9 -15
  6. package/src/duckdb/extension/json/json_deserializer.cpp +29 -67
  7. package/src/duckdb/extension/json/json_scan.cpp +1 -1
  8. package/src/duckdb/extension/json/json_serializer.cpp +26 -69
  9. package/src/duckdb/src/common/enum_util.cpp +119 -7
  10. package/src/duckdb/src/common/extra_type_info.cpp +7 -3
  11. package/src/duckdb/src/common/radix_partitioning.cpp +8 -31
  12. package/src/duckdb/src/common/row_operations/row_aggregate.cpp +18 -3
  13. package/src/duckdb/src/common/serializer/binary_deserializer.cpp +62 -77
  14. package/src/duckdb/src/common/serializer/binary_serializer.cpp +84 -84
  15. package/src/duckdb/src/common/serializer/format_serializer.cpp +1 -1
  16. package/src/duckdb/src/common/sort/partition_state.cpp +41 -33
  17. package/src/duckdb/src/common/types/data_chunk.cpp +44 -8
  18. package/src/duckdb/src/common/types/hyperloglog.cpp +21 -0
  19. package/src/duckdb/src/common/types/interval.cpp +3 -0
  20. package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +252 -126
  21. package/src/duckdb/src/common/types/row/row_layout.cpp +3 -31
  22. package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +40 -32
  23. package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +39 -26
  24. package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +11 -1
  25. package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +21 -16
  26. package/src/duckdb/src/common/types/value.cpp +63 -42
  27. package/src/duckdb/src/common/types/vector.cpp +33 -67
  28. package/src/duckdb/src/core_functions/scalar/list/list_lambdas.cpp +3 -2
  29. package/src/duckdb/src/execution/aggregate_hashtable.cpp +222 -364
  30. package/src/duckdb/src/execution/join_hashtable.cpp +5 -6
  31. package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +240 -310
  32. package/src/duckdb/src/execution/operator/aggregate/physical_ungrouped_aggregate.cpp +202 -173
  33. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +36 -2
  34. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/base_csv_reader.cpp +58 -162
  35. package/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp +434 -0
  36. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +80 -0
  37. package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer_manager.cpp +90 -0
  38. package/src/duckdb/src/execution/operator/csv_scanner/csv_file_handle.cpp +95 -0
  39. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/csv_reader_options.cpp +47 -28
  40. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine.cpp +35 -0
  41. package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +107 -0
  42. package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/parallel_csv_reader.cpp +44 -44
  43. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +52 -0
  44. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +336 -0
  45. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +165 -0
  46. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +398 -0
  47. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +175 -0
  48. package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +39 -0
  49. package/src/duckdb/src/execution/operator/join/physical_asof_join.cpp +1 -1
  50. package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +1 -2
  51. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +614 -574
  52. package/src/duckdb/src/execution/window_executor.cpp +6 -5
  53. package/src/duckdb/src/function/cast/cast_function_set.cpp +1 -0
  54. package/src/duckdb/src/function/scalar/strftime_format.cpp +4 -4
  55. package/src/duckdb/src/function/table/copy_csv.cpp +94 -96
  56. package/src/duckdb/src/function/table/read_csv.cpp +150 -136
  57. package/src/duckdb/src/function/table/table_scan.cpp +0 -2
  58. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  59. package/src/duckdb/src/include/duckdb/common/enum_util.hpp +24 -0
  60. package/src/duckdb/src/include/duckdb/common/file_opener.hpp +9 -0
  61. package/src/duckdb/src/include/duckdb/common/fixed_size_map.hpp +208 -0
  62. package/src/duckdb/src/include/duckdb/common/optional_idx.hpp +3 -0
  63. package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +2 -1
  64. package/src/duckdb/src/include/duckdb/common/printer.hpp +11 -0
  65. package/src/duckdb/src/include/duckdb/common/serializer/binary_deserializer.hpp +43 -30
  66. package/src/duckdb/src/include/duckdb/common/serializer/binary_serializer.hpp +36 -35
  67. package/src/duckdb/src/include/duckdb/common/serializer/deserialization_data.hpp +18 -0
  68. package/src/duckdb/src/include/duckdb/common/serializer/encoding_util.hpp +132 -0
  69. package/src/duckdb/src/include/duckdb/common/serializer/format_deserializer.hpp +125 -150
  70. package/src/duckdb/src/include/duckdb/common/serializer/format_serializer.hpp +119 -107
  71. package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +2 -1
  72. package/src/duckdb/src/include/duckdb/common/shared_ptr.hpp +8 -0
  73. package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +13 -7
  74. package/src/duckdb/src/include/duckdb/common/types/data_chunk.hpp +5 -0
  75. package/src/duckdb/src/include/duckdb/common/types/hyperloglog.hpp +7 -1
  76. package/src/duckdb/src/include/duckdb/common/types/interval.hpp +7 -0
  77. package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +41 -9
  78. package/src/duckdb/src/include/duckdb/common/types/row/row_data_collection_scanner.hpp +5 -0
  79. package/src/duckdb/src/include/duckdb/common/types/row/row_layout.hpp +1 -23
  80. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +14 -8
  81. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +6 -3
  82. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +7 -0
  83. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +13 -8
  84. package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +3 -2
  85. package/src/duckdb/src/include/duckdb/common/types/vector.hpp +3 -3
  86. package/src/duckdb/src/include/duckdb/common/vector.hpp +2 -2
  87. package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +125 -146
  88. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_hash_aggregate.hpp +5 -4
  89. package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_window.hpp +4 -3
  90. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/base_csv_reader.hpp +17 -17
  91. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp +72 -0
  92. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +110 -0
  93. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp +103 -0
  94. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_file_handle.hpp +8 -15
  95. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_line_info.hpp +1 -1
  96. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_reader_options.hpp +52 -28
  97. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +127 -0
  98. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +75 -0
  99. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +51 -0
  100. package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/parallel_csv_reader.hpp +21 -27
  101. package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/quote_rules.hpp +21 -0
  102. package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +18 -27
  103. package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +5 -6
  104. package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +4 -4
  105. package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +17 -12
  106. package/src/duckdb/src/include/duckdb/main/client_context_file_opener.hpp +1 -0
  107. package/src/duckdb/src/include/duckdb/main/client_data.hpp +2 -1
  108. package/src/duckdb/src/include/duckdb/main/config.hpp +1 -0
  109. package/src/duckdb/src/include/duckdb/main/connection.hpp +2 -2
  110. package/src/duckdb/src/include/duckdb/main/relation/read_csv_relation.hpp +6 -6
  111. package/src/duckdb/src/include/duckdb/parallel/event.hpp +12 -1
  112. package/src/duckdb/src/include/duckdb/storage/block.hpp +6 -0
  113. package/src/duckdb/src/include/duckdb/storage/buffer/block_handle.hpp +3 -0
  114. package/src/duckdb/src/include/duckdb/storage/statistics/base_statistics.hpp +7 -3
  115. package/src/duckdb/src/include/duckdb/storage/statistics/column_statistics.hpp +4 -0
  116. package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +5 -0
  117. package/src/duckdb/src/include/duckdb/storage/statistics/list_stats.hpp +3 -0
  118. package/src/duckdb/src/include/duckdb/storage/statistics/numeric_stats.hpp +3 -0
  119. package/src/duckdb/src/include/duckdb/storage/statistics/string_stats.hpp +3 -0
  120. package/src/duckdb/src/include/duckdb/storage/statistics/struct_stats.hpp +3 -0
  121. package/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp +15 -3
  122. package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +4 -0
  123. package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +5 -0
  124. package/src/duckdb/src/include/duckdb/verification/deserialized_statement_verifier_v2.hpp +6 -0
  125. package/src/duckdb/src/include/duckdb/verification/statement_verifier.hpp +1 -0
  126. package/src/duckdb/src/include/duckdb.h +12 -0
  127. package/src/duckdb/src/main/capi/logical_types-c.cpp +22 -0
  128. package/src/duckdb/src/main/client_context_file_opener.cpp +17 -0
  129. package/src/duckdb/src/main/client_verify.cpp +1 -0
  130. package/src/duckdb/src/main/config.cpp +2 -2
  131. package/src/duckdb/src/main/connection.cpp +3 -3
  132. package/src/duckdb/src/main/relation/read_csv_relation.cpp +19 -13
  133. package/src/duckdb/src/parallel/pipeline_finish_event.cpp +1 -1
  134. package/src/duckdb/src/parser/tableref/pivotref.cpp +0 -16
  135. package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +1 -1
  136. package/src/duckdb/src/planner/binder/statement/bind_export.cpp +41 -25
  137. package/src/duckdb/src/planner/expression/bound_aggregate_expression.cpp +4 -4
  138. package/src/duckdb/src/planner/expression/bound_window_expression.cpp +10 -10
  139. package/src/duckdb/src/planner/logical_operator.cpp +1 -1
  140. package/src/duckdb/src/planner/planner.cpp +1 -1
  141. package/src/duckdb/src/storage/checkpoint_manager.cpp +4 -3
  142. package/src/duckdb/src/storage/serialization/serialize_constraint.cpp +1 -1
  143. package/src/duckdb/src/storage/serialization/serialize_create_info.cpp +5 -5
  144. package/src/duckdb/src/storage/serialization/serialize_expression.cpp +10 -10
  145. package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +20 -20
  146. package/src/duckdb/src/storage/serialization/serialize_macro_function.cpp +2 -2
  147. package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +118 -89
  148. package/src/duckdb/src/storage/serialization/serialize_parse_info.cpp +3 -3
  149. package/src/duckdb/src/storage/serialization/serialize_parsed_expression.cpp +27 -27
  150. package/src/duckdb/src/storage/serialization/serialize_query_node.cpp +16 -16
  151. package/src/duckdb/src/storage/serialization/serialize_result_modifier.cpp +8 -8
  152. package/src/duckdb/src/storage/serialization/serialize_statement.cpp +1 -1
  153. package/src/duckdb/src/storage/serialization/serialize_storage.cpp +39 -0
  154. package/src/duckdb/src/storage/serialization/serialize_tableref.cpp +9 -9
  155. package/src/duckdb/src/storage/statistics/base_statistics.cpp +67 -4
  156. package/src/duckdb/src/storage/statistics/column_statistics.cpp +16 -0
  157. package/src/duckdb/src/storage/statistics/list_stats.cpp +21 -0
  158. package/src/duckdb/src/storage/statistics/numeric_stats.cpp +126 -1
  159. package/src/duckdb/src/storage/statistics/string_stats.cpp +23 -0
  160. package/src/duckdb/src/storage/statistics/struct_stats.cpp +27 -0
  161. package/src/duckdb/src/storage/storage_info.cpp +1 -1
  162. package/src/duckdb/src/storage/table/chunk_info.cpp +82 -3
  163. package/src/duckdb/src/storage/table/row_group.cpp +68 -1
  164. package/src/duckdb/src/storage/table/table_statistics.cpp +21 -0
  165. package/src/duckdb/src/storage/wal_replay.cpp +2 -2
  166. package/src/duckdb/src/verification/deserialized_statement_verifier_v2.cpp +15 -1
  167. package/src/duckdb/src/verification/statement_verifier.cpp +2 -0
  168. package/src/duckdb/third_party/utf8proc/include/utf8proc_wrapper.hpp +8 -0
  169. package/src/duckdb/ub_src_execution.cpp +0 -2
  170. package/src/duckdb/ub_src_execution_operator_csv_scanner.cpp +18 -0
  171. package/src/duckdb/ub_src_execution_operator_csv_scanner_sniffer.cpp +12 -0
  172. package/src/duckdb/ub_src_execution_operator_persistent.cpp +0 -12
  173. package/src/duckdb/ub_src_storage_serialization.cpp +2 -0
  174. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +0 -1487
  175. package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +0 -72
  176. package/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp +0 -158
  177. package/src/duckdb/src/execution/partitionable_hashtable.cpp +0 -207
  178. package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +0 -133
  179. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +0 -74
  180. package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +0 -73
@@ -1,72 +0,0 @@
1
- #include "duckdb/execution/operator/persistent/csv_buffer.hpp"
2
- #include "duckdb/common/string_util.hpp"
3
-
4
- namespace duckdb {
5
-
6
- CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle,
7
- idx_t &global_csv_current_position, idx_t file_number_p)
8
- : context(context), first_buffer(true), file_number(file_number_p) {
9
- this->handle = AllocateBuffer(buffer_size_p);
10
-
11
- auto buffer = Ptr();
12
- actual_size = file_handle.Read(buffer, buffer_size_p);
13
- global_csv_start = global_csv_current_position;
14
- global_csv_current_position += actual_size;
15
- if (actual_size >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') {
16
- start_position += 3;
17
- }
18
- last_buffer = file_handle.FinishedReading();
19
- }
20
-
21
- CSVBuffer::CSVBuffer(ClientContext &context, BufferHandle buffer_p, idx_t buffer_size_p, idx_t actual_size_p,
22
- bool final_buffer, idx_t global_csv_current_position, idx_t file_number_p)
23
- : context(context), handle(std::move(buffer_p)), actual_size(actual_size_p), last_buffer(final_buffer),
24
- global_csv_start(global_csv_current_position), file_number(file_number_p) {
25
- }
26
-
27
- unique_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t &global_csv_current_position,
28
- idx_t file_number_p) {
29
- auto next_buffer = AllocateBuffer(buffer_size);
30
- idx_t next_buffer_actual_size = file_handle.Read(next_buffer.Ptr(), buffer_size);
31
- if (next_buffer_actual_size == 0) {
32
- // We are done reading
33
- return nullptr;
34
- }
35
-
36
- auto next_csv_buffer =
37
- make_uniq<CSVBuffer>(context, std::move(next_buffer), buffer_size, next_buffer_actual_size,
38
- file_handle.FinishedReading(), global_csv_current_position, file_number_p);
39
- global_csv_current_position += next_buffer_actual_size;
40
- return next_csv_buffer;
41
- }
42
-
43
- BufferHandle CSVBuffer::AllocateBuffer(idx_t buffer_size) {
44
- auto &buffer_manager = BufferManager::GetBufferManager(context);
45
- return buffer_manager.Allocate(MaxValue<idx_t>(Storage::BLOCK_SIZE, buffer_size));
46
- }
47
-
48
- idx_t CSVBuffer::GetBufferSize() {
49
- return actual_size;
50
- }
51
-
52
- idx_t CSVBuffer::GetStart() {
53
- return start_position;
54
- }
55
-
56
- bool CSVBuffer::IsCSVFileLastBuffer() {
57
- return last_buffer;
58
- }
59
-
60
- bool CSVBuffer::IsCSVFileFirstBuffer() {
61
- return first_buffer;
62
- }
63
-
64
- idx_t CSVBuffer::GetCSVGlobalStart() {
65
- return global_csv_start;
66
- }
67
-
68
- idx_t CSVBuffer::GetFileNumber() {
69
- return file_number;
70
- }
71
-
72
- } // namespace duckdb
@@ -1,158 +0,0 @@
1
- #include "duckdb/execution/operator/persistent/csv_file_handle.hpp"
2
-
3
- namespace duckdb {
4
-
5
- CSVFileHandle::CSVFileHandle(FileSystem &fs, Allocator &allocator, unique_ptr<FileHandle> file_handle_p,
6
- const string &path_p, FileCompressionType compression, bool enable_reset)
7
- : fs(fs), allocator(allocator), file_handle(std::move(file_handle_p)), path(path_p), compression(compression),
8
- reset_enabled(enable_reset) {
9
- can_seek = file_handle->CanSeek();
10
- on_disk_file = file_handle->OnDiskFile();
11
- file_size = file_handle->GetFileSize();
12
- }
13
-
14
- unique_ptr<FileHandle> CSVFileHandle::OpenFileHandle(FileSystem &fs, Allocator &allocator, const string &path,
15
- FileCompressionType compression) {
16
- auto file_handle = fs.OpenFile(path, FileFlags::FILE_FLAGS_READ, FileLockType::NO_LOCK, compression);
17
- if (file_handle->CanSeek()) {
18
- file_handle->Reset();
19
- }
20
- return file_handle;
21
- }
22
-
23
- unique_ptr<CSVFileHandle> CSVFileHandle::OpenFile(FileSystem &fs, Allocator &allocator, const string &path,
24
- FileCompressionType compression, bool enable_reset) {
25
- auto file_handle = CSVFileHandle::OpenFileHandle(fs, allocator, path, compression);
26
- return make_uniq<CSVFileHandle>(fs, allocator, std::move(file_handle), path, compression, enable_reset);
27
- }
28
-
29
- bool CSVFileHandle::CanSeek() {
30
- return can_seek;
31
- }
32
-
33
- void CSVFileHandle::Seek(idx_t position) {
34
- if (!can_seek) {
35
- throw InternalException("Cannot seek in this file");
36
- }
37
- file_handle->Seek(position);
38
- }
39
-
40
- idx_t CSVFileHandle::SeekPosition() {
41
- if (!can_seek) {
42
- throw InternalException("Cannot seek in this file");
43
- }
44
- return file_handle->SeekPosition();
45
- }
46
-
47
- void CSVFileHandle::Reset() {
48
- requested_bytes = 0;
49
- read_position = 0;
50
- if (can_seek) {
51
- // we can seek - reset the file handle
52
- file_handle->Reset();
53
- } else if (on_disk_file) {
54
- // we cannot seek but it is an on-disk file - re-open the file
55
- file_handle = CSVFileHandle::OpenFileHandle(fs, allocator, path, compression);
56
- } else {
57
- if (!reset_enabled) {
58
- throw InternalException("Reset called but reset is not enabled for this CSV Handle");
59
- }
60
- read_position = 0;
61
- }
62
- }
63
- bool CSVFileHandle::OnDiskFile() {
64
- return on_disk_file;
65
- }
66
-
67
- idx_t CSVFileHandle::FileSize() {
68
- return file_size;
69
- }
70
-
71
- bool CSVFileHandle::FinishedReading() {
72
- return requested_bytes >= file_size;
73
- }
74
-
75
- idx_t CSVFileHandle::Read(void *buffer, idx_t nr_bytes) {
76
- requested_bytes += nr_bytes;
77
- if (on_disk_file || can_seek) {
78
- // if this is a plain file source OR we can seek we are not caching anything
79
- return file_handle->Read(buffer, nr_bytes);
80
- }
81
- // not a plain file source: we need to do some bookkeeping around the reset functionality
82
- idx_t result_offset = 0;
83
- if (read_position < buffer_size) {
84
- // we need to read from our cached buffer
85
- auto buffer_read_count = MinValue<idx_t>(nr_bytes, buffer_size - read_position);
86
- memcpy(buffer, cached_buffer.get() + read_position, buffer_read_count);
87
- result_offset += buffer_read_count;
88
- read_position += buffer_read_count;
89
- if (result_offset == nr_bytes) {
90
- return nr_bytes;
91
- }
92
- } else if (!reset_enabled && cached_buffer.IsSet()) {
93
- // reset is disabled, but we still have cached data
94
- // we can remove any cached data
95
- cached_buffer.Reset();
96
- buffer_size = 0;
97
- buffer_capacity = 0;
98
- read_position = 0;
99
- }
100
- // we have data left to read from the file
101
- // read directly into the buffer
102
- auto bytes_read = file_handle->Read(char_ptr_cast(buffer) + result_offset, nr_bytes - result_offset);
103
- file_size = file_handle->GetFileSize();
104
- read_position += bytes_read;
105
- if (reset_enabled) {
106
- // if reset caching is enabled, we need to cache the bytes that we have read
107
- if (buffer_size + bytes_read >= buffer_capacity) {
108
- // no space; first enlarge the buffer
109
- buffer_capacity = MaxValue<idx_t>(NextPowerOfTwo(buffer_size + bytes_read), buffer_capacity * 2);
110
-
111
- auto new_buffer = allocator.Allocate(buffer_capacity);
112
- if (buffer_size > 0) {
113
- memcpy(new_buffer.get(), cached_buffer.get(), buffer_size);
114
- }
115
- cached_buffer = std::move(new_buffer);
116
- }
117
- memcpy(cached_buffer.get() + buffer_size, char_ptr_cast(buffer) + result_offset, bytes_read);
118
- buffer_size += bytes_read;
119
- }
120
-
121
- return result_offset + bytes_read;
122
- }
123
-
124
- string CSVFileHandle::ReadLine() {
125
- bool carriage_return = false;
126
- string result;
127
- char buffer[1];
128
- while (true) {
129
- idx_t bytes_read = Read(buffer, 1);
130
- if (bytes_read == 0) {
131
- return result;
132
- }
133
- if (carriage_return) {
134
- if (buffer[0] != '\n') {
135
- if (!file_handle->CanSeek()) {
136
- throw BinderException(
137
- "Carriage return newlines not supported when reading CSV files in which we cannot seek");
138
- }
139
- file_handle->Seek(file_handle->SeekPosition() - 1);
140
- return result;
141
- }
142
- }
143
- if (buffer[0] == '\n') {
144
- return result;
145
- }
146
- if (buffer[0] != '\r') {
147
- result += buffer[0];
148
- } else {
149
- carriage_return = true;
150
- }
151
- }
152
- }
153
-
154
- void CSVFileHandle::DisableReset() {
155
- this->reset_enabled = false;
156
- }
157
-
158
- } // namespace duckdb
@@ -1,207 +0,0 @@
1
- #include "duckdb/execution/partitionable_hashtable.hpp"
2
-
3
- #include "duckdb/common/radix_partitioning.hpp"
4
-
5
- namespace duckdb {
6
-
7
- RadixPartitionInfo::RadixPartitionInfo(const idx_t n_partitions_upper_bound)
8
- : n_partitions(PreviousPowerOfTwo(n_partitions_upper_bound)),
9
- radix_bits(RadixPartitioning::RadixBits(n_partitions)), radix_mask(RadixPartitioning::Mask(radix_bits)),
10
- radix_shift(RadixPartitioning::Shift(radix_bits)) {
11
-
12
- D_ASSERT(radix_bits <= RadixPartitioning::MAX_RADIX_BITS);
13
- D_ASSERT(n_partitions > 0);
14
- D_ASSERT(n_partitions == RadixPartitioning::NumberOfPartitions(radix_bits));
15
- D_ASSERT(IsPowerOfTwo(n_partitions));
16
- }
17
-
18
- PartitionableHashTable::PartitionableHashTable(ClientContext &context, Allocator &allocator,
19
- RadixPartitionInfo &partition_info_p, vector<LogicalType> group_types_p,
20
- vector<LogicalType> payload_types_p,
21
- vector<BoundAggregateExpression *> bindings_p)
22
- : context(context), allocator(allocator), group_types(std::move(group_types_p)),
23
- payload_types(std::move(payload_types_p)), bindings(std::move(bindings_p)), is_partitioned(false),
24
- partition_info(partition_info_p), hashes(LogicalType::HASH), hashes_subset(LogicalType::HASH) {
25
-
26
- sel_vectors.resize(partition_info.n_partitions);
27
- sel_vector_sizes.resize(partition_info.n_partitions);
28
- group_subset.Initialize(allocator, group_types);
29
- if (!payload_types.empty()) {
30
- payload_subset.Initialize(allocator, payload_types);
31
- }
32
-
33
- for (hash_t r = 0; r < partition_info.n_partitions; r++) {
34
- sel_vectors[r].Initialize();
35
- }
36
-
37
- RowLayout layout;
38
- layout.Initialize(group_types, AggregateObject::CreateAggregateObjects(bindings));
39
- tuple_size = layout.GetRowWidth();
40
- }
41
-
42
- HtEntryType PartitionableHashTable::GetHTEntrySize() {
43
- // we need at least STANDARD_VECTOR_SIZE entries to fit in the hash table
44
- if (GroupedAggregateHashTable::GetMaxCapacity(HtEntryType::HT_WIDTH_32, tuple_size) < STANDARD_VECTOR_SIZE) {
45
- return HtEntryType::HT_WIDTH_64;
46
- }
47
- return HtEntryType::HT_WIDTH_32;
48
- }
49
-
50
- bool OverMemoryLimit(ClientContext &context, const bool is_partitioned, const RadixPartitionInfo &partition_info,
51
- const GroupedAggregateHashTable &ht) {
52
- const auto n_partitions = is_partitioned ? partition_info.n_partitions : 1;
53
- const auto max_memory = BufferManager::GetBufferManager(context).GetMaxMemory();
54
- const auto num_threads = TaskScheduler::GetScheduler(context).NumberOfThreads();
55
- const auto memory_per_partition = 0.6 * max_memory / num_threads / n_partitions;
56
- return ht.TotalSize() > memory_per_partition;
57
- }
58
-
59
- idx_t PartitionableHashTable::ListAddChunk(HashTableList &list, DataChunk &groups, Vector &group_hashes,
60
- DataChunk &payload, const unsafe_vector<idx_t> &filter) {
61
- // If this is false, a single AddChunk would overflow the max capacity
62
- D_ASSERT(list.empty() || groups.size() <= list.back()->MaxCapacity());
63
- if (list.empty() || list.back()->Count() + groups.size() >= list.back()->MaxCapacity() ||
64
- OverMemoryLimit(context, is_partitioned, partition_info, *list.back())) {
65
- idx_t new_capacity = GroupedAggregateHashTable::InitialCapacity();
66
- if (!list.empty()) {
67
- new_capacity = list.back()->Capacity();
68
- // early release first part of ht and prevent adding of more data
69
- list.back()->Finalize();
70
- }
71
- list.push_back(make_uniq<GroupedAggregateHashTable>(context, allocator, group_types, payload_types, bindings,
72
- GetHTEntrySize(), new_capacity));
73
- }
74
- return list.back()->AddChunk(append_state, groups, group_hashes, payload, filter);
75
- }
76
-
77
- idx_t PartitionableHashTable::AddChunk(DataChunk &groups, DataChunk &payload, bool do_partition,
78
- const unsafe_vector<idx_t> &filter) {
79
- groups.Hash(hashes);
80
-
81
- // we partition when we are asked to or when the unpartitioned ht runs out of space
82
- if (!IsPartitioned() && do_partition) {
83
- Partition(false);
84
- }
85
-
86
- if (!IsPartitioned()) {
87
- return ListAddChunk(unpartitioned_hts, groups, hashes, payload, filter);
88
- }
89
-
90
- // makes no sense to do this with 1 partition
91
- D_ASSERT(partition_info.n_partitions > 0);
92
-
93
- for (hash_t r = 0; r < partition_info.n_partitions; r++) {
94
- sel_vector_sizes[r] = 0;
95
- }
96
-
97
- hashes.Flatten(groups.size());
98
- auto hashes_ptr = FlatVector::GetData<hash_t>(hashes);
99
-
100
- // Determine for every partition how much data will be sinked into it
101
- for (idx_t i = 0; i < groups.size(); i++) {
102
- auto partition = partition_info.GetHashPartition(hashes_ptr[i]);
103
- D_ASSERT(partition < partition_info.n_partitions);
104
- sel_vectors[partition].set_index(sel_vector_sizes[partition]++, i);
105
- }
106
-
107
- #ifdef DEBUG
108
- // make sure we have lost no rows
109
- idx_t total_count = 0;
110
- for (idx_t r = 0; r < partition_info.n_partitions; r++) {
111
- total_count += sel_vector_sizes[r];
112
- }
113
- D_ASSERT(total_count == groups.size());
114
- #endif
115
- idx_t group_count = 0;
116
- for (hash_t r = 0; r < partition_info.n_partitions; r++) {
117
- group_subset.Slice(groups, sel_vectors[r], sel_vector_sizes[r]);
118
- if (!payload_types.empty()) {
119
- payload_subset.Slice(payload, sel_vectors[r], sel_vector_sizes[r]);
120
- } else {
121
- payload_subset.SetCardinality(sel_vector_sizes[r]);
122
- }
123
- hashes_subset.Slice(hashes, sel_vectors[r], sel_vector_sizes[r]);
124
-
125
- group_count += ListAddChunk(radix_partitioned_hts[r], group_subset, hashes_subset, payload_subset, filter);
126
- }
127
- return group_count;
128
- }
129
-
130
- void PartitionableHashTable::Partition(bool sink_done) {
131
- D_ASSERT(!IsPartitioned());
132
- D_ASSERT(radix_partitioned_hts.empty());
133
- D_ASSERT(partition_info.n_partitions > 1);
134
-
135
- vector<GroupedAggregateHashTable *> partition_hts(partition_info.n_partitions);
136
- radix_partitioned_hts.resize(partition_info.n_partitions);
137
- for (auto &unpartitioned_ht : unpartitioned_hts) {
138
- for (idx_t r = 0; r < partition_info.n_partitions; r++) {
139
- radix_partitioned_hts[r].push_back(make_uniq<GroupedAggregateHashTable>(
140
- context, allocator, group_types, payload_types, bindings, GetHTEntrySize()));
141
- partition_hts[r] = radix_partitioned_hts[r].back().get();
142
- }
143
- unpartitioned_ht->Partition(partition_hts, partition_info.radix_bits, sink_done);
144
- unpartitioned_ht.reset();
145
- }
146
- unpartitioned_hts.clear();
147
- is_partitioned = true;
148
- }
149
-
150
- bool PartitionableHashTable::IsPartitioned() {
151
- return is_partitioned;
152
- }
153
-
154
- HashTableList PartitionableHashTable::GetPartition(idx_t partition) {
155
- D_ASSERT(IsPartitioned());
156
- D_ASSERT(partition < partition_info.n_partitions);
157
- D_ASSERT(radix_partitioned_hts.size() > partition);
158
- return std::move(radix_partitioned_hts[partition]);
159
- }
160
-
161
- HashTableList PartitionableHashTable::GetUnpartitioned() {
162
- D_ASSERT(!IsPartitioned());
163
- return std::move(unpartitioned_hts);
164
- }
165
-
166
- idx_t PartitionableHashTable::GetPartitionCount(idx_t partition) const {
167
- idx_t total_size = 0;
168
- for (const auto &ht : radix_partitioned_hts[partition]) {
169
- total_size += ht->Count();
170
- }
171
- return total_size;
172
- }
173
-
174
- idx_t PartitionableHashTable::GetPartitionSize(idx_t partition) const {
175
- idx_t total_size = 0;
176
- for (const auto &ht : radix_partitioned_hts[partition]) {
177
- total_size += ht->DataSize();
178
- }
179
- return total_size;
180
- }
181
-
182
- void PartitionableHashTable::Finalize() {
183
- if (IsPartitioned()) {
184
- for (auto &ht_list : radix_partitioned_hts) {
185
- for (auto &ht : ht_list) {
186
- D_ASSERT(ht);
187
- ht->Finalize();
188
- }
189
- }
190
- } else {
191
- for (auto &ht : unpartitioned_hts) {
192
- D_ASSERT(ht);
193
- ht->Finalize();
194
- }
195
- }
196
- }
197
-
198
- void PartitionableHashTable::Append(GroupedAggregateHashTable &ht) {
199
- if (unpartitioned_hts.empty()) {
200
- unpartitioned_hts.push_back(make_uniq<GroupedAggregateHashTable>(context, allocator, group_types, payload_types,
201
- bindings, GetHTEntrySize(),
202
- GroupedAggregateHashTable::InitialCapacity()));
203
- }
204
- unpartitioned_hts.back()->Append(ht);
205
- }
206
-
207
- } // namespace duckdb
@@ -1,133 +0,0 @@
1
- //===----------------------------------------------------------------------===//
2
- // DuckDB
3
- //
4
- // duckdb/execution/operator/persistent/base_csv_reader.hpp
5
- //
6
- //
7
- //===----------------------------------------------------------------------===//
8
-
9
- #pragma once
10
-
11
- #include "duckdb/execution/operator/persistent/base_csv_reader.hpp"
12
-
13
- namespace duckdb {
14
- struct CopyInfo;
15
- struct CSVFileHandle;
16
- struct FileHandle;
17
- struct StrpTimeFormat;
18
-
19
- class FileOpener;
20
- class FileSystem;
21
-
22
- //! The shifts array allows for linear searching of multi-byte values. For each position, it determines the next
23
- //! position given that we encounter a byte with the given value.
24
- /*! For example, if we have a string "ABAC", the shifts array will have the following values:
25
- * [0] --> ['A'] = 1, all others = 0
26
- * [1] --> ['B'] = 2, ['A'] = 1, all others = 0
27
- * [2] --> ['A'] = 3, all others = 0
28
- * [3] --> ['C'] = 4 (match), 'B' = 2, 'A' = 1, all others = 0
29
- * Suppose we then search in the following string "ABABAC", our progression will be as follows:
30
- * 'A' -> [1], 'B' -> [2], 'A' -> [3], 'B' -> [2], 'A' -> [3], 'C' -> [4] (match!)
31
- */
32
- struct TextSearchShiftArray {
33
- TextSearchShiftArray();
34
- explicit TextSearchShiftArray(string search_term);
35
-
36
- inline bool Match(uint8_t &position, uint8_t byte_value) {
37
- if (position >= length) {
38
- return false;
39
- }
40
- position = shifts[position * 255 + byte_value];
41
- return position == length;
42
- }
43
-
44
- idx_t length;
45
- unique_ptr<uint8_t[]> shifts;
46
- };
47
-
48
- //! Buffered CSV reader is a class that reads values from a stream and parses them as a CSV file
49
- class BufferedCSVReader : public BaseCSVReader {
50
- //! Initial buffer read size; can be extended for long lines
51
- static constexpr idx_t INITIAL_BUFFER_SIZE = 16384;
52
- //! Larger buffer size for non disk files
53
- static constexpr idx_t INITIAL_BUFFER_SIZE_LARGE = 10000000; // 10MB
54
-
55
- public:
56
- BufferedCSVReader(ClientContext &context, BufferedCSVReaderOptions options,
57
- const vector<LogicalType> &requested_types = vector<LogicalType>());
58
- BufferedCSVReader(ClientContext &context, string filename, BufferedCSVReaderOptions options,
59
- const vector<LogicalType> &requested_types = vector<LogicalType>());
60
- virtual ~BufferedCSVReader() {
61
- }
62
-
63
- unsafe_unique_array<char> buffer;
64
- idx_t buffer_size;
65
- idx_t position;
66
- idx_t start = 0;
67
-
68
- vector<unsafe_unique_array<char>> cached_buffers;
69
-
70
- unique_ptr<CSVFileHandle> file_handle;
71
-
72
- TextSearchShiftArray delimiter_search, escape_search, quote_search;
73
-
74
- public:
75
- //! Extract a single DataChunk from the CSV file and stores it in insert_chunk
76
- void ParseCSV(DataChunk &insert_chunk);
77
- static string ColumnTypesError(case_insensitive_map_t<idx_t> sql_types_per_column, const vector<string> &names);
78
-
79
- private:
80
- //! Initialize Parser
81
- void Initialize(const vector<LogicalType> &requested_types);
82
- //! Skips skip_rows, reads header row from input stream
83
- void SkipRowsAndReadHeader(idx_t skip_rows, bool skip_header);
84
- //! Jumps back to the beginning of input stream and resets necessary internal states
85
- void JumpToBeginning(idx_t skip_rows, bool skip_header);
86
- //! Resets the buffer
87
- void ResetBuffer();
88
- //! Resets the steam
89
- void ResetStream();
90
- //! Reads a new buffer from the CSV file if the current one has been exhausted
91
- bool ReadBuffer(idx_t &start, idx_t &line_start);
92
- //! Jumps back to the beginning of input stream and resets necessary internal states
93
- bool JumpToNextSample();
94
- //! Initializes the TextSearchShiftArrays for complex parser
95
- void PrepareComplexParser();
96
- //! Try to parse a single datachunk from the file. Throws an exception if anything goes wrong.
97
- void ParseCSV(ParserMode mode);
98
- //! Try to parse a single datachunk from the file. Returns whether or not the parsing is successful
99
- bool TryParseCSV(ParserMode mode);
100
- //! Extract a single DataChunk from the CSV file and stores it in insert_chunk
101
- bool TryParseCSV(ParserMode mode, DataChunk &insert_chunk, string &error_message);
102
-
103
- //! Parses a CSV file with a one-byte delimiter, escape and quote character
104
- bool TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message);
105
- //! Parses more complex CSV files with multi-byte delimiters, escapes or quotes
106
- bool TryParseComplexCSV(DataChunk &insert_chunk, string &error_message);
107
- //! Sniffs CSV dialect and determines skip rows, header row, column types and column names
108
- vector<LogicalType> SniffCSV(const vector<LogicalType> &requested_types);
109
-
110
- //! First phase of auto detection: detect CSV dialect (i.e. delimiter, quote rules, etc)
111
- void DetectDialect(const vector<LogicalType> &requested_types, BufferedCSVReaderOptions &original_options,
112
- vector<BufferedCSVReaderOptions> &info_candidates, idx_t &best_num_cols);
113
- //! Second phase of auto detection: detect candidate types for each column
114
- void DetectCandidateTypes(const vector<LogicalType> &type_candidates,
115
- const map<LogicalTypeId, vector<const char *>> &format_template_candidates,
116
- const vector<BufferedCSVReaderOptions> &info_candidates,
117
- BufferedCSVReaderOptions &original_options, idx_t best_num_cols,
118
- vector<vector<LogicalType>> &best_sql_types_candidates,
119
- std::map<LogicalTypeId, vector<string>> &best_format_candidates,
120
- DataChunk &best_header_row);
121
- //! Third phase of auto detection: detect header of CSV file
122
- void DetectHeader(const vector<vector<LogicalType>> &best_sql_types_candidates, const DataChunk &best_header_row);
123
- //! Fourth phase of auto detection: refine the types of each column and select which types to use for each column
124
- vector<LogicalType> RefineTypeDetection(const vector<LogicalType> &type_candidates,
125
- const vector<LogicalType> &requested_types,
126
- vector<vector<LogicalType>> &best_sql_types_candidates,
127
- map<LogicalTypeId, vector<string>> &best_format_candidates);
128
-
129
- //! Skip Empty lines for tables with over one column
130
- void SkipEmptyLines();
131
- };
132
-
133
- } // namespace duckdb
@@ -1,74 +0,0 @@
1
- //===----------------------------------------------------------------------===//
2
- // DuckDB
3
- //
4
- // duckdb/execution/operator/persistent/csv_buffer.hpp
5
- //
6
- //
7
- //===----------------------------------------------------------------------===//
8
-
9
- #pragma once
10
-
11
- #include "duckdb/common/constants.hpp"
12
- #include "duckdb/execution/operator/persistent/csv_file_handle.hpp"
13
- #include "duckdb/storage/buffer_manager.hpp"
14
-
15
- namespace duckdb {
16
-
17
- class CSVBuffer {
18
- public:
19
- //! Colossal buffer size for multi-threading
20
- static constexpr idx_t INITIAL_BUFFER_SIZE_COLOSSAL = 32000000; // 32MB
21
-
22
- //! Constructor for Initial Buffer
23
- CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle,
24
- idx_t &global_csv_current_position, idx_t file_number);
25
-
26
- //! Constructor for `Next()` Buffers
27
- CSVBuffer(ClientContext &context, BufferHandle handle, idx_t buffer_size_p, idx_t actual_size_p, bool final_buffer,
28
- idx_t global_csv_current_position, idx_t file_number);
29
-
30
- //! Creates a new buffer with the next part of the CSV File
31
- unique_ptr<CSVBuffer> Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t &global_csv_current_position,
32
- idx_t file_number);
33
-
34
- //! Gets the buffer actual size
35
- idx_t GetBufferSize();
36
-
37
- //! Gets the start position of the buffer, only relevant for the first time it's scanned
38
- idx_t GetStart();
39
-
40
- //! If this buffer is the last buffer of the CSV File
41
- bool IsCSVFileLastBuffer();
42
-
43
- //! If this buffer is the first buffer of the CSV File
44
- bool IsCSVFileFirstBuffer();
45
-
46
- idx_t GetCSVGlobalStart();
47
-
48
- idx_t GetFileNumber();
49
-
50
- BufferHandle AllocateBuffer(idx_t buffer_size);
51
-
52
- char *Ptr() {
53
- return char_ptr_cast(handle.Ptr());
54
- }
55
-
56
- private:
57
- ClientContext &context;
58
-
59
- BufferHandle handle;
60
- //! Actual size can be smaller than the buffer size in case we allocate it too optimistically.
61
- idx_t actual_size;
62
- //! We need to check for Byte Order Mark, to define the start position of this buffer
63
- //! https://en.wikipedia.org/wiki/Byte_order_mark#UTF-8
64
- idx_t start_position = 0;
65
- //! If this is the last buffer of the CSV File
66
- bool last_buffer = false;
67
- //! If this is the first buffer of the CSV File
68
- bool first_buffer = false;
69
- //! Global position from the CSV File where this buffer starts
70
- idx_t global_csv_start = 0;
71
- //! Number of the file that is in this buffer
72
- idx_t file_number = 0;
73
- };
74
- } // namespace duckdb