duckdb 0.8.2-dev3458.0 → 0.8.2-dev3949.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +2 -0
- package/package.json +1 -1
- package/src/duckdb/extension/icu/icu_extension.cpp +5 -5
- package/src/duckdb/extension/json/include/json_deserializer.hpp +7 -16
- package/src/duckdb/extension/json/include/json_serializer.hpp +9 -15
- package/src/duckdb/extension/json/json_deserializer.cpp +29 -67
- package/src/duckdb/extension/json/json_scan.cpp +1 -1
- package/src/duckdb/extension/json/json_serializer.cpp +26 -69
- package/src/duckdb/src/common/enum_util.cpp +119 -7
- package/src/duckdb/src/common/extra_type_info.cpp +7 -3
- package/src/duckdb/src/common/radix_partitioning.cpp +8 -31
- package/src/duckdb/src/common/row_operations/row_aggregate.cpp +18 -3
- package/src/duckdb/src/common/serializer/binary_deserializer.cpp +62 -77
- package/src/duckdb/src/common/serializer/binary_serializer.cpp +84 -84
- package/src/duckdb/src/common/serializer/format_serializer.cpp +1 -1
- package/src/duckdb/src/common/sort/partition_state.cpp +41 -33
- package/src/duckdb/src/common/types/data_chunk.cpp +44 -8
- package/src/duckdb/src/common/types/hyperloglog.cpp +21 -0
- package/src/duckdb/src/common/types/interval.cpp +3 -0
- package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +252 -126
- package/src/duckdb/src/common/types/row/row_layout.cpp +3 -31
- package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +40 -32
- package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +39 -26
- package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +11 -1
- package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +21 -16
- package/src/duckdb/src/common/types/value.cpp +63 -42
- package/src/duckdb/src/common/types/vector.cpp +33 -67
- package/src/duckdb/src/core_functions/scalar/list/list_lambdas.cpp +3 -2
- package/src/duckdb/src/execution/aggregate_hashtable.cpp +222 -364
- package/src/duckdb/src/execution/join_hashtable.cpp +5 -6
- package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +240 -310
- package/src/duckdb/src/execution/operator/aggregate/physical_ungrouped_aggregate.cpp +202 -173
- package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +36 -2
- package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/base_csv_reader.cpp +58 -162
- package/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp +434 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +80 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer_manager.cpp +90 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_file_handle.cpp +95 -0
- package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/csv_reader_options.cpp +47 -28
- package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine.cpp +35 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +107 -0
- package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/parallel_csv_reader.cpp +44 -44
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +52 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +336 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +165 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +398 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +175 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +39 -0
- package/src/duckdb/src/execution/operator/join/physical_asof_join.cpp +1 -1
- package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +1 -2
- package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +614 -574
- package/src/duckdb/src/execution/window_executor.cpp +6 -5
- package/src/duckdb/src/function/cast/cast_function_set.cpp +1 -0
- package/src/duckdb/src/function/scalar/strftime_format.cpp +4 -4
- package/src/duckdb/src/function/table/copy_csv.cpp +94 -96
- package/src/duckdb/src/function/table/read_csv.cpp +150 -136
- package/src/duckdb/src/function/table/table_scan.cpp +0 -2
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/enum_util.hpp +24 -0
- package/src/duckdb/src/include/duckdb/common/file_opener.hpp +9 -0
- package/src/duckdb/src/include/duckdb/common/fixed_size_map.hpp +208 -0
- package/src/duckdb/src/include/duckdb/common/optional_idx.hpp +3 -0
- package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +2 -1
- package/src/duckdb/src/include/duckdb/common/printer.hpp +11 -0
- package/src/duckdb/src/include/duckdb/common/serializer/binary_deserializer.hpp +43 -30
- package/src/duckdb/src/include/duckdb/common/serializer/binary_serializer.hpp +36 -35
- package/src/duckdb/src/include/duckdb/common/serializer/deserialization_data.hpp +18 -0
- package/src/duckdb/src/include/duckdb/common/serializer/encoding_util.hpp +132 -0
- package/src/duckdb/src/include/duckdb/common/serializer/format_deserializer.hpp +125 -150
- package/src/duckdb/src/include/duckdb/common/serializer/format_serializer.hpp +119 -107
- package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +2 -1
- package/src/duckdb/src/include/duckdb/common/shared_ptr.hpp +8 -0
- package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +13 -7
- package/src/duckdb/src/include/duckdb/common/types/data_chunk.hpp +5 -0
- package/src/duckdb/src/include/duckdb/common/types/hyperloglog.hpp +7 -1
- package/src/duckdb/src/include/duckdb/common/types/interval.hpp +7 -0
- package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +41 -9
- package/src/duckdb/src/include/duckdb/common/types/row/row_data_collection_scanner.hpp +5 -0
- package/src/duckdb/src/include/duckdb/common/types/row/row_layout.hpp +1 -23
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +14 -8
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +6 -3
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +7 -0
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +13 -8
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +3 -2
- package/src/duckdb/src/include/duckdb/common/types/vector.hpp +3 -3
- package/src/duckdb/src/include/duckdb/common/vector.hpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +125 -146
- package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_hash_aggregate.hpp +5 -4
- package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_window.hpp +4 -3
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/base_csv_reader.hpp +17 -17
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp +72 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +110 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp +103 -0
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_file_handle.hpp +8 -15
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_line_info.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_reader_options.hpp +52 -28
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +127 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +75 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +51 -0
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/parallel_csv_reader.hpp +21 -27
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/quote_rules.hpp +21 -0
- package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +18 -27
- package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +5 -6
- package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +4 -4
- package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +17 -12
- package/src/duckdb/src/include/duckdb/main/client_context_file_opener.hpp +1 -0
- package/src/duckdb/src/include/duckdb/main/client_data.hpp +2 -1
- package/src/duckdb/src/include/duckdb/main/config.hpp +1 -0
- package/src/duckdb/src/include/duckdb/main/connection.hpp +2 -2
- package/src/duckdb/src/include/duckdb/main/relation/read_csv_relation.hpp +6 -6
- package/src/duckdb/src/include/duckdb/parallel/event.hpp +12 -1
- package/src/duckdb/src/include/duckdb/storage/block.hpp +6 -0
- package/src/duckdb/src/include/duckdb/storage/buffer/block_handle.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/base_statistics.hpp +7 -3
- package/src/duckdb/src/include/duckdb/storage/statistics/column_statistics.hpp +4 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +5 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/list_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/numeric_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/string_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/struct_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp +15 -3
- package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +4 -0
- package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +5 -0
- package/src/duckdb/src/include/duckdb/verification/deserialized_statement_verifier_v2.hpp +6 -0
- package/src/duckdb/src/include/duckdb/verification/statement_verifier.hpp +1 -0
- package/src/duckdb/src/include/duckdb.h +12 -0
- package/src/duckdb/src/main/capi/logical_types-c.cpp +22 -0
- package/src/duckdb/src/main/client_context_file_opener.cpp +17 -0
- package/src/duckdb/src/main/client_verify.cpp +1 -0
- package/src/duckdb/src/main/config.cpp +2 -2
- package/src/duckdb/src/main/connection.cpp +3 -3
- package/src/duckdb/src/main/relation/read_csv_relation.cpp +19 -13
- package/src/duckdb/src/parallel/pipeline_finish_event.cpp +1 -1
- package/src/duckdb/src/parser/tableref/pivotref.cpp +0 -16
- package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +1 -1
- package/src/duckdb/src/planner/binder/statement/bind_export.cpp +41 -25
- package/src/duckdb/src/planner/expression/bound_aggregate_expression.cpp +4 -4
- package/src/duckdb/src/planner/expression/bound_window_expression.cpp +10 -10
- package/src/duckdb/src/planner/logical_operator.cpp +1 -1
- package/src/duckdb/src/planner/planner.cpp +1 -1
- package/src/duckdb/src/storage/checkpoint_manager.cpp +4 -3
- package/src/duckdb/src/storage/serialization/serialize_constraint.cpp +1 -1
- package/src/duckdb/src/storage/serialization/serialize_create_info.cpp +5 -5
- package/src/duckdb/src/storage/serialization/serialize_expression.cpp +10 -10
- package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +20 -20
- package/src/duckdb/src/storage/serialization/serialize_macro_function.cpp +2 -2
- package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +118 -89
- package/src/duckdb/src/storage/serialization/serialize_parse_info.cpp +3 -3
- package/src/duckdb/src/storage/serialization/serialize_parsed_expression.cpp +27 -27
- package/src/duckdb/src/storage/serialization/serialize_query_node.cpp +16 -16
- package/src/duckdb/src/storage/serialization/serialize_result_modifier.cpp +8 -8
- package/src/duckdb/src/storage/serialization/serialize_statement.cpp +1 -1
- package/src/duckdb/src/storage/serialization/serialize_storage.cpp +39 -0
- package/src/duckdb/src/storage/serialization/serialize_tableref.cpp +9 -9
- package/src/duckdb/src/storage/statistics/base_statistics.cpp +67 -4
- package/src/duckdb/src/storage/statistics/column_statistics.cpp +16 -0
- package/src/duckdb/src/storage/statistics/list_stats.cpp +21 -0
- package/src/duckdb/src/storage/statistics/numeric_stats.cpp +126 -1
- package/src/duckdb/src/storage/statistics/string_stats.cpp +23 -0
- package/src/duckdb/src/storage/statistics/struct_stats.cpp +27 -0
- package/src/duckdb/src/storage/storage_info.cpp +1 -1
- package/src/duckdb/src/storage/table/chunk_info.cpp +82 -3
- package/src/duckdb/src/storage/table/row_group.cpp +68 -1
- package/src/duckdb/src/storage/table/table_statistics.cpp +21 -0
- package/src/duckdb/src/storage/wal_replay.cpp +2 -2
- package/src/duckdb/src/verification/deserialized_statement_verifier_v2.cpp +15 -1
- package/src/duckdb/src/verification/statement_verifier.cpp +2 -0
- package/src/duckdb/third_party/utf8proc/include/utf8proc_wrapper.hpp +8 -0
- package/src/duckdb/ub_src_execution.cpp +0 -2
- package/src/duckdb/ub_src_execution_operator_csv_scanner.cpp +18 -0
- package/src/duckdb/ub_src_execution_operator_csv_scanner_sniffer.cpp +12 -0
- package/src/duckdb/ub_src_execution_operator_persistent.cpp +0 -12
- package/src/duckdb/ub_src_storage_serialization.cpp +2 -0
- package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +0 -1487
- package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +0 -72
- package/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp +0 -158
- package/src/duckdb/src/execution/partitionable_hashtable.cpp +0 -207
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +0 -133
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +0 -74
- package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +0 -73
@@ -1,72 +0,0 @@
|
|
1
|
-
#include "duckdb/execution/operator/persistent/csv_buffer.hpp"
|
2
|
-
#include "duckdb/common/string_util.hpp"
|
3
|
-
|
4
|
-
namespace duckdb {
|
5
|
-
|
6
|
-
CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle,
|
7
|
-
idx_t &global_csv_current_position, idx_t file_number_p)
|
8
|
-
: context(context), first_buffer(true), file_number(file_number_p) {
|
9
|
-
this->handle = AllocateBuffer(buffer_size_p);
|
10
|
-
|
11
|
-
auto buffer = Ptr();
|
12
|
-
actual_size = file_handle.Read(buffer, buffer_size_p);
|
13
|
-
global_csv_start = global_csv_current_position;
|
14
|
-
global_csv_current_position += actual_size;
|
15
|
-
if (actual_size >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') {
|
16
|
-
start_position += 3;
|
17
|
-
}
|
18
|
-
last_buffer = file_handle.FinishedReading();
|
19
|
-
}
|
20
|
-
|
21
|
-
CSVBuffer::CSVBuffer(ClientContext &context, BufferHandle buffer_p, idx_t buffer_size_p, idx_t actual_size_p,
|
22
|
-
bool final_buffer, idx_t global_csv_current_position, idx_t file_number_p)
|
23
|
-
: context(context), handle(std::move(buffer_p)), actual_size(actual_size_p), last_buffer(final_buffer),
|
24
|
-
global_csv_start(global_csv_current_position), file_number(file_number_p) {
|
25
|
-
}
|
26
|
-
|
27
|
-
unique_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t &global_csv_current_position,
|
28
|
-
idx_t file_number_p) {
|
29
|
-
auto next_buffer = AllocateBuffer(buffer_size);
|
30
|
-
idx_t next_buffer_actual_size = file_handle.Read(next_buffer.Ptr(), buffer_size);
|
31
|
-
if (next_buffer_actual_size == 0) {
|
32
|
-
// We are done reading
|
33
|
-
return nullptr;
|
34
|
-
}
|
35
|
-
|
36
|
-
auto next_csv_buffer =
|
37
|
-
make_uniq<CSVBuffer>(context, std::move(next_buffer), buffer_size, next_buffer_actual_size,
|
38
|
-
file_handle.FinishedReading(), global_csv_current_position, file_number_p);
|
39
|
-
global_csv_current_position += next_buffer_actual_size;
|
40
|
-
return next_csv_buffer;
|
41
|
-
}
|
42
|
-
|
43
|
-
BufferHandle CSVBuffer::AllocateBuffer(idx_t buffer_size) {
|
44
|
-
auto &buffer_manager = BufferManager::GetBufferManager(context);
|
45
|
-
return buffer_manager.Allocate(MaxValue<idx_t>(Storage::BLOCK_SIZE, buffer_size));
|
46
|
-
}
|
47
|
-
|
48
|
-
idx_t CSVBuffer::GetBufferSize() {
|
49
|
-
return actual_size;
|
50
|
-
}
|
51
|
-
|
52
|
-
idx_t CSVBuffer::GetStart() {
|
53
|
-
return start_position;
|
54
|
-
}
|
55
|
-
|
56
|
-
bool CSVBuffer::IsCSVFileLastBuffer() {
|
57
|
-
return last_buffer;
|
58
|
-
}
|
59
|
-
|
60
|
-
bool CSVBuffer::IsCSVFileFirstBuffer() {
|
61
|
-
return first_buffer;
|
62
|
-
}
|
63
|
-
|
64
|
-
idx_t CSVBuffer::GetCSVGlobalStart() {
|
65
|
-
return global_csv_start;
|
66
|
-
}
|
67
|
-
|
68
|
-
idx_t CSVBuffer::GetFileNumber() {
|
69
|
-
return file_number;
|
70
|
-
}
|
71
|
-
|
72
|
-
} // namespace duckdb
|
@@ -1,158 +0,0 @@
|
|
1
|
-
#include "duckdb/execution/operator/persistent/csv_file_handle.hpp"
|
2
|
-
|
3
|
-
namespace duckdb {
|
4
|
-
|
5
|
-
CSVFileHandle::CSVFileHandle(FileSystem &fs, Allocator &allocator, unique_ptr<FileHandle> file_handle_p,
|
6
|
-
const string &path_p, FileCompressionType compression, bool enable_reset)
|
7
|
-
: fs(fs), allocator(allocator), file_handle(std::move(file_handle_p)), path(path_p), compression(compression),
|
8
|
-
reset_enabled(enable_reset) {
|
9
|
-
can_seek = file_handle->CanSeek();
|
10
|
-
on_disk_file = file_handle->OnDiskFile();
|
11
|
-
file_size = file_handle->GetFileSize();
|
12
|
-
}
|
13
|
-
|
14
|
-
unique_ptr<FileHandle> CSVFileHandle::OpenFileHandle(FileSystem &fs, Allocator &allocator, const string &path,
|
15
|
-
FileCompressionType compression) {
|
16
|
-
auto file_handle = fs.OpenFile(path, FileFlags::FILE_FLAGS_READ, FileLockType::NO_LOCK, compression);
|
17
|
-
if (file_handle->CanSeek()) {
|
18
|
-
file_handle->Reset();
|
19
|
-
}
|
20
|
-
return file_handle;
|
21
|
-
}
|
22
|
-
|
23
|
-
unique_ptr<CSVFileHandle> CSVFileHandle::OpenFile(FileSystem &fs, Allocator &allocator, const string &path,
|
24
|
-
FileCompressionType compression, bool enable_reset) {
|
25
|
-
auto file_handle = CSVFileHandle::OpenFileHandle(fs, allocator, path, compression);
|
26
|
-
return make_uniq<CSVFileHandle>(fs, allocator, std::move(file_handle), path, compression, enable_reset);
|
27
|
-
}
|
28
|
-
|
29
|
-
bool CSVFileHandle::CanSeek() {
|
30
|
-
return can_seek;
|
31
|
-
}
|
32
|
-
|
33
|
-
void CSVFileHandle::Seek(idx_t position) {
|
34
|
-
if (!can_seek) {
|
35
|
-
throw InternalException("Cannot seek in this file");
|
36
|
-
}
|
37
|
-
file_handle->Seek(position);
|
38
|
-
}
|
39
|
-
|
40
|
-
idx_t CSVFileHandle::SeekPosition() {
|
41
|
-
if (!can_seek) {
|
42
|
-
throw InternalException("Cannot seek in this file");
|
43
|
-
}
|
44
|
-
return file_handle->SeekPosition();
|
45
|
-
}
|
46
|
-
|
47
|
-
void CSVFileHandle::Reset() {
|
48
|
-
requested_bytes = 0;
|
49
|
-
read_position = 0;
|
50
|
-
if (can_seek) {
|
51
|
-
// we can seek - reset the file handle
|
52
|
-
file_handle->Reset();
|
53
|
-
} else if (on_disk_file) {
|
54
|
-
// we cannot seek but it is an on-disk file - re-open the file
|
55
|
-
file_handle = CSVFileHandle::OpenFileHandle(fs, allocator, path, compression);
|
56
|
-
} else {
|
57
|
-
if (!reset_enabled) {
|
58
|
-
throw InternalException("Reset called but reset is not enabled for this CSV Handle");
|
59
|
-
}
|
60
|
-
read_position = 0;
|
61
|
-
}
|
62
|
-
}
|
63
|
-
bool CSVFileHandle::OnDiskFile() {
|
64
|
-
return on_disk_file;
|
65
|
-
}
|
66
|
-
|
67
|
-
idx_t CSVFileHandle::FileSize() {
|
68
|
-
return file_size;
|
69
|
-
}
|
70
|
-
|
71
|
-
bool CSVFileHandle::FinishedReading() {
|
72
|
-
return requested_bytes >= file_size;
|
73
|
-
}
|
74
|
-
|
75
|
-
idx_t CSVFileHandle::Read(void *buffer, idx_t nr_bytes) {
|
76
|
-
requested_bytes += nr_bytes;
|
77
|
-
if (on_disk_file || can_seek) {
|
78
|
-
// if this is a plain file source OR we can seek we are not caching anything
|
79
|
-
return file_handle->Read(buffer, nr_bytes);
|
80
|
-
}
|
81
|
-
// not a plain file source: we need to do some bookkeeping around the reset functionality
|
82
|
-
idx_t result_offset = 0;
|
83
|
-
if (read_position < buffer_size) {
|
84
|
-
// we need to read from our cached buffer
|
85
|
-
auto buffer_read_count = MinValue<idx_t>(nr_bytes, buffer_size - read_position);
|
86
|
-
memcpy(buffer, cached_buffer.get() + read_position, buffer_read_count);
|
87
|
-
result_offset += buffer_read_count;
|
88
|
-
read_position += buffer_read_count;
|
89
|
-
if (result_offset == nr_bytes) {
|
90
|
-
return nr_bytes;
|
91
|
-
}
|
92
|
-
} else if (!reset_enabled && cached_buffer.IsSet()) {
|
93
|
-
// reset is disabled, but we still have cached data
|
94
|
-
// we can remove any cached data
|
95
|
-
cached_buffer.Reset();
|
96
|
-
buffer_size = 0;
|
97
|
-
buffer_capacity = 0;
|
98
|
-
read_position = 0;
|
99
|
-
}
|
100
|
-
// we have data left to read from the file
|
101
|
-
// read directly into the buffer
|
102
|
-
auto bytes_read = file_handle->Read(char_ptr_cast(buffer) + result_offset, nr_bytes - result_offset);
|
103
|
-
file_size = file_handle->GetFileSize();
|
104
|
-
read_position += bytes_read;
|
105
|
-
if (reset_enabled) {
|
106
|
-
// if reset caching is enabled, we need to cache the bytes that we have read
|
107
|
-
if (buffer_size + bytes_read >= buffer_capacity) {
|
108
|
-
// no space; first enlarge the buffer
|
109
|
-
buffer_capacity = MaxValue<idx_t>(NextPowerOfTwo(buffer_size + bytes_read), buffer_capacity * 2);
|
110
|
-
|
111
|
-
auto new_buffer = allocator.Allocate(buffer_capacity);
|
112
|
-
if (buffer_size > 0) {
|
113
|
-
memcpy(new_buffer.get(), cached_buffer.get(), buffer_size);
|
114
|
-
}
|
115
|
-
cached_buffer = std::move(new_buffer);
|
116
|
-
}
|
117
|
-
memcpy(cached_buffer.get() + buffer_size, char_ptr_cast(buffer) + result_offset, bytes_read);
|
118
|
-
buffer_size += bytes_read;
|
119
|
-
}
|
120
|
-
|
121
|
-
return result_offset + bytes_read;
|
122
|
-
}
|
123
|
-
|
124
|
-
string CSVFileHandle::ReadLine() {
|
125
|
-
bool carriage_return = false;
|
126
|
-
string result;
|
127
|
-
char buffer[1];
|
128
|
-
while (true) {
|
129
|
-
idx_t bytes_read = Read(buffer, 1);
|
130
|
-
if (bytes_read == 0) {
|
131
|
-
return result;
|
132
|
-
}
|
133
|
-
if (carriage_return) {
|
134
|
-
if (buffer[0] != '\n') {
|
135
|
-
if (!file_handle->CanSeek()) {
|
136
|
-
throw BinderException(
|
137
|
-
"Carriage return newlines not supported when reading CSV files in which we cannot seek");
|
138
|
-
}
|
139
|
-
file_handle->Seek(file_handle->SeekPosition() - 1);
|
140
|
-
return result;
|
141
|
-
}
|
142
|
-
}
|
143
|
-
if (buffer[0] == '\n') {
|
144
|
-
return result;
|
145
|
-
}
|
146
|
-
if (buffer[0] != '\r') {
|
147
|
-
result += buffer[0];
|
148
|
-
} else {
|
149
|
-
carriage_return = true;
|
150
|
-
}
|
151
|
-
}
|
152
|
-
}
|
153
|
-
|
154
|
-
void CSVFileHandle::DisableReset() {
|
155
|
-
this->reset_enabled = false;
|
156
|
-
}
|
157
|
-
|
158
|
-
} // namespace duckdb
|
@@ -1,207 +0,0 @@
|
|
1
|
-
#include "duckdb/execution/partitionable_hashtable.hpp"
|
2
|
-
|
3
|
-
#include "duckdb/common/radix_partitioning.hpp"
|
4
|
-
|
5
|
-
namespace duckdb {
|
6
|
-
|
7
|
-
RadixPartitionInfo::RadixPartitionInfo(const idx_t n_partitions_upper_bound)
|
8
|
-
: n_partitions(PreviousPowerOfTwo(n_partitions_upper_bound)),
|
9
|
-
radix_bits(RadixPartitioning::RadixBits(n_partitions)), radix_mask(RadixPartitioning::Mask(radix_bits)),
|
10
|
-
radix_shift(RadixPartitioning::Shift(radix_bits)) {
|
11
|
-
|
12
|
-
D_ASSERT(radix_bits <= RadixPartitioning::MAX_RADIX_BITS);
|
13
|
-
D_ASSERT(n_partitions > 0);
|
14
|
-
D_ASSERT(n_partitions == RadixPartitioning::NumberOfPartitions(radix_bits));
|
15
|
-
D_ASSERT(IsPowerOfTwo(n_partitions));
|
16
|
-
}
|
17
|
-
|
18
|
-
PartitionableHashTable::PartitionableHashTable(ClientContext &context, Allocator &allocator,
|
19
|
-
RadixPartitionInfo &partition_info_p, vector<LogicalType> group_types_p,
|
20
|
-
vector<LogicalType> payload_types_p,
|
21
|
-
vector<BoundAggregateExpression *> bindings_p)
|
22
|
-
: context(context), allocator(allocator), group_types(std::move(group_types_p)),
|
23
|
-
payload_types(std::move(payload_types_p)), bindings(std::move(bindings_p)), is_partitioned(false),
|
24
|
-
partition_info(partition_info_p), hashes(LogicalType::HASH), hashes_subset(LogicalType::HASH) {
|
25
|
-
|
26
|
-
sel_vectors.resize(partition_info.n_partitions);
|
27
|
-
sel_vector_sizes.resize(partition_info.n_partitions);
|
28
|
-
group_subset.Initialize(allocator, group_types);
|
29
|
-
if (!payload_types.empty()) {
|
30
|
-
payload_subset.Initialize(allocator, payload_types);
|
31
|
-
}
|
32
|
-
|
33
|
-
for (hash_t r = 0; r < partition_info.n_partitions; r++) {
|
34
|
-
sel_vectors[r].Initialize();
|
35
|
-
}
|
36
|
-
|
37
|
-
RowLayout layout;
|
38
|
-
layout.Initialize(group_types, AggregateObject::CreateAggregateObjects(bindings));
|
39
|
-
tuple_size = layout.GetRowWidth();
|
40
|
-
}
|
41
|
-
|
42
|
-
HtEntryType PartitionableHashTable::GetHTEntrySize() {
|
43
|
-
// we need at least STANDARD_VECTOR_SIZE entries to fit in the hash table
|
44
|
-
if (GroupedAggregateHashTable::GetMaxCapacity(HtEntryType::HT_WIDTH_32, tuple_size) < STANDARD_VECTOR_SIZE) {
|
45
|
-
return HtEntryType::HT_WIDTH_64;
|
46
|
-
}
|
47
|
-
return HtEntryType::HT_WIDTH_32;
|
48
|
-
}
|
49
|
-
|
50
|
-
bool OverMemoryLimit(ClientContext &context, const bool is_partitioned, const RadixPartitionInfo &partition_info,
|
51
|
-
const GroupedAggregateHashTable &ht) {
|
52
|
-
const auto n_partitions = is_partitioned ? partition_info.n_partitions : 1;
|
53
|
-
const auto max_memory = BufferManager::GetBufferManager(context).GetMaxMemory();
|
54
|
-
const auto num_threads = TaskScheduler::GetScheduler(context).NumberOfThreads();
|
55
|
-
const auto memory_per_partition = 0.6 * max_memory / num_threads / n_partitions;
|
56
|
-
return ht.TotalSize() > memory_per_partition;
|
57
|
-
}
|
58
|
-
|
59
|
-
idx_t PartitionableHashTable::ListAddChunk(HashTableList &list, DataChunk &groups, Vector &group_hashes,
|
60
|
-
DataChunk &payload, const unsafe_vector<idx_t> &filter) {
|
61
|
-
// If this is false, a single AddChunk would overflow the max capacity
|
62
|
-
D_ASSERT(list.empty() || groups.size() <= list.back()->MaxCapacity());
|
63
|
-
if (list.empty() || list.back()->Count() + groups.size() >= list.back()->MaxCapacity() ||
|
64
|
-
OverMemoryLimit(context, is_partitioned, partition_info, *list.back())) {
|
65
|
-
idx_t new_capacity = GroupedAggregateHashTable::InitialCapacity();
|
66
|
-
if (!list.empty()) {
|
67
|
-
new_capacity = list.back()->Capacity();
|
68
|
-
// early release first part of ht and prevent adding of more data
|
69
|
-
list.back()->Finalize();
|
70
|
-
}
|
71
|
-
list.push_back(make_uniq<GroupedAggregateHashTable>(context, allocator, group_types, payload_types, bindings,
|
72
|
-
GetHTEntrySize(), new_capacity));
|
73
|
-
}
|
74
|
-
return list.back()->AddChunk(append_state, groups, group_hashes, payload, filter);
|
75
|
-
}
|
76
|
-
|
77
|
-
idx_t PartitionableHashTable::AddChunk(DataChunk &groups, DataChunk &payload, bool do_partition,
|
78
|
-
const unsafe_vector<idx_t> &filter) {
|
79
|
-
groups.Hash(hashes);
|
80
|
-
|
81
|
-
// we partition when we are asked to or when the unpartitioned ht runs out of space
|
82
|
-
if (!IsPartitioned() && do_partition) {
|
83
|
-
Partition(false);
|
84
|
-
}
|
85
|
-
|
86
|
-
if (!IsPartitioned()) {
|
87
|
-
return ListAddChunk(unpartitioned_hts, groups, hashes, payload, filter);
|
88
|
-
}
|
89
|
-
|
90
|
-
// makes no sense to do this with 1 partition
|
91
|
-
D_ASSERT(partition_info.n_partitions > 0);
|
92
|
-
|
93
|
-
for (hash_t r = 0; r < partition_info.n_partitions; r++) {
|
94
|
-
sel_vector_sizes[r] = 0;
|
95
|
-
}
|
96
|
-
|
97
|
-
hashes.Flatten(groups.size());
|
98
|
-
auto hashes_ptr = FlatVector::GetData<hash_t>(hashes);
|
99
|
-
|
100
|
-
// Determine for every partition how much data will be sinked into it
|
101
|
-
for (idx_t i = 0; i < groups.size(); i++) {
|
102
|
-
auto partition = partition_info.GetHashPartition(hashes_ptr[i]);
|
103
|
-
D_ASSERT(partition < partition_info.n_partitions);
|
104
|
-
sel_vectors[partition].set_index(sel_vector_sizes[partition]++, i);
|
105
|
-
}
|
106
|
-
|
107
|
-
#ifdef DEBUG
|
108
|
-
// make sure we have lost no rows
|
109
|
-
idx_t total_count = 0;
|
110
|
-
for (idx_t r = 0; r < partition_info.n_partitions; r++) {
|
111
|
-
total_count += sel_vector_sizes[r];
|
112
|
-
}
|
113
|
-
D_ASSERT(total_count == groups.size());
|
114
|
-
#endif
|
115
|
-
idx_t group_count = 0;
|
116
|
-
for (hash_t r = 0; r < partition_info.n_partitions; r++) {
|
117
|
-
group_subset.Slice(groups, sel_vectors[r], sel_vector_sizes[r]);
|
118
|
-
if (!payload_types.empty()) {
|
119
|
-
payload_subset.Slice(payload, sel_vectors[r], sel_vector_sizes[r]);
|
120
|
-
} else {
|
121
|
-
payload_subset.SetCardinality(sel_vector_sizes[r]);
|
122
|
-
}
|
123
|
-
hashes_subset.Slice(hashes, sel_vectors[r], sel_vector_sizes[r]);
|
124
|
-
|
125
|
-
group_count += ListAddChunk(radix_partitioned_hts[r], group_subset, hashes_subset, payload_subset, filter);
|
126
|
-
}
|
127
|
-
return group_count;
|
128
|
-
}
|
129
|
-
|
130
|
-
void PartitionableHashTable::Partition(bool sink_done) {
|
131
|
-
D_ASSERT(!IsPartitioned());
|
132
|
-
D_ASSERT(radix_partitioned_hts.empty());
|
133
|
-
D_ASSERT(partition_info.n_partitions > 1);
|
134
|
-
|
135
|
-
vector<GroupedAggregateHashTable *> partition_hts(partition_info.n_partitions);
|
136
|
-
radix_partitioned_hts.resize(partition_info.n_partitions);
|
137
|
-
for (auto &unpartitioned_ht : unpartitioned_hts) {
|
138
|
-
for (idx_t r = 0; r < partition_info.n_partitions; r++) {
|
139
|
-
radix_partitioned_hts[r].push_back(make_uniq<GroupedAggregateHashTable>(
|
140
|
-
context, allocator, group_types, payload_types, bindings, GetHTEntrySize()));
|
141
|
-
partition_hts[r] = radix_partitioned_hts[r].back().get();
|
142
|
-
}
|
143
|
-
unpartitioned_ht->Partition(partition_hts, partition_info.radix_bits, sink_done);
|
144
|
-
unpartitioned_ht.reset();
|
145
|
-
}
|
146
|
-
unpartitioned_hts.clear();
|
147
|
-
is_partitioned = true;
|
148
|
-
}
|
149
|
-
|
150
|
-
bool PartitionableHashTable::IsPartitioned() {
|
151
|
-
return is_partitioned;
|
152
|
-
}
|
153
|
-
|
154
|
-
HashTableList PartitionableHashTable::GetPartition(idx_t partition) {
|
155
|
-
D_ASSERT(IsPartitioned());
|
156
|
-
D_ASSERT(partition < partition_info.n_partitions);
|
157
|
-
D_ASSERT(radix_partitioned_hts.size() > partition);
|
158
|
-
return std::move(radix_partitioned_hts[partition]);
|
159
|
-
}
|
160
|
-
|
161
|
-
HashTableList PartitionableHashTable::GetUnpartitioned() {
|
162
|
-
D_ASSERT(!IsPartitioned());
|
163
|
-
return std::move(unpartitioned_hts);
|
164
|
-
}
|
165
|
-
|
166
|
-
idx_t PartitionableHashTable::GetPartitionCount(idx_t partition) const {
|
167
|
-
idx_t total_size = 0;
|
168
|
-
for (const auto &ht : radix_partitioned_hts[partition]) {
|
169
|
-
total_size += ht->Count();
|
170
|
-
}
|
171
|
-
return total_size;
|
172
|
-
}
|
173
|
-
|
174
|
-
idx_t PartitionableHashTable::GetPartitionSize(idx_t partition) const {
|
175
|
-
idx_t total_size = 0;
|
176
|
-
for (const auto &ht : radix_partitioned_hts[partition]) {
|
177
|
-
total_size += ht->DataSize();
|
178
|
-
}
|
179
|
-
return total_size;
|
180
|
-
}
|
181
|
-
|
182
|
-
void PartitionableHashTable::Finalize() {
|
183
|
-
if (IsPartitioned()) {
|
184
|
-
for (auto &ht_list : radix_partitioned_hts) {
|
185
|
-
for (auto &ht : ht_list) {
|
186
|
-
D_ASSERT(ht);
|
187
|
-
ht->Finalize();
|
188
|
-
}
|
189
|
-
}
|
190
|
-
} else {
|
191
|
-
for (auto &ht : unpartitioned_hts) {
|
192
|
-
D_ASSERT(ht);
|
193
|
-
ht->Finalize();
|
194
|
-
}
|
195
|
-
}
|
196
|
-
}
|
197
|
-
|
198
|
-
void PartitionableHashTable::Append(GroupedAggregateHashTable &ht) {
|
199
|
-
if (unpartitioned_hts.empty()) {
|
200
|
-
unpartitioned_hts.push_back(make_uniq<GroupedAggregateHashTable>(context, allocator, group_types, payload_types,
|
201
|
-
bindings, GetHTEntrySize(),
|
202
|
-
GroupedAggregateHashTable::InitialCapacity()));
|
203
|
-
}
|
204
|
-
unpartitioned_hts.back()->Append(ht);
|
205
|
-
}
|
206
|
-
|
207
|
-
} // namespace duckdb
|
@@ -1,133 +0,0 @@
|
|
1
|
-
//===----------------------------------------------------------------------===//
|
2
|
-
// DuckDB
|
3
|
-
//
|
4
|
-
// duckdb/execution/operator/persistent/base_csv_reader.hpp
|
5
|
-
//
|
6
|
-
//
|
7
|
-
//===----------------------------------------------------------------------===//
|
8
|
-
|
9
|
-
#pragma once
|
10
|
-
|
11
|
-
#include "duckdb/execution/operator/persistent/base_csv_reader.hpp"
|
12
|
-
|
13
|
-
namespace duckdb {
|
14
|
-
struct CopyInfo;
|
15
|
-
struct CSVFileHandle;
|
16
|
-
struct FileHandle;
|
17
|
-
struct StrpTimeFormat;
|
18
|
-
|
19
|
-
class FileOpener;
|
20
|
-
class FileSystem;
|
21
|
-
|
22
|
-
//! The shifts array allows for linear searching of multi-byte values. For each position, it determines the next
|
23
|
-
//! position given that we encounter a byte with the given value.
|
24
|
-
/*! For example, if we have a string "ABAC", the shifts array will have the following values:
|
25
|
-
* [0] --> ['A'] = 1, all others = 0
|
26
|
-
* [1] --> ['B'] = 2, ['A'] = 1, all others = 0
|
27
|
-
* [2] --> ['A'] = 3, all others = 0
|
28
|
-
* [3] --> ['C'] = 4 (match), 'B' = 2, 'A' = 1, all others = 0
|
29
|
-
* Suppose we then search in the following string "ABABAC", our progression will be as follows:
|
30
|
-
* 'A' -> [1], 'B' -> [2], 'A' -> [3], 'B' -> [2], 'A' -> [3], 'C' -> [4] (match!)
|
31
|
-
*/
|
32
|
-
struct TextSearchShiftArray {
|
33
|
-
TextSearchShiftArray();
|
34
|
-
explicit TextSearchShiftArray(string search_term);
|
35
|
-
|
36
|
-
inline bool Match(uint8_t &position, uint8_t byte_value) {
|
37
|
-
if (position >= length) {
|
38
|
-
return false;
|
39
|
-
}
|
40
|
-
position = shifts[position * 255 + byte_value];
|
41
|
-
return position == length;
|
42
|
-
}
|
43
|
-
|
44
|
-
idx_t length;
|
45
|
-
unique_ptr<uint8_t[]> shifts;
|
46
|
-
};
|
47
|
-
|
48
|
-
//! Buffered CSV reader is a class that reads values from a stream and parses them as a CSV file
|
49
|
-
class BufferedCSVReader : public BaseCSVReader {
|
50
|
-
//! Initial buffer read size; can be extended for long lines
|
51
|
-
static constexpr idx_t INITIAL_BUFFER_SIZE = 16384;
|
52
|
-
//! Larger buffer size for non disk files
|
53
|
-
static constexpr idx_t INITIAL_BUFFER_SIZE_LARGE = 10000000; // 10MB
|
54
|
-
|
55
|
-
public:
|
56
|
-
BufferedCSVReader(ClientContext &context, BufferedCSVReaderOptions options,
|
57
|
-
const vector<LogicalType> &requested_types = vector<LogicalType>());
|
58
|
-
BufferedCSVReader(ClientContext &context, string filename, BufferedCSVReaderOptions options,
|
59
|
-
const vector<LogicalType> &requested_types = vector<LogicalType>());
|
60
|
-
virtual ~BufferedCSVReader() {
|
61
|
-
}
|
62
|
-
|
63
|
-
unsafe_unique_array<char> buffer;
|
64
|
-
idx_t buffer_size;
|
65
|
-
idx_t position;
|
66
|
-
idx_t start = 0;
|
67
|
-
|
68
|
-
vector<unsafe_unique_array<char>> cached_buffers;
|
69
|
-
|
70
|
-
unique_ptr<CSVFileHandle> file_handle;
|
71
|
-
|
72
|
-
TextSearchShiftArray delimiter_search, escape_search, quote_search;
|
73
|
-
|
74
|
-
public:
|
75
|
-
//! Extract a single DataChunk from the CSV file and stores it in insert_chunk
|
76
|
-
void ParseCSV(DataChunk &insert_chunk);
|
77
|
-
static string ColumnTypesError(case_insensitive_map_t<idx_t> sql_types_per_column, const vector<string> &names);
|
78
|
-
|
79
|
-
private:
|
80
|
-
//! Initialize Parser
|
81
|
-
void Initialize(const vector<LogicalType> &requested_types);
|
82
|
-
//! Skips skip_rows, reads header row from input stream
|
83
|
-
void SkipRowsAndReadHeader(idx_t skip_rows, bool skip_header);
|
84
|
-
//! Jumps back to the beginning of input stream and resets necessary internal states
|
85
|
-
void JumpToBeginning(idx_t skip_rows, bool skip_header);
|
86
|
-
//! Resets the buffer
|
87
|
-
void ResetBuffer();
|
88
|
-
//! Resets the steam
|
89
|
-
void ResetStream();
|
90
|
-
//! Reads a new buffer from the CSV file if the current one has been exhausted
|
91
|
-
bool ReadBuffer(idx_t &start, idx_t &line_start);
|
92
|
-
//! Jumps back to the beginning of input stream and resets necessary internal states
|
93
|
-
bool JumpToNextSample();
|
94
|
-
//! Initializes the TextSearchShiftArrays for complex parser
|
95
|
-
void PrepareComplexParser();
|
96
|
-
//! Try to parse a single datachunk from the file. Throws an exception if anything goes wrong.
|
97
|
-
void ParseCSV(ParserMode mode);
|
98
|
-
//! Try to parse a single datachunk from the file. Returns whether or not the parsing is successful
|
99
|
-
bool TryParseCSV(ParserMode mode);
|
100
|
-
//! Extract a single DataChunk from the CSV file and stores it in insert_chunk
|
101
|
-
bool TryParseCSV(ParserMode mode, DataChunk &insert_chunk, string &error_message);
|
102
|
-
|
103
|
-
//! Parses a CSV file with a one-byte delimiter, escape and quote character
|
104
|
-
bool TryParseSimpleCSV(DataChunk &insert_chunk, string &error_message);
|
105
|
-
//! Parses more complex CSV files with multi-byte delimiters, escapes or quotes
|
106
|
-
bool TryParseComplexCSV(DataChunk &insert_chunk, string &error_message);
|
107
|
-
//! Sniffs CSV dialect and determines skip rows, header row, column types and column names
|
108
|
-
vector<LogicalType> SniffCSV(const vector<LogicalType> &requested_types);
|
109
|
-
|
110
|
-
//! First phase of auto detection: detect CSV dialect (i.e. delimiter, quote rules, etc)
|
111
|
-
void DetectDialect(const vector<LogicalType> &requested_types, BufferedCSVReaderOptions &original_options,
|
112
|
-
vector<BufferedCSVReaderOptions> &info_candidates, idx_t &best_num_cols);
|
113
|
-
//! Second phase of auto detection: detect candidate types for each column
|
114
|
-
void DetectCandidateTypes(const vector<LogicalType> &type_candidates,
|
115
|
-
const map<LogicalTypeId, vector<const char *>> &format_template_candidates,
|
116
|
-
const vector<BufferedCSVReaderOptions> &info_candidates,
|
117
|
-
BufferedCSVReaderOptions &original_options, idx_t best_num_cols,
|
118
|
-
vector<vector<LogicalType>> &best_sql_types_candidates,
|
119
|
-
std::map<LogicalTypeId, vector<string>> &best_format_candidates,
|
120
|
-
DataChunk &best_header_row);
|
121
|
-
//! Third phase of auto detection: detect header of CSV file
|
122
|
-
void DetectHeader(const vector<vector<LogicalType>> &best_sql_types_candidates, const DataChunk &best_header_row);
|
123
|
-
//! Fourth phase of auto detection: refine the types of each column and select which types to use for each column
|
124
|
-
vector<LogicalType> RefineTypeDetection(const vector<LogicalType> &type_candidates,
|
125
|
-
const vector<LogicalType> &requested_types,
|
126
|
-
vector<vector<LogicalType>> &best_sql_types_candidates,
|
127
|
-
map<LogicalTypeId, vector<string>> &best_format_candidates);
|
128
|
-
|
129
|
-
//! Skip Empty lines for tables with over one column
|
130
|
-
void SkipEmptyLines();
|
131
|
-
};
|
132
|
-
|
133
|
-
} // namespace duckdb
|
@@ -1,74 +0,0 @@
|
|
1
|
-
//===----------------------------------------------------------------------===//
|
2
|
-
// DuckDB
|
3
|
-
//
|
4
|
-
// duckdb/execution/operator/persistent/csv_buffer.hpp
|
5
|
-
//
|
6
|
-
//
|
7
|
-
//===----------------------------------------------------------------------===//
|
8
|
-
|
9
|
-
#pragma once
|
10
|
-
|
11
|
-
#include "duckdb/common/constants.hpp"
|
12
|
-
#include "duckdb/execution/operator/persistent/csv_file_handle.hpp"
|
13
|
-
#include "duckdb/storage/buffer_manager.hpp"
|
14
|
-
|
15
|
-
namespace duckdb {
|
16
|
-
|
17
|
-
class CSVBuffer {
|
18
|
-
public:
|
19
|
-
//! Colossal buffer size for multi-threading
|
20
|
-
static constexpr idx_t INITIAL_BUFFER_SIZE_COLOSSAL = 32000000; // 32MB
|
21
|
-
|
22
|
-
//! Constructor for Initial Buffer
|
23
|
-
CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle,
|
24
|
-
idx_t &global_csv_current_position, idx_t file_number);
|
25
|
-
|
26
|
-
//! Constructor for `Next()` Buffers
|
27
|
-
CSVBuffer(ClientContext &context, BufferHandle handle, idx_t buffer_size_p, idx_t actual_size_p, bool final_buffer,
|
28
|
-
idx_t global_csv_current_position, idx_t file_number);
|
29
|
-
|
30
|
-
//! Creates a new buffer with the next part of the CSV File
|
31
|
-
unique_ptr<CSVBuffer> Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t &global_csv_current_position,
|
32
|
-
idx_t file_number);
|
33
|
-
|
34
|
-
//! Gets the buffer actual size
|
35
|
-
idx_t GetBufferSize();
|
36
|
-
|
37
|
-
//! Gets the start position of the buffer, only relevant for the first time it's scanned
|
38
|
-
idx_t GetStart();
|
39
|
-
|
40
|
-
//! If this buffer is the last buffer of the CSV File
|
41
|
-
bool IsCSVFileLastBuffer();
|
42
|
-
|
43
|
-
//! If this buffer is the first buffer of the CSV File
|
44
|
-
bool IsCSVFileFirstBuffer();
|
45
|
-
|
46
|
-
idx_t GetCSVGlobalStart();
|
47
|
-
|
48
|
-
idx_t GetFileNumber();
|
49
|
-
|
50
|
-
BufferHandle AllocateBuffer(idx_t buffer_size);
|
51
|
-
|
52
|
-
char *Ptr() {
|
53
|
-
return char_ptr_cast(handle.Ptr());
|
54
|
-
}
|
55
|
-
|
56
|
-
private:
|
57
|
-
ClientContext &context;
|
58
|
-
|
59
|
-
BufferHandle handle;
|
60
|
-
//! Actual size can be smaller than the buffer size in case we allocate it too optimistically.
|
61
|
-
idx_t actual_size;
|
62
|
-
//! We need to check for Byte Order Mark, to define the start position of this buffer
|
63
|
-
//! https://en.wikipedia.org/wiki/Byte_order_mark#UTF-8
|
64
|
-
idx_t start_position = 0;
|
65
|
-
//! If this is the last buffer of the CSV File
|
66
|
-
bool last_buffer = false;
|
67
|
-
//! If this is the first buffer of the CSV File
|
68
|
-
bool first_buffer = false;
|
69
|
-
//! Global position from the CSV File where this buffer starts
|
70
|
-
idx_t global_csv_start = 0;
|
71
|
-
//! Number of the file that is in this buffer
|
72
|
-
idx_t file_number = 0;
|
73
|
-
};
|
74
|
-
} // namespace duckdb
|