duckdb 0.8.2-dev3458.0 → 0.8.2-dev3949.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +2 -0
- package/package.json +1 -1
- package/src/duckdb/extension/icu/icu_extension.cpp +5 -5
- package/src/duckdb/extension/json/include/json_deserializer.hpp +7 -16
- package/src/duckdb/extension/json/include/json_serializer.hpp +9 -15
- package/src/duckdb/extension/json/json_deserializer.cpp +29 -67
- package/src/duckdb/extension/json/json_scan.cpp +1 -1
- package/src/duckdb/extension/json/json_serializer.cpp +26 -69
- package/src/duckdb/src/common/enum_util.cpp +119 -7
- package/src/duckdb/src/common/extra_type_info.cpp +7 -3
- package/src/duckdb/src/common/radix_partitioning.cpp +8 -31
- package/src/duckdb/src/common/row_operations/row_aggregate.cpp +18 -3
- package/src/duckdb/src/common/serializer/binary_deserializer.cpp +62 -77
- package/src/duckdb/src/common/serializer/binary_serializer.cpp +84 -84
- package/src/duckdb/src/common/serializer/format_serializer.cpp +1 -1
- package/src/duckdb/src/common/sort/partition_state.cpp +41 -33
- package/src/duckdb/src/common/types/data_chunk.cpp +44 -8
- package/src/duckdb/src/common/types/hyperloglog.cpp +21 -0
- package/src/duckdb/src/common/types/interval.cpp +3 -0
- package/src/duckdb/src/common/types/row/partitioned_tuple_data.cpp +252 -126
- package/src/duckdb/src/common/types/row/row_layout.cpp +3 -31
- package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +40 -32
- package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +39 -26
- package/src/duckdb/src/common/types/row/tuple_data_layout.cpp +11 -1
- package/src/duckdb/src/common/types/row/tuple_data_segment.cpp +21 -16
- package/src/duckdb/src/common/types/value.cpp +63 -42
- package/src/duckdb/src/common/types/vector.cpp +33 -67
- package/src/duckdb/src/core_functions/scalar/list/list_lambdas.cpp +3 -2
- package/src/duckdb/src/execution/aggregate_hashtable.cpp +222 -364
- package/src/duckdb/src/execution/join_hashtable.cpp +5 -6
- package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +240 -310
- package/src/duckdb/src/execution/operator/aggregate/physical_ungrouped_aggregate.cpp +202 -173
- package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +36 -2
- package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/base_csv_reader.cpp +58 -162
- package/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp +434 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +80 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer_manager.cpp +90 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_file_handle.cpp +95 -0
- package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/csv_reader_options.cpp +47 -28
- package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine.cpp +35 -0
- package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +107 -0
- package/src/duckdb/src/execution/operator/{persistent → csv_scanner}/parallel_csv_reader.cpp +44 -44
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +52 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +336 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +165 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +398 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +175 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +39 -0
- package/src/duckdb/src/execution/operator/join/physical_asof_join.cpp +1 -1
- package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +1 -2
- package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +614 -574
- package/src/duckdb/src/execution/window_executor.cpp +6 -5
- package/src/duckdb/src/function/cast/cast_function_set.cpp +1 -0
- package/src/duckdb/src/function/scalar/strftime_format.cpp +4 -4
- package/src/duckdb/src/function/table/copy_csv.cpp +94 -96
- package/src/duckdb/src/function/table/read_csv.cpp +150 -136
- package/src/duckdb/src/function/table/table_scan.cpp +0 -2
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/enum_util.hpp +24 -0
- package/src/duckdb/src/include/duckdb/common/file_opener.hpp +9 -0
- package/src/duckdb/src/include/duckdb/common/fixed_size_map.hpp +208 -0
- package/src/duckdb/src/include/duckdb/common/optional_idx.hpp +3 -0
- package/src/duckdb/src/include/duckdb/common/perfect_map_set.hpp +2 -1
- package/src/duckdb/src/include/duckdb/common/printer.hpp +11 -0
- package/src/duckdb/src/include/duckdb/common/serializer/binary_deserializer.hpp +43 -30
- package/src/duckdb/src/include/duckdb/common/serializer/binary_serializer.hpp +36 -35
- package/src/duckdb/src/include/duckdb/common/serializer/deserialization_data.hpp +18 -0
- package/src/duckdb/src/include/duckdb/common/serializer/encoding_util.hpp +132 -0
- package/src/duckdb/src/include/duckdb/common/serializer/format_deserializer.hpp +125 -150
- package/src/duckdb/src/include/duckdb/common/serializer/format_serializer.hpp +119 -107
- package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +2 -1
- package/src/duckdb/src/include/duckdb/common/shared_ptr.hpp +8 -0
- package/src/duckdb/src/include/duckdb/common/sort/partition_state.hpp +13 -7
- package/src/duckdb/src/include/duckdb/common/types/data_chunk.hpp +5 -0
- package/src/duckdb/src/include/duckdb/common/types/hyperloglog.hpp +7 -1
- package/src/duckdb/src/include/duckdb/common/types/interval.hpp +7 -0
- package/src/duckdb/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +41 -9
- package/src/duckdb/src/include/duckdb/common/types/row/row_data_collection_scanner.hpp +5 -0
- package/src/duckdb/src/include/duckdb/common/types/row/row_layout.hpp +1 -23
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_allocator.hpp +14 -8
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +6 -3
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_layout.hpp +7 -0
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_segment.hpp +13 -8
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +3 -2
- package/src/duckdb/src/include/duckdb/common/types/vector.hpp +3 -3
- package/src/duckdb/src/include/duckdb/common/vector.hpp +2 -2
- package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +125 -146
- package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_hash_aggregate.hpp +5 -4
- package/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_window.hpp +4 -3
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/base_csv_reader.hpp +17 -17
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp +72 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +110 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp +103 -0
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_file_handle.hpp +8 -15
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_line_info.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/csv_reader_options.hpp +52 -28
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +127 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +75 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +51 -0
- package/src/duckdb/src/include/duckdb/execution/operator/{persistent → scan/csv}/parallel_csv_reader.hpp +21 -27
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/quote_rules.hpp +21 -0
- package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +18 -27
- package/src/duckdb/src/include/duckdb/function/function_serialization.hpp +5 -6
- package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +4 -4
- package/src/duckdb/src/include/duckdb/function/table/read_csv.hpp +17 -12
- package/src/duckdb/src/include/duckdb/main/client_context_file_opener.hpp +1 -0
- package/src/duckdb/src/include/duckdb/main/client_data.hpp +2 -1
- package/src/duckdb/src/include/duckdb/main/config.hpp +1 -0
- package/src/duckdb/src/include/duckdb/main/connection.hpp +2 -2
- package/src/duckdb/src/include/duckdb/main/relation/read_csv_relation.hpp +6 -6
- package/src/duckdb/src/include/duckdb/parallel/event.hpp +12 -1
- package/src/duckdb/src/include/duckdb/storage/block.hpp +6 -0
- package/src/duckdb/src/include/duckdb/storage/buffer/block_handle.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/base_statistics.hpp +7 -3
- package/src/duckdb/src/include/duckdb/storage/statistics/column_statistics.hpp +4 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/distinct_statistics.hpp +5 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/list_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/numeric_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/string_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/statistics/struct_stats.hpp +3 -0
- package/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp +15 -3
- package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +4 -0
- package/src/duckdb/src/include/duckdb/storage/table/table_statistics.hpp +5 -0
- package/src/duckdb/src/include/duckdb/verification/deserialized_statement_verifier_v2.hpp +6 -0
- package/src/duckdb/src/include/duckdb/verification/statement_verifier.hpp +1 -0
- package/src/duckdb/src/include/duckdb.h +12 -0
- package/src/duckdb/src/main/capi/logical_types-c.cpp +22 -0
- package/src/duckdb/src/main/client_context_file_opener.cpp +17 -0
- package/src/duckdb/src/main/client_verify.cpp +1 -0
- package/src/duckdb/src/main/config.cpp +2 -2
- package/src/duckdb/src/main/connection.cpp +3 -3
- package/src/duckdb/src/main/relation/read_csv_relation.cpp +19 -13
- package/src/duckdb/src/parallel/pipeline_finish_event.cpp +1 -1
- package/src/duckdb/src/parser/tableref/pivotref.cpp +0 -16
- package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +1 -1
- package/src/duckdb/src/planner/binder/statement/bind_export.cpp +41 -25
- package/src/duckdb/src/planner/expression/bound_aggregate_expression.cpp +4 -4
- package/src/duckdb/src/planner/expression/bound_window_expression.cpp +10 -10
- package/src/duckdb/src/planner/logical_operator.cpp +1 -1
- package/src/duckdb/src/planner/planner.cpp +1 -1
- package/src/duckdb/src/storage/checkpoint_manager.cpp +4 -3
- package/src/duckdb/src/storage/serialization/serialize_constraint.cpp +1 -1
- package/src/duckdb/src/storage/serialization/serialize_create_info.cpp +5 -5
- package/src/duckdb/src/storage/serialization/serialize_expression.cpp +10 -10
- package/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +20 -20
- package/src/duckdb/src/storage/serialization/serialize_macro_function.cpp +2 -2
- package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +118 -89
- package/src/duckdb/src/storage/serialization/serialize_parse_info.cpp +3 -3
- package/src/duckdb/src/storage/serialization/serialize_parsed_expression.cpp +27 -27
- package/src/duckdb/src/storage/serialization/serialize_query_node.cpp +16 -16
- package/src/duckdb/src/storage/serialization/serialize_result_modifier.cpp +8 -8
- package/src/duckdb/src/storage/serialization/serialize_statement.cpp +1 -1
- package/src/duckdb/src/storage/serialization/serialize_storage.cpp +39 -0
- package/src/duckdb/src/storage/serialization/serialize_tableref.cpp +9 -9
- package/src/duckdb/src/storage/statistics/base_statistics.cpp +67 -4
- package/src/duckdb/src/storage/statistics/column_statistics.cpp +16 -0
- package/src/duckdb/src/storage/statistics/list_stats.cpp +21 -0
- package/src/duckdb/src/storage/statistics/numeric_stats.cpp +126 -1
- package/src/duckdb/src/storage/statistics/string_stats.cpp +23 -0
- package/src/duckdb/src/storage/statistics/struct_stats.cpp +27 -0
- package/src/duckdb/src/storage/storage_info.cpp +1 -1
- package/src/duckdb/src/storage/table/chunk_info.cpp +82 -3
- package/src/duckdb/src/storage/table/row_group.cpp +68 -1
- package/src/duckdb/src/storage/table/table_statistics.cpp +21 -0
- package/src/duckdb/src/storage/wal_replay.cpp +2 -2
- package/src/duckdb/src/verification/deserialized_statement_verifier_v2.cpp +15 -1
- package/src/duckdb/src/verification/statement_verifier.cpp +2 -0
- package/src/duckdb/third_party/utf8proc/include/utf8proc_wrapper.hpp +8 -0
- package/src/duckdb/ub_src_execution.cpp +0 -2
- package/src/duckdb/ub_src_execution_operator_csv_scanner.cpp +18 -0
- package/src/duckdb/ub_src_execution_operator_csv_scanner_sniffer.cpp +12 -0
- package/src/duckdb/ub_src_execution_operator_persistent.cpp +0 -12
- package/src/duckdb/ub_src_storage_serialization.cpp +2 -0
- package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +0 -1487
- package/src/duckdb/src/execution/operator/persistent/csv_buffer.cpp +0 -72
- package/src/duckdb/src/execution/operator/persistent/csv_file_handle.cpp +0 -158
- package/src/duckdb/src/execution/partitionable_hashtable.cpp +0 -207
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +0 -133
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_buffer.hpp +0 -74
- package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +0 -73
@@ -0,0 +1,434 @@
|
|
1
|
+
#include "duckdb/execution/operator/scan/csv/buffered_csv_reader.hpp"
|
2
|
+
|
3
|
+
#include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp"
|
4
|
+
#include "duckdb/common/file_system.hpp"
|
5
|
+
#include "duckdb/common/string_util.hpp"
|
6
|
+
#include "duckdb/common/to_string.hpp"
|
7
|
+
#include "duckdb/common/types/cast_helpers.hpp"
|
8
|
+
#include "duckdb/common/vector_operations/unary_executor.hpp"
|
9
|
+
#include "duckdb/common/vector_operations/vector_operations.hpp"
|
10
|
+
#include "duckdb/execution/operator/scan/csv/csv_sniffer.hpp"
|
11
|
+
#include "duckdb/execution/operator/scan/csv/csv_state_machine.hpp"
|
12
|
+
#include "duckdb/function/scalar/strftime_format.hpp"
|
13
|
+
#include "duckdb/main/client_data.hpp"
|
14
|
+
#include "duckdb/main/database.hpp"
|
15
|
+
#include "duckdb/main/error_manager.hpp"
|
16
|
+
#include "duckdb/parser/column_definition.hpp"
|
17
|
+
#include "duckdb/parser/keyword_helper.hpp"
|
18
|
+
#include "duckdb/storage/data_table.hpp"
|
19
|
+
#include "utf8proc.hpp"
|
20
|
+
#include "utf8proc_wrapper.hpp"
|
21
|
+
|
22
|
+
#include <algorithm>
|
23
|
+
#include <cctype>
|
24
|
+
#include <cstring>
|
25
|
+
#include <fstream>
|
26
|
+
|
27
|
+
namespace duckdb {
|
28
|
+
|
29
|
+
BufferedCSVReader::BufferedCSVReader(ClientContext &context, CSVReaderOptions options_p,
|
30
|
+
const vector<LogicalType> &requested_types)
|
31
|
+
: BaseCSVReader(context, std::move(options_p), requested_types), buffer_size(0), position(0), start(0) {
|
32
|
+
file_handle = OpenCSV(context, options);
|
33
|
+
Initialize(requested_types);
|
34
|
+
}
|
35
|
+
|
36
|
+
BufferedCSVReader::BufferedCSVReader(ClientContext &context, string filename, CSVReaderOptions options_p,
|
37
|
+
const vector<LogicalType> &requested_types)
|
38
|
+
: BaseCSVReader(context, std::move(options_p), requested_types), buffer_size(0), position(0), start(0) {
|
39
|
+
options.file_path = std::move(filename);
|
40
|
+
file_handle = OpenCSV(context, options);
|
41
|
+
Initialize(requested_types);
|
42
|
+
}
|
43
|
+
|
44
|
+
void BufferedCSVReader::Initialize(const vector<LogicalType> &requested_types) {
|
45
|
+
if (options.auto_detect && options.file_options.union_by_name) {
|
46
|
+
// This is required for the sniffer to work on Union By Name
|
47
|
+
D_ASSERT(options.file_path == file_handle->GetFilePath());
|
48
|
+
auto bm_file_handle = BaseCSVReader::OpenCSV(context, options);
|
49
|
+
auto csv_buffer_manager = make_shared<CSVBufferManager>(context, std::move(bm_file_handle), options);
|
50
|
+
CSVSniffer sniffer(options, csv_buffer_manager, state_machine_cache);
|
51
|
+
auto sniffer_result = sniffer.SniffCSV();
|
52
|
+
return_types = sniffer_result.return_types;
|
53
|
+
names = sniffer_result.names;
|
54
|
+
if (return_types.empty()) {
|
55
|
+
throw InvalidInputException("Failed to detect column types from CSV: is the file a valid CSV file?");
|
56
|
+
}
|
57
|
+
} else {
|
58
|
+
return_types = requested_types;
|
59
|
+
ResetBuffer();
|
60
|
+
}
|
61
|
+
SkipRowsAndReadHeader(options.dialect_options.skip_rows, options.dialect_options.header);
|
62
|
+
InitParseChunk(return_types.size());
|
63
|
+
}
|
64
|
+
|
65
|
+
void BufferedCSVReader::ResetBuffer() {
|
66
|
+
buffer.reset();
|
67
|
+
buffer_size = 0;
|
68
|
+
position = 0;
|
69
|
+
start = 0;
|
70
|
+
cached_buffers.clear();
|
71
|
+
}
|
72
|
+
|
73
|
+
void BufferedCSVReader::SkipRowsAndReadHeader(idx_t skip_rows, bool skip_header) {
|
74
|
+
for (idx_t i = 0; i < skip_rows; i++) {
|
75
|
+
// ignore skip rows
|
76
|
+
string read_line = file_handle->ReadLine();
|
77
|
+
linenr++;
|
78
|
+
}
|
79
|
+
|
80
|
+
if (skip_header) {
|
81
|
+
// ignore the first line as a header line
|
82
|
+
InitParseChunk(return_types.size());
|
83
|
+
ParseCSV(ParserMode::PARSING_HEADER);
|
84
|
+
}
|
85
|
+
}
|
86
|
+
|
87
|
+
string BufferedCSVReader::ColumnTypesError(case_insensitive_map_t<idx_t> sql_types_per_column,
|
88
|
+
const vector<string> &names) {
|
89
|
+
for (idx_t i = 0; i < names.size(); i++) {
|
90
|
+
auto it = sql_types_per_column.find(names[i]);
|
91
|
+
if (it != sql_types_per_column.end()) {
|
92
|
+
sql_types_per_column.erase(names[i]);
|
93
|
+
continue;
|
94
|
+
}
|
95
|
+
}
|
96
|
+
if (sql_types_per_column.empty()) {
|
97
|
+
return string();
|
98
|
+
}
|
99
|
+
string exception = "COLUMN_TYPES error: Columns with names: ";
|
100
|
+
for (auto &col : sql_types_per_column) {
|
101
|
+
exception += "\"" + col.first + "\",";
|
102
|
+
}
|
103
|
+
exception.pop_back();
|
104
|
+
exception += " do not exist in the CSV File";
|
105
|
+
return exception;
|
106
|
+
}
|
107
|
+
|
108
|
+
void BufferedCSVReader::SkipEmptyLines() {
|
109
|
+
if (parse_chunk.data.size() == 1) {
|
110
|
+
// Empty lines are null data.
|
111
|
+
return;
|
112
|
+
}
|
113
|
+
for (; position < buffer_size; position++) {
|
114
|
+
if (!StringUtil::CharacterIsNewline(buffer[position])) {
|
115
|
+
return;
|
116
|
+
}
|
117
|
+
}
|
118
|
+
}
|
119
|
+
|
120
|
+
void UpdateMaxLineLength(ClientContext &context, idx_t line_length) {
|
121
|
+
if (!context.client_data->debug_set_max_line_length) {
|
122
|
+
return;
|
123
|
+
}
|
124
|
+
if (line_length < context.client_data->debug_max_line_length) {
|
125
|
+
return;
|
126
|
+
}
|
127
|
+
context.client_data->debug_max_line_length = line_length;
|
128
|
+
}
|
129
|
+
|
130
|
+
bool BufferedCSVReader::ReadBuffer(idx_t &start, idx_t &line_start) {
|
131
|
+
if (start > buffer_size) {
|
132
|
+
return false;
|
133
|
+
}
|
134
|
+
auto old_buffer = std::move(buffer);
|
135
|
+
|
136
|
+
// the remaining part of the last buffer
|
137
|
+
idx_t remaining = buffer_size - start;
|
138
|
+
|
139
|
+
idx_t buffer_read_size = INITIAL_BUFFER_SIZE_LARGE;
|
140
|
+
|
141
|
+
while (remaining > buffer_read_size) {
|
142
|
+
buffer_read_size *= 2;
|
143
|
+
}
|
144
|
+
|
145
|
+
// Check line length
|
146
|
+
if (remaining > options.maximum_line_size) {
|
147
|
+
throw InvalidInputException("Maximum line size of %llu bytes exceeded on line %s!", options.maximum_line_size,
|
148
|
+
GetLineNumberStr(linenr, linenr_estimated));
|
149
|
+
}
|
150
|
+
|
151
|
+
buffer = make_unsafe_uniq_array<char>(buffer_read_size + remaining + 1);
|
152
|
+
buffer_size = remaining + buffer_read_size;
|
153
|
+
if (remaining > 0) {
|
154
|
+
// remaining from last buffer: copy it here
|
155
|
+
memcpy(buffer.get(), old_buffer.get() + start, remaining);
|
156
|
+
}
|
157
|
+
idx_t read_count = file_handle->Read(buffer.get() + remaining, buffer_read_size);
|
158
|
+
|
159
|
+
bytes_in_chunk += read_count;
|
160
|
+
buffer_size = remaining + read_count;
|
161
|
+
buffer[buffer_size] = '\0';
|
162
|
+
if (old_buffer) {
|
163
|
+
cached_buffers.push_back(std::move(old_buffer));
|
164
|
+
}
|
165
|
+
start = 0;
|
166
|
+
position = remaining;
|
167
|
+
if (!bom_checked) {
|
168
|
+
bom_checked = true;
|
169
|
+
if (read_count >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') {
|
170
|
+
start += 3;
|
171
|
+
position += 3;
|
172
|
+
}
|
173
|
+
}
|
174
|
+
line_start = start;
|
175
|
+
|
176
|
+
return read_count > 0;
|
177
|
+
}
|
178
|
+
|
179
|
+
void BufferedCSVReader::ParseCSV(DataChunk &insert_chunk) {
|
180
|
+
string error_message;
|
181
|
+
if (!TryParseCSV(ParserMode::PARSING, insert_chunk, error_message)) {
|
182
|
+
throw InvalidInputException(error_message);
|
183
|
+
}
|
184
|
+
}
|
185
|
+
|
186
|
+
void BufferedCSVReader::ParseCSV(ParserMode mode) {
|
187
|
+
DataChunk dummy_chunk;
|
188
|
+
string error_message;
|
189
|
+
if (!TryParseCSV(mode, dummy_chunk, error_message)) {
|
190
|
+
throw InvalidInputException(error_message);
|
191
|
+
}
|
192
|
+
}
|
193
|
+
|
194
|
+
bool BufferedCSVReader::TryParseCSV(ParserMode parser_mode, DataChunk &insert_chunk, string &error_message) {
|
195
|
+
mode = parser_mode;
|
196
|
+
// used for parsing algorithm
|
197
|
+
bool finished_chunk = false;
|
198
|
+
idx_t column = 0;
|
199
|
+
idx_t offset = 0;
|
200
|
+
bool has_quotes = false;
|
201
|
+
vector<idx_t> escape_positions;
|
202
|
+
|
203
|
+
idx_t line_start = position;
|
204
|
+
idx_t line_size = 0;
|
205
|
+
// read values into the buffer (if any)
|
206
|
+
if (position >= buffer_size) {
|
207
|
+
if (!ReadBuffer(start, line_start)) {
|
208
|
+
return true;
|
209
|
+
}
|
210
|
+
}
|
211
|
+
|
212
|
+
// start parsing the first value
|
213
|
+
goto value_start;
|
214
|
+
value_start:
|
215
|
+
offset = 0;
|
216
|
+
/* state: value_start */
|
217
|
+
// this state parses the first character of a value
|
218
|
+
if (buffer[position] == options.dialect_options.state_machine_options.quote) {
|
219
|
+
// quote: actual value starts in the next position
|
220
|
+
// move to in_quotes state
|
221
|
+
start = position + 1;
|
222
|
+
line_size++;
|
223
|
+
goto in_quotes;
|
224
|
+
} else {
|
225
|
+
// no quote, move to normal parsing state
|
226
|
+
start = position;
|
227
|
+
goto normal;
|
228
|
+
}
|
229
|
+
normal:
|
230
|
+
/* state: normal parsing state */
|
231
|
+
// this state parses the remainder of a non-quoted value until we reach a delimiter or newline
|
232
|
+
do {
|
233
|
+
for (; position < buffer_size; position++) {
|
234
|
+
line_size++;
|
235
|
+
if (buffer[position] == options.dialect_options.state_machine_options.delimiter) {
|
236
|
+
// delimiter: end the value and add it to the chunk
|
237
|
+
goto add_value;
|
238
|
+
} else if (StringUtil::CharacterIsNewline(buffer[position])) {
|
239
|
+
// newline: add row
|
240
|
+
goto add_row;
|
241
|
+
}
|
242
|
+
}
|
243
|
+
} while (ReadBuffer(start, line_start));
|
244
|
+
// file ends during normal scan: go to end state
|
245
|
+
goto final_state;
|
246
|
+
add_value:
|
247
|
+
AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes);
|
248
|
+
// increase position by 1 and move start to the new position
|
249
|
+
offset = 0;
|
250
|
+
has_quotes = false;
|
251
|
+
start = ++position;
|
252
|
+
line_size++;
|
253
|
+
if (position >= buffer_size && !ReadBuffer(start, line_start)) {
|
254
|
+
// file ends right after delimiter, go to final state
|
255
|
+
goto final_state;
|
256
|
+
}
|
257
|
+
goto value_start;
|
258
|
+
add_row : {
|
259
|
+
// check type of newline (\r or \n)
|
260
|
+
bool carriage_return = buffer[position] == '\r';
|
261
|
+
AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes);
|
262
|
+
if (!error_message.empty()) {
|
263
|
+
return false;
|
264
|
+
}
|
265
|
+
VerifyLineLength(position - line_start);
|
266
|
+
|
267
|
+
finished_chunk = AddRow(insert_chunk, column, error_message);
|
268
|
+
UpdateMaxLineLength(context, position - line_start);
|
269
|
+
if (!error_message.empty()) {
|
270
|
+
return false;
|
271
|
+
}
|
272
|
+
// increase position by 1 and move start to the new position
|
273
|
+
offset = 0;
|
274
|
+
has_quotes = false;
|
275
|
+
position++;
|
276
|
+
line_size = 0;
|
277
|
+
start = position;
|
278
|
+
line_start = position;
|
279
|
+
if (position >= buffer_size && !ReadBuffer(start, line_start)) {
|
280
|
+
// file ends right after delimiter, go to final state
|
281
|
+
goto final_state;
|
282
|
+
}
|
283
|
+
if (carriage_return) {
|
284
|
+
// \r newline, go to special state that parses an optional \n afterwards
|
285
|
+
goto carriage_return;
|
286
|
+
} else {
|
287
|
+
SetNewLineDelimiter();
|
288
|
+
SkipEmptyLines();
|
289
|
+
|
290
|
+
start = position;
|
291
|
+
line_start = position;
|
292
|
+
if (position >= buffer_size && !ReadBuffer(start, line_start)) {
|
293
|
+
// file ends right after delimiter, go to final state
|
294
|
+
goto final_state;
|
295
|
+
}
|
296
|
+
// \n newline, move to value start
|
297
|
+
if (finished_chunk) {
|
298
|
+
return true;
|
299
|
+
}
|
300
|
+
goto value_start;
|
301
|
+
}
|
302
|
+
}
|
303
|
+
in_quotes:
|
304
|
+
/* state: in_quotes */
|
305
|
+
// this state parses the remainder of a quoted value
|
306
|
+
has_quotes = true;
|
307
|
+
position++;
|
308
|
+
line_size++;
|
309
|
+
do {
|
310
|
+
for (; position < buffer_size; position++) {
|
311
|
+
line_size++;
|
312
|
+
if (buffer[position] == options.dialect_options.state_machine_options.quote) {
|
313
|
+
// quote: move to unquoted state
|
314
|
+
goto unquote;
|
315
|
+
} else if (buffer[position] == options.dialect_options.state_machine_options.escape) {
|
316
|
+
// escape: store the escaped position and move to handle_escape state
|
317
|
+
escape_positions.push_back(position - start);
|
318
|
+
goto handle_escape;
|
319
|
+
}
|
320
|
+
}
|
321
|
+
} while (ReadBuffer(start, line_start));
|
322
|
+
// still in quoted state at the end of the file, error:
|
323
|
+
throw InvalidInputException("Error in file \"%s\" on line %s: unterminated quotes. (%s)", options.file_path,
|
324
|
+
GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
|
325
|
+
unquote:
|
326
|
+
/* state: unquote */
|
327
|
+
// this state handles the state directly after we unquote
|
328
|
+
// in this state we expect either another quote (entering the quoted state again, and escaping the quote)
|
329
|
+
// or a delimiter/newline, ending the current value and moving on to the next value
|
330
|
+
position++;
|
331
|
+
line_size++;
|
332
|
+
if (position >= buffer_size && !ReadBuffer(start, line_start)) {
|
333
|
+
// file ends right after unquote, go to final state
|
334
|
+
offset = 1;
|
335
|
+
goto final_state;
|
336
|
+
}
|
337
|
+
if (buffer[position] == options.dialect_options.state_machine_options.quote &&
|
338
|
+
(options.dialect_options.state_machine_options.escape == '\0' ||
|
339
|
+
options.dialect_options.state_machine_options.escape == options.dialect_options.state_machine_options.quote)) {
|
340
|
+
// escaped quote, return to quoted state and store escape position
|
341
|
+
escape_positions.push_back(position - start);
|
342
|
+
goto in_quotes;
|
343
|
+
} else if (buffer[position] == options.dialect_options.state_machine_options.delimiter) {
|
344
|
+
// delimiter, add value
|
345
|
+
offset = 1;
|
346
|
+
goto add_value;
|
347
|
+
} else if (StringUtil::CharacterIsNewline(buffer[position])) {
|
348
|
+
offset = 1;
|
349
|
+
goto add_row;
|
350
|
+
} else {
|
351
|
+
error_message = StringUtil::Format(
|
352
|
+
"Error in file \"%s\" on line %s: quote should be followed by end of value, end of "
|
353
|
+
"row or another quote. (%s)",
|
354
|
+
options.file_path, GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
|
355
|
+
return false;
|
356
|
+
}
|
357
|
+
handle_escape:
|
358
|
+
/* state: handle_escape */
|
359
|
+
// escape should be followed by a quote or another escape character
|
360
|
+
position++;
|
361
|
+
line_size++;
|
362
|
+
if (position >= buffer_size && !ReadBuffer(start, line_start)) {
|
363
|
+
error_message = StringUtil::Format(
|
364
|
+
"Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", options.file_path,
|
365
|
+
GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
|
366
|
+
return false;
|
367
|
+
}
|
368
|
+
if (buffer[position] != options.dialect_options.state_machine_options.quote &&
|
369
|
+
buffer[position] != options.dialect_options.state_machine_options.escape) {
|
370
|
+
error_message = StringUtil::Format(
|
371
|
+
"Error in file \"%s\" on line %s: neither QUOTE nor ESCAPE is proceeded by ESCAPE. (%s)", options.file_path,
|
372
|
+
GetLineNumberStr(linenr, linenr_estimated).c_str(), options.ToString());
|
373
|
+
return false;
|
374
|
+
}
|
375
|
+
// escape was followed by quote or escape, go back to quoted state
|
376
|
+
goto in_quotes;
|
377
|
+
carriage_return:
|
378
|
+
/* state: carriage_return */
|
379
|
+
// this stage optionally skips a newline (\n) character, which allows \r\n to be interpreted as a single line
|
380
|
+
if (buffer[position] == '\n') {
|
381
|
+
SetNewLineDelimiter(true, true);
|
382
|
+
// newline after carriage return: skip
|
383
|
+
// increase position by 1 and move start to the new position
|
384
|
+
start = ++position;
|
385
|
+
line_size++;
|
386
|
+
|
387
|
+
if (position >= buffer_size && !ReadBuffer(start, line_start)) {
|
388
|
+
// file ends right after delimiter, go to final state
|
389
|
+
goto final_state;
|
390
|
+
}
|
391
|
+
} else {
|
392
|
+
SetNewLineDelimiter(true, false);
|
393
|
+
}
|
394
|
+
if (finished_chunk) {
|
395
|
+
return true;
|
396
|
+
}
|
397
|
+
SkipEmptyLines();
|
398
|
+
start = position;
|
399
|
+
line_start = position;
|
400
|
+
if (position >= buffer_size && !ReadBuffer(start, line_start)) {
|
401
|
+
// file ends right after delimiter, go to final state
|
402
|
+
goto final_state;
|
403
|
+
}
|
404
|
+
|
405
|
+
goto value_start;
|
406
|
+
final_state:
|
407
|
+
if (finished_chunk) {
|
408
|
+
return true;
|
409
|
+
}
|
410
|
+
|
411
|
+
if (column > 0 || position > start) {
|
412
|
+
// remaining values to be added to the chunk
|
413
|
+
AddValue(string_t(buffer.get() + start, position - start - offset), column, escape_positions, has_quotes);
|
414
|
+
VerifyLineLength(position - line_start);
|
415
|
+
|
416
|
+
finished_chunk = AddRow(insert_chunk, column, error_message);
|
417
|
+
SkipEmptyLines();
|
418
|
+
UpdateMaxLineLength(context, line_size);
|
419
|
+
if (!error_message.empty()) {
|
420
|
+
return false;
|
421
|
+
}
|
422
|
+
}
|
423
|
+
|
424
|
+
// final stage, only reached after parsing the file is finished
|
425
|
+
// flush the parsed chunk and finalize parsing
|
426
|
+
if (mode == ParserMode::PARSING) {
|
427
|
+
Flush(insert_chunk);
|
428
|
+
}
|
429
|
+
|
430
|
+
end_of_file_reached = true;
|
431
|
+
return true;
|
432
|
+
}
|
433
|
+
|
434
|
+
} // namespace duckdb
|
@@ -0,0 +1,80 @@
|
|
1
|
+
#include "duckdb/execution/operator/scan/csv/csv_buffer.hpp"
|
2
|
+
#include "duckdb/common/string_util.hpp"
|
3
|
+
|
4
|
+
namespace duckdb {
|
5
|
+
|
6
|
+
CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle,
|
7
|
+
idx_t &global_csv_current_position, idx_t file_number_p)
|
8
|
+
: context(context), first_buffer(true), file_number(file_number_p), can_seek(file_handle.CanSeek()) {
|
9
|
+
AllocateBuffer(buffer_size_p);
|
10
|
+
auto buffer = Ptr();
|
11
|
+
file_size = file_handle.Read(buffer, buffer_size_p);
|
12
|
+
global_csv_start = global_csv_current_position;
|
13
|
+
// BOM check (https://en.wikipedia.org/wiki/Byte_order_mark)
|
14
|
+
if (file_size >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') {
|
15
|
+
start_position += 3;
|
16
|
+
}
|
17
|
+
last_buffer = file_handle.FinishedReading();
|
18
|
+
}
|
19
|
+
|
20
|
+
CSVBuffer::CSVBuffer(CSVFileHandle &file_handle, ClientContext &context, idx_t buffer_size,
|
21
|
+
idx_t global_csv_current_position, idx_t file_number_p)
|
22
|
+
: context(context), global_csv_start(global_csv_current_position), file_number(file_number_p),
|
23
|
+
can_seek(file_handle.CanSeek()) {
|
24
|
+
AllocateBuffer(buffer_size);
|
25
|
+
file_size = file_handle.Read(handle.Ptr(), buffer_size);
|
26
|
+
last_buffer = file_handle.FinishedReading();
|
27
|
+
}
|
28
|
+
|
29
|
+
shared_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t file_number_p) {
|
30
|
+
auto next_csv_buffer =
|
31
|
+
make_shared<CSVBuffer>(file_handle, context, buffer_size, global_csv_start + file_size, file_number_p);
|
32
|
+
if (next_csv_buffer->GetBufferSize() == 0) {
|
33
|
+
// We are done reading
|
34
|
+
return nullptr;
|
35
|
+
}
|
36
|
+
return next_csv_buffer;
|
37
|
+
}
|
38
|
+
|
39
|
+
void CSVBuffer::AllocateBuffer(idx_t buffer_size) {
|
40
|
+
auto &buffer_manager = BufferManager::GetBufferManager(context);
|
41
|
+
bool can_destroy = can_seek;
|
42
|
+
handle = buffer_manager.Allocate(MaxValue<idx_t>(Storage::BLOCK_SIZE, buffer_size), can_destroy, &block);
|
43
|
+
}
|
44
|
+
|
45
|
+
idx_t CSVBuffer::GetBufferSize() {
|
46
|
+
return file_size;
|
47
|
+
}
|
48
|
+
|
49
|
+
void CSVBuffer::Reload(CSVFileHandle &file_handle) {
|
50
|
+
AllocateBuffer(file_size);
|
51
|
+
file_handle.Seek(global_csv_start);
|
52
|
+
file_handle.Read(handle.Ptr(), file_size);
|
53
|
+
}
|
54
|
+
|
55
|
+
unique_ptr<CSVBufferHandle> CSVBuffer::Pin(CSVFileHandle &file_handle) {
|
56
|
+
auto &buffer_manager = BufferManager::GetBufferManager(context);
|
57
|
+
if (can_seek && block->IsUnloaded()) {
|
58
|
+
// We have to reload it from disk
|
59
|
+
block = nullptr;
|
60
|
+
Reload(file_handle);
|
61
|
+
}
|
62
|
+
return make_uniq<CSVBufferHandle>(buffer_manager.Pin(block), file_size, first_buffer, last_buffer, global_csv_start,
|
63
|
+
start_position, file_number);
|
64
|
+
}
|
65
|
+
|
66
|
+
void CSVBuffer::Unpin() {
|
67
|
+
if (handle.IsValid()) {
|
68
|
+
handle.Destroy();
|
69
|
+
}
|
70
|
+
}
|
71
|
+
|
72
|
+
idx_t CSVBuffer::GetStart() {
|
73
|
+
return start_position;
|
74
|
+
}
|
75
|
+
|
76
|
+
bool CSVBuffer::IsCSVFileLastBuffer() {
|
77
|
+
return last_buffer;
|
78
|
+
}
|
79
|
+
|
80
|
+
} // namespace duckdb
|
@@ -0,0 +1,90 @@
|
|
1
|
+
#include "duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp"
|
2
|
+
#include "duckdb/execution/operator/scan/csv/csv_buffer.hpp"
|
3
|
+
namespace duckdb {
|
4
|
+
|
5
|
+
CSVBufferManager::CSVBufferManager(ClientContext &context_p, unique_ptr<CSVFileHandle> file_handle_p,
|
6
|
+
const CSVReaderOptions &options, idx_t file_idx_p)
|
7
|
+
: file_handle(std::move(file_handle_p)), context(context_p), file_idx(file_idx_p),
|
8
|
+
buffer_size(CSVBuffer::CSV_BUFFER_SIZE) {
|
9
|
+
if (options.skip_rows_set) {
|
10
|
+
// Skip rows if they are set
|
11
|
+
skip_rows = options.dialect_options.skip_rows;
|
12
|
+
}
|
13
|
+
auto file_size = file_handle->FileSize();
|
14
|
+
if (file_size > 0 && file_size < buffer_size) {
|
15
|
+
buffer_size = CSVBuffer::CSV_MINIMUM_BUFFER_SIZE;
|
16
|
+
}
|
17
|
+
if (options.buffer_size < buffer_size) {
|
18
|
+
buffer_size = options.buffer_size;
|
19
|
+
}
|
20
|
+
for (idx_t i = 0; i < skip_rows; i++) {
|
21
|
+
file_handle->ReadLine();
|
22
|
+
}
|
23
|
+
Initialize();
|
24
|
+
}
|
25
|
+
|
26
|
+
void CSVBufferManager::UnpinBuffer(idx_t cache_idx) {
|
27
|
+
if (cache_idx < cached_buffers.size()) {
|
28
|
+
cached_buffers[cache_idx]->Unpin();
|
29
|
+
}
|
30
|
+
}
|
31
|
+
|
32
|
+
void CSVBufferManager::Initialize() {
|
33
|
+
if (cached_buffers.empty()) {
|
34
|
+
cached_buffers.emplace_back(
|
35
|
+
make_shared<CSVBuffer>(context, buffer_size, *file_handle, global_csv_pos, file_idx));
|
36
|
+
last_buffer = cached_buffers.front();
|
37
|
+
}
|
38
|
+
start_pos = last_buffer->GetStart();
|
39
|
+
}
|
40
|
+
|
41
|
+
idx_t CSVBufferManager::GetStartPos() {
|
42
|
+
return start_pos;
|
43
|
+
}
|
44
|
+
bool CSVBufferManager::ReadNextAndCacheIt() {
|
45
|
+
D_ASSERT(last_buffer);
|
46
|
+
if (!last_buffer->IsCSVFileLastBuffer()) {
|
47
|
+
auto maybe_last_buffer = last_buffer->Next(*file_handle, buffer_size, file_idx);
|
48
|
+
if (!maybe_last_buffer) {
|
49
|
+
last_buffer->last_buffer = true;
|
50
|
+
return false;
|
51
|
+
}
|
52
|
+
last_buffer = std::move(maybe_last_buffer);
|
53
|
+
cached_buffers.emplace_back(last_buffer);
|
54
|
+
return true;
|
55
|
+
}
|
56
|
+
return false;
|
57
|
+
}
|
58
|
+
|
59
|
+
unique_ptr<CSVBufferHandle> CSVBufferManager::GetBuffer(const idx_t pos) {
|
60
|
+
while (pos >= cached_buffers.size()) {
|
61
|
+
if (done) {
|
62
|
+
return nullptr;
|
63
|
+
}
|
64
|
+
if (!ReadNextAndCacheIt()) {
|
65
|
+
done = true;
|
66
|
+
}
|
67
|
+
}
|
68
|
+
if (pos != 0) {
|
69
|
+
cached_buffers[pos - 1]->Unpin();
|
70
|
+
}
|
71
|
+
return cached_buffers[pos]->Pin(*file_handle);
|
72
|
+
}
|
73
|
+
|
74
|
+
bool CSVBufferIterator::Finished() {
|
75
|
+
return !cur_buffer_handle;
|
76
|
+
}
|
77
|
+
|
78
|
+
void CSVBufferIterator::Reset() {
|
79
|
+
if (cur_buffer_handle) {
|
80
|
+
cur_buffer_handle.reset();
|
81
|
+
}
|
82
|
+
if (cur_buffer_idx > 0) {
|
83
|
+
buffer_manager->UnpinBuffer(cur_buffer_idx - 1);
|
84
|
+
}
|
85
|
+
cur_buffer_idx = 0;
|
86
|
+
buffer_manager->Initialize();
|
87
|
+
cur_pos = buffer_manager->GetStartPos();
|
88
|
+
}
|
89
|
+
|
90
|
+
} // namespace duckdb
|
@@ -0,0 +1,95 @@
|
|
1
|
+
#include "duckdb/execution/operator/scan/csv/csv_file_handle.hpp"
|
2
|
+
|
3
|
+
namespace duckdb {
|
4
|
+
|
5
|
+
CSVFileHandle::CSVFileHandle(FileSystem &fs, Allocator &allocator, unique_ptr<FileHandle> file_handle_p,
|
6
|
+
const string &path_p, FileCompressionType compression)
|
7
|
+
: file_handle(std::move(file_handle_p)), path(path_p) {
|
8
|
+
can_seek = file_handle->CanSeek();
|
9
|
+
on_disk_file = file_handle->OnDiskFile();
|
10
|
+
file_size = file_handle->GetFileSize();
|
11
|
+
}
|
12
|
+
|
13
|
+
unique_ptr<FileHandle> CSVFileHandle::OpenFileHandle(FileSystem &fs, Allocator &allocator, const string &path,
|
14
|
+
FileCompressionType compression) {
|
15
|
+
auto file_handle = fs.OpenFile(path, FileFlags::FILE_FLAGS_READ, FileLockType::NO_LOCK, compression);
|
16
|
+
if (file_handle->CanSeek()) {
|
17
|
+
file_handle->Reset();
|
18
|
+
}
|
19
|
+
return file_handle;
|
20
|
+
}
|
21
|
+
|
22
|
+
unique_ptr<CSVFileHandle> CSVFileHandle::OpenFile(FileSystem &fs, Allocator &allocator, const string &path,
|
23
|
+
FileCompressionType compression) {
|
24
|
+
auto file_handle = CSVFileHandle::OpenFileHandle(fs, allocator, path, compression);
|
25
|
+
return make_uniq<CSVFileHandle>(fs, allocator, std::move(file_handle), path, compression);
|
26
|
+
}
|
27
|
+
|
28
|
+
bool CSVFileHandle::CanSeek() {
|
29
|
+
return can_seek;
|
30
|
+
}
|
31
|
+
|
32
|
+
void CSVFileHandle::Seek(idx_t position) {
|
33
|
+
if (!can_seek) {
|
34
|
+
throw InternalException("Cannot seek in this file");
|
35
|
+
}
|
36
|
+
file_handle->Seek(position);
|
37
|
+
}
|
38
|
+
|
39
|
+
bool CSVFileHandle::OnDiskFile() {
|
40
|
+
return on_disk_file;
|
41
|
+
}
|
42
|
+
|
43
|
+
idx_t CSVFileHandle::FileSize() {
|
44
|
+
return file_size;
|
45
|
+
}
|
46
|
+
|
47
|
+
bool CSVFileHandle::FinishedReading() {
|
48
|
+
return finished;
|
49
|
+
}
|
50
|
+
|
51
|
+
idx_t CSVFileHandle::Read(void *buffer, idx_t nr_bytes) {
|
52
|
+
requested_bytes += nr_bytes;
|
53
|
+
// if this is a plain file source OR we can seek we are not caching anything
|
54
|
+
auto bytes_read = file_handle->Read(buffer, nr_bytes);
|
55
|
+
if (!finished) {
|
56
|
+
finished = bytes_read == 0;
|
57
|
+
}
|
58
|
+
return bytes_read;
|
59
|
+
}
|
60
|
+
|
61
|
+
string CSVFileHandle::ReadLine() {
|
62
|
+
bool carriage_return = false;
|
63
|
+
string result;
|
64
|
+
char buffer[1];
|
65
|
+
while (true) {
|
66
|
+
idx_t bytes_read = Read(buffer, 1);
|
67
|
+
if (bytes_read == 0) {
|
68
|
+
return result;
|
69
|
+
}
|
70
|
+
if (carriage_return) {
|
71
|
+
if (buffer[0] != '\n') {
|
72
|
+
if (!file_handle->CanSeek()) {
|
73
|
+
throw BinderException(
|
74
|
+
"Carriage return newlines not supported when reading CSV files in which we cannot seek");
|
75
|
+
}
|
76
|
+
file_handle->Seek(file_handle->SeekPosition() - 1);
|
77
|
+
return result;
|
78
|
+
}
|
79
|
+
}
|
80
|
+
if (buffer[0] == '\n') {
|
81
|
+
return result;
|
82
|
+
}
|
83
|
+
if (buffer[0] != '\r') {
|
84
|
+
result += buffer[0];
|
85
|
+
} else {
|
86
|
+
carriage_return = true;
|
87
|
+
}
|
88
|
+
}
|
89
|
+
}
|
90
|
+
|
91
|
+
string CSVFileHandle::GetFilePath() {
|
92
|
+
return path;
|
93
|
+
}
|
94
|
+
|
95
|
+
} // namespace duckdb
|