duckdb 0.8.2-dev4653.0 → 0.8.2-dev4871.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +0 -1
- package/binding.gyp.in +0 -1
- package/package.json +1 -1
- package/src/connection.cpp +10 -23
- package/src/data_chunk.cpp +1 -3
- package/src/database.cpp +4 -9
- package/src/duckdb/extension/icu/icu-datepart.cpp +12 -8
- package/src/duckdb/extension/json/json_functions/json_transform.cpp +8 -6
- package/src/duckdb/extension/json/json_functions.cpp +4 -6
- package/src/duckdb/src/common/enum_util.cpp +10 -5
- package/src/duckdb/src/common/radix_partitioning.cpp +1 -1
- package/src/duckdb/src/common/row_operations/row_matcher.cpp +408 -0
- package/src/duckdb/src/common/types/row/tuple_data_allocator.cpp +3 -3
- package/src/duckdb/src/common/types/row/tuple_data_collection.cpp +35 -17
- package/src/duckdb/src/common/types/row/tuple_data_scatter_gather.cpp +44 -43
- package/src/duckdb/src/common/vector_operations/vector_hash.cpp +1 -0
- package/src/duckdb/src/core_functions/function_list.cpp +1 -1
- package/src/duckdb/src/core_functions/scalar/date/date_part.cpp +86 -50
- package/src/duckdb/src/core_functions/scalar/generic/hash.cpp +3 -0
- package/src/duckdb/src/core_functions/scalar/string/repeat.cpp +8 -5
- package/src/duckdb/src/execution/aggregate_hashtable.cpp +5 -4
- package/src/duckdb/src/execution/index/fixed_size_allocator.cpp +13 -0
- package/src/duckdb/src/execution/join_hashtable.cpp +71 -59
- package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +3 -3
- package/src/duckdb/src/execution/operator/csv_scanner/base_csv_reader.cpp +5 -1
- package/src/duckdb/src/execution/operator/csv_scanner/csv_buffer.cpp +18 -9
- package/src/duckdb/src/execution/operator/csv_scanner/csv_reader_options.cpp +11 -27
- package/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +1 -2
- package/src/duckdb/src/execution/operator/csv_scanner/parallel_csv_reader.cpp +4 -0
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +11 -2
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +8 -8
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +7 -6
- package/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +27 -6
- package/src/duckdb/src/execution/operator/join/physical_hash_join.cpp +9 -4
- package/src/duckdb/src/execution/physical_plan/plan_comparison_join.cpp +0 -2
- package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +49 -41
- package/src/duckdb/src/execution/reservoir_sample.cpp +3 -9
- package/src/duckdb/src/function/cast/vector_cast_helpers.cpp +8 -2
- package/src/duckdb/src/function/function_binder.cpp +10 -9
- package/src/duckdb/src/function/scalar/string/like.cpp +0 -3
- package/src/duckdb/src/function/table/read_csv.cpp +12 -9
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/enums/date_part_specifier.hpp +11 -3
- package/src/duckdb/src/include/duckdb/common/row_operations/row_matcher.hpp +63 -0
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_collection.hpp +8 -2
- package/src/duckdb/src/include/duckdb/common/types/row/tuple_data_states.hpp +2 -2
- package/src/duckdb/src/include/duckdb/common/types/validity_mask.hpp +4 -1
- package/src/duckdb/src/include/duckdb/core_functions/scalar/string_functions.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +4 -0
- package/src/duckdb/src/include/duckdb/execution/join_hashtable.hpp +14 -8
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/base_csv_reader.hpp +4 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_buffer.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_line_info.hpp +4 -0
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_reader_options.hpp +2 -4
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +3 -1
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp +1 -0
- package/src/duckdb/src/include/duckdb/execution/radix_partitioned_hashtable.hpp +1 -2
- package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +3 -0
- package/src/duckdb/src/include/duckdb/main/relation.hpp +4 -0
- package/src/duckdb/src/main/config.cpp +1 -1
- package/src/duckdb/src/main/query_result.cpp +16 -10
- package/src/duckdb/src/main/relation.cpp +10 -0
- package/src/duckdb/src/optimizer/rule/date_part_simplification.cpp +0 -3
- package/src/duckdb/src/planner/binder/tableref/plan_joinref.cpp +12 -4
- package/src/duckdb/src/storage/compression/validity_uncompressed.cpp +2 -3
- package/src/duckdb/src/storage/data_table.cpp +10 -0
- package/src/duckdb/src/storage/serialization/serialize_nodes.cpp +42 -44
- package/src/duckdb/ub_src_common_row_operations.cpp +1 -1
- package/src/statement.cpp +2 -4
- package/test/database_fail.test.ts +6 -0
- package/src/duckdb/src/common/row_operations/row_match.cpp +0 -359
@@ -45,6 +45,7 @@ GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, All
|
|
45
45
|
// Append hash column to the end and initialise the row layout
|
46
46
|
group_types_p.emplace_back(LogicalType::HASH);
|
47
47
|
layout.Initialize(std::move(group_types_p), std::move(aggregate_objects_p));
|
48
|
+
|
48
49
|
hash_offset = layout.GetOffsets()[layout.ColumnCount() - 1];
|
49
50
|
|
50
51
|
// Partitioned data and pointer table
|
@@ -52,7 +53,8 @@ GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, All
|
|
52
53
|
Resize(initial_capacity);
|
53
54
|
|
54
55
|
// Predicates
|
55
|
-
predicates.resize(layout.ColumnCount() - 1, ExpressionType::
|
56
|
+
predicates.resize(layout.ColumnCount() - 1, ExpressionType::COMPARE_NOT_DISTINCT_FROM);
|
57
|
+
row_matcher.Initialize(true, layout, predicates);
|
56
58
|
}
|
57
59
|
|
58
60
|
void GroupedAggregateHashTable::InitializePartitionedData() {
|
@@ -414,9 +416,8 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(DataChunk &groups, V
|
|
414
416
|
}
|
415
417
|
|
416
418
|
// Perform group comparisons
|
417
|
-
|
418
|
-
|
419
|
-
no_match_count);
|
419
|
+
row_matcher.Match(state.group_chunk, chunk_state.vector_data, state.group_compare_vector,
|
420
|
+
need_compare_count, layout, addresses_v, &state.no_match_vector, no_match_count);
|
420
421
|
}
|
421
422
|
|
422
423
|
// Linear probing: each of the entries that do not match move to the next entry in the HT
|
@@ -173,6 +173,19 @@ bool FixedSizeAllocator::InitializeVacuum() {
|
|
173
173
|
return false;
|
174
174
|
}
|
175
175
|
|
176
|
+
// remove all empty buffers
|
177
|
+
auto buffer_it = buffers.begin();
|
178
|
+
while (buffer_it != buffers.end()) {
|
179
|
+
if (!buffer_it->second.segment_count) {
|
180
|
+
buffers_with_free_space.erase(buffer_it->first);
|
181
|
+
buffer_it->second.Destroy();
|
182
|
+
buffer_it = buffers.erase(buffer_it);
|
183
|
+
} else {
|
184
|
+
buffer_it++;
|
185
|
+
}
|
186
|
+
}
|
187
|
+
|
188
|
+
// determine if a vacuum is necessary
|
176
189
|
multimap<idx_t, idx_t> temporary_vacuum_buffers;
|
177
190
|
D_ASSERT(vacuum_buffers.empty());
|
178
191
|
idx_t available_segments_in_memory = 0;
|
@@ -19,15 +19,15 @@ JoinHashTable::JoinHashTable(BufferManager &buffer_manager_p, const vector<JoinC
|
|
19
19
|
: buffer_manager(buffer_manager_p), conditions(conditions_p), build_types(std::move(btypes)), entry_size(0),
|
20
20
|
tuple_size(0), vfound(Value::BOOLEAN(false)), join_type(type_p), finalized(false), has_null(false),
|
21
21
|
external(false), radix_bits(4), partition_start(0), partition_end(0) {
|
22
|
+
|
22
23
|
for (auto &condition : conditions) {
|
23
24
|
D_ASSERT(condition.left->return_type == condition.right->return_type);
|
24
25
|
auto type = condition.left->return_type;
|
25
26
|
if (condition.comparison == ExpressionType::COMPARE_EQUAL ||
|
26
|
-
condition.comparison == ExpressionType::COMPARE_NOT_DISTINCT_FROM
|
27
|
-
|
28
|
-
// all equality conditions
|
29
|
-
// all other conditions at the back
|
30
|
-
// this assert checks that
|
27
|
+
condition.comparison == ExpressionType::COMPARE_NOT_DISTINCT_FROM) {
|
28
|
+
|
29
|
+
// ensure that all equality conditions are at the front,
|
30
|
+
// and that all other conditions are at the back
|
31
31
|
D_ASSERT(equality_types.size() == condition_types.size());
|
32
32
|
equality_types.push_back(type);
|
33
33
|
}
|
@@ -51,6 +51,8 @@ JoinHashTable::JoinHashTable(BufferManager &buffer_manager_p, const vector<JoinC
|
|
51
51
|
}
|
52
52
|
layout_types.emplace_back(LogicalType::HASH);
|
53
53
|
layout.Initialize(layout_types, false);
|
54
|
+
row_matcher.Initialize(false, layout, predicates);
|
55
|
+
row_matcher_no_match_sel.Initialize(true, layout, predicates);
|
54
56
|
|
55
57
|
const auto &offsets = layout.GetOffsets();
|
56
58
|
tuple_size = offsets[condition_types.size() + build_types.size()];
|
@@ -142,30 +144,6 @@ static idx_t FilterNullValues(UnifiedVectorFormat &vdata, const SelectionVector
|
|
142
144
|
return result_count;
|
143
145
|
}
|
144
146
|
|
145
|
-
idx_t JoinHashTable::PrepareKeys(DataChunk &keys, unsafe_unique_array<UnifiedVectorFormat> &key_data,
|
146
|
-
const SelectionVector *¤t_sel, SelectionVector &sel, bool build_side) {
|
147
|
-
key_data = keys.ToUnifiedFormat();
|
148
|
-
|
149
|
-
// figure out which keys are NULL, and create a selection vector out of them
|
150
|
-
current_sel = FlatVector::IncrementalSelectionVector();
|
151
|
-
idx_t added_count = keys.size();
|
152
|
-
if (build_side && IsRightOuterJoin(join_type)) {
|
153
|
-
// in case of a right or full outer join, we cannot remove NULL keys from the build side
|
154
|
-
return added_count;
|
155
|
-
}
|
156
|
-
for (idx_t i = 0; i < keys.ColumnCount(); i++) {
|
157
|
-
if (!null_values_are_equal[i]) {
|
158
|
-
if (key_data[i].validity.AllValid()) {
|
159
|
-
continue;
|
160
|
-
}
|
161
|
-
added_count = FilterNullValues(key_data[i], *current_sel, added_count, sel);
|
162
|
-
// null values are NOT equal for this column, filter them out
|
163
|
-
current_sel = &sel;
|
164
|
-
}
|
165
|
-
}
|
166
|
-
return added_count;
|
167
|
-
}
|
168
|
-
|
169
147
|
void JoinHashTable::Build(PartitionedTupleDataAppendState &append_state, DataChunk &keys, DataChunk &payload) {
|
170
148
|
D_ASSERT(!finalized);
|
171
149
|
D_ASSERT(keys.size() == payload.size());
|
@@ -194,23 +172,6 @@ void JoinHashTable::Build(PartitionedTupleDataAppendState &append_state, DataChu
|
|
194
172
|
info.correlated_counts->AddChunk(info.group_chunk, info.correlated_payload, AggregateType::NON_DISTINCT);
|
195
173
|
}
|
196
174
|
|
197
|
-
// prepare the keys for processing
|
198
|
-
unsafe_unique_array<UnifiedVectorFormat> key_data;
|
199
|
-
const SelectionVector *current_sel;
|
200
|
-
SelectionVector sel(STANDARD_VECTOR_SIZE);
|
201
|
-
idx_t added_count = PrepareKeys(keys, key_data, current_sel, sel, true);
|
202
|
-
if (added_count < keys.size()) {
|
203
|
-
has_null = true;
|
204
|
-
}
|
205
|
-
if (added_count == 0) {
|
206
|
-
return;
|
207
|
-
}
|
208
|
-
|
209
|
-
// hash the keys and obtain an entry in the list
|
210
|
-
// note that we only hash the keys used in the equality comparison
|
211
|
-
Vector hash_values(LogicalType::HASH);
|
212
|
-
Hash(keys, *current_sel, added_count, hash_values);
|
213
|
-
|
214
175
|
// build a chunk to append to the data collection [keys, payload, (optional "found" boolean), hash]
|
215
176
|
DataChunk source_chunk;
|
216
177
|
source_chunk.InitializeEmpty(layout.GetTypes());
|
@@ -228,13 +189,58 @@ void JoinHashTable::Build(PartitionedTupleDataAppendState &append_state, DataChu
|
|
228
189
|
source_chunk.data[col_offset].Reference(vfound);
|
229
190
|
col_offset++;
|
230
191
|
}
|
192
|
+
Vector hash_values(LogicalType::HASH);
|
231
193
|
source_chunk.data[col_offset].Reference(hash_values);
|
232
194
|
source_chunk.SetCardinality(keys);
|
233
195
|
|
196
|
+
// ToUnifiedFormat the source chunk
|
197
|
+
TupleDataCollection::ToUnifiedFormat(append_state.chunk_state, source_chunk);
|
198
|
+
|
199
|
+
// prepare the keys for processing
|
200
|
+
const SelectionVector *current_sel;
|
201
|
+
SelectionVector sel(STANDARD_VECTOR_SIZE);
|
202
|
+
idx_t added_count = PrepareKeys(keys, append_state.chunk_state.vector_data, current_sel, sel, true);
|
234
203
|
if (added_count < keys.size()) {
|
235
|
-
|
204
|
+
has_null = true;
|
205
|
+
}
|
206
|
+
if (added_count == 0) {
|
207
|
+
return;
|
236
208
|
}
|
237
|
-
|
209
|
+
|
210
|
+
// hash the keys and obtain an entry in the list
|
211
|
+
// note that we only hash the keys used in the equality comparison
|
212
|
+
Hash(keys, *current_sel, added_count, hash_values);
|
213
|
+
|
214
|
+
// Re-reference and ToUnifiedFormat the hash column after computing it
|
215
|
+
source_chunk.data[col_offset].Reference(hash_values);
|
216
|
+
hash_values.ToUnifiedFormat(source_chunk.size(), append_state.chunk_state.vector_data.back().unified);
|
217
|
+
|
218
|
+
// We already called TupleDataCollection::ToUnifiedFormat, so we can AppendUnified here
|
219
|
+
sink_collection->AppendUnified(append_state, source_chunk, *current_sel, added_count);
|
220
|
+
}
|
221
|
+
|
222
|
+
idx_t JoinHashTable::PrepareKeys(DataChunk &keys, vector<TupleDataVectorFormat> &vector_data,
|
223
|
+
const SelectionVector *¤t_sel, SelectionVector &sel, bool build_side) {
|
224
|
+
// figure out which keys are NULL, and create a selection vector out of them
|
225
|
+
current_sel = FlatVector::IncrementalSelectionVector();
|
226
|
+
idx_t added_count = keys.size();
|
227
|
+
if (build_side && IsRightOuterJoin(join_type)) {
|
228
|
+
// in case of a right or full outer join, we cannot remove NULL keys from the build side
|
229
|
+
return added_count;
|
230
|
+
}
|
231
|
+
|
232
|
+
for (idx_t col_idx = 0; col_idx < keys.ColumnCount(); col_idx++) {
|
233
|
+
if (!null_values_are_equal[col_idx]) {
|
234
|
+
auto &col_key_data = vector_data[col_idx].unified;
|
235
|
+
if (col_key_data.validity.AllValid()) {
|
236
|
+
continue;
|
237
|
+
}
|
238
|
+
added_count = FilterNullValues(col_key_data, *current_sel, added_count, sel);
|
239
|
+
// null values are NOT equal for this column, filter them out
|
240
|
+
current_sel = &sel;
|
241
|
+
}
|
242
|
+
}
|
243
|
+
return added_count;
|
238
244
|
}
|
239
245
|
|
240
246
|
template <bool PARALLEL>
|
@@ -322,12 +328,13 @@ void JoinHashTable::Finalize(idx_t chunk_idx_from, idx_t chunk_idx_to, bool para
|
|
322
328
|
} while (iterator.Next());
|
323
329
|
}
|
324
330
|
|
325
|
-
unique_ptr<ScanStructure> JoinHashTable::InitializeScanStructure(DataChunk &keys,
|
331
|
+
unique_ptr<ScanStructure> JoinHashTable::InitializeScanStructure(DataChunk &keys, TupleDataChunkState &key_state,
|
332
|
+
const SelectionVector *¤t_sel) {
|
326
333
|
D_ASSERT(Count() > 0); // should be handled before
|
327
334
|
D_ASSERT(finalized);
|
328
335
|
|
329
336
|
// set up the scan structure
|
330
|
-
auto ss = make_uniq<ScanStructure>(*this);
|
337
|
+
auto ss = make_uniq<ScanStructure>(*this, key_state);
|
331
338
|
|
332
339
|
if (join_type != JoinType::INNER) {
|
333
340
|
ss->found_match = make_unsafe_uniq_array<bool>(STANDARD_VECTOR_SIZE);
|
@@ -335,13 +342,15 @@ unique_ptr<ScanStructure> JoinHashTable::InitializeScanStructure(DataChunk &keys
|
|
335
342
|
}
|
336
343
|
|
337
344
|
// first prepare the keys for probing
|
338
|
-
|
345
|
+
TupleDataCollection::ToUnifiedFormat(key_state, keys);
|
346
|
+
ss->count = PrepareKeys(keys, key_state.vector_data, current_sel, ss->sel_vector, false);
|
339
347
|
return ss;
|
340
348
|
}
|
341
349
|
|
342
|
-
unique_ptr<ScanStructure> JoinHashTable::Probe(DataChunk &keys,
|
350
|
+
unique_ptr<ScanStructure> JoinHashTable::Probe(DataChunk &keys, TupleDataChunkState &key_state,
|
351
|
+
Vector *precomputed_hashes) {
|
343
352
|
const SelectionVector *current_sel;
|
344
|
-
auto ss = InitializeScanStructure(keys, current_sel);
|
353
|
+
auto ss = InitializeScanStructure(keys, key_state, current_sel);
|
345
354
|
if (ss->count == 0) {
|
346
355
|
return ss;
|
347
356
|
}
|
@@ -363,8 +372,9 @@ unique_ptr<ScanStructure> JoinHashTable::Probe(DataChunk &keys, Vector *precompu
|
|
363
372
|
return ss;
|
364
373
|
}
|
365
374
|
|
366
|
-
ScanStructure::ScanStructure(JoinHashTable &
|
367
|
-
: pointers(LogicalType::POINTER), sel_vector(STANDARD_VECTOR_SIZE), ht(
|
375
|
+
ScanStructure::ScanStructure(JoinHashTable &ht_p, TupleDataChunkState &key_state_p)
|
376
|
+
: key_state(key_state_p), pointers(LogicalType::POINTER), sel_vector(STANDARD_VECTOR_SIZE), ht(ht_p),
|
377
|
+
finished(false) {
|
368
378
|
}
|
369
379
|
|
370
380
|
void ScanStructure::Next(DataChunk &keys, DataChunk &left, DataChunk &result) {
|
@@ -404,8 +414,9 @@ idx_t ScanStructure::ResolvePredicates(DataChunk &keys, SelectionVector &match_s
|
|
404
414
|
}
|
405
415
|
idx_t no_match_count = 0;
|
406
416
|
|
407
|
-
|
408
|
-
|
417
|
+
auto &matcher = no_match_sel ? ht.row_matcher_no_match_sel : ht.row_matcher;
|
418
|
+
return matcher.Match(keys, key_state.vector_data, match_sel, this->count, ht.layout, pointers, no_match_sel,
|
419
|
+
no_match_count);
|
409
420
|
}
|
410
421
|
|
411
422
|
idx_t ScanStructure::ScanInnerJoin(DataChunk &keys, SelectionVector &result_vector) {
|
@@ -990,7 +1001,8 @@ static void CreateSpillChunk(DataChunk &spill_chunk, DataChunk &keys, DataChunk
|
|
990
1001
|
spill_chunk.data[spill_col_idx].Reference(hashes);
|
991
1002
|
}
|
992
1003
|
|
993
|
-
unique_ptr<ScanStructure> JoinHashTable::ProbeAndSpill(DataChunk &keys,
|
1004
|
+
unique_ptr<ScanStructure> JoinHashTable::ProbeAndSpill(DataChunk &keys, TupleDataChunkState &key_state,
|
1005
|
+
DataChunk &payload, ProbeSpill &probe_spill,
|
994
1006
|
ProbeSpillLocalAppendState &spill_state,
|
995
1007
|
DataChunk &spill_chunk) {
|
996
1008
|
// hash all the keys
|
@@ -1019,7 +1031,7 @@ unique_ptr<ScanStructure> JoinHashTable::ProbeAndSpill(DataChunk &keys, DataChun
|
|
1019
1031
|
payload.Slice(true_sel, true_count);
|
1020
1032
|
|
1021
1033
|
const SelectionVector *current_sel;
|
1022
|
-
auto ss = InitializeScanStructure(keys, current_sel);
|
1034
|
+
auto ss = InitializeScanStructure(keys, key_state, current_sel);
|
1023
1035
|
if (ss->count == 0) {
|
1024
1036
|
return ss;
|
1025
1037
|
}
|
@@ -782,13 +782,13 @@ public:
|
|
782
782
|
}
|
783
783
|
|
784
784
|
auto &ht_state = op.sink_state->Cast<HashAggregateGlobalSinkState>();
|
785
|
-
idx_t
|
785
|
+
idx_t partitions = 0;
|
786
786
|
for (size_t sidx = 0; sidx < op.groupings.size(); ++sidx) {
|
787
787
|
auto &grouping = op.groupings[sidx];
|
788
788
|
auto &grouping_gstate = ht_state.grouping_states[sidx];
|
789
|
-
|
789
|
+
partitions += grouping.table_data.NumberOfPartitions(*grouping_gstate.table_state);
|
790
790
|
}
|
791
|
-
return MaxValue<idx_t>(1,
|
791
|
+
return MaxValue<idx_t>(1, partitions);
|
792
792
|
}
|
793
793
|
};
|
794
794
|
|
@@ -263,7 +263,7 @@ bool BaseCSVReader::AddRow(DataChunk &insert_chunk, idx_t &column, string &error
|
|
263
263
|
return true;
|
264
264
|
}
|
265
265
|
|
266
|
-
if (mode == ParserMode::SNIFFING_DATATYPES
|
266
|
+
if (mode == ParserMode::SNIFFING_DATATYPES) {
|
267
267
|
return true;
|
268
268
|
}
|
269
269
|
|
@@ -480,6 +480,10 @@ bool BaseCSVReader::Flush(DataChunk &insert_chunk, idx_t buffer_idx, bool try_ad
|
|
480
480
|
|
481
481
|
bool was_already_null = FlatVector::IsNull(parse_vector, row_idx);
|
482
482
|
if (!was_already_null && FlatVector::IsNull(result_vector, row_idx)) {
|
483
|
+
Increment(buffer_idx);
|
484
|
+
auto bla = GetLineError(global_row_idx, buffer_idx, false);
|
485
|
+
row_idx += bla;
|
486
|
+
row_idx -= bla;
|
483
487
|
row_failed = true;
|
484
488
|
failed_cells.emplace_back(row_idx, col_idx, row_line);
|
485
489
|
}
|
@@ -8,10 +8,14 @@ CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle
|
|
8
8
|
: context(context), first_buffer(true), file_number(file_number_p), can_seek(file_handle.CanSeek()) {
|
9
9
|
AllocateBuffer(buffer_size_p);
|
10
10
|
auto buffer = Ptr();
|
11
|
-
|
11
|
+
actual_buffer_size = file_handle.Read(buffer, buffer_size_p);
|
12
|
+
while (actual_buffer_size < buffer_size_p && !file_handle.FinishedReading()) {
|
13
|
+
// We keep reading until this block is full
|
14
|
+
actual_buffer_size += file_handle.Read(&buffer[actual_buffer_size], buffer_size_p - actual_buffer_size);
|
15
|
+
}
|
12
16
|
global_csv_start = global_csv_current_position;
|
13
17
|
// BOM check (https://en.wikipedia.org/wiki/Byte_order_mark)
|
14
|
-
if (
|
18
|
+
if (actual_buffer_size >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') {
|
15
19
|
start_position += 3;
|
16
20
|
}
|
17
21
|
last_buffer = file_handle.FinishedReading();
|
@@ -22,13 +26,18 @@ CSVBuffer::CSVBuffer(CSVFileHandle &file_handle, ClientContext &context, idx_t b
|
|
22
26
|
: context(context), global_csv_start(global_csv_current_position), file_number(file_number_p),
|
23
27
|
can_seek(file_handle.CanSeek()) {
|
24
28
|
AllocateBuffer(buffer_size);
|
25
|
-
|
29
|
+
auto buffer = handle.Ptr();
|
30
|
+
actual_buffer_size = file_handle.Read(handle.Ptr(), buffer_size);
|
31
|
+
while (actual_buffer_size < buffer_size && !file_handle.FinishedReading()) {
|
32
|
+
// We keep reading until this block is full
|
33
|
+
actual_buffer_size += file_handle.Read(&buffer[actual_buffer_size], buffer_size - actual_buffer_size);
|
34
|
+
}
|
26
35
|
last_buffer = file_handle.FinishedReading();
|
27
36
|
}
|
28
37
|
|
29
38
|
shared_ptr<CSVBuffer> CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t file_number_p) {
|
30
39
|
auto next_csv_buffer =
|
31
|
-
make_shared<CSVBuffer>(file_handle, context, buffer_size, global_csv_start +
|
40
|
+
make_shared<CSVBuffer>(file_handle, context, buffer_size, global_csv_start + actual_buffer_size, file_number_p);
|
32
41
|
if (next_csv_buffer->GetBufferSize() == 0) {
|
33
42
|
// We are done reading
|
34
43
|
return nullptr;
|
@@ -43,13 +52,13 @@ void CSVBuffer::AllocateBuffer(idx_t buffer_size) {
|
|
43
52
|
}
|
44
53
|
|
45
54
|
idx_t CSVBuffer::GetBufferSize() {
|
46
|
-
return
|
55
|
+
return actual_buffer_size;
|
47
56
|
}
|
48
57
|
|
49
58
|
void CSVBuffer::Reload(CSVFileHandle &file_handle) {
|
50
|
-
AllocateBuffer(
|
59
|
+
AllocateBuffer(actual_buffer_size);
|
51
60
|
file_handle.Seek(global_csv_start);
|
52
|
-
file_handle.Read(handle.Ptr(),
|
61
|
+
file_handle.Read(handle.Ptr(), actual_buffer_size);
|
53
62
|
}
|
54
63
|
|
55
64
|
unique_ptr<CSVBufferHandle> CSVBuffer::Pin(CSVFileHandle &file_handle) {
|
@@ -59,8 +68,8 @@ unique_ptr<CSVBufferHandle> CSVBuffer::Pin(CSVFileHandle &file_handle) {
|
|
59
68
|
block = nullptr;
|
60
69
|
Reload(file_handle);
|
61
70
|
}
|
62
|
-
return make_uniq<CSVBufferHandle>(buffer_manager.Pin(block),
|
63
|
-
start_position, file_number);
|
71
|
+
return make_uniq<CSVBufferHandle>(buffer_manager.Pin(block), actual_buffer_size, first_buffer, last_buffer,
|
72
|
+
global_csv_start, start_position, file_number);
|
64
73
|
}
|
65
74
|
|
66
75
|
void CSVBuffer::Unpin() {
|
@@ -168,38 +168,24 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value,
|
|
168
168
|
if (loption == "auto_detect") {
|
169
169
|
auto_detect = ParseBoolean(value, loption);
|
170
170
|
} else if (loption == "sample_size") {
|
171
|
-
int64_t
|
172
|
-
if (
|
171
|
+
int64_t sample_size_option = ParseInteger(value, loption);
|
172
|
+
if (sample_size_option < 1 && sample_size_option != -1) {
|
173
173
|
throw BinderException("Unsupported parameter for SAMPLE_SIZE: cannot be smaller than 1");
|
174
174
|
}
|
175
|
-
if (
|
176
|
-
|
177
|
-
|
178
|
-
} else if (sample_size <= STANDARD_VECTOR_SIZE) {
|
179
|
-
sample_chunk_size = sample_size;
|
180
|
-
sample_chunks = 1;
|
175
|
+
if (sample_size_option == -1) {
|
176
|
+
// If -1, we basically read the whole thing
|
177
|
+
sample_size_chunks = NumericLimits<idx_t>().Maximum();
|
181
178
|
} else {
|
182
|
-
|
183
|
-
|
179
|
+
sample_size_chunks = sample_size_option / STANDARD_VECTOR_SIZE;
|
180
|
+
if (sample_size_option % STANDARD_VECTOR_SIZE != 0) {
|
181
|
+
sample_size_chunks++;
|
182
|
+
}
|
184
183
|
}
|
184
|
+
|
185
185
|
} else if (loption == "skip") {
|
186
186
|
SetSkipRows(ParseInteger(value, loption));
|
187
187
|
} else if (loption == "max_line_size" || loption == "maximum_line_size") {
|
188
188
|
maximum_line_size = ParseInteger(value, loption);
|
189
|
-
} else if (loption == "sample_chunk_size") {
|
190
|
-
sample_chunk_size = ParseInteger(value, loption);
|
191
|
-
if (sample_chunk_size > STANDARD_VECTOR_SIZE) {
|
192
|
-
throw BinderException(
|
193
|
-
"Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be bigger than STANDARD_VECTOR_SIZE %d",
|
194
|
-
STANDARD_VECTOR_SIZE);
|
195
|
-
} else if (sample_chunk_size < 1) {
|
196
|
-
throw BinderException("Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be smaller than 1");
|
197
|
-
}
|
198
|
-
} else if (loption == "sample_chunks") {
|
199
|
-
sample_chunks = ParseInteger(value, loption);
|
200
|
-
if (sample_chunks < 1) {
|
201
|
-
throw BinderException("Unsupported parameter for SAMPLE_CHUNKS: cannot be smaller than 1");
|
202
|
-
}
|
203
189
|
} else if (loption == "force_not_null") {
|
204
190
|
force_not_null = ParseColumnList(value, expected_names, loption);
|
205
191
|
} else if (loption == "date_format" || loption == "dateformat") {
|
@@ -322,7 +308,7 @@ string CSVReaderOptions::ToString() const {
|
|
322
308
|
(has_escape ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) +
|
323
309
|
"\n header=" + std::to_string(dialect_options.header) +
|
324
310
|
(has_header ? "" : (auto_detect ? " (auto detected)" : "' (default)")) +
|
325
|
-
"\n sample_size=" + std::to_string(
|
311
|
+
"\n sample_size=" + std::to_string(sample_size_chunks * STANDARD_VECTOR_SIZE) +
|
326
312
|
"\n ignore_errors=" + std::to_string(ignore_errors) + "\n all_varchar=" + std::to_string(all_varchar);
|
327
313
|
}
|
328
314
|
|
@@ -489,8 +475,6 @@ void CSVReaderOptions::ToNamedParameters(named_parameter_map_t &named_params) {
|
|
489
475
|
if (skip_rows_set) {
|
490
476
|
named_params["skip"] = Value::BIGINT(GetSkipRows());
|
491
477
|
}
|
492
|
-
named_params["sample_chunks"] = Value::BIGINT(sample_chunks);
|
493
|
-
named_params["sample_chunk_size"] = Value::BIGINT(sample_chunk_size);
|
494
478
|
named_params["null_padding"] = Value::BOOLEAN(null_padding);
|
495
479
|
if (!date_format.at(LogicalType::DATE).format_specifier.empty()) {
|
496
480
|
named_params["dateformat"] = Value(date_format.at(LogicalType::DATE).format_specifier);
|
@@ -29,8 +29,7 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
|
|
29
29
|
InitializeTransitionArray(transition_array[i], quoted_state);
|
30
30
|
break;
|
31
31
|
case unquoted_state:
|
32
|
-
|
33
|
-
break;
|
32
|
+
case invalid_state:
|
34
33
|
case escape_state:
|
35
34
|
InitializeTransitionArray(transition_array[i], invalid_state);
|
36
35
|
break;
|
@@ -647,6 +647,10 @@ idx_t ParallelCSVReader::GetLineError(idx_t line_error, idx_t buffer_idx, bool s
|
|
647
647
|
}
|
648
648
|
}
|
649
649
|
|
650
|
+
void ParallelCSVReader::Increment(idx_t buffer_idx) {
|
651
|
+
return buffer->line_info->Increment(file_idx, buffer_idx);
|
652
|
+
}
|
653
|
+
|
650
654
|
bool ParallelCSVReader::TryParseCSV(ParserMode mode) {
|
651
655
|
DataChunk dummy_chunk;
|
652
656
|
string error_message;
|
@@ -3,8 +3,9 @@
|
|
3
3
|
namespace duckdb {
|
4
4
|
|
5
5
|
CSVSniffer::CSVSniffer(CSVReaderOptions &options_p, shared_ptr<CSVBufferManager> buffer_manager_p,
|
6
|
-
CSVStateMachineCache &state_machine_cache_p)
|
7
|
-
: state_machine_cache(state_machine_cache_p), options(options_p), buffer_manager(std::move(buffer_manager_p))
|
6
|
+
CSVStateMachineCache &state_machine_cache_p, bool explicit_set_columns_p)
|
7
|
+
: state_machine_cache(state_machine_cache_p), options(options_p), buffer_manager(std::move(buffer_manager_p)),
|
8
|
+
explicit_set_columns(explicit_set_columns_p) {
|
8
9
|
|
9
10
|
// Check if any type is BLOB
|
10
11
|
for (auto &type : options.sql_type_list) {
|
@@ -24,6 +25,14 @@ CSVSniffer::CSVSniffer(CSVReaderOptions &options_p, shared_ptr<CSVBufferManager>
|
|
24
25
|
SnifferResult CSVSniffer::SniffCSV() {
|
25
26
|
// 1. Dialect Detection
|
26
27
|
DetectDialect();
|
28
|
+
if (explicit_set_columns) {
|
29
|
+
if (!candidates.empty()) {
|
30
|
+
options.dialect_options.state_machine_options = candidates[0]->dialect_options.state_machine_options;
|
31
|
+
options.dialect_options.new_line = candidates[0]->dialect_options.new_line;
|
32
|
+
}
|
33
|
+
// We do not need to run type and header detection as these were defined by the user
|
34
|
+
return SnifferResult(detected_types, names);
|
35
|
+
}
|
27
36
|
// 2. Type Detection
|
28
37
|
DetectTypes();
|
29
38
|
// 3. Header Detection
|
@@ -15,7 +15,7 @@ struct SniffDialect {
|
|
15
15
|
inline static bool Process(CSVStateMachine &machine, vector<idx_t> &sniffed_column_counts, char current_char,
|
16
16
|
idx_t current_pos) {
|
17
17
|
|
18
|
-
D_ASSERT(sniffed_column_counts.size() ==
|
18
|
+
D_ASSERT(sniffed_column_counts.size() == STANDARD_VECTOR_SIZE);
|
19
19
|
|
20
20
|
if (machine.state == CSVState::INVALID) {
|
21
21
|
sniffed_column_counts.clear();
|
@@ -45,7 +45,7 @@ struct SniffDialect {
|
|
45
45
|
machine.single_record_separator = ((machine.state != CSVState::RECORD_SEPARATOR && carriage_return) ||
|
46
46
|
(machine.state == CSVState::RECORD_SEPARATOR && !carriage_return)) ||
|
47
47
|
machine.single_record_separator;
|
48
|
-
if (machine.cur_rows >=
|
48
|
+
if (machine.cur_rows >= STANDARD_VECTOR_SIZE) {
|
49
49
|
// We sniffed enough rows
|
50
50
|
return true;
|
51
51
|
}
|
@@ -55,10 +55,10 @@ struct SniffDialect {
|
|
55
55
|
if (machine.state == CSVState::INVALID) {
|
56
56
|
return;
|
57
57
|
}
|
58
|
-
if (machine.cur_rows <
|
58
|
+
if (machine.cur_rows < STANDARD_VECTOR_SIZE && machine.state == CSVState::DELIMITER) {
|
59
59
|
sniffed_column_counts[machine.cur_rows] = ++machine.column_count;
|
60
60
|
}
|
61
|
-
if (machine.cur_rows <
|
61
|
+
if (machine.cur_rows < STANDARD_VECTOR_SIZE && machine.state != CSVState::EMPTY_LINE) {
|
62
62
|
sniffed_column_counts[machine.cur_rows++] = machine.column_count;
|
63
63
|
}
|
64
64
|
NewLineIdentifier suggested_newline;
|
@@ -145,7 +145,7 @@ void CSVSniffer::GenerateStateMachineSearchSpace(vector<unique_ptr<CSVStateMachi
|
|
145
145
|
void CSVSniffer::AnalyzeDialectCandidate(unique_ptr<CSVStateMachine> state_machine, idx_t &rows_read,
|
146
146
|
idx_t &best_consistent_rows, idx_t &prev_padding_count) {
|
147
147
|
// The sniffed_column_counts variable keeps track of the number of columns found for each row
|
148
|
-
vector<idx_t> sniffed_column_counts(
|
148
|
+
vector<idx_t> sniffed_column_counts(STANDARD_VECTOR_SIZE);
|
149
149
|
|
150
150
|
state_machine->csv_buffer_iterator.Process<SniffDialect>(*state_machine, sniffed_column_counts);
|
151
151
|
idx_t start_row = options.dialect_options.skip_rows;
|
@@ -244,7 +244,7 @@ void CSVSniffer::AnalyzeDialectCandidate(unique_ptr<CSVStateMachine> state_machi
|
|
244
244
|
}
|
245
245
|
|
246
246
|
bool CSVSniffer::RefineCandidateNextChunk(CSVStateMachine &candidate) {
|
247
|
-
vector<idx_t> sniffed_column_counts(
|
247
|
+
vector<idx_t> sniffed_column_counts(STANDARD_VECTOR_SIZE);
|
248
248
|
candidate.csv_buffer_iterator.Process<SniffDialect>(candidate, sniffed_column_counts);
|
249
249
|
bool allow_padding = options.null_padding;
|
250
250
|
|
@@ -268,9 +268,9 @@ void CSVSniffer::RefineCandidates() {
|
|
268
268
|
return;
|
269
269
|
}
|
270
270
|
for (auto &cur_candidate : candidates) {
|
271
|
-
for (idx_t i = 1; i <= options.
|
271
|
+
for (idx_t i = 1; i <= options.sample_size_chunks; i++) {
|
272
272
|
bool finished_file = cur_candidate->csv_buffer_iterator.Finished();
|
273
|
-
if (finished_file || i == options.
|
273
|
+
if (finished_file || i == options.sample_size_chunks) {
|
274
274
|
// we finished the file or our chunk sample successfully: stop
|
275
275
|
auto successful_candidate = std::move(cur_candidate);
|
276
276
|
candidates.clear();
|
@@ -283,11 +283,7 @@ void CSVSniffer::DetectTypes() {
|
|
283
283
|
candidate->Reset();
|
284
284
|
|
285
285
|
// Parse chunk and read csv with info candidate
|
286
|
-
|
287
|
-
if (options.sample_chunk_size == 1) {
|
288
|
-
sample_size++;
|
289
|
-
}
|
290
|
-
vector<TupleSniffing> tuples(sample_size);
|
286
|
+
vector<TupleSniffing> tuples(STANDARD_VECTOR_SIZE);
|
291
287
|
candidate->csv_buffer_iterator.Process<SniffValue>(*candidate, tuples);
|
292
288
|
// Potentially Skip empty rows (I find this dirty, but it is what the original code does)
|
293
289
|
idx_t true_start = 0;
|
@@ -311,8 +307,10 @@ void CSVSniffer::DetectTypes() {
|
|
311
307
|
break;
|
312
308
|
}
|
313
309
|
}
|
310
|
+
if (values_start > 0) {
|
311
|
+
tuples.erase(tuples.begin(), tuples.begin() + values_start);
|
312
|
+
}
|
314
313
|
|
315
|
-
tuples.erase(tuples.begin(), tuples.begin() + values_start);
|
316
314
|
idx_t row_idx = 0;
|
317
315
|
if (tuples.size() > 1 && (!options.has_header || (options.has_header && options.dialect_options.header))) {
|
318
316
|
// This means we have more than one row, hence we can use the first row to detect if we have a header
|
@@ -327,6 +325,9 @@ void CSVSniffer::DetectTypes() {
|
|
327
325
|
for (; row_idx < tuples.size(); row_idx++) {
|
328
326
|
for (idx_t col = 0; col < tuples[row_idx].values.size(); col++) {
|
329
327
|
auto &col_type_candidates = info_sql_types_candidates[col];
|
328
|
+
// col_type_candidates can't be empty since anything in a CSV file should at least be a string
|
329
|
+
// and we validate utf-8 compatibility when creating the type
|
330
|
+
D_ASSERT(!col_type_candidates.empty());
|
330
331
|
auto cur_top_candidate = col_type_candidates.back();
|
331
332
|
auto dummy_val = tuples[row_idx].values[col];
|
332
333
|
// try cast from string to sql_type
|