duckdb 0.3.5-dev992.0 → 0.4.1-dev1019.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Makefile +2 -2
- package/binding.gyp +8 -5
- package/lib/duckdb.js +92 -64
- package/package.json +1 -1
- package/src/connection.cpp +109 -130
- package/src/data_chunk.cpp +185 -0
- package/src/database.cpp +64 -12
- package/src/duckdb.cpp +58294 -25711
- package/src/duckdb.hpp +4178 -2318
- package/src/duckdb_node.hpp +23 -17
- package/src/parquet-amalgamation.cpp +37289 -36547
- package/src/parquet-amalgamation.hpp +281 -120
- package/src/statement.cpp +17 -12
- package/test/extension.test.js +1 -1
- package/test/pathnames.test.js +82 -0
- package/test/syntax_error.test.js +16 -0
- package/test/udf.test.js +172 -107
|
@@ -91,8 +91,8 @@ public:
|
|
|
91
91
|
* under the License.
|
|
92
92
|
*/
|
|
93
93
|
|
|
94
|
-
#ifndef
|
|
95
|
-
#define
|
|
94
|
+
#ifndef _DUCKDB_THRIFT_THRIFT_H_
|
|
95
|
+
#define _DUCKDB_THRIFT_THRIFT_H_ 1
|
|
96
96
|
|
|
97
97
|
|
|
98
98
|
|
|
@@ -121,8 +121,8 @@ public:
|
|
|
121
121
|
|
|
122
122
|
// clang-format off
|
|
123
123
|
|
|
124
|
-
#ifndef
|
|
125
|
-
# define
|
|
124
|
+
#ifndef _DUCKDB_THRIFT_TRANSPORT_PLATFORM_SOCKET_H_
|
|
125
|
+
# define _DUCKDB_THRIFT_TRANSPORT_PLATFORM_SOCKET_H_
|
|
126
126
|
|
|
127
127
|
#ifdef _WIN32
|
|
128
128
|
#ifdef _WINSOCKAPI_
|
|
@@ -236,7 +236,7 @@ public:
|
|
|
236
236
|
# define THRIFT_SHUT_RDWR SHUT_RDWR
|
|
237
237
|
#endif
|
|
238
238
|
|
|
239
|
-
#endif //
|
|
239
|
+
#endif // _DUCKDB_THRIFT_TRANSPORT_PLATFORM_SOCKET_H_
|
|
240
240
|
|
|
241
241
|
|
|
242
242
|
// LICENSE_CHANGE_END
|
|
@@ -329,8 +329,8 @@ public:
|
|
|
329
329
|
* under the License.
|
|
330
330
|
*/
|
|
331
331
|
|
|
332
|
-
#ifndef
|
|
333
|
-
#define
|
|
332
|
+
#ifndef _DUCKDB_THRIFT_TLOGGING_H_
|
|
333
|
+
#define _DUCKDB_THRIFT_TLOGGING_H_ 1
|
|
334
334
|
|
|
335
335
|
|
|
336
336
|
|
|
@@ -454,7 +454,7 @@ public:
|
|
|
454
454
|
#define T_GENERIC_PROTOCOL(template_class, generic_prot, specific_prot)
|
|
455
455
|
#endif
|
|
456
456
|
|
|
457
|
-
#endif // #ifndef
|
|
457
|
+
#endif // #ifndef _DUCKDB_THRIFT_TLOGGING_H_
|
|
458
458
|
|
|
459
459
|
|
|
460
460
|
// LICENSE_CHANGE_END
|
|
@@ -546,7 +546,7 @@ void profile_write_pprof(FILE* gen_calls_f, FILE* virtual_calls_f);
|
|
|
546
546
|
}
|
|
547
547
|
} // duckdb_apache::thrift
|
|
548
548
|
|
|
549
|
-
#endif // #ifndef
|
|
549
|
+
#endif // #ifndef _DUCKDB_THRIFT_THRIFT_H_
|
|
550
550
|
|
|
551
551
|
|
|
552
552
|
// LICENSE_CHANGE_END
|
|
@@ -576,8 +576,8 @@ void profile_write_pprof(FILE* gen_calls_f, FILE* virtual_calls_f);
|
|
|
576
576
|
* under the License.
|
|
577
577
|
*/
|
|
578
578
|
|
|
579
|
-
#ifndef
|
|
580
|
-
#define
|
|
579
|
+
#ifndef _DUCKDB_THRIFT_TAPPLICATIONEXCEPTION_H_
|
|
580
|
+
#define _DUCKDB_THRIFT_TAPPLICATIONEXCEPTION_H_ 1
|
|
581
581
|
|
|
582
582
|
|
|
583
583
|
|
|
@@ -671,7 +671,7 @@ protected:
|
|
|
671
671
|
}
|
|
672
672
|
} // duckdb_apache::thrift
|
|
673
673
|
|
|
674
|
-
#endif // #ifndef
|
|
674
|
+
#endif // #ifndef _DUCKDB_THRIFT_TAPPLICATIONEXCEPTION_H_
|
|
675
675
|
|
|
676
676
|
|
|
677
677
|
// LICENSE_CHANGE_END
|
|
@@ -701,8 +701,8 @@ protected:
|
|
|
701
701
|
* under the License.
|
|
702
702
|
*/
|
|
703
703
|
|
|
704
|
-
#ifndef
|
|
705
|
-
#define
|
|
704
|
+
#ifndef _DUCKDB_THRIFT_TBASE_H_
|
|
705
|
+
#define _DUCKDB_THRIFT_TBASE_H_ 1
|
|
706
706
|
|
|
707
707
|
|
|
708
708
|
|
|
@@ -730,8 +730,8 @@ protected:
|
|
|
730
730
|
* under the License.
|
|
731
731
|
*/
|
|
732
732
|
|
|
733
|
-
#ifndef
|
|
734
|
-
#define
|
|
733
|
+
#ifndef _DUCKDB_THRIFT_PROTOCOL_TPROTOCOL_H_
|
|
734
|
+
#define _DUCKDB_THRIFT_PROTOCOL_TPROTOCOL_H_ 1
|
|
735
735
|
|
|
736
736
|
#ifdef _WIN32
|
|
737
737
|
// Need to come before any Windows.h includes
|
|
@@ -763,8 +763,8 @@ protected:
|
|
|
763
763
|
* under the License.
|
|
764
764
|
*/
|
|
765
765
|
|
|
766
|
-
#ifndef
|
|
767
|
-
#define
|
|
766
|
+
#ifndef _DUCKDB_THRIFT_TRANSPORT_TTRANSPORT_H_
|
|
767
|
+
#define _DUCKDB_THRIFT_TRANSPORT_TTRANSPORT_H_ 1
|
|
768
768
|
|
|
769
769
|
|
|
770
770
|
|
|
@@ -792,8 +792,8 @@ protected:
|
|
|
792
792
|
* under the License.
|
|
793
793
|
*/
|
|
794
794
|
|
|
795
|
-
#ifndef
|
|
796
|
-
#define
|
|
795
|
+
#ifndef _DUCKDB_THRIFT_TRANSPORT_TTRANSPORTEXCEPTION_H_
|
|
796
|
+
#define _DUCKDB_THRIFT_TRANSPORT_TTRANSPORTEXCEPTION_H_ 1
|
|
797
797
|
|
|
798
798
|
// FUCK OFF #include <boost/numeric/conversion/cast.hpp>
|
|
799
799
|
#include <string>
|
|
@@ -878,7 +878,7 @@ protected:
|
|
|
878
878
|
}
|
|
879
879
|
} // duckdb_apache::thrift::transport
|
|
880
880
|
|
|
881
|
-
#endif // #ifndef
|
|
881
|
+
#endif // #ifndef _DUCKDB_THRIFT_TRANSPORT_TTRANSPORTEXCEPTION_H_
|
|
882
882
|
|
|
883
883
|
|
|
884
884
|
// LICENSE_CHANGE_END
|
|
@@ -1129,7 +1129,7 @@ public:
|
|
|
1129
1129
|
}
|
|
1130
1130
|
} // duckdb_apache::thrift::transport
|
|
1131
1131
|
|
|
1132
|
-
#endif // #ifndef
|
|
1132
|
+
#endif // #ifndef _DUCKDB_THRIFT_TRANSPORT_TTRANSPORT_H_
|
|
1133
1133
|
|
|
1134
1134
|
|
|
1135
1135
|
// LICENSE_CHANGE_END
|
|
@@ -1159,8 +1159,8 @@ public:
|
|
|
1159
1159
|
* under the License.
|
|
1160
1160
|
*/
|
|
1161
1161
|
|
|
1162
|
-
#ifndef
|
|
1163
|
-
#define
|
|
1162
|
+
#ifndef _DUCKDB_THRIFT_PROTOCOL_TPROTOCOLEXCEPTION_H_
|
|
1163
|
+
#define _DUCKDB_THRIFT_PROTOCOL_TPROTOCOLEXCEPTION_H_ 1
|
|
1164
1164
|
|
|
1165
1165
|
#include <string>
|
|
1166
1166
|
|
|
@@ -1244,7 +1244,7 @@ protected:
|
|
|
1244
1244
|
}
|
|
1245
1245
|
} // duckdb_apache::thrift::protocol
|
|
1246
1246
|
|
|
1247
|
-
#endif // #ifndef
|
|
1247
|
+
#endif // #ifndef _DUCKDB_THRIFT_PROTOCOL_TPROTOCOLEXCEPTION_H_
|
|
1248
1248
|
|
|
1249
1249
|
|
|
1250
1250
|
// LICENSE_CHANGE_END
|
|
@@ -1267,37 +1267,39 @@ protected:
|
|
|
1267
1267
|
// but that doesn't work.
|
|
1268
1268
|
// For a pretty in-depth explanation of the problem, see
|
|
1269
1269
|
// http://cellperformance.beyond3d.com/articles/2006/06/understanding-strict-aliasing.html
|
|
1270
|
+
namespace duckdb_apache { namespace thrift {
|
|
1270
1271
|
template <typename To, typename From>
|
|
1271
1272
|
static inline To bitwise_cast(From from) {
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1273
|
+
static_assert(sizeof(From) == sizeof(To), "sizeof(From) == sizeof(To)");
|
|
1274
|
+
|
|
1275
|
+
// BAD!!! These are all broken with -O2.
|
|
1276
|
+
// return *reinterpret_cast<To*>(&from); // BAD!!!
|
|
1277
|
+
// return *static_cast<To*>(static_cast<void*>(&from)); // BAD!!!
|
|
1278
|
+
// return *(To*)(void*)&from; // BAD!!!
|
|
1279
|
+
|
|
1280
|
+
// Super clean and paritally blessed by section 3.9 of the standard.
|
|
1281
|
+
// unsigned char c[sizeof(from)];
|
|
1282
|
+
// memcpy(c, &from, sizeof(from));
|
|
1283
|
+
// To to;
|
|
1284
|
+
// memcpy(&to, c, sizeof(c));
|
|
1285
|
+
// return to;
|
|
1286
|
+
|
|
1287
|
+
// Slightly more questionable.
|
|
1288
|
+
// Same code emitted by GCC.
|
|
1289
|
+
// To to;
|
|
1290
|
+
// memcpy(&to, &from, sizeof(from));
|
|
1291
|
+
// return to;
|
|
1292
|
+
|
|
1293
|
+
// Technically undefined, but almost universally supported,
|
|
1294
|
+
// and the most efficient implementation.
|
|
1295
|
+
union {
|
|
1296
|
+
From f;
|
|
1297
|
+
To t;
|
|
1298
|
+
} u;
|
|
1299
|
+
u.f = from;
|
|
1300
|
+
return u.t;
|
|
1300
1301
|
}
|
|
1302
|
+
}} // namespace duckdb_apache::thrift
|
|
1301
1303
|
|
|
1302
1304
|
|
|
1303
1305
|
#ifdef HAVE_SYS_PARAM_H
|
|
@@ -1983,7 +1985,7 @@ uint32_t skip(Protocol_& prot, TType type) {
|
|
|
1983
1985
|
|
|
1984
1986
|
}}} // duckdb_apache::thrift::protocol
|
|
1985
1987
|
|
|
1986
|
-
#endif // #define
|
|
1988
|
+
#endif // #define _DUCKDB_THRIFT_PROTOCOL_TPROTOCOL_H_ 1
|
|
1987
1989
|
|
|
1988
1990
|
|
|
1989
1991
|
// LICENSE_CHANGE_END
|
|
@@ -2001,7 +2003,7 @@ public:
|
|
|
2001
2003
|
}
|
|
2002
2004
|
} // duckdb_apache::thrift
|
|
2003
2005
|
|
|
2004
|
-
#endif // #ifndef
|
|
2006
|
+
#endif // #ifndef _DUCKDB_THRIFT_TBASE_H_
|
|
2005
2007
|
|
|
2006
2008
|
|
|
2007
2009
|
// LICENSE_CHANGE_END
|
|
@@ -4625,7 +4627,7 @@ std::ostream& operator<<(std::ostream& out, const FileCryptoMetaData& obj);
|
|
|
4625
4627
|
// LICENSE_CHANGE_END
|
|
4626
4628
|
|
|
4627
4629
|
|
|
4628
|
-
|
|
4630
|
+
#include <list>
|
|
4629
4631
|
|
|
4630
4632
|
|
|
4631
4633
|
// LICENSE_CHANGE_BEGIN
|
|
@@ -4651,8 +4653,8 @@ std::ostream& operator<<(std::ostream& out, const FileCryptoMetaData& obj);
|
|
|
4651
4653
|
* under the License.
|
|
4652
4654
|
*/
|
|
4653
4655
|
|
|
4654
|
-
#ifndef
|
|
4655
|
-
#define
|
|
4656
|
+
#ifndef _DUCKDB_THRIFT_PROTOCOL_TCOMPACTPROTOCOL_H_
|
|
4657
|
+
#define _DUCKDB_THRIFT_PROTOCOL_TCOMPACTPROTOCOL_H_ 1
|
|
4656
4658
|
|
|
4657
4659
|
|
|
4658
4660
|
|
|
@@ -4679,8 +4681,8 @@ std::ostream& operator<<(std::ostream& out, const FileCryptoMetaData& obj);
|
|
|
4679
4681
|
* under the License.
|
|
4680
4682
|
*/
|
|
4681
4683
|
|
|
4682
|
-
#ifndef
|
|
4683
|
-
#define
|
|
4684
|
+
#ifndef _DUCKDB_THRIFT_PROTOCOL_TVIRTUALPROTOCOL_H_
|
|
4685
|
+
#define _DUCKDB_THRIFT_PROTOCOL_TVIRTUALPROTOCOL_H_ 1
|
|
4684
4686
|
|
|
4685
4687
|
|
|
4686
4688
|
|
|
@@ -5172,7 +5174,7 @@ protected:
|
|
|
5172
5174
|
}
|
|
5173
5175
|
} // duckdb_apache::thrift::protocol
|
|
5174
5176
|
|
|
5175
|
-
#endif // #define
|
|
5177
|
+
#endif // #define _DUCKDB_THRIFT_PROTOCOL_TVIRTUALPROTOCOL_H_ 1
|
|
5176
5178
|
|
|
5177
5179
|
|
|
5178
5180
|
// LICENSE_CHANGE_END
|
|
@@ -5441,8 +5443,8 @@ typedef TCompactProtocolFactoryT<TTransport> TCompactProtocolFactory;
|
|
|
5441
5443
|
* specific language governing permissions and limitations
|
|
5442
5444
|
* under the License.
|
|
5443
5445
|
*/
|
|
5444
|
-
#ifndef
|
|
5445
|
-
#define
|
|
5446
|
+
#ifndef _DUCKDB_THRIFT_PROTOCOL_TCOMPACTPROTOCOL_TCC_
|
|
5447
|
+
#define _DUCKDB_THRIFT_PROTOCOL_TCOMPACTPROTOCOL_TCC_ 1
|
|
5446
5448
|
|
|
5447
5449
|
#include <limits>
|
|
5448
5450
|
|
|
@@ -6248,7 +6250,7 @@ TType TCompactProtocolT<Transport_>::getTType(int8_t type) {
|
|
|
6248
6250
|
|
|
6249
6251
|
}}} // duckdb_apache::thrift::protocol
|
|
6250
6252
|
|
|
6251
|
-
#endif //
|
|
6253
|
+
#endif // _DUCKDB_THRIFT_PROTOCOL_TCOMPACTPROTOCOL_TCC_
|
|
6252
6254
|
|
|
6253
6255
|
|
|
6254
6256
|
// LICENSE_CHANGE_END
|
|
@@ -6284,8 +6286,8 @@ TType TCompactProtocolT<Transport_>::getTType(int8_t type) {
|
|
|
6284
6286
|
* under the License.
|
|
6285
6287
|
*/
|
|
6286
6288
|
|
|
6287
|
-
#ifndef
|
|
6288
|
-
#define
|
|
6289
|
+
#ifndef _DUCKDB_THRIFT_TRANSPORT_TBUFFERTRANSPORTS_H_
|
|
6290
|
+
#define _DUCKDB_THRIFT_TRANSPORT_TBUFFERTRANSPORTS_H_ 1
|
|
6289
6291
|
|
|
6290
6292
|
#include <cstdlib>
|
|
6291
6293
|
#include <cstddef>
|
|
@@ -6319,8 +6321,8 @@ TType TCompactProtocolT<Transport_>::getTType(int8_t type) {
|
|
|
6319
6321
|
* under the License.
|
|
6320
6322
|
*/
|
|
6321
6323
|
|
|
6322
|
-
#ifndef
|
|
6323
|
-
#define
|
|
6324
|
+
#ifndef _DUCKDB_THRIFT_TRANSPORT_TVIRTUALTRANSPORT_H_
|
|
6325
|
+
#define _DUCKDB_THRIFT_TRANSPORT_TVIRTUALTRANSPORT_H_ 1
|
|
6324
6326
|
|
|
6325
6327
|
|
|
6326
6328
|
|
|
@@ -6439,7 +6441,7 @@ protected:
|
|
|
6439
6441
|
}
|
|
6440
6442
|
} // duckdb_apache::thrift::transport
|
|
6441
6443
|
|
|
6442
|
-
#endif // #ifndef
|
|
6444
|
+
#endif // #ifndef _DUCKDB_THRIFT_TRANSPORT_TVIRTUALTRANSPORT_H_
|
|
6443
6445
|
|
|
6444
6446
|
|
|
6445
6447
|
// LICENSE_CHANGE_END
|
|
@@ -6902,7 +6904,7 @@ protected:
|
|
|
6902
6904
|
}
|
|
6903
6905
|
} // duckdb_apache::thrift::transport
|
|
6904
6906
|
|
|
6905
|
-
#endif // #ifndef
|
|
6907
|
+
#endif // #ifndef _DUCKDB_THRIFT_TRANSPORT_TBUFFERTRANSPORTS_H_
|
|
6906
6908
|
|
|
6907
6909
|
|
|
6908
6910
|
// LICENSE_CHANGE_END
|
|
@@ -6916,31 +6918,174 @@ protected:
|
|
|
6916
6918
|
|
|
6917
6919
|
namespace duckdb {
|
|
6918
6920
|
|
|
6921
|
+
// A ReadHead for prefetching data in a specific range
|
|
6922
|
+
struct ReadHead {
|
|
6923
|
+
ReadHead(idx_t location, uint64_t size) : location(location), size(size) {};
|
|
6924
|
+
// Hint info
|
|
6925
|
+
idx_t location;
|
|
6926
|
+
uint64_t size;
|
|
6927
|
+
|
|
6928
|
+
// Current info
|
|
6929
|
+
unique_ptr<AllocatedData> data;
|
|
6930
|
+
bool data_isset = false;
|
|
6931
|
+
|
|
6932
|
+
idx_t GetEnd() const {
|
|
6933
|
+
return size + location;
|
|
6934
|
+
}
|
|
6935
|
+
|
|
6936
|
+
void Allocate(Allocator &allocator) {
|
|
6937
|
+
data = allocator.Allocate(size);
|
|
6938
|
+
}
|
|
6939
|
+
};
|
|
6940
|
+
|
|
6941
|
+
// Comparator for ReadHeads that are either overlapping, adjacent, or within ALLOW_GAP bytes from each other
|
|
6942
|
+
struct ReadHeadComparator {
|
|
6943
|
+
static constexpr uint64_t ALLOW_GAP = 1 << 14; // 16 KiB
|
|
6944
|
+
bool operator()(const ReadHead *a, const ReadHead *b) const {
|
|
6945
|
+
auto a_start = a->location;
|
|
6946
|
+
auto a_end = a->location + a->size;
|
|
6947
|
+
auto b_start = b->location;
|
|
6948
|
+
|
|
6949
|
+
if (a_end <= NumericLimits<idx_t>::Maximum() - ALLOW_GAP) {
|
|
6950
|
+
a_end += ALLOW_GAP;
|
|
6951
|
+
}
|
|
6952
|
+
|
|
6953
|
+
return a_start < b_start && a_end < b_start;
|
|
6954
|
+
}
|
|
6955
|
+
};
|
|
6956
|
+
|
|
6957
|
+
// Two-step read ahead buffer
|
|
6958
|
+
// 1: register all ranges that will be read, merging ranges that are consecutive
|
|
6959
|
+
// 2: prefetch all registered ranges
|
|
6960
|
+
struct ReadAheadBuffer {
|
|
6961
|
+
ReadAheadBuffer(Allocator &allocator, FileHandle &handle, FileOpener &opener)
|
|
6962
|
+
: allocator(allocator), handle(handle), file_opener(opener) {
|
|
6963
|
+
}
|
|
6964
|
+
|
|
6965
|
+
// The list of read heads
|
|
6966
|
+
std::list<ReadHead> read_heads;
|
|
6967
|
+
// Set for merging consecutive ranges
|
|
6968
|
+
std::set<ReadHead *, ReadHeadComparator> merge_set;
|
|
6969
|
+
|
|
6970
|
+
Allocator &allocator;
|
|
6971
|
+
FileHandle &handle;
|
|
6972
|
+
FileOpener &file_opener;
|
|
6973
|
+
|
|
6974
|
+
idx_t total_size = 0;
|
|
6975
|
+
|
|
6976
|
+
// Add a read head to the prefetching list
|
|
6977
|
+
void AddReadHead(idx_t pos, uint64_t len, bool merge_buffers = true) {
|
|
6978
|
+
// Attempt to merge with existing
|
|
6979
|
+
if (merge_buffers) {
|
|
6980
|
+
ReadHead new_read_head {pos, len};
|
|
6981
|
+
auto lookup_set = merge_set.find(&new_read_head);
|
|
6982
|
+
if (lookup_set != merge_set.end()) {
|
|
6983
|
+
auto existing_head = *lookup_set;
|
|
6984
|
+
auto new_start = MinValue<idx_t>(existing_head->location, new_read_head.location);
|
|
6985
|
+
auto new_length = MaxValue<idx_t>(existing_head->GetEnd(), new_read_head.GetEnd()) - new_start;
|
|
6986
|
+
existing_head->location = new_start;
|
|
6987
|
+
existing_head->size = new_length;
|
|
6988
|
+
return;
|
|
6989
|
+
}
|
|
6990
|
+
}
|
|
6991
|
+
|
|
6992
|
+
read_heads.emplace_front(ReadHead(pos, len));
|
|
6993
|
+
total_size += len;
|
|
6994
|
+
auto &read_head = read_heads.front();
|
|
6995
|
+
|
|
6996
|
+
if (merge_buffers) {
|
|
6997
|
+
merge_set.insert(&read_head);
|
|
6998
|
+
}
|
|
6999
|
+
|
|
7000
|
+
if (read_head.GetEnd() > handle.GetFileSize()) {
|
|
7001
|
+
throw std::runtime_error("Prefetch registered for bytes outside file");
|
|
7002
|
+
}
|
|
7003
|
+
}
|
|
7004
|
+
|
|
7005
|
+
// Returns the relevant read head
|
|
7006
|
+
ReadHead *GetReadHead(idx_t pos) {
|
|
7007
|
+
for (auto &read_head : read_heads) {
|
|
7008
|
+
if (pos >= read_head.location && pos < read_head.GetEnd()) {
|
|
7009
|
+
return &read_head;
|
|
7010
|
+
}
|
|
7011
|
+
}
|
|
7012
|
+
return nullptr;
|
|
7013
|
+
}
|
|
7014
|
+
|
|
7015
|
+
// Prefetch all read heads
|
|
7016
|
+
void Prefetch() {
|
|
7017
|
+
for (auto &read_head : read_heads) {
|
|
7018
|
+
read_head.Allocate(allocator);
|
|
7019
|
+
|
|
7020
|
+
if (read_head.GetEnd() > handle.GetFileSize()) {
|
|
7021
|
+
throw std::runtime_error("Prefetch registered requested for bytes outside file");
|
|
7022
|
+
}
|
|
7023
|
+
|
|
7024
|
+
handle.Read(read_head.data->get(), read_head.size, read_head.location);
|
|
7025
|
+
read_head.data_isset = true;
|
|
7026
|
+
}
|
|
7027
|
+
}
|
|
7028
|
+
};
|
|
7029
|
+
|
|
6919
7030
|
class ThriftFileTransport : public duckdb_apache::thrift::transport::TVirtualTransport<ThriftFileTransport> {
|
|
6920
7031
|
public:
|
|
6921
|
-
|
|
6922
|
-
|
|
7032
|
+
static constexpr uint64_t PREFETCH_FALLBACK_BUFFERSIZE = 1000000;
|
|
7033
|
+
|
|
7034
|
+
ThriftFileTransport(Allocator &allocator, FileHandle &handle_p, FileOpener &opener, bool prefetch_mode_p)
|
|
7035
|
+
: handle(handle_p), location(0), allocator(allocator), ra_buffer(ReadAheadBuffer(allocator, handle_p, opener)),
|
|
7036
|
+
prefetch_mode(prefetch_mode_p) {
|
|
6923
7037
|
}
|
|
6924
7038
|
|
|
6925
7039
|
uint32_t read(uint8_t *buf, uint32_t len) {
|
|
6926
|
-
|
|
6927
|
-
|
|
6928
|
-
|
|
7040
|
+
auto prefetch_buffer = ra_buffer.GetReadHead(location);
|
|
7041
|
+
if (prefetch_buffer != nullptr && location - prefetch_buffer->location + len <= prefetch_buffer->size) {
|
|
7042
|
+
D_ASSERT(location - prefetch_buffer->location + len <= prefetch_buffer->size);
|
|
7043
|
+
|
|
7044
|
+
if (!prefetch_buffer->data_isset) {
|
|
7045
|
+
prefetch_buffer->Allocate(allocator);
|
|
7046
|
+
handle.Read(prefetch_buffer->data->get(), prefetch_buffer->size, prefetch_buffer->location);
|
|
7047
|
+
prefetch_buffer->data_isset = true;
|
|
7048
|
+
}
|
|
7049
|
+
memcpy(buf, prefetch_buffer->data->get() + location - prefetch_buffer->location, len);
|
|
6929
7050
|
} else {
|
|
6930
|
-
|
|
7051
|
+
if (prefetch_mode && len < PREFETCH_FALLBACK_BUFFERSIZE && len > 0) {
|
|
7052
|
+
Prefetch(location, MinValue<uint64_t>(PREFETCH_FALLBACK_BUFFERSIZE, handle.GetFileSize() - location));
|
|
7053
|
+
auto prefetch_buffer_fallback = ra_buffer.GetReadHead(location);
|
|
7054
|
+
D_ASSERT(location - prefetch_buffer_fallback->location + len <= prefetch_buffer_fallback->size);
|
|
7055
|
+
memcpy(buf, prefetch_buffer_fallback->data->get() + location - prefetch_buffer_fallback->location, len);
|
|
7056
|
+
} else {
|
|
7057
|
+
handle.Read(buf, len, location);
|
|
7058
|
+
}
|
|
6931
7059
|
}
|
|
6932
7060
|
location += len;
|
|
6933
7061
|
return len;
|
|
6934
7062
|
}
|
|
6935
7063
|
|
|
6936
|
-
|
|
6937
|
-
|
|
6938
|
-
|
|
6939
|
-
|
|
7064
|
+
// Prefetch a single buffer
|
|
7065
|
+
void Prefetch(idx_t pos, uint64_t len) {
|
|
7066
|
+
RegisterPrefetch(pos, len, false);
|
|
7067
|
+
FinalizeRegistration();
|
|
7068
|
+
PrefetchRegistered();
|
|
7069
|
+
}
|
|
7070
|
+
|
|
7071
|
+
// Register a buffer for prefixing
|
|
7072
|
+
void RegisterPrefetch(idx_t pos, uint64_t len, bool can_merge = true) {
|
|
7073
|
+
ra_buffer.AddReadHead(pos, len, can_merge);
|
|
7074
|
+
}
|
|
7075
|
+
|
|
7076
|
+
// Prevents any further merges, should be called before PrefetchRegistered
|
|
7077
|
+
void FinalizeRegistration() {
|
|
7078
|
+
ra_buffer.merge_set.clear();
|
|
7079
|
+
}
|
|
7080
|
+
|
|
7081
|
+
// Prefetch all previously registered ranges
|
|
7082
|
+
void PrefetchRegistered() {
|
|
7083
|
+
ra_buffer.Prefetch();
|
|
6940
7084
|
}
|
|
6941
7085
|
|
|
6942
7086
|
void ClearPrefetch() {
|
|
6943
|
-
|
|
7087
|
+
ra_buffer.read_heads.clear();
|
|
7088
|
+
ra_buffer.merge_set.clear();
|
|
6944
7089
|
}
|
|
6945
7090
|
|
|
6946
7091
|
void SetLocation(idx_t location_p) {
|
|
@@ -6955,12 +7100,17 @@ public:
|
|
|
6955
7100
|
}
|
|
6956
7101
|
|
|
6957
7102
|
private:
|
|
6958
|
-
Allocator &allocator;
|
|
6959
7103
|
FileHandle &handle;
|
|
6960
7104
|
idx_t location;
|
|
6961
7105
|
|
|
6962
|
-
|
|
6963
|
-
|
|
7106
|
+
Allocator &allocator;
|
|
7107
|
+
|
|
7108
|
+
// Multi-buffer prefetch
|
|
7109
|
+
ReadAheadBuffer ra_buffer;
|
|
7110
|
+
|
|
7111
|
+
// Whether the prefetch mode is enabled. In this mode the DirectIO flag of the handle will be set and the parquet
|
|
7112
|
+
// reader will manage the read buffering.
|
|
7113
|
+
bool prefetch_mode;
|
|
6964
7114
|
};
|
|
6965
7115
|
|
|
6966
7116
|
} // namespace duckdb
|
|
@@ -7417,8 +7567,13 @@ public:
|
|
|
7417
7567
|
idx_t MaxDefine() const;
|
|
7418
7568
|
idx_t MaxRepeat() const;
|
|
7419
7569
|
|
|
7570
|
+
virtual idx_t FileOffset() const;
|
|
7571
|
+
virtual uint64_t TotalCompressedSize();
|
|
7420
7572
|
virtual idx_t GroupRowsAvailable();
|
|
7421
7573
|
|
|
7574
|
+
// register the range this reader will touch for prefetching
|
|
7575
|
+
virtual void RegisterPrefetch(ThriftFileTransport &transport, bool allow_merge);
|
|
7576
|
+
|
|
7422
7577
|
virtual unique_ptr<BaseStatistics> Stats(const std::vector<ColumnChunk> &columns);
|
|
7423
7578
|
|
|
7424
7579
|
protected:
|
|
@@ -7433,6 +7588,9 @@ protected:
|
|
|
7433
7588
|
virtual void DictReference(Vector &result);
|
|
7434
7589
|
virtual void PlainReference(shared_ptr<ByteBuffer>, Vector &result);
|
|
7435
7590
|
|
|
7591
|
+
// applies any skips that were registered using Skip()
|
|
7592
|
+
virtual void ApplyPendingSkips(idx_t num_values);
|
|
7593
|
+
|
|
7436
7594
|
bool HasDefines() {
|
|
7437
7595
|
return max_define > 0;
|
|
7438
7596
|
}
|
|
@@ -7451,13 +7609,15 @@ protected:
|
|
|
7451
7609
|
ParquetReader &reader;
|
|
7452
7610
|
LogicalType type;
|
|
7453
7611
|
|
|
7612
|
+
idx_t pending_skips = 0;
|
|
7613
|
+
|
|
7454
7614
|
private:
|
|
7455
7615
|
void PrepareRead(parquet_filter_t &filter);
|
|
7456
7616
|
void PreparePage(idx_t compressed_page_size, idx_t uncompressed_page_size);
|
|
7457
7617
|
void PrepareDataPage(PageHeader &page_hdr);
|
|
7458
7618
|
void PreparePageV2(PageHeader &page_hdr);
|
|
7459
7619
|
|
|
7460
|
-
const duckdb_parquet::format::ColumnChunk *chunk;
|
|
7620
|
+
const duckdb_parquet::format::ColumnChunk *chunk = nullptr;
|
|
7461
7621
|
|
|
7462
7622
|
duckdb_apache::thrift::protocol::TProtocol *protocol;
|
|
7463
7623
|
idx_t page_rows_available;
|
|
@@ -7545,6 +7705,11 @@ class ChunkCollection;
|
|
|
7545
7705
|
class BaseStatistics;
|
|
7546
7706
|
class TableFilterSet;
|
|
7547
7707
|
|
|
7708
|
+
struct ParquetReaderPrefetchConfig {
|
|
7709
|
+
// Percentage of data in a row group span that should be scanned for enabling whole group prefetch
|
|
7710
|
+
static constexpr double WHOLE_GROUP_PREFETCH_MINIMUM_SCAN = 0.95;
|
|
7711
|
+
};
|
|
7712
|
+
|
|
7548
7713
|
struct ParquetReaderScanState {
|
|
7549
7714
|
vector<idx_t> group_idx_list;
|
|
7550
7715
|
int64_t current_group;
|
|
@@ -7560,6 +7725,9 @@ struct ParquetReaderScanState {
|
|
|
7560
7725
|
|
|
7561
7726
|
ResizeableBuffer define_buf;
|
|
7562
7727
|
ResizeableBuffer repeat_buf;
|
|
7728
|
+
|
|
7729
|
+
bool prefetch_mode = false;
|
|
7730
|
+
bool current_group_prefetched = false;
|
|
7563
7731
|
};
|
|
7564
7732
|
|
|
7565
7733
|
struct ParquetOptions {
|
|
@@ -7568,6 +7736,8 @@ struct ParquetOptions {
|
|
|
7568
7736
|
explicit ParquetOptions(ClientContext &context);
|
|
7569
7737
|
|
|
7570
7738
|
bool binary_as_string = false;
|
|
7739
|
+
bool filename = false;
|
|
7740
|
+
bool hive_partitioning = false;
|
|
7571
7741
|
};
|
|
7572
7742
|
|
|
7573
7743
|
class ParquetReader {
|
|
@@ -7624,6 +7794,10 @@ private:
|
|
|
7624
7794
|
idx_t depth, idx_t max_define, idx_t max_repeat,
|
|
7625
7795
|
idx_t &next_schema_idx, idx_t &next_file_idx);
|
|
7626
7796
|
const duckdb_parquet::format::RowGroup &GetGroup(ParquetReaderScanState &state);
|
|
7797
|
+
uint64_t GetGroupCompressedSize(ParquetReaderScanState &state);
|
|
7798
|
+
idx_t GetGroupOffset(ParquetReaderScanState &state);
|
|
7799
|
+
// Group span is the distance between the min page offset and the max page offset plus the max page compressed size
|
|
7800
|
+
uint64_t GetGroupSpan(ParquetReaderScanState &state);
|
|
7627
7801
|
void PrepareRowGroupBuffer(ParquetReaderScanState &state, idx_t out_col_idx);
|
|
7628
7802
|
LogicalType DeriveLogicalType(const SchemaElement &s_ele);
|
|
7629
7803
|
|
|
@@ -7681,7 +7855,7 @@ namespace duckdb {
|
|
|
7681
7855
|
class BufferedSerializer;
|
|
7682
7856
|
class ParquetWriter;
|
|
7683
7857
|
class ColumnWriterPageState;
|
|
7684
|
-
class
|
|
7858
|
+
class BasicColumnWriterState;
|
|
7685
7859
|
|
|
7686
7860
|
class ColumnWriterState {
|
|
7687
7861
|
public:
|
|
@@ -7703,9 +7877,6 @@ public:
|
|
|
7703
7877
|
};
|
|
7704
7878
|
|
|
7705
7879
|
class ColumnWriter {
|
|
7706
|
-
//! We limit the uncompressed page size to 100MB
|
|
7707
|
-
// The max size in Parquet is 2GB, but we choose a more conservative limit
|
|
7708
|
-
static constexpr const idx_t MAX_UNCOMPRESSED_PAGE_SIZE = 100000000;
|
|
7709
7880
|
|
|
7710
7881
|
public:
|
|
7711
7882
|
ColumnWriter(ParquetWriter &writer, idx_t schema_idx, vector<string> schema_path, idx_t max_repeat,
|
|
@@ -7729,46 +7900,35 @@ public:
|
|
|
7729
7900
|
idx_t max_repeat = 0, idx_t max_define = 1,
|
|
7730
7901
|
bool can_have_nulls = true);
|
|
7731
7902
|
|
|
7732
|
-
virtual unique_ptr<ColumnWriterState> InitializeWriteState(duckdb_parquet::format::RowGroup &row_group);
|
|
7733
|
-
virtual void Prepare(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count);
|
|
7903
|
+
virtual unique_ptr<ColumnWriterState> InitializeWriteState(duckdb_parquet::format::RowGroup &row_group) = 0;
|
|
7734
7904
|
|
|
7735
|
-
|
|
7736
|
-
virtual
|
|
7737
|
-
|
|
7738
|
-
|
|
7739
|
-
protected:
|
|
7740
|
-
void HandleDefineLevels(ColumnWriterState &state, ColumnWriterState *parent, ValidityMask &validity, idx_t count,
|
|
7741
|
-
uint16_t define_value, uint16_t null_value);
|
|
7742
|
-
void HandleRepeatLevels(ColumnWriterState &state_p, ColumnWriterState *parent, idx_t count, idx_t max_repeat);
|
|
7743
|
-
|
|
7744
|
-
void WriteLevels(Serializer &temp_writer, const vector<uint16_t> &levels, idx_t max_value, idx_t start_offset,
|
|
7745
|
-
idx_t count);
|
|
7905
|
+
//! indicates whether the write need to analyse the data before preparing it
|
|
7906
|
+
virtual bool HasAnalyze() {
|
|
7907
|
+
return false;
|
|
7908
|
+
}
|
|
7746
7909
|
|
|
7747
|
-
virtual
|
|
7910
|
+
virtual void Analyze(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count) {
|
|
7911
|
+
throw NotImplementedException("Writer does not need analysis");
|
|
7912
|
+
}
|
|
7748
7913
|
|
|
7749
|
-
|
|
7750
|
-
void
|
|
7751
|
-
|
|
7914
|
+
//! Called after all data has been passed to Analyze
|
|
7915
|
+
virtual void FinalizeAnalyze(ColumnWriterState &state) {
|
|
7916
|
+
throw NotImplementedException("Writer does not need analysis");
|
|
7917
|
+
}
|
|
7752
7918
|
|
|
7753
|
-
virtual void
|
|
7919
|
+
virtual void Prepare(ColumnWriterState &state, ColumnWriterState *parent, Vector &vector, idx_t count) = 0;
|
|
7754
7920
|
|
|
7755
|
-
|
|
7756
|
-
virtual
|
|
7757
|
-
|
|
7758
|
-
virtual idx_t GetRowSize(Vector &vector, idx_t index);
|
|
7759
|
-
//! Writes a (subset of a) vector to the specified serializer. Only used for scalar types.
|
|
7760
|
-
virtual void WriteVector(Serializer &temp_writer, ColumnWriterStatistics *stats, ColumnWriterPageState *page_state,
|
|
7761
|
-
Vector &vector, idx_t chunk_start, idx_t chunk_end);
|
|
7921
|
+
virtual void BeginWrite(ColumnWriterState &state) = 0;
|
|
7922
|
+
virtual void Write(ColumnWriterState &state, Vector &vector, idx_t count) = 0;
|
|
7923
|
+
virtual void FinalizeWrite(ColumnWriterState &state) = 0;
|
|
7762
7924
|
|
|
7763
|
-
|
|
7764
|
-
|
|
7765
|
-
|
|
7766
|
-
|
|
7925
|
+
protected:
|
|
7926
|
+
void HandleDefineLevels(ColumnWriterState &state, ColumnWriterState *parent, ValidityMask &validity, idx_t count,
|
|
7927
|
+
uint16_t define_value, uint16_t null_value);
|
|
7928
|
+
void HandleRepeatLevels(ColumnWriterState &state_p, ColumnWriterState *parent, idx_t count, idx_t max_repeat);
|
|
7767
7929
|
|
|
7768
7930
|
void CompressPage(BufferedSerializer &temp_writer, size_t &compressed_size, data_ptr_t &compressed_data,
|
|
7769
7931
|
unique_ptr<data_t[]> &compressed_buf);
|
|
7770
|
-
|
|
7771
|
-
void SetParquetStatistics(StandardColumnWriterState &state, duckdb_parquet::format::ColumnChunk &column);
|
|
7772
7932
|
};
|
|
7773
7933
|
|
|
7774
7934
|
} // namespace duckdb
|
|
@@ -7781,6 +7941,7 @@ class FileOpener;
|
|
|
7781
7941
|
|
|
7782
7942
|
class ParquetWriter {
|
|
7783
7943
|
friend class ColumnWriter;
|
|
7944
|
+
friend class BasicColumnWriter;
|
|
7784
7945
|
friend class ListColumnWriter;
|
|
7785
7946
|
friend class StructColumnWriter;
|
|
7786
7947
|
|