duckdb 0.5.1-dev271.0 → 0.5.1-dev282.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb.cpp +13 -6
- package/src/duckdb.hpp +6 -2
- package/src/parquet-amalgamation.cpp +20142 -20140
package/package.json
CHANGED
package/src/duckdb.cpp
CHANGED
|
@@ -18941,7 +18941,8 @@ namespace duckdb {
|
|
|
18941
18941
|
|
|
18942
18942
|
static unordered_map<column_t, string> GetKnownColumnValues(string &filename,
|
|
18943
18943
|
unordered_map<string, column_t> &column_map,
|
|
18944
|
-
|
|
18944
|
+
duckdb_re2::RE2 &compiled_regex, bool filename_col,
|
|
18945
|
+
bool hive_partition_cols) {
|
|
18945
18946
|
unordered_map<column_t, string> result;
|
|
18946
18947
|
|
|
18947
18948
|
if (filename_col) {
|
|
@@ -18952,7 +18953,7 @@ static unordered_map<column_t, string> GetKnownColumnValues(string &filename,
|
|
|
18952
18953
|
}
|
|
18953
18954
|
|
|
18954
18955
|
if (hive_partition_cols) {
|
|
18955
|
-
auto partitions = HivePartitioning::Parse(filename);
|
|
18956
|
+
auto partitions = HivePartitioning::Parse(filename, compiled_regex);
|
|
18956
18957
|
for (auto &partition : partitions) {
|
|
18957
18958
|
auto lookup_column_id = column_map.find(partition.first);
|
|
18958
18959
|
if (lookup_column_id != column_map.end()) {
|
|
@@ -18990,10 +18991,10 @@ static void ConvertKnownColRefToConstants(unique_ptr<Expression> &expr,
|
|
|
18990
18991
|
// - s3://bucket/var1=value1/bla/bla/var2=value2
|
|
18991
18992
|
// - http(s)://domain(:port)/lala/kasdl/var1=value1/?not-a-var=not-a-value
|
|
18992
18993
|
// - folder/folder/folder/../var1=value1/etc/.//var2=value2
|
|
18993
|
-
|
|
18994
|
-
std::map<string, string> result;
|
|
18994
|
+
const string HivePartitioning::REGEX_STRING = "[\\/\\\\]([^\\/\\?\\\\]+)=([^\\/\\n\\?\\\\]+)";
|
|
18995
18995
|
|
|
18996
|
-
|
|
18996
|
+
std::map<string, string> HivePartitioning::Parse(string &filename, duckdb_re2::RE2 ®ex) {
|
|
18997
|
+
std::map<string, string> result;
|
|
18997
18998
|
duckdb_re2::StringPiece input(filename); // Wrap a StringPiece around it
|
|
18998
18999
|
|
|
18999
19000
|
string var;
|
|
@@ -19004,6 +19005,11 @@ std::map<string, string> HivePartitioning::Parse(string &filename) {
|
|
|
19004
19005
|
return result;
|
|
19005
19006
|
}
|
|
19006
19007
|
|
|
19008
|
+
std::map<string, string> HivePartitioning::Parse(string &filename) {
|
|
19009
|
+
duckdb_re2::RE2 regex(REGEX_STRING);
|
|
19010
|
+
return Parse(filename, regex);
|
|
19011
|
+
}
|
|
19012
|
+
|
|
19007
19013
|
// TODO: this can still be improved by removing the parts of filter expressions that are true for all remaining files.
|
|
19008
19014
|
// currently, only expressions that cannot be evaluated during pushdown are removed.
|
|
19009
19015
|
void HivePartitioning::ApplyFiltersToFileList(vector<string> &files, vector<unique_ptr<Expression>> &filters,
|
|
@@ -19011,6 +19017,7 @@ void HivePartitioning::ApplyFiltersToFileList(vector<string> &files, vector<uniq
|
|
|
19011
19017
|
bool hive_enabled, bool filename_enabled) {
|
|
19012
19018
|
vector<string> pruned_files;
|
|
19013
19019
|
vector<unique_ptr<Expression>> pruned_filters;
|
|
19020
|
+
duckdb_re2::RE2 regex(REGEX_STRING);
|
|
19014
19021
|
|
|
19015
19022
|
if ((!filename_enabled && !hive_enabled) || filters.empty()) {
|
|
19016
19023
|
return;
|
|
@@ -19019,7 +19026,7 @@ void HivePartitioning::ApplyFiltersToFileList(vector<string> &files, vector<uniq
|
|
|
19019
19026
|
for (idx_t i = 0; i < files.size(); i++) {
|
|
19020
19027
|
auto &file = files[i];
|
|
19021
19028
|
bool should_prune_file = false;
|
|
19022
|
-
auto known_values = GetKnownColumnValues(file, column_map, filename_enabled, hive_enabled);
|
|
19029
|
+
auto known_values = GetKnownColumnValues(file, column_map, regex, filename_enabled, hive_enabled);
|
|
19023
19030
|
|
|
19024
19031
|
FilterCombiner combiner;
|
|
19025
19032
|
for (auto &filter : filters) {
|
package/src/duckdb.hpp
CHANGED
|
@@ -11,8 +11,8 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI
|
|
|
11
11
|
#pragma once
|
|
12
12
|
#define DUCKDB_AMALGAMATION 1
|
|
13
13
|
#define DUCKDB_AMALGAMATION_EXTENDED 1
|
|
14
|
-
#define DUCKDB_SOURCE_ID "
|
|
15
|
-
#define DUCKDB_VERSION "v0.5.1-
|
|
14
|
+
#define DUCKDB_SOURCE_ID "3d23491fa"
|
|
15
|
+
#define DUCKDB_VERSION "v0.5.1-dev282"
|
|
16
16
|
//===----------------------------------------------------------------------===//
|
|
17
17
|
// DuckDB
|
|
18
18
|
//
|
|
@@ -26998,12 +26998,16 @@ class HivePartitioning {
|
|
|
26998
26998
|
public:
|
|
26999
26999
|
//! Parse a filename that follows the hive partitioning scheme
|
|
27000
27000
|
DUCKDB_API static std::map<string, string> Parse(string &filename);
|
|
27001
|
+
DUCKDB_API static std::map<string, string> Parse(string &filename, duckdb_re2::RE2 ®ex);
|
|
27001
27002
|
//! Prunes a list of filenames based on a set of filters, can be used by TableFunctions in the
|
|
27002
27003
|
//! pushdown_complex_filter function to skip files with filename-based filters. Also removes the filters that always
|
|
27003
27004
|
//! evaluate to true.
|
|
27004
27005
|
DUCKDB_API static void ApplyFiltersToFileList(vector<string> &files, vector<unique_ptr<Expression>> &filters,
|
|
27005
27006
|
unordered_map<string, column_t> &column_map, idx_t table_index,
|
|
27006
27007
|
bool hive_enabled, bool filename_enabled);
|
|
27008
|
+
|
|
27009
|
+
//! Returns the compiled regex pattern to match hive partitions
|
|
27010
|
+
DUCKDB_API static const string REGEX_STRING;
|
|
27007
27011
|
};
|
|
27008
27012
|
|
|
27009
27013
|
} // namespace duckdb
|