npm - duckdb - Versions diffs - 0.6.2-dev1206.0 → 0.6.2-dev1218.0 - Mend

duckdb 0.6.2-dev1206.0 → 0.6.2-dev1218.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/package.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "name": "duckdb",
   "main": "./lib/duckdb.js",
   "types": "./lib/duckdb.d.ts",
-  "version": "0.6.2-dev1206.0",
+  "version": "0.6.2-dev1218.0",
   "description": "DuckDB node.js API",
   "gypfile": true,
   "dependencies": {

package/src/duckdb/extension/parquet/column_reader.cpp CHANGED Viewed

@@ -184,7 +184,6 @@ void ColumnReader::PrepareRead(parquet_filter_t &filter) {
 	dict_decoder.reset();
 	defined_decoder.reset();
 	block.reset();
 	PageHeader page_hdr;
 	page_hdr.read(protocol);
@@ -204,6 +203,10 @@ void ColumnReader::PrepareRead(parquet_filter_t &filter) {
 	default:
 		break; // ignore INDEX page type and any other custom extensions
 	}
+	ResetPage();
+}
+void ColumnReader::ResetPage() {
 }
 void ColumnReader::PreparePageV2(PageHeader &page_hdr) {

package/src/duckdb/extension/parquet/include/boolean_column_reader.hpp CHANGED Viewed

@@ -31,6 +31,10 @@ public:
 		TemplatedColumnReader<bool, BooleanParquetValueConversion>::InitializeRead(row_group_idx_p, columns,
 		                                                                           protocol_p);
 	}
+	void ResetPage() override {
+		byte_pos = 0;
+	}
 };
 struct BooleanParquetValueConversion {

package/src/duckdb/extension/parquet/include/column_reader.hpp CHANGED Viewed

@@ -134,6 +134,8 @@ protected:
 	idx_t pending_skips = 0;
+	virtual void ResetPage();
 private:
 	void AllocateBlock(idx_t size);
 	void AllocateCompressed(idx_t size);

package/src/duckdb/extension/parquet/include/parquet_reader.hpp CHANGED Viewed

@@ -109,6 +109,8 @@ public:
 	vector<string> names;
 	shared_ptr<ParquetFileMetadataCache> metadata;
 	ParquetOptions parquet_options;
+	//! maps hive partition names to string columns
+	unique_ptr<std::map<string, string>> hive_map;
 	//! when reading multiple parquet files (with union by name option)
 	//! TableFunction might return more cols than any single parquet file. Even all parquet files have same

package/src/duckdb/extension/parquet/parquet_reader.cpp CHANGED Viewed

@@ -344,6 +344,17 @@ unique_ptr<ColumnReader> ParquetReader::CreateReaderRecursive(const FileMetaData
 			                                     std::move(element_reader));
 		}
+		// if this is a hive partition col, we should not read it at all but instead do a constant reader.
+		if (parquet_options.hive_partitioning && hive_map && depth == 1) {
+			auto lookup = hive_map->find(s_ele.name);
+			if (lookup != hive_map->end()) {
+				Value val = Value(lookup->second);
+				return make_unique<GeneratedConstantColumnReader>(*this, LogicalType::VARCHAR, SchemaElement(),
+				                                                  next_file_idx++, max_define, max_repeat, val);
+				;
+			}
+		}
 		// TODO check return value of derive type or should we only do this on read()
 		return ColumnReader::CreateReader(*this, DeriveLogicalType(s_ele), s_ele, next_file_idx++, max_define,
 		                                  max_repeat);
@@ -381,9 +392,7 @@ unique_ptr<ColumnReader> ParquetReader::CreateReader(const duckdb_parquet::forma
 	}
 	if (parquet_options.hive_partitioning) {
-		auto res = HivePartitioning::Parse(file_name);
-		for (auto &partition : res) {
+		for (auto &partition : *hive_map) {
 			Value val = Value(partition.second);
 			root_struct_reader.child_readers.push_back(make_unique<GeneratedConstantColumnReader>(
 			    *this, LogicalType::VARCHAR, SchemaElement(), next_file_idx, 0, 0, val));
@@ -439,10 +448,16 @@ void ParquetReader::InitializeSchema(const vector<string> &expected_names, const
 	// Add generated constant column for filename
 	if (parquet_options.hive_partitioning) {
-		auto partitions = HivePartitioning::Parse(file_name);
-		for (auto &part : partitions) {
-			return_types.emplace_back(LogicalType::VARCHAR);
-			names.emplace_back(part.first);
+		for (auto &part : *hive_map) {
+			// We need to lookup the hive col in the cols of the file to avoid duplicating columns that are both
+			// in the file and the hive path
+			auto lookup =
+			    std::find_if(child_types.begin(), child_types.end(),
+			                 [&part](const std::pair<std::string, LogicalType> &x) { return x.first == part.first; });
+			if (lookup == child_types.end()) {
+				return_types.emplace_back(LogicalType::VARCHAR);
+				names.emplace_back(part.first);
+			}
 		}
 	}
@@ -541,6 +556,11 @@ ParquetReader::ParquetReader(ClientContext &context_p, string file_name_p, const
 			ObjectCache::GetObjectCache(context_p).Put(file_name, metadata);
 		}
 	}
+	if (parquet_options.hive_partitioning) {
+		hive_map = make_unique<std::map<string, string>>(HivePartitioning::Parse(file_name));
+	}
 	InitializeSchema(expected_names, expected_types_p, column_ids, initial_filename_p);
 }

package/src/duckdb/src/function/table/version/pragma_version.cpp CHANGED Viewed

@@ -1,8 +1,8 @@
 #ifndef DUCKDB_VERSION
-#define DUCKDB_VERSION "0.6.2-dev1206"
+#define DUCKDB_VERSION "0.6.2-dev1218"
 #endif
 #ifndef DUCKDB_SOURCE_ID
-#define DUCKDB_SOURCE_ID "7067a9ae38"
+#define DUCKDB_SOURCE_ID "e2dfc274b0"
 #endif
 #include "duckdb/function/table/system_functions.hpp"
 #include "duckdb/main/database.hpp"