duckdb 0.6.2-dev1206.0 → 0.6.2-dev1218.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "0.6.2-dev1206.0",
5
+ "version": "0.6.2-dev1218.0",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
@@ -184,7 +184,6 @@ void ColumnReader::PrepareRead(parquet_filter_t &filter) {
184
184
  dict_decoder.reset();
185
185
  defined_decoder.reset();
186
186
  block.reset();
187
-
188
187
  PageHeader page_hdr;
189
188
  page_hdr.read(protocol);
190
189
 
@@ -204,6 +203,10 @@ void ColumnReader::PrepareRead(parquet_filter_t &filter) {
204
203
  default:
205
204
  break; // ignore INDEX page type and any other custom extensions
206
205
  }
206
+ ResetPage();
207
+ }
208
+
209
+ void ColumnReader::ResetPage() {
207
210
  }
208
211
 
209
212
  void ColumnReader::PreparePageV2(PageHeader &page_hdr) {
@@ -31,6 +31,10 @@ public:
31
31
  TemplatedColumnReader<bool, BooleanParquetValueConversion>::InitializeRead(row_group_idx_p, columns,
32
32
  protocol_p);
33
33
  }
34
+
35
+ void ResetPage() override {
36
+ byte_pos = 0;
37
+ }
34
38
  };
35
39
 
36
40
  struct BooleanParquetValueConversion {
@@ -134,6 +134,8 @@ protected:
134
134
 
135
135
  idx_t pending_skips = 0;
136
136
 
137
+ virtual void ResetPage();
138
+
137
139
  private:
138
140
  void AllocateBlock(idx_t size);
139
141
  void AllocateCompressed(idx_t size);
@@ -109,6 +109,8 @@ public:
109
109
  vector<string> names;
110
110
  shared_ptr<ParquetFileMetadataCache> metadata;
111
111
  ParquetOptions parquet_options;
112
+ //! maps hive partition names to string columns
113
+ unique_ptr<std::map<string, string>> hive_map;
112
114
 
113
115
  //! when reading multiple parquet files (with union by name option)
114
116
  //! TableFunction might return more cols than any single parquet file. Even all parquet files have same
@@ -344,6 +344,17 @@ unique_ptr<ColumnReader> ParquetReader::CreateReaderRecursive(const FileMetaData
344
344
  std::move(element_reader));
345
345
  }
346
346
 
347
+ // if this is a hive partition col, we should not read it at all but instead do a constant reader.
348
+ if (parquet_options.hive_partitioning && hive_map && depth == 1) {
349
+ auto lookup = hive_map->find(s_ele.name);
350
+ if (lookup != hive_map->end()) {
351
+ Value val = Value(lookup->second);
352
+ return make_unique<GeneratedConstantColumnReader>(*this, LogicalType::VARCHAR, SchemaElement(),
353
+ next_file_idx++, max_define, max_repeat, val);
354
+ ;
355
+ }
356
+ }
357
+
347
358
  // TODO check return value of derive type or should we only do this on read()
348
359
  return ColumnReader::CreateReader(*this, DeriveLogicalType(s_ele), s_ele, next_file_idx++, max_define,
349
360
  max_repeat);
@@ -381,9 +392,7 @@ unique_ptr<ColumnReader> ParquetReader::CreateReader(const duckdb_parquet::forma
381
392
  }
382
393
 
383
394
  if (parquet_options.hive_partitioning) {
384
- auto res = HivePartitioning::Parse(file_name);
385
-
386
- for (auto &partition : res) {
395
+ for (auto &partition : *hive_map) {
387
396
  Value val = Value(partition.second);
388
397
  root_struct_reader.child_readers.push_back(make_unique<GeneratedConstantColumnReader>(
389
398
  *this, LogicalType::VARCHAR, SchemaElement(), next_file_idx, 0, 0, val));
@@ -439,10 +448,16 @@ void ParquetReader::InitializeSchema(const vector<string> &expected_names, const
439
448
 
440
449
  // Add generated constant column for filename
441
450
  if (parquet_options.hive_partitioning) {
442
- auto partitions = HivePartitioning::Parse(file_name);
443
- for (auto &part : partitions) {
444
- return_types.emplace_back(LogicalType::VARCHAR);
445
- names.emplace_back(part.first);
451
+ for (auto &part : *hive_map) {
452
+ // We need to lookup the hive col in the cols of the file to avoid duplicating columns that are both
453
+ // in the file and the hive path
454
+ auto lookup =
455
+ std::find_if(child_types.begin(), child_types.end(),
456
+ [&part](const std::pair<std::string, LogicalType> &x) { return x.first == part.first; });
457
+ if (lookup == child_types.end()) {
458
+ return_types.emplace_back(LogicalType::VARCHAR);
459
+ names.emplace_back(part.first);
460
+ }
446
461
  }
447
462
  }
448
463
 
@@ -541,6 +556,11 @@ ParquetReader::ParquetReader(ClientContext &context_p, string file_name_p, const
541
556
  ObjectCache::GetObjectCache(context_p).Put(file_name, metadata);
542
557
  }
543
558
  }
559
+
560
+ if (parquet_options.hive_partitioning) {
561
+ hive_map = make_unique<std::map<string, string>>(HivePartitioning::Parse(file_name));
562
+ }
563
+
544
564
  InitializeSchema(expected_names, expected_types_p, column_ids, initial_filename_p);
545
565
  }
546
566
 
@@ -1,8 +1,8 @@
1
1
  #ifndef DUCKDB_VERSION
2
- #define DUCKDB_VERSION "0.6.2-dev1206"
2
+ #define DUCKDB_VERSION "0.6.2-dev1218"
3
3
  #endif
4
4
  #ifndef DUCKDB_SOURCE_ID
5
- #define DUCKDB_SOURCE_ID "7067a9ae38"
5
+ #define DUCKDB_SOURCE_ID "e2dfc274b0"
6
6
  #endif
7
7
  #include "duckdb/function/table/system_functions.hpp"
8
8
  #include "duckdb/main/database.hpp"