duckdb 0.8.2-dev4376.0 → 0.8.2-dev4424.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "0.8.2-dev4376.0",
5
+ "version": "0.8.2-dev4424.0",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
@@ -492,7 +492,7 @@ int32_t Date::ExtractDayOfTheYear(date_t date) {
492
492
 
493
493
  int64_t Date::ExtractJulianDay(date_t date) {
494
494
  // Julian Day 0 is (-4713, 11, 24) in the proleptic Gregorian calendar.
495
- static const auto JULIAN_EPOCH = -2440588;
495
+ static const int64_t JULIAN_EPOCH = -2440588;
496
496
  return date.days - JULIAN_EPOCH;
497
497
  }
498
498
 
@@ -148,9 +148,6 @@ void FixedSizeBuffer::Pin() {
148
148
 
149
149
  uint32_t FixedSizeBuffer::GetOffset(const idx_t bitmask_count) {
150
150
 
151
- // this function calls Get() on the buffer, so the buffer must already be in memory
152
- D_ASSERT(InMemory());
153
-
154
151
  // get the bitmask data
155
152
  auto bitmask_ptr = reinterpret_cast<validity_t *>(Get());
156
153
  ValidityMask mask(bitmask_ptr);
@@ -200,7 +197,7 @@ uint32_t FixedSizeBuffer::GetOffset(const idx_t bitmask_count) {
200
197
 
201
198
  uint32_t FixedSizeBuffer::GetMaxOffset(const idx_t available_segments) {
202
199
 
203
- // this function calls Get() on the buffer, so the buffer must already be in memory
200
+ // this function calls Get() on the buffer
204
201
  D_ASSERT(InMemory());
205
202
 
206
203
  // finds the maximum zero bit in a bitmask, and adds one to it,
@@ -259,17 +256,13 @@ uint32_t FixedSizeBuffer::GetMaxOffset(const idx_t available_segments) {
259
256
  }
260
257
 
261
258
  // there are no allocations in this buffer
262
- // FIXME: put this line back in and then fix the missing vacuum bug in
263
- // FIXME: test_index_large_aborted_append.test with force_restart
264
- // FIXME: test if we still have non-dirty buffer to serialize after fixing this
265
- // throw InternalException("tried to serialize empty buffer");
266
- return 0;
259
+ throw InternalException("tried to serialize empty buffer");
267
260
  }
268
261
 
269
262
  void FixedSizeBuffer::SetUninitializedRegions(PartialBlockForIndex &p_block_for_index, const idx_t segment_size,
270
263
  const idx_t offset, const idx_t bitmask_offset) {
271
264
 
272
- // this function calls Get() on the buffer, so the buffer must already be in memory
265
+ // this function calls Get() on the buffer
273
266
  D_ASSERT(InMemory());
274
267
 
275
268
  auto bitmask_ptr = reinterpret_cast<validity_t *>(Get());
@@ -89,17 +89,19 @@ bool ParallelCSVReader::SetPosition() {
89
89
  position_buffer++;
90
90
  }
91
91
  if (position_buffer > end_buffer) {
92
+ VerifyLineLength(position_buffer, buffer->batch_index);
92
93
  return false;
93
94
  }
94
95
  SkipEmptyLines();
95
96
  if (verification_positions.beginning_of_first_line == 0) {
96
97
  verification_positions.beginning_of_first_line = position_buffer;
97
98
  }
98
-
99
+ VerifyLineLength(position_buffer, buffer->batch_index);
99
100
  verification_positions.end_of_last_line = position_buffer;
100
101
  return true;
101
102
  }
102
103
  }
104
+ VerifyLineLength(position_buffer, buffer->batch_index);
103
105
  return false;
104
106
  }
105
107
  SkipEmptyLines();
@@ -143,12 +145,13 @@ bool ParallelCSVReader::SetPosition() {
143
145
  break;
144
146
  }
145
147
 
146
- if (position_buffer >= end_buffer && !StringUtil::CharacterIsNewline((*buffer)[position_buffer - 1])) {
148
+ auto pos_check = position_buffer == 0 ? position_buffer : position_buffer - 1;
149
+ if (position_buffer >= end_buffer && !StringUtil::CharacterIsNewline((*buffer)[pos_check])) {
147
150
  break;
148
151
  }
149
152
 
150
153
  if (position_buffer > end_buffer && options.dialect_options.new_line == NewLineIdentifier::CARRY_ON &&
151
- (*buffer)[position_buffer - 1] == '\n') {
154
+ (*buffer)[pos_check] == '\n') {
152
155
  break;
153
156
  }
154
157
  idx_t position_set = position_buffer;
@@ -300,7 +300,7 @@ public:
300
300
  const CSVReaderOptions &options, idx_t system_threads_p, const vector<string> &files_path_p,
301
301
  bool force_parallelism_p, vector<column_t> column_ids_p)
302
302
  : buffer_manager(std::move(buffer_manager_p)), system_threads(system_threads_p),
303
- buffer_size(options.buffer_size), force_parallelism(force_parallelism_p), column_ids(std::move(column_ids_p)),
303
+ force_parallelism(force_parallelism_p), column_ids(std::move(column_ids_p)),
304
304
  line_info(main_mutex, batch_to_tuple_end, tuple_start, tuple_end) {
305
305
  current_file_path = files_path_p[0];
306
306
  CSVFileHandle *file_handle_ptr;
@@ -316,16 +316,6 @@ public:
316
316
  first_file_size = file_size;
317
317
  on_disk_file = file_handle_ptr->OnDiskFile();
318
318
  bytes_read = 0;
319
- if (buffer_size < file_size || file_size == 0) {
320
- bytes_per_local_state = buffer_size / ParallelCSVGlobalState::MaxThreads();
321
- } else {
322
- bytes_per_local_state = file_size / MaxThreads();
323
- }
324
- if (bytes_per_local_state == 0) {
325
- // In practice, I think this won't happen, it only happens because we are mocking up test scenarios
326
- // this boy needs to be at least one.
327
- bytes_per_local_state = 1;
328
- }
329
319
  running_threads = MaxThreads();
330
320
 
331
321
  // Initialize all the book-keeping variables
@@ -368,8 +358,6 @@ public:
368
358
 
369
359
  void UpdateLinesRead(CSVBufferRead &buffer_read, idx_t file_idx);
370
360
 
371
- void IncrementThread();
372
-
373
361
  void DecrementThread();
374
362
 
375
363
  bool Finished();
@@ -402,16 +390,12 @@ private:
402
390
  mutex main_mutex;
403
391
  //! Byte set from for last thread
404
392
  idx_t next_byte = 0;
405
- //! How many bytes we should execute per local state
406
- idx_t bytes_per_local_state;
407
393
  //! Size of first file
408
394
  idx_t first_file_size = 0;
409
395
  //! Whether or not this is an on-disk file
410
396
  bool on_disk_file = true;
411
397
  //! Basically max number of threads in DuckDB
412
398
  idx_t system_threads;
413
- //! Size of the buffers
414
- idx_t buffer_size;
415
399
  //! Current batch index
416
400
  idx_t batch_index = 0;
417
401
  idx_t local_batch_index = 0;
@@ -454,11 +438,6 @@ idx_t ParallelCSVGlobalState::MaxThreads() const {
454
438
  return system_threads;
455
439
  }
456
440
 
457
- void ParallelCSVGlobalState::IncrementThread() {
458
- lock_guard<mutex> parallel_lock(main_mutex);
459
- running_threads++;
460
- }
461
-
462
441
  void ParallelCSVGlobalState::DecrementThread() {
463
442
  lock_guard<mutex> parallel_lock(main_mutex);
464
443
  D_ASSERT(running_threads > 0);
@@ -572,6 +551,7 @@ bool ParallelCSVGlobalState::Next(ClientContext &context, const ReadCSVData &bin
572
551
  }
573
552
  // set up the current buffer
574
553
  line_info.current_batches[file_index - 1].insert(local_batch_index);
554
+ idx_t bytes_per_local_state = current_buffer->actual_size / MaxThreads() + 1;
575
555
  auto result = make_uniq<CSVBufferRead>(
576
556
  buffer_manager->GetBuffer(cur_buffer_idx), buffer_manager->GetBuffer(cur_buffer_idx + 1), next_byte,
577
557
  next_byte + bytes_per_local_state, batch_index++, local_batch_index++, &line_info);
@@ -1135,6 +1115,9 @@ unique_ptr<TableRef> ReadCSVReplacement(ClientContext &context, const string &ta
1135
1115
  if (StringUtil::EndsWith(lower_name, ".gz")) {
1136
1116
  lower_name = lower_name.substr(0, lower_name.size() - 3);
1137
1117
  } else if (StringUtil::EndsWith(lower_name, ".zst")) {
1118
+ if (!Catalog::TryAutoLoad(context, "parquet")) {
1119
+ throw MissingExtensionException("parquet extension is required for reading zst compressed file");
1120
+ }
1138
1121
  lower_name = lower_name.substr(0, lower_name.size() - 4);
1139
1122
  }
1140
1123
  if (!StringUtil::EndsWith(lower_name, ".csv") && !StringUtil::Contains(lower_name, ".csv?") &&
@@ -1,8 +1,8 @@
1
1
  #ifndef DUCKDB_VERSION
2
- #define DUCKDB_VERSION "0.8.2-dev4376"
2
+ #define DUCKDB_VERSION "0.8.2-dev4424"
3
3
  #endif
4
4
  #ifndef DUCKDB_SOURCE_ID
5
- #define DUCKDB_SOURCE_ID "312b995450"
5
+ #define DUCKDB_SOURCE_ID "b78b24ad26"
6
6
  #endif
7
7
  #include "duckdb/function/table/system_functions.hpp"
8
8
  #include "duckdb/main/database.hpp"
@@ -40,7 +40,7 @@ public:
40
40
  vector<string> names;
41
41
 
42
42
  public:
43
- DUCKDB_API void ThrowError(const string &prepended_message = "") const;
43
+ [[noreturn]] DUCKDB_API void ThrowError(const string &prepended_message = "") const;
44
44
  DUCKDB_API void SetError(PreservedError error);
45
45
  DUCKDB_API bool HasError() const;
46
46
  DUCKDB_API const ExceptionType &GetErrorType() const;
@@ -363,63 +363,64 @@ void CheckpointWriter::WriteIndex(IndexCatalogEntry &index_catalog, Serializer &
363
363
 
364
364
  void CheckpointReader::ReadIndex(ClientContext &context, Deserializer &deserializer) {
365
365
 
366
- // Deserialize the index metadata
367
- auto info = deserializer.ReadProperty<unique_ptr<CreateInfo>>(100, "index");
368
- auto &index_info = info->Cast<CreateIndexInfo>();
369
-
370
- // Create the index in the catalog
371
- auto &schema_catalog = catalog.GetSchema(context, info->schema);
372
- auto &table_catalog =
373
- catalog.GetEntry(context, CatalogType::TABLE_ENTRY, info->schema, index_info.table).Cast<DuckTableEntry>();
374
- auto &index_catalog = schema_catalog.CreateIndex(context, index_info, table_catalog)->Cast<DuckIndexEntry>();
375
- index_catalog.info = table_catalog.GetStorage().info;
376
-
377
- // We deserialize the index lazily, i.e., we do not need to load any node information
366
+ // deserialize the index create info
367
+ auto create_info = deserializer.ReadProperty<unique_ptr<CreateInfo>>(100, "index");
368
+ auto &info = create_info->Cast<CreateIndexInfo>();
369
+
370
+ // create the index in the catalog
371
+ auto &schema = catalog.GetSchema(context, create_info->schema);
372
+ auto &table =
373
+ catalog.GetEntry(context, CatalogType::TABLE_ENTRY, create_info->schema, info.table).Cast<DuckTableEntry>();
374
+
375
+ auto &index = schema.CreateIndex(context, info, table)->Cast<DuckIndexEntry>();
376
+
377
+ index.info = table.GetStorage().info;
378
+ // insert the parsed expressions into the stored index so that we correctly (de)serialize it during consecutive
379
+ // checkpoints
380
+ for (auto &parsed_expr : info.parsed_expressions) {
381
+ index.parsed_expressions.push_back(parsed_expr->Copy());
382
+ }
383
+
384
+ // we deserialize the index lazily, i.e., we do not need to load any node information
378
385
  // except the root block pointer
379
- auto index_block_pointer = deserializer.ReadProperty<BlockPointer>(101, "root_block_pointer");
386
+ auto root_block_pointer = deserializer.ReadProperty<BlockPointer>(101, "root_block_pointer");
380
387
 
381
- // obtain the expressions of the ART from the index metadata
382
- vector<unique_ptr<Expression>> unbound_expressions;
388
+ // obtain the parsed expressions of the ART from the index metadata
383
389
  vector<unique_ptr<ParsedExpression>> parsed_expressions;
384
- for (auto &p_exp : index_info.parsed_expressions) {
385
- parsed_expressions.push_back(p_exp->Copy());
390
+ for (auto &parsed_expr : info.parsed_expressions) {
391
+ parsed_expressions.push_back(parsed_expr->Copy());
386
392
  }
393
+ D_ASSERT(!parsed_expressions.empty());
387
394
 
388
- // bind the parsed expressions
389
- // add the table to the bind context
395
+ // add the table to the bind context to bind the parsed expressions
390
396
  auto binder = Binder::CreateBinder(context);
391
397
  vector<LogicalType> column_types;
392
398
  vector<string> column_names;
393
- for (auto &col : table_catalog.GetColumns().Logical()) {
399
+ for (auto &col : table.GetColumns().Logical()) {
394
400
  column_types.push_back(col.Type());
395
401
  column_names.push_back(col.Name());
396
402
  }
403
+
404
+ // create a binder to bind the parsed expressions
397
405
  vector<column_t> column_ids;
398
- binder->bind_context.AddBaseTable(0, index_info.table, column_names, column_types, column_ids, &table_catalog);
406
+ binder->bind_context.AddBaseTable(0, info.table, column_names, column_types, column_ids, &table);
399
407
  IndexBinder idx_binder(*binder, context);
408
+
409
+ // bind the parsed expressions to create unbound expressions
410
+ vector<unique_ptr<Expression>> unbound_expressions;
400
411
  unbound_expressions.reserve(parsed_expressions.size());
401
412
  for (auto &expr : parsed_expressions) {
402
413
  unbound_expressions.push_back(idx_binder.Bind(expr));
403
414
  }
404
415
 
405
- if (parsed_expressions.empty()) {
406
- // this is a PK/FK index: we create the necessary bound column ref expressions
407
- unbound_expressions.reserve(index_info.column_ids.size());
408
- for (idx_t key_nr = 0; key_nr < index_info.column_ids.size(); key_nr++) {
409
- auto &col = table_catalog.GetColumn(LogicalIndex(index_info.column_ids[key_nr]));
410
- unbound_expressions.push_back(
411
- make_uniq<BoundColumnRefExpression>(col.GetName(), col.GetType(), ColumnBinding(0, key_nr)));
412
- }
413
- }
414
-
415
416
  // create the index and add it to the storage
416
- switch (index_info.index_type) {
417
+ switch (info.index_type) {
417
418
  case IndexType::ART: {
418
- auto &storage = table_catalog.GetStorage();
419
- auto art = make_uniq<ART>(index_info.column_ids, TableIOManager::Get(storage), std::move(unbound_expressions),
420
- index_info.constraint_type, storage.db, nullptr, index_block_pointer);
419
+ auto &storage = table.GetStorage();
420
+ auto art = make_uniq<ART>(info.column_ids, TableIOManager::Get(storage), std::move(unbound_expressions),
421
+ info.constraint_type, storage.db, nullptr, root_block_pointer);
421
422
 
422
- index_catalog.index = art.get();
423
+ index.index = art.get();
423
424
  storage.info->indexes.AddIndex(std::move(art));
424
425
  } break;
425
426
  default:
@@ -159,7 +159,7 @@ void LocalTableStorage::AppendToIndexes(DuckTransaction &transaction, TableAppen
159
159
  AppendToIndexes(transaction, *row_groups, table.info->indexes, table.GetTypes(), append_state.current_row);
160
160
  }
161
161
  if (error) {
162
- // need to revert the append
162
+ // need to revert all appended row ids
163
163
  row_t current_row = append_state.row_start;
164
164
  // remove the data from the indexes, if there are any indexes
165
165
  row_groups->Scan(transaction, [&](DataChunk &chunk) -> bool {
@@ -184,6 +184,13 @@ void LocalTableStorage::AppendToIndexes(DuckTransaction &transaction, TableAppen
184
184
  if (append_to_table) {
185
185
  table.RevertAppendInternal(append_state.row_start, append_count);
186
186
  }
187
+
188
+ // we need to vacuum the indexes to remove any buffers that are now empty
189
+ // due to reverting the appends
190
+ table.info->indexes.Scan([&](Index &index) {
191
+ index.Vacuum();
192
+ return false;
193
+ });
187
194
  error.Throw();
188
195
  }
189
196
  }
@@ -652,7 +652,16 @@ describe('prepare', function() {
652
652
  });
653
653
  it("should aggregate kurtosis(num)", function (done) {
654
654
  db.all("SELECT kurtosis(num) as kurtosis FROM foo", function (err: null | Error, res: TableData) {
655
- assert.equal(res[0].kurtosis, -1.1999999999999997);
655
+ // The `num` column of table `foo` contains each integer from 0 to 999,999 exactly once.
656
+ // This is a uniform distribution. The excess kurtosis for a uniform distribution is exactly -1.2.
657
+ // See https://en.wikipedia.org/wiki/Kurtosis#Other_well-known_distributions
658
+ const expected = -1.2;
659
+
660
+ // The calculated value can differ from the exact answer by small amounts on different platforms due
661
+ // to floating-point errors. This tolerance was determined experimentally.
662
+ const tolerance = Number.EPSILON * 10;
663
+
664
+ assert.ok(Math.abs(res[0].kurtosis - expected) < tolerance);
656
665
  done(err);
657
666
  });
658
667
  });
@@ -90,7 +90,7 @@ const correct_answer_map: Record<string, any[]> = {
90
90
  date_array: [
91
91
  [],
92
92
  [
93
- new Date(1970, 0, 1),
93
+ new Date(Date.UTC(1970, 0, 1)),
94
94
  null,
95
95
  new Date("0001-01-01T00:00:00.000Z"),
96
96
  new Date("9999-12-31T00:00:00.000Z"),
@@ -100,7 +100,7 @@ const correct_answer_map: Record<string, any[]> = {
100
100
  timestamp_array: [
101
101
  [],
102
102
  [
103
- new Date(1970, 0, 1),
103
+ new Date(Date.UTC(1970, 0, 1)),
104
104
  null,
105
105
  new Date("0001-01-01T00:00:00.000Z"),
106
106
  new Date("9999-12-31T23:59:59.999Z"),
@@ -111,7 +111,7 @@ const correct_answer_map: Record<string, any[]> = {
111
111
  timestamptz_array: [
112
112
  [],
113
113
  [
114
- new Date(1970, 0, 1),
114
+ new Date(Date.UTC(1970, 0, 1)),
115
115
  null,
116
116
  new Date("0001-01-01T00:00:00.000Z"),
117
117
  new Date("9999-12-31T23:59:59.999Z"),
@@ -171,7 +171,7 @@ const correct_answer_map: Record<string, any[]> = {
171
171
  ],
172
172
 
173
173
  timestamp: [
174
- new Date("1990-01-01T00:00"),
174
+ new Date(Date.UTC(1990, 0, 1)),
175
175
  new Date("9999-12-31T23:59:59.000Z"),
176
176
  null,
177
177
  ],