duckdb 0.4.1-dev112.0 → 0.4.1-dev129.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
- "version": "0.4.1-dev112.0",
4
+ "version": "0.4.1-dev129.0",
5
5
  "description": "DuckDB node.js API",
6
6
  "gypfile": true,
7
7
  "dependencies": {
package/src/duckdb.cpp CHANGED
@@ -176123,6 +176123,7 @@ private:
176123
176123
 
176124
176124
 
176125
176125
 
176126
+
176126
176127
  namespace duckdb {
176127
176128
 
176128
176129
  WriteOverflowStringsToDisk::WriteOverflowStringsToDisk(DatabaseInstance &db)
@@ -176143,19 +176144,29 @@ void WriteOverflowStringsToDisk::WriteString(string_t string, block_id_t &result
176143
176144
  handle = buffer_manager.Allocate(Storage::BLOCK_SIZE);
176144
176145
  }
176145
176146
  // first write the length of the string
176146
- if (block_id == INVALID_BLOCK || offset + sizeof(uint32_t) >= STRING_SPACE) {
176147
+ if (block_id == INVALID_BLOCK || offset + 2 * sizeof(uint32_t) >= STRING_SPACE) {
176147
176148
  AllocateNewBlock(block_manager.GetFreeBlockId());
176148
176149
  }
176149
176150
  result_block = block_id;
176150
176151
  result_offset = offset;
176151
176152
 
176152
- // write the length field
176153
- auto string_length = string.GetSize();
176154
- Store<uint32_t>(string_length, handle->node->buffer + offset);
176155
- offset += sizeof(uint32_t);
176153
+ // GZIP the string
176154
+ auto uncompressed_size = string.GetSize();
176155
+ MiniZStream s;
176156
+ size_t compressed_size = 0;
176157
+ compressed_size = s.MaxCompressedLength(uncompressed_size);
176158
+ auto compressed_buf = unique_ptr<data_t[]>(new data_t[compressed_size]);
176159
+ s.Compress((const char *)string.GetDataUnsafe(), uncompressed_size, (char *)compressed_buf.get(), &compressed_size);
176160
+ string_t compressed_string((const char *)compressed_buf.get(), compressed_size);
176161
+
176162
+ // store sizes
176163
+ Store<uint32_t>(compressed_size, handle->node->buffer + offset);
176164
+ Store<uint32_t>(uncompressed_size, handle->node->buffer + offset + sizeof(uint32_t));
176165
+
176156
176166
  // now write the remainder of the string
176157
- auto strptr = string.GetDataUnsafe();
176158
- uint32_t remaining = string_length;
176167
+ offset += 2 * sizeof(uint32_t);
176168
+ auto strptr = compressed_string.GetDataUnsafe();
176169
+ uint32_t remaining = compressed_size;
176159
176170
  while (remaining > 0) {
176160
176171
  uint32_t to_write = MinValue<uint32_t>(remaining, STRING_SPACE - offset);
176161
176172
  if (to_write > 0) {
@@ -178838,7 +178849,7 @@ public:
178838
178849
  //! Base size of big string marker (block id + offset)
178839
178850
  static constexpr idx_t BIG_STRING_MARKER_BASE_SIZE = sizeof(block_id_t) + sizeof(int32_t);
178840
178851
  //! The marker size of the big string
178841
- static constexpr idx_t BIG_STRING_MARKER_SIZE = BIG_STRING_MARKER_BASE_SIZE + sizeof(uint16_t);
178852
+ static constexpr idx_t BIG_STRING_MARKER_SIZE = BIG_STRING_MARKER_BASE_SIZE;
178842
178853
  //! The size below which the segment is compacted on flushing
178843
178854
  static constexpr size_t COMPACTION_FLUSH_LIMIT = (size_t)Storage::BLOCK_SIZE / 5 * 4;
178844
178855
 
@@ -178877,11 +178888,16 @@ public:
178877
178888
  return i;
178878
178889
  }
178879
178890
  remaining_space -= sizeof(int32_t);
178891
+ auto dictionary = GetDictionary(segment, *handle);
178880
178892
  if (!data.validity.RowIsValid(source_idx)) {
178881
- // null value is stored as -1
178882
- result_data[target_idx] = 0;
178893
+ // null value is stored as a copy of the last value, this is done to be able to efficiently do the
178894
+ // string_length calculation
178895
+ if (target_idx > 0) {
178896
+ result_data[target_idx] = result_data[target_idx - 1];
178897
+ } else {
178898
+ result_data[target_idx] = 0;
178899
+ }
178883
178900
  } else {
178884
- auto dictionary = GetDictionary(segment, *handle);
178885
178901
  auto end = handle->node->buffer + dictionary.end;
178886
178902
 
178887
178903
  dictionary.Verify();
@@ -178905,7 +178921,7 @@ public:
178905
178921
  // Unknown string, continue
178906
178922
  // non-null value, check if we can fit it within the block
178907
178923
  idx_t string_length = source_data[source_idx].GetSize();
178908
- idx_t dictionary_length = string_length + sizeof(uint16_t);
178924
+ idx_t dictionary_length = string_length;
178909
178925
 
178910
178926
  // determine whether or not we have space in the block for this string
178911
178927
  bool use_overflow_block = false;
@@ -178938,16 +178954,17 @@ public:
178938
178954
  // string fits in block, append to dictionary and increment dictionary position
178939
178955
  D_ASSERT(string_length < NumericLimits<uint16_t>::Maximum());
178940
178956
  dictionary.size += required_space;
178941
- auto dict_pos = end - dictionary.size; // first write the length as u16
178942
- Store<uint16_t>(string_length, dict_pos);
178957
+ auto dict_pos = end - dictionary.size;
178943
178958
  // now write the actual string data into the dictionary
178944
- memcpy(dict_pos + sizeof(uint16_t), source_data[source_idx].GetDataUnsafe(), string_length);
178959
+ memcpy(dict_pos, source_data[source_idx].GetDataUnsafe(), string_length);
178945
178960
  }
178946
178961
  D_ASSERT(RemainingSpace(segment, *handle) <= Storage::BLOCK_SIZE);
178947
178962
  // place the dictionary offset into the set of vectors
178948
178963
  dictionary.Verify();
178949
178964
 
178950
- result_data[target_idx] = dictionary.size;
178965
+ // note: for overflow strings we write negative value
178966
+ result_data[target_idx] = use_overflow_block ? -1 * dictionary.size : dictionary.size;
178967
+
178951
178968
  if (DUPLICATE_ELIMINATE) {
178952
178969
  seen_strings->insert({source_data[source_idx].GetString(), dictionary.size});
178953
178970
  }
@@ -178973,17 +178990,18 @@ public:
178973
178990
  static void WriteString(ColumnSegment &segment, string_t string, block_id_t &result_block, int32_t &result_offset);
178974
178991
  static void WriteStringMemory(ColumnSegment &segment, string_t string, block_id_t &result_block,
178975
178992
  int32_t &result_offset);
178976
- static string_t ReadString(ColumnSegment &segment, Vector &result, block_id_t block, int32_t offset);
178977
- static string_t ReadString(data_ptr_t target, int32_t offset);
178993
+ static string_t ReadOverflowString(ColumnSegment &segment, Vector &result, block_id_t block, int32_t offset);
178994
+ static string_t ReadString(data_ptr_t target, int32_t offset, uint32_t string_length);
178995
+ static string_t ReadStringWithLength(data_ptr_t target, int32_t offset);
178978
178996
  static void WriteStringMarker(data_ptr_t target, block_id_t block_id, int32_t offset);
178979
178997
  static void ReadStringMarker(data_ptr_t target, block_id_t &block_id, int32_t &offset);
178980
178998
 
178981
178999
  static string_location_t FetchStringLocation(StringDictionaryContainer dict, data_ptr_t baseptr,
178982
179000
  int32_t dict_offset);
178983
179001
  static string_t FetchStringFromDict(ColumnSegment &segment, StringDictionaryContainer dict, Vector &result,
178984
- data_ptr_t baseptr, int32_t dict_offset);
179002
+ data_ptr_t baseptr, int32_t dict_offset, uint32_t string_length);
178985
179003
  static string_t FetchString(ColumnSegment &segment, StringDictionaryContainer dict, Vector &result,
178986
- data_ptr_t baseptr, string_location_t location);
179004
+ data_ptr_t baseptr, string_location_t location, uint32_t string_length);
178987
179005
  };
178988
179006
  } // namespace duckdb
178989
179007
 
@@ -180464,6 +180482,8 @@ bool RLEFun::TypeIsSupported(PhysicalType type) {
180464
180482
  } // namespace duckdb
180465
180483
 
180466
180484
 
180485
+
180486
+
180467
180487
  namespace duckdb {
180468
180488
 
180469
180489
  //===--------------------------------------------------------------------===//
@@ -180541,8 +180561,14 @@ void UncompressedStringStorage::StringScanPartial(ColumnSegment &segment, Column
180541
180561
  auto base_data = (int32_t *)(baseptr + DICTIONARY_HEADER_SIZE);
180542
180562
  auto result_data = FlatVector::GetData<string_t>(result);
180543
180563
 
180564
+ int32_t previous_offset = start > 0 ? base_data[start - 1] : 0;
180565
+
180544
180566
  for (idx_t i = 0; i < scan_count; i++) {
180545
- result_data[result_offset + i] = FetchStringFromDict(segment, dict, result, baseptr, base_data[start + i]);
180567
+ // std::abs used since offsets can be negative to indicate big strings
180568
+ uint32_t string_length = std::abs(base_data[start + i]) - std::abs(previous_offset);
180569
+ result_data[result_offset + i] =
180570
+ FetchStringFromDict(segment, dict, result, baseptr, base_data[start + i], string_length);
180571
+ previous_offset = base_data[start + i];
180546
180572
  }
180547
180573
  }
180548
180574
 
@@ -180577,7 +180603,15 @@ void UncompressedStringStorage::StringFetchRow(ColumnSegment &segment, ColumnFet
180577
180603
  auto base_data = (int32_t *)(baseptr + DICTIONARY_HEADER_SIZE);
180578
180604
  auto result_data = FlatVector::GetData<string_t>(result);
180579
180605
 
180580
- result_data[result_idx] = FetchStringFromDict(segment, dict, result, baseptr, base_data[row_id]);
180606
+ auto dict_offset = base_data[row_id];
180607
+ uint32_t string_length;
180608
+ if ((idx_t)row_id == 0) {
180609
+ // edge case where this is the first string in the dict
180610
+ string_length = std::abs(dict_offset);
180611
+ } else {
180612
+ string_length = std::abs(dict_offset) - std::abs(base_data[row_id - 1]);
180613
+ }
180614
+ result_data[result_idx] = FetchStringFromDict(segment, dict, result, baseptr, dict_offset, string_length);
180581
180615
  }
180582
180616
 
180583
180617
  //===--------------------------------------------------------------------===//
@@ -180711,8 +180745,8 @@ void UncompressedStringStorage::WriteStringMemory(ColumnSegment &segment, string
180711
180745
  state.head->offset += total_length;
180712
180746
  }
180713
180747
 
180714
- string_t UncompressedStringStorage::ReadString(ColumnSegment &segment, Vector &result, block_id_t block,
180715
- int32_t offset) {
180748
+ string_t UncompressedStringStorage::ReadOverflowString(ColumnSegment &segment, Vector &result, block_id_t block,
180749
+ int32_t offset) {
180716
180750
  D_ASSERT(block != INVALID_BLOCK);
180717
180751
  D_ASSERT(offset < Storage::BLOCK_SIZE);
180718
180752
 
@@ -180724,37 +180758,52 @@ string_t UncompressedStringStorage::ReadString(ColumnSegment &segment, Vector &r
180724
180758
  auto block_handle = buffer_manager.RegisterBlock(block);
180725
180759
  auto handle = buffer_manager.Pin(block_handle);
180726
180760
 
180727
- uint32_t length = Load<uint32_t>(handle->node->buffer + offset);
180728
- uint32_t remaining = length;
180729
- offset += sizeof(uint32_t);
180730
-
180731
- // allocate a buffer to store the string
180732
- auto alloc_size = MaxValue<idx_t>(Storage::BLOCK_SIZE, length + sizeof(uint32_t));
180733
- auto target_handle = buffer_manager.Allocate(alloc_size);
180734
- auto target_ptr = target_handle->node->buffer;
180735
- // write the length in this block as well
180736
- Store<uint32_t>(length, target_ptr);
180737
- target_ptr += sizeof(uint32_t);
180738
- // now append the string to the single buffer
180739
- while (remaining > 0) {
180740
- idx_t to_write = MinValue<idx_t>(remaining, Storage::BLOCK_SIZE - sizeof(block_id_t) - offset);
180741
- memcpy(target_ptr, handle->node->buffer + offset, to_write);
180761
+ // read header
180762
+ uint32_t compressed_size = Load<uint32_t>(handle->node->buffer + offset);
180763
+ uint32_t uncompressed_size = Load<uint32_t>(handle->node->buffer + offset + sizeof(uint32_t));
180764
+ uint32_t remaining = compressed_size;
180765
+ offset += 2 * sizeof(uint32_t);
180742
180766
 
180743
- remaining -= to_write;
180744
- offset += to_write;
180745
- target_ptr += to_write;
180746
- if (remaining > 0) {
180747
- // read the next block
180748
- block_id_t next_block = Load<block_id_t>(handle->node->buffer + offset);
180749
- block_handle = buffer_manager.RegisterBlock(next_block);
180750
- handle = buffer_manager.Pin(block_handle);
180751
- offset = 0;
180767
+ data_ptr_t decompression_ptr;
180768
+ std::unique_ptr<data_t[]> decompression_buffer;
180769
+
180770
+ // If string is in single block we decompress straight from it, else we copy first
180771
+ if (remaining <= Storage::BLOCK_SIZE - sizeof(block_id_t) - offset) {
180772
+ decompression_ptr = handle->node->buffer + offset;
180773
+ } else {
180774
+ decompression_buffer = std::unique_ptr<data_t[]>(new data_t[compressed_size]);
180775
+ auto target_ptr = decompression_buffer.get();
180776
+
180777
+ // now append the string to the single buffer
180778
+ while (remaining > 0) {
180779
+ idx_t to_write = MinValue<idx_t>(remaining, Storage::BLOCK_SIZE - sizeof(block_id_t) - offset);
180780
+ memcpy(target_ptr, handle->node->buffer + offset, to_write);
180781
+
180782
+ remaining -= to_write;
180783
+ offset += to_write;
180784
+ target_ptr += to_write;
180785
+ if (remaining > 0) {
180786
+ // read the next block
180787
+ block_id_t next_block = Load<block_id_t>(handle->node->buffer + offset);
180788
+ block_handle = buffer_manager.RegisterBlock(next_block);
180789
+ handle = buffer_manager.Pin(block_handle);
180790
+ offset = 0;
180791
+ }
180752
180792
  }
180793
+ decompression_ptr = decompression_buffer.get();
180753
180794
  }
180754
180795
 
180755
- auto final_buffer = target_handle->node->buffer;
180756
- StringVector::AddHandle(result, move(target_handle));
180757
- return ReadString(final_buffer, 0);
180796
+ // overflow strings on disk are gzipped, decompress here
180797
+ auto decompressed_target_handle =
180798
+ buffer_manager.Allocate(MaxValue<idx_t>(Storage::BLOCK_SIZE, uncompressed_size));
180799
+ auto decompressed_target_ptr = decompressed_target_handle->node->buffer;
180800
+ MiniZStream s;
180801
+ s.Decompress((const char *)decompression_ptr, compressed_size, (char *)decompressed_target_ptr,
180802
+ uncompressed_size);
180803
+
180804
+ auto final_buffer = decompressed_target_handle->node->buffer;
180805
+ StringVector::AddHandle(result, move(decompressed_target_handle));
180806
+ return ReadString(final_buffer, 0, uncompressed_size);
180758
180807
  } else {
180759
180808
  // read the overflow string from memory
180760
180809
  // first pin the handle, if it is not pinned yet
@@ -180763,11 +180812,17 @@ string_t UncompressedStringStorage::ReadString(ColumnSegment &segment, Vector &r
180763
180812
  auto handle = buffer_manager.Pin(entry->second->block);
180764
180813
  auto final_buffer = handle->node->buffer;
180765
180814
  StringVector::AddHandle(result, move(handle));
180766
- return ReadString(final_buffer, offset);
180815
+ return ReadStringWithLength(final_buffer, offset);
180767
180816
  }
180768
180817
  }
180769
180818
 
180770
- string_t UncompressedStringStorage::ReadString(data_ptr_t target, int32_t offset) {
180819
+ string_t UncompressedStringStorage::ReadString(data_ptr_t target, int32_t offset, uint32_t string_length) {
180820
+ auto ptr = target + offset;
180821
+ auto str_ptr = (char *)(ptr);
180822
+ return string_t(str_ptr, string_length);
180823
+ }
180824
+
180825
+ string_t UncompressedStringStorage::ReadStringWithLength(data_ptr_t target, int32_t offset) {
180771
180826
  auto ptr = target + offset;
180772
180827
  auto str_length = Load<uint32_t>(ptr);
180773
180828
  auto str_ptr = (char *)(ptr + sizeof(uint32_t));
@@ -180775,16 +180830,12 @@ string_t UncompressedStringStorage::ReadString(data_ptr_t target, int32_t offset
180775
180830
  }
180776
180831
 
180777
180832
  void UncompressedStringStorage::WriteStringMarker(data_ptr_t target, block_id_t block_id, int32_t offset) {
180778
- uint16_t length = BIG_STRING_MARKER;
180779
- memcpy(target, &length, sizeof(uint16_t));
180780
- target += sizeof(uint16_t);
180781
180833
  memcpy(target, &block_id, sizeof(block_id_t));
180782
180834
  target += sizeof(block_id_t);
180783
180835
  memcpy(target, &offset, sizeof(int32_t));
180784
180836
  }
180785
180837
 
180786
180838
  void UncompressedStringStorage::ReadStringMarker(data_ptr_t target, block_id_t &block_id, int32_t &offset) {
180787
- target += sizeof(uint16_t);
180788
180839
  memcpy(&block_id, target, sizeof(block_id_t));
180789
180840
  target += sizeof(block_id_t);
180790
180841
  memcpy(&offset, target, sizeof(int32_t));
@@ -180792,37 +180843,31 @@ void UncompressedStringStorage::ReadStringMarker(data_ptr_t target, block_id_t &
180792
180843
 
180793
180844
  string_location_t UncompressedStringStorage::FetchStringLocation(StringDictionaryContainer dict, data_ptr_t baseptr,
180794
180845
  int32_t dict_offset) {
180795
- D_ASSERT(dict_offset >= 0 && dict_offset <= Storage::BLOCK_SIZE);
180796
- if (dict_offset == 0) {
180797
- return string_location_t(INVALID_BLOCK, 0);
180798
- }
180799
- // look up result in dictionary
180800
- auto dict_end = baseptr + dict.end;
180801
- auto dict_pos = dict_end - dict_offset;
180802
- auto string_length = Load<uint16_t>(dict_pos);
180803
- string_location_t result;
180804
- if (string_length == BIG_STRING_MARKER) {
180805
- ReadStringMarker(dict_pos, result.block_id, result.offset);
180846
+ D_ASSERT(dict_offset >= -1 * Storage::BLOCK_SIZE && dict_offset <= Storage::BLOCK_SIZE);
180847
+ if (dict_offset < 0) {
180848
+ string_location_t result;
180849
+ ReadStringMarker(baseptr + dict.end - (-1 * dict_offset), result.block_id, result.offset);
180850
+ return result;
180806
180851
  } else {
180807
- result.block_id = INVALID_BLOCK;
180808
- result.offset = dict_offset;
180852
+ return string_location_t(INVALID_BLOCK, dict_offset);
180809
180853
  }
180810
- return result;
180811
180854
  }
180812
180855
 
180813
180856
  string_t UncompressedStringStorage::FetchStringFromDict(ColumnSegment &segment, StringDictionaryContainer dict,
180814
- Vector &result, data_ptr_t baseptr, int32_t dict_offset) {
180857
+ Vector &result, data_ptr_t baseptr, int32_t dict_offset,
180858
+ uint32_t string_length) {
180815
180859
  // fetch base data
180816
180860
  D_ASSERT(dict_offset <= Storage::BLOCK_SIZE);
180817
180861
  string_location_t location = FetchStringLocation(dict, baseptr, dict_offset);
180818
- return FetchString(segment, dict, result, baseptr, location);
180862
+ return FetchString(segment, dict, result, baseptr, location, string_length);
180819
180863
  }
180820
180864
 
180821
180865
  string_t UncompressedStringStorage::FetchString(ColumnSegment &segment, StringDictionaryContainer dict, Vector &result,
180822
- data_ptr_t baseptr, string_location_t location) {
180866
+ data_ptr_t baseptr, string_location_t location,
180867
+ uint32_t string_length) {
180823
180868
  if (location.block_id != INVALID_BLOCK) {
180824
180869
  // big string marker: read from separate block
180825
- return ReadString(segment, result, location.block_id, location.offset);
180870
+ return ReadOverflowString(segment, result, location.block_id, location.offset);
180826
180871
  } else {
180827
180872
  if (location.offset == 0) {
180828
180873
  return string_t(nullptr, 0);
@@ -180830,9 +180875,8 @@ string_t UncompressedStringStorage::FetchString(ColumnSegment &segment, StringDi
180830
180875
  // normal string: read string from this block
180831
180876
  auto dict_end = baseptr + dict.end;
180832
180877
  auto dict_pos = dict_end - location.offset;
180833
- auto string_length = Load<uint16_t>(dict_pos);
180834
180878
 
180835
- auto str_ptr = (char *)(dict_pos + sizeof(uint16_t));
180879
+ auto str_ptr = (char *)(dict_pos);
180836
180880
  return string_t(str_ptr, string_length);
180837
180881
  }
180838
180882
  }
@@ -185085,7 +185129,7 @@ string ValidityStatistics::ToString() const {
185085
185129
 
185086
185130
  namespace duckdb {
185087
185131
 
185088
- const uint64_t VERSION_NUMBER = 33;
185132
+ const uint64_t VERSION_NUMBER = 35;
185089
185133
 
185090
185134
  } // namespace duckdb
185091
185135
 
package/src/duckdb.hpp CHANGED
@@ -11,8 +11,8 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI
11
11
  #pragma once
12
12
  #define DUCKDB_AMALGAMATION 1
13
13
  #define DUCKDB_AMALGAMATION_EXTENDED 1
14
- #define DUCKDB_SOURCE_ID "fa0442d60"
15
- #define DUCKDB_VERSION "v0.4.1-dev112"
14
+ #define DUCKDB_SOURCE_ID "9789bcbb0"
15
+ #define DUCKDB_VERSION "v0.4.1-dev129"
16
16
  //===----------------------------------------------------------------------===//
17
17
  // DuckDB
18
18
  //