lbug 0.12.3-dev.15 → 0.12.3-dev.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/lbug-source/CMakeLists.txt +1 -1
  2. package/lbug-source/dataset/demo-db/graph-std/demo_indices_follows.parquet +0 -0
  3. package/lbug-source/dataset/demo-db/graph-std/demo_indices_livesin.parquet +0 -0
  4. package/lbug-source/dataset/demo-db/graph-std/demo_indptr_follows.parquet +0 -0
  5. package/lbug-source/dataset/demo-db/graph-std/demo_indptr_livesin.parquet +0 -0
  6. package/lbug-source/dataset/demo-db/graph-std/demo_mapping_city.parquet +0 -0
  7. package/lbug-source/dataset/demo-db/graph-std/demo_mapping_user.parquet +0 -0
  8. package/lbug-source/dataset/demo-db/graph-std/demo_metadata.parquet +0 -0
  9. package/lbug-source/dataset/demo-db/graph-std/demo_nodes_city.parquet +0 -0
  10. package/lbug-source/dataset/demo-db/graph-std/demo_nodes_user.parquet +0 -0
  11. package/lbug-source/dataset/demo-db/graph-std/schema.cypher +4 -0
  12. package/lbug-source/scripts/antlr4/Cypher.g4 +1 -1
  13. package/lbug-source/scripts/antlr4/hash.md5 +1 -1
  14. package/lbug-source/src/antlr4/Cypher.g4 +1 -1
  15. package/lbug-source/src/binder/bind/bind_ddl.cpp +23 -13
  16. package/lbug-source/src/catalog/catalog.cpp +5 -4
  17. package/lbug-source/src/catalog/catalog_entry/node_table_catalog_entry.cpp +8 -1
  18. package/lbug-source/src/catalog/catalog_entry/rel_group_catalog_entry.cpp +7 -0
  19. package/lbug-source/src/function/function_collection.cpp +2 -1
  20. package/lbug-source/src/function/table/CMakeLists.txt +1 -0
  21. package/lbug-source/src/function/table/disk_size_info.cpp +322 -0
  22. package/lbug-source/src/include/binder/ddl/bound_create_table_info.h +10 -6
  23. package/lbug-source/src/include/catalog/catalog_entry/node_table_catalog_entry.h +5 -3
  24. package/lbug-source/src/include/catalog/catalog_entry/rel_group_catalog_entry.h +4 -2
  25. package/lbug-source/src/include/common/constants.h +1 -0
  26. package/lbug-source/src/include/function/table/simple_table_function.h +6 -0
  27. package/lbug-source/src/include/parser/ddl/create_table_info.h +3 -1
  28. package/lbug-source/src/include/processor/operator/scan/scan_node_table.h +2 -2
  29. package/lbug-source/src/include/storage/storage_manager.h +1 -0
  30. package/lbug-source/src/include/storage/table/node_table.h +6 -1
  31. package/lbug-source/src/include/storage/table/parquet_node_table.h +103 -0
  32. package/lbug-source/src/include/storage/table/parquet_rel_table.h +99 -0
  33. package/lbug-source/src/include/storage/table/rel_table.h +2 -2
  34. package/lbug-source/src/include/transaction/transaction.h +2 -0
  35. package/lbug-source/src/parser/transform/transform_ddl.cpp +6 -1
  36. package/lbug-source/src/processor/operator/persistent/reader/parquet/parquet_reader.cpp +4 -0
  37. package/lbug-source/src/processor/operator/scan/scan_multi_rel_tables.cpp +24 -2
  38. package/lbug-source/src/processor/operator/scan/scan_node_table.cpp +44 -8
  39. package/lbug-source/src/processor/operator/scan/scan_rel_table.cpp +12 -2
  40. package/lbug-source/src/storage/storage_manager.cpp +40 -6
  41. package/lbug-source/src/storage/table/CMakeLists.txt +2 -0
  42. package/lbug-source/src/storage/table/parquet_node_table.cpp +338 -0
  43. package/lbug-source/src/storage/table/parquet_rel_table.cpp +470 -0
  44. package/lbug-source/test/include/test_runner/test_group.h +11 -1
  45. package/lbug-source/test/runner/e2e_test.cpp +7 -1
  46. package/lbug-source/test/test_files/demo_db/demo_db_graph_std.test +43 -0
  47. package/lbug-source/test/test_helper/test_helper.cpp +24 -0
  48. package/lbug-source/test/test_runner/test_parser.cpp +3 -0
  49. package/lbug-source/third_party/antlr4_cypher/cypher_parser.cpp +2761 -2701
  50. package/lbug-source/third_party/antlr4_cypher/include/cypher_parser.h +2 -0
  51. package/package.json +1 -1
  52. package/prebuilt/lbugjs-darwin-arm64.node +0 -0
  53. package/prebuilt/lbugjs-linux-arm64.node +0 -0
  54. package/prebuilt/lbugjs-linux-x64.node +0 -0
  55. package/prebuilt/lbugjs-win32-x64.node +0 -0
@@ -34,10 +34,10 @@ public:
34
34
  RelGroupCatalogEntry() = default;
35
35
  RelGroupCatalogEntry(std::string tableName, common::RelMultiplicity srcMultiplicity,
36
36
  common::RelMultiplicity dstMultiplicity, common::ExtendDirection storageDirection,
37
- std::vector<RelTableCatalogInfo> relTableInfos)
37
+ std::vector<RelTableCatalogInfo> relTableInfos, std::string storage = "")
38
38
  : TableCatalogEntry{type_, std::move(tableName)}, srcMultiplicity{srcMultiplicity},
39
39
  dstMultiplicity{dstMultiplicity}, storageDirection{storageDirection},
40
- relTableInfos{std::move(relTableInfos)} {
40
+ relTableInfos{std::move(relTableInfos)}, storage{std::move(storage)} {
41
41
  propertyCollection =
42
42
  PropertyDefinitionCollection{1}; // Skip NBR_NODE_ID column as the first one.
43
43
  }
@@ -53,6 +53,7 @@ public:
53
53
  }
54
54
 
55
55
  common::ExtendDirection getStorageDirection() const { return storageDirection; }
56
+ const std::string& getStorage() const { return storage; }
56
57
 
57
58
  common::idx_t getNumRelTables() const { return relTableInfos.size(); }
58
59
  const std::vector<RelTableCatalogInfo>& getRelEntryInfos() const { return relTableInfos; }
@@ -97,6 +98,7 @@ private:
97
98
  // TODO(Guodong): Avoid using extend direction for storage direction
98
99
  common::ExtendDirection storageDirection = common::ExtendDirection::BOTH;
99
100
  std::vector<RelTableCatalogInfo> relTableInfos;
101
+ std::string storage;
100
102
  };
101
103
 
102
104
  } // namespace catalog
@@ -83,6 +83,7 @@ struct StorageConstants {
83
83
 
84
84
  struct TableOptionConstants {
85
85
  static constexpr char REL_STORAGE_DIRECTION_OPTION[] = "STORAGE_DIRECTION";
86
+ static constexpr char REL_STORAGE_OPTION[] = "STORAGE";
86
87
  };
87
88
 
88
89
  // Hash Index Configurations
@@ -134,6 +134,12 @@ struct FileInfoFunction final {
134
134
  static function_set getFunctionSet();
135
135
  };
136
136
 
137
+ struct DiskSizeInfoFunction final {
138
+ static constexpr const char* name = "DISK_SIZE_INFO";
139
+
140
+ static function_set getFunctionSet();
141
+ };
142
+
137
143
  struct ShowAttachedDatabasesFunction final {
138
144
  static constexpr const char* name = "SHOW_ATTACHED_DATABASES";
139
145
 
@@ -36,8 +36,10 @@ struct CreateTableInfo {
36
36
 
37
37
  struct ExtraCreateNodeTableInfo final : ExtraCreateTableInfo {
38
38
  std::string pKName;
39
+ options_t options;
39
40
 
40
- explicit ExtraCreateNodeTableInfo(std::string pKName) : pKName{std::move(pKName)} {}
41
+ explicit ExtraCreateNodeTableInfo(std::string pKName, options_t options = {})
42
+ : pKName{std::move(pKName)}, options{std::move(options)} {}
41
43
  };
42
44
 
43
45
  struct ExtraCreateRelTableGroupInfo final : ExtraCreateTableInfo {
@@ -24,7 +24,7 @@ public:
24
24
  void initialize(const transaction::Transaction* transaction, storage::NodeTable* table,
25
25
  ScanNodeTableProgressSharedState& progressSharedState);
26
26
 
27
- void nextMorsel(storage::NodeTableScanState& scanState,
27
+ void nextMorsel(storage::TableScanState& scanState,
28
28
  ScanNodeTableProgressSharedState& progressSharedState);
29
29
 
30
30
  common::SemiMask* getSemiMask() const { return semiMask.get(); }
@@ -116,7 +116,7 @@ private:
116
116
 
117
117
  private:
118
118
  common::idx_t currentTableIdx;
119
- std::unique_ptr<storage::NodeTableScanState> scanState;
119
+ std::unique_ptr<storage::TableScanState> scanState;
120
120
  std::vector<ScanNodeTableInfo> tableInfos;
121
121
  std::vector<std::shared_ptr<ScanNodeTableSharedState>> sharedStates;
122
122
  std::shared_ptr<ScanNodeTableProgressSharedState> progressSharedState;
@@ -97,6 +97,7 @@ private:
97
97
  bool enableCompression;
98
98
  bool inMemory;
99
99
  std::vector<IndexType> registeredIndexTypes;
100
+ std::unordered_map<common::table_id_t, std::string> tableNameCache;
100
101
  };
101
102
 
102
103
  } // namespace storage
@@ -107,7 +107,7 @@ private:
107
107
 
108
108
  class StorageManager;
109
109
 
110
- class LBUG_API NodeTable final : public Table {
110
+ class LBUG_API NodeTable : public Table {
111
111
  public:
112
112
  NodeTable(const StorageManager* storageManager,
113
113
  const catalog::NodeTableCatalogEntry* nodeTableEntry, MemoryManager* mm);
@@ -119,6 +119,11 @@ public:
119
119
  void initScanState(transaction::Transaction* transaction, TableScanState& scanState,
120
120
  common::table_id_t tableID, common::offset_t startOffset) const;
121
121
 
122
+ // Virtual method for operator-level scan coordination initialization
123
+ // Called once per scan operation (not per scan state)
124
+ virtual void initializeScanCoordination(
125
+ [[maybe_unused]] const transaction::Transaction* transaction) {}
126
+
122
127
  bool scanInternal(transaction::Transaction* transaction, TableScanState& scanState) override;
123
128
  template<bool lock = true>
124
129
  bool lookup(const transaction::Transaction* transaction, const TableScanState& scanState) const;
@@ -0,0 +1,103 @@
1
+ #pragma once
2
+
3
+ #include <mutex>
4
+ #include <vector>
5
+
6
+ #include "catalog/catalog_entry/node_table_catalog_entry.h"
7
+ #include "common/exception/runtime.h"
8
+ #include "common/types/internal_id_util.h"
9
+ #include "common/types/value/value.h"
10
+ #include "processor/operator/persistent/reader/parquet/parquet_reader.h"
11
+ #include "storage/table/node_table.h"
12
+
13
+ namespace lbug {
14
+ namespace storage {
15
+
16
+ struct ParquetNodeTableScanState final : NodeTableScanState {
17
+ std::unique_ptr<processor::ParquetReader> parquetReader;
18
+ std::unique_ptr<processor::ParquetReaderScanState> parquetScanState;
19
+ bool initialized = false;
20
+ bool scanCompleted = false; // Track if this scan state has finished reading
21
+ bool dataRead = false;
22
+ std::vector<std::vector<std::unique_ptr<common::Value>>> allData;
23
+ size_t totalRows = 0;
24
+ size_t nextRowToDistribute = 0;
25
+ uint64_t lastQueryId = 0; // Track the last query ID to detect new queries
26
+
27
+ ParquetNodeTableScanState([[maybe_unused]] MemoryManager& mm, common::ValueVector* nodeIDVector,
28
+ std::vector<common::ValueVector*> outputVectors,
29
+ std::shared_ptr<common::DataChunkState> outChunkState)
30
+ : NodeTableScanState{nodeIDVector, std::move(outputVectors), std::move(outChunkState)} {
31
+ parquetScanState = std::make_unique<processor::ParquetReaderScanState>();
32
+ }
33
+ };
34
+
35
+ // Shared state to coordinate row group assignment across parallel scan states
36
+ struct ParquetNodeTableSharedState {
37
+ std::mutex mtx;
38
+ common::node_group_idx_t currentRowGroupIdx = 0;
39
+ common::node_group_idx_t numRowGroups = 0;
40
+
41
+ void reset(common::node_group_idx_t totalRowGroups) {
42
+ std::lock_guard<std::mutex> lock(mtx);
43
+ currentRowGroupIdx = 0;
44
+ numRowGroups = totalRowGroups;
45
+ }
46
+
47
+ bool getNextRowGroup(common::node_group_idx_t& assignedRowGroupIdx) {
48
+ std::lock_guard<std::mutex> lock(mtx);
49
+ if (currentRowGroupIdx < numRowGroups) {
50
+ assignedRowGroupIdx = currentRowGroupIdx++;
51
+ return true;
52
+ }
53
+ return false;
54
+ }
55
+ };
56
+
57
+ class ParquetNodeTable final : public NodeTable {
58
+ public:
59
+ ParquetNodeTable(const StorageManager* storageManager,
60
+ const catalog::NodeTableCatalogEntry* nodeTableEntry, MemoryManager* memoryManager);
61
+
62
+ void initScanState(transaction::Transaction* transaction, TableScanState& scanState,
63
+ bool resetCachedBoundNodeSelVec = true) const override;
64
+
65
+ // Override to reset shared state for row group coordination at the start of each scan operation
66
+ void initializeScanCoordination(const transaction::Transaction* transaction) override;
67
+
68
+ bool scanInternal(transaction::Transaction* transaction, TableScanState& scanState) override;
69
+
70
+ // For parquet-backed tables, we don't support modifications
71
+ void insert([[maybe_unused]] transaction::Transaction* transaction,
72
+ [[maybe_unused]] TableInsertState& insertState) override {
73
+ throw common::RuntimeException("Cannot insert into parquet-backed node table");
74
+ }
75
+ void update([[maybe_unused]] transaction::Transaction* transaction,
76
+ [[maybe_unused]] TableUpdateState& updateState) override {
77
+ throw common::RuntimeException("Cannot update parquet-backed node table");
78
+ }
79
+ bool delete_([[maybe_unused]] transaction::Transaction* transaction,
80
+ [[maybe_unused]] TableDeleteState& deleteState) override {
81
+ throw common::RuntimeException("Cannot delete from parquet-backed node table");
82
+ return false;
83
+ }
84
+
85
+ common::row_idx_t getNumTotalRows(const transaction::Transaction* transaction) override;
86
+
87
+ const std::string& getParquetFilePath() const { return parquetFilePath; }
88
+
89
+ // Note: Cannot override getNumCommittedNodeGroups since it's not virtual in base class
90
+ // Will need a different approach
91
+
92
+ private:
93
+ std::string parquetFilePath;
94
+ const catalog::NodeTableCatalogEntry* nodeTableCatalogEntry;
95
+ mutable std::unique_ptr<ParquetNodeTableSharedState> sharedState;
96
+
97
+ void initializeParquetReader(transaction::Transaction* transaction) const;
98
+ void initParquetScanForRowGroup(transaction::Transaction* transaction,
99
+ ParquetNodeTableScanState& scanState) const;
100
+ };
101
+
102
+ } // namespace storage
103
+ } // namespace lbug
@@ -0,0 +1,99 @@
1
+ #pragma once
2
+
3
+ #include "catalog/catalog_entry/rel_group_catalog_entry.h"
4
+ #include "common/exception/runtime.h"
5
+ #include "common/types/internal_id_util.h"
6
+ #include "processor/operator/persistent/reader/parquet/parquet_reader.h"
7
+ #include "storage/table/rel_table.h"
8
+ #include "transaction/transaction.h"
9
+
10
+ namespace lbug {
11
+ namespace storage {
12
+
13
+ struct ParquetRelTableScanState final : RelTableScanState {
14
+ std::unique_ptr<processor::ParquetReaderScanState> parquetScanState;
15
+ // For CSR format: store matching rows for current bound node
16
+ size_t nextRowToProcess = 0;
17
+
18
+ // Row group range for morsel-driven parallelism
19
+ uint64_t startRowGroup = 0;
20
+ uint64_t endRowGroup = 0;
21
+ uint64_t currentRowGroup = 0;
22
+
23
+ // Per-scan-state readers for thread safety
24
+ std::unique_ptr<processor::ParquetReader> nodeMappingReader;
25
+ std::unique_ptr<processor::ParquetReader> indicesReader;
26
+ std::unique_ptr<processor::ParquetReader> indptrReader;
27
+
28
+ ParquetRelTableScanState(MemoryManager& mm, common::ValueVector* nodeIDVector,
29
+ std::vector<common::ValueVector*> outputVectors,
30
+ std::shared_ptr<common::DataChunkState> outChunkState)
31
+ : RelTableScanState{mm, nodeIDVector, std::move(outputVectors), std::move(outChunkState)} {
32
+ parquetScanState = std::make_unique<processor::ParquetReaderScanState>();
33
+ }
34
+
35
+ void setToTable(const transaction::Transaction* transaction, Table* table_,
36
+ std::vector<common::column_id_t> columnIDs_,
37
+ std::vector<ColumnPredicateSet> columnPredicateSets_,
38
+ common::RelDataDirection direction_) override;
39
+ };
40
+
41
+ class ParquetRelTable final : public RelTable {
42
+ public:
43
+ ParquetRelTable(catalog::RelGroupCatalogEntry* relGroupEntry, common::table_id_t fromTableID,
44
+ common::table_id_t toTableID, const StorageManager* storageManager,
45
+ MemoryManager* memoryManager, std::string fromNodeTableName);
46
+
47
+ void initScanState(transaction::Transaction* transaction, TableScanState& scanState,
48
+ bool resetCachedBoundNodeSelVec = true) const override;
49
+
50
+ bool scanInternal(transaction::Transaction* transaction, TableScanState& scanState) override;
51
+
52
+ // For parquet-backed tables, we don't support modifications
53
+ void insert([[maybe_unused]] transaction::Transaction* transaction,
54
+ [[maybe_unused]] TableInsertState& insertState) override {
55
+ throw common::RuntimeException("Cannot insert into parquet-backed rel table");
56
+ }
57
+ void update([[maybe_unused]] transaction::Transaction* transaction,
58
+ [[maybe_unused]] TableUpdateState& updateState) override {
59
+ throw common::RuntimeException("Cannot update parquet-backed rel table");
60
+ }
61
+ bool delete_([[maybe_unused]] transaction::Transaction* transaction,
62
+ [[maybe_unused]] TableDeleteState& deleteState) override {
63
+ throw common::RuntimeException("Cannot delete from parquet-backed rel table");
64
+ return false;
65
+ }
66
+
67
+ common::row_idx_t getNumTotalRows(const transaction::Transaction* transaction) override;
68
+
69
+ private:
70
+ catalog::RelGroupCatalogEntry* relGroupEntry; // Store reference to table schema
71
+ std::string nodeMappingFilePath;
72
+ std::string indicesFilePath;
73
+ std::string indptrFilePath;
74
+ mutable std::unique_ptr<processor::ParquetReader> nodeMappingReader;
75
+ mutable std::unique_ptr<processor::ParquetReader> indicesReader;
76
+ mutable std::unique_ptr<processor::ParquetReader> indptrReader;
77
+ mutable std::mutex parquetReaderMutex;
78
+ mutable std::mutex indptrDataMutex;
79
+ mutable std::vector<common::offset_t> indptrData; // Cached indptr data for CSR format
80
+ mutable common::internal_id_map_t<common::offset_t>
81
+ nodeMapping; // Maps node IDs to CSR node IDs
82
+ mutable std::unordered_map<common::offset_t, common::offset_t>
83
+ csrToNodeTableIdMap; // Reverse mapping: CSR node ID to node table ID
84
+
85
+ void initializeParquetReaders(transaction::Transaction* transaction) const;
86
+ void initializeIndptrReader(transaction::Transaction* transaction) const;
87
+ void loadIndptrData(transaction::Transaction* transaction) const;
88
+ void loadNodeMappingData(transaction::Transaction* transaction) const;
89
+ bool scanInternalByRowGroups(transaction::Transaction* transaction,
90
+ ParquetRelTableScanState& parquetRelScanState);
91
+ bool scanRowGroupForBoundNodes(transaction::Transaction* transaction,
92
+ ParquetRelTableScanState& parquetRelScanState,
93
+ const std::vector<uint64_t>& rowGroupsToProcess,
94
+ const std::unordered_set<common::offset_t>& boundNodeOffsets);
95
+ common::offset_t findSourceNodeForRow(common::offset_t globalRowIdx) const;
96
+ };
97
+
98
+ } // namespace storage
99
+ } // namespace lbug
@@ -48,7 +48,7 @@ struct RelTableScanState : TableScanState {
48
48
  nodeGroupScanState = std::make_unique<CSRNodeGroupScanState>();
49
49
  }
50
50
 
51
- void setToTable(const transaction::Transaction* transaction, Table* table_,
51
+ virtual void setToTable(const transaction::Transaction* transaction, Table* table_,
52
52
  std::vector<common::column_id_t> columnIDs_,
53
53
  std::vector<ColumnPredicateSet> columnPredicateSets_,
54
54
  common::RelDataDirection direction_) override;
@@ -138,7 +138,7 @@ struct LBUG_API RelTableDeleteState final : TableDeleteState {
138
138
  relIDVector{relIDVector}, detachDeleteDirection{detachDeleteDirection} {}
139
139
  };
140
140
 
141
- class LBUG_API RelTable final : public Table {
141
+ class LBUG_API RelTable : public Table {
142
142
  public:
143
143
  using rel_multiplicity_constraint_throw_func_t =
144
144
  std::function<void(const std::string&, common::offset_t, common::RelDataDirection)>;
@@ -130,6 +130,8 @@ public:
130
130
  return getMinUncommittedNodeOffset(tableID) + localRowIdx;
131
131
  }
132
132
 
133
+ main::ClientContext* getClientContext() const { return clientContext; }
134
+
133
135
  void pushCreateDropCatalogEntry(catalog::CatalogSet& catalogSet,
134
136
  catalog::CatalogEntry& catalogEntry, bool isInternal, bool skipLoggingToWAL = false);
135
137
  void pushAlterCatalogEntry(catalog::CatalogSet& catalogSet, catalog::CatalogEntry& catalogEntry,
@@ -81,7 +81,12 @@ std::unique_ptr<Statement> Transformer::transformCreateNodeTable(
81
81
  } else {
82
82
  createTableInfo.propertyDefinitions =
83
83
  transformPropertyDefinitions(*ctx.kU_PropertyDefinitions());
84
- createTableInfo.extraInfo = std::make_unique<ExtraCreateNodeTableInfo>(getPKName(ctx));
84
+ options_t options;
85
+ if (ctx.kU_Options()) {
86
+ options = transformOptions(*ctx.kU_Options());
87
+ }
88
+ createTableInfo.extraInfo =
89
+ std::make_unique<ExtraCreateNodeTableInfo>(getPKName(ctx), std::move(options));
85
90
  return std::make_unique<CreateTable>(std::move(createTableInfo));
86
91
  }
87
92
  }
@@ -340,6 +340,10 @@ std::unique_ptr<ColumnReader> ParquetReader::createReader() {
340
340
  throw CopyException{"Root element of Parquet file must be a struct"};
341
341
  }
342
342
  // LCOV_EXCL_STOP
343
+ // Clear existing column metadata before populating (in case createReader is called multiple
344
+ // times)
345
+ columnNames.clear();
346
+ columnTypes.clear();
343
347
  for (auto& field : StructType::getFields(rootReader->getDataType())) {
344
348
  columnNames.push_back(field.getName());
345
349
  columnTypes.push_back(field.getType().copy());
@@ -2,6 +2,7 @@
2
2
 
3
3
  #include "processor/execution_context.h"
4
4
  #include "storage/local_storage/local_storage.h"
5
+ #include "storage/table/parquet_rel_table.h"
5
6
 
6
7
  using namespace lbug::common;
7
8
  using namespace lbug::storage;
@@ -54,8 +55,29 @@ void ScanMultiRelTable::initLocalStateInternal(ResultSet* resultSet, ExecutionCo
54
55
  auto clientContext = context->clientContext;
55
56
  boundNodeIDVector = resultSet->getValueVector(opInfo.nodeIDPos).get();
56
57
  auto nbrNodeIDVector = outVectors[0];
57
- scanState = std::make_unique<RelTableScanState>(*MemoryManager::Get(*clientContext),
58
- boundNodeIDVector, outVectors, nbrNodeIDVector->state);
58
+
59
+ // Check if any table in any scanner is a ParquetRelTable
60
+ bool hasParquetTable = false;
61
+ for (auto& [_, scanner] : scanners) {
62
+ for (auto& relInfo : scanner.relInfos) {
63
+ if (dynamic_cast<storage::ParquetRelTable*>(relInfo.table) != nullptr) {
64
+ hasParquetTable = true;
65
+ break;
66
+ }
67
+ }
68
+ if (hasParquetTable)
69
+ break;
70
+ }
71
+
72
+ // Create appropriate scan state type
73
+ if (hasParquetTable) {
74
+ scanState =
75
+ std::make_unique<storage::ParquetRelTableScanState>(*MemoryManager::Get(*clientContext),
76
+ boundNodeIDVector, outVectors, nbrNodeIDVector->state);
77
+ } else {
78
+ scanState = std::make_unique<RelTableScanState>(*MemoryManager::Get(*clientContext),
79
+ boundNodeIDVector, outVectors, nbrNodeIDVector->state);
80
+ }
59
81
  for (auto& [_, scanner] : scanners) {
60
82
  for (auto& relInfo : scanner.relInfos) {
61
83
  if (directionInfo.directionPos.isValid()) {
@@ -2,8 +2,10 @@
2
2
 
3
3
  #include "binder/expression/expression_util.h"
4
4
  #include "processor/execution_context.h"
5
+ #include "storage/buffer_manager/memory_manager.h"
5
6
  #include "storage/local_storage/local_node_table.h"
6
7
  #include "storage/local_storage/local_storage.h"
8
+ #include "storage/table/parquet_node_table.h"
7
9
 
8
10
  using namespace lbug::common;
9
11
  using namespace lbug::storage;
@@ -35,7 +37,23 @@ void ScanNodeTableSharedState::initialize(const transaction::Transaction* transa
35
37
  this->table = table;
36
38
  this->currentCommittedGroupIdx = 0;
37
39
  this->currentUnCommittedGroupIdx = 0;
38
- this->numCommittedNodeGroups = table->getNumCommittedNodeGroups();
40
+
41
+ // Initialize table-specific scan coordination (e.g., for ParquetNodeTable)
42
+ table->initializeScanCoordination(transaction);
43
+
44
+ if (const auto parquetTable = dynamic_cast<ParquetNodeTable*>(table)) {
45
+ // For parquet tables, set numCommittedNodeGroups to number of row groups
46
+ std::vector<bool> columnSkips;
47
+ try {
48
+ auto tempReader = std::make_unique<processor::ParquetReader>(
49
+ parquetTable->getParquetFilePath(), columnSkips, transaction->getClientContext());
50
+ this->numCommittedNodeGroups = tempReader->getNumRowsGroups();
51
+ } catch (const std::exception& e) {
52
+ this->numCommittedNodeGroups = 1;
53
+ }
54
+ } else {
55
+ this->numCommittedNodeGroups = table->getNumCommittedNodeGroups();
56
+ }
39
57
  if (transaction->isWriteTransaction()) {
40
58
  if (const auto localTable =
41
59
  transaction->getLocalStorage()->getLocalTable(this->table->getTableID())) {
@@ -46,21 +64,23 @@ void ScanNodeTableSharedState::initialize(const transaction::Transaction* transa
46
64
  progressSharedState.numGroups += numCommittedNodeGroups;
47
65
  }
48
66
 
49
- void ScanNodeTableSharedState::nextMorsel(NodeTableScanState& scanState,
67
+ void ScanNodeTableSharedState::nextMorsel(TableScanState& scanState,
50
68
  ScanNodeTableProgressSharedState& progressSharedState) {
51
69
  std::unique_lock lck{mtx};
70
+ // Cast to NodeTableScanState since we know this is for node tables
71
+ auto& nodeScanState = scanState.cast<NodeTableScanState>();
52
72
  if (currentCommittedGroupIdx < numCommittedNodeGroups) {
53
- scanState.nodeGroupIdx = currentCommittedGroupIdx++;
73
+ nodeScanState.nodeGroupIdx = currentCommittedGroupIdx++;
54
74
  progressSharedState.numGroupsScanned++;
55
- scanState.source = TableScanSource::COMMITTED;
75
+ nodeScanState.source = TableScanSource::COMMITTED;
56
76
  return;
57
77
  }
58
78
  if (currentUnCommittedGroupIdx < numUnCommittedNodeGroups) {
59
- scanState.nodeGroupIdx = currentUnCommittedGroupIdx++;
60
- scanState.source = TableScanSource::UNCOMMITTED;
79
+ nodeScanState.nodeGroupIdx = currentUnCommittedGroupIdx++;
80
+ nodeScanState.source = TableScanSource::UNCOMMITTED;
61
81
  return;
62
82
  }
63
- scanState.source = TableScanSource::NONE;
83
+ nodeScanState.source = TableScanSource::NONE;
64
84
  }
65
85
 
66
86
  table_id_map_t<SemiMask*> ScanNodeTable::getSemiMasks() const {
@@ -82,7 +102,18 @@ void ScanNodeTableInfo::initScanState(TableScanState& scanState,
82
102
  void ScanNodeTable::initLocalStateInternal(ResultSet* resultSet, ExecutionContext* context) {
83
103
  ScanTable::initLocalStateInternal(resultSet, context);
84
104
  auto nodeIDVector = resultSet->getValueVector(opInfo.nodeIDPos).get();
85
- scanState = std::make_unique<NodeTableScanState>(nodeIDVector, outVectors, nodeIDVector->state);
105
+
106
+ // Check if the first table is a ParquetNodeTable and create appropriate scan state
107
+ auto* parquetTable = dynamic_cast<ParquetNodeTable*>(tableInfos[0].table);
108
+ if (parquetTable) {
109
+ scanState = std::make_unique<ParquetNodeTableScanState>(
110
+ *MemoryManager::Get(*context->clientContext), nodeIDVector, outVectors,
111
+ nodeIDVector->state);
112
+ } else {
113
+ scanState =
114
+ std::make_unique<NodeTableScanState>(nodeIDVector, outVectors, nodeIDVector->state);
115
+ }
116
+
86
117
  currentTableIdx = 0;
87
118
  initCurrentTable(context);
88
119
  }
@@ -91,6 +122,11 @@ void ScanNodeTable::initCurrentTable(ExecutionContext* context) {
91
122
  auto& currentInfo = tableInfos[currentTableIdx];
92
123
  currentInfo.initScanState(*scanState, outVectors, context->clientContext);
93
124
  scanState->semiMask = sharedStates[currentTableIdx]->getSemiMask();
125
+ // Call table->initScanState for ParquetNodeTable
126
+ if (dynamic_cast<ParquetNodeTable*>(tableInfos[currentTableIdx].table)) {
127
+ auto transaction = transaction::Transaction::Get(*context->clientContext);
128
+ tableInfos[currentTableIdx].table->initScanState(transaction, *scanState);
129
+ }
94
130
  }
95
131
 
96
132
  void ScanNodeTable::initGlobalStateInternal(ExecutionContext* context) {
@@ -2,7 +2,9 @@
2
2
 
3
3
  #include "binder/expression/expression_util.h"
4
4
  #include "processor/execution_context.h"
5
+ #include "storage/buffer_manager/memory_manager.h"
5
6
  #include "storage/local_storage/local_rel_table.h"
7
+ #include "storage/table/parquet_rel_table.h"
6
8
 
7
9
  using namespace lbug::common;
8
10
  using namespace lbug::storage;
@@ -66,8 +68,16 @@ void ScanRelTable::initLocalStateInternal(ResultSet* resultSet, ExecutionContext
66
68
  auto clientContext = context->clientContext;
67
69
  auto boundNodeIDVector = resultSet->getValueVector(opInfo.nodeIDPos).get();
68
70
  auto nbrNodeIDVector = outVectors[0];
69
- scanState = std::make_unique<RelTableScanState>(*MemoryManager::Get(*clientContext),
70
- boundNodeIDVector, outVectors, nbrNodeIDVector->state);
71
+ // Check if this is a ParquetRelTable and create appropriate scan state
72
+ auto* parquetTable = dynamic_cast<storage::ParquetRelTable*>(tableInfo.table);
73
+ if (parquetTable) {
74
+ scanState =
75
+ std::make_unique<storage::ParquetRelTableScanState>(*MemoryManager::Get(*clientContext),
76
+ boundNodeIDVector, outVectors, nbrNodeIDVector->state);
77
+ } else {
78
+ scanState = std::make_unique<RelTableScanState>(*MemoryManager::Get(*clientContext),
79
+ boundNodeIDVector, outVectors, nbrNodeIDVector->state);
80
+ }
71
81
  tableInfo.initScanState(*scanState, outVectors, clientContext);
72
82
  }
73
83
 
@@ -13,6 +13,8 @@
13
13
  #include "storage/buffer_manager/memory_manager.h"
14
14
  #include "storage/checkpointer.h"
15
15
  #include "storage/table/node_table.h"
16
+ #include "storage/table/parquet_node_table.h"
17
+ #include "storage/table/parquet_rel_table.h"
16
18
  #include "storage/table/rel_table.h"
17
19
  #include "storage/wal/wal_replayer.h"
18
20
  #include "transaction/transaction.h"
@@ -77,15 +79,31 @@ void StorageManager::recover(main::ClientContext& clientContext, bool throwOnWal
77
79
  }
78
80
 
79
81
  void StorageManager::createNodeTable(NodeTableCatalogEntry* entry) {
80
- tables[entry->getTableID()] = std::make_unique<NodeTable>(this, entry, &memoryManager);
82
+ tableNameCache[entry->getTableID()] = entry->getName();
83
+ if (!entry->getStorage().empty()) {
84
+ // Create parquet-backed node table
85
+ tables[entry->getTableID()] =
86
+ std::make_unique<ParquetNodeTable>(this, entry, &memoryManager);
87
+ } else {
88
+ // Create regular node table
89
+ tables[entry->getTableID()] = std::make_unique<NodeTable>(this, entry, &memoryManager);
90
+ }
81
91
  }
82
92
 
83
93
  // TODO(Guodong): This API is added since storageManager doesn't provide an API to add a single
84
94
  // rel table. We may have to refactor the existing StorageManager::createTable(TableCatalogEntry*
85
95
  // entry).
86
96
  void StorageManager::addRelTable(RelGroupCatalogEntry* entry, const RelTableCatalogInfo& info) {
87
- tables[info.oid] = std::make_unique<RelTable>(entry, info.nodePair.srcTableID,
88
- info.nodePair.dstTableID, this, &memoryManager);
97
+ if (!entry->getStorage().empty()) {
98
+ // Create parquet-backed rel table
99
+ std::string fromNodeTableName = tableNameCache.at(info.nodePair.srcTableID);
100
+ tables[info.oid] = std::make_unique<ParquetRelTable>(entry, info.nodePair.srcTableID,
101
+ info.nodePair.dstTableID, this, &memoryManager, fromNodeTableName);
102
+ } else {
103
+ // Create regular rel table
104
+ tables[info.oid] = std::make_unique<RelTable>(entry, info.nodePair.srcTableID,
105
+ info.nodePair.dstTableID, this, &memoryManager);
106
+ }
89
107
  }
90
108
 
91
109
  void StorageManager::createRelTableGroup(RelGroupCatalogEntry* entry) {
@@ -257,7 +275,14 @@ void StorageManager::deserialize(main::ClientContext* context, const Catalog* ca
257
275
  KU_ASSERT(!tables.contains(tableID));
258
276
  auto tableEntry = catalog->getTableCatalogEntry(&DUMMY_TRANSACTION, tableID)
259
277
  ->ptrCast<NodeTableCatalogEntry>();
260
- tables[tableID] = std::make_unique<NodeTable>(this, tableEntry, &memoryManager);
278
+ tableNameCache[tableID] = tableEntry->getName();
279
+ if (!tableEntry->getStorage().empty()) {
280
+ // Create parquet-backed node table
281
+ tables[tableID] = std::make_unique<ParquetNodeTable>(this, tableEntry, &memoryManager);
282
+ } else {
283
+ // Create regular node table
284
+ tables[tableID] = std::make_unique<NodeTable>(this, tableEntry, &memoryManager);
285
+ }
261
286
  tables[tableID]->deserialize(context, this, deSer);
262
287
  }
263
288
  deSer.validateDebuggingInfo(key, "num_rel_groups");
@@ -279,8 +304,17 @@ void StorageManager::deserialize(main::ClientContext* context, const Catalog* ca
279
304
  for (auto k = 0u; k < numInnerRelTables; k++) {
280
305
  RelTableCatalogInfo info = RelTableCatalogInfo::deserialize(deSer);
281
306
  KU_ASSERT(!tables.contains(info.oid));
282
- tables[info.oid] = std::make_unique<RelTable>(relGroupEntry, info.nodePair.srcTableID,
283
- info.nodePair.dstTableID, this, &memoryManager);
307
+ if (!relGroupEntry->getStorage().empty()) {
308
+ // Create parquet-backed rel table
309
+ std::string fromNodeTableName = tableNameCache.at(info.nodePair.srcTableID);
310
+ tables[info.oid] =
311
+ std::make_unique<ParquetRelTable>(relGroupEntry, info.nodePair.srcTableID,
312
+ info.nodePair.dstTableID, this, &memoryManager, fromNodeTableName);
313
+ } else {
314
+ // Create regular rel table
315
+ tables[info.oid] = std::make_unique<RelTable>(relGroupEntry,
316
+ info.nodePair.srcTableID, info.nodePair.dstTableID, this, &memoryManager);
317
+ }
284
318
  tables.at(info.oid)->deserialize(context, this, deSer);
285
319
  }
286
320
  }
@@ -22,6 +22,8 @@ add_library(lbug_storage_store
22
22
  node_group_collection.cpp
23
23
  node_table.cpp
24
24
  null_column.cpp
25
+ parquet_node_table.cpp
26
+ parquet_rel_table.cpp
25
27
  rel_table.cpp
26
28
  rel_table_data.cpp
27
29
  string_chunk_data.cpp