lbug 0.12.3-dev.8 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. package/lbug-source/.github/workflows/ci-workflow.yml +9 -2
  2. package/lbug-source/CMakeLists.txt +16 -7
  3. package/lbug-source/Makefile +15 -4
  4. package/lbug-source/benchmark/serializer.py +24 -3
  5. package/lbug-source/dataset/demo-db/csv/copy.cypher +4 -4
  6. package/lbug-source/dataset/demo-db/graph-std/demo_indices_follows.parquet +0 -0
  7. package/lbug-source/dataset/demo-db/graph-std/demo_indices_livesin.parquet +0 -0
  8. package/lbug-source/dataset/demo-db/graph-std/demo_indptr_follows.parquet +0 -0
  9. package/lbug-source/dataset/demo-db/graph-std/demo_indptr_livesin.parquet +0 -0
  10. package/lbug-source/dataset/demo-db/graph-std/demo_mapping_city.parquet +0 -0
  11. package/lbug-source/dataset/demo-db/graph-std/demo_mapping_user.parquet +0 -0
  12. package/lbug-source/dataset/demo-db/graph-std/demo_metadata.parquet +0 -0
  13. package/lbug-source/dataset/demo-db/graph-std/demo_nodes_city.parquet +0 -0
  14. package/lbug-source/dataset/demo-db/graph-std/demo_nodes_user.parquet +0 -0
  15. package/lbug-source/dataset/demo-db/graph-std/schema.cypher +4 -0
  16. package/lbug-source/dataset/demo-db/parquet/copy.cypher +4 -4
  17. package/lbug-source/extension/duckdb/src/catalog/duckdb_catalog.cpp +1 -1
  18. package/lbug-source/extension/duckdb/src/catalog/duckdb_table_catalog_entry.cpp +43 -4
  19. package/lbug-source/extension/duckdb/src/connector/duckdb_result_converter.cpp +6 -0
  20. package/lbug-source/extension/duckdb/src/connector/duckdb_secret_manager.cpp +1 -1
  21. package/lbug-source/extension/duckdb/src/function/duckdb_scan.cpp +49 -4
  22. package/lbug-source/extension/duckdb/src/include/catalog/duckdb_table_catalog_entry.h +6 -1
  23. package/lbug-source/extension/duckdb/src/include/function/duckdb_scan.h +2 -0
  24. package/lbug-source/extension/duckdb/test/test_files/duckdb.test +28 -0
  25. package/lbug-source/extension/extension_config.cmake +3 -2
  26. package/lbug-source/extension/httpfs/test/test_files/http.test +1 -0
  27. package/lbug-source/scripts/antlr4/Cypher.g4 +4 -4
  28. package/lbug-source/scripts/antlr4/hash.md5 +1 -1
  29. package/lbug-source/scripts/generate_binary_demo.sh +1 -1
  30. package/lbug-source/src/antlr4/Cypher.g4 +4 -4
  31. package/lbug-source/src/binder/bind/bind_ddl.cpp +97 -15
  32. package/lbug-source/src/binder/bind/bind_graph_pattern.cpp +30 -3
  33. package/lbug-source/src/catalog/catalog.cpp +6 -4
  34. package/lbug-source/src/catalog/catalog_entry/node_table_catalog_entry.cpp +8 -1
  35. package/lbug-source/src/catalog/catalog_entry/rel_group_catalog_entry.cpp +46 -7
  36. package/lbug-source/src/catalog/catalog_set.cpp +1 -0
  37. package/lbug-source/src/function/function_collection.cpp +2 -1
  38. package/lbug-source/src/function/table/CMakeLists.txt +1 -0
  39. package/lbug-source/src/function/table/disk_size_info.cpp +322 -0
  40. package/lbug-source/src/function/table/show_connection.cpp +6 -1
  41. package/lbug-source/src/function/table/show_tables.cpp +10 -2
  42. package/lbug-source/src/function/table/table_function.cpp +11 -2
  43. package/lbug-source/src/include/binder/ddl/bound_create_table_info.h +23 -6
  44. package/lbug-source/src/include/catalog/catalog_entry/node_table_catalog_entry.h +5 -3
  45. package/lbug-source/src/include/catalog/catalog_entry/rel_group_catalog_entry.h +21 -2
  46. package/lbug-source/src/include/catalog/catalog_entry/table_catalog_entry.h +7 -0
  47. package/lbug-source/src/include/common/constants.h +1 -0
  48. package/lbug-source/src/include/common/string_format.h +2 -2
  49. package/lbug-source/src/include/common/types/types.h +1 -0
  50. package/lbug-source/src/include/function/table/bind_data.h +12 -1
  51. package/lbug-source/src/include/function/table/simple_table_function.h +6 -0
  52. package/lbug-source/src/include/function/table/table_function.h +2 -0
  53. package/lbug-source/src/include/optimizer/count_rel_table_optimizer.h +49 -0
  54. package/lbug-source/src/include/optimizer/logical_operator_visitor.h +6 -0
  55. package/lbug-source/src/include/optimizer/order_by_push_down_optimizer.h +21 -0
  56. package/lbug-source/src/include/parser/ddl/create_table_info.h +3 -1
  57. package/lbug-source/src/include/planner/operator/logical_operator.h +1 -0
  58. package/lbug-source/src/include/planner/operator/logical_table_function_call.h +14 -1
  59. package/lbug-source/src/include/planner/operator/scan/logical_count_rel_table.h +84 -0
  60. package/lbug-source/src/include/processor/operator/physical_operator.h +1 -0
  61. package/lbug-source/src/include/processor/operator/scan/count_rel_table.h +62 -0
  62. package/lbug-source/src/include/processor/operator/scan/scan_node_table.h +2 -2
  63. package/lbug-source/src/include/processor/plan_mapper.h +2 -0
  64. package/lbug-source/src/include/storage/storage_manager.h +1 -0
  65. package/lbug-source/src/include/storage/storage_version_info.h +1 -1
  66. package/lbug-source/src/include/storage/table/foreign_rel_table.h +56 -0
  67. package/lbug-source/src/include/storage/table/node_table.h +6 -1
  68. package/lbug-source/src/include/storage/table/parquet_node_table.h +103 -0
  69. package/lbug-source/src/include/storage/table/parquet_rel_table.h +91 -0
  70. package/lbug-source/src/include/storage/table/rel_table.h +2 -2
  71. package/lbug-source/src/include/transaction/transaction.h +2 -0
  72. package/lbug-source/src/optimizer/CMakeLists.txt +3 -1
  73. package/lbug-source/src/optimizer/count_rel_table_optimizer.cpp +217 -0
  74. package/lbug-source/src/optimizer/limit_push_down_optimizer.cpp +12 -0
  75. package/lbug-source/src/optimizer/logical_operator_visitor.cpp +6 -0
  76. package/lbug-source/src/optimizer/optimizer.cpp +10 -0
  77. package/lbug-source/src/optimizer/order_by_push_down_optimizer.cpp +123 -0
  78. package/lbug-source/src/optimizer/projection_push_down_optimizer.cpp +5 -1
  79. package/lbug-source/src/parser/transform/transform_ddl.cpp +6 -1
  80. package/lbug-source/src/parser/transform/transform_expression.cpp +1 -1
  81. package/lbug-source/src/parser/transform/transform_graph_pattern.cpp +6 -1
  82. package/lbug-source/src/parser/transformer.cpp +7 -1
  83. package/lbug-source/src/planner/join_order/cardinality_estimator.cpp +11 -2
  84. package/lbug-source/src/planner/operator/logical_operator.cpp +2 -0
  85. package/lbug-source/src/planner/operator/logical_table_function_call.cpp +4 -0
  86. package/lbug-source/src/planner/operator/scan/CMakeLists.txt +1 -0
  87. package/lbug-source/src/planner/operator/scan/logical_count_rel_table.cpp +24 -0
  88. package/lbug-source/src/planner/plan/plan_join_order.cpp +16 -1
  89. package/lbug-source/src/processor/map/CMakeLists.txt +1 -0
  90. package/lbug-source/src/processor/map/map_count_rel_table.cpp +55 -0
  91. package/lbug-source/src/processor/map/plan_mapper.cpp +3 -0
  92. package/lbug-source/src/processor/operator/index_lookup.cpp +31 -23
  93. package/lbug-source/src/processor/operator/persistent/reader/parquet/parquet_reader.cpp +4 -0
  94. package/lbug-source/src/processor/operator/physical_operator.cpp +2 -0
  95. package/lbug-source/src/processor/operator/scan/CMakeLists.txt +1 -0
  96. package/lbug-source/src/processor/operator/scan/count_rel_table.cpp +137 -0
  97. package/lbug-source/src/processor/operator/scan/scan_multi_rel_tables.cpp +24 -2
  98. package/lbug-source/src/processor/operator/scan/scan_node_table.cpp +44 -8
  99. package/lbug-source/src/processor/operator/scan/scan_rel_table.cpp +18 -2
  100. package/lbug-source/src/storage/storage_manager.cpp +43 -6
  101. package/lbug-source/src/storage/table/CMakeLists.txt +3 -0
  102. package/lbug-source/src/storage/table/foreign_rel_table.cpp +63 -0
  103. package/lbug-source/src/storage/table/parquet_node_table.cpp +338 -0
  104. package/lbug-source/src/storage/table/parquet_rel_table.cpp +388 -0
  105. package/lbug-source/test/common/string_format.cpp +9 -1
  106. package/lbug-source/test/copy/copy_test.cpp +4 -4
  107. package/lbug-source/test/graph_test/CMakeLists.txt +1 -1
  108. package/lbug-source/test/include/test_runner/test_group.h +11 -1
  109. package/lbug-source/test/optimizer/optimizer_test.cpp +46 -0
  110. package/lbug-source/test/runner/e2e_test.cpp +7 -1
  111. package/lbug-source/test/test_files/demo_db/demo_db_graph_std.test +77 -0
  112. package/lbug-source/test/test_helper/CMakeLists.txt +1 -1
  113. package/lbug-source/test/test_helper/test_helper.cpp +33 -1
  114. package/lbug-source/test/test_runner/CMakeLists.txt +1 -1
  115. package/lbug-source/test/test_runner/insert_by_row.cpp +6 -8
  116. package/lbug-source/test/test_runner/multi_copy_split.cpp +2 -4
  117. package/lbug-source/test/test_runner/test_parser.cpp +3 -0
  118. package/lbug-source/test/transaction/checkpoint_test.cpp +1 -1
  119. package/lbug-source/test/transaction/transaction_test.cpp +19 -15
  120. package/lbug-source/third_party/antlr4_cypher/cypher_parser.cpp +2805 -2708
  121. package/lbug-source/third_party/antlr4_cypher/include/cypher_parser.h +7 -3
  122. package/lbug-source/tools/benchmark/count_rel_table.benchmark +5 -0
  123. package/lbug-source/tools/nodejs_api/package.json +4 -2
  124. package/lbug-source/tools/shell/embedded_shell.cpp +78 -3
  125. package/lbug-source/tools/shell/include/embedded_shell.h +2 -0
  126. package/lbug-source/tools/shell/linenoise.cpp +3 -3
  127. package/lbug-source/tools/shell/test/test_helper.py +1 -1
  128. package/lbug-source/tools/shell/test/test_shell_basics.py +12 -0
  129. package/lbug-source/tools/shell/test/test_shell_commands.py +19 -0
  130. package/package.json +9 -2
  131. package/prebuilt/lbugjs-darwin-arm64.node +0 -0
  132. package/prebuilt/lbugjs-linux-arm64.node +0 -0
  133. package/prebuilt/lbugjs-linux-x64.node +0 -0
  134. package/prebuilt/lbugjs-win32-x64.node +0 -0
@@ -0,0 +1,55 @@
1
+ #include "planner/operator/scan/logical_count_rel_table.h"
2
+ #include "processor/operator/scan/count_rel_table.h"
3
+ #include "processor/plan_mapper.h"
4
+ #include "storage/storage_manager.h"
5
+
6
+ using namespace lbug::common;
7
+ using namespace lbug::planner;
8
+ using namespace lbug::storage;
9
+
10
+ namespace lbug {
11
+ namespace processor {
12
+
13
+ std::unique_ptr<PhysicalOperator> PlanMapper::mapCountRelTable(
14
+ const LogicalOperator* logicalOperator) {
15
+ auto& logicalCountRelTable = logicalOperator->constCast<LogicalCountRelTable>();
16
+ auto outSchema = logicalCountRelTable.getSchema();
17
+
18
+ auto storageManager = StorageManager::Get(*clientContext);
19
+
20
+ // Get the node tables for scanning bound nodes
21
+ std::vector<NodeTable*> nodeTables;
22
+ for (auto tableID : logicalCountRelTable.getBoundNodeTableIDs()) {
23
+ nodeTables.push_back(storageManager->getTable(tableID)->ptrCast<NodeTable>());
24
+ }
25
+
26
+ // Get the rel tables
27
+ std::vector<RelTable*> relTables;
28
+ for (auto tableID : logicalCountRelTable.getRelTableIDs()) {
29
+ relTables.push_back(storageManager->getTable(tableID)->ptrCast<RelTable>());
30
+ }
31
+
32
+ // Determine rel data direction from extend direction
33
+ auto extendDirection = logicalCountRelTable.getDirection();
34
+ RelDataDirection relDirection;
35
+ if (extendDirection == ExtendDirection::FWD) {
36
+ relDirection = RelDataDirection::FWD;
37
+ } else if (extendDirection == ExtendDirection::BWD) {
38
+ relDirection = RelDataDirection::BWD;
39
+ } else {
40
+ // For BOTH, we'll scan FWD (shouldn't reach here as optimizer filters BOTH)
41
+ relDirection = RelDataDirection::FWD;
42
+ }
43
+
44
+ // Get the output position for the count expression
45
+ auto countOutputPos = getDataPos(*logicalCountRelTable.getCountExpr(), *outSchema);
46
+
47
+ auto printInfo = std::make_unique<CountRelTablePrintInfo>(
48
+ logicalCountRelTable.getRelGroupEntry()->getName());
49
+
50
+ return std::make_unique<CountRelTable>(std::move(nodeTables), std::move(relTables),
51
+ relDirection, countOutputPos, getOperatorID(), std::move(printInfo));
52
+ }
53
+
54
+ } // namespace processor
55
+ } // namespace lbug
@@ -62,6 +62,9 @@ std::unique_ptr<PhysicalOperator> PlanMapper::mapOperator(const LogicalOperator*
62
62
  case LogicalOperatorType::COPY_TO: {
63
63
  physicalOperator = mapCopyTo(logicalOperator);
64
64
  } break;
65
+ case LogicalOperatorType::COUNT_REL_TABLE: {
66
+ physicalOperator = mapCountRelTable(logicalOperator);
67
+ } break;
65
68
  case LogicalOperatorType::CREATE_MACRO: {
66
69
  physicalOperator = mapCreateMacro(logicalOperator);
67
70
  } break;
@@ -28,7 +28,6 @@ std::optional<WarningSourceData> getWarningSourceData(
28
28
  return ret;
29
29
  }
30
30
 
31
- // TODO(Guodong): Add short path for unfiltered case.
32
31
  bool checkNullKey(ValueVector* keyVector, offset_t vectorOffset,
33
32
  BatchInsertErrorHandler* errorHandler, const std::vector<ValueVector*>& warningDataVectors) {
34
33
  bool isNull = keyVector->isNull(vectorOffset);
@@ -71,27 +70,17 @@ struct OffsetVectorManager {
71
70
  offset_t insertOffset;
72
71
  };
73
72
 
74
- // TODO(Guodong): Add short path for unfiltered case.
75
73
  template<bool hasNoNullsGuarantee>
76
- void fillOffsetArraysFromVector(transaction::Transaction* transaction, const IndexLookupInfo& info,
77
- ValueVector* keyVector, ValueVector* resultVector,
78
- const std::vector<ValueVector*>& warningDataVectors, BatchInsertErrorHandler* errorHandler) {
79
- KU_ASSERT(resultVector->dataType.getPhysicalType() == PhysicalTypeID::INT64);
74
+ void fillOffsetArraysFromVectorInternal(transaction::Transaction* transaction,
75
+ const IndexLookupInfo& info, ValueVector* keyVector, ValueVector* resultVector,
76
+ const std::vector<ValueVector*>& warningDataVectors, BatchInsertErrorHandler* errorHandler,
77
+ const sel_t* selVector, sel_t numKeys) {
80
78
  TypeUtils::visit(
81
79
  keyVector->dataType.getPhysicalType(),
82
80
  [&]<IndexHashable T>(T) {
83
- auto numKeys = keyVector->state->getSelVector().getSelSize();
84
-
85
- // fetch all the selection pos at the start
86
- // since we may modify the selection vector in the middle of the lookup
87
- std::vector<sel_t> lookupPos(numKeys);
88
- for (idx_t i = 0; i < numKeys; ++i) {
89
- lookupPos[i] = (keyVector->state->getSelVector()[i]);
90
- }
91
-
92
81
  OffsetVectorManager resultManager{resultVector, errorHandler};
93
- for (auto i = 0u; i < numKeys; i++) {
94
- auto pos = lookupPos[i];
82
+ for (sel_t i = 0u; i < numKeys; i++) {
83
+ auto pos = selVector ? selVector[i] : i;
95
84
  if constexpr (!hasNoNullsGuarantee) {
96
85
  if (!checkNullKey(keyVector, pos, errorHandler, warningDataVectors)) {
97
86
  continue;
@@ -99,12 +88,9 @@ void fillOffsetArraysFromVector(transaction::Transaction* transaction, const Ind
99
88
  }
100
89
  offset_t lookupOffset = 0;
101
90
  if (!info.nodeTable->lookupPK(transaction, keyVector, pos, lookupOffset)) {
102
- TypeUtils::visit(keyVector->dataType, [&]<typename type>(type) {
103
- errorHandler->handleError(
104
- ExceptionMessage::nonExistentPKException(
105
- TypeUtils::toString(keyVector->getValue<type>(pos), keyVector)),
106
- getWarningSourceData(warningDataVectors, pos));
107
- });
91
+ errorHandler->handleError(ExceptionMessage::nonExistentPKException(
92
+ keyVector->getAsValue(pos)->toString()),
93
+ getWarningSourceData(warningDataVectors, pos));
108
94
  } else {
109
95
  resultManager.insertEntry(lookupOffset, pos);
110
96
  }
@@ -112,6 +98,28 @@ void fillOffsetArraysFromVector(transaction::Transaction* transaction, const Ind
112
98
  },
113
99
  [&](auto) { KU_UNREACHABLE; });
114
100
  }
101
+
102
+ template<bool hasNoNullsGuarantee>
103
+ void fillOffsetArraysFromVector(transaction::Transaction* transaction, const IndexLookupInfo& info,
104
+ ValueVector* keyVector, ValueVector* resultVector,
105
+ const std::vector<ValueVector*>& warningDataVectors, BatchInsertErrorHandler* errorHandler) {
106
+ KU_ASSERT(resultVector->dataType.getPhysicalType() == PhysicalTypeID::INT64);
107
+ auto& selVector = keyVector->state->getSelVector();
108
+ auto numKeys = selVector.getSelSize();
109
+ if (selVector.isUnfiltered()) {
110
+ // Fast path: selection vector is unfiltered - pass a null selection vector
111
+ fillOffsetArraysFromVectorInternal<hasNoNullsGuarantee>(transaction, info, keyVector,
112
+ resultVector, warningDataVectors, errorHandler, nullptr /* selVector */, numKeys);
113
+ } else {
114
+ // Filtered case: copy selection positions since we may modify the selection vector
115
+ std::vector<sel_t> lookupPos(numKeys);
116
+ for (idx_t i = 0; i < numKeys; ++i) {
117
+ lookupPos[i] = selVector[i];
118
+ }
119
+ fillOffsetArraysFromVectorInternal<hasNoNullsGuarantee>(transaction, info, keyVector,
120
+ resultVector, warningDataVectors, errorHandler, lookupPos.data(), numKeys);
121
+ }
122
+ }
115
123
  } // namespace
116
124
 
117
125
  std::string IndexLookupPrintInfo::toString() const {
@@ -340,6 +340,10 @@ std::unique_ptr<ColumnReader> ParquetReader::createReader() {
340
340
  throw CopyException{"Root element of Parquet file must be a struct"};
341
341
  }
342
342
  // LCOV_EXCL_STOP
343
+ // Clear existing column metadata before populating (in case createReader is called multiple
344
+ // times)
345
+ columnNames.clear();
346
+ columnTypes.clear();
343
347
  for (auto& field : StructType::getFields(rootReader->getDataType())) {
344
348
  columnNames.push_back(field.getName());
345
349
  columnTypes.push_back(field.getType().copy());
@@ -27,6 +27,8 @@ std::string PhysicalOperatorUtils::operatorTypeToString(PhysicalOperatorType ope
27
27
  return "BATCH_INSERT";
28
28
  case PhysicalOperatorType::COPY_TO:
29
29
  return "COPY_TO";
30
+ case PhysicalOperatorType::COUNT_REL_TABLE:
31
+ return "COUNT_REL_TABLE";
30
32
  case PhysicalOperatorType::CREATE_MACRO:
31
33
  return "CREATE_MACRO";
32
34
  case PhysicalOperatorType::CREATE_SEQUENCE:
@@ -1,5 +1,6 @@
1
1
  add_library(lbug_processor_operator_scan
2
2
  OBJECT
3
+ count_rel_table.cpp
3
4
  primary_key_scan_node_table.cpp
4
5
  scan_multi_rel_tables.cpp
5
6
  scan_node_table.cpp
@@ -0,0 +1,137 @@
1
+ #include "processor/operator/scan/count_rel_table.h"
2
+
3
+ #include "common/system_config.h"
4
+ #include "main/client_context.h"
5
+ #include "main/database.h"
6
+ #include "processor/execution_context.h"
7
+ #include "storage/buffer_manager/memory_manager.h"
8
+ #include "storage/local_storage/local_rel_table.h"
9
+ #include "storage/local_storage/local_storage.h"
10
+ #include "storage/table/column.h"
11
+ #include "storage/table/column_chunk_data.h"
12
+ #include "storage/table/csr_chunked_node_group.h"
13
+ #include "storage/table/csr_node_group.h"
14
+ #include "storage/table/rel_table_data.h"
15
+ #include "transaction/transaction.h"
16
+
17
+ using namespace lbug::common;
18
+ using namespace lbug::storage;
19
+ using namespace lbug::transaction;
20
+
21
+ namespace lbug {
22
+ namespace processor {
23
+
24
+ void CountRelTable::initLocalStateInternal(ResultSet* resultSet, ExecutionContext* /*context*/) {
25
+ countVector = resultSet->getValueVector(countOutputPos).get();
26
+ hasExecuted = false;
27
+ totalCount = 0;
28
+ }
29
+
30
+ // Count rels by using CSR metadata, accounting for deletions and uncommitted data.
31
+ // This is more efficient than scanning through all edges.
32
+ bool CountRelTable::getNextTuplesInternal(ExecutionContext* context) {
33
+ if (hasExecuted) {
34
+ return false;
35
+ }
36
+
37
+ auto transaction = Transaction::Get(*context->clientContext);
38
+ auto* memoryManager = context->clientContext->getDatabase()->getMemoryManager();
39
+
40
+ for (auto* relTable : relTables) {
41
+ // Get the RelTableData for the specified direction
42
+ auto* relTableData = relTable->getDirectedTableData(direction);
43
+ auto numNodeGroups = relTableData->getNumNodeGroups();
44
+ auto* csrLengthColumn = relTableData->getCSRLengthColumn();
45
+
46
+ // For each node group in the rel table
47
+ for (node_group_idx_t nodeGroupIdx = 0; nodeGroupIdx < numNodeGroups; nodeGroupIdx++) {
48
+ auto* nodeGroup = relTableData->getNodeGroup(nodeGroupIdx);
49
+ if (!nodeGroup) {
50
+ continue;
51
+ }
52
+
53
+ auto& csrNodeGroup = nodeGroup->cast<CSRNodeGroup>();
54
+
55
+ // Count from persistent (checkpointed) data
56
+ if (auto* persistentGroup = csrNodeGroup.getPersistentChunkedGroup()) {
57
+ // Sum the actual relationship lengths from the CSR header instead of using
58
+ // getNumRows() which includes dummy rows added for CSR offset array gaps
59
+ auto& csrPersistentGroup = persistentGroup->cast<ChunkedCSRNodeGroup>();
60
+ auto& csrHeader = csrPersistentGroup.getCSRHeader();
61
+
62
+ // Get the number of nodes in this CSR header
63
+ auto numNodes = csrHeader.length->getNumValues();
64
+ if (numNodes == 0) {
65
+ continue;
66
+ }
67
+
68
+ // Create an in-memory chunk to scan the CSR length column into
69
+ auto lengthChunk =
70
+ ColumnChunkFactory::createColumnChunkData(*memoryManager, LogicalType::UINT64(),
71
+ false /*enableCompression*/, StorageConfig::NODE_GROUP_SIZE,
72
+ ResidencyState::IN_MEMORY, false /*initializeToZero*/);
73
+
74
+ // Initialize scan state and scan the length column from disk
75
+ ChunkState chunkState;
76
+ csrHeader.length->initializeScanState(chunkState, csrLengthColumn);
77
+ csrLengthColumn->scan(chunkState, lengthChunk.get(), 0 /*offsetInChunk*/, numNodes);
78
+
79
+ // Sum all the lengths
80
+ auto* lengthData = reinterpret_cast<const uint64_t*>(lengthChunk->getData());
81
+ row_idx_t groupRelCount = 0;
82
+ for (offset_t i = 0; i < numNodes; ++i) {
83
+ groupRelCount += lengthData[i];
84
+ }
85
+ totalCount += groupRelCount;
86
+
87
+ // Subtract deletions from persistent data
88
+ if (persistentGroup->hasVersionInfo()) {
89
+ auto numDeletions =
90
+ persistentGroup->getNumDeletions(transaction, 0, groupRelCount);
91
+ totalCount -= numDeletions;
92
+ }
93
+ }
94
+
95
+ // Count in-memory committed data (not yet checkpointed)
96
+ // This data is stored in chunkedGroups within the NodeGroup
97
+ auto numChunkedGroups = csrNodeGroup.getNumChunkedGroups();
98
+ for (node_group_idx_t i = 0; i < numChunkedGroups; i++) {
99
+ auto* chunkedGroup = csrNodeGroup.getChunkedNodeGroup(i);
100
+ if (chunkedGroup) {
101
+ auto numRows = chunkedGroup->getNumRows();
102
+ totalCount += numRows;
103
+ // Subtract deletions from in-memory committed data
104
+ if (chunkedGroup->hasVersionInfo()) {
105
+ auto numDeletions = chunkedGroup->getNumDeletions(transaction, 0, numRows);
106
+ totalCount -= numDeletions;
107
+ }
108
+ }
109
+ }
110
+ }
111
+
112
+ // Add uncommitted insertions from local storage
113
+ if (transaction->isWriteTransaction()) {
114
+ if (auto* localTable =
115
+ transaction->getLocalStorage()->getLocalTable(relTable->getTableID())) {
116
+ auto& localRelTable = localTable->cast<LocalRelTable>();
117
+ // Count entries in the CSR index for this direction.
118
+ // We can't use getNumTotalRows() because it includes deleted rows.
119
+ auto& csrIndex = localRelTable.getCSRIndex(direction);
120
+ for (const auto& [nodeOffset, rowIndices] : csrIndex) {
121
+ totalCount += rowIndices.size();
122
+ }
123
+ }
124
+ }
125
+ }
126
+
127
+ hasExecuted = true;
128
+
129
+ // Write the count to the output vector (single value)
130
+ countVector->state->getSelVectorUnsafe().setToUnfiltered(1);
131
+ countVector->setValue<int64_t>(0, static_cast<int64_t>(totalCount));
132
+
133
+ return true;
134
+ }
135
+
136
+ } // namespace processor
137
+ } // namespace lbug
@@ -2,6 +2,7 @@
2
2
 
3
3
  #include "processor/execution_context.h"
4
4
  #include "storage/local_storage/local_storage.h"
5
+ #include "storage/table/parquet_rel_table.h"
5
6
 
6
7
  using namespace lbug::common;
7
8
  using namespace lbug::storage;
@@ -54,8 +55,29 @@ void ScanMultiRelTable::initLocalStateInternal(ResultSet* resultSet, ExecutionCo
54
55
  auto clientContext = context->clientContext;
55
56
  boundNodeIDVector = resultSet->getValueVector(opInfo.nodeIDPos).get();
56
57
  auto nbrNodeIDVector = outVectors[0];
57
- scanState = std::make_unique<RelTableScanState>(*MemoryManager::Get(*clientContext),
58
- boundNodeIDVector, outVectors, nbrNodeIDVector->state);
58
+
59
+ // Check if any table in any scanner is a ParquetRelTable
60
+ bool hasParquetTable = false;
61
+ for (auto& [_, scanner] : scanners) {
62
+ for (auto& relInfo : scanner.relInfos) {
63
+ if (dynamic_cast<storage::ParquetRelTable*>(relInfo.table) != nullptr) {
64
+ hasParquetTable = true;
65
+ break;
66
+ }
67
+ }
68
+ if (hasParquetTable)
69
+ break;
70
+ }
71
+
72
+ // Create appropriate scan state type
73
+ if (hasParquetTable) {
74
+ scanState =
75
+ std::make_unique<storage::ParquetRelTableScanState>(*MemoryManager::Get(*clientContext),
76
+ boundNodeIDVector, outVectors, nbrNodeIDVector->state);
77
+ } else {
78
+ scanState = std::make_unique<RelTableScanState>(*MemoryManager::Get(*clientContext),
79
+ boundNodeIDVector, outVectors, nbrNodeIDVector->state);
80
+ }
59
81
  for (auto& [_, scanner] : scanners) {
60
82
  for (auto& relInfo : scanner.relInfos) {
61
83
  if (directionInfo.directionPos.isValid()) {
@@ -2,8 +2,10 @@
2
2
 
3
3
  #include "binder/expression/expression_util.h"
4
4
  #include "processor/execution_context.h"
5
+ #include "storage/buffer_manager/memory_manager.h"
5
6
  #include "storage/local_storage/local_node_table.h"
6
7
  #include "storage/local_storage/local_storage.h"
8
+ #include "storage/table/parquet_node_table.h"
7
9
 
8
10
  using namespace lbug::common;
9
11
  using namespace lbug::storage;
@@ -35,7 +37,23 @@ void ScanNodeTableSharedState::initialize(const transaction::Transaction* transa
35
37
  this->table = table;
36
38
  this->currentCommittedGroupIdx = 0;
37
39
  this->currentUnCommittedGroupIdx = 0;
38
- this->numCommittedNodeGroups = table->getNumCommittedNodeGroups();
40
+
41
+ // Initialize table-specific scan coordination (e.g., for ParquetNodeTable)
42
+ table->initializeScanCoordination(transaction);
43
+
44
+ if (const auto parquetTable = dynamic_cast<ParquetNodeTable*>(table)) {
45
+ // For parquet tables, set numCommittedNodeGroups to number of row groups
46
+ std::vector<bool> columnSkips;
47
+ try {
48
+ auto tempReader = std::make_unique<processor::ParquetReader>(
49
+ parquetTable->getParquetFilePath(), columnSkips, transaction->getClientContext());
50
+ this->numCommittedNodeGroups = tempReader->getNumRowsGroups();
51
+ } catch (const std::exception& e) {
52
+ this->numCommittedNodeGroups = 1;
53
+ }
54
+ } else {
55
+ this->numCommittedNodeGroups = table->getNumCommittedNodeGroups();
56
+ }
39
57
  if (transaction->isWriteTransaction()) {
40
58
  if (const auto localTable =
41
59
  transaction->getLocalStorage()->getLocalTable(this->table->getTableID())) {
@@ -46,21 +64,23 @@ void ScanNodeTableSharedState::initialize(const transaction::Transaction* transa
46
64
  progressSharedState.numGroups += numCommittedNodeGroups;
47
65
  }
48
66
 
49
- void ScanNodeTableSharedState::nextMorsel(NodeTableScanState& scanState,
67
+ void ScanNodeTableSharedState::nextMorsel(TableScanState& scanState,
50
68
  ScanNodeTableProgressSharedState& progressSharedState) {
51
69
  std::unique_lock lck{mtx};
70
+ // Cast to NodeTableScanState since we know this is for node tables
71
+ auto& nodeScanState = scanState.cast<NodeTableScanState>();
52
72
  if (currentCommittedGroupIdx < numCommittedNodeGroups) {
53
- scanState.nodeGroupIdx = currentCommittedGroupIdx++;
73
+ nodeScanState.nodeGroupIdx = currentCommittedGroupIdx++;
54
74
  progressSharedState.numGroupsScanned++;
55
- scanState.source = TableScanSource::COMMITTED;
75
+ nodeScanState.source = TableScanSource::COMMITTED;
56
76
  return;
57
77
  }
58
78
  if (currentUnCommittedGroupIdx < numUnCommittedNodeGroups) {
59
- scanState.nodeGroupIdx = currentUnCommittedGroupIdx++;
60
- scanState.source = TableScanSource::UNCOMMITTED;
79
+ nodeScanState.nodeGroupIdx = currentUnCommittedGroupIdx++;
80
+ nodeScanState.source = TableScanSource::UNCOMMITTED;
61
81
  return;
62
82
  }
63
- scanState.source = TableScanSource::NONE;
83
+ nodeScanState.source = TableScanSource::NONE;
64
84
  }
65
85
 
66
86
  table_id_map_t<SemiMask*> ScanNodeTable::getSemiMasks() const {
@@ -82,7 +102,18 @@ void ScanNodeTableInfo::initScanState(TableScanState& scanState,
82
102
  void ScanNodeTable::initLocalStateInternal(ResultSet* resultSet, ExecutionContext* context) {
83
103
  ScanTable::initLocalStateInternal(resultSet, context);
84
104
  auto nodeIDVector = resultSet->getValueVector(opInfo.nodeIDPos).get();
85
- scanState = std::make_unique<NodeTableScanState>(nodeIDVector, outVectors, nodeIDVector->state);
105
+
106
+ // Check if the first table is a ParquetNodeTable and create appropriate scan state
107
+ auto* parquetTable = dynamic_cast<ParquetNodeTable*>(tableInfos[0].table);
108
+ if (parquetTable) {
109
+ scanState = std::make_unique<ParquetNodeTableScanState>(
110
+ *MemoryManager::Get(*context->clientContext), nodeIDVector, outVectors,
111
+ nodeIDVector->state);
112
+ } else {
113
+ scanState =
114
+ std::make_unique<NodeTableScanState>(nodeIDVector, outVectors, nodeIDVector->state);
115
+ }
116
+
86
117
  currentTableIdx = 0;
87
118
  initCurrentTable(context);
88
119
  }
@@ -91,6 +122,11 @@ void ScanNodeTable::initCurrentTable(ExecutionContext* context) {
91
122
  auto& currentInfo = tableInfos[currentTableIdx];
92
123
  currentInfo.initScanState(*scanState, outVectors, context->clientContext);
93
124
  scanState->semiMask = sharedStates[currentTableIdx]->getSemiMask();
125
+ // Call table->initScanState for ParquetNodeTable
126
+ if (dynamic_cast<ParquetNodeTable*>(tableInfos[currentTableIdx].table)) {
127
+ auto transaction = transaction::Transaction::Get(*context->clientContext);
128
+ tableInfos[currentTableIdx].table->initScanState(transaction, *scanState);
129
+ }
94
130
  }
95
131
 
96
132
  void ScanNodeTable::initGlobalStateInternal(ExecutionContext* context) {
@@ -2,7 +2,10 @@
2
2
 
3
3
  #include "binder/expression/expression_util.h"
4
4
  #include "processor/execution_context.h"
5
+ #include "storage/buffer_manager/memory_manager.h"
5
6
  #include "storage/local_storage/local_rel_table.h"
7
+ #include "storage/table/foreign_rel_table.h"
8
+ #include "storage/table/parquet_rel_table.h"
6
9
 
7
10
  using namespace lbug::common;
8
11
  using namespace lbug::storage;
@@ -66,8 +69,21 @@ void ScanRelTable::initLocalStateInternal(ResultSet* resultSet, ExecutionContext
66
69
  auto clientContext = context->clientContext;
67
70
  auto boundNodeIDVector = resultSet->getValueVector(opInfo.nodeIDPos).get();
68
71
  auto nbrNodeIDVector = outVectors[0];
69
- scanState = std::make_unique<RelTableScanState>(*MemoryManager::Get(*clientContext),
70
- boundNodeIDVector, outVectors, nbrNodeIDVector->state);
72
+ // Check if this is a ParquetRelTable or ForeignRelTable and create appropriate scan state
73
+ auto* parquetTable = dynamic_cast<storage::ParquetRelTable*>(tableInfo.table);
74
+ auto* foreignTable = dynamic_cast<storage::ForeignRelTable*>(tableInfo.table);
75
+ if (parquetTable) {
76
+ scanState =
77
+ std::make_unique<storage::ParquetRelTableScanState>(*MemoryManager::Get(*clientContext),
78
+ boundNodeIDVector, outVectors, nbrNodeIDVector->state);
79
+ } else if (foreignTable) {
80
+ scanState =
81
+ std::make_unique<storage::ForeignRelTableScanState>(*MemoryManager::Get(*clientContext),
82
+ boundNodeIDVector, outVectors, nbrNodeIDVector->state);
83
+ } else {
84
+ scanState = std::make_unique<RelTableScanState>(*MemoryManager::Get(*clientContext),
85
+ boundNodeIDVector, outVectors, nbrNodeIDVector->state);
86
+ }
71
87
  tableInfo.initScanState(*scanState, outVectors, clientContext);
72
88
  }
73
89
 
@@ -12,7 +12,10 @@
12
12
  #include "storage/buffer_manager/buffer_manager.h"
13
13
  #include "storage/buffer_manager/memory_manager.h"
14
14
  #include "storage/checkpointer.h"
15
+ #include "storage/table/foreign_rel_table.h"
15
16
  #include "storage/table/node_table.h"
17
+ #include "storage/table/parquet_node_table.h"
18
+ #include "storage/table/parquet_rel_table.h"
16
19
  #include "storage/table/rel_table.h"
17
20
  #include "storage/wal/wal_replayer.h"
18
21
  #include "transaction/transaction.h"
@@ -77,15 +80,35 @@ void StorageManager::recover(main::ClientContext& clientContext, bool throwOnWal
77
80
  }
78
81
 
79
82
  void StorageManager::createNodeTable(NodeTableCatalogEntry* entry) {
80
- tables[entry->getTableID()] = std::make_unique<NodeTable>(this, entry, &memoryManager);
83
+ tableNameCache[entry->getTableID()] = entry->getName();
84
+ if (!entry->getStorage().empty()) {
85
+ // Create parquet-backed node table
86
+ tables[entry->getTableID()] =
87
+ std::make_unique<ParquetNodeTable>(this, entry, &memoryManager);
88
+ } else {
89
+ // Create regular node table
90
+ tables[entry->getTableID()] = std::make_unique<NodeTable>(this, entry, &memoryManager);
91
+ }
81
92
  }
82
93
 
83
94
  // TODO(Guodong): This API is added since storageManager doesn't provide an API to add a single
84
95
  // rel table. We may have to refactor the existing StorageManager::createTable(TableCatalogEntry*
85
96
  // entry).
86
97
  void StorageManager::addRelTable(RelGroupCatalogEntry* entry, const RelTableCatalogInfo& info) {
87
- tables[info.oid] = std::make_unique<RelTable>(entry, info.nodePair.srcTableID,
88
- info.nodePair.dstTableID, this, &memoryManager);
98
+ if (entry->getScanFunction().has_value()) {
99
+ // Create foreign-backed rel table
100
+ tables[info.oid] = std::make_unique<ForeignRelTable>(entry, info.nodePair.srcTableID,
101
+ info.nodePair.dstTableID, this, &memoryManager, *entry->getScanFunction(),
102
+ std::move(entry->getScanBindData().value()));
103
+ } else if (!entry->getStorage().empty()) {
104
+ // Create parquet-backed rel table
105
+ tables[info.oid] = std::make_unique<ParquetRelTable>(entry, info.nodePair.srcTableID,
106
+ info.nodePair.dstTableID, this, &memoryManager);
107
+ } else {
108
+ // Create regular rel table
109
+ tables[info.oid] = std::make_unique<RelTable>(entry, info.nodePair.srcTableID,
110
+ info.nodePair.dstTableID, this, &memoryManager);
111
+ }
89
112
  }
90
113
 
91
114
  void StorageManager::createRelTableGroup(RelGroupCatalogEntry* entry) {
@@ -257,7 +280,14 @@ void StorageManager::deserialize(main::ClientContext* context, const Catalog* ca
257
280
  KU_ASSERT(!tables.contains(tableID));
258
281
  auto tableEntry = catalog->getTableCatalogEntry(&DUMMY_TRANSACTION, tableID)
259
282
  ->ptrCast<NodeTableCatalogEntry>();
260
- tables[tableID] = std::make_unique<NodeTable>(this, tableEntry, &memoryManager);
283
+ tableNameCache[tableID] = tableEntry->getName();
284
+ if (!tableEntry->getStorage().empty()) {
285
+ // Create parquet-backed node table
286
+ tables[tableID] = std::make_unique<ParquetNodeTable>(this, tableEntry, &memoryManager);
287
+ } else {
288
+ // Create regular node table
289
+ tables[tableID] = std::make_unique<NodeTable>(this, tableEntry, &memoryManager);
290
+ }
261
291
  tables[tableID]->deserialize(context, this, deSer);
262
292
  }
263
293
  deSer.validateDebuggingInfo(key, "num_rel_groups");
@@ -279,8 +309,15 @@ void StorageManager::deserialize(main::ClientContext* context, const Catalog* ca
279
309
  for (auto k = 0u; k < numInnerRelTables; k++) {
280
310
  RelTableCatalogInfo info = RelTableCatalogInfo::deserialize(deSer);
281
311
  KU_ASSERT(!tables.contains(info.oid));
282
- tables[info.oid] = std::make_unique<RelTable>(relGroupEntry, info.nodePair.srcTableID,
283
- info.nodePair.dstTableID, this, &memoryManager);
312
+ if (!relGroupEntry->getStorage().empty()) {
313
+ // Create parquet-backed rel table
314
+ tables[info.oid] = std::make_unique<ParquetRelTable>(relGroupEntry,
315
+ info.nodePair.srcTableID, info.nodePair.dstTableID, this, &memoryManager);
316
+ } else {
317
+ // Create regular rel table
318
+ tables[info.oid] = std::make_unique<RelTable>(relGroupEntry,
319
+ info.nodePair.srcTableID, info.nodePair.dstTableID, this, &memoryManager);
320
+ }
284
321
  tables.at(info.oid)->deserialize(context, this, deSer);
285
322
  }
286
323
  }
@@ -13,6 +13,7 @@ add_library(lbug_storage_store
13
13
  compression_flush_buffer.cpp
14
14
  dictionary_chunk.cpp
15
15
  dictionary_column.cpp
16
+ foreign_rel_table.cpp
16
17
  in_mem_chunked_node_group_collection.cpp
17
18
  in_memory_exception_chunk.cpp
18
19
  lazy_segment_scanner.cpp
@@ -22,6 +23,8 @@ add_library(lbug_storage_store
22
23
  node_group_collection.cpp
23
24
  node_table.cpp
24
25
  null_column.cpp
26
+ parquet_node_table.cpp
27
+ parquet_rel_table.cpp
25
28
  rel_table.cpp
26
29
  rel_table_data.cpp
27
30
  string_chunk_data.cpp
@@ -0,0 +1,63 @@
1
+ #include "storage/table/foreign_rel_table.h"
2
+
3
+ #include "function/table/table_function.h"
4
+ #include "processor/operator/scan/scan_rel_table.h"
5
+ #include "storage/storage_manager.h"
6
+ #include "transaction/transaction.h"
7
+
8
+ namespace lbug {
9
+ namespace storage {
10
+
11
+ ForeignRelTableScanState::ForeignRelTableScanState(MemoryManager& mm,
12
+ common::ValueVector* nodeIDVector, std::vector<common::ValueVector*> outputVectors,
13
+ std::shared_ptr<common::DataChunkState> outChunkState)
14
+ : RelTableScanState{mm, nodeIDVector, std::move(outputVectors), std::move(outChunkState)} {
15
+ dataChunk.valueVectors.resize(this->outputVectors.size());
16
+ for (size_t i = 0; i < this->outputVectors.size(); ++i) {
17
+ dataChunk.valueVectors[i] = std::shared_ptr<common::ValueVector>(this->outputVectors[i],
18
+ [](common::ValueVector*) {});
19
+ }
20
+ dataChunk.state = this->outState;
21
+ }
22
+
23
+ ForeignRelTable::ForeignRelTable(catalog::RelGroupCatalogEntry* relGroupEntry,
24
+ common::table_id_t fromTableID, common::table_id_t toTableID,
25
+ const StorageManager* storageManager, MemoryManager* memoryManager,
26
+ function::TableFunction scanFunction, std::shared_ptr<function::TableFuncBindData> scanBindData)
27
+ : RelTable{relGroupEntry, fromTableID, toTableID, storageManager, memoryManager},
28
+ scanFunction{std::move(scanFunction)}, scanBindData{std::move(scanBindData)} {}
29
+
30
+ void ForeignRelTable::initScanState([[maybe_unused]] transaction::Transaction* transaction,
31
+ TableScanState& scanState, [[maybe_unused]] bool resetCachedBoundNodeSelVec) const {
32
+ // For foreign tables, we don't need node group initialization
33
+ // RelTable::initScanState(transaction, scanState, resetCachedBoundNodeSelVec);
34
+ auto& foreignRelScanState = static_cast<ForeignRelTableScanState&>(scanState);
35
+ function::TableFuncInitSharedStateInput sharedInput{scanBindData.get(), nullptr /* context */};
36
+ foreignRelScanState.sharedState = scanFunction.initSharedStateFunc(sharedInput);
37
+ function::TableFuncInitLocalStateInput localInput{*foreignRelScanState.sharedState,
38
+ *scanBindData, nullptr /* clientContext */};
39
+ foreignRelScanState.localState = scanFunction.initLocalStateFunc(localInput);
40
+ }
41
+
42
+ bool ForeignRelTable::scanInternal([[maybe_unused]] transaction::Transaction* transaction,
43
+ TableScanState& scanState) {
44
+ auto& foreignRelScanState = static_cast<ForeignRelTableScanState&>(scanState);
45
+ function::TableFuncInput input{scanBindData.get(), foreignRelScanState.localState.get(),
46
+ foreignRelScanState.sharedState.get(), nullptr /* clientContext */};
47
+ common::DataChunk dc;
48
+ dc.valueVectors = foreignRelScanState.dataChunk.valueVectors;
49
+ dc.state = foreignRelScanState.dataChunk.state;
50
+ function::TableFuncOutput output{std::move(dc)};
51
+ auto numTuples = scanFunction.tableFunc(input, output);
52
+ return numTuples > 0;
53
+ }
54
+
55
+ common::row_idx_t ForeignRelTable::getNumTotalRows(
56
+ [[maybe_unused]] const transaction::Transaction* transaction) {
57
+ // For foreign tables, we might need to query the foreign table for row count
58
+ // For now, return 0 or implement proper counting
59
+ return 0;
60
+ }
61
+
62
+ } // namespace storage
63
+ } // namespace lbug