lbug 0.12.3-dev.2 → 0.12.3-dev.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. package/README.md +2 -6
  2. package/lbug-source/.github/workflows/ci-workflow.yml +9 -2
  3. package/lbug-source/CMakeLists.txt +15 -6
  4. package/lbug-source/Makefile +1 -2
  5. package/lbug-source/README.md +2 -6
  6. package/lbug-source/benchmark/serializer.py +24 -3
  7. package/lbug-source/dataset/demo-db/csv/copy.cypher +4 -4
  8. package/lbug-source/dataset/demo-db/graph-std/demo_indices_follows.parquet +0 -0
  9. package/lbug-source/dataset/demo-db/graph-std/demo_indices_livesin.parquet +0 -0
  10. package/lbug-source/dataset/demo-db/graph-std/demo_indptr_follows.parquet +0 -0
  11. package/lbug-source/dataset/demo-db/graph-std/demo_indptr_livesin.parquet +0 -0
  12. package/lbug-source/dataset/demo-db/graph-std/demo_mapping_city.parquet +0 -0
  13. package/lbug-source/dataset/demo-db/graph-std/demo_mapping_user.parquet +0 -0
  14. package/lbug-source/dataset/demo-db/graph-std/demo_metadata.parquet +0 -0
  15. package/lbug-source/dataset/demo-db/graph-std/demo_nodes_city.parquet +0 -0
  16. package/lbug-source/dataset/demo-db/graph-std/demo_nodes_user.parquet +0 -0
  17. package/lbug-source/dataset/demo-db/graph-std/schema.cypher +4 -0
  18. package/lbug-source/dataset/demo-db/parquet/copy.cypher +4 -4
  19. package/lbug-source/extension/httpfs/test/test_files/http.test +1 -0
  20. package/lbug-source/scripts/antlr4/Cypher.g4 +1 -1
  21. package/lbug-source/scripts/antlr4/hash.md5 +1 -1
  22. package/lbug-source/scripts/generate_binary_demo.sh +1 -1
  23. package/lbug-source/src/antlr4/Cypher.g4 +1 -1
  24. package/lbug-source/src/binder/bind/bind_ddl.cpp +23 -13
  25. package/lbug-source/src/catalog/catalog.cpp +5 -4
  26. package/lbug-source/src/catalog/catalog_entry/node_table_catalog_entry.cpp +8 -1
  27. package/lbug-source/src/catalog/catalog_entry/rel_group_catalog_entry.cpp +7 -0
  28. package/lbug-source/src/function/function_collection.cpp +2 -1
  29. package/lbug-source/src/function/table/CMakeLists.txt +1 -0
  30. package/lbug-source/src/function/table/disk_size_info.cpp +322 -0
  31. package/lbug-source/src/include/binder/ddl/bound_create_table_info.h +10 -6
  32. package/lbug-source/src/include/catalog/catalog_entry/node_table_catalog_entry.h +5 -3
  33. package/lbug-source/src/include/catalog/catalog_entry/rel_group_catalog_entry.h +4 -2
  34. package/lbug-source/src/include/common/constants.h +1 -0
  35. package/lbug-source/src/include/function/table/simple_table_function.h +6 -0
  36. package/lbug-source/src/include/optimizer/count_rel_table_optimizer.h +49 -0
  37. package/lbug-source/src/include/optimizer/logical_operator_visitor.h +6 -0
  38. package/lbug-source/src/include/parser/ddl/create_table_info.h +3 -1
  39. package/lbug-source/src/include/planner/operator/logical_operator.h +1 -0
  40. package/lbug-source/src/include/planner/operator/scan/logical_count_rel_table.h +84 -0
  41. package/lbug-source/src/include/processor/operator/physical_operator.h +1 -0
  42. package/lbug-source/src/include/processor/operator/scan/count_rel_table.h +62 -0
  43. package/lbug-source/src/include/processor/operator/scan/scan_node_table.h +2 -2
  44. package/lbug-source/src/include/processor/plan_mapper.h +2 -0
  45. package/lbug-source/src/include/storage/storage_manager.h +1 -0
  46. package/lbug-source/src/include/storage/storage_version_info.h +1 -7
  47. package/lbug-source/src/include/storage/table/node_table.h +6 -1
  48. package/lbug-source/src/include/storage/table/parquet_node_table.h +103 -0
  49. package/lbug-source/src/include/storage/table/parquet_rel_table.h +91 -0
  50. package/lbug-source/src/include/storage/table/rel_table.h +2 -2
  51. package/lbug-source/src/include/transaction/transaction.h +2 -0
  52. package/lbug-source/src/main/query_result/materialized_query_result.cpp +2 -2
  53. package/lbug-source/src/optimizer/CMakeLists.txt +1 -0
  54. package/lbug-source/src/optimizer/count_rel_table_optimizer.cpp +217 -0
  55. package/lbug-source/src/optimizer/logical_operator_visitor.cpp +6 -0
  56. package/lbug-source/src/optimizer/optimizer.cpp +6 -0
  57. package/lbug-source/src/parser/transform/transform_ddl.cpp +6 -1
  58. package/lbug-source/src/planner/operator/logical_operator.cpp +2 -0
  59. package/lbug-source/src/planner/operator/scan/CMakeLists.txt +1 -0
  60. package/lbug-source/src/planner/operator/scan/logical_count_rel_table.cpp +24 -0
  61. package/lbug-source/src/processor/map/CMakeLists.txt +1 -0
  62. package/lbug-source/src/processor/map/map_count_rel_table.cpp +55 -0
  63. package/lbug-source/src/processor/map/plan_mapper.cpp +3 -0
  64. package/lbug-source/src/processor/operator/persistent/reader/parquet/parquet_reader.cpp +4 -0
  65. package/lbug-source/src/processor/operator/physical_operator.cpp +2 -0
  66. package/lbug-source/src/processor/operator/scan/CMakeLists.txt +1 -0
  67. package/lbug-source/src/processor/operator/scan/count_rel_table.cpp +137 -0
  68. package/lbug-source/src/processor/operator/scan/scan_multi_rel_tables.cpp +24 -2
  69. package/lbug-source/src/processor/operator/scan/scan_node_table.cpp +44 -8
  70. package/lbug-source/src/processor/operator/scan/scan_rel_table.cpp +12 -2
  71. package/lbug-source/src/storage/storage_manager.cpp +37 -6
  72. package/lbug-source/src/storage/table/CMakeLists.txt +2 -0
  73. package/lbug-source/src/storage/table/parquet_node_table.cpp +338 -0
  74. package/lbug-source/src/storage/table/parquet_rel_table.cpp +388 -0
  75. package/lbug-source/test/api/api_test.cpp +18 -0
  76. package/lbug-source/test/common/string_format.cpp +9 -1
  77. package/lbug-source/test/copy/copy_test.cpp +4 -4
  78. package/lbug-source/test/graph_test/CMakeLists.txt +1 -1
  79. package/lbug-source/test/include/test_runner/test_group.h +11 -1
  80. package/lbug-source/test/optimizer/optimizer_test.cpp +46 -0
  81. package/lbug-source/test/runner/e2e_test.cpp +7 -1
  82. package/lbug-source/test/test_files/demo_db/demo_db_graph_std.test +77 -0
  83. package/lbug-source/test/test_helper/CMakeLists.txt +1 -1
  84. package/lbug-source/test/test_helper/test_helper.cpp +33 -1
  85. package/lbug-source/test/test_runner/CMakeLists.txt +1 -1
  86. package/lbug-source/test/test_runner/insert_by_row.cpp +6 -8
  87. package/lbug-source/test/test_runner/multi_copy_split.cpp +2 -4
  88. package/lbug-source/test/test_runner/test_parser.cpp +3 -0
  89. package/lbug-source/test/transaction/checkpoint_test.cpp +1 -1
  90. package/lbug-source/test/transaction/transaction_test.cpp +19 -15
  91. package/lbug-source/third_party/antlr4_cypher/cypher_parser.cpp +2761 -2701
  92. package/lbug-source/third_party/antlr4_cypher/include/cypher_parser.h +2 -0
  93. package/lbug-source/tools/benchmark/count_rel_table.benchmark +5 -0
  94. package/lbug-source/tools/shell/embedded_shell.cpp +78 -3
  95. package/lbug-source/tools/shell/include/embedded_shell.h +2 -0
  96. package/lbug-source/tools/shell/linenoise.cpp +3 -3
  97. package/lbug-source/tools/shell/test/test_helper.py +1 -1
  98. package/lbug-source/tools/shell/test/test_shell_basics.py +12 -0
  99. package/lbug-source/tools/shell/test/test_shell_commands.py +19 -0
  100. package/package.json +1 -1
  101. package/prebuilt/lbugjs-darwin-arm64.node +0 -0
  102. package/prebuilt/lbugjs-linux-arm64.node +0 -0
  103. package/prebuilt/lbugjs-linux-x64.node +0 -0
  104. package/prebuilt/lbugjs-win32-x64.node +0 -0
@@ -0,0 +1,388 @@
1
+ #include "storage/table/parquet_rel_table.h"
2
+
3
+ #include <thread>
4
+
5
+ #include "catalog/catalog_entry/rel_group_catalog_entry.h"
6
+ #include "common/data_chunk/sel_vector.h"
7
+ #include "common/exception/runtime.h"
8
+ #include "common/file_system/virtual_file_system.h"
9
+ #include "main/client_context.h"
10
+ #include "processor/operator/persistent/reader/parquet/parquet_reader.h"
11
+ #include "storage/storage_manager.h"
12
+ #include "transaction/transaction.h"
13
+
14
+ using namespace lbug::catalog;
15
+ using namespace lbug::common;
16
+ using namespace lbug::processor;
17
+ using namespace lbug::transaction;
18
+
19
+ namespace lbug {
20
+ namespace storage {
21
+
22
+ void ParquetRelTableScanState::setToTable(const Transaction* transaction, Table* table_,
23
+ std::vector<column_id_t> columnIDs_, std::vector<ColumnPredicateSet> columnPredicateSets_,
24
+ RelDataDirection direction_) {
25
+ // Call base class implementation but skip local table setup
26
+ TableScanState::setToTable(transaction, table_, std::move(columnIDs_),
27
+ std::move(columnPredicateSets_));
28
+ columns.resize(columnIDs.size());
29
+ direction = direction_;
30
+ for (size_t i = 0; i < columnIDs.size(); ++i) {
31
+ auto columnID = columnIDs[i];
32
+ if (columnID == INVALID_COLUMN_ID || columnID == ROW_IDX_COLUMN_ID) {
33
+ columns[i] = nullptr;
34
+ } else {
35
+ columns[i] = table->cast<RelTable>().getColumn(columnID, direction);
36
+ }
37
+ }
38
+ csrOffsetColumn = table->cast<RelTable>().getCSROffsetColumn(direction);
39
+ csrLengthColumn = table->cast<RelTable>().getCSRLengthColumn(direction);
40
+ nodeGroupIdx = INVALID_NODE_GROUP_IDX;
41
+ // ParquetRelTable does not support local storage, so we skip the local table initialization
42
+ }
43
+
44
+ ParquetRelTable::ParquetRelTable(RelGroupCatalogEntry* relGroupEntry, table_id_t fromTableID,
45
+ table_id_t toTableID, const StorageManager* storageManager, MemoryManager* memoryManager)
46
+ : RelTable{relGroupEntry, fromTableID, toTableID, storageManager, memoryManager},
47
+ relGroupEntry{relGroupEntry} {
48
+ std::string storage = relGroupEntry->getStorage();
49
+ if (storage.empty()) {
50
+ throw RuntimeException("Parquet file path is empty for parquet-backed rel table");
51
+ }
52
+
53
+ // Get the relationship name for multi-table directory support
54
+ std::string relName = relGroupEntry->getName();
55
+
56
+ // New prefix format with relationship name: "prefix" which expands to:
57
+ // prefix_indices_{relName}.parquet, prefix_indptr_{relName}.parquet,
58
+ // prefix_metadata_{relName}.parquet
59
+ std::string prefix = storage;
60
+ indicesFilePath = prefix + "_indices_" + relName + ".parquet";
61
+ indptrFilePath = prefix + "_indptr_" + relName + ".parquet";
62
+ }
63
+
64
+ void ParquetRelTable::initScanState(Transaction* transaction, TableScanState& scanState,
65
+ bool resetCachedBoundNodeSelVec) const {
66
+ // For parquet tables, we create our own scan state
67
+ auto& relScanState = scanState.cast<RelTableScanState>();
68
+ relScanState.source = TableScanSource::COMMITTED;
69
+ relScanState.nodeGroup = nullptr;
70
+ relScanState.nodeGroupIdx = INVALID_NODE_GROUP_IDX;
71
+
72
+ // Initialize ParquetReaders for this scan state (per-thread)
73
+ auto& parquetRelScanState = static_cast<ParquetRelTableScanState&>(relScanState);
74
+
75
+ // Initialize readers if not already done for this scan state
76
+ if (!parquetRelScanState.indicesReader) {
77
+ std::vector<bool> columnSkips; // Read all columns
78
+ auto context = transaction->getClientContext();
79
+ parquetRelScanState.indicesReader =
80
+ std::make_unique<ParquetReader>(indicesFilePath, columnSkips, context);
81
+ }
82
+ if (!indptrFilePath.empty() && !parquetRelScanState.indptrReader) {
83
+ std::vector<bool> columnSkips; // Read all columns
84
+ auto context = transaction->getClientContext();
85
+ parquetRelScanState.indptrReader =
86
+ std::make_unique<ParquetReader>(indptrFilePath, columnSkips, context);
87
+ }
88
+
89
+ // Load shared indptr data - thread-safe to read
90
+ if (!indptrFilePath.empty()) {
91
+ loadIndptrData(transaction);
92
+ }
93
+
94
+ // For morsel-driven parallelism, each scan state maintains its own bound node processing state
95
+ // No shared state needed between threads
96
+ if (resetCachedBoundNodeSelVec) {
97
+ // Copy the cached bound node selection vector from the scan state
98
+ if (relScanState.nodeIDVector->state->getSelVector().isUnfiltered()) {
99
+ relScanState.cachedBoundNodeSelVector.setToUnfiltered();
100
+ } else {
101
+ relScanState.cachedBoundNodeSelVector.setToFiltered();
102
+ memcpy(relScanState.cachedBoundNodeSelVector.getMutableBuffer().data(),
103
+ relScanState.nodeIDVector->state->getSelVector().getMutableBuffer().data(),
104
+ relScanState.nodeIDVector->state->getSelVector().getSelSize() * sizeof(sel_t));
105
+ }
106
+ relScanState.cachedBoundNodeSelVector.setSelSize(
107
+ relScanState.nodeIDVector->state->getSelVector().getSelSize());
108
+ }
109
+
110
+ // Initialize row group ranges for morsel-driven parallelism
111
+ // For now, assign all row groups to this scan state (will be partitioned by the scan operator)
112
+ parquetRelScanState.startRowGroup = 0;
113
+ parquetRelScanState.endRowGroup = parquetRelScanState.indicesReader ?
114
+ parquetRelScanState.indicesReader->getNumRowsGroups() :
115
+ 0;
116
+ parquetRelScanState.currentRowGroup = parquetRelScanState.startRowGroup;
117
+ parquetRelScanState.nextRowToProcess = 0;
118
+ }
119
+
120
+ void ParquetRelTable::initializeParquetReaders(Transaction* transaction) const {
121
+ if (!indicesReader) {
122
+ std::lock_guard lock(parquetReaderMutex);
123
+ if (!indicesReader) {
124
+ std::vector<bool> columnSkips; // Read all columns
125
+ auto context = transaction->getClientContext();
126
+ indicesReader = std::make_unique<ParquetReader>(indicesFilePath, columnSkips, context);
127
+ }
128
+ }
129
+ }
130
+
131
+ void ParquetRelTable::initializeIndptrReader(Transaction* transaction) const {
132
+ if (!indptrFilePath.empty() && !indptrReader) {
133
+ std::lock_guard lock(parquetReaderMutex);
134
+ if (!indptrReader) {
135
+ std::vector<bool> columnSkips; // Read all columns
136
+ auto context = transaction->getClientContext();
137
+ indptrReader = std::make_unique<ParquetReader>(indptrFilePath, columnSkips, context);
138
+ }
139
+ }
140
+ }
141
+
142
+ void ParquetRelTable::loadIndptrData(Transaction* transaction) const {
143
+ if (indptrData.empty() && !indptrFilePath.empty()) {
144
+ std::lock_guard lock(indptrDataMutex);
145
+ if (indptrData.empty()) {
146
+ initializeIndptrReader(transaction);
147
+ if (!indptrReader)
148
+ return;
149
+
150
+ // Initialize scan to populate column types
151
+ auto context = transaction->getClientContext();
152
+ auto vfs = VirtualFileSystem::GetUnsafe(*context);
153
+ std::vector<uint64_t> groupsToRead;
154
+ for (uint64_t i = 0; i < indptrReader->getNumRowsGroups(); ++i) {
155
+ groupsToRead.push_back(i);
156
+ }
157
+
158
+ ParquetReaderScanState scanState;
159
+ indptrReader->initializeScan(scanState, groupsToRead, vfs);
160
+
161
+ // Check if the indptr file has any columns after scan initialization
162
+ auto numColumns = indptrReader->getNumColumns();
163
+ if (numColumns == 0) {
164
+ throw RuntimeException("Indptr parquet file has no columns");
165
+ }
166
+
167
+ // Validate column type for indptr
168
+ const auto& indptrType = indptrReader->getColumnType(0);
169
+ if (!LogicalTypeUtils::isIntegral(indptrType.getLogicalTypeID())) {
170
+ throw RuntimeException(
171
+ "Indptr parquet file column must be integer type (column 0)");
172
+ }
173
+
174
+ // Read the indptr column
175
+ DataChunk dataChunk(1);
176
+
177
+ // Now get the column type after scan is initialized
178
+ const auto& columnTypeRef = indptrReader->getColumnType(0);
179
+ auto columnType = columnTypeRef.copy();
180
+ auto vector = std::make_shared<ValueVector>(std::move(columnType));
181
+ dataChunk.insert(0, vector);
182
+
183
+ // Read all indptr values
184
+ while (indptrReader->scanInternal(scanState, dataChunk)) {
185
+ auto selSize = dataChunk.state->getSelVector().getSelSize();
186
+ for (size_t i = 0; i < selSize; ++i) {
187
+ auto value = dataChunk.getValueVector(0).getValue<common::offset_t>(i);
188
+ indptrData.push_back(value);
189
+ }
190
+ }
191
+ }
192
+ }
193
+ }
194
+
195
+ bool ParquetRelTable::scanInternal(Transaction* transaction, TableScanState& scanState) {
196
+ auto& relScanState = scanState.cast<RelTableScanState>();
197
+
198
+ // Get the ParquetRelTableScanState
199
+ auto& parquetRelScanState = static_cast<ParquetRelTableScanState&>(relScanState);
200
+
201
+ // Load shared indptr data - thread-safe to read
202
+ if (!indptrFilePath.empty()) {
203
+ loadIndptrData(transaction);
204
+ }
205
+
206
+ // True morsel-driven parallelism: each scan state processes its assigned row groups
207
+ // Process all row groups assigned to this scan state, collecting relationships for bound nodes
208
+ return scanInternalByRowGroups(transaction, parquetRelScanState);
209
+ }
210
+
211
+ bool ParquetRelTable::scanInternalByRowGroups(Transaction* transaction,
212
+ ParquetRelTableScanState& parquetRelScanState) {
213
+ // True morsel-driven parallelism: process assigned row groups and collect relationships for
214
+ // bound nodes
215
+
216
+ // Check if we have any row groups left to process
217
+ if (parquetRelScanState.currentRowGroup >= parquetRelScanState.endRowGroup) {
218
+ // No more row groups to process
219
+ auto newSelVector = std::make_shared<SelectionVector>(0);
220
+ parquetRelScanState.outState->setSelVector(newSelVector);
221
+ return false;
222
+ }
223
+
224
+ // Process the current row group
225
+ std::vector<uint64_t> rowGroupsToProcess = {parquetRelScanState.currentRowGroup};
226
+
227
+ // Create a set of bound node IDs for fast lookup
228
+ std::unordered_set<common::offset_t> boundNodeOffsets;
229
+ for (size_t i = 0; i < parquetRelScanState.cachedBoundNodeSelVector.getSelSize(); ++i) {
230
+ common::sel_t boundNodeIdx = parquetRelScanState.cachedBoundNodeSelVector[i];
231
+ const auto boundNodeID = parquetRelScanState.nodeIDVector->getValue<nodeID_t>(boundNodeIdx);
232
+ boundNodeOffsets.insert(boundNodeID.offset);
233
+ }
234
+
235
+ // Scan the current row group and collect relationships for bound nodes
236
+ bool hasData = scanRowGroupForBoundNodes(transaction, parquetRelScanState, rowGroupsToProcess,
237
+ boundNodeOffsets);
238
+
239
+ // Move to next row group for next call
240
+ parquetRelScanState.currentRowGroup++;
241
+
242
+ return hasData;
243
+ }
244
+
245
+ common::offset_t ParquetRelTable::findSourceNodeForRow(common::offset_t globalRowIdx) const {
246
+ // Binary search in indptrData to find which source node this row belongs to
247
+ // indptrData[i] gives the starting row index for source node i
248
+ // indptrData[i+1] gives the ending row index for source node i
249
+
250
+ if (indptrData.empty()) {
251
+ return common::INVALID_OFFSET;
252
+ }
253
+
254
+ // Binary search to find the source node
255
+ size_t left = 0;
256
+ size_t right = indptrData.size() - 2; // -2 because we compare with i+1
257
+
258
+ while (left <= right) {
259
+ size_t mid = left + (right - left) / 2;
260
+ if (globalRowIdx >= indptrData[mid] && globalRowIdx < indptrData[mid + 1]) {
261
+ return mid; // Found the source node
262
+ } else if (globalRowIdx < indptrData[mid]) {
263
+ if (mid == 0)
264
+ break;
265
+ right = mid - 1;
266
+ } else {
267
+ left = mid + 1;
268
+ }
269
+ }
270
+
271
+ return common::INVALID_OFFSET; // Row not found in any range
272
+ }
273
+
274
+ bool ParquetRelTable::scanRowGroupForBoundNodes(Transaction* transaction,
275
+ ParquetRelTableScanState& parquetRelScanState, const std::vector<uint64_t>& rowGroupsToProcess,
276
+ const std::unordered_set<common::offset_t>& boundNodeOffsets) {
277
+
278
+ // Initialize readers if needed
279
+ initializeParquetReaders(transaction);
280
+
281
+ if (!parquetRelScanState.indicesReader) {
282
+ return false;
283
+ }
284
+
285
+ // Initialize scan state for the assigned row groups
286
+ auto context = transaction->getClientContext();
287
+ auto vfs = VirtualFileSystem::GetUnsafe(*context);
288
+ parquetRelScanState.indicesReader->initializeScan(*parquetRelScanState.parquetScanState,
289
+ rowGroupsToProcess, vfs);
290
+
291
+ // Create DataChunk matching the indices parquet file schema
292
+ auto numIndicesColumns = parquetRelScanState.indicesReader->getNumColumns();
293
+ DataChunk indicesChunk(numIndicesColumns);
294
+
295
+ // Insert value vectors for all columns in the parquet file
296
+ for (uint32_t colIdx = 0; colIdx < numIndicesColumns; ++colIdx) {
297
+ const auto& columnTypeRef = parquetRelScanState.indicesReader->getColumnType(colIdx);
298
+ auto columnType = columnTypeRef.copy();
299
+ auto vector = std::make_shared<ValueVector>(std::move(columnType));
300
+ indicesChunk.insert(colIdx, vector);
301
+ }
302
+
303
+ // Scan the row groups and collect relationships for bound nodes
304
+ uint64_t totalRowsCollected = 0;
305
+ const uint64_t maxRowsPerCall = DEFAULT_VECTOR_CAPACITY;
306
+ uint64_t currentGlobalRowIdx = 0;
307
+
308
+ // Calculate the starting global row index for the first row group
309
+ if (!rowGroupsToProcess.empty()) {
310
+ auto metadata = parquetRelScanState.indicesReader->getMetadata();
311
+ for (uint64_t rgIdx = 0; rgIdx < rowGroupsToProcess[0]; ++rgIdx) {
312
+ currentGlobalRowIdx += metadata->row_groups[rgIdx].num_rows;
313
+ }
314
+ }
315
+
316
+ while (totalRowsCollected < maxRowsPerCall &&
317
+ parquetRelScanState.indicesReader->scanInternal(*parquetRelScanState.parquetScanState,
318
+ indicesChunk)) {
319
+
320
+ auto selSize = indicesChunk.state->getSelVector().getSelSize();
321
+
322
+ for (size_t i = 0; i < selSize && totalRowsCollected < maxRowsPerCall;
323
+ ++i, ++currentGlobalRowIdx) {
324
+ // Find which source node this row belongs to
325
+ common::offset_t sourceNodeOffset = findSourceNodeForRow(currentGlobalRowIdx);
326
+ if (sourceNodeOffset == common::INVALID_OFFSET) {
327
+ continue; // Invalid row
328
+ }
329
+
330
+ // Check if this source node is in our bound nodes
331
+ if (boundNodeOffsets.find(sourceNodeOffset) == boundNodeOffsets.end()) {
332
+ continue; // Not a bound node, skip
333
+ }
334
+
335
+ // This row belongs to a bound node, collect the relationship
336
+
337
+ // Column 0 in indices file is the target/destination node ID
338
+ // Read as offset_t and convert to INTERNAL_ID
339
+ auto dstOffset = indicesChunk.getValueVector(0).getValue<common::offset_t>(i);
340
+ auto dstNodeID = internalID_t(dstOffset, getToNodeTableID());
341
+
342
+ // outputVectors[0] is the neighbor node ID (destination), if requested
343
+ if (!parquetRelScanState.outputVectors.empty()) {
344
+ parquetRelScanState.outputVectors[0]->setValue(totalRowsCollected, dstNodeID);
345
+ }
346
+
347
+ // If there are additional columns (e.g., weight), copy them to subsequent output
348
+ // vectors These are property columns and should have matching types
349
+ for (uint32_t colIdx = 1;
350
+ colIdx < numIndicesColumns && colIdx < parquetRelScanState.outputVectors.size();
351
+ ++colIdx) {
352
+ parquetRelScanState.outputVectors[colIdx]->copyFromVectorData(totalRowsCollected,
353
+ &indicesChunk.getValueVector(colIdx), i);
354
+ }
355
+
356
+ totalRowsCollected++;
357
+ }
358
+ }
359
+
360
+ // Set up the output state
361
+ if (totalRowsCollected > 0) {
362
+ auto selVector = std::make_shared<SelectionVector>(totalRowsCollected);
363
+ selVector->setToFiltered(totalRowsCollected);
364
+ for (uint64_t i = 0; i < totalRowsCollected; ++i) {
365
+ (*selVector)[i] = i;
366
+ }
367
+ parquetRelScanState.outState->setSelVector(selVector);
368
+
369
+ return true;
370
+ } else {
371
+ // No data found
372
+ auto selVector = std::make_shared<SelectionVector>(0);
373
+ parquetRelScanState.outState->setSelVector(selVector);
374
+ return false;
375
+ }
376
+ }
377
+
378
+ row_idx_t ParquetRelTable::getNumTotalRows(const transaction::Transaction* transaction) {
379
+ initializeParquetReaders(const_cast<transaction::Transaction*>(transaction));
380
+ if (!indicesReader) {
381
+ return 0;
382
+ }
383
+ auto metadata = indicesReader->getMetadata();
384
+ return metadata ? metadata->num_rows : 0;
385
+ }
386
+
387
+ } // namespace storage
388
+ } // namespace lbug
@@ -488,6 +488,24 @@ TEST_F(ApiTest, CloseDatabaseBeforeQueryResultWithMultipleStatements) {
488
488
  inMemoryDatabase.reset();
489
489
  }
490
490
 
491
+ TEST_F(ApiTest, ToStringDoesNotModifyIterator) {
492
+ auto result = conn->query("MATCH (p:person) RETURN p.ID ORDER BY p.ID");
493
+ ASSERT_TRUE(result->isSuccess());
494
+ ASSERT_TRUE(result->hasNext()); // Iterator should be at the beginning
495
+ auto str = result->toString(); // This should not modify the iterator
496
+ ASSERT_TRUE(result->hasNext()); // Iterator should still be at the beginning
497
+ // Now iterate through the results
498
+ std::vector<int64_t> ids;
499
+ while (result->hasNext()) {
500
+ auto tuple = result->getNext();
501
+ ids.push_back(tuple->getValue(0)->getValue<int64_t>());
502
+ }
503
+ ASSERT_EQ(ids.size(), 8u); // Assuming 8 persons in the test data
504
+ ASSERT_EQ(ids[0], 0);
505
+ ASSERT_EQ(ids[1], 2);
506
+ // etc., but just check the count for now
507
+ }
508
+
491
509
  TEST_F(ApiTest, EmptyDBFile) {
492
510
  if (inMemMode) {
493
511
  GTEST_SKIP();
@@ -10,6 +10,7 @@ TEST(StringFormat, Basic) {
10
10
  "Some formatted data: a and 423");
11
11
  }
12
12
 
13
+ #if !USE_STD_FORMAT
13
14
  TEST(StringFormat, Escape) {
14
15
  ASSERT_EQ(stringFormat("Escape this {{}} but not this {{ }}"),
15
16
  "Escape this {} but not this {{ }}");
@@ -29,6 +30,7 @@ TEST(StringFormat, TooManyArguments) {
29
30
  TEST(StringFormat, TooFewArguments) {
30
31
  ASSERT_THROW(stringFormat("Format with arguments {}"), InternalException);
31
32
  }
33
+ #endif
32
34
 
33
35
  TEST(StringFormat, Format8BitTypes) {
34
36
  enum TestEnum : uint8_t {
@@ -38,7 +40,9 @@ TEST(StringFormat, Format8BitTypes) {
38
40
  char literal_character = 'a';
39
41
  TestEnum enum_val = TestEnum::NO;
40
42
  int8_t signed_int8 = 4;
41
- ASSERT_EQ(stringFormat("{} {} {}", literal_character, enum_val, signed_int8), "a 1 4");
43
+ ASSERT_EQ(
44
+ stringFormat("{} {} {}", literal_character, static_cast<uint8_t>(enum_val), signed_int8),
45
+ "a 1 4");
42
46
  }
43
47
 
44
48
  TEST(StringFormat, FormatString) {
@@ -64,5 +68,9 @@ TEST(StringFormat, FormatIntegers) {
64
68
  TEST(StringFormat, FormatFloats) {
65
69
  float a = 2.3;
66
70
  double b = 5.4;
71
+ #if USE_STD_FORMAT
72
+ ASSERT_EQ(stringFormat("{} {}", a, b), "2.3 5.4");
73
+ #else
67
74
  ASSERT_EQ(stringFormat("{} {}", a, b), "2.300000 5.400000");
75
+ #endif
68
76
  }
@@ -280,7 +280,7 @@ TEST_F(CopyTest, NodeInsertBMExceptionDuringCommitRecovery) {
280
280
  .executeFunc =
281
281
  [](main::Connection* conn, int) {
282
282
  const auto queryString = common::stringFormat(
283
- "UNWIND RANGE(1,{}) AS i CREATE (a:account {ID:i})", numValues);
283
+ "UNWIND RANGE(1,{}) AS i CREATE (a:account {{ID:i}})", numValues);
284
284
  return conn->query(queryString);
285
285
  },
286
286
  .earlyExitOnFailureFunc = [](main::QueryResult*) { return false; },
@@ -300,7 +300,7 @@ TEST_F(CopyTest, RelInsertBMExceptionDuringCommitRecovery) {
300
300
  conn->query("CREATE NODE TABLE account(ID INT64, PRIMARY KEY(ID))");
301
301
  conn->query("CREATE REL TABLE follows(FROM account TO account);");
302
302
  const auto queryString = common::stringFormat(
303
- "UNWIND RANGE(1,{}) AS i CREATE (a:account {ID:i})", numNodes);
303
+ "UNWIND RANGE(1,{}) AS i CREATE (a:account {{ID:i}})", numNodes);
304
304
  ASSERT_TRUE(conn->query(queryString)->isSuccess());
305
305
  failureFrequency = 32;
306
306
  },
@@ -309,7 +309,7 @@ TEST_F(CopyTest, RelInsertBMExceptionDuringCommitRecovery) {
309
309
  return conn->query(common::stringFormat(
310
310
  "UNWIND RANGE(1,{}) AS i MATCH (a:account), (b:account) WHERE a.ID = i AND "
311
311
  "b.ID = i + 1 CREATE (a)-[f:follows]->(b)",
312
- numNodes));
312
+ numNodes - 1));
313
313
  },
314
314
  .earlyExitOnFailureFunc = [](main::QueryResult*) { return false; },
315
315
  .checkFunc =
@@ -407,7 +407,7 @@ TEST_F(CopyTest, NodeInsertBMExceptionDuringCheckpointRecovery) {
407
407
  .executeFunc =
408
408
  [](main::Connection* conn, int) {
409
409
  return conn->query(common::stringFormat(
410
- "UNWIND RANGE(1,{}) AS i CREATE (a:account {ID:i})", numValues));
410
+ "UNWIND RANGE(1,{}) AS i CREATE (a:account {{ID:i}})", numValues));
411
411
  },
412
412
  .earlyExitOnFailureFunc = [](main::QueryResult*) { return true; },
413
413
  .checkFunc =
@@ -8,7 +8,7 @@ target_include_directories(
8
8
  PUBLIC
9
9
  ../include/
10
10
  )
11
- target_link_libraries(graph_test PUBLIC GTEST_LIB lbug)
11
+ target_link_libraries(graph_test PUBLIC GTEST_LIB lbug_shared)
12
12
 
13
13
  add_library(
14
14
  api_graph_test
@@ -107,7 +107,17 @@ struct TestGroup {
107
107
  std::unordered_map<std::string, std::set<std::string>> testCasesConnNames;
108
108
  bool testFwdOnly;
109
109
 
110
- enum class DatasetType { CSV, PARQUET, NPY, CSV_TO_PARQUET, TURTLE, LBUG, JSON, CSV_TO_JSON };
110
+ enum class DatasetType {
111
+ CSV,
112
+ PARQUET,
113
+ NPY,
114
+ CSV_TO_PARQUET,
115
+ TURTLE,
116
+ LBUG,
117
+ JSON,
118
+ CSV_TO_JSON,
119
+ GRAPH_STD
120
+ };
111
121
  DatasetType datasetType;
112
122
 
113
123
  bool isValid() const { return !group.empty() && !dataset.empty(); }
@@ -1,5 +1,6 @@
1
1
  #include "graph_test/private_graph_test.h"
2
2
  #include "planner/operator/logical_plan_util.h"
3
+ #include "planner/operator/scan/logical_count_rel_table.h"
3
4
  #include "test_runner/test_runner.h"
4
5
 
5
6
  namespace lbug {
@@ -17,6 +18,19 @@ public:
17
18
  std::unique_ptr<planner::LogicalPlan> getRoot(const std::string& query) {
18
19
  return TestRunner::getLogicalPlan(query, *conn);
19
20
  }
21
+
22
+ // Helper to check if a specific operator type exists in the plan
23
+ static bool hasOperatorType(planner::LogicalOperator* op, planner::LogicalOperatorType type) {
24
+ if (op->getOperatorType() == type) {
25
+ return true;
26
+ }
27
+ for (auto i = 0u; i < op->getNumChildren(); ++i) {
28
+ if (hasOperatorType(op->getChild(i).get(), type)) {
29
+ return true;
30
+ }
31
+ }
32
+ return false;
33
+ }
20
34
  };
21
35
 
22
36
  TEST_F(OptimizerTest, JoinHint) {
@@ -211,5 +225,37 @@ TEST_F(OptimizerTest, SubqueryHint) {
211
225
  ASSERT_STREQ(getEncodedPlan(q6).c_str(), "Filter()HJ(a._ID){S(a)}{E(a)Filter()S(b)}");
212
226
  }
213
227
 
228
+ TEST_F(OptimizerTest, CountRelTableOptimizer) {
229
+ // Test that COUNT(*) over a single rel table is optimized to COUNT_REL_TABLE
230
+ auto q1 = "MATCH (a:person)-[e:knows]->(b:person) RETURN COUNT(*);";
231
+ auto plan1 = getRoot(q1);
232
+ ASSERT_TRUE(hasOperatorType(plan1->getLastOperator().get(),
233
+ planner::LogicalOperatorType::COUNT_REL_TABLE));
234
+ // Verify the query returns the correct result
235
+ auto result1 = conn->query(q1);
236
+ ASSERT_TRUE(result1->isSuccess());
237
+ ASSERT_EQ(result1->getNumTuples(), 1);
238
+ auto tuple1 = result1->getNext();
239
+ ASSERT_EQ(tuple1->getValue(0)->getValue<int64_t>(), 14);
240
+
241
+ // Test that COUNT(*) with GROUP BY is NOT optimized (has keys)
242
+ auto q2 = "MATCH (a:person)-[e:knows]->(b:person) RETURN a.fName, COUNT(*);";
243
+ auto plan2 = getRoot(q2);
244
+ ASSERT_FALSE(hasOperatorType(plan2->getLastOperator().get(),
245
+ planner::LogicalOperatorType::COUNT_REL_TABLE));
246
+
247
+ // Test that COUNT(*) with WHERE clause is NOT optimized (has filter)
248
+ auto q3 = "MATCH (a:person)-[e:knows]->(b:person) WHERE a.ID > 0 RETURN COUNT(*);";
249
+ auto plan3 = getRoot(q3);
250
+ ASSERT_FALSE(hasOperatorType(plan3->getLastOperator().get(),
251
+ planner::LogicalOperatorType::COUNT_REL_TABLE));
252
+
253
+ // Test that COUNT(DISTINCT ...) is NOT optimized
254
+ auto q4 = "MATCH (a:person)-[e:knows]->(b:person) RETURN COUNT(DISTINCT a);";
255
+ auto plan4 = getRoot(q4);
256
+ ASSERT_FALSE(hasOperatorType(plan4->getLastOperator().get(),
257
+ planner::LogicalOperatorType::COUNT_REL_TABLE));
258
+ }
259
+
214
260
  } // namespace testing
215
261
  } // namespace lbug
@@ -38,7 +38,13 @@ public:
38
38
  }
39
39
  createDB(checkpointWaitTimeout);
40
40
  createConns(connNames);
41
- if (datasetType != TestGroup::DatasetType::LBUG && dataset != "empty") {
41
+ if (datasetType == TestGroup::DatasetType::GRAPH_STD) {
42
+ // For GRAPH_STD, only run schema.cypher (which contains WITH storage = ... clauses)
43
+ // No copy.cypher needed as data is in external parquet files
44
+ lbug::main::Connection* connection =
45
+ conn ? conn.get() : (connMap.begin()->second).get();
46
+ TestHelper::executeScript(dataset + "/" + TestHelper::SCHEMA_FILE_NAME, *connection);
47
+ } else if (datasetType != TestGroup::DatasetType::LBUG && dataset != "empty") {
42
48
  initGraph();
43
49
  } else if (generateBinaryDemo && TestHelper::E2E_OVERRIDE_IMPORT_DIR.empty()) {
44
50
  initGraph(TestHelper::appendLbugRootPath("dataset/demo-db/parquet/"));
@@ -0,0 +1,77 @@
1
+ -DATASET GRAPH-STD demo-db/graph-std
2
+
3
+ --
4
+
5
+ -CASE DemoDBGraphStdTest
6
+
7
+ -LOG MatchUserLivesInCity
8
+ -STATEMENT MATCH (u:user)-[l:livesin]->(c:city) RETURN u.name, u.age, c.name;
9
+ ---- 4
10
+ Adam|30|Waterloo
11
+ Karissa|40|Waterloo
12
+ Zhang|50|Kitchener
13
+ Noura|25|Guelph
14
+
15
+ -LOG MatchSingleNodeLabel
16
+ -STATEMENT MATCH (a:user) RETURN a.name, a.age;
17
+ ---- 4
18
+ Adam|30
19
+ Karissa|40
20
+ Zhang|50
21
+ Noura|25
22
+
23
+ -LOG MatchCityNodes
24
+ -STATEMENT MATCH (c:city) RETURN c.name, c.population;
25
+ ---- 3
26
+ Waterloo|150000
27
+ Kitchener|200000
28
+ Guelph|75000
29
+
30
+ -LOG MatchFollowsRel
31
+ -STATEMENT MATCH (a:user)-[e:follows]->(b:user) RETURN a.name, b.name, e.since;
32
+ ---- 4
33
+ Adam|Karissa|2020
34
+ Adam|Zhang|2020
35
+ Karissa|Zhang|2021
36
+ Zhang|Noura|2022
37
+
38
+ -LOG MatchLivesInWithCityPopulation
39
+ -STATEMENT MATCH (u:user)-[l:livesin]->(c:city) RETURN u.name, c.name, c.population ORDER BY c.population DESC;
40
+ ---- 4
41
+ Zhang|Kitchener|200000
42
+ Adam|Waterloo|150000
43
+ Karissa|Waterloo|150000
44
+ Noura|Guelph|75000
45
+
46
+ -LOG MatchLivesInFilterByCity
47
+ -STATEMENT MATCH (u:user)-[l:livesin]->(c:city) WHERE c.name = 'Waterloo' RETURN u.name, u.age;
48
+ ---- 2
49
+ Adam|30
50
+ Karissa|40
51
+
52
+ -LOG MatchLivesInFilterByCityPopulation
53
+ -STATEMENT MATCH (u:user)-[l:livesin]->(c:city) WHERE c.population > 100000 RETURN u.name, c.name ORDER BY u.name;
54
+ ---- 3
55
+ Adam|Waterloo
56
+ Karissa|Waterloo
57
+ Zhang|Kitchener
58
+
59
+ -LOG CountUsersPerCity
60
+ -STATEMENT MATCH (u:user)-[l:livesin]->(c:city) RETURN c.name, COUNT(*) AS num_users ORDER BY num_users DESC;
61
+ ---- 3
62
+ Waterloo|2
63
+ Guelph|1
64
+ Kitchener|1
65
+
66
+ -LOG MatchFollowsWithDestinationAge
67
+ -STATEMENT MATCH (a:user)-[e:follows]->(b:user) WHERE b.age > 30 RETURN a.name, b.name, b.age ORDER BY b.age DESC;
68
+ ---- 3
69
+ Adam|Zhang|50
70
+ Karissa|Zhang|50
71
+ Adam|Karissa|40
72
+
73
+ -LOG MatchFollowsFilterBySourceAndDest
74
+ -STATEMENT MATCH (a:user)-[e:follows]->(b:user) WHERE a.age < 40 AND b.age >= 40 RETURN a.name, b.name;
75
+ ---- 2
76
+ Adam|Karissa
77
+ Adam|Zhang
@@ -8,7 +8,7 @@ target_include_directories(
8
8
  PUBLIC
9
9
  ../include/
10
10
  )
11
- target_link_libraries(test_helper PUBLIC lbug)
11
+ target_link_libraries(test_helper PUBLIC lbug_shared)
12
12
 
13
13
  add_library(
14
14
  api_test_helper