lbug 0.12.3-dev.16 → 0.12.3-dev.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/lbug-source/CMakeLists.txt +1 -1
  2. package/lbug-source/dataset/demo-db/graph-std/demo_indices_follows.parquet +0 -0
  3. package/lbug-source/dataset/demo-db/graph-std/demo_indices_livesin.parquet +0 -0
  4. package/lbug-source/dataset/demo-db/graph-std/demo_indptr_follows.parquet +0 -0
  5. package/lbug-source/dataset/demo-db/graph-std/demo_indptr_livesin.parquet +0 -0
  6. package/lbug-source/dataset/demo-db/graph-std/demo_mapping_city.parquet +0 -0
  7. package/lbug-source/dataset/demo-db/graph-std/demo_mapping_user.parquet +0 -0
  8. package/lbug-source/dataset/demo-db/graph-std/demo_metadata.parquet +0 -0
  9. package/lbug-source/dataset/demo-db/graph-std/demo_nodes_city.parquet +0 -0
  10. package/lbug-source/dataset/demo-db/graph-std/demo_nodes_user.parquet +0 -0
  11. package/lbug-source/dataset/demo-db/graph-std/schema.cypher +4 -0
  12. package/lbug-source/scripts/antlr4/Cypher.g4 +1 -1
  13. package/lbug-source/scripts/antlr4/hash.md5 +1 -1
  14. package/lbug-source/src/antlr4/Cypher.g4 +1 -1
  15. package/lbug-source/src/binder/bind/bind_ddl.cpp +23 -13
  16. package/lbug-source/src/catalog/catalog.cpp +5 -4
  17. package/lbug-source/src/catalog/catalog_entry/node_table_catalog_entry.cpp +8 -1
  18. package/lbug-source/src/catalog/catalog_entry/rel_group_catalog_entry.cpp +7 -0
  19. package/lbug-source/src/include/binder/ddl/bound_create_table_info.h +10 -6
  20. package/lbug-source/src/include/catalog/catalog_entry/node_table_catalog_entry.h +5 -3
  21. package/lbug-source/src/include/catalog/catalog_entry/rel_group_catalog_entry.h +4 -2
  22. package/lbug-source/src/include/common/constants.h +1 -0
  23. package/lbug-source/src/include/parser/ddl/create_table_info.h +3 -1
  24. package/lbug-source/src/include/processor/operator/scan/scan_node_table.h +2 -2
  25. package/lbug-source/src/include/storage/storage_manager.h +1 -0
  26. package/lbug-source/src/include/storage/table/node_table.h +6 -1
  27. package/lbug-source/src/include/storage/table/parquet_node_table.h +103 -0
  28. package/lbug-source/src/include/storage/table/parquet_rel_table.h +99 -0
  29. package/lbug-source/src/include/storage/table/rel_table.h +2 -2
  30. package/lbug-source/src/include/transaction/transaction.h +2 -0
  31. package/lbug-source/src/parser/transform/transform_ddl.cpp +6 -1
  32. package/lbug-source/src/processor/operator/persistent/reader/parquet/parquet_reader.cpp +4 -0
  33. package/lbug-source/src/processor/operator/scan/scan_multi_rel_tables.cpp +24 -2
  34. package/lbug-source/src/processor/operator/scan/scan_node_table.cpp +44 -8
  35. package/lbug-source/src/processor/operator/scan/scan_rel_table.cpp +12 -2
  36. package/lbug-source/src/storage/storage_manager.cpp +40 -6
  37. package/lbug-source/src/storage/table/CMakeLists.txt +2 -0
  38. package/lbug-source/src/storage/table/parquet_node_table.cpp +338 -0
  39. package/lbug-source/src/storage/table/parquet_rel_table.cpp +470 -0
  40. package/lbug-source/test/include/test_runner/test_group.h +11 -1
  41. package/lbug-source/test/runner/e2e_test.cpp +7 -1
  42. package/lbug-source/test/test_files/demo_db/demo_db_graph_std.test +43 -0
  43. package/lbug-source/test/test_helper/test_helper.cpp +24 -0
  44. package/lbug-source/test/test_runner/test_parser.cpp +3 -0
  45. package/lbug-source/third_party/antlr4_cypher/cypher_parser.cpp +2761 -2701
  46. package/lbug-source/third_party/antlr4_cypher/include/cypher_parser.h +2 -0
  47. package/package.json +1 -1
  48. package/prebuilt/lbugjs-darwin-arm64.node +0 -0
  49. package/prebuilt/lbugjs-linux-arm64.node +0 -0
  50. package/prebuilt/lbugjs-linux-x64.node +0 -0
  51. package/prebuilt/lbugjs-win32-x64.node +0 -0
@@ -0,0 +1,470 @@
1
+ #include "storage/table/parquet_rel_table.h"
2
+
3
+ #include <thread>
4
+
5
+ #include "catalog/catalog_entry/rel_group_catalog_entry.h"
6
+ #include "common/data_chunk/sel_vector.h"
7
+ #include "common/exception/runtime.h"
8
+ #include "common/file_system/virtual_file_system.h"
9
+ #include "main/client_context.h"
10
+ #include "processor/operator/persistent/reader/parquet/parquet_reader.h"
11
+ #include "storage/storage_manager.h"
12
+ #include "transaction/transaction.h"
13
+
14
+ using namespace lbug::catalog;
15
+ using namespace lbug::common;
16
+ using namespace lbug::processor;
17
+ using namespace lbug::transaction;
18
+
19
+ namespace lbug {
20
+ namespace storage {
21
+
22
+ void ParquetRelTableScanState::setToTable(const Transaction* transaction, Table* table_,
23
+ std::vector<column_id_t> columnIDs_, std::vector<ColumnPredicateSet> columnPredicateSets_,
24
+ RelDataDirection direction_) {
25
+ // Call base class implementation but skip local table setup
26
+ TableScanState::setToTable(transaction, table_, std::move(columnIDs_),
27
+ std::move(columnPredicateSets_));
28
+ columns.resize(columnIDs.size());
29
+ direction = direction_;
30
+ for (size_t i = 0; i < columnIDs.size(); ++i) {
31
+ auto columnID = columnIDs[i];
32
+ if (columnID == INVALID_COLUMN_ID || columnID == ROW_IDX_COLUMN_ID) {
33
+ columns[i] = nullptr;
34
+ } else {
35
+ columns[i] = table->cast<RelTable>().getColumn(columnID, direction);
36
+ }
37
+ }
38
+ csrOffsetColumn = table->cast<RelTable>().getCSROffsetColumn(direction);
39
+ csrLengthColumn = table->cast<RelTable>().getCSRLengthColumn(direction);
40
+ nodeGroupIdx = INVALID_NODE_GROUP_IDX;
41
+ // ParquetRelTable does not support local storage, so we skip the local table initialization
42
+ }
43
+
44
+ ParquetRelTable::ParquetRelTable(RelGroupCatalogEntry* relGroupEntry, table_id_t fromTableID,
45
+ table_id_t toTableID, const StorageManager* storageManager, MemoryManager* memoryManager,
46
+ std::string fromNodeTableName)
47
+ : RelTable{relGroupEntry, fromTableID, toTableID, storageManager, memoryManager},
48
+ relGroupEntry{relGroupEntry} {
49
+ std::string storage = relGroupEntry->getStorage();
50
+ if (storage.empty()) {
51
+ throw RuntimeException("Parquet file path is empty for parquet-backed rel table");
52
+ }
53
+
54
+ // Get the relationship name for multi-table directory support
55
+ std::string relName = relGroupEntry->getName();
56
+
57
+ // New prefix format with relationship name: "prefix" which expands to:
58
+ // prefix_indices_{relName}.parquet, prefix_indptr_{relName}.parquet,
59
+ // prefix_metadata_{relName}.parquet
60
+ std::string prefix = storage;
61
+ nodeMappingFilePath = prefix + "_mapping_" + fromNodeTableName + ".parquet";
62
+ indicesFilePath = prefix + "_indices_" + relName + ".parquet";
63
+ indptrFilePath = prefix + "_indptr_" + relName + ".parquet";
64
+ }
65
+
66
+ void ParquetRelTable::initScanState(Transaction* transaction, TableScanState& scanState,
67
+ bool resetCachedBoundNodeSelVec) const {
68
+ // For parquet tables, we create our own scan state
69
+ auto& relScanState = scanState.cast<RelTableScanState>();
70
+ relScanState.source = TableScanSource::COMMITTED;
71
+ relScanState.nodeGroup = nullptr;
72
+ relScanState.nodeGroupIdx = INVALID_NODE_GROUP_IDX;
73
+
74
+ // Initialize ParquetReaders for this scan state (per-thread)
75
+ auto& parquetRelScanState = static_cast<ParquetRelTableScanState&>(relScanState);
76
+
77
+ // Initialize readers if not already done for this scan state
78
+ if (!parquetRelScanState.nodeMappingReader) {
79
+ std::vector<bool> columnSkips; // Read all columns
80
+ auto context = transaction->getClientContext();
81
+ parquetRelScanState.nodeMappingReader =
82
+ std::make_unique<ParquetReader>(nodeMappingFilePath, columnSkips, context);
83
+ }
84
+ if (!parquetRelScanState.indicesReader) {
85
+ std::vector<bool> columnSkips; // Read all columns
86
+ auto context = transaction->getClientContext();
87
+ parquetRelScanState.indicesReader =
88
+ std::make_unique<ParquetReader>(indicesFilePath, columnSkips, context);
89
+ }
90
+ if (!indptrFilePath.empty() && !parquetRelScanState.indptrReader) {
91
+ std::vector<bool> columnSkips; // Read all columns
92
+ auto context = transaction->getClientContext();
93
+ parquetRelScanState.indptrReader =
94
+ std::make_unique<ParquetReader>(indptrFilePath, columnSkips, context);
95
+ }
96
+
97
+ // Load shared data (node mapping and indptr) - these are thread-safe to read
98
+ loadNodeMappingData(transaction);
99
+ if (!indptrFilePath.empty()) {
100
+ loadIndptrData(transaction);
101
+ }
102
+
103
+ // For morsel-driven parallelism, each scan state maintains its own bound node processing state
104
+ // No shared state needed between threads
105
+ if (resetCachedBoundNodeSelVec) {
106
+ // Copy the cached bound node selection vector from the scan state
107
+ if (relScanState.nodeIDVector->state->getSelVector().isUnfiltered()) {
108
+ relScanState.cachedBoundNodeSelVector.setToUnfiltered();
109
+ } else {
110
+ relScanState.cachedBoundNodeSelVector.setToFiltered();
111
+ memcpy(relScanState.cachedBoundNodeSelVector.getMutableBuffer().data(),
112
+ relScanState.nodeIDVector->state->getSelVector().getMutableBuffer().data(),
113
+ relScanState.nodeIDVector->state->getSelVector().getSelSize() * sizeof(sel_t));
114
+ }
115
+ relScanState.cachedBoundNodeSelVector.setSelSize(
116
+ relScanState.nodeIDVector->state->getSelVector().getSelSize());
117
+ }
118
+
119
+ // Initialize row group ranges for morsel-driven parallelism
120
+ // For now, assign all row groups to this scan state (will be partitioned by the scan operator)
121
+ parquetRelScanState.startRowGroup = 0;
122
+ parquetRelScanState.endRowGroup = parquetRelScanState.indicesReader ?
123
+ parquetRelScanState.indicesReader->getNumRowsGroups() :
124
+ 0;
125
+ parquetRelScanState.currentRowGroup = parquetRelScanState.startRowGroup;
126
+ parquetRelScanState.nextRowToProcess = 0;
127
+ }
128
+
129
+ void ParquetRelTable::initializeParquetReaders(Transaction* transaction) const {
130
+ if (!nodeMappingReader || !indicesReader) {
131
+ std::lock_guard lock(parquetReaderMutex);
132
+ if (!nodeMappingReader) {
133
+ std::vector<bool> columnSkips; // Read all columns
134
+ auto context = transaction->getClientContext();
135
+ nodeMappingReader =
136
+ std::make_unique<ParquetReader>(nodeMappingFilePath, columnSkips, context);
137
+ }
138
+ if (!indicesReader) {
139
+ std::vector<bool> columnSkips; // Read all columns
140
+ auto context = transaction->getClientContext();
141
+ indicesReader = std::make_unique<ParquetReader>(indicesFilePath, columnSkips, context);
142
+ }
143
+ }
144
+ }
145
+
146
+ void ParquetRelTable::initializeIndptrReader(Transaction* transaction) const {
147
+ if (!indptrFilePath.empty() && !indptrReader) {
148
+ std::lock_guard lock(parquetReaderMutex);
149
+ if (!indptrReader) {
150
+ std::vector<bool> columnSkips; // Read all columns
151
+ auto context = transaction->getClientContext();
152
+ indptrReader = std::make_unique<ParquetReader>(indptrFilePath, columnSkips, context);
153
+ }
154
+ }
155
+ }
156
+
157
+ void ParquetRelTable::loadNodeMappingData(Transaction* transaction) const {
158
+ if (nodeMapping.empty() && !nodeMappingFilePath.empty()) {
159
+ std::lock_guard lock(parquetReaderMutex);
160
+ if (nodeMapping.empty()) {
161
+ // Initialize node mapping reader if not already done
162
+ if (!nodeMappingReader) {
163
+ std::vector<bool> columnSkips; // Read all columns
164
+ auto context = transaction->getClientContext();
165
+ nodeMappingReader =
166
+ std::make_unique<ParquetReader>(nodeMappingFilePath, columnSkips, context);
167
+ }
168
+
169
+ // Initialize scan to populate column types
170
+ auto context = transaction->getClientContext();
171
+ auto vfs = VirtualFileSystem::GetUnsafe(*context);
172
+ std::vector<uint64_t> groupsToRead;
173
+ for (uint64_t i = 0; i < nodeMappingReader->getNumRowsGroups(); ++i) {
174
+ groupsToRead.push_back(i);
175
+ }
176
+
177
+ ParquetReaderScanState scanState;
178
+ nodeMappingReader->initializeScan(scanState, groupsToRead, vfs);
179
+
180
+ // Check if the node mapping file has columns
181
+ auto numColumns = nodeMappingReader->getNumColumns();
182
+ if (numColumns < 2) {
183
+ throw RuntimeException("Node mapping parquet file must have at least 2 columns");
184
+ }
185
+
186
+ // Validate column types for node mapping
187
+ const auto& csrNodeIdType = nodeMappingReader->getColumnType(0);
188
+ const auto& nodeTableIdType = nodeMappingReader->getColumnType(1);
189
+ if (!LogicalTypeUtils::isIntegral(csrNodeIdType.getLogicalTypeID()) ||
190
+ !LogicalTypeUtils::isIntegral(nodeTableIdType.getLogicalTypeID())) {
191
+ throw RuntimeException(
192
+ "Node mapping parquet file columns must be integer types (columns 0 and 1)");
193
+ }
194
+
195
+ // Read the node mapping data
196
+ DataChunk dataChunk(2);
197
+
198
+ // Get column types
199
+ for (uint32_t i = 0; i < 2 && i < numColumns; ++i) {
200
+ const auto& columnTypeRef = nodeMappingReader->getColumnType(i);
201
+ auto columnType = columnTypeRef.copy();
202
+ auto vector = std::make_shared<ValueVector>(std::move(columnType));
203
+ dataChunk.insert(i, vector);
204
+ }
205
+
206
+ // Read all node mapping values
207
+ while (nodeMappingReader->scanInternal(scanState, dataChunk)) {
208
+ auto selSize = dataChunk.state->getSelVector().getSelSize();
209
+ for (size_t i = 0; i < selSize; ++i) {
210
+ auto csrNodeId = dataChunk.getValueVector(0).getValue<common::offset_t>(i);
211
+ auto nodeTableId = dataChunk.getValueVector(1).getValue<common::offset_t>(i);
212
+ nodeMapping[common::internalID_t(nodeTableId, getFromNodeTableID())] =
213
+ csrNodeId;
214
+ // Also create reverse mapping for destination node lookups
215
+ csrToNodeTableIdMap[csrNodeId] = nodeTableId;
216
+ }
217
+ }
218
+ }
219
+ }
220
+ }
221
+
222
+ void ParquetRelTable::loadIndptrData(Transaction* transaction) const {
223
+ if (indptrData.empty() && !indptrFilePath.empty()) {
224
+ std::lock_guard lock(indptrDataMutex);
225
+ if (indptrData.empty()) {
226
+ initializeIndptrReader(transaction);
227
+ if (!indptrReader)
228
+ return;
229
+
230
+ // Initialize scan to populate column types
231
+ auto context = transaction->getClientContext();
232
+ auto vfs = VirtualFileSystem::GetUnsafe(*context);
233
+ std::vector<uint64_t> groupsToRead;
234
+ for (uint64_t i = 0; i < indptrReader->getNumRowsGroups(); ++i) {
235
+ groupsToRead.push_back(i);
236
+ }
237
+
238
+ ParquetReaderScanState scanState;
239
+ indptrReader->initializeScan(scanState, groupsToRead, vfs);
240
+
241
+ // Check if the indptr file has any columns after scan initialization
242
+ auto numColumns = indptrReader->getNumColumns();
243
+ if (numColumns == 0) {
244
+ throw RuntimeException("Indptr parquet file has no columns");
245
+ }
246
+
247
+ // Validate column type for indptr
248
+ const auto& indptrType = indptrReader->getColumnType(0);
249
+ if (!LogicalTypeUtils::isIntegral(indptrType.getLogicalTypeID())) {
250
+ throw RuntimeException(
251
+ "Indptr parquet file column must be integer type (column 0)");
252
+ }
253
+
254
+ // Read the indptr column
255
+ DataChunk dataChunk(1);
256
+
257
+ // Now get the column type after scan is initialized
258
+ const auto& columnTypeRef = indptrReader->getColumnType(0);
259
+ auto columnType = columnTypeRef.copy();
260
+ auto vector = std::make_shared<ValueVector>(std::move(columnType));
261
+ dataChunk.insert(0, vector);
262
+
263
+ // Read all indptr values
264
+ while (indptrReader->scanInternal(scanState, dataChunk)) {
265
+ auto selSize = dataChunk.state->getSelVector().getSelSize();
266
+ for (size_t i = 0; i < selSize; ++i) {
267
+ auto value = dataChunk.getValueVector(0).getValue<common::offset_t>(i);
268
+ indptrData.push_back(value);
269
+ }
270
+ }
271
+ }
272
+ }
273
+ }
274
+
275
+ bool ParquetRelTable::scanInternal(Transaction* transaction, TableScanState& scanState) {
276
+ auto& relScanState = scanState.cast<RelTableScanState>();
277
+
278
+ // Get the ParquetRelTableScanState
279
+ auto& parquetRelScanState = static_cast<ParquetRelTableScanState&>(relScanState);
280
+
281
+ // Readers are now initialized per scan state in initScanState
282
+ // Load shared data (node mapping and indptr) - these are thread-safe to read
283
+ loadNodeMappingData(transaction);
284
+ if (!indptrFilePath.empty()) {
285
+ loadIndptrData(transaction);
286
+ }
287
+
288
+ // True morsel-driven parallelism: each scan state processes its assigned row groups
289
+ // Process all row groups assigned to this scan state, collecting relationships for bound nodes
290
+ return scanInternalByRowGroups(transaction, parquetRelScanState);
291
+ }
292
+
293
+ bool ParquetRelTable::scanInternalByRowGroups(Transaction* transaction,
294
+ ParquetRelTableScanState& parquetRelScanState) {
295
+ // True morsel-driven parallelism: process assigned row groups and collect relationships for
296
+ // bound nodes
297
+
298
+ // Check if we have any row groups left to process
299
+ if (parquetRelScanState.currentRowGroup >= parquetRelScanState.endRowGroup) {
300
+ // No more row groups to process
301
+ auto newSelVector = std::make_shared<SelectionVector>(0);
302
+ parquetRelScanState.outState->setSelVector(newSelVector);
303
+ return false;
304
+ }
305
+
306
+ // Process the current row group
307
+ std::vector<uint64_t> rowGroupsToProcess = {parquetRelScanState.currentRowGroup};
308
+
309
+ // Create a set of bound node IDs for fast lookup
310
+ std::unordered_set<common::offset_t> boundNodeOffsets;
311
+ for (size_t i = 0; i < parquetRelScanState.cachedBoundNodeSelVector.getSelSize(); ++i) {
312
+ common::sel_t boundNodeIdx = parquetRelScanState.cachedBoundNodeSelVector[i];
313
+ const auto boundNodeID = parquetRelScanState.nodeIDVector->getValue<nodeID_t>(boundNodeIdx);
314
+ boundNodeOffsets.insert(boundNodeID.offset);
315
+ }
316
+
317
+ // Scan the current row group and collect relationships for bound nodes
318
+ bool hasData = scanRowGroupForBoundNodes(transaction, parquetRelScanState, rowGroupsToProcess,
319
+ boundNodeOffsets);
320
+
321
+ // Move to next row group for next call
322
+ parquetRelScanState.currentRowGroup++;
323
+
324
+ return hasData;
325
+ }
326
+
327
+ common::offset_t ParquetRelTable::findSourceNodeForRow(common::offset_t globalRowIdx) const {
328
+ // Binary search in indptrData to find which source node this row belongs to
329
+ // indptrData[i] gives the starting row index for source node i
330
+ // indptrData[i+1] gives the ending row index for source node i
331
+
332
+ if (indptrData.empty()) {
333
+ return common::INVALID_OFFSET;
334
+ }
335
+
336
+ // Binary search to find the source node
337
+ size_t left = 0;
338
+ size_t right = indptrData.size() - 2; // -2 because we compare with i+1
339
+
340
+ while (left <= right) {
341
+ size_t mid = left + (right - left) / 2;
342
+ if (globalRowIdx >= indptrData[mid] && globalRowIdx < indptrData[mid + 1]) {
343
+ return mid; // Found the source node
344
+ } else if (globalRowIdx < indptrData[mid]) {
345
+ if (mid == 0)
346
+ break;
347
+ right = mid - 1;
348
+ } else {
349
+ left = mid + 1;
350
+ }
351
+ }
352
+
353
+ return common::INVALID_OFFSET; // Row not found in any range
354
+ }
355
+
356
+ bool ParquetRelTable::scanRowGroupForBoundNodes(Transaction* transaction,
357
+ ParquetRelTableScanState& parquetRelScanState, const std::vector<uint64_t>& rowGroupsToProcess,
358
+ const std::unordered_set<common::offset_t>& boundNodeOffsets) {
359
+
360
+ // Initialize readers if needed
361
+ initializeParquetReaders(transaction);
362
+
363
+ if (!parquetRelScanState.indicesReader) {
364
+ return false;
365
+ }
366
+
367
+ // Initialize scan state for the assigned row groups
368
+ auto context = transaction->getClientContext();
369
+ auto vfs = VirtualFileSystem::GetUnsafe(*context);
370
+ parquetRelScanState.indicesReader->initializeScan(*parquetRelScanState.parquetScanState,
371
+ rowGroupsToProcess, vfs);
372
+
373
+ // Create DataChunk matching the indices parquet file schema
374
+ auto numIndicesColumns = parquetRelScanState.indicesReader->getNumColumns();
375
+ DataChunk indicesChunk(numIndicesColumns);
376
+
377
+ // Insert value vectors for all columns in the parquet file
378
+ for (uint32_t colIdx = 0; colIdx < numIndicesColumns; ++colIdx) {
379
+ const auto& columnTypeRef = parquetRelScanState.indicesReader->getColumnType(colIdx);
380
+ auto columnType = columnTypeRef.copy();
381
+ auto vector = std::make_shared<ValueVector>(std::move(columnType));
382
+ indicesChunk.insert(colIdx, vector);
383
+ }
384
+
385
+ // Scan the row groups and collect relationships for bound nodes
386
+ uint64_t totalRowsCollected = 0;
387
+ const uint64_t maxRowsPerCall = DEFAULT_VECTOR_CAPACITY;
388
+ uint64_t currentGlobalRowIdx = 0;
389
+
390
+ // Calculate the starting global row index for the first row group
391
+ if (!rowGroupsToProcess.empty()) {
392
+ auto metadata = parquetRelScanState.indicesReader->getMetadata();
393
+ for (uint64_t rgIdx = 0; rgIdx < rowGroupsToProcess[0]; ++rgIdx) {
394
+ currentGlobalRowIdx += metadata->row_groups[rgIdx].num_rows;
395
+ }
396
+ }
397
+
398
+ while (totalRowsCollected < maxRowsPerCall &&
399
+ parquetRelScanState.indicesReader->scanInternal(*parquetRelScanState.parquetScanState,
400
+ indicesChunk)) {
401
+
402
+ auto selSize = indicesChunk.state->getSelVector().getSelSize();
403
+
404
+ for (size_t i = 0; i < selSize && totalRowsCollected < maxRowsPerCall;
405
+ ++i, ++currentGlobalRowIdx) {
406
+ // Find which source node this row belongs to
407
+ common::offset_t sourceNodeOffset = findSourceNodeForRow(currentGlobalRowIdx);
408
+ if (sourceNodeOffset == common::INVALID_OFFSET) {
409
+ continue; // Invalid row
410
+ }
411
+
412
+ // Check if this source node is in our bound nodes
413
+ if (boundNodeOffsets.find(sourceNodeOffset) == boundNodeOffsets.end()) {
414
+ continue; // Not a bound node, skip
415
+ }
416
+
417
+ // This row belongs to a bound node, collect the relationship
418
+
419
+ // Column 0 in indices file is the target/destination node ID
420
+ // Read as offset_t and convert to INTERNAL_ID
421
+ auto dstOffset = indicesChunk.getValueVector(0).getValue<common::offset_t>(i);
422
+ auto dstNodeID = internalID_t(dstOffset, getToNodeTableID());
423
+
424
+ // outputVectors[0] is the neighbor node ID (destination), if requested
425
+ if (!parquetRelScanState.outputVectors.empty()) {
426
+ parquetRelScanState.outputVectors[0]->setValue(totalRowsCollected, dstNodeID);
427
+ }
428
+
429
+ // If there are additional columns (e.g., weight), copy them to subsequent output
430
+ // vectors These are property columns and should have matching types
431
+ for (uint32_t colIdx = 1;
432
+ colIdx < numIndicesColumns && colIdx < parquetRelScanState.outputVectors.size();
433
+ ++colIdx) {
434
+ parquetRelScanState.outputVectors[colIdx]->copyFromVectorData(totalRowsCollected,
435
+ &indicesChunk.getValueVector(colIdx), i);
436
+ }
437
+
438
+ totalRowsCollected++;
439
+ }
440
+ }
441
+
442
+ // Set up the output state
443
+ if (totalRowsCollected > 0) {
444
+ auto selVector = std::make_shared<SelectionVector>(totalRowsCollected);
445
+ selVector->setToFiltered(totalRowsCollected);
446
+ for (uint64_t i = 0; i < totalRowsCollected; ++i) {
447
+ (*selVector)[i] = i;
448
+ }
449
+ parquetRelScanState.outState->setSelVector(selVector);
450
+
451
+ return true;
452
+ } else {
453
+ // No data found
454
+ auto selVector = std::make_shared<SelectionVector>(0);
455
+ parquetRelScanState.outState->setSelVector(selVector);
456
+ return false;
457
+ }
458
+ }
459
+
460
+ row_idx_t ParquetRelTable::getNumTotalRows(const transaction::Transaction* transaction) {
461
+ initializeParquetReaders(const_cast<transaction::Transaction*>(transaction));
462
+ if (!indicesReader) {
463
+ return 0;
464
+ }
465
+ auto metadata = indicesReader->getMetadata();
466
+ return metadata ? metadata->num_rows : 0;
467
+ }
468
+
469
+ } // namespace storage
470
+ } // namespace lbug
@@ -107,7 +107,17 @@ struct TestGroup {
107
107
  std::unordered_map<std::string, std::set<std::string>> testCasesConnNames;
108
108
  bool testFwdOnly;
109
109
 
110
- enum class DatasetType { CSV, PARQUET, NPY, CSV_TO_PARQUET, TURTLE, LBUG, JSON, CSV_TO_JSON };
110
+ enum class DatasetType {
111
+ CSV,
112
+ PARQUET,
113
+ NPY,
114
+ CSV_TO_PARQUET,
115
+ TURTLE,
116
+ LBUG,
117
+ JSON,
118
+ CSV_TO_JSON,
119
+ GRAPH_STD
120
+ };
111
121
  DatasetType datasetType;
112
122
 
113
123
  bool isValid() const { return !group.empty() && !dataset.empty(); }
@@ -38,7 +38,13 @@ public:
38
38
  }
39
39
  createDB(checkpointWaitTimeout);
40
40
  createConns(connNames);
41
- if (datasetType != TestGroup::DatasetType::LBUG && dataset != "empty") {
41
+ if (datasetType == TestGroup::DatasetType::GRAPH_STD) {
42
+ // For GRAPH_STD, only run schema.cypher (which contains WITH storage = ... clauses)
43
+ // No copy.cypher needed as data is in external parquet files
44
+ lbug::main::Connection* connection =
45
+ conn ? conn.get() : (connMap.begin()->second).get();
46
+ TestHelper::executeScript(dataset + "/" + TestHelper::SCHEMA_FILE_NAME, *connection);
47
+ } else if (datasetType != TestGroup::DatasetType::LBUG && dataset != "empty") {
42
48
  initGraph();
43
49
  } else if (generateBinaryDemo && TestHelper::E2E_OVERRIDE_IMPORT_DIR.empty()) {
44
50
  initGraph(TestHelper::appendLbugRootPath("dataset/demo-db/parquet/"));
@@ -0,0 +1,43 @@
1
+ -DATASET GRAPH-STD demo-db/graph-std
2
+
3
+ --
4
+
5
+ -CASE DemoDBGraphStdTest
6
+
7
+ -LOG MatchUserLivesInCity
8
+ -STATEMENT MATCH (u:user)-[l:livesin]->(c:city) RETURN u.name, u.age, c.name;
9
+ ---- 7
10
+ Adam|30|Guelph
11
+ Adam|30|Guelph
12
+ Karissa|40|Waterloo
13
+ Noura|25|Kitchener
14
+ Noura|25|Waterloo
15
+ Noura|25|Waterloo
16
+ Zhang|50|Kitchener
17
+
18
+ -LOG MatchSingleNodeLabel
19
+ -STATEMENT MATCH (a:user) RETURN a.name, a.age;
20
+ ---- 4
21
+ Adam|30
22
+ Karissa|40
23
+ Zhang|50
24
+ Noura|25
25
+
26
+ -LOG MatchCityNodes
27
+ -STATEMENT MATCH (c:city) RETURN c.name, c.population;
28
+ ---- 3
29
+ Waterloo|150000
30
+ Kitchener|200000
31
+ Guelph|75000
32
+
33
+ -LOG MatchFollowsRel
34
+ -STATEMENT MATCH (a:user)-[e:follows]->(b:user) RETURN a.name, b.name, e.since;
35
+ ---- 8
36
+ Adam|Zhang|2022
37
+ Karissa|Noura|2020
38
+ Karissa|Zhang|2020
39
+ Noura|Karissa|2020
40
+ Noura|Zhang|2021
41
+ Zhang|Adam|2022
42
+ Zhang|Karissa|2020
43
+ Zhang|Noura|2021
@@ -71,6 +71,30 @@ void TestHelper::executeScript(const std::string& cypherScript, Connection& conn
71
71
  auto fullPath = appendLbugRootPath(csvFilePath);
72
72
  line.replace(line.find(csvFilePath), csvFilePath.length(), fullPath);
73
73
  }
74
+ // Also handle storage = 'path' for parquet tables
75
+ std::vector<std::string> storagePaths;
76
+ size_t storageIndex = 0;
77
+ while (true) {
78
+ size_t start = line.find("storage = '", storageIndex);
79
+ if (start == std::string::npos) {
80
+ break;
81
+ }
82
+ start += 11; // length of "storage = '"
83
+ size_t end = line.find("'", start);
84
+ if (end == std::string::npos) {
85
+ break;
86
+ }
87
+ std::string storagePath = line.substr(start, end - start);
88
+ storagePaths.push_back(storagePath);
89
+ storageIndex = end + 1;
90
+ }
91
+ for (auto& storagePath : storagePaths) {
92
+ auto fullPath = appendLbugRootPath(storagePath);
93
+ size_t pos = line.find(storagePath);
94
+ if (pos != std::string::npos) {
95
+ line.replace(pos, storagePath.length(), fullPath);
96
+ }
97
+ }
74
98
  #ifdef __STATIC_LINK_EXTENSION_TEST__
75
99
  if (line.starts_with("load extension")) {
76
100
  continue;
@@ -87,6 +87,9 @@ void TestParser::extractDataset() {
87
87
  testGroup->datasetType = TestGroup::DatasetType::JSON;
88
88
  testGroup->dataset = currentToken.params[2];
89
89
  }
90
+ } else if (datasetType == "GRAPH-STD") {
91
+ testGroup->datasetType = TestGroup::DatasetType::GRAPH_STD;
92
+ testGroup->dataset = currentToken.params[2];
90
93
  } else {
91
94
  throw TestException(
92
95
  "Invalid dataset type `" + currentToken.params[1] + "` [" + path + ":" + line + "].");