lbug 0.12.3-dev.16 → 0.12.3-dev.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/lbug-source/CMakeLists.txt +1 -1
  2. package/lbug-source/dataset/demo-db/graph-std/demo_indices_follows.parquet +0 -0
  3. package/lbug-source/dataset/demo-db/graph-std/demo_indices_livesin.parquet +0 -0
  4. package/lbug-source/dataset/demo-db/graph-std/demo_indptr_follows.parquet +0 -0
  5. package/lbug-source/dataset/demo-db/graph-std/demo_indptr_livesin.parquet +0 -0
  6. package/lbug-source/dataset/demo-db/graph-std/demo_mapping_city.parquet +0 -0
  7. package/lbug-source/dataset/demo-db/graph-std/demo_mapping_user.parquet +0 -0
  8. package/lbug-source/dataset/demo-db/graph-std/demo_metadata.parquet +0 -0
  9. package/lbug-source/dataset/demo-db/graph-std/demo_nodes_city.parquet +0 -0
  10. package/lbug-source/dataset/demo-db/graph-std/demo_nodes_user.parquet +0 -0
  11. package/lbug-source/dataset/demo-db/graph-std/schema.cypher +4 -0
  12. package/lbug-source/scripts/antlr4/Cypher.g4 +1 -1
  13. package/lbug-source/scripts/antlr4/hash.md5 +1 -1
  14. package/lbug-source/src/antlr4/Cypher.g4 +1 -1
  15. package/lbug-source/src/binder/bind/bind_ddl.cpp +23 -13
  16. package/lbug-source/src/catalog/catalog.cpp +5 -4
  17. package/lbug-source/src/catalog/catalog_entry/node_table_catalog_entry.cpp +8 -1
  18. package/lbug-source/src/catalog/catalog_entry/rel_group_catalog_entry.cpp +7 -0
  19. package/lbug-source/src/include/binder/ddl/bound_create_table_info.h +10 -6
  20. package/lbug-source/src/include/catalog/catalog_entry/node_table_catalog_entry.h +5 -3
  21. package/lbug-source/src/include/catalog/catalog_entry/rel_group_catalog_entry.h +4 -2
  22. package/lbug-source/src/include/common/constants.h +1 -0
  23. package/lbug-source/src/include/parser/ddl/create_table_info.h +3 -1
  24. package/lbug-source/src/include/processor/operator/scan/scan_node_table.h +2 -2
  25. package/lbug-source/src/include/storage/storage_manager.h +1 -0
  26. package/lbug-source/src/include/storage/table/node_table.h +6 -1
  27. package/lbug-source/src/include/storage/table/parquet_node_table.h +103 -0
  28. package/lbug-source/src/include/storage/table/parquet_rel_table.h +99 -0
  29. package/lbug-source/src/include/storage/table/rel_table.h +2 -2
  30. package/lbug-source/src/include/transaction/transaction.h +2 -0
  31. package/lbug-source/src/parser/transform/transform_ddl.cpp +6 -1
  32. package/lbug-source/src/processor/operator/persistent/reader/parquet/parquet_reader.cpp +4 -0
  33. package/lbug-source/src/processor/operator/scan/scan_multi_rel_tables.cpp +24 -2
  34. package/lbug-source/src/processor/operator/scan/scan_node_table.cpp +44 -8
  35. package/lbug-source/src/processor/operator/scan/scan_rel_table.cpp +12 -2
  36. package/lbug-source/src/storage/storage_manager.cpp +40 -6
  37. package/lbug-source/src/storage/table/CMakeLists.txt +2 -0
  38. package/lbug-source/src/storage/table/parquet_node_table.cpp +338 -0
  39. package/lbug-source/src/storage/table/parquet_rel_table.cpp +470 -0
  40. package/lbug-source/test/include/test_runner/test_group.h +11 -1
  41. package/lbug-source/test/runner/e2e_test.cpp +7 -1
  42. package/lbug-source/test/test_files/demo_db/demo_db_graph_std.test +43 -0
  43. package/lbug-source/test/test_helper/test_helper.cpp +24 -0
  44. package/lbug-source/test/test_runner/test_parser.cpp +3 -0
  45. package/lbug-source/third_party/antlr4_cypher/cypher_parser.cpp +2761 -2701
  46. package/lbug-source/third_party/antlr4_cypher/include/cypher_parser.h +2 -0
  47. package/package.json +1 -1
  48. package/prebuilt/lbugjs-darwin-arm64.node +0 -0
  49. package/prebuilt/lbugjs-linux-arm64.node +0 -0
  50. package/prebuilt/lbugjs-linux-x64.node +0 -0
  51. package/prebuilt/lbugjs-win32-x64.node +0 -0
@@ -81,7 +81,12 @@ std::unique_ptr<Statement> Transformer::transformCreateNodeTable(
81
81
  } else {
82
82
  createTableInfo.propertyDefinitions =
83
83
  transformPropertyDefinitions(*ctx.kU_PropertyDefinitions());
84
- createTableInfo.extraInfo = std::make_unique<ExtraCreateNodeTableInfo>(getPKName(ctx));
84
+ options_t options;
85
+ if (ctx.kU_Options()) {
86
+ options = transformOptions(*ctx.kU_Options());
87
+ }
88
+ createTableInfo.extraInfo =
89
+ std::make_unique<ExtraCreateNodeTableInfo>(getPKName(ctx), std::move(options));
85
90
  return std::make_unique<CreateTable>(std::move(createTableInfo));
86
91
  }
87
92
  }
@@ -340,6 +340,10 @@ std::unique_ptr<ColumnReader> ParquetReader::createReader() {
340
340
  throw CopyException{"Root element of Parquet file must be a struct"};
341
341
  }
342
342
  // LCOV_EXCL_STOP
343
+ // Clear existing column metadata before populating (in case createReader is called multiple
344
+ // times)
345
+ columnNames.clear();
346
+ columnTypes.clear();
343
347
  for (auto& field : StructType::getFields(rootReader->getDataType())) {
344
348
  columnNames.push_back(field.getName());
345
349
  columnTypes.push_back(field.getType().copy());
@@ -2,6 +2,7 @@
2
2
 
3
3
  #include "processor/execution_context.h"
4
4
  #include "storage/local_storage/local_storage.h"
5
+ #include "storage/table/parquet_rel_table.h"
5
6
 
6
7
  using namespace lbug::common;
7
8
  using namespace lbug::storage;
@@ -54,8 +55,29 @@ void ScanMultiRelTable::initLocalStateInternal(ResultSet* resultSet, ExecutionCo
54
55
  auto clientContext = context->clientContext;
55
56
  boundNodeIDVector = resultSet->getValueVector(opInfo.nodeIDPos).get();
56
57
  auto nbrNodeIDVector = outVectors[0];
57
- scanState = std::make_unique<RelTableScanState>(*MemoryManager::Get(*clientContext),
58
- boundNodeIDVector, outVectors, nbrNodeIDVector->state);
58
+
59
+ // Check if any table in any scanner is a ParquetRelTable
60
+ bool hasParquetTable = false;
61
+ for (auto& [_, scanner] : scanners) {
62
+ for (auto& relInfo : scanner.relInfos) {
63
+ if (dynamic_cast<storage::ParquetRelTable*>(relInfo.table) != nullptr) {
64
+ hasParquetTable = true;
65
+ break;
66
+ }
67
+ }
68
+ if (hasParquetTable)
69
+ break;
70
+ }
71
+
72
+ // Create appropriate scan state type
73
+ if (hasParquetTable) {
74
+ scanState =
75
+ std::make_unique<storage::ParquetRelTableScanState>(*MemoryManager::Get(*clientContext),
76
+ boundNodeIDVector, outVectors, nbrNodeIDVector->state);
77
+ } else {
78
+ scanState = std::make_unique<RelTableScanState>(*MemoryManager::Get(*clientContext),
79
+ boundNodeIDVector, outVectors, nbrNodeIDVector->state);
80
+ }
59
81
  for (auto& [_, scanner] : scanners) {
60
82
  for (auto& relInfo : scanner.relInfos) {
61
83
  if (directionInfo.directionPos.isValid()) {
@@ -2,8 +2,10 @@
2
2
 
3
3
  #include "binder/expression/expression_util.h"
4
4
  #include "processor/execution_context.h"
5
+ #include "storage/buffer_manager/memory_manager.h"
5
6
  #include "storage/local_storage/local_node_table.h"
6
7
  #include "storage/local_storage/local_storage.h"
8
+ #include "storage/table/parquet_node_table.h"
7
9
 
8
10
  using namespace lbug::common;
9
11
  using namespace lbug::storage;
@@ -35,7 +37,23 @@ void ScanNodeTableSharedState::initialize(const transaction::Transaction* transa
35
37
  this->table = table;
36
38
  this->currentCommittedGroupIdx = 0;
37
39
  this->currentUnCommittedGroupIdx = 0;
38
- this->numCommittedNodeGroups = table->getNumCommittedNodeGroups();
40
+
41
+ // Initialize table-specific scan coordination (e.g., for ParquetNodeTable)
42
+ table->initializeScanCoordination(transaction);
43
+
44
+ if (const auto parquetTable = dynamic_cast<ParquetNodeTable*>(table)) {
45
+ // For parquet tables, set numCommittedNodeGroups to number of row groups
46
+ std::vector<bool> columnSkips;
47
+ try {
48
+ auto tempReader = std::make_unique<processor::ParquetReader>(
49
+ parquetTable->getParquetFilePath(), columnSkips, transaction->getClientContext());
50
+ this->numCommittedNodeGroups = tempReader->getNumRowsGroups();
51
+ } catch (const std::exception& e) {
52
+ this->numCommittedNodeGroups = 1;
53
+ }
54
+ } else {
55
+ this->numCommittedNodeGroups = table->getNumCommittedNodeGroups();
56
+ }
39
57
  if (transaction->isWriteTransaction()) {
40
58
  if (const auto localTable =
41
59
  transaction->getLocalStorage()->getLocalTable(this->table->getTableID())) {
@@ -46,21 +64,23 @@ void ScanNodeTableSharedState::initialize(const transaction::Transaction* transa
46
64
  progressSharedState.numGroups += numCommittedNodeGroups;
47
65
  }
48
66
 
49
- void ScanNodeTableSharedState::nextMorsel(NodeTableScanState& scanState,
67
+ void ScanNodeTableSharedState::nextMorsel(TableScanState& scanState,
50
68
  ScanNodeTableProgressSharedState& progressSharedState) {
51
69
  std::unique_lock lck{mtx};
70
+ // Cast to NodeTableScanState since we know this is for node tables
71
+ auto& nodeScanState = scanState.cast<NodeTableScanState>();
52
72
  if (currentCommittedGroupIdx < numCommittedNodeGroups) {
53
- scanState.nodeGroupIdx = currentCommittedGroupIdx++;
73
+ nodeScanState.nodeGroupIdx = currentCommittedGroupIdx++;
54
74
  progressSharedState.numGroupsScanned++;
55
- scanState.source = TableScanSource::COMMITTED;
75
+ nodeScanState.source = TableScanSource::COMMITTED;
56
76
  return;
57
77
  }
58
78
  if (currentUnCommittedGroupIdx < numUnCommittedNodeGroups) {
59
- scanState.nodeGroupIdx = currentUnCommittedGroupIdx++;
60
- scanState.source = TableScanSource::UNCOMMITTED;
79
+ nodeScanState.nodeGroupIdx = currentUnCommittedGroupIdx++;
80
+ nodeScanState.source = TableScanSource::UNCOMMITTED;
61
81
  return;
62
82
  }
63
- scanState.source = TableScanSource::NONE;
83
+ nodeScanState.source = TableScanSource::NONE;
64
84
  }
65
85
 
66
86
  table_id_map_t<SemiMask*> ScanNodeTable::getSemiMasks() const {
@@ -82,7 +102,18 @@ void ScanNodeTableInfo::initScanState(TableScanState& scanState,
82
102
  void ScanNodeTable::initLocalStateInternal(ResultSet* resultSet, ExecutionContext* context) {
83
103
  ScanTable::initLocalStateInternal(resultSet, context);
84
104
  auto nodeIDVector = resultSet->getValueVector(opInfo.nodeIDPos).get();
85
- scanState = std::make_unique<NodeTableScanState>(nodeIDVector, outVectors, nodeIDVector->state);
105
+
106
+ // Check if the first table is a ParquetNodeTable and create appropriate scan state
107
+ auto* parquetTable = dynamic_cast<ParquetNodeTable*>(tableInfos[0].table);
108
+ if (parquetTable) {
109
+ scanState = std::make_unique<ParquetNodeTableScanState>(
110
+ *MemoryManager::Get(*context->clientContext), nodeIDVector, outVectors,
111
+ nodeIDVector->state);
112
+ } else {
113
+ scanState =
114
+ std::make_unique<NodeTableScanState>(nodeIDVector, outVectors, nodeIDVector->state);
115
+ }
116
+
86
117
  currentTableIdx = 0;
87
118
  initCurrentTable(context);
88
119
  }
@@ -91,6 +122,11 @@ void ScanNodeTable::initCurrentTable(ExecutionContext* context) {
91
122
  auto& currentInfo = tableInfos[currentTableIdx];
92
123
  currentInfo.initScanState(*scanState, outVectors, context->clientContext);
93
124
  scanState->semiMask = sharedStates[currentTableIdx]->getSemiMask();
125
+ // Call table->initScanState for ParquetNodeTable
126
+ if (dynamic_cast<ParquetNodeTable*>(tableInfos[currentTableIdx].table)) {
127
+ auto transaction = transaction::Transaction::Get(*context->clientContext);
128
+ tableInfos[currentTableIdx].table->initScanState(transaction, *scanState);
129
+ }
94
130
  }
95
131
 
96
132
  void ScanNodeTable::initGlobalStateInternal(ExecutionContext* context) {
@@ -2,7 +2,9 @@
2
2
 
3
3
  #include "binder/expression/expression_util.h"
4
4
  #include "processor/execution_context.h"
5
+ #include "storage/buffer_manager/memory_manager.h"
5
6
  #include "storage/local_storage/local_rel_table.h"
7
+ #include "storage/table/parquet_rel_table.h"
6
8
 
7
9
  using namespace lbug::common;
8
10
  using namespace lbug::storage;
@@ -66,8 +68,16 @@ void ScanRelTable::initLocalStateInternal(ResultSet* resultSet, ExecutionContext
66
68
  auto clientContext = context->clientContext;
67
69
  auto boundNodeIDVector = resultSet->getValueVector(opInfo.nodeIDPos).get();
68
70
  auto nbrNodeIDVector = outVectors[0];
69
- scanState = std::make_unique<RelTableScanState>(*MemoryManager::Get(*clientContext),
70
- boundNodeIDVector, outVectors, nbrNodeIDVector->state);
71
+ // Check if this is a ParquetRelTable and create appropriate scan state
72
+ auto* parquetTable = dynamic_cast<storage::ParquetRelTable*>(tableInfo.table);
73
+ if (parquetTable) {
74
+ scanState =
75
+ std::make_unique<storage::ParquetRelTableScanState>(*MemoryManager::Get(*clientContext),
76
+ boundNodeIDVector, outVectors, nbrNodeIDVector->state);
77
+ } else {
78
+ scanState = std::make_unique<RelTableScanState>(*MemoryManager::Get(*clientContext),
79
+ boundNodeIDVector, outVectors, nbrNodeIDVector->state);
80
+ }
71
81
  tableInfo.initScanState(*scanState, outVectors, clientContext);
72
82
  }
73
83
 
@@ -13,6 +13,8 @@
13
13
  #include "storage/buffer_manager/memory_manager.h"
14
14
  #include "storage/checkpointer.h"
15
15
  #include "storage/table/node_table.h"
16
+ #include "storage/table/parquet_node_table.h"
17
+ #include "storage/table/parquet_rel_table.h"
16
18
  #include "storage/table/rel_table.h"
17
19
  #include "storage/wal/wal_replayer.h"
18
20
  #include "transaction/transaction.h"
@@ -77,15 +79,31 @@ void StorageManager::recover(main::ClientContext& clientContext, bool throwOnWal
77
79
  }
78
80
 
79
81
  void StorageManager::createNodeTable(NodeTableCatalogEntry* entry) {
80
- tables[entry->getTableID()] = std::make_unique<NodeTable>(this, entry, &memoryManager);
82
+ tableNameCache[entry->getTableID()] = entry->getName();
83
+ if (!entry->getStorage().empty()) {
84
+ // Create parquet-backed node table
85
+ tables[entry->getTableID()] =
86
+ std::make_unique<ParquetNodeTable>(this, entry, &memoryManager);
87
+ } else {
88
+ // Create regular node table
89
+ tables[entry->getTableID()] = std::make_unique<NodeTable>(this, entry, &memoryManager);
90
+ }
81
91
  }
82
92
 
83
93
  // TODO(Guodong): This API is added since storageManager doesn't provide an API to add a single
84
94
  // rel table. We may have to refactor the existing StorageManager::createTable(TableCatalogEntry*
85
95
  // entry).
86
96
  void StorageManager::addRelTable(RelGroupCatalogEntry* entry, const RelTableCatalogInfo& info) {
87
- tables[info.oid] = std::make_unique<RelTable>(entry, info.nodePair.srcTableID,
88
- info.nodePair.dstTableID, this, &memoryManager);
97
+ if (!entry->getStorage().empty()) {
98
+ // Create parquet-backed rel table
99
+ std::string fromNodeTableName = tableNameCache.at(info.nodePair.srcTableID);
100
+ tables[info.oid] = std::make_unique<ParquetRelTable>(entry, info.nodePair.srcTableID,
101
+ info.nodePair.dstTableID, this, &memoryManager, fromNodeTableName);
102
+ } else {
103
+ // Create regular rel table
104
+ tables[info.oid] = std::make_unique<RelTable>(entry, info.nodePair.srcTableID,
105
+ info.nodePair.dstTableID, this, &memoryManager);
106
+ }
89
107
  }
90
108
 
91
109
  void StorageManager::createRelTableGroup(RelGroupCatalogEntry* entry) {
@@ -257,7 +275,14 @@ void StorageManager::deserialize(main::ClientContext* context, const Catalog* ca
257
275
  KU_ASSERT(!tables.contains(tableID));
258
276
  auto tableEntry = catalog->getTableCatalogEntry(&DUMMY_TRANSACTION, tableID)
259
277
  ->ptrCast<NodeTableCatalogEntry>();
260
- tables[tableID] = std::make_unique<NodeTable>(this, tableEntry, &memoryManager);
278
+ tableNameCache[tableID] = tableEntry->getName();
279
+ if (!tableEntry->getStorage().empty()) {
280
+ // Create parquet-backed node table
281
+ tables[tableID] = std::make_unique<ParquetNodeTable>(this, tableEntry, &memoryManager);
282
+ } else {
283
+ // Create regular node table
284
+ tables[tableID] = std::make_unique<NodeTable>(this, tableEntry, &memoryManager);
285
+ }
261
286
  tables[tableID]->deserialize(context, this, deSer);
262
287
  }
263
288
  deSer.validateDebuggingInfo(key, "num_rel_groups");
@@ -279,8 +304,17 @@ void StorageManager::deserialize(main::ClientContext* context, const Catalog* ca
279
304
  for (auto k = 0u; k < numInnerRelTables; k++) {
280
305
  RelTableCatalogInfo info = RelTableCatalogInfo::deserialize(deSer);
281
306
  KU_ASSERT(!tables.contains(info.oid));
282
- tables[info.oid] = std::make_unique<RelTable>(relGroupEntry, info.nodePair.srcTableID,
283
- info.nodePair.dstTableID, this, &memoryManager);
307
+ if (!relGroupEntry->getStorage().empty()) {
308
+ // Create parquet-backed rel table
309
+ std::string fromNodeTableName = tableNameCache.at(info.nodePair.srcTableID);
310
+ tables[info.oid] =
311
+ std::make_unique<ParquetRelTable>(relGroupEntry, info.nodePair.srcTableID,
312
+ info.nodePair.dstTableID, this, &memoryManager, fromNodeTableName);
313
+ } else {
314
+ // Create regular rel table
315
+ tables[info.oid] = std::make_unique<RelTable>(relGroupEntry,
316
+ info.nodePair.srcTableID, info.nodePair.dstTableID, this, &memoryManager);
317
+ }
284
318
  tables.at(info.oid)->deserialize(context, this, deSer);
285
319
  }
286
320
  }
@@ -22,6 +22,8 @@ add_library(lbug_storage_store
22
22
  node_group_collection.cpp
23
23
  node_table.cpp
24
24
  null_column.cpp
25
+ parquet_node_table.cpp
26
+ parquet_rel_table.cpp
25
27
  rel_table.cpp
26
28
  rel_table_data.cpp
27
29
  string_chunk_data.cpp
@@ -0,0 +1,338 @@
1
+ #include "storage/table/parquet_node_table.h"
2
+
3
+ #include <mutex>
4
+
5
+ #include "catalog/catalog_entry/node_table_catalog_entry.h"
6
+ #include "common/data_chunk/sel_vector.h"
7
+ #include "common/exception/runtime.h"
8
+ #include "common/file_system/virtual_file_system.h"
9
+ #include "common/types/value/value.h"
10
+ #include "main/client_context.h"
11
+ #include "processor/operator/persistent/reader/parquet/parquet_reader.h"
12
+ #include "storage/buffer_manager/memory_manager.h"
13
+ #include "storage/storage_manager.h"
14
+ #include "storage/storage_utils.h"
15
+ #include "storage/table/column.h"
16
+ #include "transaction/transaction.h"
17
+
18
+ using namespace lbug::catalog;
19
+ using namespace lbug::common;
20
+ using namespace lbug::processor;
21
+ using namespace lbug::transaction;
22
+
23
+ namespace lbug {
24
+ namespace storage {
25
+
26
+ ParquetNodeTable::ParquetNodeTable(const StorageManager* storageManager,
27
+ const NodeTableCatalogEntry* nodeTableEntry, MemoryManager* memoryManager)
28
+ : NodeTable{storageManager, nodeTableEntry, memoryManager},
29
+ nodeTableCatalogEntry{nodeTableEntry} {
30
+ std::string prefix = nodeTableEntry->getStorage();
31
+ if (prefix.empty()) {
32
+ throw RuntimeException("Parquet file prefix is empty for parquet-backed node table");
33
+ }
34
+
35
+ // Get the table name for multi-table directory support
36
+ std::string tableName = nodeTableEntry->getName();
37
+
38
+ // For node tables with multi-table support:
39
+ // prefix_nodes_{tableName}.parquet (e.g., demo_nodes_city.parquet)
40
+ parquetFilePath = prefix + "_nodes_" + tableName + ".parquet";
41
+ sharedState = std::make_unique<ParquetNodeTableSharedState>();
42
+ }
43
+
44
+ void ParquetNodeTable::initScanState(Transaction* transaction, TableScanState& scanState,
45
+ [[maybe_unused]] bool resetCachedBoundNodeSelVec) const {
46
+ // Set up the scan state similar to how NodeTable does it
47
+ auto& nodeScanState = scanState.cast<NodeTableScanState>();
48
+ nodeScanState.source = TableScanSource::COMMITTED;
49
+
50
+ // Note: Don't set nodeGroupIdx here - it's set by the morsel-driven parallelism system
51
+
52
+ auto& parquetNodeScanState = static_cast<ParquetNodeTableScanState&>(nodeScanState);
53
+
54
+ // Reset scan state for each scan to allow multiple scans of the same table in one query
55
+ parquetNodeScanState.dataRead = false;
56
+ parquetNodeScanState.allData.clear();
57
+ parquetNodeScanState.totalRows = 0;
58
+ parquetNodeScanState.nextRowToDistribute = 0;
59
+
60
+ // Reset scan completion flag for this scan state
61
+ parquetNodeScanState.scanCompleted = false;
62
+
63
+ // Each scan state gets its own parquet reader for thread safety
64
+ if (!parquetNodeScanState.initialized) {
65
+ auto context = transaction->getClientContext();
66
+ if (!context) {
67
+ throw RuntimeException("Invalid client context for parquet scan state initialization");
68
+ }
69
+
70
+ std::vector<bool> columnSkips;
71
+ try {
72
+ parquetNodeScanState.parquetReader =
73
+ std::make_unique<ParquetReader>(parquetFilePath, columnSkips, context);
74
+ parquetNodeScanState.initialized = true;
75
+ } catch (const std::exception& e) {
76
+ throw RuntimeException("Failed to initialize parquet reader for file '" +
77
+ parquetFilePath + "': " + e.what());
78
+ }
79
+ }
80
+
81
+ // Set nodeGroupIdx to invalid initially - will be assigned by getNextRowGroup
82
+ parquetNodeScanState.nodeGroupIdx = INVALID_NODE_GROUP_IDX;
83
+
84
+ // Initialize scan state for the current row group (assigned via shared state)
85
+ initParquetScanForRowGroup(transaction, parquetNodeScanState);
86
+ }
87
+
88
+ void ParquetNodeTable::initializeScanCoordination(const Transaction* transaction) {
89
+ // Reset shared state at the start of each scan operation
90
+ // This is called once per scan operation by the ScanNodeTable operator
91
+ // Create a temporary reader to get the number of row groups
92
+ auto context = transaction->getClientContext();
93
+ if (!context) {
94
+ return;
95
+ }
96
+
97
+ std::vector<bool> columnSkips;
98
+ try {
99
+ auto tempReader = std::make_unique<ParquetReader>(parquetFilePath, columnSkips, context);
100
+ auto numRowGroups = tempReader->getNumRowsGroups();
101
+ sharedState->reset(numRowGroups);
102
+ } catch (const std::exception& e) {
103
+ // If we can't read the file, set to 1 row group as fallback
104
+ sharedState->reset(1);
105
+ }
106
+ }
107
+
108
+ void ParquetNodeTable::initParquetScanForRowGroup(Transaction* transaction,
109
+ ParquetNodeTableScanState& scanState) const {
110
+ auto context = transaction->getClientContext();
111
+ if (!context) {
112
+ return;
113
+ }
114
+
115
+ auto vfs = VirtualFileSystem::GetUnsafe(*context);
116
+ if (!vfs) {
117
+ return;
118
+ }
119
+
120
+ // Defensive check: ensure parquet reader exists
121
+ if (!scanState.parquetReader) {
122
+ return;
123
+ }
124
+
125
+ // Defensive check: ensure parquet scan state exists
126
+ if (!scanState.parquetScanState) {
127
+ return;
128
+ }
129
+
130
+ std::vector<uint64_t> groupsToRead;
131
+
132
+ // Use shared state to get the next available row group for this scan state
133
+ if (scanState.nodeGroupIdx == INVALID_NODE_GROUP_IDX) {
134
+ common::node_group_idx_t assignedRowGroup;
135
+ if (sharedState->getNextRowGroup(assignedRowGroup)) {
136
+ scanState.nodeGroupIdx = assignedRowGroup;
137
+ groupsToRead.push_back(assignedRowGroup);
138
+ } else {
139
+ // No more row groups available - mark scan as completed
140
+ scanState.scanCompleted = true;
141
+ // Still need to initialize the scan state with empty groups so reader is in valid state
142
+ scanState.parquetReader->initializeScan(*scanState.parquetScanState, groupsToRead, vfs);
143
+ return;
144
+ }
145
+ } else {
146
+ // Row group already assigned (e.g., by external morsel system or re-initialization)
147
+ groupsToRead.push_back(scanState.nodeGroupIdx);
148
+ }
149
+
150
+ // Re-initialize scan for the specific row groups
151
+ // Note: initializeScan can be called multiple times; the first call populates column metadata
152
+ scanState.parquetReader->initializeScan(*scanState.parquetScanState, groupsToRead, vfs);
153
+ }
154
+
155
+ bool ParquetNodeTable::scanInternal(Transaction* transaction, TableScanState& scanState) {
156
+ auto& parquetScanState = static_cast<ParquetNodeTableScanState&>(scanState);
157
+
158
+ // Check if this particular scan state has already completed
159
+ if (parquetScanState.scanCompleted) {
160
+ return false;
161
+ }
162
+
163
+ scanState.resetOutVectors();
164
+
165
+ // Read all data once into scan state
166
+ if (!parquetScanState.dataRead) {
167
+ // Only the first thread reads the parquet data
168
+ if (!parquetScanState.initialized) {
169
+ return false;
170
+ }
171
+
172
+ // Create a data chunk for reading parquet data
173
+ auto numColumns = parquetScanState.parquetReader->getNumColumns();
174
+
175
+ // Defensive check: ensure parquet file has at least one column
176
+ if (numColumns == 0) {
177
+ throw RuntimeException("Parquet file '" + parquetFilePath + "' has no columns");
178
+ }
179
+
180
+ DataChunk parquetDataChunk(numColumns, scanState.outState);
181
+
182
+ // Create vectors with parquet types
183
+ // Defensive check: ensure parquet file has enough columns for what we expect
184
+ // Always create the data chunk to match the exact number of parquet columns
185
+ // to prevent crashes in the parquet reader when accessing result vectors
186
+ for (uint32_t i = 0; i < numColumns; ++i) {
187
+ const auto& parquetColumnType = parquetScanState.parquetReader->getColumnType(i);
188
+ auto columnType = parquetColumnType.copy();
189
+ auto vector = std::make_shared<ValueVector>(std::move(columnType),
190
+ MemoryManager::Get(*transaction->getClientContext()), scanState.outState);
191
+ parquetDataChunk.insert(i, vector);
192
+ }
193
+
194
+ // Read from parquet
195
+ parquetScanState.parquetReader->scan(*parquetScanState.parquetScanState, parquetDataChunk);
196
+
197
+ auto selSize = parquetDataChunk.state->getSelVector().getSelSize();
198
+ if (selSize > 0) {
199
+ parquetScanState.allData.resize(selSize);
200
+ for (size_t row = 0; row < selSize; ++row) {
201
+ parquetScanState.allData[row].resize(
202
+ scanState.outputVectors
203
+ .size()); // Use output vector count, not parquet column count
204
+
205
+ // Map parquet columns to correct output vector positions by name
206
+ // Defensive check: ensure we don't access more columns than available in the chunk
207
+ auto maxParquetCol = std::min(static_cast<size_t>(numColumns),
208
+ static_cast<size_t>(parquetDataChunk.getNumValueVectors()));
209
+
210
+ for (size_t parquetCol = 0; parquetCol < maxParquetCol; ++parquetCol) {
211
+ // Defensive check: ensure the column index is valid for the data chunk
212
+ if (parquetCol >= parquetDataChunk.getNumValueVectors()) {
213
+ continue;
214
+ }
215
+
216
+ auto& srcVector = parquetDataChunk.getValueVectorMutable(parquetCol);
217
+
218
+ // Get parquet column name and find its corresponding column ID
219
+ std::string parquetColumnName =
220
+ parquetScanState.parquetReader->getColumnName(parquetCol);
221
+ auto nodeTableEntry = this->nodeTableCatalogEntry;
222
+
223
+ // Check if the column exists first before calling getColumnID
224
+ if (!nodeTableEntry->containsProperty(parquetColumnName)) {
225
+ // Column doesn't exist in table schema, skip it
226
+ continue;
227
+ }
228
+
229
+ // Find the column ID for this property name
230
+ column_id_t parquetColumnID = nodeTableEntry->getColumnID(parquetColumnName);
231
+
232
+ // Find which output vector position corresponds to this column ID
233
+ size_t outputCol = INVALID_COLUMN_ID;
234
+ for (size_t outCol = 0; outCol < scanState.columnIDs.size(); ++outCol) {
235
+ if (scanState.columnIDs[outCol] == parquetColumnID) {
236
+ outputCol = outCol;
237
+ break;
238
+ }
239
+ }
240
+
241
+ // Only copy data if we found a matching output position
242
+ if (outputCol != INVALID_COLUMN_ID &&
243
+ outputCol < parquetScanState.allData[row].size()) {
244
+ // Defensive check: ensure the row index is valid for the source vector
245
+ if (row >= srcVector.state->getSelVector().getSelSize()) {
246
+ continue;
247
+ }
248
+
249
+ if (srcVector.isNull(row)) {
250
+ parquetScanState.allData[row][outputCol] =
251
+ std::make_unique<Value>(Value::createNullValue());
252
+ } else {
253
+ parquetScanState.allData[row][outputCol] =
254
+ std::make_unique<Value>(*srcVector.getAsValue(row));
255
+ }
256
+ }
257
+ }
258
+ }
259
+ parquetScanState.totalRows = selSize;
260
+ }
261
+ parquetScanState.dataRead = true;
262
+ }
263
+
264
+ // Now distribute one row to this scan state
265
+ if (parquetScanState.nextRowToDistribute >= parquetScanState.totalRows) {
266
+ parquetScanState.scanCompleted = true;
267
+ return false; // No more rows to distribute
268
+ }
269
+
270
+ size_t rowIndex = parquetScanState.nextRowToDistribute++;
271
+
272
+ // Copy one row to output vectors
273
+ // Defensive checks: ensure valid row index and handle empty data gracefully
274
+ if (rowIndex >= parquetScanState.allData.size()) {
275
+ parquetScanState.scanCompleted = true;
276
+ return false;
277
+ }
278
+
279
+ auto numColumns =
280
+ std::min(scanState.outputVectors.size(), parquetScanState.allData[rowIndex].size());
281
+ for (size_t col = 0; col < numColumns; ++col) {
282
+ // Defensive check: ensure output vector exists
283
+ if (col >= scanState.outputVectors.size() || !scanState.outputVectors[col]) {
284
+ continue;
285
+ }
286
+
287
+ auto& dstVector = *scanState.outputVectors[col];
288
+
289
+ // Defensive check: ensure value exists for this column
290
+ if (col >= parquetScanState.allData[rowIndex].size() ||
291
+ !parquetScanState.allData[rowIndex][col]) {
292
+ dstVector.setNull(0, true);
293
+ continue;
294
+ }
295
+
296
+ auto& value = *parquetScanState.allData[rowIndex][col];
297
+
298
+ if (value.isNull()) {
299
+ dstVector.setNull(0, true);
300
+ } else {
301
+ dstVector.copyFromValue(0, value);
302
+ }
303
+ }
304
+
305
+ // Set node ID for this row
306
+ auto tableID = this->getTableID();
307
+ auto& nodeID = scanState.nodeIDVector->getValue<nodeID_t>(0);
308
+ nodeID.tableID = tableID;
309
+ nodeID.offset = rowIndex; // Use the actual row index from parquet
310
+
311
+ scanState.outState->getSelVectorUnsafe().setSelSize(1); // Return exactly one row
312
+ return true;
313
+ }
314
+
315
+ row_idx_t ParquetNodeTable::getNumTotalRows(const transaction::Transaction* transaction) {
316
+ // Create a temporary reader to get metadata
317
+ auto context = transaction->getClientContext();
318
+ if (!context) {
319
+ return 0;
320
+ }
321
+
322
+ std::vector<bool> columnSkips;
323
+
324
+ try {
325
+ auto tempReader = std::make_unique<ParquetReader>(parquetFilePath, columnSkips, context);
326
+ if (!tempReader) {
327
+ return 0;
328
+ }
329
+ auto metadata = tempReader->getMetadata();
330
+ return metadata ? metadata->num_rows : 0;
331
+ } catch (const std::exception& e) {
332
+ // If parquet file is corrupted or invalid, return 0 instead of crashing
333
+ return 0;
334
+ }
335
+ }
336
+
337
+ } // namespace storage
338
+ } // namespace lbug