lbug 0.12.3-dev.16 → 0.12.3-dev.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lbug-source/CMakeLists.txt +1 -1
- package/lbug-source/dataset/demo-db/graph-std/demo_indices_follows.parquet +0 -0
- package/lbug-source/dataset/demo-db/graph-std/demo_indices_livesin.parquet +0 -0
- package/lbug-source/dataset/demo-db/graph-std/demo_indptr_follows.parquet +0 -0
- package/lbug-source/dataset/demo-db/graph-std/demo_indptr_livesin.parquet +0 -0
- package/lbug-source/dataset/demo-db/graph-std/demo_mapping_city.parquet +0 -0
- package/lbug-source/dataset/demo-db/graph-std/demo_mapping_user.parquet +0 -0
- package/lbug-source/dataset/demo-db/graph-std/demo_metadata.parquet +0 -0
- package/lbug-source/dataset/demo-db/graph-std/demo_nodes_city.parquet +0 -0
- package/lbug-source/dataset/demo-db/graph-std/demo_nodes_user.parquet +0 -0
- package/lbug-source/dataset/demo-db/graph-std/schema.cypher +4 -0
- package/lbug-source/scripts/antlr4/Cypher.g4 +1 -1
- package/lbug-source/scripts/antlr4/hash.md5 +1 -1
- package/lbug-source/src/antlr4/Cypher.g4 +1 -1
- package/lbug-source/src/binder/bind/bind_ddl.cpp +23 -13
- package/lbug-source/src/catalog/catalog.cpp +5 -4
- package/lbug-source/src/catalog/catalog_entry/node_table_catalog_entry.cpp +8 -1
- package/lbug-source/src/catalog/catalog_entry/rel_group_catalog_entry.cpp +7 -0
- package/lbug-source/src/include/binder/ddl/bound_create_table_info.h +10 -6
- package/lbug-source/src/include/catalog/catalog_entry/node_table_catalog_entry.h +5 -3
- package/lbug-source/src/include/catalog/catalog_entry/rel_group_catalog_entry.h +4 -2
- package/lbug-source/src/include/common/constants.h +1 -0
- package/lbug-source/src/include/parser/ddl/create_table_info.h +3 -1
- package/lbug-source/src/include/processor/operator/scan/scan_node_table.h +2 -2
- package/lbug-source/src/include/storage/storage_manager.h +1 -0
- package/lbug-source/src/include/storage/table/node_table.h +6 -1
- package/lbug-source/src/include/storage/table/parquet_node_table.h +103 -0
- package/lbug-source/src/include/storage/table/parquet_rel_table.h +99 -0
- package/lbug-source/src/include/storage/table/rel_table.h +2 -2
- package/lbug-source/src/include/transaction/transaction.h +2 -0
- package/lbug-source/src/parser/transform/transform_ddl.cpp +6 -1
- package/lbug-source/src/processor/operator/persistent/reader/parquet/parquet_reader.cpp +4 -0
- package/lbug-source/src/processor/operator/scan/scan_multi_rel_tables.cpp +24 -2
- package/lbug-source/src/processor/operator/scan/scan_node_table.cpp +44 -8
- package/lbug-source/src/processor/operator/scan/scan_rel_table.cpp +12 -2
- package/lbug-source/src/storage/storage_manager.cpp +40 -6
- package/lbug-source/src/storage/table/CMakeLists.txt +2 -0
- package/lbug-source/src/storage/table/parquet_node_table.cpp +338 -0
- package/lbug-source/src/storage/table/parquet_rel_table.cpp +470 -0
- package/lbug-source/test/include/test_runner/test_group.h +11 -1
- package/lbug-source/test/runner/e2e_test.cpp +7 -1
- package/lbug-source/test/test_files/demo_db/demo_db_graph_std.test +43 -0
- package/lbug-source/test/test_helper/test_helper.cpp +24 -0
- package/lbug-source/test/test_runner/test_parser.cpp +3 -0
- package/lbug-source/third_party/antlr4_cypher/cypher_parser.cpp +2761 -2701
- package/lbug-source/third_party/antlr4_cypher/include/cypher_parser.h +2 -0
- package/package.json +1 -1
- package/prebuilt/lbugjs-darwin-arm64.node +0 -0
- package/prebuilt/lbugjs-linux-arm64.node +0 -0
- package/prebuilt/lbugjs-linux-x64.node +0 -0
- package/prebuilt/lbugjs-win32-x64.node +0 -0
|
@@ -81,7 +81,12 @@ std::unique_ptr<Statement> Transformer::transformCreateNodeTable(
|
|
|
81
81
|
} else {
|
|
82
82
|
createTableInfo.propertyDefinitions =
|
|
83
83
|
transformPropertyDefinitions(*ctx.kU_PropertyDefinitions());
|
|
84
|
-
|
|
84
|
+
options_t options;
|
|
85
|
+
if (ctx.kU_Options()) {
|
|
86
|
+
options = transformOptions(*ctx.kU_Options());
|
|
87
|
+
}
|
|
88
|
+
createTableInfo.extraInfo =
|
|
89
|
+
std::make_unique<ExtraCreateNodeTableInfo>(getPKName(ctx), std::move(options));
|
|
85
90
|
return std::make_unique<CreateTable>(std::move(createTableInfo));
|
|
86
91
|
}
|
|
87
92
|
}
|
|
@@ -340,6 +340,10 @@ std::unique_ptr<ColumnReader> ParquetReader::createReader() {
|
|
|
340
340
|
throw CopyException{"Root element of Parquet file must be a struct"};
|
|
341
341
|
}
|
|
342
342
|
// LCOV_EXCL_STOP
|
|
343
|
+
// Clear existing column metadata before populating (in case createReader is called multiple
|
|
344
|
+
// times)
|
|
345
|
+
columnNames.clear();
|
|
346
|
+
columnTypes.clear();
|
|
343
347
|
for (auto& field : StructType::getFields(rootReader->getDataType())) {
|
|
344
348
|
columnNames.push_back(field.getName());
|
|
345
349
|
columnTypes.push_back(field.getType().copy());
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
#include "processor/execution_context.h"
|
|
4
4
|
#include "storage/local_storage/local_storage.h"
|
|
5
|
+
#include "storage/table/parquet_rel_table.h"
|
|
5
6
|
|
|
6
7
|
using namespace lbug::common;
|
|
7
8
|
using namespace lbug::storage;
|
|
@@ -54,8 +55,29 @@ void ScanMultiRelTable::initLocalStateInternal(ResultSet* resultSet, ExecutionCo
|
|
|
54
55
|
auto clientContext = context->clientContext;
|
|
55
56
|
boundNodeIDVector = resultSet->getValueVector(opInfo.nodeIDPos).get();
|
|
56
57
|
auto nbrNodeIDVector = outVectors[0];
|
|
57
|
-
|
|
58
|
-
|
|
58
|
+
|
|
59
|
+
// Check if any table in any scanner is a ParquetRelTable
|
|
60
|
+
bool hasParquetTable = false;
|
|
61
|
+
for (auto& [_, scanner] : scanners) {
|
|
62
|
+
for (auto& relInfo : scanner.relInfos) {
|
|
63
|
+
if (dynamic_cast<storage::ParquetRelTable*>(relInfo.table) != nullptr) {
|
|
64
|
+
hasParquetTable = true;
|
|
65
|
+
break;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
if (hasParquetTable)
|
|
69
|
+
break;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// Create appropriate scan state type
|
|
73
|
+
if (hasParquetTable) {
|
|
74
|
+
scanState =
|
|
75
|
+
std::make_unique<storage::ParquetRelTableScanState>(*MemoryManager::Get(*clientContext),
|
|
76
|
+
boundNodeIDVector, outVectors, nbrNodeIDVector->state);
|
|
77
|
+
} else {
|
|
78
|
+
scanState = std::make_unique<RelTableScanState>(*MemoryManager::Get(*clientContext),
|
|
79
|
+
boundNodeIDVector, outVectors, nbrNodeIDVector->state);
|
|
80
|
+
}
|
|
59
81
|
for (auto& [_, scanner] : scanners) {
|
|
60
82
|
for (auto& relInfo : scanner.relInfos) {
|
|
61
83
|
if (directionInfo.directionPos.isValid()) {
|
|
@@ -2,8 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
#include "binder/expression/expression_util.h"
|
|
4
4
|
#include "processor/execution_context.h"
|
|
5
|
+
#include "storage/buffer_manager/memory_manager.h"
|
|
5
6
|
#include "storage/local_storage/local_node_table.h"
|
|
6
7
|
#include "storage/local_storage/local_storage.h"
|
|
8
|
+
#include "storage/table/parquet_node_table.h"
|
|
7
9
|
|
|
8
10
|
using namespace lbug::common;
|
|
9
11
|
using namespace lbug::storage;
|
|
@@ -35,7 +37,23 @@ void ScanNodeTableSharedState::initialize(const transaction::Transaction* transa
|
|
|
35
37
|
this->table = table;
|
|
36
38
|
this->currentCommittedGroupIdx = 0;
|
|
37
39
|
this->currentUnCommittedGroupIdx = 0;
|
|
38
|
-
|
|
40
|
+
|
|
41
|
+
// Initialize table-specific scan coordination (e.g., for ParquetNodeTable)
|
|
42
|
+
table->initializeScanCoordination(transaction);
|
|
43
|
+
|
|
44
|
+
if (const auto parquetTable = dynamic_cast<ParquetNodeTable*>(table)) {
|
|
45
|
+
// For parquet tables, set numCommittedNodeGroups to number of row groups
|
|
46
|
+
std::vector<bool> columnSkips;
|
|
47
|
+
try {
|
|
48
|
+
auto tempReader = std::make_unique<processor::ParquetReader>(
|
|
49
|
+
parquetTable->getParquetFilePath(), columnSkips, transaction->getClientContext());
|
|
50
|
+
this->numCommittedNodeGroups = tempReader->getNumRowsGroups();
|
|
51
|
+
} catch (const std::exception& e) {
|
|
52
|
+
this->numCommittedNodeGroups = 1;
|
|
53
|
+
}
|
|
54
|
+
} else {
|
|
55
|
+
this->numCommittedNodeGroups = table->getNumCommittedNodeGroups();
|
|
56
|
+
}
|
|
39
57
|
if (transaction->isWriteTransaction()) {
|
|
40
58
|
if (const auto localTable =
|
|
41
59
|
transaction->getLocalStorage()->getLocalTable(this->table->getTableID())) {
|
|
@@ -46,21 +64,23 @@ void ScanNodeTableSharedState::initialize(const transaction::Transaction* transa
|
|
|
46
64
|
progressSharedState.numGroups += numCommittedNodeGroups;
|
|
47
65
|
}
|
|
48
66
|
|
|
49
|
-
void ScanNodeTableSharedState::nextMorsel(
|
|
67
|
+
void ScanNodeTableSharedState::nextMorsel(TableScanState& scanState,
|
|
50
68
|
ScanNodeTableProgressSharedState& progressSharedState) {
|
|
51
69
|
std::unique_lock lck{mtx};
|
|
70
|
+
// Cast to NodeTableScanState since we know this is for node tables
|
|
71
|
+
auto& nodeScanState = scanState.cast<NodeTableScanState>();
|
|
52
72
|
if (currentCommittedGroupIdx < numCommittedNodeGroups) {
|
|
53
|
-
|
|
73
|
+
nodeScanState.nodeGroupIdx = currentCommittedGroupIdx++;
|
|
54
74
|
progressSharedState.numGroupsScanned++;
|
|
55
|
-
|
|
75
|
+
nodeScanState.source = TableScanSource::COMMITTED;
|
|
56
76
|
return;
|
|
57
77
|
}
|
|
58
78
|
if (currentUnCommittedGroupIdx < numUnCommittedNodeGroups) {
|
|
59
|
-
|
|
60
|
-
|
|
79
|
+
nodeScanState.nodeGroupIdx = currentUnCommittedGroupIdx++;
|
|
80
|
+
nodeScanState.source = TableScanSource::UNCOMMITTED;
|
|
61
81
|
return;
|
|
62
82
|
}
|
|
63
|
-
|
|
83
|
+
nodeScanState.source = TableScanSource::NONE;
|
|
64
84
|
}
|
|
65
85
|
|
|
66
86
|
table_id_map_t<SemiMask*> ScanNodeTable::getSemiMasks() const {
|
|
@@ -82,7 +102,18 @@ void ScanNodeTableInfo::initScanState(TableScanState& scanState,
|
|
|
82
102
|
void ScanNodeTable::initLocalStateInternal(ResultSet* resultSet, ExecutionContext* context) {
|
|
83
103
|
ScanTable::initLocalStateInternal(resultSet, context);
|
|
84
104
|
auto nodeIDVector = resultSet->getValueVector(opInfo.nodeIDPos).get();
|
|
85
|
-
|
|
105
|
+
|
|
106
|
+
// Check if the first table is a ParquetNodeTable and create appropriate scan state
|
|
107
|
+
auto* parquetTable = dynamic_cast<ParquetNodeTable*>(tableInfos[0].table);
|
|
108
|
+
if (parquetTable) {
|
|
109
|
+
scanState = std::make_unique<ParquetNodeTableScanState>(
|
|
110
|
+
*MemoryManager::Get(*context->clientContext), nodeIDVector, outVectors,
|
|
111
|
+
nodeIDVector->state);
|
|
112
|
+
} else {
|
|
113
|
+
scanState =
|
|
114
|
+
std::make_unique<NodeTableScanState>(nodeIDVector, outVectors, nodeIDVector->state);
|
|
115
|
+
}
|
|
116
|
+
|
|
86
117
|
currentTableIdx = 0;
|
|
87
118
|
initCurrentTable(context);
|
|
88
119
|
}
|
|
@@ -91,6 +122,11 @@ void ScanNodeTable::initCurrentTable(ExecutionContext* context) {
|
|
|
91
122
|
auto& currentInfo = tableInfos[currentTableIdx];
|
|
92
123
|
currentInfo.initScanState(*scanState, outVectors, context->clientContext);
|
|
93
124
|
scanState->semiMask = sharedStates[currentTableIdx]->getSemiMask();
|
|
125
|
+
// Call table->initScanState for ParquetNodeTable
|
|
126
|
+
if (dynamic_cast<ParquetNodeTable*>(tableInfos[currentTableIdx].table)) {
|
|
127
|
+
auto transaction = transaction::Transaction::Get(*context->clientContext);
|
|
128
|
+
tableInfos[currentTableIdx].table->initScanState(transaction, *scanState);
|
|
129
|
+
}
|
|
94
130
|
}
|
|
95
131
|
|
|
96
132
|
void ScanNodeTable::initGlobalStateInternal(ExecutionContext* context) {
|
|
@@ -2,7 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
#include "binder/expression/expression_util.h"
|
|
4
4
|
#include "processor/execution_context.h"
|
|
5
|
+
#include "storage/buffer_manager/memory_manager.h"
|
|
5
6
|
#include "storage/local_storage/local_rel_table.h"
|
|
7
|
+
#include "storage/table/parquet_rel_table.h"
|
|
6
8
|
|
|
7
9
|
using namespace lbug::common;
|
|
8
10
|
using namespace lbug::storage;
|
|
@@ -66,8 +68,16 @@ void ScanRelTable::initLocalStateInternal(ResultSet* resultSet, ExecutionContext
|
|
|
66
68
|
auto clientContext = context->clientContext;
|
|
67
69
|
auto boundNodeIDVector = resultSet->getValueVector(opInfo.nodeIDPos).get();
|
|
68
70
|
auto nbrNodeIDVector = outVectors[0];
|
|
69
|
-
|
|
70
|
-
|
|
71
|
+
// Check if this is a ParquetRelTable and create appropriate scan state
|
|
72
|
+
auto* parquetTable = dynamic_cast<storage::ParquetRelTable*>(tableInfo.table);
|
|
73
|
+
if (parquetTable) {
|
|
74
|
+
scanState =
|
|
75
|
+
std::make_unique<storage::ParquetRelTableScanState>(*MemoryManager::Get(*clientContext),
|
|
76
|
+
boundNodeIDVector, outVectors, nbrNodeIDVector->state);
|
|
77
|
+
} else {
|
|
78
|
+
scanState = std::make_unique<RelTableScanState>(*MemoryManager::Get(*clientContext),
|
|
79
|
+
boundNodeIDVector, outVectors, nbrNodeIDVector->state);
|
|
80
|
+
}
|
|
71
81
|
tableInfo.initScanState(*scanState, outVectors, clientContext);
|
|
72
82
|
}
|
|
73
83
|
|
|
@@ -13,6 +13,8 @@
|
|
|
13
13
|
#include "storage/buffer_manager/memory_manager.h"
|
|
14
14
|
#include "storage/checkpointer.h"
|
|
15
15
|
#include "storage/table/node_table.h"
|
|
16
|
+
#include "storage/table/parquet_node_table.h"
|
|
17
|
+
#include "storage/table/parquet_rel_table.h"
|
|
16
18
|
#include "storage/table/rel_table.h"
|
|
17
19
|
#include "storage/wal/wal_replayer.h"
|
|
18
20
|
#include "transaction/transaction.h"
|
|
@@ -77,15 +79,31 @@ void StorageManager::recover(main::ClientContext& clientContext, bool throwOnWal
|
|
|
77
79
|
}
|
|
78
80
|
|
|
79
81
|
void StorageManager::createNodeTable(NodeTableCatalogEntry* entry) {
|
|
80
|
-
|
|
82
|
+
tableNameCache[entry->getTableID()] = entry->getName();
|
|
83
|
+
if (!entry->getStorage().empty()) {
|
|
84
|
+
// Create parquet-backed node table
|
|
85
|
+
tables[entry->getTableID()] =
|
|
86
|
+
std::make_unique<ParquetNodeTable>(this, entry, &memoryManager);
|
|
87
|
+
} else {
|
|
88
|
+
// Create regular node table
|
|
89
|
+
tables[entry->getTableID()] = std::make_unique<NodeTable>(this, entry, &memoryManager);
|
|
90
|
+
}
|
|
81
91
|
}
|
|
82
92
|
|
|
83
93
|
// TODO(Guodong): This API is added since storageManager doesn't provide an API to add a single
|
|
84
94
|
// rel table. We may have to refactor the existing StorageManager::createTable(TableCatalogEntry*
|
|
85
95
|
// entry).
|
|
86
96
|
void StorageManager::addRelTable(RelGroupCatalogEntry* entry, const RelTableCatalogInfo& info) {
|
|
87
|
-
|
|
88
|
-
|
|
97
|
+
if (!entry->getStorage().empty()) {
|
|
98
|
+
// Create parquet-backed rel table
|
|
99
|
+
std::string fromNodeTableName = tableNameCache.at(info.nodePair.srcTableID);
|
|
100
|
+
tables[info.oid] = std::make_unique<ParquetRelTable>(entry, info.nodePair.srcTableID,
|
|
101
|
+
info.nodePair.dstTableID, this, &memoryManager, fromNodeTableName);
|
|
102
|
+
} else {
|
|
103
|
+
// Create regular rel table
|
|
104
|
+
tables[info.oid] = std::make_unique<RelTable>(entry, info.nodePair.srcTableID,
|
|
105
|
+
info.nodePair.dstTableID, this, &memoryManager);
|
|
106
|
+
}
|
|
89
107
|
}
|
|
90
108
|
|
|
91
109
|
void StorageManager::createRelTableGroup(RelGroupCatalogEntry* entry) {
|
|
@@ -257,7 +275,14 @@ void StorageManager::deserialize(main::ClientContext* context, const Catalog* ca
|
|
|
257
275
|
KU_ASSERT(!tables.contains(tableID));
|
|
258
276
|
auto tableEntry = catalog->getTableCatalogEntry(&DUMMY_TRANSACTION, tableID)
|
|
259
277
|
->ptrCast<NodeTableCatalogEntry>();
|
|
260
|
-
|
|
278
|
+
tableNameCache[tableID] = tableEntry->getName();
|
|
279
|
+
if (!tableEntry->getStorage().empty()) {
|
|
280
|
+
// Create parquet-backed node table
|
|
281
|
+
tables[tableID] = std::make_unique<ParquetNodeTable>(this, tableEntry, &memoryManager);
|
|
282
|
+
} else {
|
|
283
|
+
// Create regular node table
|
|
284
|
+
tables[tableID] = std::make_unique<NodeTable>(this, tableEntry, &memoryManager);
|
|
285
|
+
}
|
|
261
286
|
tables[tableID]->deserialize(context, this, deSer);
|
|
262
287
|
}
|
|
263
288
|
deSer.validateDebuggingInfo(key, "num_rel_groups");
|
|
@@ -279,8 +304,17 @@ void StorageManager::deserialize(main::ClientContext* context, const Catalog* ca
|
|
|
279
304
|
for (auto k = 0u; k < numInnerRelTables; k++) {
|
|
280
305
|
RelTableCatalogInfo info = RelTableCatalogInfo::deserialize(deSer);
|
|
281
306
|
KU_ASSERT(!tables.contains(info.oid));
|
|
282
|
-
|
|
283
|
-
|
|
307
|
+
if (!relGroupEntry->getStorage().empty()) {
|
|
308
|
+
// Create parquet-backed rel table
|
|
309
|
+
std::string fromNodeTableName = tableNameCache.at(info.nodePair.srcTableID);
|
|
310
|
+
tables[info.oid] =
|
|
311
|
+
std::make_unique<ParquetRelTable>(relGroupEntry, info.nodePair.srcTableID,
|
|
312
|
+
info.nodePair.dstTableID, this, &memoryManager, fromNodeTableName);
|
|
313
|
+
} else {
|
|
314
|
+
// Create regular rel table
|
|
315
|
+
tables[info.oid] = std::make_unique<RelTable>(relGroupEntry,
|
|
316
|
+
info.nodePair.srcTableID, info.nodePair.dstTableID, this, &memoryManager);
|
|
317
|
+
}
|
|
284
318
|
tables.at(info.oid)->deserialize(context, this, deSer);
|
|
285
319
|
}
|
|
286
320
|
}
|
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
#include "storage/table/parquet_node_table.h"
|
|
2
|
+
|
|
3
|
+
#include <mutex>
|
|
4
|
+
|
|
5
|
+
#include "catalog/catalog_entry/node_table_catalog_entry.h"
|
|
6
|
+
#include "common/data_chunk/sel_vector.h"
|
|
7
|
+
#include "common/exception/runtime.h"
|
|
8
|
+
#include "common/file_system/virtual_file_system.h"
|
|
9
|
+
#include "common/types/value/value.h"
|
|
10
|
+
#include "main/client_context.h"
|
|
11
|
+
#include "processor/operator/persistent/reader/parquet/parquet_reader.h"
|
|
12
|
+
#include "storage/buffer_manager/memory_manager.h"
|
|
13
|
+
#include "storage/storage_manager.h"
|
|
14
|
+
#include "storage/storage_utils.h"
|
|
15
|
+
#include "storage/table/column.h"
|
|
16
|
+
#include "transaction/transaction.h"
|
|
17
|
+
|
|
18
|
+
using namespace lbug::catalog;
|
|
19
|
+
using namespace lbug::common;
|
|
20
|
+
using namespace lbug::processor;
|
|
21
|
+
using namespace lbug::transaction;
|
|
22
|
+
|
|
23
|
+
namespace lbug {
|
|
24
|
+
namespace storage {
|
|
25
|
+
|
|
26
|
+
ParquetNodeTable::ParquetNodeTable(const StorageManager* storageManager,
|
|
27
|
+
const NodeTableCatalogEntry* nodeTableEntry, MemoryManager* memoryManager)
|
|
28
|
+
: NodeTable{storageManager, nodeTableEntry, memoryManager},
|
|
29
|
+
nodeTableCatalogEntry{nodeTableEntry} {
|
|
30
|
+
std::string prefix = nodeTableEntry->getStorage();
|
|
31
|
+
if (prefix.empty()) {
|
|
32
|
+
throw RuntimeException("Parquet file prefix is empty for parquet-backed node table");
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Get the table name for multi-table directory support
|
|
36
|
+
std::string tableName = nodeTableEntry->getName();
|
|
37
|
+
|
|
38
|
+
// For node tables with multi-table support:
|
|
39
|
+
// prefix_nodes_{tableName}.parquet (e.g., demo_nodes_city.parquet)
|
|
40
|
+
parquetFilePath = prefix + "_nodes_" + tableName + ".parquet";
|
|
41
|
+
sharedState = std::make_unique<ParquetNodeTableSharedState>();
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
void ParquetNodeTable::initScanState(Transaction* transaction, TableScanState& scanState,
|
|
45
|
+
[[maybe_unused]] bool resetCachedBoundNodeSelVec) const {
|
|
46
|
+
// Set up the scan state similar to how NodeTable does it
|
|
47
|
+
auto& nodeScanState = scanState.cast<NodeTableScanState>();
|
|
48
|
+
nodeScanState.source = TableScanSource::COMMITTED;
|
|
49
|
+
|
|
50
|
+
// Note: Don't set nodeGroupIdx here - it's set by the morsel-driven parallelism system
|
|
51
|
+
|
|
52
|
+
auto& parquetNodeScanState = static_cast<ParquetNodeTableScanState&>(nodeScanState);
|
|
53
|
+
|
|
54
|
+
// Reset scan state for each scan to allow multiple scans of the same table in one query
|
|
55
|
+
parquetNodeScanState.dataRead = false;
|
|
56
|
+
parquetNodeScanState.allData.clear();
|
|
57
|
+
parquetNodeScanState.totalRows = 0;
|
|
58
|
+
parquetNodeScanState.nextRowToDistribute = 0;
|
|
59
|
+
|
|
60
|
+
// Reset scan completion flag for this scan state
|
|
61
|
+
parquetNodeScanState.scanCompleted = false;
|
|
62
|
+
|
|
63
|
+
// Each scan state gets its own parquet reader for thread safety
|
|
64
|
+
if (!parquetNodeScanState.initialized) {
|
|
65
|
+
auto context = transaction->getClientContext();
|
|
66
|
+
if (!context) {
|
|
67
|
+
throw RuntimeException("Invalid client context for parquet scan state initialization");
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
std::vector<bool> columnSkips;
|
|
71
|
+
try {
|
|
72
|
+
parquetNodeScanState.parquetReader =
|
|
73
|
+
std::make_unique<ParquetReader>(parquetFilePath, columnSkips, context);
|
|
74
|
+
parquetNodeScanState.initialized = true;
|
|
75
|
+
} catch (const std::exception& e) {
|
|
76
|
+
throw RuntimeException("Failed to initialize parquet reader for file '" +
|
|
77
|
+
parquetFilePath + "': " + e.what());
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// Set nodeGroupIdx to invalid initially - will be assigned by getNextRowGroup
|
|
82
|
+
parquetNodeScanState.nodeGroupIdx = INVALID_NODE_GROUP_IDX;
|
|
83
|
+
|
|
84
|
+
// Initialize scan state for the current row group (assigned via shared state)
|
|
85
|
+
initParquetScanForRowGroup(transaction, parquetNodeScanState);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
void ParquetNodeTable::initializeScanCoordination(const Transaction* transaction) {
|
|
89
|
+
// Reset shared state at the start of each scan operation
|
|
90
|
+
// This is called once per scan operation by the ScanNodeTable operator
|
|
91
|
+
// Create a temporary reader to get the number of row groups
|
|
92
|
+
auto context = transaction->getClientContext();
|
|
93
|
+
if (!context) {
|
|
94
|
+
return;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
std::vector<bool> columnSkips;
|
|
98
|
+
try {
|
|
99
|
+
auto tempReader = std::make_unique<ParquetReader>(parquetFilePath, columnSkips, context);
|
|
100
|
+
auto numRowGroups = tempReader->getNumRowsGroups();
|
|
101
|
+
sharedState->reset(numRowGroups);
|
|
102
|
+
} catch (const std::exception& e) {
|
|
103
|
+
// If we can't read the file, set to 1 row group as fallback
|
|
104
|
+
sharedState->reset(1);
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
void ParquetNodeTable::initParquetScanForRowGroup(Transaction* transaction,
|
|
109
|
+
ParquetNodeTableScanState& scanState) const {
|
|
110
|
+
auto context = transaction->getClientContext();
|
|
111
|
+
if (!context) {
|
|
112
|
+
return;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
auto vfs = VirtualFileSystem::GetUnsafe(*context);
|
|
116
|
+
if (!vfs) {
|
|
117
|
+
return;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// Defensive check: ensure parquet reader exists
|
|
121
|
+
if (!scanState.parquetReader) {
|
|
122
|
+
return;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// Defensive check: ensure parquet scan state exists
|
|
126
|
+
if (!scanState.parquetScanState) {
|
|
127
|
+
return;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
std::vector<uint64_t> groupsToRead;
|
|
131
|
+
|
|
132
|
+
// Use shared state to get the next available row group for this scan state
|
|
133
|
+
if (scanState.nodeGroupIdx == INVALID_NODE_GROUP_IDX) {
|
|
134
|
+
common::node_group_idx_t assignedRowGroup;
|
|
135
|
+
if (sharedState->getNextRowGroup(assignedRowGroup)) {
|
|
136
|
+
scanState.nodeGroupIdx = assignedRowGroup;
|
|
137
|
+
groupsToRead.push_back(assignedRowGroup);
|
|
138
|
+
} else {
|
|
139
|
+
// No more row groups available - mark scan as completed
|
|
140
|
+
scanState.scanCompleted = true;
|
|
141
|
+
// Still need to initialize the scan state with empty groups so reader is in valid state
|
|
142
|
+
scanState.parquetReader->initializeScan(*scanState.parquetScanState, groupsToRead, vfs);
|
|
143
|
+
return;
|
|
144
|
+
}
|
|
145
|
+
} else {
|
|
146
|
+
// Row group already assigned (e.g., by external morsel system or re-initialization)
|
|
147
|
+
groupsToRead.push_back(scanState.nodeGroupIdx);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// Re-initialize scan for the specific row groups
|
|
151
|
+
// Note: initializeScan can be called multiple times; the first call populates column metadata
|
|
152
|
+
scanState.parquetReader->initializeScan(*scanState.parquetScanState, groupsToRead, vfs);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
bool ParquetNodeTable::scanInternal(Transaction* transaction, TableScanState& scanState) {
|
|
156
|
+
auto& parquetScanState = static_cast<ParquetNodeTableScanState&>(scanState);
|
|
157
|
+
|
|
158
|
+
// Check if this particular scan state has already completed
|
|
159
|
+
if (parquetScanState.scanCompleted) {
|
|
160
|
+
return false;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
scanState.resetOutVectors();
|
|
164
|
+
|
|
165
|
+
// Read all data once into scan state
|
|
166
|
+
if (!parquetScanState.dataRead) {
|
|
167
|
+
// Only the first thread reads the parquet data
|
|
168
|
+
if (!parquetScanState.initialized) {
|
|
169
|
+
return false;
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
// Create a data chunk for reading parquet data
|
|
173
|
+
auto numColumns = parquetScanState.parquetReader->getNumColumns();
|
|
174
|
+
|
|
175
|
+
// Defensive check: ensure parquet file has at least one column
|
|
176
|
+
if (numColumns == 0) {
|
|
177
|
+
throw RuntimeException("Parquet file '" + parquetFilePath + "' has no columns");
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
DataChunk parquetDataChunk(numColumns, scanState.outState);
|
|
181
|
+
|
|
182
|
+
// Create vectors with parquet types
|
|
183
|
+
// Defensive check: ensure parquet file has enough columns for what we expect
|
|
184
|
+
// Always create the data chunk to match the exact number of parquet columns
|
|
185
|
+
// to prevent crashes in the parquet reader when accessing result vectors
|
|
186
|
+
for (uint32_t i = 0; i < numColumns; ++i) {
|
|
187
|
+
const auto& parquetColumnType = parquetScanState.parquetReader->getColumnType(i);
|
|
188
|
+
auto columnType = parquetColumnType.copy();
|
|
189
|
+
auto vector = std::make_shared<ValueVector>(std::move(columnType),
|
|
190
|
+
MemoryManager::Get(*transaction->getClientContext()), scanState.outState);
|
|
191
|
+
parquetDataChunk.insert(i, vector);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// Read from parquet
|
|
195
|
+
parquetScanState.parquetReader->scan(*parquetScanState.parquetScanState, parquetDataChunk);
|
|
196
|
+
|
|
197
|
+
auto selSize = parquetDataChunk.state->getSelVector().getSelSize();
|
|
198
|
+
if (selSize > 0) {
|
|
199
|
+
parquetScanState.allData.resize(selSize);
|
|
200
|
+
for (size_t row = 0; row < selSize; ++row) {
|
|
201
|
+
parquetScanState.allData[row].resize(
|
|
202
|
+
scanState.outputVectors
|
|
203
|
+
.size()); // Use output vector count, not parquet column count
|
|
204
|
+
|
|
205
|
+
// Map parquet columns to correct output vector positions by name
|
|
206
|
+
// Defensive check: ensure we don't access more columns than available in the chunk
|
|
207
|
+
auto maxParquetCol = std::min(static_cast<size_t>(numColumns),
|
|
208
|
+
static_cast<size_t>(parquetDataChunk.getNumValueVectors()));
|
|
209
|
+
|
|
210
|
+
for (size_t parquetCol = 0; parquetCol < maxParquetCol; ++parquetCol) {
|
|
211
|
+
// Defensive check: ensure the column index is valid for the data chunk
|
|
212
|
+
if (parquetCol >= parquetDataChunk.getNumValueVectors()) {
|
|
213
|
+
continue;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
auto& srcVector = parquetDataChunk.getValueVectorMutable(parquetCol);
|
|
217
|
+
|
|
218
|
+
// Get parquet column name and find its corresponding column ID
|
|
219
|
+
std::string parquetColumnName =
|
|
220
|
+
parquetScanState.parquetReader->getColumnName(parquetCol);
|
|
221
|
+
auto nodeTableEntry = this->nodeTableCatalogEntry;
|
|
222
|
+
|
|
223
|
+
// Check if the column exists first before calling getColumnID
|
|
224
|
+
if (!nodeTableEntry->containsProperty(parquetColumnName)) {
|
|
225
|
+
// Column doesn't exist in table schema, skip it
|
|
226
|
+
continue;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
// Find the column ID for this property name
|
|
230
|
+
column_id_t parquetColumnID = nodeTableEntry->getColumnID(parquetColumnName);
|
|
231
|
+
|
|
232
|
+
// Find which output vector position corresponds to this column ID
|
|
233
|
+
size_t outputCol = INVALID_COLUMN_ID;
|
|
234
|
+
for (size_t outCol = 0; outCol < scanState.columnIDs.size(); ++outCol) {
|
|
235
|
+
if (scanState.columnIDs[outCol] == parquetColumnID) {
|
|
236
|
+
outputCol = outCol;
|
|
237
|
+
break;
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// Only copy data if we found a matching output position
|
|
242
|
+
if (outputCol != INVALID_COLUMN_ID &&
|
|
243
|
+
outputCol < parquetScanState.allData[row].size()) {
|
|
244
|
+
// Defensive check: ensure the row index is valid for the source vector
|
|
245
|
+
if (row >= srcVector.state->getSelVector().getSelSize()) {
|
|
246
|
+
continue;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
if (srcVector.isNull(row)) {
|
|
250
|
+
parquetScanState.allData[row][outputCol] =
|
|
251
|
+
std::make_unique<Value>(Value::createNullValue());
|
|
252
|
+
} else {
|
|
253
|
+
parquetScanState.allData[row][outputCol] =
|
|
254
|
+
std::make_unique<Value>(*srcVector.getAsValue(row));
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
parquetScanState.totalRows = selSize;
|
|
260
|
+
}
|
|
261
|
+
parquetScanState.dataRead = true;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
// Now distribute one row to this scan state
|
|
265
|
+
if (parquetScanState.nextRowToDistribute >= parquetScanState.totalRows) {
|
|
266
|
+
parquetScanState.scanCompleted = true;
|
|
267
|
+
return false; // No more rows to distribute
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
size_t rowIndex = parquetScanState.nextRowToDistribute++;
|
|
271
|
+
|
|
272
|
+
// Copy one row to output vectors
|
|
273
|
+
// Defensive checks: ensure valid row index and handle empty data gracefully
|
|
274
|
+
if (rowIndex >= parquetScanState.allData.size()) {
|
|
275
|
+
parquetScanState.scanCompleted = true;
|
|
276
|
+
return false;
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
auto numColumns =
|
|
280
|
+
std::min(scanState.outputVectors.size(), parquetScanState.allData[rowIndex].size());
|
|
281
|
+
for (size_t col = 0; col < numColumns; ++col) {
|
|
282
|
+
// Defensive check: ensure output vector exists
|
|
283
|
+
if (col >= scanState.outputVectors.size() || !scanState.outputVectors[col]) {
|
|
284
|
+
continue;
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
auto& dstVector = *scanState.outputVectors[col];
|
|
288
|
+
|
|
289
|
+
// Defensive check: ensure value exists for this column
|
|
290
|
+
if (col >= parquetScanState.allData[rowIndex].size() ||
|
|
291
|
+
!parquetScanState.allData[rowIndex][col]) {
|
|
292
|
+
dstVector.setNull(0, true);
|
|
293
|
+
continue;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
auto& value = *parquetScanState.allData[rowIndex][col];
|
|
297
|
+
|
|
298
|
+
if (value.isNull()) {
|
|
299
|
+
dstVector.setNull(0, true);
|
|
300
|
+
} else {
|
|
301
|
+
dstVector.copyFromValue(0, value);
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
// Set node ID for this row
|
|
306
|
+
auto tableID = this->getTableID();
|
|
307
|
+
auto& nodeID = scanState.nodeIDVector->getValue<nodeID_t>(0);
|
|
308
|
+
nodeID.tableID = tableID;
|
|
309
|
+
nodeID.offset = rowIndex; // Use the actual row index from parquet
|
|
310
|
+
|
|
311
|
+
scanState.outState->getSelVectorUnsafe().setSelSize(1); // Return exactly one row
|
|
312
|
+
return true;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
row_idx_t ParquetNodeTable::getNumTotalRows(const transaction::Transaction* transaction) {
|
|
316
|
+
// Create a temporary reader to get metadata
|
|
317
|
+
auto context = transaction->getClientContext();
|
|
318
|
+
if (!context) {
|
|
319
|
+
return 0;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
std::vector<bool> columnSkips;
|
|
323
|
+
|
|
324
|
+
try {
|
|
325
|
+
auto tempReader = std::make_unique<ParquetReader>(parquetFilePath, columnSkips, context);
|
|
326
|
+
if (!tempReader) {
|
|
327
|
+
return 0;
|
|
328
|
+
}
|
|
329
|
+
auto metadata = tempReader->getMetadata();
|
|
330
|
+
return metadata ? metadata->num_rows : 0;
|
|
331
|
+
} catch (const std::exception& e) {
|
|
332
|
+
// If parquet file is corrupted or invalid, return 0 instead of crashing
|
|
333
|
+
return 0;
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
} // namespace storage
|
|
338
|
+
} // namespace lbug
|