lbug 0.12.3-dev.16 → 0.12.3-dev.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lbug-source/CMakeLists.txt +1 -1
- package/lbug-source/dataset/demo-db/graph-std/demo_indices_follows.parquet +0 -0
- package/lbug-source/dataset/demo-db/graph-std/demo_indices_livesin.parquet +0 -0
- package/lbug-source/dataset/demo-db/graph-std/demo_indptr_follows.parquet +0 -0
- package/lbug-source/dataset/demo-db/graph-std/demo_indptr_livesin.parquet +0 -0
- package/lbug-source/dataset/demo-db/graph-std/demo_mapping_city.parquet +0 -0
- package/lbug-source/dataset/demo-db/graph-std/demo_mapping_user.parquet +0 -0
- package/lbug-source/dataset/demo-db/graph-std/demo_metadata.parquet +0 -0
- package/lbug-source/dataset/demo-db/graph-std/demo_nodes_city.parquet +0 -0
- package/lbug-source/dataset/demo-db/graph-std/demo_nodes_user.parquet +0 -0
- package/lbug-source/dataset/demo-db/graph-std/schema.cypher +4 -0
- package/lbug-source/scripts/antlr4/Cypher.g4 +1 -1
- package/lbug-source/scripts/antlr4/hash.md5 +1 -1
- package/lbug-source/src/antlr4/Cypher.g4 +1 -1
- package/lbug-source/src/binder/bind/bind_ddl.cpp +23 -13
- package/lbug-source/src/catalog/catalog.cpp +5 -4
- package/lbug-source/src/catalog/catalog_entry/node_table_catalog_entry.cpp +8 -1
- package/lbug-source/src/catalog/catalog_entry/rel_group_catalog_entry.cpp +7 -0
- package/lbug-source/src/include/binder/ddl/bound_create_table_info.h +10 -6
- package/lbug-source/src/include/catalog/catalog_entry/node_table_catalog_entry.h +5 -3
- package/lbug-source/src/include/catalog/catalog_entry/rel_group_catalog_entry.h +4 -2
- package/lbug-source/src/include/common/constants.h +1 -0
- package/lbug-source/src/include/parser/ddl/create_table_info.h +3 -1
- package/lbug-source/src/include/processor/operator/scan/scan_node_table.h +2 -2
- package/lbug-source/src/include/storage/storage_manager.h +1 -0
- package/lbug-source/src/include/storage/table/node_table.h +6 -1
- package/lbug-source/src/include/storage/table/parquet_node_table.h +103 -0
- package/lbug-source/src/include/storage/table/parquet_rel_table.h +99 -0
- package/lbug-source/src/include/storage/table/rel_table.h +2 -2
- package/lbug-source/src/include/transaction/transaction.h +2 -0
- package/lbug-source/src/parser/transform/transform_ddl.cpp +6 -1
- package/lbug-source/src/processor/operator/persistent/reader/parquet/parquet_reader.cpp +4 -0
- package/lbug-source/src/processor/operator/scan/scan_multi_rel_tables.cpp +24 -2
- package/lbug-source/src/processor/operator/scan/scan_node_table.cpp +44 -8
- package/lbug-source/src/processor/operator/scan/scan_rel_table.cpp +12 -2
- package/lbug-source/src/storage/storage_manager.cpp +40 -6
- package/lbug-source/src/storage/table/CMakeLists.txt +2 -0
- package/lbug-source/src/storage/table/parquet_node_table.cpp +338 -0
- package/lbug-source/src/storage/table/parquet_rel_table.cpp +470 -0
- package/lbug-source/test/include/test_runner/test_group.h +11 -1
- package/lbug-source/test/runner/e2e_test.cpp +7 -1
- package/lbug-source/test/test_files/demo_db/demo_db_graph_std.test +43 -0
- package/lbug-source/test/test_helper/test_helper.cpp +24 -0
- package/lbug-source/test/test_runner/test_parser.cpp +3 -0
- package/lbug-source/third_party/antlr4_cypher/cypher_parser.cpp +2761 -2701
- package/lbug-source/third_party/antlr4_cypher/include/cypher_parser.h +2 -0
- package/package.json +1 -1
- package/prebuilt/lbugjs-darwin-arm64.node +0 -0
- package/prebuilt/lbugjs-linux-arm64.node +0 -0
- package/prebuilt/lbugjs-linux-x64.node +0 -0
- package/prebuilt/lbugjs-win32-x64.node +0 -0
|
@@ -0,0 +1,470 @@
|
|
|
1
|
+
#include "storage/table/parquet_rel_table.h"
|
|
2
|
+
|
|
3
|
+
#include <thread>
|
|
4
|
+
|
|
5
|
+
#include "catalog/catalog_entry/rel_group_catalog_entry.h"
|
|
6
|
+
#include "common/data_chunk/sel_vector.h"
|
|
7
|
+
#include "common/exception/runtime.h"
|
|
8
|
+
#include "common/file_system/virtual_file_system.h"
|
|
9
|
+
#include "main/client_context.h"
|
|
10
|
+
#include "processor/operator/persistent/reader/parquet/parquet_reader.h"
|
|
11
|
+
#include "storage/storage_manager.h"
|
|
12
|
+
#include "transaction/transaction.h"
|
|
13
|
+
|
|
14
|
+
using namespace lbug::catalog;
|
|
15
|
+
using namespace lbug::common;
|
|
16
|
+
using namespace lbug::processor;
|
|
17
|
+
using namespace lbug::transaction;
|
|
18
|
+
|
|
19
|
+
namespace lbug {
|
|
20
|
+
namespace storage {
|
|
21
|
+
|
|
22
|
+
void ParquetRelTableScanState::setToTable(const Transaction* transaction, Table* table_,
|
|
23
|
+
std::vector<column_id_t> columnIDs_, std::vector<ColumnPredicateSet> columnPredicateSets_,
|
|
24
|
+
RelDataDirection direction_) {
|
|
25
|
+
// Call base class implementation but skip local table setup
|
|
26
|
+
TableScanState::setToTable(transaction, table_, std::move(columnIDs_),
|
|
27
|
+
std::move(columnPredicateSets_));
|
|
28
|
+
columns.resize(columnIDs.size());
|
|
29
|
+
direction = direction_;
|
|
30
|
+
for (size_t i = 0; i < columnIDs.size(); ++i) {
|
|
31
|
+
auto columnID = columnIDs[i];
|
|
32
|
+
if (columnID == INVALID_COLUMN_ID || columnID == ROW_IDX_COLUMN_ID) {
|
|
33
|
+
columns[i] = nullptr;
|
|
34
|
+
} else {
|
|
35
|
+
columns[i] = table->cast<RelTable>().getColumn(columnID, direction);
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
csrOffsetColumn = table->cast<RelTable>().getCSROffsetColumn(direction);
|
|
39
|
+
csrLengthColumn = table->cast<RelTable>().getCSRLengthColumn(direction);
|
|
40
|
+
nodeGroupIdx = INVALID_NODE_GROUP_IDX;
|
|
41
|
+
// ParquetRelTable does not support local storage, so we skip the local table initialization
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
ParquetRelTable::ParquetRelTable(RelGroupCatalogEntry* relGroupEntry, table_id_t fromTableID,
|
|
45
|
+
table_id_t toTableID, const StorageManager* storageManager, MemoryManager* memoryManager,
|
|
46
|
+
std::string fromNodeTableName)
|
|
47
|
+
: RelTable{relGroupEntry, fromTableID, toTableID, storageManager, memoryManager},
|
|
48
|
+
relGroupEntry{relGroupEntry} {
|
|
49
|
+
std::string storage = relGroupEntry->getStorage();
|
|
50
|
+
if (storage.empty()) {
|
|
51
|
+
throw RuntimeException("Parquet file path is empty for parquet-backed rel table");
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// Get the relationship name for multi-table directory support
|
|
55
|
+
std::string relName = relGroupEntry->getName();
|
|
56
|
+
|
|
57
|
+
// New prefix format with relationship name: "prefix" which expands to:
|
|
58
|
+
// prefix_indices_{relName}.parquet, prefix_indptr_{relName}.parquet,
|
|
59
|
+
// prefix_metadata_{relName}.parquet
|
|
60
|
+
std::string prefix = storage;
|
|
61
|
+
nodeMappingFilePath = prefix + "_mapping_" + fromNodeTableName + ".parquet";
|
|
62
|
+
indicesFilePath = prefix + "_indices_" + relName + ".parquet";
|
|
63
|
+
indptrFilePath = prefix + "_indptr_" + relName + ".parquet";
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
void ParquetRelTable::initScanState(Transaction* transaction, TableScanState& scanState,
|
|
67
|
+
bool resetCachedBoundNodeSelVec) const {
|
|
68
|
+
// For parquet tables, we create our own scan state
|
|
69
|
+
auto& relScanState = scanState.cast<RelTableScanState>();
|
|
70
|
+
relScanState.source = TableScanSource::COMMITTED;
|
|
71
|
+
relScanState.nodeGroup = nullptr;
|
|
72
|
+
relScanState.nodeGroupIdx = INVALID_NODE_GROUP_IDX;
|
|
73
|
+
|
|
74
|
+
// Initialize ParquetReaders for this scan state (per-thread)
|
|
75
|
+
auto& parquetRelScanState = static_cast<ParquetRelTableScanState&>(relScanState);
|
|
76
|
+
|
|
77
|
+
// Initialize readers if not already done for this scan state
|
|
78
|
+
if (!parquetRelScanState.nodeMappingReader) {
|
|
79
|
+
std::vector<bool> columnSkips; // Read all columns
|
|
80
|
+
auto context = transaction->getClientContext();
|
|
81
|
+
parquetRelScanState.nodeMappingReader =
|
|
82
|
+
std::make_unique<ParquetReader>(nodeMappingFilePath, columnSkips, context);
|
|
83
|
+
}
|
|
84
|
+
if (!parquetRelScanState.indicesReader) {
|
|
85
|
+
std::vector<bool> columnSkips; // Read all columns
|
|
86
|
+
auto context = transaction->getClientContext();
|
|
87
|
+
parquetRelScanState.indicesReader =
|
|
88
|
+
std::make_unique<ParquetReader>(indicesFilePath, columnSkips, context);
|
|
89
|
+
}
|
|
90
|
+
if (!indptrFilePath.empty() && !parquetRelScanState.indptrReader) {
|
|
91
|
+
std::vector<bool> columnSkips; // Read all columns
|
|
92
|
+
auto context = transaction->getClientContext();
|
|
93
|
+
parquetRelScanState.indptrReader =
|
|
94
|
+
std::make_unique<ParquetReader>(indptrFilePath, columnSkips, context);
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// Load shared data (node mapping and indptr) - these are thread-safe to read
|
|
98
|
+
loadNodeMappingData(transaction);
|
|
99
|
+
if (!indptrFilePath.empty()) {
|
|
100
|
+
loadIndptrData(transaction);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// For morsel-driven parallelism, each scan state maintains its own bound node processing state
|
|
104
|
+
// No shared state needed between threads
|
|
105
|
+
if (resetCachedBoundNodeSelVec) {
|
|
106
|
+
// Copy the cached bound node selection vector from the scan state
|
|
107
|
+
if (relScanState.nodeIDVector->state->getSelVector().isUnfiltered()) {
|
|
108
|
+
relScanState.cachedBoundNodeSelVector.setToUnfiltered();
|
|
109
|
+
} else {
|
|
110
|
+
relScanState.cachedBoundNodeSelVector.setToFiltered();
|
|
111
|
+
memcpy(relScanState.cachedBoundNodeSelVector.getMutableBuffer().data(),
|
|
112
|
+
relScanState.nodeIDVector->state->getSelVector().getMutableBuffer().data(),
|
|
113
|
+
relScanState.nodeIDVector->state->getSelVector().getSelSize() * sizeof(sel_t));
|
|
114
|
+
}
|
|
115
|
+
relScanState.cachedBoundNodeSelVector.setSelSize(
|
|
116
|
+
relScanState.nodeIDVector->state->getSelVector().getSelSize());
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Initialize row group ranges for morsel-driven parallelism
|
|
120
|
+
// For now, assign all row groups to this scan state (will be partitioned by the scan operator)
|
|
121
|
+
parquetRelScanState.startRowGroup = 0;
|
|
122
|
+
parquetRelScanState.endRowGroup = parquetRelScanState.indicesReader ?
|
|
123
|
+
parquetRelScanState.indicesReader->getNumRowsGroups() :
|
|
124
|
+
0;
|
|
125
|
+
parquetRelScanState.currentRowGroup = parquetRelScanState.startRowGroup;
|
|
126
|
+
parquetRelScanState.nextRowToProcess = 0;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
void ParquetRelTable::initializeParquetReaders(Transaction* transaction) const {
|
|
130
|
+
if (!nodeMappingReader || !indicesReader) {
|
|
131
|
+
std::lock_guard lock(parquetReaderMutex);
|
|
132
|
+
if (!nodeMappingReader) {
|
|
133
|
+
std::vector<bool> columnSkips; // Read all columns
|
|
134
|
+
auto context = transaction->getClientContext();
|
|
135
|
+
nodeMappingReader =
|
|
136
|
+
std::make_unique<ParquetReader>(nodeMappingFilePath, columnSkips, context);
|
|
137
|
+
}
|
|
138
|
+
if (!indicesReader) {
|
|
139
|
+
std::vector<bool> columnSkips; // Read all columns
|
|
140
|
+
auto context = transaction->getClientContext();
|
|
141
|
+
indicesReader = std::make_unique<ParquetReader>(indicesFilePath, columnSkips, context);
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
void ParquetRelTable::initializeIndptrReader(Transaction* transaction) const {
|
|
147
|
+
if (!indptrFilePath.empty() && !indptrReader) {
|
|
148
|
+
std::lock_guard lock(parquetReaderMutex);
|
|
149
|
+
if (!indptrReader) {
|
|
150
|
+
std::vector<bool> columnSkips; // Read all columns
|
|
151
|
+
auto context = transaction->getClientContext();
|
|
152
|
+
indptrReader = std::make_unique<ParquetReader>(indptrFilePath, columnSkips, context);
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
void ParquetRelTable::loadNodeMappingData(Transaction* transaction) const {
|
|
158
|
+
if (nodeMapping.empty() && !nodeMappingFilePath.empty()) {
|
|
159
|
+
std::lock_guard lock(parquetReaderMutex);
|
|
160
|
+
if (nodeMapping.empty()) {
|
|
161
|
+
// Initialize node mapping reader if not already done
|
|
162
|
+
if (!nodeMappingReader) {
|
|
163
|
+
std::vector<bool> columnSkips; // Read all columns
|
|
164
|
+
auto context = transaction->getClientContext();
|
|
165
|
+
nodeMappingReader =
|
|
166
|
+
std::make_unique<ParquetReader>(nodeMappingFilePath, columnSkips, context);
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
// Initialize scan to populate column types
|
|
170
|
+
auto context = transaction->getClientContext();
|
|
171
|
+
auto vfs = VirtualFileSystem::GetUnsafe(*context);
|
|
172
|
+
std::vector<uint64_t> groupsToRead;
|
|
173
|
+
for (uint64_t i = 0; i < nodeMappingReader->getNumRowsGroups(); ++i) {
|
|
174
|
+
groupsToRead.push_back(i);
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
ParquetReaderScanState scanState;
|
|
178
|
+
nodeMappingReader->initializeScan(scanState, groupsToRead, vfs);
|
|
179
|
+
|
|
180
|
+
// Check if the node mapping file has columns
|
|
181
|
+
auto numColumns = nodeMappingReader->getNumColumns();
|
|
182
|
+
if (numColumns < 2) {
|
|
183
|
+
throw RuntimeException("Node mapping parquet file must have at least 2 columns");
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// Validate column types for node mapping
|
|
187
|
+
const auto& csrNodeIdType = nodeMappingReader->getColumnType(0);
|
|
188
|
+
const auto& nodeTableIdType = nodeMappingReader->getColumnType(1);
|
|
189
|
+
if (!LogicalTypeUtils::isIntegral(csrNodeIdType.getLogicalTypeID()) ||
|
|
190
|
+
!LogicalTypeUtils::isIntegral(nodeTableIdType.getLogicalTypeID())) {
|
|
191
|
+
throw RuntimeException(
|
|
192
|
+
"Node mapping parquet file columns must be integer types (columns 0 and 1)");
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
// Read the node mapping data
|
|
196
|
+
DataChunk dataChunk(2);
|
|
197
|
+
|
|
198
|
+
// Get column types
|
|
199
|
+
for (uint32_t i = 0; i < 2 && i < numColumns; ++i) {
|
|
200
|
+
const auto& columnTypeRef = nodeMappingReader->getColumnType(i);
|
|
201
|
+
auto columnType = columnTypeRef.copy();
|
|
202
|
+
auto vector = std::make_shared<ValueVector>(std::move(columnType));
|
|
203
|
+
dataChunk.insert(i, vector);
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// Read all node mapping values
|
|
207
|
+
while (nodeMappingReader->scanInternal(scanState, dataChunk)) {
|
|
208
|
+
auto selSize = dataChunk.state->getSelVector().getSelSize();
|
|
209
|
+
for (size_t i = 0; i < selSize; ++i) {
|
|
210
|
+
auto csrNodeId = dataChunk.getValueVector(0).getValue<common::offset_t>(i);
|
|
211
|
+
auto nodeTableId = dataChunk.getValueVector(1).getValue<common::offset_t>(i);
|
|
212
|
+
nodeMapping[common::internalID_t(nodeTableId, getFromNodeTableID())] =
|
|
213
|
+
csrNodeId;
|
|
214
|
+
// Also create reverse mapping for destination node lookups
|
|
215
|
+
csrToNodeTableIdMap[csrNodeId] = nodeTableId;
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
void ParquetRelTable::loadIndptrData(Transaction* transaction) const {
|
|
223
|
+
if (indptrData.empty() && !indptrFilePath.empty()) {
|
|
224
|
+
std::lock_guard lock(indptrDataMutex);
|
|
225
|
+
if (indptrData.empty()) {
|
|
226
|
+
initializeIndptrReader(transaction);
|
|
227
|
+
if (!indptrReader)
|
|
228
|
+
return;
|
|
229
|
+
|
|
230
|
+
// Initialize scan to populate column types
|
|
231
|
+
auto context = transaction->getClientContext();
|
|
232
|
+
auto vfs = VirtualFileSystem::GetUnsafe(*context);
|
|
233
|
+
std::vector<uint64_t> groupsToRead;
|
|
234
|
+
for (uint64_t i = 0; i < indptrReader->getNumRowsGroups(); ++i) {
|
|
235
|
+
groupsToRead.push_back(i);
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
ParquetReaderScanState scanState;
|
|
239
|
+
indptrReader->initializeScan(scanState, groupsToRead, vfs);
|
|
240
|
+
|
|
241
|
+
// Check if the indptr file has any columns after scan initialization
|
|
242
|
+
auto numColumns = indptrReader->getNumColumns();
|
|
243
|
+
if (numColumns == 0) {
|
|
244
|
+
throw RuntimeException("Indptr parquet file has no columns");
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// Validate column type for indptr
|
|
248
|
+
const auto& indptrType = indptrReader->getColumnType(0);
|
|
249
|
+
if (!LogicalTypeUtils::isIntegral(indptrType.getLogicalTypeID())) {
|
|
250
|
+
throw RuntimeException(
|
|
251
|
+
"Indptr parquet file column must be integer type (column 0)");
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
// Read the indptr column
|
|
255
|
+
DataChunk dataChunk(1);
|
|
256
|
+
|
|
257
|
+
// Now get the column type after scan is initialized
|
|
258
|
+
const auto& columnTypeRef = indptrReader->getColumnType(0);
|
|
259
|
+
auto columnType = columnTypeRef.copy();
|
|
260
|
+
auto vector = std::make_shared<ValueVector>(std::move(columnType));
|
|
261
|
+
dataChunk.insert(0, vector);
|
|
262
|
+
|
|
263
|
+
// Read all indptr values
|
|
264
|
+
while (indptrReader->scanInternal(scanState, dataChunk)) {
|
|
265
|
+
auto selSize = dataChunk.state->getSelVector().getSelSize();
|
|
266
|
+
for (size_t i = 0; i < selSize; ++i) {
|
|
267
|
+
auto value = dataChunk.getValueVector(0).getValue<common::offset_t>(i);
|
|
268
|
+
indptrData.push_back(value);
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
bool ParquetRelTable::scanInternal(Transaction* transaction, TableScanState& scanState) {
|
|
276
|
+
auto& relScanState = scanState.cast<RelTableScanState>();
|
|
277
|
+
|
|
278
|
+
// Get the ParquetRelTableScanState
|
|
279
|
+
auto& parquetRelScanState = static_cast<ParquetRelTableScanState&>(relScanState);
|
|
280
|
+
|
|
281
|
+
// Readers are now initialized per scan state in initScanState
|
|
282
|
+
// Load shared data (node mapping and indptr) - these are thread-safe to read
|
|
283
|
+
loadNodeMappingData(transaction);
|
|
284
|
+
if (!indptrFilePath.empty()) {
|
|
285
|
+
loadIndptrData(transaction);
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
// True morsel-driven parallelism: each scan state processes its assigned row groups
|
|
289
|
+
// Process all row groups assigned to this scan state, collecting relationships for bound nodes
|
|
290
|
+
return scanInternalByRowGroups(transaction, parquetRelScanState);
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
bool ParquetRelTable::scanInternalByRowGroups(Transaction* transaction,
|
|
294
|
+
ParquetRelTableScanState& parquetRelScanState) {
|
|
295
|
+
// True morsel-driven parallelism: process assigned row groups and collect relationships for
|
|
296
|
+
// bound nodes
|
|
297
|
+
|
|
298
|
+
// Check if we have any row groups left to process
|
|
299
|
+
if (parquetRelScanState.currentRowGroup >= parquetRelScanState.endRowGroup) {
|
|
300
|
+
// No more row groups to process
|
|
301
|
+
auto newSelVector = std::make_shared<SelectionVector>(0);
|
|
302
|
+
parquetRelScanState.outState->setSelVector(newSelVector);
|
|
303
|
+
return false;
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// Process the current row group
|
|
307
|
+
std::vector<uint64_t> rowGroupsToProcess = {parquetRelScanState.currentRowGroup};
|
|
308
|
+
|
|
309
|
+
// Create a set of bound node IDs for fast lookup
|
|
310
|
+
std::unordered_set<common::offset_t> boundNodeOffsets;
|
|
311
|
+
for (size_t i = 0; i < parquetRelScanState.cachedBoundNodeSelVector.getSelSize(); ++i) {
|
|
312
|
+
common::sel_t boundNodeIdx = parquetRelScanState.cachedBoundNodeSelVector[i];
|
|
313
|
+
const auto boundNodeID = parquetRelScanState.nodeIDVector->getValue<nodeID_t>(boundNodeIdx);
|
|
314
|
+
boundNodeOffsets.insert(boundNodeID.offset);
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
// Scan the current row group and collect relationships for bound nodes
|
|
318
|
+
bool hasData = scanRowGroupForBoundNodes(transaction, parquetRelScanState, rowGroupsToProcess,
|
|
319
|
+
boundNodeOffsets);
|
|
320
|
+
|
|
321
|
+
// Move to next row group for next call
|
|
322
|
+
parquetRelScanState.currentRowGroup++;
|
|
323
|
+
|
|
324
|
+
return hasData;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
common::offset_t ParquetRelTable::findSourceNodeForRow(common::offset_t globalRowIdx) const {
|
|
328
|
+
// Binary search in indptrData to find which source node this row belongs to
|
|
329
|
+
// indptrData[i] gives the starting row index for source node i
|
|
330
|
+
// indptrData[i+1] gives the ending row index for source node i
|
|
331
|
+
|
|
332
|
+
if (indptrData.empty()) {
|
|
333
|
+
return common::INVALID_OFFSET;
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
// Binary search to find the source node
|
|
337
|
+
size_t left = 0;
|
|
338
|
+
size_t right = indptrData.size() - 2; // -2 because we compare with i+1
|
|
339
|
+
|
|
340
|
+
while (left <= right) {
|
|
341
|
+
size_t mid = left + (right - left) / 2;
|
|
342
|
+
if (globalRowIdx >= indptrData[mid] && globalRowIdx < indptrData[mid + 1]) {
|
|
343
|
+
return mid; // Found the source node
|
|
344
|
+
} else if (globalRowIdx < indptrData[mid]) {
|
|
345
|
+
if (mid == 0)
|
|
346
|
+
break;
|
|
347
|
+
right = mid - 1;
|
|
348
|
+
} else {
|
|
349
|
+
left = mid + 1;
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
return common::INVALID_OFFSET; // Row not found in any range
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
bool ParquetRelTable::scanRowGroupForBoundNodes(Transaction* transaction,
|
|
357
|
+
ParquetRelTableScanState& parquetRelScanState, const std::vector<uint64_t>& rowGroupsToProcess,
|
|
358
|
+
const std::unordered_set<common::offset_t>& boundNodeOffsets) {
|
|
359
|
+
|
|
360
|
+
// Initialize readers if needed
|
|
361
|
+
initializeParquetReaders(transaction);
|
|
362
|
+
|
|
363
|
+
if (!parquetRelScanState.indicesReader) {
|
|
364
|
+
return false;
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
// Initialize scan state for the assigned row groups
|
|
368
|
+
auto context = transaction->getClientContext();
|
|
369
|
+
auto vfs = VirtualFileSystem::GetUnsafe(*context);
|
|
370
|
+
parquetRelScanState.indicesReader->initializeScan(*parquetRelScanState.parquetScanState,
|
|
371
|
+
rowGroupsToProcess, vfs);
|
|
372
|
+
|
|
373
|
+
// Create DataChunk matching the indices parquet file schema
|
|
374
|
+
auto numIndicesColumns = parquetRelScanState.indicesReader->getNumColumns();
|
|
375
|
+
DataChunk indicesChunk(numIndicesColumns);
|
|
376
|
+
|
|
377
|
+
// Insert value vectors for all columns in the parquet file
|
|
378
|
+
for (uint32_t colIdx = 0; colIdx < numIndicesColumns; ++colIdx) {
|
|
379
|
+
const auto& columnTypeRef = parquetRelScanState.indicesReader->getColumnType(colIdx);
|
|
380
|
+
auto columnType = columnTypeRef.copy();
|
|
381
|
+
auto vector = std::make_shared<ValueVector>(std::move(columnType));
|
|
382
|
+
indicesChunk.insert(colIdx, vector);
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
// Scan the row groups and collect relationships for bound nodes
|
|
386
|
+
uint64_t totalRowsCollected = 0;
|
|
387
|
+
const uint64_t maxRowsPerCall = DEFAULT_VECTOR_CAPACITY;
|
|
388
|
+
uint64_t currentGlobalRowIdx = 0;
|
|
389
|
+
|
|
390
|
+
// Calculate the starting global row index for the first row group
|
|
391
|
+
if (!rowGroupsToProcess.empty()) {
|
|
392
|
+
auto metadata = parquetRelScanState.indicesReader->getMetadata();
|
|
393
|
+
for (uint64_t rgIdx = 0; rgIdx < rowGroupsToProcess[0]; ++rgIdx) {
|
|
394
|
+
currentGlobalRowIdx += metadata->row_groups[rgIdx].num_rows;
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
while (totalRowsCollected < maxRowsPerCall &&
|
|
399
|
+
parquetRelScanState.indicesReader->scanInternal(*parquetRelScanState.parquetScanState,
|
|
400
|
+
indicesChunk)) {
|
|
401
|
+
|
|
402
|
+
auto selSize = indicesChunk.state->getSelVector().getSelSize();
|
|
403
|
+
|
|
404
|
+
for (size_t i = 0; i < selSize && totalRowsCollected < maxRowsPerCall;
|
|
405
|
+
++i, ++currentGlobalRowIdx) {
|
|
406
|
+
// Find which source node this row belongs to
|
|
407
|
+
common::offset_t sourceNodeOffset = findSourceNodeForRow(currentGlobalRowIdx);
|
|
408
|
+
if (sourceNodeOffset == common::INVALID_OFFSET) {
|
|
409
|
+
continue; // Invalid row
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
// Check if this source node is in our bound nodes
|
|
413
|
+
if (boundNodeOffsets.find(sourceNodeOffset) == boundNodeOffsets.end()) {
|
|
414
|
+
continue; // Not a bound node, skip
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
// This row belongs to a bound node, collect the relationship
|
|
418
|
+
|
|
419
|
+
// Column 0 in indices file is the target/destination node ID
|
|
420
|
+
// Read as offset_t and convert to INTERNAL_ID
|
|
421
|
+
auto dstOffset = indicesChunk.getValueVector(0).getValue<common::offset_t>(i);
|
|
422
|
+
auto dstNodeID = internalID_t(dstOffset, getToNodeTableID());
|
|
423
|
+
|
|
424
|
+
// outputVectors[0] is the neighbor node ID (destination), if requested
|
|
425
|
+
if (!parquetRelScanState.outputVectors.empty()) {
|
|
426
|
+
parquetRelScanState.outputVectors[0]->setValue(totalRowsCollected, dstNodeID);
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
// If there are additional columns (e.g., weight), copy them to subsequent output
|
|
430
|
+
// vectors These are property columns and should have matching types
|
|
431
|
+
for (uint32_t colIdx = 1;
|
|
432
|
+
colIdx < numIndicesColumns && colIdx < parquetRelScanState.outputVectors.size();
|
|
433
|
+
++colIdx) {
|
|
434
|
+
parquetRelScanState.outputVectors[colIdx]->copyFromVectorData(totalRowsCollected,
|
|
435
|
+
&indicesChunk.getValueVector(colIdx), i);
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
totalRowsCollected++;
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
// Set up the output state
|
|
443
|
+
if (totalRowsCollected > 0) {
|
|
444
|
+
auto selVector = std::make_shared<SelectionVector>(totalRowsCollected);
|
|
445
|
+
selVector->setToFiltered(totalRowsCollected);
|
|
446
|
+
for (uint64_t i = 0; i < totalRowsCollected; ++i) {
|
|
447
|
+
(*selVector)[i] = i;
|
|
448
|
+
}
|
|
449
|
+
parquetRelScanState.outState->setSelVector(selVector);
|
|
450
|
+
|
|
451
|
+
return true;
|
|
452
|
+
} else {
|
|
453
|
+
// No data found
|
|
454
|
+
auto selVector = std::make_shared<SelectionVector>(0);
|
|
455
|
+
parquetRelScanState.outState->setSelVector(selVector);
|
|
456
|
+
return false;
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
row_idx_t ParquetRelTable::getNumTotalRows(const transaction::Transaction* transaction) {
|
|
461
|
+
initializeParquetReaders(const_cast<transaction::Transaction*>(transaction));
|
|
462
|
+
if (!indicesReader) {
|
|
463
|
+
return 0;
|
|
464
|
+
}
|
|
465
|
+
auto metadata = indicesReader->getMetadata();
|
|
466
|
+
return metadata ? metadata->num_rows : 0;
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
} // namespace storage
|
|
470
|
+
} // namespace lbug
|
|
@@ -107,7 +107,17 @@ struct TestGroup {
|
|
|
107
107
|
std::unordered_map<std::string, std::set<std::string>> testCasesConnNames;
|
|
108
108
|
bool testFwdOnly;
|
|
109
109
|
|
|
110
|
-
enum class DatasetType {
|
|
110
|
+
enum class DatasetType {
|
|
111
|
+
CSV,
|
|
112
|
+
PARQUET,
|
|
113
|
+
NPY,
|
|
114
|
+
CSV_TO_PARQUET,
|
|
115
|
+
TURTLE,
|
|
116
|
+
LBUG,
|
|
117
|
+
JSON,
|
|
118
|
+
CSV_TO_JSON,
|
|
119
|
+
GRAPH_STD
|
|
120
|
+
};
|
|
111
121
|
DatasetType datasetType;
|
|
112
122
|
|
|
113
123
|
bool isValid() const { return !group.empty() && !dataset.empty(); }
|
|
@@ -38,7 +38,13 @@ public:
|
|
|
38
38
|
}
|
|
39
39
|
createDB(checkpointWaitTimeout);
|
|
40
40
|
createConns(connNames);
|
|
41
|
-
if (datasetType
|
|
41
|
+
if (datasetType == TestGroup::DatasetType::GRAPH_STD) {
|
|
42
|
+
// For GRAPH_STD, only run schema.cypher (which contains WITH storage = ... clauses)
|
|
43
|
+
// No copy.cypher needed as data is in external parquet files
|
|
44
|
+
lbug::main::Connection* connection =
|
|
45
|
+
conn ? conn.get() : (connMap.begin()->second).get();
|
|
46
|
+
TestHelper::executeScript(dataset + "/" + TestHelper::SCHEMA_FILE_NAME, *connection);
|
|
47
|
+
} else if (datasetType != TestGroup::DatasetType::LBUG && dataset != "empty") {
|
|
42
48
|
initGraph();
|
|
43
49
|
} else if (generateBinaryDemo && TestHelper::E2E_OVERRIDE_IMPORT_DIR.empty()) {
|
|
44
50
|
initGraph(TestHelper::appendLbugRootPath("dataset/demo-db/parquet/"));
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
-DATASET GRAPH-STD demo-db/graph-std
|
|
2
|
+
|
|
3
|
+
--
|
|
4
|
+
|
|
5
|
+
-CASE DemoDBGraphStdTest
|
|
6
|
+
|
|
7
|
+
-LOG MatchUserLivesInCity
|
|
8
|
+
-STATEMENT MATCH (u:user)-[l:livesin]->(c:city) RETURN u.name, u.age, c.name;
|
|
9
|
+
---- 7
|
|
10
|
+
Adam|30|Guelph
|
|
11
|
+
Adam|30|Guelph
|
|
12
|
+
Karissa|40|Waterloo
|
|
13
|
+
Noura|25|Kitchener
|
|
14
|
+
Noura|25|Waterloo
|
|
15
|
+
Noura|25|Waterloo
|
|
16
|
+
Zhang|50|Kitchener
|
|
17
|
+
|
|
18
|
+
-LOG MatchSingleNodeLabel
|
|
19
|
+
-STATEMENT MATCH (a:user) RETURN a.name, a.age;
|
|
20
|
+
---- 4
|
|
21
|
+
Adam|30
|
|
22
|
+
Karissa|40
|
|
23
|
+
Zhang|50
|
|
24
|
+
Noura|25
|
|
25
|
+
|
|
26
|
+
-LOG MatchCityNodes
|
|
27
|
+
-STATEMENT MATCH (c:city) RETURN c.name, c.population;
|
|
28
|
+
---- 3
|
|
29
|
+
Waterloo|150000
|
|
30
|
+
Kitchener|200000
|
|
31
|
+
Guelph|75000
|
|
32
|
+
|
|
33
|
+
-LOG MatchFollowsRel
|
|
34
|
+
-STATEMENT MATCH (a:user)-[e:follows]->(b:user) RETURN a.name, b.name, e.since;
|
|
35
|
+
---- 8
|
|
36
|
+
Adam|Zhang|2022
|
|
37
|
+
Karissa|Noura|2020
|
|
38
|
+
Karissa|Zhang|2020
|
|
39
|
+
Noura|Karissa|2020
|
|
40
|
+
Noura|Zhang|2021
|
|
41
|
+
Zhang|Adam|2022
|
|
42
|
+
Zhang|Karissa|2020
|
|
43
|
+
Zhang|Noura|2021
|
|
@@ -71,6 +71,30 @@ void TestHelper::executeScript(const std::string& cypherScript, Connection& conn
|
|
|
71
71
|
auto fullPath = appendLbugRootPath(csvFilePath);
|
|
72
72
|
line.replace(line.find(csvFilePath), csvFilePath.length(), fullPath);
|
|
73
73
|
}
|
|
74
|
+
// Also handle storage = 'path' for parquet tables
|
|
75
|
+
std::vector<std::string> storagePaths;
|
|
76
|
+
size_t storageIndex = 0;
|
|
77
|
+
while (true) {
|
|
78
|
+
size_t start = line.find("storage = '", storageIndex);
|
|
79
|
+
if (start == std::string::npos) {
|
|
80
|
+
break;
|
|
81
|
+
}
|
|
82
|
+
start += 11; // length of "storage = '"
|
|
83
|
+
size_t end = line.find("'", start);
|
|
84
|
+
if (end == std::string::npos) {
|
|
85
|
+
break;
|
|
86
|
+
}
|
|
87
|
+
std::string storagePath = line.substr(start, end - start);
|
|
88
|
+
storagePaths.push_back(storagePath);
|
|
89
|
+
storageIndex = end + 1;
|
|
90
|
+
}
|
|
91
|
+
for (auto& storagePath : storagePaths) {
|
|
92
|
+
auto fullPath = appendLbugRootPath(storagePath);
|
|
93
|
+
size_t pos = line.find(storagePath);
|
|
94
|
+
if (pos != std::string::npos) {
|
|
95
|
+
line.replace(pos, storagePath.length(), fullPath);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
74
98
|
#ifdef __STATIC_LINK_EXTENSION_TEST__
|
|
75
99
|
if (line.starts_with("load extension")) {
|
|
76
100
|
continue;
|
|
@@ -87,6 +87,9 @@ void TestParser::extractDataset() {
|
|
|
87
87
|
testGroup->datasetType = TestGroup::DatasetType::JSON;
|
|
88
88
|
testGroup->dataset = currentToken.params[2];
|
|
89
89
|
}
|
|
90
|
+
} else if (datasetType == "GRAPH-STD") {
|
|
91
|
+
testGroup->datasetType = TestGroup::DatasetType::GRAPH_STD;
|
|
92
|
+
testGroup->dataset = currentToken.params[2];
|
|
90
93
|
} else {
|
|
91
94
|
throw TestException(
|
|
92
95
|
"Invalid dataset type `" + currentToken.params[1] + "` [" + path + ":" + line + "].");
|