npm - lbug - Versions diffs - 0.12.3-dev.16 → 0.12.3-dev.18 - Mend

lbug 0.12.3-dev.16 → 0.12.3-dev.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

package/lbug-source/src/storage/table/parquet_rel_table.cpp ADDED Viewed

@@ -0,0 +1,388 @@
+#include "storage/table/parquet_rel_table.h"
+#include <thread>
+#include "catalog/catalog_entry/rel_group_catalog_entry.h"
+#include "common/data_chunk/sel_vector.h"
+#include "common/exception/runtime.h"
+#include "common/file_system/virtual_file_system.h"
+#include "main/client_context.h"
+#include "processor/operator/persistent/reader/parquet/parquet_reader.h"
+#include "storage/storage_manager.h"
+#include "transaction/transaction.h"
+using namespace lbug::catalog;
+using namespace lbug::common;
+using namespace lbug::processor;
+using namespace lbug::transaction;
+namespace lbug {
+namespace storage {
+void ParquetRelTableScanState::setToTable(const Transaction* transaction, Table* table_,
+    std::vector<column_id_t> columnIDs_, std::vector<ColumnPredicateSet> columnPredicateSets_,
+    RelDataDirection direction_) {
+    // Call base class implementation but skip local table setup
+    TableScanState::setToTable(transaction, table_, std::move(columnIDs_),
+        std::move(columnPredicateSets_));
+    columns.resize(columnIDs.size());
+    direction = direction_;
+    for (size_t i = 0; i < columnIDs.size(); ++i) {
+        auto columnID = columnIDs[i];
+        if (columnID == INVALID_COLUMN_ID || columnID == ROW_IDX_COLUMN_ID) {
+            columns[i] = nullptr;
+        } else {
+            columns[i] = table->cast<RelTable>().getColumn(columnID, direction);
+        }
+    }
+    csrOffsetColumn = table->cast<RelTable>().getCSROffsetColumn(direction);
+    csrLengthColumn = table->cast<RelTable>().getCSRLengthColumn(direction);
+    nodeGroupIdx = INVALID_NODE_GROUP_IDX;
+    // ParquetRelTable does not support local storage, so we skip the local table initialization
+}
+ParquetRelTable::ParquetRelTable(RelGroupCatalogEntry* relGroupEntry, table_id_t fromTableID,
+    table_id_t toTableID, const StorageManager* storageManager, MemoryManager* memoryManager)
+    : RelTable{relGroupEntry, fromTableID, toTableID, storageManager, memoryManager},
+      relGroupEntry{relGroupEntry} {
+    std::string storage = relGroupEntry->getStorage();
+    if (storage.empty()) {
+        throw RuntimeException("Parquet file path is empty for parquet-backed rel table");
+    }
+    // Get the relationship name for multi-table directory support
+    std::string relName = relGroupEntry->getName();
+    // New prefix format with relationship name: "prefix" which expands to:
+    // prefix_indices_{relName}.parquet, prefix_indptr_{relName}.parquet,
+    // prefix_metadata_{relName}.parquet
+    std::string prefix = storage;
+    indicesFilePath = prefix + "_indices_" + relName + ".parquet";
+    indptrFilePath = prefix + "_indptr_" + relName + ".parquet";
+}
+void ParquetRelTable::initScanState(Transaction* transaction, TableScanState& scanState,
+    bool resetCachedBoundNodeSelVec) const {
+    // For parquet tables, we create our own scan state
+    auto& relScanState = scanState.cast<RelTableScanState>();
+    relScanState.source = TableScanSource::COMMITTED;
+    relScanState.nodeGroup = nullptr;
+    relScanState.nodeGroupIdx = INVALID_NODE_GROUP_IDX;
+    // Initialize ParquetReaders for this scan state (per-thread)
+    auto& parquetRelScanState = static_cast<ParquetRelTableScanState&>(relScanState);
+    // Initialize readers if not already done for this scan state
+    if (!parquetRelScanState.indicesReader) {
+        std::vector<bool> columnSkips; // Read all columns
+        auto context = transaction->getClientContext();
+        parquetRelScanState.indicesReader =
+            std::make_unique<ParquetReader>(indicesFilePath, columnSkips, context);
+    }
+    if (!indptrFilePath.empty() && !parquetRelScanState.indptrReader) {
+        std::vector<bool> columnSkips; // Read all columns
+        auto context = transaction->getClientContext();
+        parquetRelScanState.indptrReader =
+            std::make_unique<ParquetReader>(indptrFilePath, columnSkips, context);
+    }
+    // Load shared indptr data - thread-safe to read
+    if (!indptrFilePath.empty()) {
+        loadIndptrData(transaction);
+    }
+    // For morsel-driven parallelism, each scan state maintains its own bound node processing state
+    // No shared state needed between threads
+    if (resetCachedBoundNodeSelVec) {
+        // Copy the cached bound node selection vector from the scan state
+        if (relScanState.nodeIDVector->state->getSelVector().isUnfiltered()) {
+            relScanState.cachedBoundNodeSelVector.setToUnfiltered();
+        } else {
+            relScanState.cachedBoundNodeSelVector.setToFiltered();
+            memcpy(relScanState.cachedBoundNodeSelVector.getMutableBuffer().data(),
+                relScanState.nodeIDVector->state->getSelVector().getMutableBuffer().data(),
+                relScanState.nodeIDVector->state->getSelVector().getSelSize() * sizeof(sel_t));
+        }
+        relScanState.cachedBoundNodeSelVector.setSelSize(
+            relScanState.nodeIDVector->state->getSelVector().getSelSize());
+    }
+    // Initialize row group ranges for morsel-driven parallelism
+    // For now, assign all row groups to this scan state (will be partitioned by the scan operator)
+    parquetRelScanState.startRowGroup = 0;
+    parquetRelScanState.endRowGroup = parquetRelScanState.indicesReader ?
+                                          parquetRelScanState.indicesReader->getNumRowsGroups() :
+                                          0;
+    parquetRelScanState.currentRowGroup = parquetRelScanState.startRowGroup;
+    parquetRelScanState.nextRowToProcess = 0;
+}
+void ParquetRelTable::initializeParquetReaders(Transaction* transaction) const {
+    if (!indicesReader) {
+        std::lock_guard lock(parquetReaderMutex);
+        if (!indicesReader) {
+            std::vector<bool> columnSkips; // Read all columns
+            auto context = transaction->getClientContext();
+            indicesReader = std::make_unique<ParquetReader>(indicesFilePath, columnSkips, context);
+        }
+    }
+}
+void ParquetRelTable::initializeIndptrReader(Transaction* transaction) const {
+    if (!indptrFilePath.empty() && !indptrReader) {
+        std::lock_guard lock(parquetReaderMutex);
+        if (!indptrReader) {
+            std::vector<bool> columnSkips; // Read all columns
+            auto context = transaction->getClientContext();
+            indptrReader = std::make_unique<ParquetReader>(indptrFilePath, columnSkips, context);
+        }
+    }
+}
+void ParquetRelTable::loadIndptrData(Transaction* transaction) const {
+    if (indptrData.empty() && !indptrFilePath.empty()) {
+        std::lock_guard lock(indptrDataMutex);
+        if (indptrData.empty()) {
+            initializeIndptrReader(transaction);
+            if (!indptrReader)
+                return;
+            // Initialize scan to populate column types
+            auto context = transaction->getClientContext();
+            auto vfs = VirtualFileSystem::GetUnsafe(*context);
+            std::vector<uint64_t> groupsToRead;
+            for (uint64_t i = 0; i < indptrReader->getNumRowsGroups(); ++i) {
+                groupsToRead.push_back(i);
+            }
+            ParquetReaderScanState scanState;
+            indptrReader->initializeScan(scanState, groupsToRead, vfs);
+            // Check if the indptr file has any columns after scan initialization
+            auto numColumns = indptrReader->getNumColumns();
+            if (numColumns == 0) {
+                throw RuntimeException("Indptr parquet file has no columns");
+            }
+            // Validate column type for indptr
+            const auto& indptrType = indptrReader->getColumnType(0);
+            if (!LogicalTypeUtils::isIntegral(indptrType.getLogicalTypeID())) {
+                throw RuntimeException(
+                    "Indptr parquet file column must be integer type (column 0)");
+            }
+            // Read the indptr column
+            DataChunk dataChunk(1);
+            // Now get the column type after scan is initialized
+            const auto& columnTypeRef = indptrReader->getColumnType(0);
+            auto columnType = columnTypeRef.copy();
+            auto vector = std::make_shared<ValueVector>(std::move(columnType));
+            dataChunk.insert(0, vector);
+            // Read all indptr values
+            while (indptrReader->scanInternal(scanState, dataChunk)) {
+                auto selSize = dataChunk.state->getSelVector().getSelSize();
+                for (size_t i = 0; i < selSize; ++i) {
+                    auto value = dataChunk.getValueVector(0).getValue<common::offset_t>(i);
+                    indptrData.push_back(value);
+                }
+            }
+        }
+    }
+}
+bool ParquetRelTable::scanInternal(Transaction* transaction, TableScanState& scanState) {
+    auto& relScanState = scanState.cast<RelTableScanState>();
+    // Get the ParquetRelTableScanState
+    auto& parquetRelScanState = static_cast<ParquetRelTableScanState&>(relScanState);
+    // Load shared indptr data - thread-safe to read
+    if (!indptrFilePath.empty()) {
+        loadIndptrData(transaction);
+    }
+    // True morsel-driven parallelism: each scan state processes its assigned row groups
+    // Process all row groups assigned to this scan state, collecting relationships for bound nodes
+    return scanInternalByRowGroups(transaction, parquetRelScanState);
+}
+bool ParquetRelTable::scanInternalByRowGroups(Transaction* transaction,
+    ParquetRelTableScanState& parquetRelScanState) {
+    // True morsel-driven parallelism: process assigned row groups and collect relationships for
+    // bound nodes
+    // Check if we have any row groups left to process
+    if (parquetRelScanState.currentRowGroup >= parquetRelScanState.endRowGroup) {
+        // No more row groups to process
+        auto newSelVector = std::make_shared<SelectionVector>(0);
+        parquetRelScanState.outState->setSelVector(newSelVector);
+        return false;
+    }
+    // Process the current row group
+    std::vector<uint64_t> rowGroupsToProcess = {parquetRelScanState.currentRowGroup};
+    // Create a set of bound node IDs for fast lookup
+    std::unordered_set<common::offset_t> boundNodeOffsets;
+    for (size_t i = 0; i < parquetRelScanState.cachedBoundNodeSelVector.getSelSize(); ++i) {
+        common::sel_t boundNodeIdx = parquetRelScanState.cachedBoundNodeSelVector[i];
+        const auto boundNodeID = parquetRelScanState.nodeIDVector->getValue<nodeID_t>(boundNodeIdx);
+        boundNodeOffsets.insert(boundNodeID.offset);
+    }
+    // Scan the current row group and collect relationships for bound nodes
+    bool hasData = scanRowGroupForBoundNodes(transaction, parquetRelScanState, rowGroupsToProcess,
+        boundNodeOffsets);
+    // Move to next row group for next call
+    parquetRelScanState.currentRowGroup++;
+    return hasData;
+}
+common::offset_t ParquetRelTable::findSourceNodeForRow(common::offset_t globalRowIdx) const {
+    // Binary search in indptrData to find which source node this row belongs to
+    // indptrData[i] gives the starting row index for source node i
+    // indptrData[i+1] gives the ending row index for source node i
+    if (indptrData.empty()) {
+        return common::INVALID_OFFSET;
+    }
+    // Binary search to find the source node
+    size_t left = 0;
+    size_t right = indptrData.size() - 2; // -2 because we compare with i+1
+    while (left <= right) {
+        size_t mid = left + (right - left) / 2;
+        if (globalRowIdx >= indptrData[mid] && globalRowIdx < indptrData[mid + 1]) {
+            return mid; // Found the source node
+        } else if (globalRowIdx < indptrData[mid]) {
+            if (mid == 0)
+                break;
+            right = mid - 1;
+        } else {
+            left = mid + 1;
+        }
+    }
+    return common::INVALID_OFFSET; // Row not found in any range
+}
+bool ParquetRelTable::scanRowGroupForBoundNodes(Transaction* transaction,
+    ParquetRelTableScanState& parquetRelScanState, const std::vector<uint64_t>& rowGroupsToProcess,
+    const std::unordered_set<common::offset_t>& boundNodeOffsets) {
+    // Initialize readers if needed
+    initializeParquetReaders(transaction);
+    if (!parquetRelScanState.indicesReader) {
+        return false;
+    }
+    // Initialize scan state for the assigned row groups
+    auto context = transaction->getClientContext();
+    auto vfs = VirtualFileSystem::GetUnsafe(*context);
+    parquetRelScanState.indicesReader->initializeScan(*parquetRelScanState.parquetScanState,
+        rowGroupsToProcess, vfs);
+    // Create DataChunk matching the indices parquet file schema
+    auto numIndicesColumns = parquetRelScanState.indicesReader->getNumColumns();
+    DataChunk indicesChunk(numIndicesColumns);
+    // Insert value vectors for all columns in the parquet file
+    for (uint32_t colIdx = 0; colIdx < numIndicesColumns; ++colIdx) {
+        const auto& columnTypeRef = parquetRelScanState.indicesReader->getColumnType(colIdx);
+        auto columnType = columnTypeRef.copy();
+        auto vector = std::make_shared<ValueVector>(std::move(columnType));
+        indicesChunk.insert(colIdx, vector);
+    }
+    // Scan the row groups and collect relationships for bound nodes
+    uint64_t totalRowsCollected = 0;
+    const uint64_t maxRowsPerCall = DEFAULT_VECTOR_CAPACITY;
+    uint64_t currentGlobalRowIdx = 0;
+    // Calculate the starting global row index for the first row group
+    if (!rowGroupsToProcess.empty()) {
+        auto metadata = parquetRelScanState.indicesReader->getMetadata();
+        for (uint64_t rgIdx = 0; rgIdx < rowGroupsToProcess[0]; ++rgIdx) {
+            currentGlobalRowIdx += metadata->row_groups[rgIdx].num_rows;
+        }
+    }
+    while (totalRowsCollected < maxRowsPerCall &&
+           parquetRelScanState.indicesReader->scanInternal(*parquetRelScanState.parquetScanState,
+               indicesChunk)) {
+        auto selSize = indicesChunk.state->getSelVector().getSelSize();
+        for (size_t i = 0; i < selSize && totalRowsCollected < maxRowsPerCall;
+             ++i, ++currentGlobalRowIdx) {
+            // Find which source node this row belongs to
+            common::offset_t sourceNodeOffset = findSourceNodeForRow(currentGlobalRowIdx);
+            if (sourceNodeOffset == common::INVALID_OFFSET) {
+                continue; // Invalid row
+            }
+            // Check if this source node is in our bound nodes
+            if (boundNodeOffsets.find(sourceNodeOffset) == boundNodeOffsets.end()) {
+                continue; // Not a bound node, skip
+            }
+            // This row belongs to a bound node, collect the relationship
+            // Column 0 in indices file is the target/destination node ID
+            // Read as offset_t and convert to INTERNAL_ID
+            auto dstOffset = indicesChunk.getValueVector(0).getValue<common::offset_t>(i);
+            auto dstNodeID = internalID_t(dstOffset, getToNodeTableID());
+            // outputVectors[0] is the neighbor node ID (destination), if requested
+            if (!parquetRelScanState.outputVectors.empty()) {
+                parquetRelScanState.outputVectors[0]->setValue(totalRowsCollected, dstNodeID);
+            }
+            // If there are additional columns (e.g., weight), copy them to subsequent output
+            // vectors These are property columns and should have matching types
+            for (uint32_t colIdx = 1;
+                 colIdx < numIndicesColumns && colIdx < parquetRelScanState.outputVectors.size();
+                 ++colIdx) {
+                parquetRelScanState.outputVectors[colIdx]->copyFromVectorData(totalRowsCollected,
+                    &indicesChunk.getValueVector(colIdx), i);
+            }
+            totalRowsCollected++;
+        }
+    }
+    // Set up the output state
+    if (totalRowsCollected > 0) {
+        auto selVector = std::make_shared<SelectionVector>(totalRowsCollected);
+        selVector->setToFiltered(totalRowsCollected);
+        for (uint64_t i = 0; i < totalRowsCollected; ++i) {
+            (*selVector)[i] = i;
+        }
+        parquetRelScanState.outState->setSelVector(selVector);
+        return true;
+    } else {
+        // No data found
+        auto selVector = std::make_shared<SelectionVector>(0);
+        parquetRelScanState.outState->setSelVector(selVector);
+        return false;
+    }
+}
+row_idx_t ParquetRelTable::getNumTotalRows(const transaction::Transaction* transaction) {
+    initializeParquetReaders(const_cast<transaction::Transaction*>(transaction));
+    if (!indicesReader) {
+        return 0;
+    }
+    auto metadata = indicesReader->getMetadata();
+    return metadata ? metadata->num_rows : 0;
+}
+} // namespace storage
+} // namespace lbug

package/lbug-source/test/include/test_runner/test_group.h CHANGED Viewed

@@ -107,7 +107,17 @@ struct TestGroup {
     std::unordered_map<std::string, std::set<std::string>> testCasesConnNames;
     bool testFwdOnly;
-    enum class DatasetType { CSV, PARQUET, NPY, CSV_TO_PARQUET, TURTLE, LBUG, JSON, CSV_TO_JSON };
+    enum class DatasetType {
+        CSV,
+        PARQUET,
+        NPY,
+        CSV_TO_PARQUET,
+        TURTLE,
+        LBUG,
+        JSON,
+        CSV_TO_JSON,
+        GRAPH_STD
+    };
     DatasetType datasetType;
     bool isValid() const { return !group.empty() && !dataset.empty(); }

package/lbug-source/test/runner/e2e_test.cpp CHANGED Viewed

@@ -38,7 +38,13 @@ public:
         }
         createDB(checkpointWaitTimeout);
         createConns(connNames);
-        if (datasetType != TestGroup::DatasetType::LBUG && dataset != "empty") {
+        if (datasetType == TestGroup::DatasetType::GRAPH_STD) {
+            // For GRAPH_STD, only run schema.cypher (which contains WITH storage = ... clauses)
+            // No copy.cypher needed as data is in external parquet files
+            lbug::main::Connection* connection =
+                conn ? conn.get() : (connMap.begin()->second).get();
+            TestHelper::executeScript(dataset + "/" + TestHelper::SCHEMA_FILE_NAME, *connection);
+        } else if (datasetType != TestGroup::DatasetType::LBUG && dataset != "empty") {
             initGraph();
         } else if (generateBinaryDemo && TestHelper::E2E_OVERRIDE_IMPORT_DIR.empty()) {
             initGraph(TestHelper::appendLbugRootPath("dataset/demo-db/parquet/"));

package/lbug-source/test/test_files/demo_db/demo_db_graph_std.test ADDED Viewed

@@ -0,0 +1,77 @@
+-DATASET GRAPH-STD demo-db/graph-std
+--
+-CASE DemoDBGraphStdTest
+-LOG MatchUserLivesInCity
+-STATEMENT MATCH (u:user)-[l:livesin]->(c:city) RETURN u.name, u.age, c.name;
+---- 4
+Adam|30|Waterloo
+Karissa|40|Waterloo
+Zhang|50|Kitchener
+Noura|25|Guelph
+-LOG MatchSingleNodeLabel
+-STATEMENT MATCH (a:user) RETURN a.name, a.age;
+---- 4
+Adam|30
+Karissa|40
+Zhang|50
+Noura|25
+-LOG MatchCityNodes
+-STATEMENT MATCH (c:city) RETURN c.name, c.population;
+---- 3
+Waterloo|150000
+Kitchener|200000
+Guelph|75000
+-LOG MatchFollowsRel
+-STATEMENT MATCH (a:user)-[e:follows]->(b:user) RETURN a.name, b.name, e.since;
+---- 4
+Adam|Karissa|2020
+Adam|Zhang|2020
+Karissa|Zhang|2021
+Zhang|Noura|2022
+-LOG MatchLivesInWithCityPopulation
+-STATEMENT MATCH (u:user)-[l:livesin]->(c:city) RETURN u.name, c.name, c.population ORDER BY c.population DESC;
+---- 4
+Zhang|Kitchener|200000
+Adam|Waterloo|150000
+Karissa|Waterloo|150000
+Noura|Guelph|75000
+-LOG MatchLivesInFilterByCity
+-STATEMENT MATCH (u:user)-[l:livesin]->(c:city) WHERE c.name = 'Waterloo' RETURN u.name, u.age;
+---- 2
+Adam|30
+Karissa|40
+-LOG MatchLivesInFilterByCityPopulation
+-STATEMENT MATCH (u:user)-[l:livesin]->(c:city) WHERE c.population > 100000 RETURN u.name, c.name ORDER BY u.name;
+---- 3
+Adam|Waterloo
+Karissa|Waterloo
+Zhang|Kitchener
+-LOG CountUsersPerCity
+-STATEMENT MATCH (u:user)-[l:livesin]->(c:city) RETURN c.name, COUNT(*) AS num_users ORDER BY num_users DESC;
+---- 3
+Waterloo|2
+Guelph|1
+Kitchener|1
+-LOG MatchFollowsWithDestinationAge
+-STATEMENT MATCH (a:user)-[e:follows]->(b:user) WHERE b.age > 30 RETURN a.name, b.name, b.age ORDER BY b.age DESC;
+---- 3
+Adam|Zhang|50
+Karissa|Zhang|50
+Adam|Karissa|40
+-LOG MatchFollowsFilterBySourceAndDest
+-STATEMENT MATCH (a:user)-[e:follows]->(b:user) WHERE a.age < 40 AND b.age >= 40 RETURN a.name, b.name;
+---- 2
+Adam|Karissa
+Adam|Zhang

package/lbug-source/test/test_helper/test_helper.cpp CHANGED Viewed

@@ -71,6 +71,30 @@ void TestHelper::executeScript(const std::string& cypherScript, Connection& conn
             auto fullPath = appendLbugRootPath(csvFilePath);
             line.replace(line.find(csvFilePath), csvFilePath.length(), fullPath);
         }
+        // Also handle storage = 'path' for parquet tables
+        std::vector<std::string> storagePaths;
+        size_t storageIndex = 0;
+        while (true) {
+            size_t start = line.find("storage = '", storageIndex);
+            if (start == std::string::npos) {
+                break;
+            }
+            start += 11; // length of "storage = '"
+            size_t end = line.find("'", start);
+            if (end == std::string::npos) {
+                break;
+            }
+            std::string storagePath = line.substr(start, end - start);
+            storagePaths.push_back(storagePath);
+            storageIndex = end + 1;
+        }
+        for (auto& storagePath : storagePaths) {
+            auto fullPath = appendLbugRootPath(storagePath);
+            size_t pos = line.find(storagePath);
+            if (pos != std::string::npos) {
+                line.replace(pos, storagePath.length(), fullPath);
+            }
+        }
 #ifdef __STATIC_LINK_EXTENSION_TEST__
         if (line.starts_with("load extension")) {
             continue;

package/lbug-source/test/test_runner/test_parser.cpp CHANGED Viewed

@@ -87,6 +87,9 @@ void TestParser::extractDataset() {
             testGroup->datasetType = TestGroup::DatasetType::JSON;
             testGroup->dataset = currentToken.params[2];
         }
+    } else if (datasetType == "GRAPH-STD") {
+        testGroup->datasetType = TestGroup::DatasetType::GRAPH_STD;
+        testGroup->dataset = currentToken.params[2];
     } else {
         throw TestException(
             "Invalid dataset type `" + currentToken.params[1] + "` [" + path + ":" + line + "].");