PyPI - faceberg - Versions diffs - 0.1.0__py3-none-any.whl - Mend

faceberg 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

faceberg/__init__.py +15 -0
faceberg/bridge.py +586 -0
faceberg/catalog.py +1491 -0
faceberg/cli.py +483 -0
faceberg/config.py +208 -0
faceberg/convert.py +813 -0
faceberg/pretty.py +224 -0
faceberg/server.py +439 -0
faceberg/shell.py +83 -0
faceberg/spaces/Dockerfile +10 -0
faceberg/spaces/README.md +85 -0
faceberg/spaces/landing.html +799 -0
faceberg/tests/__init__.py +0 -0
faceberg/tests/conftest.py +229 -0
faceberg/tests/test_bridge.py +825 -0
faceberg/tests/test_catalog.py +1347 -0
faceberg/tests/test_catalog_duckdb.py +341 -0
faceberg/tests/test_catalog_pandas.py +290 -0
faceberg/tests/test_cli.py +62 -0
faceberg/tests/test_config.py +367 -0
faceberg/tests/test_convert.py +422 -0
faceberg/tests/test_pretty.py +366 -0
faceberg/tests/test_server.py +343 -0
faceberg/tests/test_server_playwright.py +524 -0
faceberg-0.1.0.dist-info/METADATA +175 -0
faceberg-0.1.0.dist-info/RECORD +29 -0
faceberg-0.1.0.dist-info/WHEEL +4 -0
faceberg-0.1.0.dist-info/entry_points.txt +2 -0
faceberg-0.1.0.dist-info/licenses/LICENSE +201 -0

faceberg/tests/test_catalog_duckdb.py ADDED Viewed

@@ -0,0 +1,341 @@
+"""Tests for reading catalogs using DuckDB.
+These tests verify that DuckDB can properly read Iceberg tables
+created by the Faceberg catalog from HuggingFace datasets.
+DuckDB supports both file:// and hf:// URIs:
+- file:// URIs work with local catalogs
+- hf:// URIs work through the httpfs extension for remote catalogs
+The tests automatically load the httpfs and iceberg extensions to enable
+reading Iceberg tables with both local and remote storage.
+Note: DuckDB's httpfs extension requires hf:// URLs in the format
+hf://datasets/{org}/{dataset}/{file}. Datasets must have an organization/user
+prefix (e.g., google-research-datasets/mbpp or glue/mrpc work, but rotten_tomatoes fails).
+"""
+import duckdb
+import pytest
+@pytest.fixture
+def duckdb_conn():
+    """Create a DuckDB connection for testing with required extensions."""
+    conn = duckdb.connect()
+    # Load httpfs extension for hf:// protocol support
+    try:
+        conn.execute("INSTALL httpfs")
+        conn.execute("LOAD httpfs")
+    except Exception as e:
+        pytest.skip(f"Could not load httpfs extension: {e}")
+    # Load iceberg extension for iceberg_scan support
+    try:
+        conn.execute("INSTALL iceberg")
+        conn.execute("LOAD iceberg")
+    except Exception as e:
+        pytest.skip(f"Could not load iceberg extension: {e}")
+    yield conn
+    conn.close()
+@pytest.fixture
+def mbpp_metadata_path(session_mbpp):
+    """Return path to MBPP table metadata for DuckDB.
+    DuckDB supports both file:// and hf:// URIs through httpfs extension.
+    """
+    # Construct path to v1.metadata.json directly from catalog URI
+    return f"{session_mbpp.uri}/google-research-datasets/mbpp/metadata/v1.metadata.json"
+# =============================================================================
+# A. Basic Scanning Tests
+# =============================================================================
+def test_duckdb_iceberg_scan_basic(duckdb_conn, mbpp_metadata_path):
+    """Test basic DuckDB iceberg_scan functionality."""
+    # Use iceberg_scan to read the table
+    result = duckdb_conn.execute(
+        f"""
+        SELECT COUNT(*) as cnt
+        FROM iceberg_scan('{mbpp_metadata_path}')
+    """
+    ).fetchone()
+    # Verify we got a count
+    assert result is not None
+    assert result[0] > 0
+def test_duckdb_query_data(duckdb_conn, mbpp_metadata_path):
+    """Test querying data with WHERE clause."""
+    # Query with WHERE clause on split column
+    result = duckdb_conn.execute(
+        f"""
+        SELECT COUNT(*) as cnt, split
+        FROM iceberg_scan('{mbpp_metadata_path}')
+        WHERE split = 'train'
+        GROUP BY split
+    """
+    ).fetchall()
+    # Verify we got results
+    assert len(result) > 0
+    assert result[0][1] == "train"  # Split column value
+    assert result[0][0] > 0  # Count
+def test_duckdb_aggregation(duckdb_conn, mbpp_metadata_path):
+    """Test aggregation queries (GROUP BY)."""
+    # Run GROUP BY query on split column
+    result = duckdb_conn.execute(
+        f"""
+        SELECT split, COUNT(*) as cnt
+        FROM iceberg_scan('{mbpp_metadata_path}')
+        GROUP BY split
+        ORDER BY split
+    """
+    ).fetchall()
+    # Verify we got multiple splits
+    assert len(result) > 0
+    # Verify each split has a count
+    for row in result:
+        split_name, count = row
+        assert split_name in ["train", "test", "validation", "prompt"]
+        assert count > 0
+# =============================================================================
+# B. Schema and Metadata Tests
+# =============================================================================
+def test_duckdb_read_schema(duckdb_conn, mbpp_metadata_path):
+    """Test reading table schema via DuckDB."""
+    # Use DESCRIBE to get schema information
+    result = duckdb_conn.execute(
+        f"""
+        DESCRIBE SELECT * FROM iceberg_scan('{mbpp_metadata_path}') LIMIT 0
+    """
+    ).fetchall()
+    # Verify we got column information
+    assert len(result) > 0
+    # Extract column names
+    column_names = [row[0] for row in result]
+    # Verify expected columns
+    assert "split" in column_names
+    assert "prompt" in column_names
+    assert "code" in column_names
+def test_duckdb_table_info(duckdb_conn, mbpp_metadata_path):
+    """Test reading basic table information.
+    Uses google-research-datasets/mbpp which has an org prefix compatible with DuckDB's
+    httpfs hf:// URL format requirements.
+    """
+    # Query to verify table can be opened and scanned
+    result = duckdb_conn.execute(
+        f"""
+        SELECT COUNT(*)
+        FROM iceberg_scan('{mbpp_metadata_path}')
+    """
+    ).fetchone()
+    # Verify we can read the table
+    assert result is not None
+    assert result[0] > 0  # IMDB dataset has data
+# =============================================================================
+# C. Partition Pruning Tests
+# =============================================================================
+def test_duckdb_partition_filter(duckdb_conn, mbpp_metadata_path):
+    """Test partition pruning with WHERE clause."""
+    # Query with and without filter to compare
+    total_count = duckdb_conn.execute(
+        f"""
+        SELECT COUNT(*)
+        FROM iceberg_scan('{mbpp_metadata_path}')
+    """
+    ).fetchone()[0]
+    train_count = duckdb_conn.execute(
+        f"""
+        SELECT COUNT(*)
+        FROM iceberg_scan('{mbpp_metadata_path}')
+        WHERE split = 'train'
+    """
+    ).fetchone()[0]
+    # Verify partition pruning occurred (train < total)
+    assert train_count > 0
+    assert train_count < total_count
+def test_duckdb_partition_comparison(session_mbpp, duckdb_conn, mbpp_metadata_path):
+    """Test that DuckDB partition filtering matches PyIceberg.
+    Both DuckDB (via httpfs extension) and PyIceberg (via HfFileIO) can read
+    hf:// URIs, allowing direct comparison of query results.
+    """
+    # Get count from DuckDB
+    duckdb_count = duckdb_conn.execute(
+        f"""
+        SELECT COUNT(*)
+        FROM iceberg_scan('{mbpp_metadata_path}')
+        WHERE split = 'train'
+    """
+    ).fetchone()[0]
+    # Get count from PyIceberg
+    table = session_mbpp.load_table("google-research-datasets.mbpp")
+    scan = table.scan().filter("split = 'train'")
+    arrow_table = scan.to_arrow()
+    pyiceberg_count = arrow_table.num_rows
+    # Verify counts match
+    assert duckdb_count == pyiceberg_count
+# =============================================================================
+# D. REST Catalog Tests
+# =============================================================================
+@pytest.fixture(scope="session")
+def duckdb_rest_conn(session_rest_server):
+    """Create a DuckDB connection configured to use REST catalog.
+    Note: DuckDB REST catalog support is still evolving. As of DuckDB 1.4.3,
+    the REST catalog configuration may not be fully supported. These tests
+    are marked as expected to fail until DuckDB adds stable REST catalog support.
+    """
+    conn = duckdb.connect()
+    # Load required extensions
+    try:
+        conn.execute("INSTALL httpfs")
+        conn.execute("LOAD httpfs")
+        conn.execute("INSTALL iceberg")
+        conn.execute("LOAD iceberg")
+    except Exception as e:
+        pytest.skip(f"Could not load required extensions: {e}")
+    # Attach REST catalog
+    # Note: DuckDB REST catalog support requires specifying ENDPOINT in ATTACH
+    # AUTHORIZATION_TYPE 'none' disables authentication for local test server
+    conn.execute(f"""
+        ATTACH 'warehouse' AS iceberg_catalog (
+            TYPE ICEBERG,
+            ENDPOINT '{session_rest_server}',
+            AUTHORIZATION_TYPE 'none'
+        )
+    """)
+    yield conn
+    conn.close()
+def test_duckdb_rest_list_tables(duckdb_rest_conn):
+    """Test listing tables via REST catalog in DuckDB."""
+    # List tables in the google-research-datasets namespace using SHOW TABLES
+    result = duckdb_rest_conn.execute("""
+        SHOW TABLES FROM "iceberg_catalog"."google-research-datasets"
+    """).fetchall()
+    # Verify we can list tables and mbpp is present
+    assert len(result) > 0
+    table_names = [row[0] for row in result]
+    assert "mbpp" in table_names
+def test_duckdb_rest_query_data(duckdb_rest_conn):
+    """Test querying data via REST catalog in DuckDB."""
+    # Query with WHERE clause
+    result = duckdb_rest_conn.execute("""
+        SELECT COUNT(*) as cnt, split
+        FROM iceberg_catalog."google-research-datasets".mbpp
+        WHERE split = 'train'
+        GROUP BY split
+    """).fetchall()
+    # Verify we got results
+    assert len(result) > 0
+    assert result[0][1] == "train"
+    assert result[0][0] > 0
+def test_duckdb_rest_aggregation(duckdb_rest_conn):
+    """Test aggregation queries via REST catalog."""
+    # Run GROUP BY query
+    result = duckdb_rest_conn.execute("""
+        SELECT split, COUNT(*) as cnt
+        FROM iceberg_catalog."google-research-datasets".mbpp
+        GROUP BY split
+        ORDER BY split
+    """).fetchall()
+    # Verify we got multiple splits
+    assert len(result) > 0
+    # Verify each split has a count
+    for row in result:
+        split_name, count = row
+        assert split_name in ["train", "test", "validation", "prompt"]
+        assert count > 0
+def test_duckdb_rest_schema(duckdb_rest_conn):
+    """Test reading schema via REST catalog in DuckDB."""
+    # Use DESCRIBE to get schema
+    result = duckdb_rest_conn.execute("""
+        DESCRIBE SELECT * FROM iceberg_catalog."google-research-datasets".mbpp LIMIT 0
+    """).fetchall()
+    # Verify we got column information
+    assert len(result) > 0
+    # Extract column names
+    column_names = [row[0] for row in result]
+    # Verify expected columns
+    assert "split" in column_names
+    assert "prompt" in column_names
+    assert "code" in column_names
+def test_duckdb_rest_partition_filter(duckdb_rest_conn):
+    """Test partition filtering via REST catalog."""
+    # Query with and without filter
+    total_count = duckdb_rest_conn.execute("""
+        SELECT COUNT(*)
+        FROM iceberg_catalog."google-research-datasets".mbpp
+    """).fetchone()[0]
+    train_count = duckdb_rest_conn.execute("""
+        SELECT COUNT(*)
+        FROM iceberg_catalog."google-research-datasets".mbpp
+        WHERE split = 'train'
+    """).fetchone()[0]
+    # Verify partition pruning
+    assert train_count > 0
+    assert train_count < total_count

faceberg/tests/test_catalog_pandas.py ADDED Viewed

@@ -0,0 +1,290 @@
+"""Tests for pandas.read_iceberg() integration with Faceberg catalogs.
+These tests verify that pandas can read Iceberg tables created by Faceberg
+using both catalog_properties and environment variable configuration.
+"""
+import os
+import pandas as pd
+import pytest
+@pytest.fixture
+def catalog_properties(session_mbpp):
+    """Return catalog properties for pandas.read_iceberg()."""
+    # Use appropriate catalog implementation based on URI scheme
+    if session_mbpp.uri.startswith("hf://"):
+        catalog_impl = "faceberg.catalog.RemoteCatalog"
+    else:
+        catalog_impl = "faceberg.catalog.LocalCatalog"
+    return {"py-catalog-impl": catalog_impl, "uri": session_mbpp.uri}
+# =============================================================================
+# A. Basic pandas.read_iceberg() Tests
+# =============================================================================
+def test_read_iceberg_with_catalog_properties(catalog_properties):
+    """Test reading table using pandas with catalog_properties."""
+    df = pd.read_iceberg(
+        table_identifier="google-research-datasets.mbpp",
+        catalog_name="test_catalog",  # Name doesn't matter when passing properties
+        catalog_properties=catalog_properties,
+        limit=10,
+    )
+    # Verify DataFrame was created
+    assert isinstance(df, pd.DataFrame)
+    assert len(df) == 10
+    assert len(df.columns) > 0
+    # Verify expected columns
+    assert "split" in df.columns
+    assert "prompt" in df.columns
+    assert "code" in df.columns
+def test_read_iceberg_with_env_vars(catalog_properties):
+    """Test reading table using pandas with environment variables.
+    Note: catalog_properties is the recommended approach for programmatic usage.
+    This test demonstrates that env vars can provide the URI, combined with
+    catalog_properties for py-catalog-impl to bypass PyIceberg's URI inference.
+    """
+    catalog_uri = catalog_properties["uri"]
+    # Set environment variables
+    os.environ["PYICEBERG_CATALOG__TEST_CATALOG__PY_CATALOG_IMPL"] = "faceberg.catalog.LocalCatalog"
+    os.environ["PYICEBERG_CATALOG__TEST_CATALOG__URI"] = catalog_uri
+    try:
+        # Pass py-catalog-impl and uri in catalog_properties
+        # Env vars can also be used, but catalog_properties takes precedence
+        df = pd.read_iceberg(
+            table_identifier="google-research-datasets.mbpp",
+            catalog_name="test_catalog",
+            catalog_properties=catalog_properties,
+            limit=10,
+        )
+        # Verify DataFrame was created
+        assert isinstance(df, pd.DataFrame)
+        assert len(df) == 10
+        assert "split" in df.columns
+    finally:
+        # Clean up environment variables
+        os.environ.pop("PYICEBERG_CATALOG__TEST_CATALOG__PY_CATALOG_IMPL", None)
+        os.environ.pop("PYICEBERG_CATALOG__TEST_CATALOG__URI", None)
+def test_read_iceberg_all_rows(catalog_properties):
+    """Test reading all rows without limit."""
+    df = pd.read_iceberg(
+        table_identifier="google-research-datasets.mbpp",
+        catalog_name="test",
+        catalog_properties=catalog_properties,
+    )
+    # Verify we got data
+    assert len(df) > 0
+    assert "split" in df.columns
+    # Verify multiple splits exist
+    split_values = df["split"].unique()
+    assert len(split_values) > 1
+# =============================================================================
+# B. Column Selection Tests
+# =============================================================================
+def test_read_iceberg_column_selection(catalog_properties):
+    """Test reading specific columns - both multiple and single column."""
+    # Test multiple columns
+    df = pd.read_iceberg(
+        table_identifier="google-research-datasets.mbpp",
+        catalog_name="test",
+        catalog_properties=catalog_properties,
+        columns=["prompt", "code"],
+        limit=5,
+    )
+    assert list(df.columns) == ["prompt", "code"]
+    assert "split" not in df.columns
+    assert len(df) == 5
+    # Test single column
+    df = pd.read_iceberg(
+        table_identifier="google-research-datasets.mbpp",
+        catalog_name="test",
+        catalog_properties=catalog_properties,
+        columns=["split"],
+        limit=10,
+    )
+    assert list(df.columns) == ["split"]
+    assert len(df) == 10
+# =============================================================================
+# C. Row Filtering Tests
+# =============================================================================
+def test_read_iceberg_row_filtering(catalog_properties):
+    """Test various row filtering scenarios."""
+    # Test filtering by partition column
+    df = pd.read_iceberg(
+        table_identifier="google-research-datasets.mbpp",
+        catalog_name="test",
+        catalog_properties=catalog_properties,
+        row_filter="split = 'train'",
+        limit=20,
+    )
+    assert len(df) == 20
+    assert all(df["split"] == "train")
+    # Test filtering with IN clause
+    df = pd.read_iceberg(
+        table_identifier="google-research-datasets.mbpp",
+        catalog_name="test",
+        catalog_properties=catalog_properties,
+        row_filter="split IN ('train', 'test')",
+        limit=30,
+    )
+    unique_splits = df["split"].unique()
+    assert set(unique_splits).issubset({"train", "test"})
+    assert "validation" not in unique_splits
+    assert len(df) == 30
+    # Test filtering by non-partition column (task_id exists in mbpp and is an integer)
+    df = pd.read_iceberg(
+        table_identifier="google-research-datasets.mbpp",
+        catalog_name="test",
+        catalog_properties=catalog_properties,
+        row_filter="task_id = 602",
+        limit=10,
+    )
+    assert len(df) <= 10  # May be less if task_id=602 doesn't have 10 rows
+    if len(df) > 0:
+        assert all(df["task_id"] == 602)
+# =============================================================================
+# D. Combined Filter and Column Selection Tests
+# =============================================================================
+def test_read_iceberg_filter_and_column_selection(catalog_properties):
+    """Test combining row filters and column selection."""
+    # Test basic filter with column selection
+    df = pd.read_iceberg(
+        table_identifier="google-research-datasets.mbpp",
+        catalog_name="test",
+        catalog_properties=catalog_properties,
+        columns=["prompt", "code"],
+        row_filter="split = 'train'",
+        limit=5,
+    )
+    assert list(df.columns) == ["prompt", "code"]
+    # Note: Some versions may optimize away rows if columns don't include filter columns
+    assert len(df) <= 5  # May be less if optimizer is aggressive
+    # Test complex filtering with column selection
+    df = pd.read_iceberg(
+        table_identifier="google-research-datasets.mbpp",
+        catalog_name="test",
+        catalog_properties=catalog_properties,
+        columns=["prompt"],
+        row_filter="split = 'train' AND task_id = '602'",
+        limit=3,
+    )
+    assert list(df.columns) == ["prompt"]
+    # Note: Filter may be optimized differently when split/task_id not in projection
+    assert len(df) <= 3
+# =============================================================================
+# E. Edge Cases and Error Handling
+# =============================================================================
+def test_read_iceberg_empty_result(catalog_properties):
+    """Test reading with filter that returns no rows."""
+    df = pd.read_iceberg(
+        table_identifier="google-research-datasets.mbpp",
+        catalog_name="test",
+        catalog_properties=catalog_properties,
+        row_filter="split = 'nonexistent'",
+    )
+    # Verify empty DataFrame with correct schema
+    assert len(df) == 0
+    assert "split" in df.columns
+    assert "prompt" in df.columns
+    assert "code" in df.columns
+def test_read_iceberg_invalid_table(catalog_properties):
+    """Test reading non-existent table."""
+    with pytest.raises(Exception):  # Will raise NoSuchTableError
+        pd.read_iceberg(
+            table_identifier="default.nonexistent_table",
+            catalog_name="test",
+            catalog_properties=catalog_properties,
+        )
+def test_read_iceberg_case_sensitive_false(catalog_properties):
+    """Test case-insensitive column matching."""
+    df = pd.read_iceberg(
+        table_identifier="google-research-datasets.mbpp",
+        catalog_name="test",
+        catalog_properties=catalog_properties,
+        columns=["PROMPT", "CODE"],  # Uppercase column names
+        case_sensitive=False,
+        limit=5,
+    )
+    # Should still work with case-insensitive matching
+    assert len(df) == 5
+    assert len(df.columns) == 2
+# =============================================================================
+# G. Data Type Verification Tests
+# =============================================================================
+def test_read_iceberg_data_integrity(catalog_properties):
+    """Test that data types and content are valid."""
+    df = pd.read_iceberg(
+        table_identifier="google-research-datasets.mbpp",
+        catalog_name="test",
+        catalog_properties=catalog_properties,
+        limit=10,
+    )
+    # Verify data types (pandas 2.x uses StringDtype for strings)
+    assert df["prompt"].dtype.name in ["object", "string", "str"]  # String type
+    assert df["code"].dtype.name in ["object", "string", "str"]  # String type
+    assert df["task_id"].dtype.name in ["int32", "int64"]  # Integer type
+    assert df["split"].dtype.name in ["object", "string", "str"]  # String type
+    # Verify prompt column contains actual text
+    assert all(df["prompt"].str.len() > 0)
+    # Verify code column contains actual code
+    assert all(df["code"].str.len() > 0)
+    # Verify split has valid values
+    assert all(df["split"].isin(["train", "test", "validation", "prompt"]))

faceberg/tests/test_cli.py ADDED Viewed

@@ -0,0 +1,62 @@
+"""Tests for CLI commands."""
+from click.testing import CliRunner
+from faceberg import config as cfg
+from faceberg.cli import main
+def test_list_command_with_tree_view(tmp_path):
+    """Test list command uses CatalogTreeView for rich display."""
+    # Create a local catalog
+    catalog_dir = tmp_path / "test_catalog"
+    catalog_dir.mkdir()
+    # Create config with some tables
+    config = cfg.Config()
+    config["default"] = cfg.Namespace()
+    config["default"]["imdb"] = cfg.Dataset(repo="stanfordnlp/imdb", config="plain_text")
+    config["default"]["squad"] = cfg.Dataset(repo="squad", config="plain_text")
+    config["analytics"] = cfg.Namespace()
+    config["analytics"]["aggregated"] = cfg.Table(uri="")
+    # Save config
+    config.to_yaml(catalog_dir / "faceberg.yml")
+    # Run list command
+    runner = CliRunner()
+    result = runner.invoke(main, [str(catalog_dir), "list"])
+    # Verify command succeeded
+    assert result.exit_code == 0
+    # Verify output contains catalog name in rich format
+    output = result.output
+    # Verify namespaces are shown
+    assert "default" in output
+    assert "analytics" in output
+    # Verify dataset nodes are shown with their icons
+    assert "imdb" in output
+    assert "squad" in output
+    assert "aggregated" in output
+    # Verify dataset metadata is shown (repo info)
+    assert "stanfordnlp/imdb" in output
+def test_list_command_empty_catalog(tmp_path):
+    """Test list command with empty catalog."""
+    catalog_dir = tmp_path / "empty_catalog"
+    catalog_dir.mkdir()
+    # Create empty config
+    config = cfg.Config()
+    config.to_yaml(catalog_dir / "faceberg.yml")
+    runner = CliRunner()
+    result = runner.invoke(main, [str(catalog_dir), "list"])
+    # Command should succeed even with empty catalog
+    assert result.exit_code == 0