PyPI - faceberg - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

faceberg 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

faceberg/_version.py +34 -0
faceberg/catalog.py +92 -76
faceberg/discover.py +181 -0
faceberg/iceberg.py +707 -0
faceberg/tests/test_catalog.py +1 -2
faceberg/tests/test_discover.py +257 -0
faceberg/tests/test_iceberg.py +911 -0
faceberg-0.1.2.dist-info/METADATA +149 -0
{faceberg-0.1.0.dist-info → faceberg-0.1.2.dist-info}/RECORD +12 -11
faceberg/bridge.py +0 -586
faceberg/convert.py +0 -813
faceberg/tests/test_bridge.py +0 -825
faceberg/tests/test_convert.py +0 -422
faceberg-0.1.0.dist-info/METADATA +0 -175
{faceberg-0.1.0.dist-info → faceberg-0.1.2.dist-info}/WHEEL +0 -0
{faceberg-0.1.0.dist-info → faceberg-0.1.2.dist-info}/entry_points.txt +0 -0
{faceberg-0.1.0.dist-info → faceberg-0.1.2.dist-info}/licenses/LICENSE +0 -0

faceberg/_version.py ADDED Viewed

@@ -0,0 +1,34 @@
+# file generated by setuptools-scm
+# don't change, don't track in version control
+__all__ = [
+    "__version__",
+    "__version_tuple__",
+    "version",
+    "version_tuple",
+    "__commit_id__",
+    "commit_id",
+]
+TYPE_CHECKING = False
+if TYPE_CHECKING:
+    from typing import Tuple
+    from typing import Union
+    VERSION_TUPLE = Tuple[Union[int, str], ...]
+    COMMIT_ID = Union[str, None]
+else:
+    VERSION_TUPLE = object
+    COMMIT_ID = object
+version: str
+__version__: str
+__version_tuple__: VERSION_TUPLE
+version_tuple: VERSION_TUPLE
+commit_id: COMMIT_ID
+__commit_id__: COMMIT_ID
+__version__ = version = '0.1.2'
+__version_tuple__ = version_tuple = (0, 1, 2)
+__commit_id__ = commit_id = None

faceberg/catalog.py CHANGED Viewed

@@ -4,7 +4,6 @@ import logging
 import os
 import shutil
 import tempfile
-import uuid
 from contextlib import contextmanager
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, List, Optional, Set, Union
@@ -20,7 +19,7 @@ from pyiceberg.exceptions import (
     NoSuchTableError,
     TableAlreadyExistsError,
 )
-from pyiceberg.io import FileIO
+from pyiceberg.io import FileIO, load_file_io
 from pyiceberg.io.fsspec import FsspecFileIO
 from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionKey, PartitionSpec
 from pyiceberg.schema import Schema
@@ -34,8 +33,8 @@ from pyiceberg.typedef import EMPTY_DICT, Properties
 from uuid_utils import uuid7
 from . import config as cfg
-from .bridge import DatasetInfo
-from .convert import IcebergMetadataWriter
+from .discover import discover_dataset
+from .iceberg import write_snapshot
 if TYPE_CHECKING:
     import pyarrow as pa
@@ -361,8 +360,6 @@ class BaseCatalog(Catalog):
         Returns:
             FileIO instance with authentication configured
         """
-        from pyiceberg.io import load_file_io
         # Start with catalog's persisted properties
         props = dict(self.properties)
         # Add runtime-only token if available
@@ -956,72 +953,82 @@ class BaseCatalog(Catalog):
                 identifier, state="in_progress", percent=0, stage="Discovering dataset"
             )
-        dataset_info = DatasetInfo.discover(
+        dataset_info = discover_dataset(
             repo_id=repo,
             config=config,
             token=self._hf_token,
         )
-        # Convert to TableInfo
+        # Prepare schema with split column
         if progress_callback:
-            progress_callback(identifier, state="in_progress", percent=0, stage="Converting schema")
+            progress_callback(
+                identifier, state="in_progress", percent=10, stage="Converting schema"
+            )
+        if not dataset_info.files:
+            raise ValueError(f"No Parquet files found in dataset {repo}")
-        # TODO(kszucs): support nested namespace, pass identifier to to_table_info
-        namespace, table_name = identifier
-        table_info = dataset_info.to_table_info(
-            namespace=namespace,
-            table_name=table_name,
+        # Convert HuggingFace features to Arrow schema
+        arrow_schema = dataset_info.features.arrow_schema
+        # Build table properties
+        data_path = (
+            f"hf://datasets/{repo}/{dataset_info.data_dir}"
+            if dataset_info.data_dir
+            else f"hf://datasets/{repo}"
         )
-        # Create the table with full metadata in staging context
+        properties = {
+            "format-version": "2",
+            "write.parquet.compression-codec": "snappy",
+            "write.py-location-provider.impl": "faceberg.catalog.HfLocationProvider",
+            "write.data.path": data_path,
+            "hf.dataset.repo": repo,
+            "hf.dataset.config": config,
+            "hf.dataset.revision": dataset_info.revision,
+            "hf.write.pattern": "{split}-{uuid}-iceberg.parquet",
+            "hf.write.split": "train",
+        }
+        # Write Iceberg metadata
         if progress_callback:
             progress_callback(
-                identifier, state="in_progress", percent=0, stage="Writing Iceberg metadata"
+                identifier, state="in_progress", percent=20, stage="Writing Iceberg metadata"
             )
         with self._staging() as staging:
-            # Define table directory in the staging area
-            # Note: IcebergMetadataWriter will create the metadata subdirectory
-            table_dir = staging / identifier.path
-            table_dir.mkdir(parents=True, exist_ok=True)
             # Create table URI for metadata
             table_uri = self.uri / identifier.path
-            # Create metadata writer
-            metadata_writer = IcebergMetadataWriter(
-                table_path=table_dir,
-                schema=table_info.schema,
-                partition_spec=table_info.partition_spec,
-                base_uri=table_uri,
-            )
+            # Load FileIO with HuggingFace support
+            io = self._load_file_io(location=str(table_uri))
-            # Generate table UUID
-            table_uuid = str(uuid.uuid4())
-            # Write Iceberg metadata files (manifest, manifest list, table metadata)
-            metadata_writer.create_metadata_from_files(
-                file_infos=table_info.data_files,
-                table_uuid=table_uuid,
-                properties=table_info.get_table_properties(),
-                progress_callback=progress_callback,
-                identifier=identifier,
+            # Write snapshot metadata with split column
+            write_snapshot(
+                files=dataset_info.files,
+                schema=arrow_schema,
+                current_metadata=None,
+                output_dir=staging / identifier.path,
+                base_uri=str(table_uri),
+                properties=properties,
+                include_split_column=True,
+                io=io,
             )
-            # TODO(kszucs): metadata writer should return with the affected file paths
-            # Record all created files in the table directory
+            # Record all created files in the table metadata directory
             if progress_callback:
                 progress_callback(identifier, state="in_progress", percent=90, stage="Finalizing")
-            for path in table_dir.rglob("*"):
+            metadata_dir = staging / identifier.path / "metadata"
+            for path in metadata_dir.rglob("*"):
                 if path.is_file():
                     staging.add(path.relative_to(staging.path))
             # Register table in config if not already there
             if identifier not in catalog_config:
                 catalog_config[identifier] = cfg.Dataset(
-                    repo=table_info.dataset_repo,
-                    config=table_info.dataset_config,
+                    repo=repo,
+                    config=config,
                 )
                 # Save config since we added a dataset table
                 catalog_config.to_yaml(staging / "faceberg.yml")
@@ -1109,16 +1116,17 @@ class BaseCatalog(Catalog):
                 "Please recreate the table to enable incremental sync."
             )
-        # Discover dataset at current revision with only new files since old_revision
-        dataset_info = DatasetInfo.discover(
+        # Discover dataset at current revision
+        # Note: The new discover_dataset() doesn't support since_revision filtering yet
+        # So we discover all files and write_snapshot() will handle the diff
+        dataset_info = discover_dataset(
             repo_id=table_entry.repo,
             config=table_entry.config,
             token=self._hf_token,
-            since_revision=old_revision,
         )
-        # Check if already up to date (no new files)
-        if not dataset_info.data_files:
+        # Check if already up to date (same revision)
+        if dataset_info.revision == old_revision:
             logger.info(f"Table {identifier} already at revision {old_revision}")
             if progress_callback:
                 progress_callback(
@@ -1126,43 +1134,51 @@ class BaseCatalog(Catalog):
                 )
             return table
-        # Convert to TableInfo with only new files
-        # TODO(kszucs): support nested namespace, pass identifier to to_table_info
-        table_info = dataset_info.to_table_info(
-            namespace=identifier[0],
-            table_name=identifier[1],
+        # Use existing table schema - don't modify it
+        # The schema was already set correctly when the table was created
+        # Build updated properties
+        data_path = (
+            f"hf://datasets/{table_entry.repo}/{dataset_info.data_dir}"
+            if dataset_info.data_dir
+            else f"hf://datasets/{table_entry.repo}"
         )
-        # If no new files, table is already up to date
-        if not table_info.data_files:
-            logger.info(f"No new files for {identifier}")
-            return table
+        properties = {
+            "format-version": "2",
+            "write.parquet.compression-codec": "snappy",
+            "write.py-location-provider.impl": "faceberg.catalog.HfLocationProvider",
+            "write.data.path": data_path,
+            "hf.dataset.repo": table_entry.repo,
+            "hf.dataset.config": table_entry.config,
+            "hf.dataset.revision": dataset_info.revision,
+            "hf.write.pattern": "{split}-{uuid}-iceberg.parquet",
+            "hf.write.split": "train",
+        }
-        # Append new snapshot with only new files
+        # Append new snapshot with all files (write_snapshot will handle diffing)
         with self._staging() as staging:
-            # Create local metadata directory
-            metadata_dir = staging / identifier.path / "metadata"
-            metadata_dir.mkdir(parents=True, exist_ok=True)
             # Create table URI for metadata
-            table_uri = self.uri / identifier.path.path
-            # Create metadata writer
-            metadata_writer = IcebergMetadataWriter(
-                table_path=metadata_dir,
-                schema=table_info.schema,
-                partition_spec=table_info.partition_spec,
-                base_uri=table_uri,
-            )
+            table_uri = self.uri / identifier.path
-            # Append new snapshot with updated files
-            metadata_writer.append_snapshot_from_files(
-                file_infos=table_info.data_files,
+            # Load FileIO with HuggingFace support
+            io = self._load_file_io(location=str(table_uri))
+            # Write new snapshot (will diff against current_metadata)
+            # Schema and include_split_column parameters are ignored when current_metadata exists
+            # - it uses current_metadata.schema() and current_metadata.spec()
+            write_snapshot(
+                files=dataset_info.files,
+                schema=dataset_info.features.arrow_schema,  # Only used if creating new table
                 current_metadata=table.metadata,
-                properties=table_info.get_table_properties(),
+                output_dir=staging / identifier.path,
+                base_uri=str(table_uri),
+                properties=properties,
+                io=io,
             )
-            # Record all files in the table directory (including new manifest/metadata files)
+            # Record all files in the metadata directory (including new manifest/metadata files)
+            metadata_dir = staging / identifier.path / "metadata"
             for path in metadata_dir.rglob("*"):
                 if path.is_file():
                     staging.add(path.relative_to(staging.path))

faceberg/discover.py ADDED Viewed

@@ -0,0 +1,181 @@
+"""HuggingFace dataset discovery.
+This module discovers HuggingFace datasets and extracts metadata without
+any Iceberg-specific conversions. It provides the foundation for converting
+datasets to Iceberg tables.
+"""
+import os
+import tempfile
+from dataclasses import dataclass
+from typing import List, Optional
+from datasets import Features, load_dataset_builder
+from huggingface_hub import HfApi
+def dataset_builder_safe(
+    repo_id: str,
+    config: str,
+    token: Optional[str] = None,
+):
+    """Load dataset builder while avoiding picking up local files.
+    Changes to a temporary directory before loading to ensure the datasets
+    library doesn't pick up local files in the current directory.
+    Args:
+        repo_id: HuggingFace dataset repository ID
+        config: Configuration name
+        token: Optional HuggingFace API token
+    Returns:
+        Dataset builder object
+    Raises:
+        Exception: If loading fails
+    """
+    original_cwd = os.getcwd()
+    try:
+        # Change to a temporary directory to avoid dataset library picking up local files
+        with tempfile.TemporaryDirectory() as tmpdir:
+            os.chdir(tmpdir)
+            return load_dataset_builder(repo_id, config, token=token)
+    finally:
+        # Always restore the original directory
+        os.chdir(original_cwd)
+@dataclass
+class ParquetFile:
+    """A Parquet file discovered in a HuggingFace dataset.
+    Attributes:
+        uri: Full hf:// URI with revision (e.g., hf://datasets/repo@sha/file.parquet)
+        path: File path within the dataset (e.g., data/train-00000.parquet)
+        size: File size in bytes
+        blob_id: Git blob ID (oid) from HuggingFace
+        split: Optional split name (train, test, validation, etc.)
+    """
+    uri: str
+    path: str
+    size: int
+    blob_id: str
+    split: Optional[str] = None
+@dataclass
+class DatasetInfo:
+    """Complete information about a discovered HuggingFace dataset.
+    This represents the result of dataset discovery, containing all metadata
+    needed to understand the dataset structure without any Iceberg conversions.
+    Attributes:
+        repo_id: HuggingFace repository ID (e.g., "squad")
+        config: Configuration name
+        revision: Git revision SHA
+        features: HuggingFace Features object describing the schema
+        splits: List of split names (e.g., ["train", "test"])
+        data_dir: Common directory path containing data files
+        files: List of all discovered Parquet files
+    """
+    repo_id: str
+    config: str
+    revision: str
+    features: Features
+    splits: List[str]
+    data_dir: str
+    files: List[ParquetFile]
+def discover_dataset(
+    repo_id: str,
+    config: str,
+    token: Optional[str] = None,
+) -> DatasetInfo:
+    """Discover structure and files in a HuggingFace dataset.
+    Queries the HuggingFace Hub to gather dataset metadata, features, splits,
+    and Parquet file information without any Iceberg-specific conversions.
+    Args:
+        repo_id: HuggingFace dataset repository ID (e.g., "squad")
+        config: Configuration name to discover
+        token: HuggingFace API token (uses HF_TOKEN env var if not provided)
+    Returns:
+        DatasetInfo with all files for the latest revision
+    Raises:
+        ValueError: If dataset not found, config doesn't exist, or metadata inconsistent
+    """
+    # Step 1: Load dataset builder
+    try:
+        builder = dataset_builder_safe(repo_id, config=config, token=token)
+    except Exception as e:
+        raise ValueError(
+            f"Dataset {repo_id} config {config} not found or not accessible: {e}"
+        ) from e
+    revision = builder.hash
+    features = builder.info.features
+    # Step 2: Fetch file metadata from HuggingFace Hub
+    api = HfApi(token=token)
+    dataset_info = api.dataset_info(repo_id, revision=revision, files_metadata=True)
+    # Build mapping from URI to sibling metadata
+    file_metadata = {
+        f"hf://datasets/{repo_id}@{revision}/{s.rfilename}": s for s in dataset_info.siblings
+    }
+    # Step 3: Process data files
+    files = []
+    for split, file_uris in builder.config.data_files.items():
+        for uri in file_uris:
+            # Get metadata (strict - fail if not found)
+            if uri not in file_metadata:
+                raise ValueError(
+                    f"File {uri} from dataset builder not found in Hub API response. "
+                    f"This may indicate an inconsistent dataset state."
+                )
+            metadata = file_metadata[uri]
+            # Create ParquetFile
+            files.append(
+                ParquetFile(
+                    uri=uri,
+                    path=metadata.rfilename,
+                    size=metadata.size,
+                    blob_id=metadata.blob_id,
+                    split=split,
+                )
+            )
+    # Step 4: Extract common data directory
+    if files:
+        try:
+            file_dirs = [os.path.dirname(f.path) for f in files]
+            data_dir = os.path.commonpath(file_dirs) if file_dirs else ""
+        except ValueError as e:
+            file_paths = [f.path for f in files]
+            raise ValueError(
+                f"Unable to determine common data directory from files: {file_paths}"
+            ) from e
+    else:
+        data_dir = ""
+    # Step 5: Return DatasetInfo
+    return DatasetInfo(
+        repo_id=repo_id,
+        config=config,
+        revision=revision,
+        features=features,
+        splits=list(builder.config.data_files.keys()),
+        data_dir=data_dir,
+        files=files,
+    )

faceberg 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

faceberg 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl