PyPI - datafolio - Versions diffs - 1.1.0__py3-none-any.whl - Mend

datafolio 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

datafolio/__init__.py +7 -0
datafolio/accessors.py +338 -0
datafolio/base/__init__.py +25 -0
datafolio/base/handler.py +220 -0
datafolio/base/registry.py +228 -0
datafolio/cache/__init__.py +29 -0
datafolio/cache/config.py +167 -0
datafolio/cache/manager.py +519 -0
datafolio/cache/metadata.py +218 -0
datafolio/cache/validation.py +162 -0
datafolio/cli/__init__.py +5 -0
datafolio/cli/main.py +929 -0
datafolio/display.py +695 -0
datafolio/folio.py +5054 -0
datafolio/handlers/__init__.py +64 -0
datafolio/handlers/arrays.py +145 -0
datafolio/handlers/artifacts.py +143 -0
datafolio/handlers/json_data.py +148 -0
datafolio/handlers/sklearn_models.py +257 -0
datafolio/handlers/tables.py +279 -0
datafolio/handlers/timestamps.py +157 -0
datafolio/metadata.py +94 -0
datafolio/py.typed +0 -0
datafolio/readers.py +191 -0
datafolio/storage/__init__.py +22 -0
datafolio/storage/backend.py +616 -0
datafolio/storage/categories.py +111 -0
datafolio/utils.py +621 -0
datafolio-1.1.0.dist-info/METADATA +390 -0
datafolio-1.1.0.dist-info/RECORD +32 -0
datafolio-1.1.0.dist-info/WHEEL +4 -0
datafolio-1.1.0.dist-info/entry_points.txt +3 -0

datafolio/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""datafolio: Lightweight wrapping of dataframes, models, and metadata to track analyses."""
+__version__ = "1.1.0"
+from datafolio.folio import DataFolio
+__all__ = ["DataFolio", "__version__"]

datafolio/accessors.py ADDED Viewed

@@ -0,0 +1,338 @@
+"""Data accessors for convenient item access.
+This module provides ItemProxy and DataAccessor classes that enable
+autocomplete-friendly access to DataFolio items using both attribute
+and dictionary-style syntax.
+"""
+from typing import TYPE_CHECKING, Any, Dict, Optional
+if TYPE_CHECKING:
+    from datafolio.folio import DataFolio
+class ItemProxy:
+    """Proxy for accessing a single item with autocomplete-friendly properties.
+    Provides convenient property-based access to item data and metadata.
+    """
+    def __init__(self, folio: "DataFolio", name: str):
+        """Initialize ItemProxy.
+        Args:
+            folio: Parent DataFolio instance
+            name: Name of the item
+        """
+        self._folio = folio
+        self._name = name
+    @property
+    def content(self) -> Any:
+        """Get the content of this item.
+        Returns:
+            - For tables: DataFrame
+            - For numpy arrays: numpy array
+            - For JSON data: dict/list/scalar
+            - For timestamps: datetime object (UTC-aware)
+            - For models: loaded model object
+            - For artifacts: file path string (use with open())
+        Examples:
+            >>> df = folio.data.results.content  # DataFrame
+            >>> arr = folio.data.embeddings.content  # numpy array
+            >>> cfg = folio.data.config.content  # dict
+            >>> ts = folio.data.event_time.content  # datetime
+            >>> model = folio.data.classifier.content  # model object
+            >>> with open(folio.data.plot.content, 'rb') as f:  # file path
+            ...     img = f.read()
+        """
+        # Auto-refresh before accessing
+        self._folio._refresh_if_needed()
+        item = self._folio._items[self._name]
+        item_type = item.get("item_type")
+        # Dispatch to appropriate getter
+        if item_type in ("referenced_table", "included_table"):
+            return self._folio.get_table(self._name)
+        elif item_type == "numpy_array":
+            return self._folio.get_numpy(self._name)
+        elif item_type == "json_data":
+            return self._folio.get_json(self._name)
+        elif item_type == "timestamp":
+            return self._folio.get_timestamp(self._name)
+        elif item_type in ("model", "pytorch_model"):
+            return self._folio.get_model(self._name)
+        elif item_type == "artifact":
+            return self._folio.get_artifact_path(self._name)
+        else:
+            raise ValueError(f"Unknown item type: {item_type}")
+    @property
+    def description(self) -> Optional[str]:
+        """Get the description of this item.
+        Returns:
+            Description string or None if not set
+        """
+        # Auto-refresh before accessing
+        self._folio._refresh_if_needed()
+        item = self._folio._items[self._name]
+        return item.get("description")
+    @property
+    def type(self) -> str:
+        """Get the type of this item.
+        Returns:
+            Item type string ('referenced_table', 'included_table', 'model',
+            'pytorch_model', 'numpy_array', 'json_data', 'artifact')
+        """
+        # Auto-refresh before accessing
+        self._folio._refresh_if_needed()
+        item = self._folio._items[self._name]
+        return item.get("item_type", "unknown")
+    @property
+    def path(self) -> Optional[str]:
+        """Get the file path for this item.
+        Returns:
+            - For referenced tables: external file path
+            - For artifacts: artifact file path
+            - For other types: None
+        Examples:
+            >>> folio.data.external_data.path  # 's3://bucket/data.parquet'
+            >>> folio.data.plot.path  # '/path/to/bundle/artifacts/plot.png'
+        """
+        # Auto-refresh before accessing
+        self._folio._refresh_if_needed()
+        item = self._folio._items[self._name]
+        item_type = item.get("item_type")
+        if item_type == "referenced_table":
+            return item.get("path")
+        elif item_type == "artifact":
+            return self._folio.get_artifact_path(self._name)
+        else:
+            return None
+    @property
+    def inputs(self) -> list[str]:
+        """Get the list of items this item depends on (lineage).
+        Returns:
+            List of item names that were used to create this item
+        """
+        return self._folio.get_inputs(self._name)
+    @property
+    def dependents(self) -> list[str]:
+        """Get the list of items that depend on this item (lineage).
+        Returns:
+            List of item names that use this item as input
+        """
+        return self._folio.get_dependents(self._name)
+    @property
+    def metadata(self) -> Dict[str, Any]:
+        """Get the full metadata dictionary for this item.
+        Returns:
+            Dictionary containing all item metadata
+        """
+        # Auto-refresh before accessing
+        self._folio._refresh_if_needed()
+        return dict(self._folio._items[self._name])
+    def __repr__(self) -> str:
+        """Return string representation."""
+        # Auto-refresh before accessing
+        self._folio._refresh_if_needed()
+        item = self._folio._items[self._name]
+        item_type = item.get("item_type", "unknown")
+        desc = item.get("description", "")
+        desc_str = f": {desc}" if desc else ""
+        return f"ItemProxy('{self._name}', type='{item_type}'{desc_str})"
+class DataAccessor:
+    """Accessor for autocomplete-friendly item access.
+    Supports both attribute-style (folio.data.my_item) and
+    dictionary-style (folio.data['my_item']) access.
+    """
+    def __init__(self, folio: "DataFolio"):
+        """Initialize DataAccessor.
+        Args:
+            folio: Parent DataFolio instance
+        """
+        # Store folio reference
+        self._folio = folio
+        # Create a dynamic class with item attributes for better autocomplete
+        self._setup_dynamic_attributes()
+    def _setup_dynamic_attributes(self) -> None:
+        """Set up dynamic attributes on the class for autocomplete support.
+        This sets attributes on the class itself (not just the instance),
+        which makes them more visible to IDE autocomplete and Jedi.
+        """
+        # Get current items
+        self._folio._refresh_if_needed()
+        # Get the class of this instance
+        cls = self.__class__
+        # Track which attributes we've added (store on the class)
+        if not hasattr(cls, "_dynamic_attrs"):
+            cls._dynamic_attrs = set()
+        # Get current item names
+        current_items = set(self._folio._items.keys())
+        # Remove attributes that no longer exist
+        for attr in list(cls._dynamic_attrs):
+            if attr not in current_items:
+                if hasattr(cls, attr):
+                    delattr(cls, attr)
+                cls._dynamic_attrs.discard(attr)
+        # Add new attributes as properties on the class
+        for item_name in current_items:
+            if item_name not in cls._dynamic_attrs:
+                # Create a property that returns an ItemProxy
+                # Use a default argument to capture the item_name in the closure
+                def make_property(name: str):
+                    def item_property(self) -> ItemProxy:
+                        return ItemProxy(self._folio, name)
+                    return property(item_property)
+                setattr(cls, item_name, make_property(item_name))
+                cls._dynamic_attrs.add(item_name)
+    def _sync_items(self) -> None:
+        """Sync item attributes with current folio state.
+        This method is called when items may have changed.
+        """
+        self._setup_dynamic_attributes()
+    def __getattr__(self, name: str) -> ItemProxy:
+        """Get item by attribute access.
+        Args:
+            name: Item name
+        Returns:
+            ItemProxy for the item
+        Raises:
+            AttributeError: If item doesn't exist
+        """
+        if name.startswith("_"):
+            # Allow access to private attributes
+            raise AttributeError(
+                f"'{type(self).__name__}' object has no attribute '{name}'"
+            )
+        # Auto-refresh and re-sync items if needed
+        self._folio._refresh_if_needed()
+        self._sync_items()
+        # Try to get the attribute again after syncing
+        # Use object.__getattribute__ directly to avoid recursion
+        try:
+            return object.__getattribute__(self, name)
+        except AttributeError:
+            raise AttributeError(
+                f"Item '{name}' not found in DataFolio. "
+                f"Available items: {', '.join(sorted(self._folio._items.keys()))}"
+            )
+    def __getitem__(self, name: str) -> ItemProxy:
+        """Get item by dictionary access.
+        Args:
+            name: Item name
+        Returns:
+            ItemProxy for the item
+        Raises:
+            KeyError: If item doesn't exist
+        """
+        # Auto-refresh before accessing
+        self._folio._refresh_if_needed()
+        if name not in self._folio._items:
+            raise KeyError(f"Item '{name}' not found in DataFolio")
+        return ItemProxy(self._folio, name)
+    def __dir__(self) -> list[str]:
+        """Return list of item names for autocomplete.
+        Returns:
+            List combining standard attributes and item names
+        """
+        # Auto-refresh before accessing
+        self._folio._refresh_if_needed()
+        # Combine standard object attributes with item names
+        standard_attrs = list(object.__dir__(self))
+        item_names = list(self._folio._items.keys())
+        return sorted(set(standard_attrs + item_names))
+    def _ipython_key_completions_(self) -> list[str]:
+        """Provide key completions for IPython/Jupyter.
+        This method is specifically for IPython's autocomplete system,
+        which provides better autocomplete support in Jupyter notebooks.
+        Returns:
+            Sorted list of all item names
+        """
+        # Auto-refresh before accessing
+        self._folio._refresh_if_needed()
+        return sorted(self._folio._items.keys())
+    def __repr__(self) -> str:
+        """Return string representation.
+        Returns:
+            String showing available items
+        """
+        items = sorted(self._folio._items.keys())
+        if not items:
+            return "DataAccessor(no items)"
+        # Group by type
+        contents = self._folio.list_contents()
+        lines = ["DataAccessor:"]
+        for category, item_list in [
+            ("Tables", contents["referenced_tables"] + contents["included_tables"]),
+            ("Models", contents["models"]),
+            ("Numpy Arrays", contents["numpy_arrays"]),
+            ("JSON Data", contents["json_data"]),
+            ("Artifacts", contents["artifacts"]),
+        ]:
+            if item_list:
+                lines.append(f"  {category}: {', '.join(sorted(item_list))}")
+        return "\n".join(lines)

datafolio/base/__init__.py ADDED Viewed

@@ -0,0 +1,25 @@
+"""Base classes and utilities for datafolio handler system.
+This module provides the foundation for the handler-based architecture:
+- BaseHandler: Abstract base class for all data type handlers
+- HandlerRegistry: Registry for managing handlers
+- Convenience functions: register_handler, get_handler, detect_handler
+"""
+from datafolio.base.handler import BaseHandler
+from datafolio.base.registry import (
+    HandlerRegistry,
+    detect_handler,
+    get_handler,
+    get_registry,
+    register_handler,
+)
+__all__ = [
+    "BaseHandler",
+    "HandlerRegistry",
+    "register_handler",
+    "get_handler",
+    "detect_handler",
+    "get_registry",
+]

datafolio/base/handler.py ADDED Viewed

@@ -0,0 +1,220 @@
+"""Base handler interface for data type handlers.
+This module defines the abstract base class that all data type handlers must implement.
+Handlers are responsible for serialization, deserialization, and metadata management
+for specific data types (e.g., pandas DataFrames, numpy arrays, PyTorch models).
+"""
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any, Dict, Optional
+from datafolio.storage.categories import StorageCategory, get_storage_category
+if TYPE_CHECKING:
+    from datafolio.folio import DataFolio
+class BaseHandler(ABC):
+    """Abstract base class for data type handlers.
+    Each handler manages serialization, deserialization, and metadata
+    for one data type (tables, arrays, models, etc.).
+    Handlers are responsible for:
+    1. Type detection (can_handle)
+    2. Writing data to storage (add)
+    3. Reading data from storage (get)
+    4. File cleanup (delete) - optional override
+    5. Storage location (get_storage_subdir) - automatically derived from item_type
+    The storage location is automatically determined from the item_type using
+    the ITEM_TYPE_TO_CATEGORY mapping. Handlers only need to define their
+    item_type, and the storage category is looked up automatically.
+    Examples:
+        Create a custom handler (minimal implementation):
+        >>> class MyHandler(BaseHandler):
+        ...     @property
+        ...     def item_type(self) -> str:
+        ...         return "my_type"  # Must be in ITEM_TYPE_TO_CATEGORY
+        ...
+        ...     def can_handle(self, data: Any) -> bool:
+        ...         return isinstance(data, MyDataType)
+        ...
+        ...     def add(self, folio, name, data, **kwargs):
+        ...         # Write data and return metadata
+        ...         ...
+        ...
+        ...     def get(self, folio, name, **kwargs):
+        ...         # Read and return data
+        ...         ...
+        ...
+        ...     # No need to implement get_storage_subdir() - it's automatic!
+    """
+    @property
+    @abstractmethod
+    def item_type(self) -> str:
+        """Unique identifier for this item type.
+        Must match the 'item_type' value stored in items.json.
+        Returns:
+            Item type string (e.g., 'included_table', 'numpy_array', 'pytorch_model')
+        Examples:
+            >>> handler = PandasHandler()
+            >>> handler.item_type
+            'included_table'
+        """
+        pass
+    @abstractmethod
+    def can_handle(self, data: Any) -> bool:
+        """Check if this handler can process the given data.
+        Used by add_data() for auto-detection. Return False if the handler
+        should not participate in auto-detection (e.g., for reference tables
+        or models that require explicit parameters).
+        Args:
+            data: Data object to check
+        Returns:
+            True if this handler supports this data type
+        Examples:
+            >>> handler = PandasHandler()
+            >>> handler.can_handle(pd.DataFrame())
+            True
+            >>> handler.can_handle(np.array([1, 2, 3]))
+            False
+        """
+        pass
+    @abstractmethod
+    def add(
+        self,
+        folio: "DataFolio",
+        name: str,
+        data: Any,
+        description: Optional[str] = None,
+        inputs: Optional[list[str]] = None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """Add data to folio and return metadata.
+        Responsible for:
+        1. Validating data type
+        2. Writing data to storage
+        3. Building metadata dictionary
+        Args:
+            folio: DataFolio instance (for accessing storage, bundle_dir, etc.)
+            name: Item name
+            data: Data to store
+            description: Optional description
+            inputs: Optional lineage inputs
+            **kwargs: Handler-specific options
+        Returns:
+            Complete metadata dict to store in folio._items[name].
+            Must include at minimum: 'name', 'item_type', 'created_at'.
+            Should include 'description' and 'inputs' if provided.
+        Raises:
+            TypeError: If data is wrong type for this handler
+        Examples:
+            >>> metadata = handler.add(folio, 'results', df, description='Results')
+            >>> # Returns: {'name': 'results', 'item_type': 'included_table',
+            >>> #           'filename': 'results.parquet', 'num_rows': 100, ...}
+        """
+        pass
+    @abstractmethod
+    def get(self, folio: "DataFolio", name: str, **kwargs) -> Any:
+        """Load and return data from folio.
+        Args:
+            folio: DataFolio instance
+            name: Item name
+            **kwargs: Handler-specific options (passed to reader)
+        Returns:
+            The loaded data object
+        Raises:
+            KeyError: If item doesn't exist
+        Examples:
+            >>> df = handler.get(folio, 'results')
+            >>> df = handler.get(folio, 'results', columns=['col1', 'col2'])
+        """
+        pass
+    def delete(self, folio: "DataFolio", name: str) -> None:
+        """Delete data files for this item.
+        Default implementation deletes file at items[name]['filename']
+        from the appropriate subdirectory. Override if custom logic needed
+        (e.g., for external references that have no local files).
+        Args:
+            folio: DataFolio instance
+            name: Item name
+        Examples:
+            Default behavior (can be overridden):
+            >>> handler.delete(folio, 'results')
+            # Deletes: {bundle_dir}/{subdir}/{filename}
+        """
+        item = folio._items[name]
+        if "filename" in item:
+            subdir = self.get_storage_subdir()
+            filepath = folio._storage.join_paths(
+                folio._bundle_dir, subdir, item["filename"]
+            )
+            folio._storage.delete_file(filepath)
+    def get_storage_category(self) -> StorageCategory:
+        """Get the storage category for this handler's item type.
+        This method looks up the storage category in ITEM_TYPE_TO_CATEGORY
+        based on the handler's item_type. Override this method if you need
+        custom category logic that differs from the standard mapping.
+        Returns:
+            StorageCategory enum value
+        Examples:
+            >>> handler = PandasHandler()
+            >>> handler.get_storage_category()
+            <StorageCategory.TABLES: 'tables'>
+            >>> handler = NumpyHandler()
+            >>> handler.get_storage_category()
+            <StorageCategory.ARTIFACTS: 'artifacts'>
+        """
+        return get_storage_category(self.item_type)
+    def get_storage_subdir(self) -> str:
+        """Return subdirectory for this data type.
+        This method is derived from get_storage_category(). In most cases,
+        you should not need to override this - override get_storage_category()
+        instead if you need custom storage location logic.
+        Returns:
+            Subdirectory name: 'tables', 'models', or 'artifacts'
+        Examples:
+            >>> handler = PandasHandler()
+            >>> handler.get_storage_subdir()
+            'tables'
+            >>> handler = SklearnHandler()
+            >>> handler.get_storage_subdir()
+            'models'
+        """
+        return self.get_storage_category().directory