PyPI - hopeit.dataframes - Versions diffs - 0.26.3__tar.gz → 0.26.5__tar.gz - Mend

hopeit.dataframes 0.26.3tar.gz → 0.26.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

{hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hopeit.dataframes
-Version: 0.26.3
+Version: 0.26.5
 Summary: Hopeit Engine Dataframes for Pandas
 Author-email: Leo Smerling & Pablo Canto <contact@hopeit.com.ar>, Leo Smerling <contact@hopeit.com.ar>, Pablo Canto <contact@hopeit.com.ar>
 License: Apache 2
@@ -24,8 +24,8 @@ Classifier: Topic :: Internet :: WWW/HTTP
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Classifier: Framework :: AsyncIO
 Description-Content-Type: text/markdown
-Requires-Dist: hopeit.engine>=0.26.3
-Requires-Dist: hopeit.fs-storage>=0.26.3
+Requires-Dist: hopeit.engine>=0.26.5
+Requires-Dist: hopeit.fs-storage>=0.26.5
 Provides-Extra: pandas
 Requires-Dist: pandas>=2.2.3; extra == "pandas"
 Requires-Dist: pyarrow>=19.0.1; extra == "pandas"

{hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "hopeit.dataframes"
-version = "0.26.3"
+version = "0.26.5"
 description = "Hopeit Engine Dataframes for Pandas"
 dynamic = ["readme"]
@@ -33,8 +33,8 @@ classifiers = [
 ]
 dependencies = [
-    "hopeit.engine>=0.26.3",
-    "hopeit.fs-storage>=0.26.3"
+    "hopeit.engine>=0.26.5",
+    "hopeit.fs-storage>=0.26.5"
 ]
 [project.optional-dependencies]

{hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/__init__.py RENAMED Viewed

@@ -72,8 +72,15 @@ print(Payload.to_json(my_json_response))
 from typing import Dict, Generic, Iterator, List, Type
-import numpy as np
-import pandas as pd
+try:
+    import numpy as np
+    import pandas as pd
+except ImportError:
+    # Supports using `@dataframe` annotation for dataobjects definitions
+    # without installing pandas and numpy. Useful for API-only projects.
+    import hopeit.dataframes.pandas.numpy_mock as np  # type: ignore[no-redef]
+    import hopeit.dataframes.pandas.pandas_mock as pd  # type: ignore[no-redef]
 from hopeit.dataframes.dataframe import DataFrameT, dataframe
 from hopeit.dataframes.serialization.dataset import Dataset
 from hopeit.dataframes.datablocks import DataBlocks

{hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/datablocks.py RENAMED Viewed

@@ -7,11 +7,14 @@ and saved as a single flat pandas DataFrame.
 from datetime import datetime
 from typing import AsyncGenerator, Generic, Optional, Type, TypeVar, get_args, get_origin
-import pandas as pd
+try:
+    import pandas as pd
+except ImportError:
+    import hopeit.dataframes.pandas.pandas_mock as pd  # type: ignore[no-redef]
 from hopeit.dataobjects import dataobject, dataclass, fields
 from hopeit.dataframes.serialization.dataset import Dataset, DatasetLoadError
-from hopeit.dataframes.serialization.protocol import find_dataframe_type
 from hopeit.dataframes.setup.registry import get_dataset_storage
 DataBlockType = TypeVar("DataBlockType")
@@ -110,6 +113,7 @@ class DataBlocks(Generic[DataBlockType, DataFrameType]):
         datablock: DataBlockType,
         *,
         select: Optional[list[str]] = None,
+        schema_validation: bool = True,
         database_key: Optional[str] = None,
     ) -> pd.DataFrame:
         """
@@ -124,37 +128,25 @@ class DataBlocks(Generic[DataBlockType, DataFrameType]):
         Returns:
             pd.DataFrame: The resulting pandas DataFrame.
         """
-        keys = [
-            field_name
-            for field_name, field_info in fields(datablock).items()  # type: ignore[arg-type]
-            if get_origin(field_info.annotation) is Dataset
-            and (select is None or field_name in select)
-        ]
-        # Filter/validate selected field names using saved schema,
-        # generates a single field for every common/duplicated field in the datasets
-        field_names = list(
-            dict.fromkeys(
-                [
-                    field_name
-                    for key in keys
-                    for field_name in getattr(datablock, key).schema["properties"].keys()
-                ]
-            )
-        )
+        dataset_types = cls._get_dataset_types(type(datablock), select=select)
+        field_names = cls._get_field_names(dataset_types)
         # Load data from first dataset (datablock uses a single file for all datasets)
-        dataset: Dataset = getattr(datablock, keys[0])
+        dataset: Dataset = getattr(datablock, dataset_types[0][0])
         storage = await get_dataset_storage(database_key)
-        result_df = await DataBlocks._load_datablock_df(storage, dataset, field_names, database_key)
+        result_df = await DataBlocks._load_datablock_df(
+            storage, dataset, columns=None, database_key=database_key
+        )
         # Enfore datatypes and add missing optional fields using class schema (allows schema evolution)
-        cls._adapt_to_schema(datablock, keys, result_df)
+        if schema_validation:
+            cls._adapt_to_schema(dataset_types, result_df)
+            result_df = result_df[field_names]
-        # Adding constant value fields
+        # Adding constant value fields from serialized datablock
         for field_name, field_info in fields(datablock).items():  # type: ignore[arg-type]
             if get_origin(field_info.annotation) is not Dataset:
-                result_df[field_name] = getattr(datablock, field_name)  # type: ignore[index]
+                result_df.loc[:, field_name] = getattr(datablock, field_name)  # type: ignore[index]
         return result_df
@@ -206,6 +198,29 @@ class DataBlocks(Generic[DataBlockType, DataFrameType]):
         return datatype(**blocks)
+    @staticmethod
+    def _get_dataset_types(
+        datatype: Type[DataBlockType], *, select: list[str] | None = None
+    ) -> list[tuple[str, DataFrameType]]:
+        return [
+            (field_name, get_args(field_info.annotation)[0])
+            for field_name, field_info in fields(datatype).items()  # type: ignore[type-var]
+            if get_origin(field_info.annotation) is Dataset
+            and (select is None or field_name in select)
+        ]
+    @staticmethod
+    def _get_field_names(dataset_types: list[tuple[str, DataFrameType]]) -> list[str]:
+        return list(
+            dict.fromkeys(
+                [
+                    field_name
+                    for _, dataset_type in dataset_types
+                    for field_name, _ in fields(dataset_type).items()  # type: ignore[arg-type]
+                ]
+            )
+        )
     @staticmethod
     def default(datatype: Type[DataBlockType]) -> DataBlockType:
         return datatype(**{field_name: [] for field_name in list(fields(datatype))})  # type: ignore[type-var]
@@ -216,6 +231,7 @@ class DataBlocks(Generic[DataBlockType, DataFrameType]):
         datatype: Type[DataBlockType],
         query: DataBlockQuery,
         metadata: DataBlockMetadata | None = None,
+        schema_validation: bool = True,
         **kwargs,  # Non-Dataset field values for DataBlockType
     ) -> AsyncGenerator[pd.DataFrame, None]:
         if metadata is None:
@@ -223,6 +239,9 @@ class DataBlocks(Generic[DataBlockType, DataFrameType]):
         storage = await get_dataset_storage(metadata.database_key)
+        dataset_types = cls._get_dataset_types(datatype, select=query.select)
+        field_names = cls._get_field_names(dataset_types)
         async for block_dataset in storage._get_batch(  # type: ignore[attr-defined]
             datatype,
             database_key=metadata.database_key,
@@ -231,29 +250,19 @@ class DataBlocks(Generic[DataBlockType, DataFrameType]):
             group_key=metadata.group_key,
             collection=metadata.collection,
         ):
-            dataset_types = [
-                (field_name, get_args(field_info.annotation)[0])
-                for field_name, field_info in fields(datatype).items()  # type: ignore[type-var]
-                if get_origin(field_info.annotation) is Dataset
-                and (query.select is None or field_name in query.select)
-            ]
-            field_names = list(
-                dict.fromkeys(
-                    [
-                        field_name
-                        for _, dataset_type in dataset_types
-                        for field_name, _ in fields(dataset_type).items()
-                    ]
-                )
-            )
             result_df = await DataBlocks._load_datablock_df(
-                storage, block_dataset, field_names, metadata.database_key
+                storage, block_dataset, columns=None, database_key=metadata.database_key
             )
-            # Adding constant value fields
+            # Enfore datatypes and add missing optional fields using class schema (allows schema evolution)
+            if schema_validation:
+                cls._adapt_to_schema(dataset_types, result_df)
+                result_df = result_df[field_names]
+            # Adding constant value fields from kwargs
             for field_name, field_info in fields(datatype).items():  # type: ignore[type-var]
                 if get_origin(field_info.annotation) is not Dataset:
-                    result_df[field_name] = kwargs.get(field_name)
+                    result_df.loc[:, field_name] = kwargs.get(field_name)
             yield result_df
@@ -286,9 +295,10 @@ class DataBlocks(Generic[DataBlockType, DataFrameType]):
             ) from e
     @classmethod
-    def _adapt_to_schema(cls, datablock: DataBlockType, keys: list[str], df: pd.DataFrame) -> None:
-        for key in keys:
-            datatype = find_dataframe_type(getattr(datablock, key).datatype)  # type: ignore[var-annotated]
-            valid_df = datatype._from_df(df)._df
+    def _adapt_to_schema(
+        cls, dataset_types: list[tuple[str, DataFrameType]], df: pd.DataFrame
+    ) -> None:
+        for _, datatype in dataset_types:
+            valid_df = datatype._from_df(df)._df  # type: ignore[attr-defined]
             for col in valid_df.columns:
                 df[col] = valid_df[col]

{hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/dataframe.py RENAMED Viewed

@@ -6,8 +6,13 @@ import dataclasses
 from datetime import date, datetime, timezone
 from typing import Any, Callable, Dict, Generic, Iterator, List, Type, TypeVar, Union
-import numpy as np
-import pandas as pd
+try:
+    import numpy as np
+    import pandas as pd
+except ImportError:
+    import hopeit.dataframes.pandas.numpy_mock as np  # type: ignore[no-redef]
+    import hopeit.dataframes.pandas.pandas_mock as pd  # type: ignore[no-redef]
 from pydantic import create_model
 from pydantic.fields import FieldInfo

hopeit_dataframes-0.26.5/src/hopeit/dataframes/pandas/numpy_mock.py ADDED Viewed

@@ -0,0 +1,3 @@
+from typing import Any
+ndarray = Any

hopeit_dataframes-0.26.5/src/hopeit/dataframes/pandas/pandas_mock.py ADDED Viewed

@@ -0,0 +1,13 @@
+class DataFrame:
+    def __init__(*args, **kwargs):
+        pass
+class Series:
+    def __init__(*args, **kwargs):
+        pass
+class Timestamp:
+    def __init__(*args, **kwargs):
+        pass

{hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/serialization/dataset.py RENAMED Viewed

@@ -4,7 +4,12 @@ from datetime import datetime
 from typing import Any, Dict, Generic, Optional, Type, TypeVar
 from hopeit.dataobjects import dataclass, dataobject
-import pandas as pd
+try:
+    import pandas as pd
+except ImportError:
+    import hopeit.dataframes.pandas.pandas_mock as pd  # type: ignore[no-redef]
 from pydantic import TypeAdapter
 from hopeit.dataframes.setup.registry import get_dataset_storage

{hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/serialization/files.py RENAMED Viewed

@@ -8,16 +8,15 @@ from uuid import uuid4
 from pathlib import Path
 import aiofiles
-import pandas as pd
-import pyarrow
 from pydantic import TypeAdapter
 try:
+    import pandas as pd
     import pyarrow  # type: ignore  # noqa  # pylint: disable=unused-import
 except ImportError as e:
     raise ImportError(
-        "`pyarrow` needs to be installed to use `DatasetFileStorage`",
-        "Run `pip install hopeit.dataframes[pyarrow]`",
+        "`pandas` and `pyarrow` needs to be installed to use `DatasetFileStorage`",
+        "Run `pip install hopeit.dataframes[pandas]`",
     ) from e
 from hopeit.dataframes.dataframe import DataFrameMixin

hopeit_dataframes-0.26.5/src/hopeit/dataframes/setup/__init__.py ADDED Viewed

File without changes

hopeit_dataframes-0.26.5/src/hopeit/dataframes/setup/py.typed ADDED Viewed

File without changes

{hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit.dataframes.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hopeit.dataframes
-Version: 0.26.3
+Version: 0.26.5
 Summary: Hopeit Engine Dataframes for Pandas
 Author-email: Leo Smerling & Pablo Canto <contact@hopeit.com.ar>, Leo Smerling <contact@hopeit.com.ar>, Pablo Canto <contact@hopeit.com.ar>
 License: Apache 2
@@ -24,8 +24,8 @@ Classifier: Topic :: Internet :: WWW/HTTP
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Classifier: Framework :: AsyncIO
 Description-Content-Type: text/markdown
-Requires-Dist: hopeit.engine>=0.26.3
-Requires-Dist: hopeit.fs-storage>=0.26.3
+Requires-Dist: hopeit.engine>=0.26.5
+Requires-Dist: hopeit.fs-storage>=0.26.5
 Provides-Extra: pandas
 Requires-Dist: pandas>=2.2.3; extra == "pandas"
 Requires-Dist: pyarrow>=19.0.1; extra == "pandas"

{hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit.dataframes.egg-info/SOURCES.txt RENAMED Viewed

@@ -9,6 +9,10 @@ src/hopeit/dataframes/__init__.py
 src/hopeit/dataframes/datablocks.py
 src/hopeit/dataframes/dataframe.py
 src/hopeit/dataframes/py.typed
+src/hopeit/dataframes/pandas/__init__.py
+src/hopeit/dataframes/pandas/numpy_mock.py
+src/hopeit/dataframes/pandas/pandas_mock.py
+src/hopeit/dataframes/pandas/py.typed
 src/hopeit/dataframes/serialization/__init__.py
 src/hopeit/dataframes/serialization/dataset.py
 src/hopeit/dataframes/serialization/files.py