PyPI - pyteryx - Versions diffs - 0.1.0__py3-none-any.whl - Mend

pyteryx 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

pyteryx/__init__.py +36 -0
pyteryx/_validators.py +81 -0
pyteryx/_version.py +3 -0
pyteryx/developer.py +373 -0
pyteryx/in_out.py +314 -0
pyteryx/join.py +387 -0
pyteryx/parse.py +325 -0
pyteryx/pipeline.py +136 -0
pyteryx/preparation.py +805 -0
pyteryx/transform.py +437 -0
pyteryx-0.1.0.dist-info/METADATA +287 -0
pyteryx-0.1.0.dist-info/RECORD +15 -0
pyteryx-0.1.0.dist-info/WHEEL +4 -0
pyteryx-0.1.0.dist-info/entry_points.txt +2 -0
pyteryx-0.1.0.dist-info/licenses/LICENSE +21 -0

pyteryx/__init__.py ADDED Viewed

@@ -0,0 +1,36 @@
+"""pyteryx — Alteryx-to-Python migration toolkit.
+Replicate every major Alteryx Designer tool as an independent Python
+function, organised under classes that mirror Alteryx's tool palette
+categories.  Uses **pandas** as the data engine.
+Quick start::
+    from pyteryx import InOut, Preparation, Join, Transform, Parse, Developer
+    df = InOut.input_data("sales.csv")
+    high, low = Preparation.filter(df, "Revenue > 1000")
+    summary = Transform.summarize(high, group_by="Region",
+                                   aggregations={"Revenue": "sum"})
+    InOut.output_data(summary, "summary.parquet")
+"""
+from pyteryx._version import __version__
+from pyteryx.developer import Developer
+from pyteryx.in_out import InOut
+from pyteryx.join import Join
+from pyteryx.parse import Parse
+from pyteryx.pipeline import Pipeline
+from pyteryx.preparation import Preparation
+from pyteryx.transform import Transform
+__all__ = [
+    "__version__",
+    "Developer",
+    "InOut",
+    "Join",
+    "Parse",
+    "Pipeline",
+    "Preparation",
+    "Transform",
+]

pyteryx/_validators.py ADDED Viewed

@@ -0,0 +1,81 @@
+"""Shared input validation helpers for pyteryx tool functions.
+Every public tool function validates its inputs through these helpers
+to provide clear, consistent error messages across the library.
+"""
+from __future__ import annotations
+from typing import Sequence
+import pandas as pd
+def validate_dataframe(df: object, param_name: str = "df") -> None:
+    """Ensure the given object is a pandas DataFrame.
+    Args:
+        df: The object to validate.
+        param_name: Name of the parameter (for error messages).
+    Raises:
+        TypeError: If *df* is not a ``pandas.DataFrame``.
+    """
+    if not isinstance(df, pd.DataFrame):
+        raise TypeError(
+            f"'{param_name}' must be a pandas DataFrame, "
+            f"got {type(df).__name__}."
+        )
+def validate_columns(
+    df: pd.DataFrame,
+    columns: str | Sequence[str],
+    param_name: str = "columns",
+) -> list[str]:
+    """Ensure the specified columns exist in the DataFrame.
+    Accepts a single column name (``str``) or a sequence of names and
+    always returns a ``list[str]`` for uniform downstream handling.
+    Args:
+        df: The DataFrame to check against.
+        columns: Column name(s) to validate.
+        param_name: Name of the parameter (for error messages).
+    Returns:
+        A list of validated column names.
+    Raises:
+        TypeError: If *columns* is not a string or sequence of strings.
+        KeyError: If any column is missing from *df*.
+    """
+    if isinstance(columns, str):
+        columns = [columns]
+    elif not isinstance(columns, (list, tuple)):
+        raise TypeError(
+            f"'{param_name}' must be a string or list of strings, "
+            f"got {type(columns).__name__}."
+        )
+    missing = [c for c in columns if c not in df.columns]
+    if missing:
+        raise KeyError(
+            f"Column(s) not found in DataFrame: {missing}. "
+            f"Available columns: {list(df.columns)}"
+        )
+    return list(columns)
+def validate_not_empty(df: pd.DataFrame, param_name: str = "df") -> None:
+    """Ensure the DataFrame is not empty.
+    Args:
+        df: The DataFrame to check.
+        param_name: Name of the parameter (for error messages).
+    Raises:
+        ValueError: If *df* has zero rows.
+    """
+    if df.empty:
+        raise ValueError(f"'{param_name}' must not be an empty DataFrame.")

pyteryx/_version.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""Single source of truth for the pyteryx package version."""
+__version__ = "0.1.0"

pyteryx/developer.py ADDED Viewed

@@ -0,0 +1,373 @@
+"""Developer — Utility and advanced tools.
+Mirrors the Alteryx **Developer** tool palette: base64 encoding,
+HTTP downloads, schema inspection, and dynamic renaming.
+All methods are static and return **new** DataFrames.
+"""
+from __future__ import annotations
+import base64
+import json
+from typing import Any, Callable, Sequence
+import pandas as pd
+from pyteryx._validators import validate_columns, validate_dataframe
+class Developer:
+    """Alteryx **Developer** tool palette.
+    Provides static methods for encoding, downloading, schema
+    inspection, and dynamic column renaming.
+    """
+    # ------------------------------------------------------------------ #
+    # Base64 Encode
+    # ------------------------------------------------------------------ #
+    @staticmethod
+    def base64_encode(
+        df: pd.DataFrame,
+        column: str,
+        output_column: str | None = None,
+    ) -> pd.DataFrame:
+        """Encode a column's values to Base64 (Alteryx *Base64 Encoder*).
+        Args:
+            df: The input DataFrame.
+            column: Column to encode.
+            output_column: Name for the encoded column.  Defaults to
+                ``{column}_Base64``.
+        Returns:
+            A new DataFrame with the encoded column.
+        Example:
+            >>> df = Developer.base64_encode(df, "Password")
+        """
+        validate_dataframe(df)
+        validate_columns(df, column)
+        out = df.copy()
+        out_col = output_column or f"{column}_Base64"
+        out[out_col] = out[column].astype(str).apply(
+            lambda v: base64.b64encode(v.encode("utf-8")).decode("utf-8")
+        )
+        return out
+    # ------------------------------------------------------------------ #
+    # Base64 Decode
+    # ------------------------------------------------------------------ #
+    @staticmethod
+    def base64_decode(
+        df: pd.DataFrame,
+        column: str,
+        output_column: str | None = None,
+    ) -> pd.DataFrame:
+        """Decode a Base64-encoded column (Alteryx *Base64 Encoder — Decode*).
+        Args:
+            df: The input DataFrame.
+            column: Column to decode.
+            output_column: Name for the decoded column.  Defaults to
+                ``{column}_Decoded``.
+        Returns:
+            A new DataFrame with the decoded column.
+        Example:
+            >>> df = Developer.base64_decode(df, "Password_Base64")
+        """
+        validate_dataframe(df)
+        validate_columns(df, column)
+        out = df.copy()
+        out_col = output_column or f"{column}_Decoded"
+        out[out_col] = out[column].astype(str).apply(
+            lambda v: base64.b64decode(v.encode("utf-8")).decode("utf-8")
+        )
+        return out
+    # ------------------------------------------------------------------ #
+    # Download
+    # ------------------------------------------------------------------ #
+    @staticmethod
+    def download(
+        url: str,
+        params: dict[str, Any] | None = None,
+        output_column: str = "DownloadData",
+    ) -> pd.DataFrame:
+        """Fetch data from a URL (Alteryx *Download*).
+        Attempts to parse the response as JSON.  If that fails, the raw
+        text is returned in a single-column DataFrame.
+        **Note**: This uses ``urllib`` from the standard library to avoid
+        adding ``requests`` as a hard dependency.
+        Args:
+            url: The URL to fetch.
+            params: Optional query parameters.
+            output_column: Name of the output column.
+        Returns:
+            A DataFrame containing the downloaded data.
+        Example:
+            >>> df = Developer.download("https://api.example.com/data")
+        """
+        import urllib.parse
+        import urllib.request
+        if params:
+            query_string = urllib.parse.urlencode(params)
+            url = f"{url}?{query_string}"
+        with urllib.request.urlopen(url) as response:  # noqa: S310
+            body = response.read().decode("utf-8")
+        try:
+            data = json.loads(body)
+            if isinstance(data, list):
+                return pd.DataFrame(data)
+            elif isinstance(data, dict):
+                return pd.DataFrame([data])
+        except (json.JSONDecodeError, ValueError):
+            pass
+        return pd.DataFrame({output_column: [body]})
+    # ------------------------------------------------------------------ #
+    # Column Info
+    # ------------------------------------------------------------------ #
+    @staticmethod
+    def column_info(df: pd.DataFrame) -> pd.DataFrame:
+        """Return schema/metadata about a DataFrame (Alteryx *Column Info*).
+        Args:
+            df: The input DataFrame.
+        Returns:
+            A DataFrame with columns ``Name``, ``Type``, ``Size``,
+            ``NonNullCount``, ``NullCount``, ``UniqueCount``.
+        Example:
+            >>> schema = Developer.column_info(df)
+        """
+        validate_dataframe(df)
+        rows = []
+        for col in df.columns:
+            rows.append(
+                {
+                    "Name": col,
+                    "Type": str(df[col].dtype),
+                    "Size": df[col].memory_usage(deep=True),
+                    "NonNullCount": int(df[col].notna().sum()),
+                    "NullCount": int(df[col].isna().sum()),
+                    "UniqueCount": int(df[col].nunique()),
+                }
+            )
+        return pd.DataFrame(rows)
+    # ------------------------------------------------------------------ #
+    # Dynamic Rename
+    # ------------------------------------------------------------------ #
+    @staticmethod
+    def dynamic_rename(
+        df: pd.DataFrame,
+        rename_df: pd.DataFrame,
+        key_col: str = "OldName",
+        new_name_col: str = "NewName",
+        mode: str = "mapping",
+    ) -> pd.DataFrame:
+        """Rename columns dynamically using a lookup table (Alteryx *Dynamic Rename*).
+        Args:
+            df: The DataFrame whose columns will be renamed.
+            rename_df: A lookup DataFrame with the rename mapping.
+            key_col: Column in *rename_df* containing current column names.
+            new_name_col: Column in *rename_df* containing new names.
+            mode: ``"mapping"`` uses the lookup table;
+                ``"prefix"`` adds a prefix from a single-value
+                *rename_df*;
+                ``"suffix"`` adds a suffix from a single-value
+                *rename_df*.
+        Returns:
+            A DataFrame with renamed columns.
+        Example:
+            >>> mapping = pd.DataFrame({"OldName": ["col_a"], "NewName": ["Column A"]})
+            >>> df = Developer.dynamic_rename(df, mapping)
+        """
+        validate_dataframe(df)
+        validate_dataframe(rename_df, "rename_df")
+        if mode == "mapping":
+            validate_columns(rename_df, key_col, "key_col")
+            validate_columns(rename_df, new_name_col, "new_name_col")
+            rename_map = dict(zip(rename_df[key_col], rename_df[new_name_col]))
+            return df.rename(columns=rename_map)
+        elif mode == "prefix":
+            prefix = str(rename_df.iloc[0, 0])
+            return df.rename(columns={c: f"{prefix}{c}" for c in df.columns})
+        elif mode == "suffix":
+            suffix = str(rename_df.iloc[0, 0])
+            return df.rename(columns={c: f"{c}{suffix}" for c in df.columns})
+        else:
+            raise ValueError(f"Unknown mode '{mode}'. Use 'mapping', 'prefix', or 'suffix'.")
+    # ------------------------------------------------------------------ #
+    # JSON Parse
+    # ------------------------------------------------------------------ #
+    @staticmethod
+    def json_parse(
+        df: pd.DataFrame,
+        column: str,
+        prefix: str | None = None,
+    ) -> pd.DataFrame:
+        """Parse a JSON string column into separate columns (Alteryx *JSON Parse*).
+        Args:
+            df: The input DataFrame.
+            column: The column containing JSON strings.
+            prefix: Optional prefix for new columns. Defaults to the original column name.
+        Returns:
+            A new DataFrame with the parsed JSON fields expanded as new columns.
+        Example:
+            >>> df = Developer.json_parse(df, "JSON_Data")
+        """
+        validate_dataframe(df)
+        validate_columns(df, column)
+        out = df.copy()
+        def parse_json(val: Any) -> dict:
+            if pd.isna(val):
+                return {}
+            if isinstance(val, str):
+                try:
+                    parsed = json.loads(val)
+                    if isinstance(parsed, dict):
+                        return parsed
+                    return {"value": parsed}
+                except (json.JSONDecodeError, TypeError):
+                    return {}
+            return {}
+        parsed_series = out[column].apply(parse_json)
+        parsed_df = pd.json_normalize(parsed_series)
+        prefix_str = prefix if prefix is not None else column
+        if prefix_str:
+            parsed_df = parsed_df.add_prefix(f"{prefix_str}_")
+        # Drop the original column and join the parsed columns
+        out = out.drop(columns=[column])
+        # Ensure index alignment
+        parsed_df.index = out.index
+        return pd.concat([out, parsed_df], axis=1)
+    # ------------------------------------------------------------------ #
+    # Dynamic Select
+    # ------------------------------------------------------------------ #
+    @staticmethod
+    def dynamic_select(
+        df: pd.DataFrame,
+        dtype_include: Any | None = None,
+        dtype_exclude: Any | None = None,
+        pattern: str | None = None,
+    ) -> pd.DataFrame:
+        """Select columns dynamically based on data type or pattern (Alteryx *Dynamic Select*).
+        Args:
+            df: The input DataFrame.
+            dtype_include: Data types to include (e.g., 'number', 'object').
+            dtype_exclude: Data types to exclude.
+            pattern: A regex pattern to match column names against.
+        Returns:
+            A new DataFrame containing only the selected columns.
+        Example:
+            >>> df_num = Developer.dynamic_select(df, dtype_include="number")
+            >>> df_sales = Developer.dynamic_select(df, pattern="^Sales_")
+        """
+        validate_dataframe(df)
+        out = df
+        if dtype_include is not None or dtype_exclude is not None:
+            out = out.select_dtypes(include=dtype_include, exclude=dtype_exclude)
+        if pattern is not None:
+            out = out.filter(regex=pattern)
+        return out
+    # ------------------------------------------------------------------ #
+    # Test
+    # ------------------------------------------------------------------ #
+    @staticmethod
+    def test(
+        df: pd.DataFrame,
+        condition_func: Callable,
+        error_msg: str = "Test condition failed",
+    ) -> pd.DataFrame:
+        """Verify data using a custom condition (Alteryx *Test*).
+        Evaluates `condition_func(df)`. If it returns False, raises a ValueError.
+        Args:
+            df: The input DataFrame.
+            condition_func: A callable taking the DataFrame and returning a boolean.
+            error_msg: Error message to raise on failure.
+        Returns:
+            The original DataFrame if the test passes.
+        Raises:
+            ValueError: If the condition is false.
+        Example:
+            >>> Developer.test(df, lambda d: d["Sales"].sum() > 0, "No sales!")
+        """
+        validate_dataframe(df)
+        if not condition_func(df):
+            raise ValueError(error_msg)
+        return df
+    # ------------------------------------------------------------------ #
+    # Test Equal
+    # ------------------------------------------------------------------ #
+    @staticmethod
+    def test_equal(
+        df_left: pd.DataFrame,
+        df_right: pd.DataFrame,
+        **kwargs: Any,
+    ) -> None:
+        """Test if two data streams are identical (Alteryx *Expect Equal*).
+        Wraps `pandas.testing.assert_frame_equal`. Raises AssertionError if they differ.
+        Args:
+            df_left: The first DataFrame.
+            df_right: The second DataFrame.
+            **kwargs: Additional arguments to `pd.testing.assert_frame_equal`.
+        Raises:
+            AssertionError: If the DataFrames do not match.
+        Example:
+            >>> Developer.test_equal(df1, df2)
+        """
+        validate_dataframe(df_left)
+        validate_dataframe(df_right, "df_right")
+        pd.testing.assert_frame_equal(df_left, df_right, **kwargs)