pyteryx 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pyteryx/__init__.py ADDED
@@ -0,0 +1,36 @@
1
+ """pyteryx — Alteryx-to-Python migration toolkit.
2
+
3
+ Replicate every major Alteryx Designer tool as an independent Python
4
+ function, organised under classes that mirror Alteryx's tool palette
5
+ categories. Uses **pandas** as the data engine.
6
+
7
+ Quick start::
8
+
9
+ from pyteryx import InOut, Preparation, Join, Transform, Parse, Developer
10
+
11
+ df = InOut.input_data("sales.csv")
12
+ high, low = Preparation.filter(df, "Revenue > 1000")
13
+ summary = Transform.summarize(high, group_by="Region",
14
+ aggregations={"Revenue": "sum"})
15
+ InOut.output_data(summary, "summary.parquet")
16
+ """
17
+
18
+ from pyteryx._version import __version__
19
+ from pyteryx.developer import Developer
20
+ from pyteryx.in_out import InOut
21
+ from pyteryx.join import Join
22
+ from pyteryx.parse import Parse
23
+ from pyteryx.pipeline import Pipeline
24
+ from pyteryx.preparation import Preparation
25
+ from pyteryx.transform import Transform
26
+
27
+ __all__ = [
28
+ "__version__",
29
+ "Developer",
30
+ "InOut",
31
+ "Join",
32
+ "Parse",
33
+ "Pipeline",
34
+ "Preparation",
35
+ "Transform",
36
+ ]
pyteryx/_validators.py ADDED
@@ -0,0 +1,81 @@
1
+ """Shared input validation helpers for pyteryx tool functions.
2
+
3
+ Every public tool function validates its inputs through these helpers
4
+ to provide clear, consistent error messages across the library.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Sequence
10
+
11
+ import pandas as pd
12
+
13
+
14
+ def validate_dataframe(df: object, param_name: str = "df") -> None:
15
+ """Ensure the given object is a pandas DataFrame.
16
+
17
+ Args:
18
+ df: The object to validate.
19
+ param_name: Name of the parameter (for error messages).
20
+
21
+ Raises:
22
+ TypeError: If *df* is not a ``pandas.DataFrame``.
23
+ """
24
+ if not isinstance(df, pd.DataFrame):
25
+ raise TypeError(
26
+ f"'{param_name}' must be a pandas DataFrame, "
27
+ f"got {type(df).__name__}."
28
+ )
29
+
30
+
31
+ def validate_columns(
32
+ df: pd.DataFrame,
33
+ columns: str | Sequence[str],
34
+ param_name: str = "columns",
35
+ ) -> list[str]:
36
+ """Ensure the specified columns exist in the DataFrame.
37
+
38
+ Accepts a single column name (``str``) or a sequence of names and
39
+ always returns a ``list[str]`` for uniform downstream handling.
40
+
41
+ Args:
42
+ df: The DataFrame to check against.
43
+ columns: Column name(s) to validate.
44
+ param_name: Name of the parameter (for error messages).
45
+
46
+ Returns:
47
+ A list of validated column names.
48
+
49
+ Raises:
50
+ TypeError: If *columns* is not a string or sequence of strings.
51
+ KeyError: If any column is missing from *df*.
52
+ """
53
+ if isinstance(columns, str):
54
+ columns = [columns]
55
+ elif not isinstance(columns, (list, tuple)):
56
+ raise TypeError(
57
+ f"'{param_name}' must be a string or list of strings, "
58
+ f"got {type(columns).__name__}."
59
+ )
60
+
61
+ missing = [c for c in columns if c not in df.columns]
62
+ if missing:
63
+ raise KeyError(
64
+ f"Column(s) not found in DataFrame: {missing}. "
65
+ f"Available columns: {list(df.columns)}"
66
+ )
67
+ return list(columns)
68
+
69
+
70
+ def validate_not_empty(df: pd.DataFrame, param_name: str = "df") -> None:
71
+ """Ensure the DataFrame is not empty.
72
+
73
+ Args:
74
+ df: The DataFrame to check.
75
+ param_name: Name of the parameter (for error messages).
76
+
77
+ Raises:
78
+ ValueError: If *df* has zero rows.
79
+ """
80
+ if df.empty:
81
+ raise ValueError(f"'{param_name}' must not be an empty DataFrame.")
pyteryx/_version.py ADDED
@@ -0,0 +1,3 @@
1
+ """Single source of truth for the pyteryx package version."""
2
+
3
+ __version__ = "0.1.0"
pyteryx/developer.py ADDED
@@ -0,0 +1,373 @@
1
+ """Developer — Utility and advanced tools.
2
+
3
+ Mirrors the Alteryx **Developer** tool palette: base64 encoding,
4
+ HTTP downloads, schema inspection, and dynamic renaming.
5
+
6
+ All methods are static and return **new** DataFrames.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import base64
12
+ import json
13
+ from typing import Any, Callable, Sequence
14
+
15
+ import pandas as pd
16
+
17
+ from pyteryx._validators import validate_columns, validate_dataframe
18
+
19
+
20
+ class Developer:
21
+ """Alteryx **Developer** tool palette.
22
+
23
+ Provides static methods for encoding, downloading, schema
24
+ inspection, and dynamic column renaming.
25
+ """
26
+
27
+ # ------------------------------------------------------------------ #
28
+ # Base64 Encode
29
+ # ------------------------------------------------------------------ #
30
+ @staticmethod
31
+ def base64_encode(
32
+ df: pd.DataFrame,
33
+ column: str,
34
+ output_column: str | None = None,
35
+ ) -> pd.DataFrame:
36
+ """Encode a column's values to Base64 (Alteryx *Base64 Encoder*).
37
+
38
+ Args:
39
+ df: The input DataFrame.
40
+ column: Column to encode.
41
+ output_column: Name for the encoded column. Defaults to
42
+ ``{column}_Base64``.
43
+
44
+ Returns:
45
+ A new DataFrame with the encoded column.
46
+
47
+ Example:
48
+ >>> df = Developer.base64_encode(df, "Password")
49
+ """
50
+ validate_dataframe(df)
51
+ validate_columns(df, column)
52
+ out = df.copy()
53
+ out_col = output_column or f"{column}_Base64"
54
+ out[out_col] = out[column].astype(str).apply(
55
+ lambda v: base64.b64encode(v.encode("utf-8")).decode("utf-8")
56
+ )
57
+ return out
58
+
59
+ # ------------------------------------------------------------------ #
60
+ # Base64 Decode
61
+ # ------------------------------------------------------------------ #
62
+ @staticmethod
63
+ def base64_decode(
64
+ df: pd.DataFrame,
65
+ column: str,
66
+ output_column: str | None = None,
67
+ ) -> pd.DataFrame:
68
+ """Decode a Base64-encoded column (Alteryx *Base64 Encoder — Decode*).
69
+
70
+ Args:
71
+ df: The input DataFrame.
72
+ column: Column to decode.
73
+ output_column: Name for the decoded column. Defaults to
74
+ ``{column}_Decoded``.
75
+
76
+ Returns:
77
+ A new DataFrame with the decoded column.
78
+
79
+ Example:
80
+ >>> df = Developer.base64_decode(df, "Password_Base64")
81
+ """
82
+ validate_dataframe(df)
83
+ validate_columns(df, column)
84
+ out = df.copy()
85
+ out_col = output_column or f"{column}_Decoded"
86
+ out[out_col] = out[column].astype(str).apply(
87
+ lambda v: base64.b64decode(v.encode("utf-8")).decode("utf-8")
88
+ )
89
+ return out
90
+
91
+ # ------------------------------------------------------------------ #
92
+ # Download
93
+ # ------------------------------------------------------------------ #
94
+ @staticmethod
95
+ def download(
96
+ url: str,
97
+ params: dict[str, Any] | None = None,
98
+ output_column: str = "DownloadData",
99
+ ) -> pd.DataFrame:
100
+ """Fetch data from a URL (Alteryx *Download*).
101
+
102
+ Attempts to parse the response as JSON. If that fails, the raw
103
+ text is returned in a single-column DataFrame.
104
+
105
+ **Note**: This uses ``urllib`` from the standard library to avoid
106
+ adding ``requests`` as a hard dependency.
107
+
108
+ Args:
109
+ url: The URL to fetch.
110
+ params: Optional query parameters.
111
+ output_column: Name of the output column.
112
+
113
+ Returns:
114
+ A DataFrame containing the downloaded data.
115
+
116
+ Example:
117
+ >>> df = Developer.download("https://api.example.com/data")
118
+ """
119
+ import urllib.parse
120
+ import urllib.request
121
+
122
+ if params:
123
+ query_string = urllib.parse.urlencode(params)
124
+ url = f"{url}?{query_string}"
125
+
126
+ with urllib.request.urlopen(url) as response: # noqa: S310
127
+ body = response.read().decode("utf-8")
128
+
129
+ try:
130
+ data = json.loads(body)
131
+ if isinstance(data, list):
132
+ return pd.DataFrame(data)
133
+ elif isinstance(data, dict):
134
+ return pd.DataFrame([data])
135
+ except (json.JSONDecodeError, ValueError):
136
+ pass
137
+
138
+ return pd.DataFrame({output_column: [body]})
139
+
140
+ # ------------------------------------------------------------------ #
141
+ # Column Info
142
+ # ------------------------------------------------------------------ #
143
+ @staticmethod
144
+ def column_info(df: pd.DataFrame) -> pd.DataFrame:
145
+ """Return schema/metadata about a DataFrame (Alteryx *Column Info*).
146
+
147
+ Args:
148
+ df: The input DataFrame.
149
+
150
+ Returns:
151
+ A DataFrame with columns ``Name``, ``Type``, ``Size``,
152
+ ``NonNullCount``, ``NullCount``, ``UniqueCount``.
153
+
154
+ Example:
155
+ >>> schema = Developer.column_info(df)
156
+ """
157
+ validate_dataframe(df)
158
+
159
+ rows = []
160
+ for col in df.columns:
161
+ rows.append(
162
+ {
163
+ "Name": col,
164
+ "Type": str(df[col].dtype),
165
+ "Size": df[col].memory_usage(deep=True),
166
+ "NonNullCount": int(df[col].notna().sum()),
167
+ "NullCount": int(df[col].isna().sum()),
168
+ "UniqueCount": int(df[col].nunique()),
169
+ }
170
+ )
171
+
172
+ return pd.DataFrame(rows)
173
+
174
+ # ------------------------------------------------------------------ #
175
+ # Dynamic Rename
176
+ # ------------------------------------------------------------------ #
177
+ @staticmethod
178
+ def dynamic_rename(
179
+ df: pd.DataFrame,
180
+ rename_df: pd.DataFrame,
181
+ key_col: str = "OldName",
182
+ new_name_col: str = "NewName",
183
+ mode: str = "mapping",
184
+ ) -> pd.DataFrame:
185
+ """Rename columns dynamically using a lookup table (Alteryx *Dynamic Rename*).
186
+
187
+ Args:
188
+ df: The DataFrame whose columns will be renamed.
189
+ rename_df: A lookup DataFrame with the rename mapping.
190
+ key_col: Column in *rename_df* containing current column names.
191
+ new_name_col: Column in *rename_df* containing new names.
192
+ mode: ``"mapping"`` uses the lookup table;
193
+ ``"prefix"`` adds a prefix from a single-value
194
+ *rename_df*;
195
+ ``"suffix"`` adds a suffix from a single-value
196
+ *rename_df*.
197
+
198
+ Returns:
199
+ A DataFrame with renamed columns.
200
+
201
+ Example:
202
+ >>> mapping = pd.DataFrame({"OldName": ["col_a"], "NewName": ["Column A"]})
203
+ >>> df = Developer.dynamic_rename(df, mapping)
204
+ """
205
+ validate_dataframe(df)
206
+ validate_dataframe(rename_df, "rename_df")
207
+
208
+ if mode == "mapping":
209
+ validate_columns(rename_df, key_col, "key_col")
210
+ validate_columns(rename_df, new_name_col, "new_name_col")
211
+ rename_map = dict(zip(rename_df[key_col], rename_df[new_name_col]))
212
+ return df.rename(columns=rename_map)
213
+ elif mode == "prefix":
214
+ prefix = str(rename_df.iloc[0, 0])
215
+ return df.rename(columns={c: f"{prefix}{c}" for c in df.columns})
216
+ elif mode == "suffix":
217
+ suffix = str(rename_df.iloc[0, 0])
218
+ return df.rename(columns={c: f"{c}{suffix}" for c in df.columns})
219
+ else:
220
+ raise ValueError(f"Unknown mode '{mode}'. Use 'mapping', 'prefix', or 'suffix'.")
221
+
222
+ # ------------------------------------------------------------------ #
223
+ # JSON Parse
224
+ # ------------------------------------------------------------------ #
225
+ @staticmethod
226
+ def json_parse(
227
+ df: pd.DataFrame,
228
+ column: str,
229
+ prefix: str | None = None,
230
+ ) -> pd.DataFrame:
231
+ """Parse a JSON string column into separate columns (Alteryx *JSON Parse*).
232
+
233
+ Args:
234
+ df: The input DataFrame.
235
+ column: The column containing JSON strings.
236
+ prefix: Optional prefix for new columns. Defaults to the original column name.
237
+
238
+ Returns:
239
+ A new DataFrame with the parsed JSON fields expanded as new columns.
240
+
241
+ Example:
242
+ >>> df = Developer.json_parse(df, "JSON_Data")
243
+ """
244
+ validate_dataframe(df)
245
+ validate_columns(df, column)
246
+
247
+ out = df.copy()
248
+
249
+ def parse_json(val: Any) -> dict:
250
+ if pd.isna(val):
251
+ return {}
252
+ if isinstance(val, str):
253
+ try:
254
+ parsed = json.loads(val)
255
+ if isinstance(parsed, dict):
256
+ return parsed
257
+ return {"value": parsed}
258
+ except (json.JSONDecodeError, TypeError):
259
+ return {}
260
+ return {}
261
+
262
+ parsed_series = out[column].apply(parse_json)
263
+ parsed_df = pd.json_normalize(parsed_series)
264
+
265
+ prefix_str = prefix if prefix is not None else column
266
+ if prefix_str:
267
+ parsed_df = parsed_df.add_prefix(f"{prefix_str}_")
268
+
269
+ # Drop the original column and join the parsed columns
270
+ out = out.drop(columns=[column])
271
+
272
+ # Ensure index alignment
273
+ parsed_df.index = out.index
274
+ return pd.concat([out, parsed_df], axis=1)
275
+
276
+ # ------------------------------------------------------------------ #
277
+ # Dynamic Select
278
+ # ------------------------------------------------------------------ #
279
+ @staticmethod
280
+ def dynamic_select(
281
+ df: pd.DataFrame,
282
+ dtype_include: Any | None = None,
283
+ dtype_exclude: Any | None = None,
284
+ pattern: str | None = None,
285
+ ) -> pd.DataFrame:
286
+ """Select columns dynamically based on data type or pattern (Alteryx *Dynamic Select*).
287
+
288
+ Args:
289
+ df: The input DataFrame.
290
+ dtype_include: Data types to include (e.g., 'number', 'object').
291
+ dtype_exclude: Data types to exclude.
292
+ pattern: A regex pattern to match column names against.
293
+
294
+ Returns:
295
+ A new DataFrame containing only the selected columns.
296
+
297
+ Example:
298
+ >>> df_num = Developer.dynamic_select(df, dtype_include="number")
299
+ >>> df_sales = Developer.dynamic_select(df, pattern="^Sales_")
300
+ """
301
+ validate_dataframe(df)
302
+
303
+ out = df
304
+ if dtype_include is not None or dtype_exclude is not None:
305
+ out = out.select_dtypes(include=dtype_include, exclude=dtype_exclude)
306
+
307
+ if pattern is not None:
308
+ out = out.filter(regex=pattern)
309
+
310
+ return out
311
+
312
+ # ------------------------------------------------------------------ #
313
+ # Test
314
+ # ------------------------------------------------------------------ #
315
+ @staticmethod
316
+ def test(
317
+ df: pd.DataFrame,
318
+ condition_func: Callable,
319
+ error_msg: str = "Test condition failed",
320
+ ) -> pd.DataFrame:
321
+ """Verify data using a custom condition (Alteryx *Test*).
322
+
323
+ Evaluates `condition_func(df)`. If it returns False, raises a ValueError.
324
+
325
+ Args:
326
+ df: The input DataFrame.
327
+ condition_func: A callable taking the DataFrame and returning a boolean.
328
+ error_msg: Error message to raise on failure.
329
+
330
+ Returns:
331
+ The original DataFrame if the test passes.
332
+
333
+ Raises:
334
+ ValueError: If the condition is false.
335
+
336
+ Example:
337
+ >>> Developer.test(df, lambda d: d["Sales"].sum() > 0, "No sales!")
338
+ """
339
+ validate_dataframe(df)
340
+
341
+ if not condition_func(df):
342
+ raise ValueError(error_msg)
343
+
344
+ return df
345
+
346
+ # ------------------------------------------------------------------ #
347
+ # Test Equal
348
+ # ------------------------------------------------------------------ #
349
+ @staticmethod
350
+ def test_equal(
351
+ df_left: pd.DataFrame,
352
+ df_right: pd.DataFrame,
353
+ **kwargs: Any,
354
+ ) -> None:
355
+ """Test if two data streams are identical (Alteryx *Expect Equal*).
356
+
357
+ Wraps `pandas.testing.assert_frame_equal`. Raises AssertionError if they differ.
358
+
359
+ Args:
360
+ df_left: The first DataFrame.
361
+ df_right: The second DataFrame.
362
+ **kwargs: Additional arguments to `pd.testing.assert_frame_equal`.
363
+
364
+ Raises:
365
+ AssertionError: If the DataFrames do not match.
366
+
367
+ Example:
368
+ >>> Developer.test_equal(df1, df2)
369
+ """
370
+ validate_dataframe(df_left)
371
+ validate_dataframe(df_right, "df_right")
372
+
373
+ pd.testing.assert_frame_equal(df_left, df_right, **kwargs)