dataforge-studio 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,119 @@
1
+ Metadata-Version: 2.4
2
+ Name: dataforge-studio
3
+ Version: 1.0.1
4
+ Summary: Studio visual de preprocesamiento de datos — Universidad Alberto Hurtado
5
+ Author-email: Álvaro Riquelme <alvaroriquelme.14@gmail.com>
6
+ License-Expression: LicenseRef-Proprietary
7
+ Project-URL: Homepage, https://dataforgeUAH.github.io/dataici
8
+ Keywords: data,preprocessing,pandas,visual,pipeline,uah
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
12
+ Classifier: Intended Audience :: Education
13
+ Requires-Python: >=3.10
14
+ Description-Content-Type: text/markdown
15
+ Requires-Dist: fastapi>=0.110.0
16
+ Requires-Dist: uvicorn[standard]>=0.29.0
17
+ Requires-Dist: pandas>=2.0.0
18
+ Requires-Dist: openpyxl>=3.1.0
19
+ Requires-Dist: python-multipart>=0.0.9
20
+ Requires-Dist: matplotlib>=3.7.0
21
+ Requires-Dist: numpy>=1.24.0
22
+
23
+ # DataICI — v0.2
24
+
25
+ Herramienta visual de preprocesamiento de datos para estudiantes de Ingeniería Civil Industrial.
26
+
27
+ ## Requisitos previos
28
+ - Python 3.9+ → https://python.org
29
+ - Node.js 18+ → https://nodejs.org
30
+
31
+ ---
32
+
33
+ ## Instalación y ejecución
34
+
35
+ ### 1. Backend (FastAPI + pandas)
36
+
37
+ Abre una terminal en la carpeta `dataici/`:
38
+
39
+ ```bash
40
+ # Windows
41
+ cd backend
42
+ pip install -r requirements.txt
43
+ uvicorn main:app --reload
44
+
45
+ # Mac
46
+ cd backend
47
+ pip3 install -r requirements.txt
48
+ uvicorn main:app --reload
49
+ ```
50
+
51
+ Backend corriendo en: http://localhost:8000
52
+
53
+ ---
54
+
55
+ ### 2. Frontend (React)
56
+
57
+ Abre **otra terminal**:
58
+
59
+ ```bash
60
+ cd frontend
61
+ npm install
62
+ npm run dev
63
+ ```
64
+
65
+ App disponible en: http://localhost:5173
66
+
67
+ ---
68
+
69
+ ## Estructura del proyecto
70
+
71
+ ```
72
+ dataici/
73
+ ├── backend/
74
+ │ ├── main.py ← API FastAPI
75
+ │ ├── requirements.txt
76
+ │ └── blocks/ ← un archivo por bloque
77
+ │ ├── load_csv.py
78
+ │ ├── drop_nulls.py
79
+ │ ├── filter_rows.py
80
+ │ ├── groupby.py
81
+ │ └── export_csv.py
82
+
83
+ └── frontend/
84
+ ├── package.json
85
+ ├── vite.config.js
86
+ └── src/
87
+ ├── App.jsx ← app principal
88
+ ├── nodes/
89
+ │ └── BlockNode.jsx ← nodo del canvas
90
+ └── panels/
91
+ ├── Sidebar.jsx ← bloques disponibles
92
+ ├── ParamsPanel.jsx ← parámetros del bloque
93
+ └── PreviewPanel.jsx ← resultados
94
+ ```
95
+
96
+ ---
97
+
98
+ ## Cómo agregar un nuevo bloque
99
+
100
+ Solo crear `backend/blocks/nuevo_bloque.py`. El frontend lo detecta automáticamente.
101
+
102
+ ```python
103
+ METADATA = {
104
+ "type": "mi_bloque",
105
+ "label": "Mi bloque",
106
+ "category": "Limpieza", # Entrada / Salida | Limpieza | Análisis
107
+ "params": [
108
+ {"key": "columna", "label": "Columna", "type": "text", "default": ""},
109
+ {"key": "metodo", "label": "Método", "type": "select", "options": ["a", "b"], "default": "a"},
110
+ {"key": "activo", "label": "Activar", "type": "toggle", "default": False},
111
+ ]
112
+ }
113
+
114
+ def run(df, params):
115
+ col = params.get("columna")
116
+ df = df.drop(columns=[col])
117
+ code = [f'df = df.drop(columns=["{col}"])']
118
+ return df, code
119
+ ```
@@ -0,0 +1,30 @@
1
+ dataici/__init__.py,sha256=UE3F0RhOupkC8xJmkgpfCieOZIHNiIr0dIPU-5MHeTU,102
2
+ dataici/charts.py,sha256=0Lik7kYvjq5YQUEq6I5xOuF2-vaRE2_bRkTrkNDWJBc,7109
3
+ dataici/cli.py,sha256=UqapIUK07u6ZDe3gbsu1dUC5OW1f6rN56grpdChP9B8,936
4
+ dataici/main.py,sha256=lTPXKgcpWZQgno8lhSKrhKclauBMM-XBVRaMVUCGIlQ,12376
5
+ dataici/blocks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ dataici/blocks/aggregate.py,sha256=yO5O58k33eZ9V14V-iQtCOUgaMy6UYCzKaTslKULNoc,1606
7
+ dataici/blocks/append_column.py,sha256=WcLm6EkZ8FJPkl22ddbikQF5EBYVmlyR4SobZ7mJijw,501
8
+ dataici/blocks/concatenate.py,sha256=ejaCTbY0Ak9NXKFvRx7R3Vyqn7TGxNFBztUTNpghqo4,3103
9
+ dataici/blocks/drop_columns.py,sha256=7XgB8AOpOvqcHozBkw-v1rnV69NS_xyR5PIxmSQ1tpQ,627
10
+ dataici/blocks/filter_rows.py,sha256=2C8or2xO3Qrlh2KsoyeYK2wJROk3sg6MyyJtG4z2Ub4,4376
11
+ dataici/blocks/handle_missings.py,sha256=1SiCLujj4JDZPGHw6Ekj17FZjL7fBCNeK6FnEMGpvS0,7351
12
+ dataici/blocks/load_csv.py,sha256=gQcn1bm1tokDYqk_TfHxZu7JBYyELazsOTsOiNRS-BM,2692
13
+ dataici/blocks/read_excel.py,sha256=XCyOBNNwFLXzoqRk51vXEQfKw2ye32JoaSYfEVVfu-s,1738
14
+ dataici/blocks/rename_columns.py,sha256=NyhkgcgDmd_agus2wLZqoetaWZzdYTJnBFRc8U8ZRaw,710
15
+ dataici/blocks/reorder_columns.py,sha256=eyp5Cdthhm2c7D0pcji9eMerPhC1gYmjAdYrDJE1MVc,589
16
+ dataici/blocks/replace_values.py,sha256=GjlLmDTE4GWuCl8G_lyhg29C1NvyGK3sGiUiuA9-NFc,5521
17
+ dataici/blocks/resample.py,sha256=S_jrFBM4Hd5hIkmmcIfQeQ1K46XNTMg6gDehhYbxZSo,2697
18
+ dataici/blocks/sample_rows.py,sha256=GXxaBpw8X5YpfCU52V5XhyyG65OG-tQpb0lNjAhSip8,1611
19
+ dataici/blocks/select_columns.py,sha256=vm4LgKBPwdw4_qugoWwQxavlfMmmU9qOR17svoQoJ7o,591
20
+ dataici/blocks/set_dtypes.py,sha256=bTwfqMXAN_4lr1BuS2OVjtTYg5gh_7MLYfPeF4tJBIo,1599
21
+ dataici/blocks/set_index.py,sha256=AgruV2FTYtQsLh-tFbjClraqc0g58t07eQcRDkU7mjE,674
22
+ dataici/blocks/write_csv.py,sha256=UhNE7fhZAfFoRpkrHs_OXtKNu1kkGvkLSXh0BmjKk2U,1907
23
+ dataici/static/index.html,sha256=ISU1RFLD_nyMQx0F02JpS3XFSsDvQN7djnxEwWT4l-o,394
24
+ dataici/static/assets/index-CYGnphoW.js,sha256=g5zsPhlYAdY4Le_0ziC3FUGZ-w6CgV4ZzlGO0lyUqcU,487756
25
+ dataici/static/assets/index-DLK3-mBP.css,sha256=BRbWdZ92-il_sC1qbpDqlb9TBKn5sJ5dGREYT8Q9D-s,16169
26
+ dataforge_studio-1.0.1.dist-info/METADATA,sha256=6Q8sSmWVZYz9YFBF-t9YxQEpGByxq05ea4LOrrVsQag,3122
27
+ dataforge_studio-1.0.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
28
+ dataforge_studio-1.0.1.dist-info/entry_points.txt,sha256=WPUWatbninyQUv7yeZUL_46_kSM36g4i_OIQzyKSNPE,45
29
+ dataforge_studio-1.0.1.dist-info/top_level.txt,sha256=GSq62DNLl7S05BlvprGvsaye2dBNocMXqIxxrFvJif4,8
30
+ dataforge_studio-1.0.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ dataici = dataici.cli:main
@@ -0,0 +1 @@
1
+ dataici
dataici/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ # DataICI — Studio de Preprocesamiento de Datos
2
+ # Universidad Alberto Hurtado
3
+ __version__ = "1.0.0"
File without changes
@@ -0,0 +1,50 @@
1
+ import json
2
+
3
+ METADATA = {
4
+ "type": "aggregate",
5
+ "label": "Aggregate",
6
+ "category": "Resampling",
7
+ "params": [
8
+ {"key": "custom", "label": "Custom Functions", "type": "text", "default": "false"},
9
+ {"key": "func", "label": "General function", "type": "text", "default": "mean"},
10
+ {"key": "col_funcs", "label": "Per-column funcs", "type": "text", "default": "{}"},
11
+ ]
12
+ }
13
+
14
+ # pandas resampler does not have a .unique() — map to nunique
15
+ _ALIASES = {"unique": "nunique"}
16
+
17
+
18
+ def _is_resampler(obj):
19
+ try:
20
+ from pandas.core.resample import DatetimeIndexResampler
21
+ return isinstance(obj, DatetimeIndexResampler)
22
+ except ImportError:
23
+ pass
24
+ return hasattr(obj, "_selected_obj") and not hasattr(obj, "to_dict")
25
+
26
+
27
+ def run(obj, params):
28
+ custom = str(params.get("custom", "false")).strip().lower() == "true"
29
+ func = (params.get("func", "mean") or "mean").strip()
30
+ col_funcs_raw = params.get("col_funcs", "{}")
31
+
32
+ try:
33
+ col_funcs = json.loads(col_funcs_raw) if col_funcs_raw else {}
34
+ except Exception:
35
+ col_funcs = {}
36
+
37
+ is_rs = _is_resampler(obj)
38
+ prefix = "resampler" if is_rs else "df"
39
+
40
+ if custom and col_funcs:
41
+ # Map aliases per-column
42
+ mapped = {col: _ALIASES.get(fn, fn) for col, fn in col_funcs.items()}
43
+ df = obj.agg(mapped)
44
+ code = [f"df = {prefix}.agg({json.dumps(mapped)})"]
45
+ else:
46
+ actual = _ALIASES.get(func, func)
47
+ df = getattr(obj, actual)()
48
+ code = [f"df = {prefix}.{actual}()"]
49
+
50
+ return df, code
@@ -0,0 +1,18 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ METADATA = {
4
+ "type": "append_column",
5
+ "label": "Append a Column",
6
+ "category": "Columns",
7
+ "params": [
8
+ {"key": "colname", "label": "colname", "type": "text", "default": "new_col"},
9
+ ]
10
+ }
11
+
12
+ def run(df, params):
13
+ colname = params.get("colname", "new_col").strip()
14
+ if not colname:
15
+ raise ValueError("Especifica el nombre de la nueva columna.")
16
+ df[colname] = 'None'
17
+ code = [f'df["{colname}"] = None']
18
+ return df, code
@@ -0,0 +1,70 @@
1
+ import pandas as pd
2
+
3
+ METADATA = {
4
+ "type": "concatenate",
5
+ "label": "Concatenate",
6
+ "category": "DataFrame",
7
+ "multi_input": True,
8
+ "params": [
9
+ {"key": "input_count", "label": "Número de entradas", "type": "text", "default": "2"},
10
+ {"key": "axis", "label": "Axis", "type": "text", "default": "index"},
11
+ {"key": "join", "label": "Join", "type": "text", "default": "outer"},
12
+ {"key": "change_col_names", "label": "Change column names", "type": "text", "default": "None"},
13
+ {"key": "ignore_index", "label": "Ignore index", "type": "text", "default": "false"},
14
+ {"key": "sort", "label": "Sort", "type": "text", "default": "false"},
15
+ ]
16
+ }
17
+
18
+
19
+ def _bool(val):
20
+ if isinstance(val, bool):
21
+ return val
22
+ return str(val).strip().lower() == "true"
23
+
24
+
25
+ def run(dfs, params):
26
+ """dfs: list of DataFrames received from upstream nodes."""
27
+ axis_raw = params.get("axis", "index")
28
+ axis = 0 if axis_raw == "index" else 1
29
+
30
+ join = params.get("join", "outer")
31
+ change_cols = params.get("change_col_names", "None") # None | prefix | suffix
32
+ ignore_index = _bool(params.get("ignore_index", False))
33
+ sort = _bool(params.get("sort", False))
34
+
35
+ if len(dfs) < 2:
36
+ raise ValueError("Concatenate necesita al menos 2 DataFrames.")
37
+
38
+ # ── Rename columns when axis=1 and change_col_names != None ──────────────
39
+ if axis == 1 and change_cols in ("prefix", "suffix"):
40
+ renamed = []
41
+ for i, df in enumerate(dfs):
42
+ if change_cols == "prefix":
43
+ df = df.rename(columns=lambda c: f"df{i+1}_{c}")
44
+ else:
45
+ df = df.rename(columns=lambda c: f"{c}_df{i+1}")
46
+ renamed.append(df)
47
+ dfs = renamed
48
+
49
+ # ── Build concat kwargs ───────────────────────────────────────────────────
50
+ kwargs = {"axis": axis, "join": join, "sort": sort}
51
+ if axis == 0:
52
+ kwargs["ignore_index"] = ignore_index
53
+
54
+ df_result = pd.concat(dfs, **kwargs)
55
+
56
+ # ── Code string ──────────────────────────────────────────────────────────
57
+ frames_repr = ", ".join(f"df_{i+1}" for i in range(len(dfs)))
58
+ code_lines = []
59
+
60
+ if axis == 1 and change_cols == "prefix":
61
+ for i in range(len(dfs)):
62
+ code_lines.append(f"df_{i+1} = df_{i+1}.rename(columns=lambda c: f'df{i+1}_{{c}}')")
63
+ elif axis == 1 and change_cols == "suffix":
64
+ for i in range(len(dfs)):
65
+ code_lines.append(f"df_{i+1} = df_{i+1}.rename(columns=lambda c: f'{{c}}_df{i+1}')")
66
+
67
+ kw_str = ", ".join(f"{k}={repr(v)}" for k, v in kwargs.items())
68
+ code_lines.append(f"df = pd.concat([{frames_repr}], {kw_str})")
69
+
70
+ return df_result, code_lines
@@ -0,0 +1,19 @@
1
+ METADATA = {
2
+ "type": "drop_columns",
3
+ "label": "Drop Columns",
4
+ "category": "Columns",
5
+ "params": [
6
+ {"key": "columns", "label": "columns", "type": "text", "default": ""},
7
+ ]
8
+ }
9
+
10
+ def run(df, params):
11
+ cols = [c.strip() for c in params.get("columns", "").split(",") if c.strip()]
12
+ if not cols:
13
+ raise ValueError("Selecciona al menos una columna para eliminar.")
14
+ missing = [c for c in cols if c not in df.columns]
15
+ if missing:
16
+ raise ValueError(f"Columnas no encontradas: {missing}")
17
+ df = df.drop(columns=cols)
18
+ code = [f"df = df.drop(columns={cols})"]
19
+ return df, code
@@ -0,0 +1,120 @@
1
+ import json
2
+ import pandas as pd
3
+
4
+ METADATA = {
5
+ "type": "filter_rows",
6
+ "label": "Filter Rows",
7
+ "category": "DataFrame",
8
+ "params": [
9
+ {"key": "conditions", "label": "conditions", "type": "text", "default": "[]"},
10
+ ]
11
+ }
12
+
13
+ def run(df, params):
14
+ raw = params.get("conditions", "[]")
15
+ try:
16
+ conditions = json.loads(raw) if isinstance(raw, str) else raw
17
+ except Exception:
18
+ raise ValueError("Error al leer las condiciones.")
19
+
20
+ if not conditions:
21
+ raise ValueError("Agrega al menos una condición Where.")
22
+
23
+ OP_METHOD = {"==": "eq", "!=": "ne", "<": "lt", "<=": "le", ">": "gt", ">=": "ge"}
24
+ OP_SYM = {"and": "&", "or": "|", "xor": "^"}
25
+
26
+ masks = []
27
+ cond_lines = []
28
+
29
+ for i, cond in enumerate(conditions):
30
+ col = cond.get("column", "")
31
+ op = cond.get("operator", "==")
32
+ typ = cond.get("type", "number")
33
+ val = str(cond.get("value", "0")).strip()
34
+ negate = cond.get("not", False)
35
+ logical = cond.get("logical", "and")
36
+
37
+ if not col or col not in df.columns:
38
+ raise ValueError(f"Columna '{col}' no encontrada en el DataFrame.")
39
+
40
+ s = df[col]
41
+
42
+ # ── Build mask ───────────────────────────────────────────────────────
43
+ if op == "isna":
44
+ mask = s.isna()
45
+ code_expr = f"df['{col}'].isna()"
46
+
47
+ elif op == "notna":
48
+ mask = s.notna()
49
+ code_expr = f"df['{col}'].notna()"
50
+
51
+ elif op == "isin":
52
+ items = [v.strip() for v in val.split(",") if v.strip()]
53
+ if typ == "number":
54
+ try:
55
+ parsed = [float(v) for v in items]
56
+ except ValueError:
57
+ raise ValueError(f"isin numérico: valores inválidos → {items}")
58
+ code_expr = f"df['{col}'].isin({parsed})"
59
+ else:
60
+ parsed = [v.strip("'\"") for v in items]
61
+ code_expr = f"df['{col}'].isin({parsed!r})"
62
+ mask = s.isin(parsed)
63
+
64
+ else:
65
+ method = OP_METHOD.get(op, "eq")
66
+
67
+ if typ == "number":
68
+ try:
69
+ parsed = float(val)
70
+ except ValueError:
71
+ raise ValueError(f"Valor numérico inválido: '{val}'")
72
+ mask = getattr(s, method)(parsed)
73
+ code_expr = f"df['{col}'].{method}({parsed})"
74
+
75
+ elif typ == "string":
76
+ parsed = val.strip("'\"")
77
+ mask = getattr(s, method)(parsed)
78
+ code_expr = f"df['{col}'].{method}('{parsed}')"
79
+
80
+ elif typ == "datetime":
81
+ try:
82
+ parsed = pd.Timestamp(val)
83
+ except Exception:
84
+ raise ValueError(f"Fecha inválida: '{val}'")
85
+ mask = getattr(s, method)(parsed)
86
+ code_expr = f"df['{col}'].{method}(pd.Timestamp('{val}'))"
87
+
88
+ elif typ == "column":
89
+ if val not in df.columns:
90
+ raise ValueError(f"Columna de comparación '{val}' no existe.")
91
+ mask = getattr(s, method)(df[val])
92
+ code_expr = f"df['{col}'].{method}(df['{val}'])"
93
+
94
+ else:
95
+ raise ValueError(f"Tipo desconocido: '{typ}'")
96
+
97
+ if negate:
98
+ mask = ~mask
99
+ code_expr = f"~({code_expr})"
100
+
101
+ masks.append({"logical": logical, "mask": mask, "expr": code_expr})
102
+
103
+ # ── Combine masks ────────────────────────────────────────────────────────
104
+ result_mask = masks[0]["mask"]
105
+ cond_lines = [f" ({masks[0]['expr']})"]
106
+
107
+ for m in masks[1:]:
108
+ sym = OP_SYM.get(m["logical"], "&")
109
+ if m["logical"] == "and":
110
+ result_mask = result_mask & m["mask"]
111
+ elif m["logical"] == "or":
112
+ result_mask = result_mask | m["mask"]
113
+ elif m["logical"] == "xor":
114
+ result_mask = result_mask ^ m["mask"]
115
+ cond_lines.append(f" {sym} ({m['expr']})")
116
+
117
+ df = df[result_mask]
118
+
119
+ code = ["cond = (", *cond_lines, ")", "df = df[cond]"]
120
+ return df, code
@@ -0,0 +1,160 @@
1
+ METADATA = {
2
+ "type": "handle_missings",
3
+ "label": "Handle Missings",
4
+ "category": "DataFrame",
5
+ "params": [
6
+ {"key": "all_columns", "label": "All Columns", "type": "text", "default": "true"},
7
+ {"key": "columns", "label": "Columns", "type": "text", "default": ""},
8
+ {"key": "operation", "label": "Operation", "type": "text", "default": "dropna"},
9
+ # dropna
10
+ {"key": "axis", "label": "Axis", "type": "text", "default": "index"},
11
+ {"key": "how", "label": "How", "type": "text", "default": "any"},
12
+ {"key": "thresh", "label": "Thresh (%)", "type": "text", "default": ""},
13
+ # fillna
14
+ {"key": "fill_type", "label": "Fill Type", "type": "text", "default": "value"},
15
+ {"key": "value_type", "label": "Value Type", "type": "text", "default": "number"},
16
+ {"key": "fill_value", "label": "Fill Value", "type": "text", "default": "0"},
17
+ {"key": "fill_method", "label": "Fill Method", "type": "text", "default": "ffill"},
18
+ {"key": "fill_axis", "label": "Fill Axis", "type": "text", "default": "index"},
19
+ {"key": "fill_limit", "label": "Fill Limit", "type": "text", "default": ""},
20
+ # interpolate
21
+ {"key": "interp_method", "label": "Interp Method", "type": "text", "default": "linear"},
22
+ {"key": "interp_axis", "label": "Interp Axis", "type": "text", "default": "index"},
23
+ {"key": "interp_limit", "label": "Interp Limit", "type": "text", "default": ""},
24
+ {"key": "interp_limit_direction", "label": "Limit Direction", "type": "text", "default": "None"},
25
+ {"key": "interp_limit_area", "label": "Limit Area", "type": "text", "default": "None"},
26
+ ]
27
+ }
28
+
29
+
30
+ def _parse_limit(raw):
31
+ """Return int or None."""
32
+ try:
33
+ v = int(str(raw).strip())
34
+ return v if v > 0 else None
35
+ except (ValueError, TypeError):
36
+ return None
37
+
38
+
39
+ def run(df, params):
40
+ operation = params.get("operation", "dropna")
41
+ all_columns = params.get("all_columns", "true").strip().lower() != "false"
42
+ columns_raw = params.get("columns", "")
43
+ subset = (
44
+ [c.strip() for c in columns_raw.split(",") if c.strip()]
45
+ if not all_columns and columns_raw.strip()
46
+ else None
47
+ )
48
+
49
+ code = []
50
+
51
+ # ── dropna ────────────────────────────────────────────────────────────────
52
+ if operation == "dropna":
53
+ axis = params.get("axis", "index")
54
+ how = params.get("how", "any")
55
+ thresh_raw = params.get("thresh", "").strip()
56
+
57
+ kwargs = {"axis": axis}
58
+ if subset:
59
+ kwargs["subset"] = subset
60
+
61
+ if thresh_raw:
62
+ try:
63
+ thresh_pct = float(thresh_raw)
64
+ # thresh = minimum number of non-NA values required to keep row/col
65
+ n = df.shape[0] if axis in ("index", "0") else df.shape[1]
66
+ kwargs["thresh"] = max(1, int(thresh_pct / 100.0 * n))
67
+ # thresh and how are mutually exclusive — omit how
68
+ except (ValueError, TypeError):
69
+ kwargs["how"] = how
70
+ else:
71
+ kwargs["how"] = how
72
+
73
+ df = df.dropna(**kwargs)
74
+ code = [f"df = df.dropna({', '.join(f'{k}={repr(v)}' for k, v in kwargs.items())})"]
75
+
76
+ # ── fillna ────────────────────────────────────────────────────────────────
77
+ elif operation == "fillna":
78
+ fill_type = params.get("fill_type", "value")
79
+
80
+ if fill_type == "value":
81
+ value_type = params.get("value_type", "number")
82
+ raw_val = params.get("fill_value", "0")
83
+ if value_type == "number":
84
+ try:
85
+ fill_val = float(raw_val) if "." in str(raw_val) else int(raw_val)
86
+ except (ValueError, TypeError):
87
+ fill_val = 0
88
+ else:
89
+ fill_val = raw_val
90
+
91
+ if subset:
92
+ df[subset] = df[subset].fillna(fill_val)
93
+ code = [f"df[{subset}] = df[{subset}].fillna({repr(fill_val)})"]
94
+ else:
95
+ df = df.fillna(fill_val)
96
+ code = [f"df = df.fillna({repr(fill_val)})"]
97
+
98
+ else: # method
99
+ method = params.get("fill_method", "ffill")
100
+ fill_axis = params.get("fill_axis", "index")
101
+ limit = _parse_limit(params.get("fill_limit", ""))
102
+ axis_val = None if fill_axis in ("None", "") else fill_axis
103
+
104
+ extra = {}
105
+ if axis_val:
106
+ extra["axis"] = axis_val
107
+ if limit:
108
+ extra["limit"] = limit
109
+
110
+ if method == "ffill":
111
+ fn = "ffill"
112
+ if subset:
113
+ df[subset] = df[subset].ffill(**extra)
114
+ code = [f"df[{subset}] = df[{subset}].ffill({_fmt(extra)})"]
115
+ else:
116
+ df = df.ffill(**extra)
117
+ code = [f"df = df.ffill({_fmt(extra)})"]
118
+ elif method == "bfill":
119
+ fn = "bfill"
120
+ if subset:
121
+ df[subset] = df[subset].bfill(**extra)
122
+ code = [f"df[{subset}] = df[{subset}].bfill({_fmt(extra)})"]
123
+ else:
124
+ df = df.bfill(**extra)
125
+ code = [f"df = df.bfill({_fmt(extra)})"]
126
+ else:
127
+ # method == "None" — no-op
128
+ code = ["# fillna: method=None — no action taken"]
129
+
130
+ # ── interpolate ───────────────────────────────────────────────────────────
131
+ elif operation == "interpolate":
132
+ imethod = params.get("interp_method", "linear")
133
+ iaxis = params.get("interp_axis", "index")
134
+ limit = _parse_limit(params.get("interp_limit", ""))
135
+ limit_dir = params.get("interp_limit_direction", "None")
136
+ limit_area = params.get("interp_limit_area", "None")
137
+
138
+ kwargs = {"method": imethod}
139
+ if iaxis and iaxis != "None":
140
+ kwargs["axis"] = iaxis
141
+ if limit:
142
+ kwargs["limit"] = limit
143
+ if limit_dir and limit_dir != "None":
144
+ kwargs["limit_direction"] = limit_dir
145
+ if limit_area and limit_area != "None":
146
+ kwargs["limit_area"] = limit_area
147
+
148
+ if subset:
149
+ df[subset] = df[subset].interpolate(**kwargs)
150
+ code = [f"df[{subset}] = df[{subset}].interpolate({_fmt(kwargs)})"]
151
+ else:
152
+ df = df.interpolate(**kwargs)
153
+ code = [f"df = df.interpolate({_fmt(kwargs)})"]
154
+
155
+ return df, code
156
+
157
+
158
+ def _fmt(kwargs):
159
+ """Format a dict as keyword arguments string."""
160
+ return ", ".join(f"{k}={repr(v)}" for k, v in kwargs.items())