dataforge-studio 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,68 @@
1
+ import pandas as pd
2
+ import os
3
+
4
+ METADATA = {
5
+ "type": "load_csv",
6
+ "label": "Read CSV",
7
+ "category": "Data I/O",
8
+ "params": [
9
+ {"key": "path", "label": "path", "type": "text", "default": ""},
10
+ {"key": "multiple_files", "label": "read multiple files", "type": "toggle", "default": False},
11
+ {"key": "name", "label": "name", "type": "text", "default": ""},
12
+ {"key": "sep", "label": "sep", "type": "text", "default": ","},
13
+ {"key": "header", "label": "header (infer / None / int)", "type": "text", "default": "infer"},
14
+ {"key": "decimal", "label": "decimal", "type": "text", "default": "."},
15
+ {"key": "thousands", "label": "thousands", "type": "text", "default": ""},
16
+ ]
17
+ }
18
+
19
+ def run(df, params):
20
+ path = params.get("path", "").strip()
21
+ name = params.get("name", "").strip()
22
+ sep = params.get("sep", ",") or ","
23
+ decimal = params.get("decimal", ".") or "."
24
+ thousands = params.get("thousands", "") or None
25
+ multiple = params.get("multiple_files", False)
26
+
27
+ raw_header = params.get("header", "infer").strip()
28
+ if raw_header == "None":
29
+ header = None
30
+ elif raw_header == "infer" or raw_header == "":
31
+ header = "infer"
32
+ else:
33
+ try:
34
+ header = int(raw_header)
35
+ except ValueError:
36
+ header = "infer"
37
+
38
+ if sep == "\\t":
39
+ sep = "\t"
40
+
41
+ if multiple:
42
+ import glob
43
+ pattern = os.path.join(path, "*.csv")
44
+ files = glob.glob(pattern)
45
+ if not files:
46
+ raise FileNotFoundError(f"No se encontraron archivos CSV en: {path}")
47
+ dfs = [pd.read_csv(f, sep=sep, header=header, decimal=decimal, thousands=thousands) for f in sorted(files)]
48
+ df = pd.concat(dfs, ignore_index=True)
49
+ code = [
50
+ "import glob",
51
+ f'files = glob.glob("{os.path.join(path, "*.csv")}")',
52
+ f'df = pd.concat([pd.read_csv(f, sep="{sep}", header={repr(header)}, decimal="{decimal}") for f in sorted(files)], ignore_index=True)',
53
+ ]
54
+ else:
55
+ full_path = os.path.join(path, name) if name else path
56
+ df = pd.read_csv(full_path, sep=sep, header=header, decimal=decimal, thousands=thousands)
57
+ sep_r = "\\t" if sep == "\t" else sep
58
+ code = [
59
+ f'df = pd.read_csv(',
60
+ f' "{full_path}",',
61
+ f' sep="{sep_r}",',
62
+ f' header={repr(header)},',
63
+ f' decimal="{decimal}",',
64
+ f' thousands={repr(thousands)}',
65
+ ')',
66
+ ]
67
+
68
+ return df, code
@@ -0,0 +1,47 @@
1
+ import pandas as pd
2
+ import os
3
+
4
+ METADATA = {
5
+ "type": "read_excel",
6
+ "label": "Read Excel",
7
+ "category": "Data I/O",
8
+ "params": [
9
+ {"key": "path", "label": "path", "type": "text", "default": ""},
10
+ {"key": "multiple_files", "label": "read multiple files", "type": "toggle", "default": False},
11
+ {"key": "name", "label": "name", "type": "text", "default": ""},
12
+ {"key": "header", "label": "header (None / int)", "type": "text", "default": "0"},
13
+ ]
14
+ }
15
+
16
+ def run(df, params):
17
+ path = params.get("path", "").strip()
18
+ name = params.get("name", "").strip()
19
+ multiple = params.get("multiple_files", False)
20
+
21
+ raw_header = params.get("header", "0").strip()
22
+ header = None if raw_header == "None" else int(raw_header) if raw_header.isdigit() else 0
23
+
24
+ if multiple:
25
+ import glob
26
+ pattern = os.path.join(path, "*.xlsx")
27
+ files = glob.glob(pattern)
28
+ if not files:
29
+ raise FileNotFoundError(f"No se encontraron archivos Excel en: {path}")
30
+ dfs = [pd.read_excel(f, header=header) for f in sorted(files)]
31
+ df = pd.concat(dfs, ignore_index=True)
32
+ code = [
33
+ "import glob",
34
+ f'files = glob.glob("{os.path.join(path, "*.xlsx")}")',
35
+ f'df = pd.concat([pd.read_excel(f, header={header}) for f in sorted(files)], ignore_index=True)',
36
+ ]
37
+ else:
38
+ full_path = os.path.join(path, name) if name else path
39
+ df = pd.read_excel(full_path, header=header)
40
+ code = [
41
+ f'df = pd.read_excel(',
42
+ f' "{full_path}",',
43
+ f' header={header}',
44
+ ')',
45
+ ]
46
+
47
+ return df, code
@@ -0,0 +1,25 @@
1
+ METADATA = {
2
+ "type": "rename_columns",
3
+ "label": "Rename Columns",
4
+ "category": "Columns",
5
+ "params": [
6
+ {"key": "mapping", "label": "mapping", "type": "text", "default": ""},
7
+ ]
8
+ }
9
+
10
+ def run(df, params):
11
+ raw = params.get("mapping", "")
12
+ mapping = {}
13
+ for pair in raw.split("|"):
14
+ pair = pair.strip()
15
+ if ":" not in pair:
16
+ continue
17
+ old, new = pair.split(":", 1)
18
+ old, new = old.strip(), new.strip()
19
+ if old and new:
20
+ mapping[old] = new
21
+ if not mapping:
22
+ raise ValueError("Define al menos un renombrado.")
23
+ df = df.rename(columns=mapping)
24
+ code = [f"df = df.rename(columns={mapping})"]
25
+ return df, code
@@ -0,0 +1,19 @@
1
+ METADATA = {
2
+ "type": "reorder_columns",
3
+ "label": "Reorder Columns",
4
+ "category": "Columns",
5
+ "params": [
6
+ {"key": "columns", "label": "columns", "type": "text", "default": ""},
7
+ ]
8
+ }
9
+
10
+ def run(df, params):
11
+ cols = [c.strip() for c in params.get("columns", "").split(",") if c.strip()]
12
+ if not cols:
13
+ raise ValueError("Define el orden de columnas.")
14
+ missing = [c for c in cols if c not in df.columns]
15
+ if missing:
16
+ raise ValueError(f"Columnas no encontradas: {missing}")
17
+ df = df[cols]
18
+ code = [f"df = df[{cols}]"]
19
+ return df, code
@@ -0,0 +1,154 @@
1
+ import json
2
+ import math
3
+ import pandas as pd
4
+
5
+ METADATA = {
6
+ "type": "replace_values",
7
+ "label": "Replace Values",
8
+ "category": "DataFrame",
9
+ "params": [
10
+ {"key": "target_col", "label": "Column to replace", "type": "text", "default": ""},
11
+ {"key": "with_type", "label": "With type", "type": "text", "default": "number"},
12
+ {"key": "with_value", "label": "Value", "type": "text", "default": ""},
13
+ {"key": "with_format", "label": "Datetime format", "type": "text", "default": "%Y-%m-%d %H:%M:%S"},
14
+ {"key": "conditions", "label": "Conditions (JSON)", "type": "text", "default": "[]"},
15
+ ]
16
+ }
17
+
18
+
19
+ # ── Condition mask builder ────────────────────────────────────────────────────
20
+ def _parse_cond_value(raw, vtype, df):
21
+ if vtype == 'column':
22
+ return df[raw] if raw in df.columns else raw
23
+ if vtype == 'number':
24
+ try:
25
+ return float(raw) if '.' in str(raw) else int(raw)
26
+ except (ValueError, TypeError):
27
+ return raw
28
+ if vtype == 'datetime':
29
+ try:
30
+ return pd.Timestamp(raw)
31
+ except Exception:
32
+ return raw
33
+ return raw # string
34
+
35
+
36
+ def _single_mask(df, cond):
37
+ col = cond.get('column', '')
38
+ operator = cond.get('operator', '==')
39
+ vtype = cond.get('type', 'string')
40
+ raw_val = cond.get('value', '')
41
+ negate = cond.get('not', False)
42
+
43
+ if col not in df.columns:
44
+ return pd.Series([True] * len(df), index=df.index)
45
+
46
+ s = df[col]
47
+
48
+ if operator == 'isna':
49
+ mask = s.isna()
50
+ elif operator == 'notna':
51
+ mask = s.notna()
52
+ elif operator == 'isin':
53
+ vals = [v.strip() for v in str(raw_val).split(',')]
54
+ mask = s.isin(vals)
55
+ else:
56
+ val = _parse_cond_value(raw_val, vtype, df)
57
+ ops = {'==': s == val, '!=': s != val,
58
+ '<': s < val, '<=': s <= val,
59
+ '>': s > val, '>=': s >= val}
60
+ mask = ops.get(operator, pd.Series([True] * len(df), index=df.index))
61
+
62
+ return ~mask if negate else mask
63
+
64
+
65
+ def _build_mask(df, conds):
66
+ if not conds:
67
+ return pd.Series([True] * len(df), index=df.index)
68
+ mask = _single_mask(df, conds[0])
69
+ for cond in conds[1:]:
70
+ m2 = _single_mask(df, cond)
71
+ logical = cond.get('logical', 'and')
72
+ if logical == 'or':
73
+ mask = mask | m2
74
+ elif logical == 'xor':
75
+ mask = mask ^ m2
76
+ else:
77
+ mask = mask & m2
78
+ return mask
79
+
80
+
81
+ def _mask_repr(conds):
82
+ if not conds:
83
+ return "slice(None)"
84
+ parts = []
85
+ for c in conds:
86
+ col = c.get('column', '')
87
+ op = c.get('operator', '==')
88
+ val = c.get('value', '')
89
+ not_ = 'NOT ' if c.get('not') else ''
90
+ parts.append(f"{not_}df['{col}'] {op} {repr(val)}")
91
+ return ' & '.join(f"({p})" for p in parts)
92
+
93
+
94
+ # ── Main run ──────────────────────────────────────────────────────────────────
95
+ def run(df, params):
96
+ target_col = params.get('target_col', '').strip()
97
+ with_type = params.get('with_type', 'number')
98
+ raw_value = params.get('with_value', '')
99
+ with_format = params.get('with_format', '%Y-%m-%d %H:%M:%S').strip()
100
+
101
+ if not target_col or target_col not in df.columns:
102
+ raise ValueError(f"replace_values: columna '{target_col}' no encontrada en el DataFrame.")
103
+
104
+ conds = []
105
+ try:
106
+ conds = json.loads(params.get('conditions', '[]'))
107
+ except Exception:
108
+ conds = []
109
+
110
+ mask = _build_mask(df, conds)
111
+ mask_repr = _mask_repr(conds)
112
+ code = []
113
+
114
+ # ── number ──
115
+ if with_type == 'number':
116
+ try:
117
+ replacement = float(raw_value) if '.' in str(raw_value) else int(raw_value)
118
+ except (ValueError, TypeError):
119
+ replacement = 0
120
+ df.loc[mask, target_col] = replacement
121
+ code = [f"df.loc[{mask_repr}, '{target_col}'] = {repr(replacement)}"]
122
+
123
+ # ── na (NaN) ──
124
+ elif with_type == 'na':
125
+ df.loc[mask, target_col] = float('nan')
126
+ code = [f"df.loc[{mask_repr}, '{target_col}'] = float('nan')"]
127
+
128
+ # ── string ──
129
+ elif with_type == 'string':
130
+ df.loc[mask, target_col] = raw_value
131
+ code = [f"df.loc[{mask_repr}, '{target_col}'] = {repr(raw_value)}"]
132
+
133
+ # ── datetime ──
134
+ elif with_type == 'datetime':
135
+ try:
136
+ replacement = pd.Timestamp(raw_value)
137
+ except Exception:
138
+ raise ValueError(f"replace_values: no se pudo parsear '{raw_value}' como datetime.")
139
+ df.loc[mask, target_col] = replacement
140
+ fmt_comment = f" # format: {with_format}" if with_format else ""
141
+ code = [f"df.loc[{mask_repr}, '{target_col}'] = pd.Timestamp({repr(raw_value)}){fmt_comment}"]
142
+
143
+ # ── column (copy values from another column) ──
144
+ elif with_type == 'column':
145
+ src_col = raw_value.strip()
146
+ if src_col not in df.columns:
147
+ raise ValueError(f"replace_values: columna fuente '{src_col}' no encontrada.")
148
+ df.loc[mask, target_col] = df.loc[mask, src_col].values
149
+ code = [f"df.loc[{mask_repr}, '{target_col}'] = df.loc[{mask_repr}, '{src_col}']"]
150
+
151
+ else:
152
+ code = [f"# replace_values: with_type='{with_type}' no reconocido"]
153
+
154
+ return df, code
@@ -0,0 +1,68 @@
1
+ METADATA = {
2
+ "type": "resample",
3
+ "label": "Resample",
4
+ "category": "Resampling",
5
+ "params": [
6
+ {"key": "rule", "label": "Rule (ej: 1S, 1T, 2H, 1D)", "type": "text", "default": ""},
7
+ {"key": "sparse", "label": "Sparse resampling", "type": "text", "default": "true"},
8
+ {"key": "closed", "label": "closed", "type": "text", "default": "None"},
9
+ {"key": "label", "label": "label", "type": "text", "default": "None"},
10
+ {"key": "kind", "label": "kind", "type": "text", "default": "None"},
11
+ {"key": "origin", "label": "origin", "type": "text", "default": "epoch"},
12
+ {"key": "offset", "label": "offset", "type": "text", "default": ""},
13
+ ]
14
+ }
15
+
16
+ # Signal to main.py that this block returns a Resampler, not a DataFrame
17
+ IS_RESAMPLER = True
18
+
19
+
20
+ def _parse_bool(val):
21
+ if isinstance(val, bool):
22
+ return val
23
+ return str(val).strip().lower() not in ("false", "0", "")
24
+
25
+
26
+ def run(df, params):
27
+ import pandas as pd
28
+
29
+ rule = (params.get("rule", "") or "").strip()
30
+ sparse = _parse_bool(params.get("sparse", "true"))
31
+ closed = (params.get("closed", "None") or "None").strip()
32
+ label_val = (params.get("label", "None") or "None").strip()
33
+ kind = (params.get("kind", "None") or "None").strip()
34
+ origin = (params.get("origin", "epoch") or "epoch").strip()
35
+ offset = (params.get("offset", "") or "").strip()
36
+
37
+ if not rule:
38
+ raise ValueError("resample: debes especificar una regla (e.g. '1T', '1H', '1D').")
39
+
40
+ if not isinstance(df.index, pd.DatetimeIndex):
41
+ raise ValueError(
42
+ "resample: el índice debe ser DatetimeIndex. "
43
+ "Usa el bloque Set Index con una columna datetime antes de este bloque."
44
+ )
45
+
46
+ if sparse:
47
+ resampler = df.resample(rule)
48
+ code = [f'resampler = df.resample("{rule}")']
49
+ else:
50
+ kwargs = {}
51
+ if closed and closed != "None":
52
+ kwargs["closed"] = closed
53
+ if label_val and label_val != "None":
54
+ kwargs["label"] = label_val
55
+ if kind and kind != "None":
56
+ kwargs["kind"] = kind
57
+ if origin:
58
+ kwargs["origin"] = origin
59
+ if offset:
60
+ kwargs["offset"] = offset
61
+
62
+ resampler = df.resample(rule, **kwargs)
63
+ kw_s = ", ".join(f"{k}={repr(v)}" for k, v in kwargs.items())
64
+ rule_s = f'"{rule}"' + (f", {kw_s}" if kw_s else "")
65
+ code = [f"resampler = df.resample({rule_s})"]
66
+
67
+ # Return the Resampler object — aggregation is done by the Aggregate block
68
+ return resampler, code
@@ -0,0 +1,49 @@
1
+ METADATA = {
2
+ "type": "sample_rows",
3
+ "label": "Sample Rows",
4
+ "category": "DataFrame",
5
+ "params": [
6
+ {"key": "n", "label": "n", "type": "text", "default": ""},
7
+ {"key": "frac", "label": "frac", "type": "text", "default": ""},
8
+ {"key": "random_state", "label": "random state", "type": "text", "default": ""},
9
+ {"key": "ignore_index", "label": "ignore index", "type": "text", "default": "false"},
10
+ ]
11
+ }
12
+
13
+
14
+ def _parse_bool(val):
15
+ if isinstance(val, bool):
16
+ return val
17
+ return str(val).strip().lower() == "true"
18
+
19
+
20
+ def run(df, params):
21
+ n_raw = str(params.get("n", "") or "").strip()
22
+ fr_raw = str(params.get("frac", "") or "").strip()
23
+ rs_raw = str(params.get("random_state", "") or "").strip()
24
+ ignore_index = _parse_bool(params.get("ignore_index", False))
25
+
26
+ kwargs = {"ignore_index": ignore_index}
27
+
28
+ # frac takes precedence over n if both are provided
29
+ if fr_raw and fr_raw.lower() not in ("", "none"):
30
+ try:
31
+ kwargs["frac"] = float(fr_raw)
32
+ except ValueError:
33
+ pass
34
+ elif n_raw and n_raw.lower() not in ("", "none"):
35
+ try:
36
+ kwargs["n"] = int(n_raw)
37
+ except ValueError:
38
+ pass
39
+
40
+ if rs_raw and rs_raw.lower() not in ("", "none"):
41
+ try:
42
+ kwargs["random_state"] = int(rs_raw)
43
+ except ValueError:
44
+ pass
45
+
46
+ df = df.sample(**kwargs)
47
+ kw_s = ", ".join(f"{k}={repr(v)}" for k, v in kwargs.items())
48
+ code = [f"df = df.sample({kw_s})"]
49
+ return df, code
@@ -0,0 +1,19 @@
1
+ METADATA = {
2
+ "type": "select_columns",
3
+ "label": "Select Columns",
4
+ "category": "Columns",
5
+ "params": [
6
+ {"key": "columns", "label": "columns", "type": "text", "default": ""},
7
+ ]
8
+ }
9
+
10
+ def run(df, params):
11
+ cols = [c.strip() for c in params.get("columns", "").split(",") if c.strip()]
12
+ if not cols:
13
+ raise ValueError("Selecciona al menos una columna.")
14
+ missing = [c for c in cols if c not in df.columns]
15
+ if missing:
16
+ raise ValueError(f"Columnas no encontradas: {missing}")
17
+ df = df[cols]
18
+ code = [f"df = df[{cols}]"]
19
+ return df, code
@@ -0,0 +1,46 @@
1
+ METADATA = {
2
+ "type": "set_dtypes",
3
+ "label": "Set Dtypes",
4
+ "category": "DataFrame",
5
+ "params": [
6
+ {"key": "mapping", "label": "mapping", "type": "text", "default": ""},
7
+ ]
8
+ }
9
+
10
+ def run(df, params):
11
+ import pandas as pd
12
+ raw = params.get("mapping", "")
13
+ if not raw.strip():
14
+ raise ValueError("Especifica al menos un mapeo columna:tipo.")
15
+
16
+ dtype_map = {}
17
+ for pair in raw.split(";"):
18
+ pair = pair.strip()
19
+ if ":" not in pair:
20
+ continue
21
+ col, dtype = pair.split(":", 1)
22
+ dtype_map[col.strip()] = dtype.strip()
23
+
24
+ if not dtype_map:
25
+ raise ValueError("Especifica al menos un mapeo columna:tipo.")
26
+
27
+ code = []
28
+ for col, dtype in dtype_map.items():
29
+ if dtype == "datetime":
30
+ df[col] = pd.to_datetime(df[col], format='mixed', errors='coerce')
31
+ code.append(f"s = df['{col}']")
32
+ code.append(f"df['{col}'] = pd.to_datetime(s, format='mixed', errors='coerce')")
33
+ elif dtype == "numeric":
34
+ df[col] = pd.to_numeric(df[col], errors='coerce')
35
+ code.append(f"s = df['{col}']")
36
+ code.append(f"df['{col}'] = pd.to_numeric(s, errors='coerce')")
37
+ elif dtype == "categorical":
38
+ df[col] = df[col].astype("category")
39
+ code.append(f"s = df['{col}']")
40
+ code.append(f"df['{col}'] = s.astype('category')")
41
+ else:
42
+ df[col] = df[col].astype(dtype, errors='ignore')
43
+ code.append(f"s = df['{col}']")
44
+ code.append(f"df['{col}'] = s.astype('{dtype}')")
45
+
46
+ return df, code
@@ -0,0 +1,24 @@
1
+ METADATA = {
2
+ "type": "set_index",
3
+ "label": "Set Index",
4
+ "category": "Index",
5
+ "params": [
6
+ {"key": "columns", "label": "columns", "type": "text", "default": ""},
7
+ ]
8
+ }
9
+
10
+ def run(df, params):
11
+ cols_raw = params.get("columns", "").strip()
12
+ if not cols_raw:
13
+ raise ValueError("Selecciona al menos una columna para usar como índice.")
14
+ cols = [c.strip() for c in cols_raw.split(",") if c.strip()]
15
+ df.set_index(cols, drop=False, inplace=True)
16
+ cols_repr = str(cols)
17
+ code = [
18
+ f"df.set_index(",
19
+ f" {cols_repr},",
20
+ f" drop=False,",
21
+ f" inplace=True,",
22
+ f")",
23
+ ]
24
+ return df, code
@@ -0,0 +1,49 @@
1
+ import os
2
+
3
+ METADATA = {
4
+ "type": "write_csv",
5
+ "label": "Write CSV",
6
+ "category": "Data I/O",
7
+ "params": [
8
+ {"key": "path", "label": "path", "type": "text", "default": ""},
9
+ {"key": "name", "label": "name", "type": "text", "default": "output.csv"},
10
+ {"key": "sep", "label": "sep", "type": "text", "default": ","},
11
+ {"key": "na_rep", "label": "NA rep", "type": "text", "default": ""},
12
+ {"key": "float_format", "label": "float format", "type": "text", "default": ""},
13
+ {"key": "header", "label": "header", "type": "toggle", "default": True},
14
+ {"key": "index", "label": "index", "type": "toggle", "default": True},
15
+ {"key": "decimal", "label": "decimal", "type": "text", "default": "."},
16
+ ]
17
+ }
18
+
19
+ def run(df, params):
20
+ path = params.get("path", "").strip()
21
+ name = params.get("name", "output.csv").strip()
22
+ sep = params.get("sep", ",") or ","
23
+ na_rep = params.get("na_rep", "") or ""
24
+ float_format = params.get("float_format", "") or None
25
+ header = params.get("header", True)
26
+ index = params.get("index", True)
27
+ decimal = params.get("decimal", ".") or "."
28
+
29
+ if sep == "\\t":
30
+ sep = "\t"
31
+
32
+ full_path = os.path.join(path, name) if path else name
33
+
34
+ df.to_csv(full_path, sep=sep, na_rep=na_rep, float_format=float_format,
35
+ header=header, index=index, decimal=decimal)
36
+
37
+ sep_r = "\\t" if sep == "\t" else sep
38
+ code = [
39
+ f'df.to_csv(',
40
+ f' "{full_path}",',
41
+ f' sep="{sep_r}",',
42
+ f' na_rep="{na_rep}",',
43
+ f' float_format={repr(float_format)},',
44
+ f' header={header},',
45
+ f' index={index},',
46
+ f' decimal="{decimal}"',
47
+ ')',
48
+ ]
49
+ return df, code