dataforge-studio 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataforge_studio-1.0.1.dist-info/METADATA +119 -0
- dataforge_studio-1.0.1.dist-info/RECORD +30 -0
- dataforge_studio-1.0.1.dist-info/WHEEL +5 -0
- dataforge_studio-1.0.1.dist-info/entry_points.txt +2 -0
- dataforge_studio-1.0.1.dist-info/top_level.txt +1 -0
- dataici/__init__.py +3 -0
- dataici/blocks/__init__.py +0 -0
- dataici/blocks/aggregate.py +50 -0
- dataici/blocks/append_column.py +18 -0
- dataici/blocks/concatenate.py +70 -0
- dataici/blocks/drop_columns.py +19 -0
- dataici/blocks/filter_rows.py +120 -0
- dataici/blocks/handle_missings.py +160 -0
- dataici/blocks/load_csv.py +68 -0
- dataici/blocks/read_excel.py +47 -0
- dataici/blocks/rename_columns.py +25 -0
- dataici/blocks/reorder_columns.py +19 -0
- dataici/blocks/replace_values.py +154 -0
- dataici/blocks/resample.py +68 -0
- dataici/blocks/sample_rows.py +49 -0
- dataici/blocks/select_columns.py +19 -0
- dataici/blocks/set_dtypes.py +46 -0
- dataici/blocks/set_index.py +24 -0
- dataici/blocks/write_csv.py +49 -0
- dataici/charts.py +202 -0
- dataici/cli.py +35 -0
- dataici/main.py +349 -0
- dataici/static/assets/index-CYGnphoW.js +74 -0
- dataici/static/assets/index-DLK3-mBP.css +1 -0
- dataici/static/index.html +13 -0
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
METADATA = {
|
|
5
|
+
"type": "load_csv",
|
|
6
|
+
"label": "Read CSV",
|
|
7
|
+
"category": "Data I/O",
|
|
8
|
+
"params": [
|
|
9
|
+
{"key": "path", "label": "path", "type": "text", "default": ""},
|
|
10
|
+
{"key": "multiple_files", "label": "read multiple files", "type": "toggle", "default": False},
|
|
11
|
+
{"key": "name", "label": "name", "type": "text", "default": ""},
|
|
12
|
+
{"key": "sep", "label": "sep", "type": "text", "default": ","},
|
|
13
|
+
{"key": "header", "label": "header (infer / None / int)", "type": "text", "default": "infer"},
|
|
14
|
+
{"key": "decimal", "label": "decimal", "type": "text", "default": "."},
|
|
15
|
+
{"key": "thousands", "label": "thousands", "type": "text", "default": ""},
|
|
16
|
+
]
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
def run(df, params):
|
|
20
|
+
path = params.get("path", "").strip()
|
|
21
|
+
name = params.get("name", "").strip()
|
|
22
|
+
sep = params.get("sep", ",") or ","
|
|
23
|
+
decimal = params.get("decimal", ".") or "."
|
|
24
|
+
thousands = params.get("thousands", "") or None
|
|
25
|
+
multiple = params.get("multiple_files", False)
|
|
26
|
+
|
|
27
|
+
raw_header = params.get("header", "infer").strip()
|
|
28
|
+
if raw_header == "None":
|
|
29
|
+
header = None
|
|
30
|
+
elif raw_header == "infer" or raw_header == "":
|
|
31
|
+
header = "infer"
|
|
32
|
+
else:
|
|
33
|
+
try:
|
|
34
|
+
header = int(raw_header)
|
|
35
|
+
except ValueError:
|
|
36
|
+
header = "infer"
|
|
37
|
+
|
|
38
|
+
if sep == "\\t":
|
|
39
|
+
sep = "\t"
|
|
40
|
+
|
|
41
|
+
if multiple:
|
|
42
|
+
import glob
|
|
43
|
+
pattern = os.path.join(path, "*.csv")
|
|
44
|
+
files = glob.glob(pattern)
|
|
45
|
+
if not files:
|
|
46
|
+
raise FileNotFoundError(f"No se encontraron archivos CSV en: {path}")
|
|
47
|
+
dfs = [pd.read_csv(f, sep=sep, header=header, decimal=decimal, thousands=thousands) for f in sorted(files)]
|
|
48
|
+
df = pd.concat(dfs, ignore_index=True)
|
|
49
|
+
code = [
|
|
50
|
+
"import glob",
|
|
51
|
+
f'files = glob.glob("{os.path.join(path, "*.csv")}")',
|
|
52
|
+
f'df = pd.concat([pd.read_csv(f, sep="{sep}", header={repr(header)}, decimal="{decimal}") for f in sorted(files)], ignore_index=True)',
|
|
53
|
+
]
|
|
54
|
+
else:
|
|
55
|
+
full_path = os.path.join(path, name) if name else path
|
|
56
|
+
df = pd.read_csv(full_path, sep=sep, header=header, decimal=decimal, thousands=thousands)
|
|
57
|
+
sep_r = "\\t" if sep == "\t" else sep
|
|
58
|
+
code = [
|
|
59
|
+
f'df = pd.read_csv(',
|
|
60
|
+
f' "{full_path}",',
|
|
61
|
+
f' sep="{sep_r}",',
|
|
62
|
+
f' header={repr(header)},',
|
|
63
|
+
f' decimal="{decimal}",',
|
|
64
|
+
f' thousands={repr(thousands)}',
|
|
65
|
+
')',
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
return df, code
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
METADATA = {
|
|
5
|
+
"type": "read_excel",
|
|
6
|
+
"label": "Read Excel",
|
|
7
|
+
"category": "Data I/O",
|
|
8
|
+
"params": [
|
|
9
|
+
{"key": "path", "label": "path", "type": "text", "default": ""},
|
|
10
|
+
{"key": "multiple_files", "label": "read multiple files", "type": "toggle", "default": False},
|
|
11
|
+
{"key": "name", "label": "name", "type": "text", "default": ""},
|
|
12
|
+
{"key": "header", "label": "header (None / int)", "type": "text", "default": "0"},
|
|
13
|
+
]
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
def run(df, params):
|
|
17
|
+
path = params.get("path", "").strip()
|
|
18
|
+
name = params.get("name", "").strip()
|
|
19
|
+
multiple = params.get("multiple_files", False)
|
|
20
|
+
|
|
21
|
+
raw_header = params.get("header", "0").strip()
|
|
22
|
+
header = None if raw_header == "None" else int(raw_header) if raw_header.isdigit() else 0
|
|
23
|
+
|
|
24
|
+
if multiple:
|
|
25
|
+
import glob
|
|
26
|
+
pattern = os.path.join(path, "*.xlsx")
|
|
27
|
+
files = glob.glob(pattern)
|
|
28
|
+
if not files:
|
|
29
|
+
raise FileNotFoundError(f"No se encontraron archivos Excel en: {path}")
|
|
30
|
+
dfs = [pd.read_excel(f, header=header) for f in sorted(files)]
|
|
31
|
+
df = pd.concat(dfs, ignore_index=True)
|
|
32
|
+
code = [
|
|
33
|
+
"import glob",
|
|
34
|
+
f'files = glob.glob("{os.path.join(path, "*.xlsx")}")',
|
|
35
|
+
f'df = pd.concat([pd.read_excel(f, header={header}) for f in sorted(files)], ignore_index=True)',
|
|
36
|
+
]
|
|
37
|
+
else:
|
|
38
|
+
full_path = os.path.join(path, name) if name else path
|
|
39
|
+
df = pd.read_excel(full_path, header=header)
|
|
40
|
+
code = [
|
|
41
|
+
f'df = pd.read_excel(',
|
|
42
|
+
f' "{full_path}",',
|
|
43
|
+
f' header={header}',
|
|
44
|
+
')',
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
return df, code
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
METADATA = {
|
|
2
|
+
"type": "rename_columns",
|
|
3
|
+
"label": "Rename Columns",
|
|
4
|
+
"category": "Columns",
|
|
5
|
+
"params": [
|
|
6
|
+
{"key": "mapping", "label": "mapping", "type": "text", "default": ""},
|
|
7
|
+
]
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
def run(df, params):
|
|
11
|
+
raw = params.get("mapping", "")
|
|
12
|
+
mapping = {}
|
|
13
|
+
for pair in raw.split("|"):
|
|
14
|
+
pair = pair.strip()
|
|
15
|
+
if ":" not in pair:
|
|
16
|
+
continue
|
|
17
|
+
old, new = pair.split(":", 1)
|
|
18
|
+
old, new = old.strip(), new.strip()
|
|
19
|
+
if old and new:
|
|
20
|
+
mapping[old] = new
|
|
21
|
+
if not mapping:
|
|
22
|
+
raise ValueError("Define al menos un renombrado.")
|
|
23
|
+
df = df.rename(columns=mapping)
|
|
24
|
+
code = [f"df = df.rename(columns={mapping})"]
|
|
25
|
+
return df, code
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
METADATA = {
|
|
2
|
+
"type": "reorder_columns",
|
|
3
|
+
"label": "Reorder Columns",
|
|
4
|
+
"category": "Columns",
|
|
5
|
+
"params": [
|
|
6
|
+
{"key": "columns", "label": "columns", "type": "text", "default": ""},
|
|
7
|
+
]
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
def run(df, params):
|
|
11
|
+
cols = [c.strip() for c in params.get("columns", "").split(",") if c.strip()]
|
|
12
|
+
if not cols:
|
|
13
|
+
raise ValueError("Define el orden de columnas.")
|
|
14
|
+
missing = [c for c in cols if c not in df.columns]
|
|
15
|
+
if missing:
|
|
16
|
+
raise ValueError(f"Columnas no encontradas: {missing}")
|
|
17
|
+
df = df[cols]
|
|
18
|
+
code = [f"df = df[{cols}]"]
|
|
19
|
+
return df, code
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import math
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
METADATA = {
|
|
6
|
+
"type": "replace_values",
|
|
7
|
+
"label": "Replace Values",
|
|
8
|
+
"category": "DataFrame",
|
|
9
|
+
"params": [
|
|
10
|
+
{"key": "target_col", "label": "Column to replace", "type": "text", "default": ""},
|
|
11
|
+
{"key": "with_type", "label": "With type", "type": "text", "default": "number"},
|
|
12
|
+
{"key": "with_value", "label": "Value", "type": "text", "default": ""},
|
|
13
|
+
{"key": "with_format", "label": "Datetime format", "type": "text", "default": "%Y-%m-%d %H:%M:%S"},
|
|
14
|
+
{"key": "conditions", "label": "Conditions (JSON)", "type": "text", "default": "[]"},
|
|
15
|
+
]
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# ── Condition mask builder ────────────────────────────────────────────────────
|
|
20
|
+
def _parse_cond_value(raw, vtype, df):
|
|
21
|
+
if vtype == 'column':
|
|
22
|
+
return df[raw] if raw in df.columns else raw
|
|
23
|
+
if vtype == 'number':
|
|
24
|
+
try:
|
|
25
|
+
return float(raw) if '.' in str(raw) else int(raw)
|
|
26
|
+
except (ValueError, TypeError):
|
|
27
|
+
return raw
|
|
28
|
+
if vtype == 'datetime':
|
|
29
|
+
try:
|
|
30
|
+
return pd.Timestamp(raw)
|
|
31
|
+
except Exception:
|
|
32
|
+
return raw
|
|
33
|
+
return raw # string
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _single_mask(df, cond):
|
|
37
|
+
col = cond.get('column', '')
|
|
38
|
+
operator = cond.get('operator', '==')
|
|
39
|
+
vtype = cond.get('type', 'string')
|
|
40
|
+
raw_val = cond.get('value', '')
|
|
41
|
+
negate = cond.get('not', False)
|
|
42
|
+
|
|
43
|
+
if col not in df.columns:
|
|
44
|
+
return pd.Series([True] * len(df), index=df.index)
|
|
45
|
+
|
|
46
|
+
s = df[col]
|
|
47
|
+
|
|
48
|
+
if operator == 'isna':
|
|
49
|
+
mask = s.isna()
|
|
50
|
+
elif operator == 'notna':
|
|
51
|
+
mask = s.notna()
|
|
52
|
+
elif operator == 'isin':
|
|
53
|
+
vals = [v.strip() for v in str(raw_val).split(',')]
|
|
54
|
+
mask = s.isin(vals)
|
|
55
|
+
else:
|
|
56
|
+
val = _parse_cond_value(raw_val, vtype, df)
|
|
57
|
+
ops = {'==': s == val, '!=': s != val,
|
|
58
|
+
'<': s < val, '<=': s <= val,
|
|
59
|
+
'>': s > val, '>=': s >= val}
|
|
60
|
+
mask = ops.get(operator, pd.Series([True] * len(df), index=df.index))
|
|
61
|
+
|
|
62
|
+
return ~mask if negate else mask
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _build_mask(df, conds):
|
|
66
|
+
if not conds:
|
|
67
|
+
return pd.Series([True] * len(df), index=df.index)
|
|
68
|
+
mask = _single_mask(df, conds[0])
|
|
69
|
+
for cond in conds[1:]:
|
|
70
|
+
m2 = _single_mask(df, cond)
|
|
71
|
+
logical = cond.get('logical', 'and')
|
|
72
|
+
if logical == 'or':
|
|
73
|
+
mask = mask | m2
|
|
74
|
+
elif logical == 'xor':
|
|
75
|
+
mask = mask ^ m2
|
|
76
|
+
else:
|
|
77
|
+
mask = mask & m2
|
|
78
|
+
return mask
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _mask_repr(conds):
|
|
82
|
+
if not conds:
|
|
83
|
+
return "slice(None)"
|
|
84
|
+
parts = []
|
|
85
|
+
for c in conds:
|
|
86
|
+
col = c.get('column', '')
|
|
87
|
+
op = c.get('operator', '==')
|
|
88
|
+
val = c.get('value', '')
|
|
89
|
+
not_ = 'NOT ' if c.get('not') else ''
|
|
90
|
+
parts.append(f"{not_}df['{col}'] {op} {repr(val)}")
|
|
91
|
+
return ' & '.join(f"({p})" for p in parts)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# ── Main run ──────────────────────────────────────────────────────────────────
|
|
95
|
+
def run(df, params):
|
|
96
|
+
target_col = params.get('target_col', '').strip()
|
|
97
|
+
with_type = params.get('with_type', 'number')
|
|
98
|
+
raw_value = params.get('with_value', '')
|
|
99
|
+
with_format = params.get('with_format', '%Y-%m-%d %H:%M:%S').strip()
|
|
100
|
+
|
|
101
|
+
if not target_col or target_col not in df.columns:
|
|
102
|
+
raise ValueError(f"replace_values: columna '{target_col}' no encontrada en el DataFrame.")
|
|
103
|
+
|
|
104
|
+
conds = []
|
|
105
|
+
try:
|
|
106
|
+
conds = json.loads(params.get('conditions', '[]'))
|
|
107
|
+
except Exception:
|
|
108
|
+
conds = []
|
|
109
|
+
|
|
110
|
+
mask = _build_mask(df, conds)
|
|
111
|
+
mask_repr = _mask_repr(conds)
|
|
112
|
+
code = []
|
|
113
|
+
|
|
114
|
+
# ── number ──
|
|
115
|
+
if with_type == 'number':
|
|
116
|
+
try:
|
|
117
|
+
replacement = float(raw_value) if '.' in str(raw_value) else int(raw_value)
|
|
118
|
+
except (ValueError, TypeError):
|
|
119
|
+
replacement = 0
|
|
120
|
+
df.loc[mask, target_col] = replacement
|
|
121
|
+
code = [f"df.loc[{mask_repr}, '{target_col}'] = {repr(replacement)}"]
|
|
122
|
+
|
|
123
|
+
# ── na (NaN) ──
|
|
124
|
+
elif with_type == 'na':
|
|
125
|
+
df.loc[mask, target_col] = float('nan')
|
|
126
|
+
code = [f"df.loc[{mask_repr}, '{target_col}'] = float('nan')"]
|
|
127
|
+
|
|
128
|
+
# ── string ──
|
|
129
|
+
elif with_type == 'string':
|
|
130
|
+
df.loc[mask, target_col] = raw_value
|
|
131
|
+
code = [f"df.loc[{mask_repr}, '{target_col}'] = {repr(raw_value)}"]
|
|
132
|
+
|
|
133
|
+
# ── datetime ──
|
|
134
|
+
elif with_type == 'datetime':
|
|
135
|
+
try:
|
|
136
|
+
replacement = pd.Timestamp(raw_value)
|
|
137
|
+
except Exception:
|
|
138
|
+
raise ValueError(f"replace_values: no se pudo parsear '{raw_value}' como datetime.")
|
|
139
|
+
df.loc[mask, target_col] = replacement
|
|
140
|
+
fmt_comment = f" # format: {with_format}" if with_format else ""
|
|
141
|
+
code = [f"df.loc[{mask_repr}, '{target_col}'] = pd.Timestamp({repr(raw_value)}){fmt_comment}"]
|
|
142
|
+
|
|
143
|
+
# ── column (copy values from another column) ──
|
|
144
|
+
elif with_type == 'column':
|
|
145
|
+
src_col = raw_value.strip()
|
|
146
|
+
if src_col not in df.columns:
|
|
147
|
+
raise ValueError(f"replace_values: columna fuente '{src_col}' no encontrada.")
|
|
148
|
+
df.loc[mask, target_col] = df.loc[mask, src_col].values
|
|
149
|
+
code = [f"df.loc[{mask_repr}, '{target_col}'] = df.loc[{mask_repr}, '{src_col}']"]
|
|
150
|
+
|
|
151
|
+
else:
|
|
152
|
+
code = [f"# replace_values: with_type='{with_type}' no reconocido"]
|
|
153
|
+
|
|
154
|
+
return df, code
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
METADATA = {
|
|
2
|
+
"type": "resample",
|
|
3
|
+
"label": "Resample",
|
|
4
|
+
"category": "Resampling",
|
|
5
|
+
"params": [
|
|
6
|
+
{"key": "rule", "label": "Rule (ej: 1S, 1T, 2H, 1D)", "type": "text", "default": ""},
|
|
7
|
+
{"key": "sparse", "label": "Sparse resampling", "type": "text", "default": "true"},
|
|
8
|
+
{"key": "closed", "label": "closed", "type": "text", "default": "None"},
|
|
9
|
+
{"key": "label", "label": "label", "type": "text", "default": "None"},
|
|
10
|
+
{"key": "kind", "label": "kind", "type": "text", "default": "None"},
|
|
11
|
+
{"key": "origin", "label": "origin", "type": "text", "default": "epoch"},
|
|
12
|
+
{"key": "offset", "label": "offset", "type": "text", "default": ""},
|
|
13
|
+
]
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
# Signal to main.py that this block returns a Resampler, not a DataFrame
|
|
17
|
+
IS_RESAMPLER = True
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _parse_bool(val):
|
|
21
|
+
if isinstance(val, bool):
|
|
22
|
+
return val
|
|
23
|
+
return str(val).strip().lower() not in ("false", "0", "")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def run(df, params):
|
|
27
|
+
import pandas as pd
|
|
28
|
+
|
|
29
|
+
rule = (params.get("rule", "") or "").strip()
|
|
30
|
+
sparse = _parse_bool(params.get("sparse", "true"))
|
|
31
|
+
closed = (params.get("closed", "None") or "None").strip()
|
|
32
|
+
label_val = (params.get("label", "None") or "None").strip()
|
|
33
|
+
kind = (params.get("kind", "None") or "None").strip()
|
|
34
|
+
origin = (params.get("origin", "epoch") or "epoch").strip()
|
|
35
|
+
offset = (params.get("offset", "") or "").strip()
|
|
36
|
+
|
|
37
|
+
if not rule:
|
|
38
|
+
raise ValueError("resample: debes especificar una regla (e.g. '1T', '1H', '1D').")
|
|
39
|
+
|
|
40
|
+
if not isinstance(df.index, pd.DatetimeIndex):
|
|
41
|
+
raise ValueError(
|
|
42
|
+
"resample: el índice debe ser DatetimeIndex. "
|
|
43
|
+
"Usa el bloque Set Index con una columna datetime antes de este bloque."
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
if sparse:
|
|
47
|
+
resampler = df.resample(rule)
|
|
48
|
+
code = [f'resampler = df.resample("{rule}")']
|
|
49
|
+
else:
|
|
50
|
+
kwargs = {}
|
|
51
|
+
if closed and closed != "None":
|
|
52
|
+
kwargs["closed"] = closed
|
|
53
|
+
if label_val and label_val != "None":
|
|
54
|
+
kwargs["label"] = label_val
|
|
55
|
+
if kind and kind != "None":
|
|
56
|
+
kwargs["kind"] = kind
|
|
57
|
+
if origin:
|
|
58
|
+
kwargs["origin"] = origin
|
|
59
|
+
if offset:
|
|
60
|
+
kwargs["offset"] = offset
|
|
61
|
+
|
|
62
|
+
resampler = df.resample(rule, **kwargs)
|
|
63
|
+
kw_s = ", ".join(f"{k}={repr(v)}" for k, v in kwargs.items())
|
|
64
|
+
rule_s = f'"{rule}"' + (f", {kw_s}" if kw_s else "")
|
|
65
|
+
code = [f"resampler = df.resample({rule_s})"]
|
|
66
|
+
|
|
67
|
+
# Return the Resampler object — aggregation is done by the Aggregate block
|
|
68
|
+
return resampler, code
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
METADATA = {
|
|
2
|
+
"type": "sample_rows",
|
|
3
|
+
"label": "Sample Rows",
|
|
4
|
+
"category": "DataFrame",
|
|
5
|
+
"params": [
|
|
6
|
+
{"key": "n", "label": "n", "type": "text", "default": ""},
|
|
7
|
+
{"key": "frac", "label": "frac", "type": "text", "default": ""},
|
|
8
|
+
{"key": "random_state", "label": "random state", "type": "text", "default": ""},
|
|
9
|
+
{"key": "ignore_index", "label": "ignore index", "type": "text", "default": "false"},
|
|
10
|
+
]
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _parse_bool(val):
|
|
15
|
+
if isinstance(val, bool):
|
|
16
|
+
return val
|
|
17
|
+
return str(val).strip().lower() == "true"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def run(df, params):
|
|
21
|
+
n_raw = str(params.get("n", "") or "").strip()
|
|
22
|
+
fr_raw = str(params.get("frac", "") or "").strip()
|
|
23
|
+
rs_raw = str(params.get("random_state", "") or "").strip()
|
|
24
|
+
ignore_index = _parse_bool(params.get("ignore_index", False))
|
|
25
|
+
|
|
26
|
+
kwargs = {"ignore_index": ignore_index}
|
|
27
|
+
|
|
28
|
+
# frac takes precedence over n if both are provided
|
|
29
|
+
if fr_raw and fr_raw.lower() not in ("", "none"):
|
|
30
|
+
try:
|
|
31
|
+
kwargs["frac"] = float(fr_raw)
|
|
32
|
+
except ValueError:
|
|
33
|
+
pass
|
|
34
|
+
elif n_raw and n_raw.lower() not in ("", "none"):
|
|
35
|
+
try:
|
|
36
|
+
kwargs["n"] = int(n_raw)
|
|
37
|
+
except ValueError:
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
if rs_raw and rs_raw.lower() not in ("", "none"):
|
|
41
|
+
try:
|
|
42
|
+
kwargs["random_state"] = int(rs_raw)
|
|
43
|
+
except ValueError:
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
df = df.sample(**kwargs)
|
|
47
|
+
kw_s = ", ".join(f"{k}={repr(v)}" for k, v in kwargs.items())
|
|
48
|
+
code = [f"df = df.sample({kw_s})"]
|
|
49
|
+
return df, code
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
METADATA = {
|
|
2
|
+
"type": "select_columns",
|
|
3
|
+
"label": "Select Columns",
|
|
4
|
+
"category": "Columns",
|
|
5
|
+
"params": [
|
|
6
|
+
{"key": "columns", "label": "columns", "type": "text", "default": ""},
|
|
7
|
+
]
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
def run(df, params):
|
|
11
|
+
cols = [c.strip() for c in params.get("columns", "").split(",") if c.strip()]
|
|
12
|
+
if not cols:
|
|
13
|
+
raise ValueError("Selecciona al menos una columna.")
|
|
14
|
+
missing = [c for c in cols if c not in df.columns]
|
|
15
|
+
if missing:
|
|
16
|
+
raise ValueError(f"Columnas no encontradas: {missing}")
|
|
17
|
+
df = df[cols]
|
|
18
|
+
code = [f"df = df[{cols}]"]
|
|
19
|
+
return df, code
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
METADATA = {
|
|
2
|
+
"type": "set_dtypes",
|
|
3
|
+
"label": "Set Dtypes",
|
|
4
|
+
"category": "DataFrame",
|
|
5
|
+
"params": [
|
|
6
|
+
{"key": "mapping", "label": "mapping", "type": "text", "default": ""},
|
|
7
|
+
]
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
def run(df, params):
|
|
11
|
+
import pandas as pd
|
|
12
|
+
raw = params.get("mapping", "")
|
|
13
|
+
if not raw.strip():
|
|
14
|
+
raise ValueError("Especifica al menos un mapeo columna:tipo.")
|
|
15
|
+
|
|
16
|
+
dtype_map = {}
|
|
17
|
+
for pair in raw.split(";"):
|
|
18
|
+
pair = pair.strip()
|
|
19
|
+
if ":" not in pair:
|
|
20
|
+
continue
|
|
21
|
+
col, dtype = pair.split(":", 1)
|
|
22
|
+
dtype_map[col.strip()] = dtype.strip()
|
|
23
|
+
|
|
24
|
+
if not dtype_map:
|
|
25
|
+
raise ValueError("Especifica al menos un mapeo columna:tipo.")
|
|
26
|
+
|
|
27
|
+
code = []
|
|
28
|
+
for col, dtype in dtype_map.items():
|
|
29
|
+
if dtype == "datetime":
|
|
30
|
+
df[col] = pd.to_datetime(df[col], format='mixed', errors='coerce')
|
|
31
|
+
code.append(f"s = df['{col}']")
|
|
32
|
+
code.append(f"df['{col}'] = pd.to_datetime(s, format='mixed', errors='coerce')")
|
|
33
|
+
elif dtype == "numeric":
|
|
34
|
+
df[col] = pd.to_numeric(df[col], errors='coerce')
|
|
35
|
+
code.append(f"s = df['{col}']")
|
|
36
|
+
code.append(f"df['{col}'] = pd.to_numeric(s, errors='coerce')")
|
|
37
|
+
elif dtype == "categorical":
|
|
38
|
+
df[col] = df[col].astype("category")
|
|
39
|
+
code.append(f"s = df['{col}']")
|
|
40
|
+
code.append(f"df['{col}'] = s.astype('category')")
|
|
41
|
+
else:
|
|
42
|
+
df[col] = df[col].astype(dtype, errors='ignore')
|
|
43
|
+
code.append(f"s = df['{col}']")
|
|
44
|
+
code.append(f"df['{col}'] = s.astype('{dtype}')")
|
|
45
|
+
|
|
46
|
+
return df, code
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
METADATA = {
|
|
2
|
+
"type": "set_index",
|
|
3
|
+
"label": "Set Index",
|
|
4
|
+
"category": "Index",
|
|
5
|
+
"params": [
|
|
6
|
+
{"key": "columns", "label": "columns", "type": "text", "default": ""},
|
|
7
|
+
]
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
def run(df, params):
|
|
11
|
+
cols_raw = params.get("columns", "").strip()
|
|
12
|
+
if not cols_raw:
|
|
13
|
+
raise ValueError("Selecciona al menos una columna para usar como índice.")
|
|
14
|
+
cols = [c.strip() for c in cols_raw.split(",") if c.strip()]
|
|
15
|
+
df.set_index(cols, drop=False, inplace=True)
|
|
16
|
+
cols_repr = str(cols)
|
|
17
|
+
code = [
|
|
18
|
+
f"df.set_index(",
|
|
19
|
+
f" {cols_repr},",
|
|
20
|
+
f" drop=False,",
|
|
21
|
+
f" inplace=True,",
|
|
22
|
+
f")",
|
|
23
|
+
]
|
|
24
|
+
return df, code
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
METADATA = {
|
|
4
|
+
"type": "write_csv",
|
|
5
|
+
"label": "Write CSV",
|
|
6
|
+
"category": "Data I/O",
|
|
7
|
+
"params": [
|
|
8
|
+
{"key": "path", "label": "path", "type": "text", "default": ""},
|
|
9
|
+
{"key": "name", "label": "name", "type": "text", "default": "output.csv"},
|
|
10
|
+
{"key": "sep", "label": "sep", "type": "text", "default": ","},
|
|
11
|
+
{"key": "na_rep", "label": "NA rep", "type": "text", "default": ""},
|
|
12
|
+
{"key": "float_format", "label": "float format", "type": "text", "default": ""},
|
|
13
|
+
{"key": "header", "label": "header", "type": "toggle", "default": True},
|
|
14
|
+
{"key": "index", "label": "index", "type": "toggle", "default": True},
|
|
15
|
+
{"key": "decimal", "label": "decimal", "type": "text", "default": "."},
|
|
16
|
+
]
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
def run(df, params):
|
|
20
|
+
path = params.get("path", "").strip()
|
|
21
|
+
name = params.get("name", "output.csv").strip()
|
|
22
|
+
sep = params.get("sep", ",") or ","
|
|
23
|
+
na_rep = params.get("na_rep", "") or ""
|
|
24
|
+
float_format = params.get("float_format", "") or None
|
|
25
|
+
header = params.get("header", True)
|
|
26
|
+
index = params.get("index", True)
|
|
27
|
+
decimal = params.get("decimal", ".") or "."
|
|
28
|
+
|
|
29
|
+
if sep == "\\t":
|
|
30
|
+
sep = "\t"
|
|
31
|
+
|
|
32
|
+
full_path = os.path.join(path, name) if path else name
|
|
33
|
+
|
|
34
|
+
df.to_csv(full_path, sep=sep, na_rep=na_rep, float_format=float_format,
|
|
35
|
+
header=header, index=index, decimal=decimal)
|
|
36
|
+
|
|
37
|
+
sep_r = "\\t" if sep == "\t" else sep
|
|
38
|
+
code = [
|
|
39
|
+
f'df.to_csv(',
|
|
40
|
+
f' "{full_path}",',
|
|
41
|
+
f' sep="{sep_r}",',
|
|
42
|
+
f' na_rep="{na_rep}",',
|
|
43
|
+
f' float_format={repr(float_format)},',
|
|
44
|
+
f' header={header},',
|
|
45
|
+
f' index={index},',
|
|
46
|
+
f' decimal="{decimal}"',
|
|
47
|
+
')',
|
|
48
|
+
]
|
|
49
|
+
return df, code
|