dataforge-studio 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dataici/charts.py ADDED
@@ -0,0 +1,202 @@
1
+ import io, base64
2
+ import numpy as np
3
+ import pandas as pd
4
+ import matplotlib
5
+ matplotlib.use('Agg')
6
+ import matplotlib.pyplot as plt
7
+ import matplotlib.ticker as ticker
8
+ import warnings
9
+ warnings.filterwarnings('ignore')
10
+
11
+ plt.rcParams.update({
12
+ 'axes.spines.top': False,
13
+ 'axes.spines.right': False,
14
+ 'axes.edgecolor': '#cccccc',
15
+ 'axes.linewidth': 0.8,
16
+ 'xtick.color': '#666666',
17
+ 'ytick.color': '#666666',
18
+ 'xtick.labelsize': 9,
19
+ 'ytick.labelsize': 9,
20
+ 'figure.facecolor': 'white',
21
+ 'axes.facecolor': 'white',
22
+ })
23
+
24
+ SCATTER_COLOR = '#1a56a0'
25
+ HIST_FACE = '#aec7e8'
26
+ HIST_EDGE = '#1f77b4'
27
+ BOX_FACE = '#aec7e8'
28
+ BOX_EDGE = '#1f77b4'
29
+ DPI = 150
30
+
31
+
32
+ def _to_b64(fig):
33
+ buf = io.BytesIO()
34
+ fig.savefig(buf, format='png', dpi=DPI, facecolor='white', edgecolor='none',
35
+ bbox_inches='tight')
36
+ plt.close(fig)
37
+ buf.seek(0)
38
+ return base64.b64encode(buf.read()).decode()
39
+
40
+
41
+ def make_scatter(col, indices, values):
42
+ """ALL points — overlap creates density effect like DataBruin."""
43
+ idx_arr = np.asarray(indices, dtype=np.float64)
44
+ val_arr = np.asarray(values, dtype=np.float64)
45
+ # Remove any remaining NaN/Inf
46
+ mask = np.isfinite(idx_arr) & np.isfinite(val_arr)
47
+ idx_arr, val_arr = idx_arr[mask], val_arr[mask]
48
+ if len(idx_arr) == 0:
49
+ return None
50
+
51
+ x_max = float(idx_arr.max())
52
+ fig, ax = plt.subplots(figsize=(14, 5))
53
+ ax.scatter(idx_arr, val_arr,
54
+ s=8, c=SCATTER_COLOR, alpha=0.6,
55
+ linewidths=0, rasterized=True)
56
+ ax.set_ylabel(col, fontsize=9, color='#444')
57
+ ax.xaxis.set_major_formatter(
58
+ ticker.FuncFormatter(lambda x, _: f'{int(x):,}'))
59
+ ax.set_xlim(left=-x_max * 0.02, right=x_max * 1.02)
60
+ ax.margins(y=0.08)
61
+ fig.subplots_adjust(left=0.08, right=0.99, top=0.96, bottom=0.10)
62
+ return _to_b64(fig)
63
+
64
+
65
+ def make_histogram(values):
66
+ """Histogram — fewer bins like DataBruin."""
67
+ arr = np.asarray(values, dtype=np.float64)
68
+ arr = arr[np.isfinite(arr)]
69
+ if len(arr) == 0:
70
+ return None
71
+ # DataBruin visually shows ~10-15 bins
72
+ # Use Scott's rule which gives fewer bins than Sturges for large n
73
+ n_bins = int(np.ceil(np.log2(len(arr)) + 1))
74
+ n_bins = max(5, min(n_bins, 20)) # cap at 20, min 5
75
+
76
+ fig, ax = plt.subplots(figsize=(6, 3.5))
77
+ ax.hist(arr, bins=n_bins, color=HIST_FACE, edgecolor=HIST_EDGE, linewidth=0.4)
78
+ ax.set_ylabel('Frequency', fontsize=8, color='#444')
79
+ ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{x:g}'))
80
+ ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{int(x):,}'))
81
+ ax.margins(x=0.02)
82
+ fig.subplots_adjust(left=0.14, right=0.97, top=0.95, bottom=0.12)
83
+ return _to_b64(fig)
84
+
85
+
86
+ def make_boxplot(col, values):
87
+ """Horizontal boxplot with vertical label."""
88
+ arr = np.asarray(values, dtype=np.float64)
89
+ arr = arr[np.isfinite(arr)]
90
+ if len(arr) < 2:
91
+ return None
92
+ fig, ax = plt.subplots(figsize=(6, 2.4))
93
+ ax.boxplot(arr, vert=False, patch_artist=True, widths=0.55,
94
+ flierprops=dict(marker='o', markersize=3, markerfacecolor='white',
95
+ markeredgecolor='#555', markeredgewidth=0.8, alpha=0.5),
96
+ medianprops=dict(color=BOX_EDGE, linewidth=2.5),
97
+ boxprops=dict(facecolor=BOX_FACE, edgecolor=BOX_EDGE, linewidth=1.5),
98
+ whiskerprops=dict(color='#333', linewidth=1.5),
99
+ capprops=dict(color='#333', linewidth=1.5),
100
+ )
101
+ ax.set_yticks([1])
102
+ ax.set_yticklabels([col], fontsize=7, rotation=90, va='center')
103
+ ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{x:g}'))
104
+ ax.margins(x=0.04)
105
+ fig.subplots_adjust(left=0.10, right=0.97, top=0.93, bottom=0.18)
106
+ return _to_b64(fig)
107
+
108
+
109
+ def make_datetime_line(col, indices, timestamps_ms):
110
+ """Line chart for datetime columns."""
111
+ import datetime
112
+ pairs = [(i, datetime.datetime.fromtimestamp(t / 1000))
113
+ for i, t in zip(indices, timestamps_ms) if t is not None]
114
+ if not pairs:
115
+ return None
116
+ idxs, dates = zip(*pairs)
117
+ fig, ax = plt.subplots(figsize=(11, 4))
118
+ ax.plot(idxs, dates, color=SCATTER_COLOR, linewidth=1)
119
+ import matplotlib.dates as mdates
120
+ ax.yaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
121
+ ax.xaxis.set_major_formatter(
122
+ ticker.FuncFormatter(lambda x, _: f'{int(x):,}'))
123
+ ax.set_xlim(left=0)
124
+ ax.margins(y=0.08)
125
+ fig.subplots_adjust(left=0.12, right=0.98, top=0.95, bottom=0.12)
126
+ return _to_b64(fig)
127
+
128
+
129
+ def generate_column_charts(df, col):
130
+ out = {}
131
+ try:
132
+ s = df[col].copy()
133
+ dtype = str(s.dtype)
134
+ n = len(df)
135
+
136
+ is_num = np.issubdtype(s.dtype, np.number)
137
+ is_dt = 'datetime' in dtype
138
+
139
+ # Try numeric conversion for object columns
140
+ if not is_num and not is_dt:
141
+ try:
142
+ s_conv = pd.to_numeric(s, errors='coerce')
143
+ if s_conv.notna().sum() > len(s) * 0.5:
144
+ s = s_conv
145
+ is_num = True
146
+ except Exception:
147
+ pass
148
+
149
+ if is_num:
150
+ try:
151
+ arr = s.to_numpy(dtype=np.float64, na_value=np.nan)
152
+ mask = ~np.isnan(arr)
153
+ valid_idxs = np.where(mask)[0]
154
+ valid_vals = arr[mask]
155
+
156
+ if len(valid_vals) == 0:
157
+ return out
158
+
159
+ # Scatter — all points
160
+ try:
161
+ out['scatter'] = make_scatter(col, valid_idxs, valid_vals)
162
+ except Exception as e:
163
+ print(f"[charts] scatter error for '{col}': {e}")
164
+
165
+ # Histogram
166
+ try:
167
+ out['hist'] = make_histogram(valid_vals)
168
+ except Exception as e:
169
+ print(f"[charts] hist error for '{col}': {e}")
170
+
171
+ # Boxplot — needs at least 2 unique values
172
+ try:
173
+ if len(np.unique(valid_vals)) >= 2:
174
+ out['box'] = make_boxplot(col, valid_vals)
175
+ except Exception as e:
176
+ print(f"[charts] box error for '{col}': {e}")
177
+
178
+ except Exception as e:
179
+ print(f"[charts] numeric processing error for '{col}': {e}")
180
+
181
+ elif is_dt:
182
+ try:
183
+ sz = min(5000, n)
184
+ step = max(1, n // sz)
185
+ idxs = list(range(0, n, step))[:sz]
186
+ samp = s.iloc[idxs]
187
+ ts_list = []
188
+ for v in samp:
189
+ try:
190
+ ts_list.append(int(pd.Timestamp(v).timestamp() * 1000))
191
+ except Exception:
192
+ ts_list.append(None)
193
+ img = make_datetime_line(col, idxs, ts_list)
194
+ if img:
195
+ out['scatter'] = img
196
+ except Exception as e:
197
+ print(f"[charts] datetime error for '{col}': {e}")
198
+
199
+ except Exception as e:
200
+ print(f"[charts] top-level error for '{col}': {e}")
201
+
202
+ return out
dataici/cli.py ADDED
@@ -0,0 +1,35 @@
1
+ """
2
+ DataForge — punto de entrada de línea de comandos.
3
+
4
+ Uso:
5
+ dataici # corre en 127.0.0.1:8000
6
+ dataici --port 8080 # puerto personalizado
7
+ """
8
+ import argparse
9
+ import uvicorn
10
+
11
+
12
+ def main():
13
+ parser = argparse.ArgumentParser(
14
+ prog="dataici",
15
+ description="DataForge — Studio de Preprocesamiento de Datos (UAH)",
16
+ )
17
+ parser.add_argument("--host", default="127.0.0.1", help="Host (default: 127.0.0.1)")
18
+ parser.add_argument("--port", default=8000, type=int, help="Puerto (default: 8000)")
19
+ args = parser.parse_args()
20
+
21
+ url = f"http://{args.host}:{args.port}"
22
+ print(f"\n 🚀 DataForge corriendo en {url}")
23
+ print(f" → Vuelve a la página y haz clic en 'Abrir DataForge'\n")
24
+
25
+ uvicorn.run(
26
+ "dataici.main:app",
27
+ host=args.host,
28
+ port=args.port,
29
+ reload=False,
30
+ log_level="warning",
31
+ )
32
+
33
+
34
+ if __name__ == "__main__":
35
+ main()
dataici/main.py ADDED
@@ -0,0 +1,349 @@
1
+ from fastapi import FastAPI, APIRouter
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from fastapi.staticfiles import StaticFiles
4
+ from fastapi.responses import FileResponse
5
+ from pydantic import BaseModel
6
+ from typing import Any, Optional
7
+ from collections import defaultdict, deque
8
+ import importlib
9
+ import math
10
+ import json
11
+ import os
12
+ import uuid
13
+ from datetime import datetime
14
+
15
+ # ── Paths ─────────────────────────────────────────────────────────────────────
16
+ _HERE = os.path.dirname(__file__)
17
+
18
+ # Projects stored in ~/.dataici/projects.json so they survive package updates
19
+ _DATAICI_DIR = os.path.join(os.path.expanduser("~"), ".dataici")
20
+ os.makedirs(_DATAICI_DIR, exist_ok=True)
21
+ PROJECTS_FILE = os.path.join(_DATAICI_DIR, "projects.json")
22
+
23
+ # Static frontend files (pre-built, bundled with the package)
24
+ STATIC_DIR = os.path.join(_HERE, "static")
25
+
26
+ def _load_projects():
27
+ if not os.path.exists(PROJECTS_FILE):
28
+ return {}
29
+ with open(PROJECTS_FILE, "r", encoding="utf-8") as f:
30
+ return json.load(f)
31
+
32
+ def _save_projects(projects):
33
+ with open(PROJECTS_FILE, "w", encoding="utf-8") as f:
34
+ json.dump(projects, f, ensure_ascii=False, indent=2)
35
+
36
+
37
+ def _sanitize(obj):
38
+ """Recursively replace NaN/Inf floats with None so JSON serialization never fails."""
39
+ if isinstance(obj, float):
40
+ if math.isnan(obj) or math.isinf(obj):
41
+ return None
42
+ return obj
43
+ if isinstance(obj, dict):
44
+ return {k: _sanitize(v) for k, v in obj.items()}
45
+ if isinstance(obj, list):
46
+ return [_sanitize(v) for v in obj]
47
+ return obj
48
+
49
+ app = FastAPI(title="DataICI API")
50
+ app.add_middleware(
51
+ CORSMiddleware,
52
+ allow_origins=["*"],
53
+ allow_methods=["*"],
54
+ allow_headers=["*"],
55
+ )
56
+ _cache = {}
57
+
58
+ # ── All API routes live under /api ────────────────────────────────────────────
59
+ api = APIRouter(prefix="/api")
60
+
61
+ class NodeDef(BaseModel):
62
+ id: str
63
+ type: str
64
+ params: dict[str, Any] = {}
65
+
66
+ class EdgeDef(BaseModel):
67
+ source: str
68
+ target: str
69
+ sourceHandle: Optional[str] = None
70
+ targetHandle: Optional[str] = None
71
+
72
+ class PipelineRequest(BaseModel):
73
+ nodes: list[NodeDef]
74
+ edges: list[EdgeDef]
75
+
76
+ @api.get("/health")
77
+ def root():
78
+ return {"status": "DataICI backend running", "version": "1.0.0"}
79
+
80
+ # ── Project management ────────────────────────────────────────────────────────
81
+ @api.get("/projects")
82
+ def list_projects():
83
+ projects = _load_projects()
84
+ return sorted(projects.values(), key=lambda p: p["created_at"], reverse=True)
85
+
86
+ @api.post("/projects")
87
+ def create_project(body: dict):
88
+ projects = _load_projects()
89
+ pid = str(uuid.uuid4())[:8]
90
+ now = datetime.now().isoformat()
91
+ proj = {
92
+ "id": pid,
93
+ "name": body.get("name", "Sin nombre"),
94
+ "description": body.get("description", ""),
95
+ "created_at": now,
96
+ "modified_at": now,
97
+ "nodes": [],
98
+ "edges": [],
99
+ }
100
+ projects[pid] = proj
101
+ _save_projects(projects)
102
+ return proj
103
+
104
+ @api.get("/projects/{pid}")
105
+ def get_project(pid: str):
106
+ projects = _load_projects()
107
+ if pid not in projects:
108
+ return {"error": "Proyecto no encontrado"}
109
+ return projects[pid]
110
+
111
+ @api.put("/projects/{pid}")
112
+ def update_project(pid: str, body: dict):
113
+ projects = _load_projects()
114
+ if pid not in projects:
115
+ return {"error": "Proyecto no encontrado"}
116
+ projects[pid].update({
117
+ "nodes": body.get("nodes", projects[pid]["nodes"]),
118
+ "edges": body.get("edges", projects[pid]["edges"]),
119
+ "modified_at": datetime.now().isoformat(),
120
+ })
121
+ _save_projects(projects)
122
+ return projects[pid]
123
+
124
+ @api.delete("/projects/{pid}")
125
+ def delete_project(pid: str):
126
+ projects = _load_projects()
127
+ if pid in projects:
128
+ del projects[pid]
129
+ _save_projects(projects)
130
+ return {"ok": True}
131
+
132
+ # ── Resampler helpers ─────────────────────────────────────────────────────────
133
+ def _is_resampler(obj):
134
+ try:
135
+ from pandas.core.resample import DatetimeIndexResampler
136
+ return isinstance(obj, DatetimeIndexResampler)
137
+ except ImportError:
138
+ pass
139
+ return hasattr(obj, "_selected_obj") and hasattr(obj, "mean") and not hasattr(obj, "to_dict")
140
+
141
+
142
+ def _resampler_to_display(resampler):
143
+ windows = []
144
+ for key, group in resampler:
145
+ if group.empty:
146
+ continue
147
+ group_head = group.head(5)
148
+ raw_idx_name = group.index.name or "Time"
149
+ idx_col = raw_idx_name if raw_idx_name not in group.columns else f"{raw_idx_name}__idx"
150
+ group_reset = group_head.rename_axis(idx_col).reset_index()
151
+ data_cols = list(group.columns)
152
+ windows.append({
153
+ "timestamp": str(key),
154
+ "n_rows": int(len(group)),
155
+ "index_col": idx_col,
156
+ "columns": data_cols,
157
+ "data": group_reset.where(group_reset.notna(), other=None).to_dict(orient="records"),
158
+ })
159
+ if len(windows) >= 4:
160
+ break
161
+ return {
162
+ "is_resampler": True,
163
+ "n_windows": len(windows),
164
+ "windows": windows,
165
+ }
166
+
167
+
168
+ @api.post("/run")
169
+ def run_pipeline(req: PipelineRequest):
170
+ import pandas as pd
171
+ import numpy as np
172
+
173
+ nodes_by_id = {n.id: n for n in req.nodes}
174
+ incoming = defaultdict(dict)
175
+ outgoing = defaultdict(list)
176
+
177
+ for edge in req.edges:
178
+ handle = edge.targetHandle or "input-0"
179
+ incoming[edge.target][handle] = edge.source
180
+ outgoing[edge.source].append(edge.target)
181
+
182
+ in_degree = {n.id: 0 for n in req.nodes}
183
+ for edge in req.edges:
184
+ in_degree[edge.target] += 1
185
+ queue = deque([n.id for n in req.nodes if in_degree[n.id] == 0])
186
+ topo_order = []
187
+ while queue:
188
+ nid = queue.popleft()
189
+ topo_order.append(nid)
190
+ for nb in outgoing[nid]:
191
+ in_degree[nb] -= 1
192
+ if in_degree[nb] == 0:
193
+ queue.append(nb)
194
+
195
+ if len(topo_order) != len(req.nodes):
196
+ return {"error": "El pipeline tiene un ciclo o nodos sin conectar."}
197
+
198
+ results = {}
199
+ all_code = ["import pandas as pd", ""]
200
+ last_result = None
201
+
202
+ for nid in topo_order:
203
+ node = nodes_by_id[nid]
204
+ params = dict(node.params)
205
+ try:
206
+ module = importlib.import_module(f"dataici.blocks.{node.type}")
207
+ importlib.reload(module)
208
+ meta = getattr(module, "METADATA", {})
209
+ if meta:
210
+ valid_keys = {p["key"] for p in meta.get("params", [])}
211
+ params = {k: v for k, v in params.items() if k in valid_keys}
212
+ multi_input = meta.get("multi_input", False)
213
+ node_incoming = incoming.get(nid, {})
214
+
215
+ if multi_input:
216
+ sorted_handles = sorted(
217
+ node_incoming.keys(),
218
+ key=lambda h: int(h.split("-")[1]) if h and "-" in h else 0,
219
+ )
220
+ input_dfs = [results[node_incoming[h]] for h in sorted_handles if node_incoming.get(h) in results]
221
+ if len(input_dfs) < 2:
222
+ return {"error": f"'{node.type}' necesita al menos 2 entradas conectadas."}
223
+ result, code_lines = module.run(input_dfs, params)
224
+ else:
225
+ if node_incoming:
226
+ src_id = node_incoming.get("input-0") or list(node_incoming.values())[0]
227
+ result_in = results.get(src_id)
228
+ else:
229
+ result_in = None
230
+ result, code_lines = module.run(result_in, params)
231
+
232
+ results[nid] = result
233
+ last_result = result
234
+ all_code.extend(code_lines)
235
+ all_code.append("")
236
+
237
+ except ModuleNotFoundError:
238
+ return {"error": f"Bloque '{node.type}' no encontrado."}
239
+ except Exception as e:
240
+ return {"error": str(e)}
241
+
242
+ if last_result is None:
243
+ return {"error": "Pipeline vacío o sin resultado."}
244
+
245
+ if _is_resampler(last_result):
246
+ resampler_info = _resampler_to_display(last_result)
247
+ _cache["df"] = last_result._selected_obj
248
+ return _sanitize({
249
+ **resampler_info,
250
+ "code": "\n".join(all_code).strip(),
251
+ })
252
+
253
+ df = last_result
254
+ _cache["df"] = df
255
+ df_safe = df.where(df.notna(), other=None)
256
+
257
+ try:
258
+ describe = df.describe(include="all").fillna("").astype(str).to_dict()
259
+ except Exception:
260
+ describe = {}
261
+
262
+ for col in df.select_dtypes(include=["datetime64"]).columns:
263
+ try:
264
+ s = df[col].dropna()
265
+ describe[col] = {
266
+ "count": str(len(s)), "mean": str(s.mean()), "min": str(s.min()),
267
+ "25%": str(s.quantile(0.25)), "50%": str(s.median()),
268
+ "75%": str(s.quantile(0.75)), "max": str(s.max()),
269
+ }
270
+ except Exception:
271
+ pass
272
+
273
+ box_stats = {}
274
+ for col in df.select_dtypes(include="number").columns:
275
+ try:
276
+ clean = df[col].dropna().astype(float)
277
+ box_stats[col] = {
278
+ "q1": float(clean.quantile(0.25)), "med": float(clean.quantile(0.5)),
279
+ "q3": float(clean.quantile(0.75)), "min": float(clean.min()),
280
+ "max": float(clean.max()), "count": int(len(clean)),
281
+ "missing": int(df[col].isnull().sum()),
282
+ }
283
+ except Exception:
284
+ pass
285
+
286
+ value_counts = {}
287
+ for col in df.columns:
288
+ try:
289
+ vc = df[col].value_counts(dropna=True).head(20)
290
+ value_counts[col] = [
291
+ {"value": str(k), "count": int(v), "pct": round(float(v)/len(df), 6)}
292
+ for k, v in vc.items()
293
+ ]
294
+ except Exception:
295
+ pass
296
+
297
+ index_names = [n for n in df.index.names if n is not None]
298
+ return _sanitize({
299
+ "data": df_safe.head(100).to_dict(orient="records"),
300
+ "columns": list(df.columns),
301
+ "index_names": index_names,
302
+ "shape": list(df.shape),
303
+ "nulls": int(df.isnull().sum().sum()),
304
+ "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
305
+ "describe": describe,
306
+ "box_stats": box_stats,
307
+ "value_counts": value_counts,
308
+ "code": "\n".join(all_code).strip(),
309
+ })
310
+
311
+ @api.get("/charts")
312
+ def get_charts(col: str):
313
+ df = _cache.get("df")
314
+ if df is None:
315
+ return {"error": "No hay datos. Ejecuta el pipeline primero."}
316
+ if col not in df.columns:
317
+ return {"error": f"Columna '{col}' no encontrada."}
318
+ try:
319
+ from dataici.charts import generate_column_charts
320
+ return generate_column_charts(df, col)
321
+ except Exception as e:
322
+ return {"error": str(e)}
323
+
324
+ @api.get("/blocks")
325
+ def list_blocks():
326
+ blocks = []
327
+ blocks_dir = os.path.join(_HERE, "blocks")
328
+ for fname in sorted(os.listdir(blocks_dir)):
329
+ if fname.endswith(".py") and not fname.startswith("_"):
330
+ try:
331
+ mod = importlib.import_module(f"dataici.blocks.{fname[:-3]}")
332
+ if hasattr(mod, "METADATA"):
333
+ blocks.append(mod.METADATA)
334
+ except Exception:
335
+ pass
336
+ return blocks
337
+
338
+ # ── Register API router ───────────────────────────────────────────────────────
339
+ app.include_router(api)
340
+
341
+ # ── Serve built frontend ──────────────────────────────────────────────────────
342
+ if os.path.exists(STATIC_DIR):
343
+ assets_dir = os.path.join(STATIC_DIR, "assets")
344
+ if os.path.exists(assets_dir):
345
+ app.mount("/assets", StaticFiles(directory=assets_dir), name="assets")
346
+
347
+ @app.get("/{full_path:path}", include_in_schema=False)
348
+ def serve_spa(full_path: str):
349
+ return FileResponse(os.path.join(STATIC_DIR, "index.html"))