PyPI - dataforge-studio - Versions diffs - 1.0.1__py3-none-any.whl - Mend

dataforge-studio 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

dataforge_studio-1.0.1.dist-info/METADATA +119 -0
dataforge_studio-1.0.1.dist-info/RECORD +30 -0
dataforge_studio-1.0.1.dist-info/WHEEL +5 -0
dataforge_studio-1.0.1.dist-info/entry_points.txt +2 -0
dataforge_studio-1.0.1.dist-info/top_level.txt +1 -0
dataici/__init__.py +3 -0
dataici/blocks/__init__.py +0 -0
dataici/blocks/aggregate.py +50 -0
dataici/blocks/append_column.py +18 -0
dataici/blocks/concatenate.py +70 -0
dataici/blocks/drop_columns.py +19 -0
dataici/blocks/filter_rows.py +120 -0
dataici/blocks/handle_missings.py +160 -0
dataici/blocks/load_csv.py +68 -0
dataici/blocks/read_excel.py +47 -0
dataici/blocks/rename_columns.py +25 -0
dataici/blocks/reorder_columns.py +19 -0
dataici/blocks/replace_values.py +154 -0
dataici/blocks/resample.py +68 -0
dataici/blocks/sample_rows.py +49 -0
dataici/blocks/select_columns.py +19 -0
dataici/blocks/set_dtypes.py +46 -0
dataici/blocks/set_index.py +24 -0
dataici/blocks/write_csv.py +49 -0
dataici/charts.py +202 -0
dataici/cli.py +35 -0
dataici/main.py +349 -0
dataici/static/assets/index-CYGnphoW.js +74 -0
dataici/static/assets/index-DLK3-mBP.css +1 -0
dataici/static/index.html +13 -0

dataici/charts.py ADDED Viewed

@@ -0,0 +1,202 @@
+import io, base64
+import numpy as np
+import pandas as pd
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import matplotlib.ticker as ticker
+import warnings
+warnings.filterwarnings('ignore')
+plt.rcParams.update({
+    'axes.spines.top':    False,
+    'axes.spines.right':  False,
+    'axes.edgecolor':     '#cccccc',
+    'axes.linewidth':     0.8,
+    'xtick.color':        '#666666',
+    'ytick.color':        '#666666',
+    'xtick.labelsize':    9,
+    'ytick.labelsize':    9,
+    'figure.facecolor':   'white',
+    'axes.facecolor':     'white',
+})
+SCATTER_COLOR = '#1a56a0'
+HIST_FACE     = '#aec7e8'
+HIST_EDGE     = '#1f77b4'
+BOX_FACE      = '#aec7e8'
+BOX_EDGE      = '#1f77b4'
+DPI           = 150
+def _to_b64(fig):
+    buf = io.BytesIO()
+    fig.savefig(buf, format='png', dpi=DPI, facecolor='white', edgecolor='none',
+                bbox_inches='tight')
+    plt.close(fig)
+    buf.seek(0)
+    return base64.b64encode(buf.read()).decode()
+def make_scatter(col, indices, values):
+    """ALL points — overlap creates density effect like DataBruin."""
+    idx_arr = np.asarray(indices, dtype=np.float64)
+    val_arr = np.asarray(values,  dtype=np.float64)
+    # Remove any remaining NaN/Inf
+    mask    = np.isfinite(idx_arr) & np.isfinite(val_arr)
+    idx_arr, val_arr = idx_arr[mask], val_arr[mask]
+    if len(idx_arr) == 0:
+        return None
+    x_max = float(idx_arr.max())
+    fig, ax = plt.subplots(figsize=(14, 5))
+    ax.scatter(idx_arr, val_arr,
+               s=8, c=SCATTER_COLOR, alpha=0.6,
+               linewidths=0, rasterized=True)
+    ax.set_ylabel(col, fontsize=9, color='#444')
+    ax.xaxis.set_major_formatter(
+        ticker.FuncFormatter(lambda x, _: f'{int(x):,}'))
+    ax.set_xlim(left=-x_max * 0.02, right=x_max * 1.02)
+    ax.margins(y=0.08)
+    fig.subplots_adjust(left=0.08, right=0.99, top=0.96, bottom=0.10)
+    return _to_b64(fig)
+def make_histogram(values):
+    """Histogram — fewer bins like DataBruin."""
+    arr = np.asarray(values, dtype=np.float64)
+    arr = arr[np.isfinite(arr)]
+    if len(arr) == 0:
+        return None
+    # DataBruin visually shows ~10-15 bins
+    # Use Scott's rule which gives fewer bins than Sturges for large n
+    n_bins = int(np.ceil(np.log2(len(arr)) + 1))
+    n_bins = max(5, min(n_bins, 20))  # cap at 20, min 5
+    fig, ax = plt.subplots(figsize=(6, 3.5))
+    ax.hist(arr, bins=n_bins, color=HIST_FACE, edgecolor=HIST_EDGE, linewidth=0.4)
+    ax.set_ylabel('Frequency', fontsize=8, color='#444')
+    ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{x:g}'))
+    ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{int(x):,}'))
+    ax.margins(x=0.02)
+    fig.subplots_adjust(left=0.14, right=0.97, top=0.95, bottom=0.12)
+    return _to_b64(fig)
+def make_boxplot(col, values):
+    """Horizontal boxplot with vertical label."""
+    arr = np.asarray(values, dtype=np.float64)
+    arr = arr[np.isfinite(arr)]
+    if len(arr) < 2:
+        return None
+    fig, ax = plt.subplots(figsize=(6, 2.4))
+    ax.boxplot(arr, vert=False, patch_artist=True, widths=0.55,
+        flierprops=dict(marker='o', markersize=3, markerfacecolor='white',
+                        markeredgecolor='#555', markeredgewidth=0.8, alpha=0.5),
+        medianprops=dict(color=BOX_EDGE, linewidth=2.5),
+        boxprops=dict(facecolor=BOX_FACE, edgecolor=BOX_EDGE, linewidth=1.5),
+        whiskerprops=dict(color='#333', linewidth=1.5),
+        capprops=dict(color='#333', linewidth=1.5),
+    )
+    ax.set_yticks([1])
+    ax.set_yticklabels([col], fontsize=7, rotation=90, va='center')
+    ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{x:g}'))
+    ax.margins(x=0.04)
+    fig.subplots_adjust(left=0.10, right=0.97, top=0.93, bottom=0.18)
+    return _to_b64(fig)
+def make_datetime_line(col, indices, timestamps_ms):
+    """Line chart for datetime columns."""
+    import datetime
+    pairs = [(i, datetime.datetime.fromtimestamp(t / 1000))
+             for i, t in zip(indices, timestamps_ms) if t is not None]
+    if not pairs:
+        return None
+    idxs, dates = zip(*pairs)
+    fig, ax = plt.subplots(figsize=(11, 4))
+    ax.plot(idxs, dates, color=SCATTER_COLOR, linewidth=1)
+    import matplotlib.dates as mdates
+    ax.yaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
+    ax.xaxis.set_major_formatter(
+        ticker.FuncFormatter(lambda x, _: f'{int(x):,}'))
+    ax.set_xlim(left=0)
+    ax.margins(y=0.08)
+    fig.subplots_adjust(left=0.12, right=0.98, top=0.95, bottom=0.12)
+    return _to_b64(fig)
+def generate_column_charts(df, col):
+    out = {}
+    try:
+        s     = df[col].copy()
+        dtype = str(s.dtype)
+        n     = len(df)
+        is_num = np.issubdtype(s.dtype, np.number)
+        is_dt  = 'datetime' in dtype
+        # Try numeric conversion for object columns
+        if not is_num and not is_dt:
+            try:
+                s_conv = pd.to_numeric(s, errors='coerce')
+                if s_conv.notna().sum() > len(s) * 0.5:
+                    s      = s_conv
+                    is_num = True
+            except Exception:
+                pass
+        if is_num:
+            try:
+                arr        = s.to_numpy(dtype=np.float64, na_value=np.nan)
+                mask       = ~np.isnan(arr)
+                valid_idxs = np.where(mask)[0]
+                valid_vals = arr[mask]
+                if len(valid_vals) == 0:
+                    return out
+                # Scatter — all points
+                try:
+                    out['scatter'] = make_scatter(col, valid_idxs, valid_vals)
+                except Exception as e:
+                    print(f"[charts] scatter error for '{col}': {e}")
+                # Histogram
+                try:
+                    out['hist'] = make_histogram(valid_vals)
+                except Exception as e:
+                    print(f"[charts] hist error for '{col}': {e}")
+                # Boxplot — needs at least 2 unique values
+                try:
+                    if len(np.unique(valid_vals)) >= 2:
+                        out['box'] = make_boxplot(col, valid_vals)
+                except Exception as e:
+                    print(f"[charts] box error for '{col}': {e}")
+            except Exception as e:
+                print(f"[charts] numeric processing error for '{col}': {e}")
+        elif is_dt:
+            try:
+                sz   = min(5000, n)
+                step = max(1, n // sz)
+                idxs = list(range(0, n, step))[:sz]
+                samp = s.iloc[idxs]
+                ts_list = []
+                for v in samp:
+                    try:
+                        ts_list.append(int(pd.Timestamp(v).timestamp() * 1000))
+                    except Exception:
+                        ts_list.append(None)
+                img = make_datetime_line(col, idxs, ts_list)
+                if img:
+                    out['scatter'] = img
+            except Exception as e:
+                print(f"[charts] datetime error for '{col}': {e}")
+    except Exception as e:
+        print(f"[charts] top-level error for '{col}': {e}")
+    return out

dataici/cli.py ADDED Viewed

@@ -0,0 +1,35 @@
+"""
+DataForge — punto de entrada de línea de comandos.
+Uso:
+    dataici                 # corre en 127.0.0.1:8000
+    dataici --port 8080     # puerto personalizado
+"""
+import argparse
+import uvicorn
+def main():
+    parser = argparse.ArgumentParser(
+        prog="dataici",
+        description="DataForge — Studio de Preprocesamiento de Datos (UAH)",
+    )
+    parser.add_argument("--host", default="127.0.0.1", help="Host (default: 127.0.0.1)")
+    parser.add_argument("--port", default=8000, type=int, help="Puerto (default: 8000)")
+    args = parser.parse_args()
+    url = f"http://{args.host}:{args.port}"
+    print(f"\n  🚀 DataForge corriendo en {url}")
+    print(f"  → Vuelve a la página y haz clic en 'Abrir DataForge'\n")
+    uvicorn.run(
+        "dataici.main:app",
+        host=args.host,
+        port=args.port,
+        reload=False,
+        log_level="warning",
+    )
+if __name__ == "__main__":
+    main()

dataici/main.py ADDED Viewed

@@ -0,0 +1,349 @@
+from fastapi import FastAPI, APIRouter
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import FileResponse
+from pydantic import BaseModel
+from typing import Any, Optional
+from collections import defaultdict, deque
+import importlib
+import math
+import json
+import os
+import uuid
+from datetime import datetime
+# ── Paths ─────────────────────────────────────────────────────────────────────
+_HERE = os.path.dirname(__file__)
+# Projects stored in ~/.dataici/projects.json so they survive package updates
+_DATAICI_DIR  = os.path.join(os.path.expanduser("~"), ".dataici")
+os.makedirs(_DATAICI_DIR, exist_ok=True)
+PROJECTS_FILE = os.path.join(_DATAICI_DIR, "projects.json")
+# Static frontend files (pre-built, bundled with the package)
+STATIC_DIR = os.path.join(_HERE, "static")
+def _load_projects():
+    if not os.path.exists(PROJECTS_FILE):
+        return {}
+    with open(PROJECTS_FILE, "r", encoding="utf-8") as f:
+        return json.load(f)
+def _save_projects(projects):
+    with open(PROJECTS_FILE, "w", encoding="utf-8") as f:
+        json.dump(projects, f, ensure_ascii=False, indent=2)
+def _sanitize(obj):
+    """Recursively replace NaN/Inf floats with None so JSON serialization never fails."""
+    if isinstance(obj, float):
+        if math.isnan(obj) or math.isinf(obj):
+            return None
+        return obj
+    if isinstance(obj, dict):
+        return {k: _sanitize(v) for k, v in obj.items()}
+    if isinstance(obj, list):
+        return [_sanitize(v) for v in obj]
+    return obj
+app = FastAPI(title="DataICI API")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+_cache = {}
+# ── All API routes live under /api ────────────────────────────────────────────
+api = APIRouter(prefix="/api")
+class NodeDef(BaseModel):
+    id: str
+    type: str
+    params: dict[str, Any] = {}
+class EdgeDef(BaseModel):
+    source: str
+    target: str
+    sourceHandle: Optional[str] = None
+    targetHandle: Optional[str] = None
+class PipelineRequest(BaseModel):
+    nodes: list[NodeDef]
+    edges: list[EdgeDef]
+@api.get("/health")
+def root():
+    return {"status": "DataICI backend running", "version": "1.0.0"}
+# ── Project management ────────────────────────────────────────────────────────
+@api.get("/projects")
+def list_projects():
+    projects = _load_projects()
+    return sorted(projects.values(), key=lambda p: p["created_at"], reverse=True)
+@api.post("/projects")
+def create_project(body: dict):
+    projects = _load_projects()
+    pid  = str(uuid.uuid4())[:8]
+    now  = datetime.now().isoformat()
+    proj = {
+        "id": pid,
+        "name":        body.get("name", "Sin nombre"),
+        "description": body.get("description", ""),
+        "created_at":  now,
+        "modified_at": now,
+        "nodes": [],
+        "edges": [],
+    }
+    projects[pid] = proj
+    _save_projects(projects)
+    return proj
+@api.get("/projects/{pid}")
+def get_project(pid: str):
+    projects = _load_projects()
+    if pid not in projects:
+        return {"error": "Proyecto no encontrado"}
+    return projects[pid]
+@api.put("/projects/{pid}")
+def update_project(pid: str, body: dict):
+    projects = _load_projects()
+    if pid not in projects:
+        return {"error": "Proyecto no encontrado"}
+    projects[pid].update({
+        "nodes":       body.get("nodes",  projects[pid]["nodes"]),
+        "edges":       body.get("edges",  projects[pid]["edges"]),
+        "modified_at": datetime.now().isoformat(),
+    })
+    _save_projects(projects)
+    return projects[pid]
+@api.delete("/projects/{pid}")
+def delete_project(pid: str):
+    projects = _load_projects()
+    if pid in projects:
+        del projects[pid]
+        _save_projects(projects)
+    return {"ok": True}
+# ── Resampler helpers ─────────────────────────────────────────────────────────
+def _is_resampler(obj):
+    try:
+        from pandas.core.resample import DatetimeIndexResampler
+        return isinstance(obj, DatetimeIndexResampler)
+    except ImportError:
+        pass
+    return hasattr(obj, "_selected_obj") and hasattr(obj, "mean") and not hasattr(obj, "to_dict")
+def _resampler_to_display(resampler):
+    windows = []
+    for key, group in resampler:
+        if group.empty:
+            continue
+        group_head = group.head(5)
+        raw_idx_name = group.index.name or "Time"
+        idx_col = raw_idx_name if raw_idx_name not in group.columns else f"{raw_idx_name}__idx"
+        group_reset = group_head.rename_axis(idx_col).reset_index()
+        data_cols   = list(group.columns)
+        windows.append({
+            "timestamp": str(key),
+            "n_rows":    int(len(group)),
+            "index_col": idx_col,
+            "columns":   data_cols,
+            "data":      group_reset.where(group_reset.notna(), other=None).to_dict(orient="records"),
+        })
+        if len(windows) >= 4:
+            break
+    return {
+        "is_resampler": True,
+        "n_windows":    len(windows),
+        "windows":      windows,
+    }
+@api.post("/run")
+def run_pipeline(req: PipelineRequest):
+    import pandas as pd
+    import numpy as np
+    nodes_by_id = {n.id: n for n in req.nodes}
+    incoming  = defaultdict(dict)
+    outgoing  = defaultdict(list)
+    for edge in req.edges:
+        handle = edge.targetHandle or "input-0"
+        incoming[edge.target][handle] = edge.source
+        outgoing[edge.source].append(edge.target)
+    in_degree  = {n.id: 0 for n in req.nodes}
+    for edge in req.edges:
+        in_degree[edge.target] += 1
+    queue      = deque([n.id for n in req.nodes if in_degree[n.id] == 0])
+    topo_order = []
+    while queue:
+        nid = queue.popleft()
+        topo_order.append(nid)
+        for nb in outgoing[nid]:
+            in_degree[nb] -= 1
+            if in_degree[nb] == 0:
+                queue.append(nb)
+    if len(topo_order) != len(req.nodes):
+        return {"error": "El pipeline tiene un ciclo o nodos sin conectar."}
+    results  = {}
+    all_code = ["import pandas as pd", ""]
+    last_result = None
+    for nid in topo_order:
+        node   = nodes_by_id[nid]
+        params = dict(node.params)
+        try:
+            module = importlib.import_module(f"dataici.blocks.{node.type}")
+            importlib.reload(module)
+            meta = getattr(module, "METADATA", {})
+            if meta:
+                valid_keys = {p["key"] for p in meta.get("params", [])}
+                params = {k: v for k, v in params.items() if k in valid_keys}
+            multi_input   = meta.get("multi_input", False)
+            node_incoming = incoming.get(nid, {})
+            if multi_input:
+                sorted_handles = sorted(
+                    node_incoming.keys(),
+                    key=lambda h: int(h.split("-")[1]) if h and "-" in h else 0,
+                )
+                input_dfs = [results[node_incoming[h]] for h in sorted_handles if node_incoming.get(h) in results]
+                if len(input_dfs) < 2:
+                    return {"error": f"'{node.type}' necesita al menos 2 entradas conectadas."}
+                result, code_lines = module.run(input_dfs, params)
+            else:
+                if node_incoming:
+                    src_id = node_incoming.get("input-0") or list(node_incoming.values())[0]
+                    result_in = results.get(src_id)
+                else:
+                    result_in = None
+                result, code_lines = module.run(result_in, params)
+            results[nid] = result
+            last_result = result
+            all_code.extend(code_lines)
+            all_code.append("")
+        except ModuleNotFoundError:
+            return {"error": f"Bloque '{node.type}' no encontrado."}
+        except Exception as e:
+            return {"error": str(e)}
+    if last_result is None:
+        return {"error": "Pipeline vacío o sin resultado."}
+    if _is_resampler(last_result):
+        resampler_info = _resampler_to_display(last_result)
+        _cache["df"] = last_result._selected_obj
+        return _sanitize({
+            **resampler_info,
+            "code": "\n".join(all_code).strip(),
+        })
+    df = last_result
+    _cache["df"] = df
+    df_safe = df.where(df.notna(), other=None)
+    try:
+        describe = df.describe(include="all").fillna("").astype(str).to_dict()
+    except Exception:
+        describe = {}
+    for col in df.select_dtypes(include=["datetime64"]).columns:
+        try:
+            s = df[col].dropna()
+            describe[col] = {
+                "count": str(len(s)), "mean": str(s.mean()), "min": str(s.min()),
+                "25%": str(s.quantile(0.25)), "50%": str(s.median()),
+                "75%": str(s.quantile(0.75)), "max": str(s.max()),
+            }
+        except Exception:
+            pass
+    box_stats = {}
+    for col in df.select_dtypes(include="number").columns:
+        try:
+            clean = df[col].dropna().astype(float)
+            box_stats[col] = {
+                "q1": float(clean.quantile(0.25)), "med": float(clean.quantile(0.5)),
+                "q3": float(clean.quantile(0.75)), "min": float(clean.min()),
+                "max": float(clean.max()), "count": int(len(clean)),
+                "missing": int(df[col].isnull().sum()),
+            }
+        except Exception:
+            pass
+    value_counts = {}
+    for col in df.columns:
+        try:
+            vc = df[col].value_counts(dropna=True).head(20)
+            value_counts[col] = [
+                {"value": str(k), "count": int(v), "pct": round(float(v)/len(df), 6)}
+                for k, v in vc.items()
+            ]
+        except Exception:
+            pass
+    index_names = [n for n in df.index.names if n is not None]
+    return _sanitize({
+        "data":         df_safe.head(100).to_dict(orient="records"),
+        "columns":      list(df.columns),
+        "index_names":  index_names,
+        "shape":        list(df.shape),
+        "nulls":        int(df.isnull().sum().sum()),
+        "dtypes":       {col: str(dtype) for col, dtype in df.dtypes.items()},
+        "describe":     describe,
+        "box_stats":    box_stats,
+        "value_counts": value_counts,
+        "code":         "\n".join(all_code).strip(),
+    })
+@api.get("/charts")
+def get_charts(col: str):
+    df = _cache.get("df")
+    if df is None:
+        return {"error": "No hay datos. Ejecuta el pipeline primero."}
+    if col not in df.columns:
+        return {"error": f"Columna '{col}' no encontrada."}
+    try:
+        from dataici.charts import generate_column_charts
+        return generate_column_charts(df, col)
+    except Exception as e:
+        return {"error": str(e)}
+@api.get("/blocks")
+def list_blocks():
+    blocks = []
+    blocks_dir = os.path.join(_HERE, "blocks")
+    for fname in sorted(os.listdir(blocks_dir)):
+        if fname.endswith(".py") and not fname.startswith("_"):
+            try:
+                mod = importlib.import_module(f"dataici.blocks.{fname[:-3]}")
+                if hasattr(mod, "METADATA"):
+                    blocks.append(mod.METADATA)
+            except Exception:
+                pass
+    return blocks
+# ── Register API router ───────────────────────────────────────────────────────
+app.include_router(api)
+# ── Serve built frontend ──────────────────────────────────────────────────────
+if os.path.exists(STATIC_DIR):
+    assets_dir = os.path.join(STATIC_DIR, "assets")
+    if os.path.exists(assets_dir):
+        app.mount("/assets", StaticFiles(directory=assets_dir), name="assets")
+    @app.get("/{full_path:path}", include_in_schema=False)
+    def serve_spa(full_path: str):
+        return FileResponse(os.path.join(STATIC_DIR, "index.html"))