PyPI - rowbase - Versions diffs - 0.1.0__py3-none-any.whl - Mend

rowbase 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

rowbase/__init__.py +25 -0
rowbase/_internal/__init__.py +0 -0
rowbase/_internal/registry.py +83 -0
rowbase/api/__init__.py +0 -0
rowbase/api/client.py +212 -0
rowbase/cli/__init__.py +0 -0
rowbase/cli/auth_cmd.py +54 -0
rowbase/cli/data_cmd.py +72 -0
rowbase/cli/dataset_cmd.py +62 -0
rowbase/cli/formatters.py +106 -0
rowbase/cli/init_cmd.py +76 -0
rowbase/cli/main.py +32 -0
rowbase/cli/pipeline_cmd.py +341 -0
rowbase/cli/runs_cmd.py +277 -0
rowbase/config.py +105 -0
rowbase/dag.py +127 -0
rowbase/dataset.py +110 -0
rowbase/errors.py +55 -0
rowbase/execution.py +249 -0
rowbase/io/__init__.py +0 -0
rowbase/io/readers.py +104 -0
rowbase/io/writers.py +102 -0
rowbase/pipeline.py +79 -0
rowbase/py.typed +0 -0
rowbase/schema.py +68 -0
rowbase/source.py +88 -0
rowbase/templates/.gitkeep +0 -0
rowbase-0.1.0.dist-info/METADATA +195 -0
rowbase-0.1.0.dist-info/RECORD +31 -0
rowbase-0.1.0.dist-info/WHEEL +4 -0
rowbase-0.1.0.dist-info/entry_points.txt +2 -0

rowbase/__init__.py ADDED Viewed

@@ -0,0 +1,25 @@
+"""Rowbase SDK — declare data pipelines as Python functions."""
+__version__ = "0.1.0"
+from rowbase.config import get_config as _get_config
+from rowbase.dataset import dataset
+from rowbase.errors import RowbaseError
+from rowbase.pipeline import pipeline
+from rowbase.source import source
+class _ConfigProxy:
+    """Lazy proxy so `rowbase.config.get(...)` works without explicit loading."""
+    def get(self, key: str, default: object = None) -> object:
+        return _get_config().get(key, default)
+config = _ConfigProxy()
+__all__ = ["RowbaseError", "config", "dataset", "pipeline", "source"]
+def connect(api_key: str | None = None) -> None:
+    """Connect to the Rowbase platform. No-op in Phase 1."""

rowbase/_internal/__init__.py ADDED Viewed

File without changes

rowbase/_internal/registry.py ADDED Viewed

@@ -0,0 +1,83 @@
+"""Scoped pipeline context registry.
+The @pipeline decorator creates a PipelineContext and sets it as the current
+context via a ContextVar. source() and @dataset register into this context
+during pipeline discovery.
+"""
+from __future__ import annotations
+import contextvars
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+    from collections.abc import Callable
+    from pydantic import BaseModel
+@dataclass
+class SourceMetadata:
+    """Metadata for a registered source."""
+    name: str
+    columns: list[str] | dict[str, type] | None = None
+    description: str = ""
+    reader_options: dict[str, Any] | None = None
+    optional: bool = False
+@dataclass
+class DatasetMetadata:
+    """Metadata for a registered dataset."""
+    name: str
+    fn: Callable[..., Any]
+    schema: type[BaseModel] | None = None
+    on_schema_error: str = "fail"
+    description: str = ""
+    depends_on: list[str] = field(default_factory=list)
+    metadata: bool = True
+@dataclass
+class PipelineContext:
+    """Scoped registry for a single pipeline's sources and datasets."""
+    sources: dict[str, SourceMetadata] = field(default_factory=dict)
+    datasets: dict[str, DatasetMetadata] = field(default_factory=dict)
+    published: set[str] = field(default_factory=set)
+    def register_source(self, meta: SourceMetadata) -> None:
+        self.sources[meta.name] = meta
+    def register_dataset(self, meta: DatasetMetadata) -> None:
+        self.datasets[meta.name] = meta
+    def mark_published(self, name: str) -> None:
+        self.published.add(name)
+    @property
+    def all_names(self) -> set[str]:
+        return set(self.sources) | set(self.datasets)
+_current_context: contextvars.ContextVar[PipelineContext | None] = contextvars.ContextVar(
+    "_current_context", default=None
+)
+def get_current_context() -> PipelineContext:
+    """Get the current pipeline context. Raises if called outside a pipeline function."""
+    ctx = _current_context.get()
+    if ctx is None:
+        raise RuntimeError(
+            "source() and @dataset must be called inside a @pipeline-decorated function."
+        )
+    return ctx
+def set_current_context(ctx: PipelineContext | None) -> contextvars.Token[PipelineContext | None]:
+    """Set the current pipeline context. Returns a token for resetting."""
+    return _current_context.set(ctx)

rowbase/api/__init__.py ADDED Viewed

File without changes

rowbase/api/client.py ADDED Viewed

@@ -0,0 +1,212 @@
+"""Rowbase API client — deploy pipelines and submit runs to the platform."""
+from __future__ import annotations
+import json
+import os
+from pathlib import Path
+from typing import Any
+import httpx
+from rowbase.errors import RowbaseError
+DEFAULT_API_URL = "http://localhost:8000"
+CREDENTIALS_PATH = Path.home() / ".rowbase" / "credentials.json"
+def _get_api_url() -> str:
+    return os.environ.get("ROWBASE_API_URL", DEFAULT_API_URL)
+def _get_api_key() -> str | None:
+    key = os.environ.get("ROWBASE_API_KEY")
+    if key:
+        return key
+    if CREDENTIALS_PATH.exists():
+        data = json.loads(CREDENTIALS_PATH.read_text())
+        return data.get("api_key")
+    return None
+def save_credentials(api_key: str) -> None:
+    """Save API key to ~/.rowbase/credentials.json."""
+    CREDENTIALS_PATH.parent.mkdir(parents=True, exist_ok=True)
+    CREDENTIALS_PATH.write_text(json.dumps({"api_key": api_key}))
+    CREDENTIALS_PATH.chmod(0o600)
+def clear_credentials() -> None:
+    """Remove stored credentials."""
+    if CREDENTIALS_PATH.exists():
+        CREDENTIALS_PATH.unlink()
+def get_credentials_info() -> dict[str, Any]:
+    """Return current auth state for display."""
+    api_url = _get_api_url()
+    api_key = _get_api_key()
+    source = None
+    if api_key:
+        if os.environ.get("ROWBASE_API_KEY"):
+            source = "env (ROWBASE_API_KEY)"
+        else:
+            source = str(CREDENTIALS_PATH)
+    return {
+        "authenticated": api_key is not None,
+        "api_url": api_url,
+        "key_prefix": api_key[:20] + "..." if api_key else None,
+        "source": source,
+    }
+class RowbaseClient:
+    """Synchronous HTTP client for the Rowbase API."""
+    def __init__(self, api_url: str | None = None, api_key: str | None = None) -> None:
+        self.api_url = (api_url or _get_api_url()).rstrip("/")
+        self.api_key = api_key or _get_api_key()
+        if not self.api_key:
+            raise RowbaseError(
+                code="AUTH_REQUIRED",
+                message="No API key configured",
+                hint="Run 'rowbase auth login' or set ROWBASE_API_KEY environment variable.",
+            )
+        self._client = httpx.Client(
+            base_url=f"{self.api_url}/api/v1",
+            headers={"Authorization": f"Bearer {self.api_key}"},
+            timeout=300.0,
+        )
+    def _raise_for_error(self, response: httpx.Response) -> None:
+        if response.status_code >= 400:
+            try:
+                body = response.json()
+                msg = body.get("message") or body.get("detail") or str(body)
+            except Exception:
+                msg = response.text
+            raise RowbaseError(
+                code="API_ERROR",
+                message=f"API error ({response.status_code}): {msg}",
+                details={"status_code": response.status_code},
+            )
+    def list_pipelines(self) -> list[dict[str, Any]]:
+        """List all pipelines."""
+        resp = self._client.get("/pipelines")
+        self._raise_for_error(resp)
+        return resp.json()
+    def register_pipeline(
+        self,
+        name: str,
+        source_code: str,
+        description: str | None = None,
+        tags: list[str] | None = None,
+        auxiliary_files: dict[str, str] | None = None,
+    ) -> dict[str, Any]:
+        """Register or update a pipeline."""
+        payload: dict[str, Any] = {"name": name, "source_code": source_code}
+        if description:
+            payload["description"] = description
+        if tags:
+            payload["tags"] = tags
+        if auxiliary_files:
+            payload["auxiliary_files"] = auxiliary_files
+        resp = self._client.post("/pipelines", json=payload)
+        self._raise_for_error(resp)
+        return resp.json()
+    def discover_pipeline(self, pipeline_id: str) -> dict[str, Any]:
+        """Discover pipeline structure (sources, datasets, DAG)."""
+        resp = self._client.post(f"/pipelines/{pipeline_id}/discover")
+        self._raise_for_error(resp)
+        return resp.json()
+    def list_runs(self, pipeline_id: str) -> list[dict[str, Any]]:
+        """List all runs for a pipeline."""
+        resp = self._client.get(f"/pipelines/{pipeline_id}/runs")
+        self._raise_for_error(resp)
+        return resp.json()
+    def submit_run(
+        self,
+        pipeline_id: str,
+        files: dict[str, Path],
+    ) -> dict[str, Any]:
+        """Submit a pipeline run with input files."""
+        file_tuples = []
+        source_names = []
+        for source_name, file_path in files.items():
+            source_names.append(source_name)
+            file_tuples.append(
+                ("files", (file_path.name, file_path.open("rb")))
+            )
+        try:
+            resp = self._client.post(
+                f"/pipelines/{pipeline_id}/runs",
+                files=file_tuples,
+                data={"source_names": json.dumps(source_names)},
+            )
+        finally:
+            for _, (_, f) in file_tuples:
+                f.close()
+        self._raise_for_error(resp)
+        return resp.json()
+    def get_run(self, pipeline_id: str, run_id: str) -> dict[str, Any]:
+        resp = self._client.get(f"/pipelines/{pipeline_id}/runs/{run_id}")
+        self._raise_for_error(resp)
+        return resp.json()
+    def get_dataset_data(
+        self, pipeline_id: str, run_id: str, dataset_name: str
+    ) -> dict[str, Any]:
+        resp = self._client.get(
+            f"/pipelines/{pipeline_id}/runs/{run_id}/datasets/{dataset_name}/data"
+        )
+        self._raise_for_error(resp)
+        return resp.json()
+    def get_step_metrics(self, pipeline_id: str, run_id: str) -> list[dict[str, Any]]:
+        resp = self._client.get(f"/pipelines/{pipeline_id}/runs/{run_id}/step-metrics")
+        self._raise_for_error(resp)
+        return resp.json()
+    def get_logs(self, pipeline_id: str, run_id: str) -> list[dict[str, Any]]:
+        resp = self._client.get(f"/pipelines/{pipeline_id}/runs/{run_id}/logs")
+        self._raise_for_error(resp)
+        return resp.json()
+    def get_dataset_detail(
+        self, pipeline_id: str, run_id: str, dataset_name: str
+    ) -> dict[str, Any]:
+        resp = self._client.get(
+            f"/pipelines/{pipeline_id}/runs/{run_id}/datasets/{dataset_name}"
+        )
+        self._raise_for_error(resp)
+        return resp.json()
+    def get_errors(self, pipeline_id: str, run_id: str) -> list[dict[str, Any]]:
+        resp = self._client.get(f"/pipelines/{pipeline_id}/runs/{run_id}/errors")
+        self._raise_for_error(resp)
+        return resp.json()
+def collect_auxiliary_files(pipeline_path: Path) -> dict[str, str]:
+    """Scan for auxiliary modules (lib/, configs/) relative to the pipeline file."""
+    project_dir = pipeline_path.resolve().parent
+    aux: dict[str, str] = {}
+    for subdir in ("lib", "configs"):
+        scan_dir = project_dir / subdir
+        if not scan_dir.is_dir():
+            continue
+        for file in scan_dir.rglob("*"):
+            if file.is_file() and file.suffix in (".py", ".yaml", ".yml", ".json"):
+                rel = file.relative_to(project_dir)
+                aux[str(rel)] = file.read_text()
+    return aux

rowbase/cli/__init__.py ADDED Viewed

File without changes

rowbase/cli/auth_cmd.py ADDED Viewed

@@ -0,0 +1,54 @@
+"""Authentication commands: login, logout, status."""
+from __future__ import annotations
+from typing import Annotated
+import typer
+from rowbase.cli.formatters import print_error, print_success
+app = typer.Typer(no_args_is_help=True)
+@app.command()
+def login(
+    api_key: Annotated[str | None, typer.Option("--api-key", "-k", help="API key")] = None,
+) -> None:
+    """Store an API key for the Rowbase platform."""
+    from rowbase.api.client import save_credentials
+    if api_key is None:
+        api_key = typer.prompt("API key", hide_input=True)
+    if not api_key.startswith("rb_key_"):
+        print_error("Invalid API key format. Expected prefix: rb_key_")
+        raise typer.Exit(code=1)
+    save_credentials(api_key)
+    print_success(f"API key saved ({api_key[:20]}...)")
+@app.command()
+def logout() -> None:
+    """Remove stored credentials."""
+    from rowbase.api.client import clear_credentials
+    clear_credentials()
+    print_success("Credentials cleared.")
+@app.command()
+def status() -> None:
+    """Show current authentication status."""
+    from rowbase.api.client import get_credentials_info
+    info = get_credentials_info()
+    if info["authenticated"]:
+        print_success(f"Authenticated")
+        typer.echo(f"  API URL:  {info['api_url']}")
+        typer.echo(f"  Key:      {info['key_prefix']}")
+        typer.echo(f"  Source:   {info['source']}")
+    else:
+        print_error("Not authenticated")
+        typer.echo("  Run 'rowbase auth login' or set ROWBASE_API_KEY env var.")

rowbase/cli/data_cmd.py ADDED Viewed

@@ -0,0 +1,72 @@
+"""Data commands: inspect local files."""
+from __future__ import annotations
+from pathlib import Path
+from typing import Annotated
+import typer
+from rowbase.cli.formatters import print_error, print_json
+app = typer.Typer(no_args_is_help=True)
+@app.command()
+def inspect(
+    file_path: Annotated[Path, typer.Argument(help="Path to a data file (csv, parquet, xlsx)")],
+    fmt: Annotated[str, typer.Option("--format", "-f")] = "text",
+) -> None:
+    """Inspect a local data file: show columns, types, row count, and sample values."""
+    from rowbase.io.readers import _read_file
+    if not file_path.exists():
+        print_error(f"File not found: {file_path}")
+        raise typer.Exit(code=1)
+    try:
+        df = _read_file(file_path)
+    except Exception as e:
+        print_error(f"Failed to read file: {e}")
+        raise typer.Exit(code=1) from e
+    col_info = []
+    for col_name in df.columns:
+        series = df[col_name]
+        col_info.append(
+            {
+                "name": col_name,
+                "dtype": str(series.dtype),
+                "null_count": series.null_count(),
+                "unique_count": series.n_unique(),
+            }
+        )
+    if fmt == "json":
+        print_json(
+            {
+                "file": str(file_path),
+                "rows": df.shape[0],
+                "columns": col_info,
+            }
+        )
+    else:
+        from rich.console import Console
+        from rich.table import Table
+        console = Console()
+        console.print(
+            f"\n[bold]{file_path.name}[/bold] — {df.shape[0]:,} rows, {df.shape[1]} columns\n"
+        )
+        table = Table()
+        table.add_column("Column", style="cyan")
+        table.add_column("Type", style="green")
+        table.add_column("Nulls", justify="right")
+        table.add_column("Unique", justify="right")
+        for c in col_info:
+            table.add_row(c["name"], c["dtype"], str(c["null_count"]), str(c["unique_count"]))
+        console.print(table)

rowbase/cli/dataset_cmd.py ADDED Viewed

@@ -0,0 +1,62 @@
+"""Dataset commands: test a single dataset."""
+from __future__ import annotations
+from pathlib import Path
+from typing import Annotated
+import typer
+from rowbase.cli.formatters import print_error, print_json, print_success
+from rowbase.cli.pipeline_cmd import FormatOption, _load_pipeline, _parse_inputs
+app = typer.Typer(no_args_is_help=True)
+@app.command()
+def test(
+    dataset_name: Annotated[str, typer.Option("--dataset", "-d")],
+    pipeline: Annotated[Path, typer.Option("--pipeline", "-p")] = Path("pipeline.py"),
+    inputs: Annotated[list[str] | None, typer.Option("--input", "-i")] = None,
+    sample_rows: Annotated[int, typer.Option("--sample-rows")] = 5,
+    fmt: FormatOption = "text",
+) -> None:
+    """Test a single dataset with sample data."""
+    from rowbase.execution import PipelineRunner
+    pipeline_fn = _load_pipeline(pipeline)
+    spec = pipeline_fn()
+    if dataset_name not in spec.context.datasets:
+        print_error(f"Dataset {dataset_name!r} not found in pipeline.")
+        available = sorted(spec.context.datasets.keys())
+        typer.echo(f"  Available: {', '.join(available)}")
+        raise typer.Exit(code=1)
+    input_map = _parse_inputs(inputs or [])
+    runner = PipelineRunner()
+    result = runner.run(spec, input_map, sample_rows=sample_rows)
+    if dataset_name not in result.datasets:
+        print_error(f"Dataset {dataset_name!r} did not produce output.")
+        for err in result.errors:
+            print_error(f"  [{err.code}] {err.message}")
+        raise typer.Exit(code=1)
+    ds_result = result.datasets[dataset_name]
+    if fmt == "json":
+        print_json(
+            {
+                "dataset": dataset_name,
+                "row_count": ds_result.row_count,
+                "columns": ds_result.columns,
+                "status": result.status,
+                "errors": [e.to_dict() for e in result.errors],
+            }
+        )
+    else:
+        print_success(
+            f"Dataset {dataset_name!r}: {ds_result.row_count} rows, {len(ds_result.columns)} columns"
+        )
+        typer.echo(f"  Columns: {', '.join(ds_result.columns)}")

rowbase/cli/formatters.py ADDED Viewed

@@ -0,0 +1,106 @@
+"""Output formatters for CLI commands."""
+from __future__ import annotations
+import json
+from typing import Any
+from rich.console import Console
+from rich.table import Table
+console = Console()
+error_console = Console(stderr=True)
+def print_json(data: dict[str, Any]) -> None:
+    """Print data as formatted JSON to stdout."""
+    print(json.dumps(data, indent=2, default=str))
+def print_success(message: str) -> None:
+    console.print(f"[green]✓[/green] {message}")
+def print_error(message: str) -> None:
+    error_console.print(f"[red]✗[/red] {message}")
+def print_warning(message: str) -> None:
+    console.print(f"[yellow]![/yellow] {message}")
+def print_dag_table(
+    sources: list[dict[str, Any]],
+    datasets: list[dict[str, Any]],
+    published: set[str],
+) -> None:
+    """Print pipeline DAG as a rich table."""
+    table = Table(title="Pipeline Graph")
+    table.add_column("Name", style="cyan")
+    table.add_column("Type", style="green")
+    table.add_column("Depends On")
+    table.add_column("Published", justify="center")
+    for src in sources:
+        table.add_row(src["name"], "source", "", "")
+    for ds in datasets:
+        deps = ", ".join(ds.get("depends_on", []))
+        pub = "✓" if ds["name"] in published else ""
+        table.add_row(ds["name"], "dataset", deps, pub)
+    console.print(table)
+def _print_captured_output(result: dict[str, Any]) -> None:
+    """Print captured stdout, stderr, and log records from dataset functions."""
+    has_output = False
+    for name, ds in result.get("datasets", {}).items():
+        for stream in ("stdout", "stderr"):
+            if ds.get(stream):
+                if not has_output:
+                    console.print("\n[dim]Captured output:[/dim]")
+                    has_output = True
+                console.print(f"  [cyan]{name}[/cyan] ({stream}):")
+                for line in ds[stream].rstrip().splitlines():
+                    console.print(f"    {line}")
+    for log in result.get("logs", []):
+        if not has_output:
+            console.print("\n[dim]Captured output:[/dim]")
+            has_output = True
+        level_color = {"WARNING": "yellow", "ERROR": "red", "CRITICAL": "red"}.get(
+            log["level"], "dim"
+        )
+        console.print(
+            f"  [cyan]{log['dataset']}[/cyan] "
+            f"[{level_color}]{log['level']}[/{level_color}]: {log['message']}"
+        )
+def print_run_result(result: dict[str, Any]) -> None:
+    """Print a run result as a rich table."""
+    status = result["status"]
+    color = "green" if status == "success" else "yellow" if status == "partial" else "red"
+    console.print(f"\nRun [bold]{result['run_id']}[/bold] — [{color}]{status}[/{color}]")
+    console.print(f"Duration: {result['duration_ms']}ms")
+    if result.get("datasets"):
+        table = Table(title="Datasets")
+        table.add_column("Name", style="cyan")
+        table.add_column("Rows", justify="right")
+        table.add_column("Columns")
+        for name, ds in result["datasets"].items():
+            table.add_row(name, str(ds["row_count"]), ", ".join(ds["columns"]))
+        console.print(table)
+    _print_captured_output(result)
+    if result.get("errors"):
+        for err in result["errors"]:
+            print_error(f"[{err['code']}] {err['message']}")
+            if err.get("hint"):
+                console.print(f"  Hint: {err['hint']}")