rowbase 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rowbase/__init__.py ADDED
@@ -0,0 +1,25 @@
1
+ """Rowbase SDK — declare data pipelines as Python functions."""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ from rowbase.config import get_config as _get_config
6
+ from rowbase.dataset import dataset
7
+ from rowbase.errors import RowbaseError
8
+ from rowbase.pipeline import pipeline
9
+ from rowbase.source import source
10
+
11
+
12
+ class _ConfigProxy:
13
+ """Lazy proxy so `rowbase.config.get(...)` works without explicit loading."""
14
+
15
+ def get(self, key: str, default: object = None) -> object:
16
+ return _get_config().get(key, default)
17
+
18
+
19
+ config = _ConfigProxy()
20
+
21
+ __all__ = ["RowbaseError", "config", "dataset", "pipeline", "source"]
22
+
23
+
24
+ def connect(api_key: str | None = None) -> None:
25
+ """Connect to the Rowbase platform. No-op in Phase 1."""
File without changes
@@ -0,0 +1,83 @@
1
+ """Scoped pipeline context registry.
2
+
3
+ The @pipeline decorator creates a PipelineContext and sets it as the current
4
+ context via a ContextVar. source() and @dataset register into this context
5
+ during pipeline discovery.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import contextvars
11
+ from dataclasses import dataclass, field
12
+ from typing import TYPE_CHECKING, Any
13
+
14
+ if TYPE_CHECKING:
15
+ from collections.abc import Callable
16
+
17
+ from pydantic import BaseModel
18
+
19
+
20
+ @dataclass
21
+ class SourceMetadata:
22
+ """Metadata for a registered source."""
23
+
24
+ name: str
25
+ columns: list[str] | dict[str, type] | None = None
26
+ description: str = ""
27
+ reader_options: dict[str, Any] | None = None
28
+ optional: bool = False
29
+
30
+
31
+ @dataclass
32
+ class DatasetMetadata:
33
+ """Metadata for a registered dataset."""
34
+
35
+ name: str
36
+ fn: Callable[..., Any]
37
+ schema: type[BaseModel] | None = None
38
+ on_schema_error: str = "fail"
39
+ description: str = ""
40
+ depends_on: list[str] = field(default_factory=list)
41
+ metadata: bool = True
42
+
43
+
44
+ @dataclass
45
+ class PipelineContext:
46
+ """Scoped registry for a single pipeline's sources and datasets."""
47
+
48
+ sources: dict[str, SourceMetadata] = field(default_factory=dict)
49
+ datasets: dict[str, DatasetMetadata] = field(default_factory=dict)
50
+ published: set[str] = field(default_factory=set)
51
+
52
+ def register_source(self, meta: SourceMetadata) -> None:
53
+ self.sources[meta.name] = meta
54
+
55
+ def register_dataset(self, meta: DatasetMetadata) -> None:
56
+ self.datasets[meta.name] = meta
57
+
58
+ def mark_published(self, name: str) -> None:
59
+ self.published.add(name)
60
+
61
+ @property
62
+ def all_names(self) -> set[str]:
63
+ return set(self.sources) | set(self.datasets)
64
+
65
+
66
+ _current_context: contextvars.ContextVar[PipelineContext | None] = contextvars.ContextVar(
67
+ "_current_context", default=None
68
+ )
69
+
70
+
71
+ def get_current_context() -> PipelineContext:
72
+ """Get the current pipeline context. Raises if called outside a pipeline function."""
73
+ ctx = _current_context.get()
74
+ if ctx is None:
75
+ raise RuntimeError(
76
+ "source() and @dataset must be called inside a @pipeline-decorated function."
77
+ )
78
+ return ctx
79
+
80
+
81
+ def set_current_context(ctx: PipelineContext | None) -> contextvars.Token[PipelineContext | None]:
82
+ """Set the current pipeline context. Returns a token for resetting."""
83
+ return _current_context.set(ctx)
File without changes
rowbase/api/client.py ADDED
@@ -0,0 +1,212 @@
1
+ """Rowbase API client — deploy pipelines and submit runs to the platform."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ import httpx
11
+
12
+ from rowbase.errors import RowbaseError
13
+
14
+ DEFAULT_API_URL = "http://localhost:8000"
15
+ CREDENTIALS_PATH = Path.home() / ".rowbase" / "credentials.json"
16
+
17
+
18
+ def _get_api_url() -> str:
19
+ return os.environ.get("ROWBASE_API_URL", DEFAULT_API_URL)
20
+
21
+
22
+ def _get_api_key() -> str | None:
23
+ key = os.environ.get("ROWBASE_API_KEY")
24
+ if key:
25
+ return key
26
+ if CREDENTIALS_PATH.exists():
27
+ data = json.loads(CREDENTIALS_PATH.read_text())
28
+ return data.get("api_key")
29
+ return None
30
+
31
+
32
+ def save_credentials(api_key: str) -> None:
33
+ """Save API key to ~/.rowbase/credentials.json."""
34
+ CREDENTIALS_PATH.parent.mkdir(parents=True, exist_ok=True)
35
+ CREDENTIALS_PATH.write_text(json.dumps({"api_key": api_key}))
36
+ CREDENTIALS_PATH.chmod(0o600)
37
+
38
+
39
+ def clear_credentials() -> None:
40
+ """Remove stored credentials."""
41
+ if CREDENTIALS_PATH.exists():
42
+ CREDENTIALS_PATH.unlink()
43
+
44
+
45
+ def get_credentials_info() -> dict[str, Any]:
46
+ """Return current auth state for display."""
47
+ api_url = _get_api_url()
48
+ api_key = _get_api_key()
49
+ source = None
50
+ if api_key:
51
+ if os.environ.get("ROWBASE_API_KEY"):
52
+ source = "env (ROWBASE_API_KEY)"
53
+ else:
54
+ source = str(CREDENTIALS_PATH)
55
+ return {
56
+ "authenticated": api_key is not None,
57
+ "api_url": api_url,
58
+ "key_prefix": api_key[:20] + "..." if api_key else None,
59
+ "source": source,
60
+ }
61
+
62
+
63
+ class RowbaseClient:
64
+ """Synchronous HTTP client for the Rowbase API."""
65
+
66
+ def __init__(self, api_url: str | None = None, api_key: str | None = None) -> None:
67
+ self.api_url = (api_url or _get_api_url()).rstrip("/")
68
+ self.api_key = api_key or _get_api_key()
69
+ if not self.api_key:
70
+ raise RowbaseError(
71
+ code="AUTH_REQUIRED",
72
+ message="No API key configured",
73
+ hint="Run 'rowbase auth login' or set ROWBASE_API_KEY environment variable.",
74
+ )
75
+ self._client = httpx.Client(
76
+ base_url=f"{self.api_url}/api/v1",
77
+ headers={"Authorization": f"Bearer {self.api_key}"},
78
+ timeout=300.0,
79
+ )
80
+
81
+ def _raise_for_error(self, response: httpx.Response) -> None:
82
+ if response.status_code >= 400:
83
+ try:
84
+ body = response.json()
85
+ msg = body.get("message") or body.get("detail") or str(body)
86
+ except Exception:
87
+ msg = response.text
88
+ raise RowbaseError(
89
+ code="API_ERROR",
90
+ message=f"API error ({response.status_code}): {msg}",
91
+ details={"status_code": response.status_code},
92
+ )
93
+
94
+ def list_pipelines(self) -> list[dict[str, Any]]:
95
+ """List all pipelines."""
96
+ resp = self._client.get("/pipelines")
97
+ self._raise_for_error(resp)
98
+ return resp.json()
99
+
100
+ def register_pipeline(
101
+ self,
102
+ name: str,
103
+ source_code: str,
104
+ description: str | None = None,
105
+ tags: list[str] | None = None,
106
+ auxiliary_files: dict[str, str] | None = None,
107
+ ) -> dict[str, Any]:
108
+ """Register or update a pipeline."""
109
+ payload: dict[str, Any] = {"name": name, "source_code": source_code}
110
+ if description:
111
+ payload["description"] = description
112
+ if tags:
113
+ payload["tags"] = tags
114
+ if auxiliary_files:
115
+ payload["auxiliary_files"] = auxiliary_files
116
+ resp = self._client.post("/pipelines", json=payload)
117
+ self._raise_for_error(resp)
118
+ return resp.json()
119
+
120
+ def discover_pipeline(self, pipeline_id: str) -> dict[str, Any]:
121
+ """Discover pipeline structure (sources, datasets, DAG)."""
122
+ resp = self._client.post(f"/pipelines/{pipeline_id}/discover")
123
+ self._raise_for_error(resp)
124
+ return resp.json()
125
+
126
+ def list_runs(self, pipeline_id: str) -> list[dict[str, Any]]:
127
+ """List all runs for a pipeline."""
128
+ resp = self._client.get(f"/pipelines/{pipeline_id}/runs")
129
+ self._raise_for_error(resp)
130
+ return resp.json()
131
+
132
+ def submit_run(
133
+ self,
134
+ pipeline_id: str,
135
+ files: dict[str, Path],
136
+ ) -> dict[str, Any]:
137
+ """Submit a pipeline run with input files."""
138
+ file_tuples = []
139
+ source_names = []
140
+ for source_name, file_path in files.items():
141
+ source_names.append(source_name)
142
+ file_tuples.append(
143
+ ("files", (file_path.name, file_path.open("rb")))
144
+ )
145
+
146
+ try:
147
+ resp = self._client.post(
148
+ f"/pipelines/{pipeline_id}/runs",
149
+ files=file_tuples,
150
+ data={"source_names": json.dumps(source_names)},
151
+ )
152
+ finally:
153
+ for _, (_, f) in file_tuples:
154
+ f.close()
155
+
156
+ self._raise_for_error(resp)
157
+ return resp.json()
158
+
159
+ def get_run(self, pipeline_id: str, run_id: str) -> dict[str, Any]:
160
+ resp = self._client.get(f"/pipelines/{pipeline_id}/runs/{run_id}")
161
+ self._raise_for_error(resp)
162
+ return resp.json()
163
+
164
+ def get_dataset_data(
165
+ self, pipeline_id: str, run_id: str, dataset_name: str
166
+ ) -> dict[str, Any]:
167
+ resp = self._client.get(
168
+ f"/pipelines/{pipeline_id}/runs/{run_id}/datasets/{dataset_name}/data"
169
+ )
170
+ self._raise_for_error(resp)
171
+ return resp.json()
172
+
173
+ def get_step_metrics(self, pipeline_id: str, run_id: str) -> list[dict[str, Any]]:
174
+ resp = self._client.get(f"/pipelines/{pipeline_id}/runs/{run_id}/step-metrics")
175
+ self._raise_for_error(resp)
176
+ return resp.json()
177
+
178
+ def get_logs(self, pipeline_id: str, run_id: str) -> list[dict[str, Any]]:
179
+ resp = self._client.get(f"/pipelines/{pipeline_id}/runs/{run_id}/logs")
180
+ self._raise_for_error(resp)
181
+ return resp.json()
182
+
183
+ def get_dataset_detail(
184
+ self, pipeline_id: str, run_id: str, dataset_name: str
185
+ ) -> dict[str, Any]:
186
+ resp = self._client.get(
187
+ f"/pipelines/{pipeline_id}/runs/{run_id}/datasets/{dataset_name}"
188
+ )
189
+ self._raise_for_error(resp)
190
+ return resp.json()
191
+
192
+ def get_errors(self, pipeline_id: str, run_id: str) -> list[dict[str, Any]]:
193
+ resp = self._client.get(f"/pipelines/{pipeline_id}/runs/{run_id}/errors")
194
+ self._raise_for_error(resp)
195
+ return resp.json()
196
+
197
+
198
+ def collect_auxiliary_files(pipeline_path: Path) -> dict[str, str]:
199
+ """Scan for auxiliary modules (lib/, configs/) relative to the pipeline file."""
200
+ project_dir = pipeline_path.resolve().parent
201
+ aux: dict[str, str] = {}
202
+
203
+ for subdir in ("lib", "configs"):
204
+ scan_dir = project_dir / subdir
205
+ if not scan_dir.is_dir():
206
+ continue
207
+ for file in scan_dir.rglob("*"):
208
+ if file.is_file() and file.suffix in (".py", ".yaml", ".yml", ".json"):
209
+ rel = file.relative_to(project_dir)
210
+ aux[str(rel)] = file.read_text()
211
+
212
+ return aux
File without changes
@@ -0,0 +1,54 @@
1
+ """Authentication commands: login, logout, status."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Annotated
6
+
7
+ import typer
8
+
9
+ from rowbase.cli.formatters import print_error, print_success
10
+
11
+ app = typer.Typer(no_args_is_help=True)
12
+
13
+
14
+ @app.command()
15
+ def login(
16
+ api_key: Annotated[str | None, typer.Option("--api-key", "-k", help="API key")] = None,
17
+ ) -> None:
18
+ """Store an API key for the Rowbase platform."""
19
+ from rowbase.api.client import save_credentials
20
+
21
+ if api_key is None:
22
+ api_key = typer.prompt("API key", hide_input=True)
23
+
24
+ if not api_key.startswith("rb_key_"):
25
+ print_error("Invalid API key format. Expected prefix: rb_key_")
26
+ raise typer.Exit(code=1)
27
+
28
+ save_credentials(api_key)
29
+ print_success(f"API key saved ({api_key[:20]}...)")
30
+
31
+
32
+ @app.command()
33
+ def logout() -> None:
34
+ """Remove stored credentials."""
35
+ from rowbase.api.client import clear_credentials
36
+
37
+ clear_credentials()
38
+ print_success("Credentials cleared.")
39
+
40
+
41
+ @app.command()
42
+ def status() -> None:
43
+ """Show current authentication status."""
44
+ from rowbase.api.client import get_credentials_info
45
+
46
+ info = get_credentials_info()
47
+ if info["authenticated"]:
48
+ print_success(f"Authenticated")
49
+ typer.echo(f" API URL: {info['api_url']}")
50
+ typer.echo(f" Key: {info['key_prefix']}")
51
+ typer.echo(f" Source: {info['source']}")
52
+ else:
53
+ print_error("Not authenticated")
54
+ typer.echo(" Run 'rowbase auth login' or set ROWBASE_API_KEY env var.")
@@ -0,0 +1,72 @@
1
+ """Data commands: inspect local files."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Annotated
7
+
8
+ import typer
9
+
10
+ from rowbase.cli.formatters import print_error, print_json
11
+
12
+ app = typer.Typer(no_args_is_help=True)
13
+
14
+
15
+ @app.command()
16
+ def inspect(
17
+ file_path: Annotated[Path, typer.Argument(help="Path to a data file (csv, parquet, xlsx)")],
18
+ fmt: Annotated[str, typer.Option("--format", "-f")] = "text",
19
+ ) -> None:
20
+ """Inspect a local data file: show columns, types, row count, and sample values."""
21
+
22
+ from rowbase.io.readers import _read_file
23
+
24
+ if not file_path.exists():
25
+ print_error(f"File not found: {file_path}")
26
+ raise typer.Exit(code=1)
27
+
28
+ try:
29
+ df = _read_file(file_path)
30
+ except Exception as e:
31
+ print_error(f"Failed to read file: {e}")
32
+ raise typer.Exit(code=1) from e
33
+
34
+ col_info = []
35
+ for col_name in df.columns:
36
+ series = df[col_name]
37
+ col_info.append(
38
+ {
39
+ "name": col_name,
40
+ "dtype": str(series.dtype),
41
+ "null_count": series.null_count(),
42
+ "unique_count": series.n_unique(),
43
+ }
44
+ )
45
+
46
+ if fmt == "json":
47
+ print_json(
48
+ {
49
+ "file": str(file_path),
50
+ "rows": df.shape[0],
51
+ "columns": col_info,
52
+ }
53
+ )
54
+ else:
55
+ from rich.console import Console
56
+ from rich.table import Table
57
+
58
+ console = Console()
59
+ console.print(
60
+ f"\n[bold]{file_path.name}[/bold] — {df.shape[0]:,} rows, {df.shape[1]} columns\n"
61
+ )
62
+
63
+ table = Table()
64
+ table.add_column("Column", style="cyan")
65
+ table.add_column("Type", style="green")
66
+ table.add_column("Nulls", justify="right")
67
+ table.add_column("Unique", justify="right")
68
+
69
+ for c in col_info:
70
+ table.add_row(c["name"], c["dtype"], str(c["null_count"]), str(c["unique_count"]))
71
+
72
+ console.print(table)
@@ -0,0 +1,62 @@
1
+ """Dataset commands: test a single dataset."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Annotated
7
+
8
+ import typer
9
+
10
+ from rowbase.cli.formatters import print_error, print_json, print_success
11
+ from rowbase.cli.pipeline_cmd import FormatOption, _load_pipeline, _parse_inputs
12
+
13
+ app = typer.Typer(no_args_is_help=True)
14
+
15
+
16
+ @app.command()
17
+ def test(
18
+ dataset_name: Annotated[str, typer.Option("--dataset", "-d")],
19
+ pipeline: Annotated[Path, typer.Option("--pipeline", "-p")] = Path("pipeline.py"),
20
+ inputs: Annotated[list[str] | None, typer.Option("--input", "-i")] = None,
21
+ sample_rows: Annotated[int, typer.Option("--sample-rows")] = 5,
22
+ fmt: FormatOption = "text",
23
+ ) -> None:
24
+ """Test a single dataset with sample data."""
25
+ from rowbase.execution import PipelineRunner
26
+
27
+ pipeline_fn = _load_pipeline(pipeline)
28
+ spec = pipeline_fn()
29
+
30
+ if dataset_name not in spec.context.datasets:
31
+ print_error(f"Dataset {dataset_name!r} not found in pipeline.")
32
+ available = sorted(spec.context.datasets.keys())
33
+ typer.echo(f" Available: {', '.join(available)}")
34
+ raise typer.Exit(code=1)
35
+
36
+ input_map = _parse_inputs(inputs or [])
37
+ runner = PipelineRunner()
38
+ result = runner.run(spec, input_map, sample_rows=sample_rows)
39
+
40
+ if dataset_name not in result.datasets:
41
+ print_error(f"Dataset {dataset_name!r} did not produce output.")
42
+ for err in result.errors:
43
+ print_error(f" [{err.code}] {err.message}")
44
+ raise typer.Exit(code=1)
45
+
46
+ ds_result = result.datasets[dataset_name]
47
+
48
+ if fmt == "json":
49
+ print_json(
50
+ {
51
+ "dataset": dataset_name,
52
+ "row_count": ds_result.row_count,
53
+ "columns": ds_result.columns,
54
+ "status": result.status,
55
+ "errors": [e.to_dict() for e in result.errors],
56
+ }
57
+ )
58
+ else:
59
+ print_success(
60
+ f"Dataset {dataset_name!r}: {ds_result.row_count} rows, {len(ds_result.columns)} columns"
61
+ )
62
+ typer.echo(f" Columns: {', '.join(ds_result.columns)}")
@@ -0,0 +1,106 @@
1
+ """Output formatters for CLI commands."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from typing import Any
7
+
8
+ from rich.console import Console
9
+ from rich.table import Table
10
+
11
+ console = Console()
12
+ error_console = Console(stderr=True)
13
+
14
+
15
+ def print_json(data: dict[str, Any]) -> None:
16
+ """Print data as formatted JSON to stdout."""
17
+ print(json.dumps(data, indent=2, default=str))
18
+
19
+
20
+ def print_success(message: str) -> None:
21
+ console.print(f"[green]✓[/green] {message}")
22
+
23
+
24
+ def print_error(message: str) -> None:
25
+ error_console.print(f"[red]✗[/red] {message}")
26
+
27
+
28
+ def print_warning(message: str) -> None:
29
+ console.print(f"[yellow]![/yellow] {message}")
30
+
31
+
32
+ def print_dag_table(
33
+ sources: list[dict[str, Any]],
34
+ datasets: list[dict[str, Any]],
35
+ published: set[str],
36
+ ) -> None:
37
+ """Print pipeline DAG as a rich table."""
38
+ table = Table(title="Pipeline Graph")
39
+ table.add_column("Name", style="cyan")
40
+ table.add_column("Type", style="green")
41
+ table.add_column("Depends On")
42
+ table.add_column("Published", justify="center")
43
+
44
+ for src in sources:
45
+ table.add_row(src["name"], "source", "", "")
46
+
47
+ for ds in datasets:
48
+ deps = ", ".join(ds.get("depends_on", []))
49
+ pub = "✓" if ds["name"] in published else ""
50
+ table.add_row(ds["name"], "dataset", deps, pub)
51
+
52
+ console.print(table)
53
+
54
+
55
+ def _print_captured_output(result: dict[str, Any]) -> None:
56
+ """Print captured stdout, stderr, and log records from dataset functions."""
57
+ has_output = False
58
+
59
+ for name, ds in result.get("datasets", {}).items():
60
+ for stream in ("stdout", "stderr"):
61
+ if ds.get(stream):
62
+ if not has_output:
63
+ console.print("\n[dim]Captured output:[/dim]")
64
+ has_output = True
65
+ console.print(f" [cyan]{name}[/cyan] ({stream}):")
66
+ for line in ds[stream].rstrip().splitlines():
67
+ console.print(f" {line}")
68
+
69
+ for log in result.get("logs", []):
70
+ if not has_output:
71
+ console.print("\n[dim]Captured output:[/dim]")
72
+ has_output = True
73
+ level_color = {"WARNING": "yellow", "ERROR": "red", "CRITICAL": "red"}.get(
74
+ log["level"], "dim"
75
+ )
76
+ console.print(
77
+ f" [cyan]{log['dataset']}[/cyan] "
78
+ f"[{level_color}]{log['level']}[/{level_color}]: {log['message']}"
79
+ )
80
+
81
+
82
+ def print_run_result(result: dict[str, Any]) -> None:
83
+ """Print a run result as a rich table."""
84
+ status = result["status"]
85
+ color = "green" if status == "success" else "yellow" if status == "partial" else "red"
86
+ console.print(f"\nRun [bold]{result['run_id']}[/bold] — [{color}]{status}[/{color}]")
87
+ console.print(f"Duration: {result['duration_ms']}ms")
88
+
89
+ if result.get("datasets"):
90
+ table = Table(title="Datasets")
91
+ table.add_column("Name", style="cyan")
92
+ table.add_column("Rows", justify="right")
93
+ table.add_column("Columns")
94
+
95
+ for name, ds in result["datasets"].items():
96
+ table.add_row(name, str(ds["row_count"]), ", ".join(ds["columns"]))
97
+
98
+ console.print(table)
99
+
100
+ _print_captured_output(result)
101
+
102
+ if result.get("errors"):
103
+ for err in result["errors"]:
104
+ print_error(f"[{err['code']}] {err['message']}")
105
+ if err.get("hint"):
106
+ console.print(f" Hint: {err['hint']}")