PyPI - rowbase - Versions diffs - 0.1.0__tar.gz - Mend

rowbase 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

rowbase-0.1.0/PKG-INFO +195 -0
rowbase-0.1.0/README.md +166 -0
rowbase-0.1.0/pyproject.toml +95 -0
rowbase-0.1.0/src/rowbase/__init__.py +25 -0
rowbase-0.1.0/src/rowbase/_internal/__init__.py +0 -0
rowbase-0.1.0/src/rowbase/_internal/registry.py +83 -0
rowbase-0.1.0/src/rowbase/api/__init__.py +0 -0
rowbase-0.1.0/src/rowbase/api/client.py +212 -0
rowbase-0.1.0/src/rowbase/cli/__init__.py +0 -0
rowbase-0.1.0/src/rowbase/cli/auth_cmd.py +54 -0
rowbase-0.1.0/src/rowbase/cli/data_cmd.py +72 -0
rowbase-0.1.0/src/rowbase/cli/dataset_cmd.py +62 -0
rowbase-0.1.0/src/rowbase/cli/formatters.py +106 -0
rowbase-0.1.0/src/rowbase/cli/init_cmd.py +76 -0
rowbase-0.1.0/src/rowbase/cli/main.py +32 -0
rowbase-0.1.0/src/rowbase/cli/pipeline_cmd.py +341 -0
rowbase-0.1.0/src/rowbase/cli/runs_cmd.py +277 -0
rowbase-0.1.0/src/rowbase/config.py +105 -0
rowbase-0.1.0/src/rowbase/dag.py +127 -0
rowbase-0.1.0/src/rowbase/dataset.py +110 -0
rowbase-0.1.0/src/rowbase/errors.py +55 -0
rowbase-0.1.0/src/rowbase/execution.py +249 -0
rowbase-0.1.0/src/rowbase/io/__init__.py +0 -0
rowbase-0.1.0/src/rowbase/io/readers.py +104 -0
rowbase-0.1.0/src/rowbase/io/writers.py +102 -0
rowbase-0.1.0/src/rowbase/pipeline.py +79 -0
rowbase-0.1.0/src/rowbase/py.typed +0 -0
rowbase-0.1.0/src/rowbase/schema.py +68 -0
rowbase-0.1.0/src/rowbase/source.py +88 -0
rowbase-0.1.0/src/rowbase/templates/.gitkeep +0 -0
rowbase-0.1.0/tests/test_config.py +103 -0
rowbase-0.1.0/tests/test_dag.py +172 -0
rowbase-0.1.0/tests/test_dataset.py +175 -0
rowbase-0.1.0/tests/test_errors.py +90 -0
rowbase-0.1.0/tests/test_execution.py +518 -0
rowbase-0.1.0/tests/test_pipeline.py +200 -0
rowbase-0.1.0/tests/test_readers.py +95 -0
rowbase-0.1.0/tests/test_schema.py +96 -0
rowbase-0.1.0/tests/test_source.py +85 -0
rowbase-0.1.0/tests/test_writers.py +103 -0
rowbase-0.1.0/uv.lock +616 -0

rowbase-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,195 @@
+Metadata-Version: 2.4
+Name: rowbase
+Version: 0.1.0
+Summary: Rowbase SDK — declare data pipelines as Python functions
+Author-email: Rowbase Team <team@rowbase.com>
+License-Expression: LicenseRef-Proprietary
+Keywords: data,etl,pipelines,polars,sdk
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: Other/Proprietary License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Database
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Typing :: Typed
+Requires-Python: >=3.12
+Requires-Dist: httpx>=0.27.0
+Requires-Dist: polars>=1.0
+Requires-Dist: pyarrow>=15.0
+Requires-Dist: pydantic>=2.0
+Requires-Dist: pyyaml>=6.0
+Requires-Dist: rich>=13.0
+Requires-Dist: typer>=0.15.0
+Provides-Extra: dev
+Requires-Dist: mypy>=1.13.0; extra == 'dev'
+Requires-Dist: pytest>=8.3.0; extra == 'dev'
+Requires-Dist: ruff>=0.8.0; extra == 'dev'
+Description-Content-Type: text/markdown
+# Rowbase
+Declare data pipelines as Python functions. Rowbase handles DAG construction, dependency resolution, execution, and validation — so you can focus on your transforms.
+## Install
+```bash
+pip install rowbase
+```
+## Quick Start
+```python
+import polars as pl
+import rowbase
+@rowbase.pipeline
+def my_pipeline():
+    orders = rowbase.source("orders", columns=["id", "email", "total", "country"])
+    @rowbase.dataset("cleaned", data_from=orders)
+    def cleaned(orders: pl.DataFrame) -> pl.DataFrame:
+        return orders.filter(pl.col("email").is_not_null())
+    @rowbase.dataset("domestic", data_from=cleaned)
+    def domestic(cleaned: pl.DataFrame) -> pl.DataFrame:
+        return cleaned.filter(pl.col("country") == "US")
+    yield domestic
+```
+Run it:
+```bash
+rowbase pipeline run -p pipeline.py -i orders=orders.csv -o ./output/
+```
+## Core Concepts
+### Sources
+Declare data inputs with `source()`. Each source maps to a file provided at runtime.
+```python
+orders = rowbase.source("orders", columns=["id", "email", "total"])
+```
+Supported formats: CSV, Parquet, Excel. Detected automatically by file extension.
+### Datasets
+Transform functions decorated with `@dataset`. They consume sources or other datasets and return a Polars DataFrame.
+```python
+@rowbase.dataset("summary", data_from=[orders, returns])
+def summary(orders: pl.DataFrame, returns: pl.DataFrame) -> pl.DataFrame:
+    return orders.join(returns, on="order_id", how="left")
+```
+### Pipelines
+Generator functions that wire sources and datasets together. `yield` a dataset to mark it as a published output — non-yielded datasets are intermediate.
+```python
+@rowbase.pipeline
+def my_pipeline():
+    raw = rowbase.source("raw")
+    @rowbase.dataset("cleaned", data_from=raw)
+    def cleaned(raw: pl.DataFrame) -> pl.DataFrame:
+        return raw.drop_nulls()
+    @rowbase.dataset("aggregated", data_from=cleaned)
+    def aggregated(cleaned: pl.DataFrame) -> pl.DataFrame:
+        return cleaned.group_by("category").agg(pl.col("amount").sum())
+    yield aggregated  # published output
+```
+### Schema Validation
+Validate dataset outputs with Pydantic models.
+```python
+from pydantic import BaseModel
+class OrderSchema(BaseModel):
+    id: int
+    email: str
+    total: float
+@rowbase.dataset("validated", data_from=orders, schema=OrderSchema, on_schema_error="skip")
+def validated(orders: pl.DataFrame) -> pl.DataFrame:
+    return orders
+```
+`on_schema_error` options: `"fail"` (default), `"skip"`, `"collect"`.
+### Configuration
+Define config in `rowbase.yaml`:
+```yaml
+config:
+  api_key: secret_key
+  threshold: 100
+```
+Access values in your pipeline:
+```python
+rowbase.config.get("api_key")
+```
+Environment variable overrides follow the pattern `ROWBASE_CONFIG_<KEY>`.
+## CLI
+```
+rowbase pipeline validate -p pipeline.py          # Check DAG structure
+rowbase pipeline info -p pipeline.py               # Show sources, datasets, and graph
+rowbase pipeline dry-run -p pipeline.py -i orders=orders.csv --sample-rows 5
+rowbase pipeline run -p pipeline.py -i orders=orders.csv -o ./output/ --output-format parquet
+rowbase pipeline deploy -p pipeline.py             # Deploy to Rowbase platform
+rowbase dataset test -d cleaned -p pipeline.py -i orders=orders.csv
+rowbase data inspect data.csv                      # Inspect file structure
+rowbase auth login                                 # Authenticate
+rowbase auth status                                # Check auth state
+rowbase runs list                                  # View past runs
+rowbase runs show <run_id>                         # Run details
+rowbase runs download <run_id> -o ./results/       # Download outputs
+rowbase init                                       # Initialize a new project
+```
+## Programmatic Usage
+```python
+from pathlib import Path
+from rowbase.execution import PipelineRunner
+spec = my_pipeline()
+runner = PipelineRunner()
+result = runner.run(
+    spec,
+    inputs={"orders": Path("orders.csv")},
+    output_dir=Path("output/"),
+    output_format="parquet",
+)
+print(result.status)  # "success", "partial", or "failed"
+for name, df in result.dataframes.items():
+    print(f"{name}: {df.shape[0]} rows")
+```
+## Output Formats
+Parquet, CSV, NDJSON, and Excel. Set via `--output-format` in the CLI or `output_format` in `PipelineRunner.run()`.
+## License
+Proprietary. All rights reserved.

rowbase-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,166 @@
+# Rowbase
+Declare data pipelines as Python functions. Rowbase handles DAG construction, dependency resolution, execution, and validation — so you can focus on your transforms.
+## Install
+```bash
+pip install rowbase
+```
+## Quick Start
+```python
+import polars as pl
+import rowbase
+@rowbase.pipeline
+def my_pipeline():
+    orders = rowbase.source("orders", columns=["id", "email", "total", "country"])
+    @rowbase.dataset("cleaned", data_from=orders)
+    def cleaned(orders: pl.DataFrame) -> pl.DataFrame:
+        return orders.filter(pl.col("email").is_not_null())
+    @rowbase.dataset("domestic", data_from=cleaned)
+    def domestic(cleaned: pl.DataFrame) -> pl.DataFrame:
+        return cleaned.filter(pl.col("country") == "US")
+    yield domestic
+```
+Run it:
+```bash
+rowbase pipeline run -p pipeline.py -i orders=orders.csv -o ./output/
+```
+## Core Concepts
+### Sources
+Declare data inputs with `source()`. Each source maps to a file provided at runtime.
+```python
+orders = rowbase.source("orders", columns=["id", "email", "total"])
+```
+Supported formats: CSV, Parquet, Excel. Detected automatically by file extension.
+### Datasets
+Transform functions decorated with `@dataset`. They consume sources or other datasets and return a Polars DataFrame.
+```python
+@rowbase.dataset("summary", data_from=[orders, returns])
+def summary(orders: pl.DataFrame, returns: pl.DataFrame) -> pl.DataFrame:
+    return orders.join(returns, on="order_id", how="left")
+```
+### Pipelines
+Generator functions that wire sources and datasets together. `yield` a dataset to mark it as a published output — non-yielded datasets are intermediate.
+```python
+@rowbase.pipeline
+def my_pipeline():
+    raw = rowbase.source("raw")
+    @rowbase.dataset("cleaned", data_from=raw)
+    def cleaned(raw: pl.DataFrame) -> pl.DataFrame:
+        return raw.drop_nulls()
+    @rowbase.dataset("aggregated", data_from=cleaned)
+    def aggregated(cleaned: pl.DataFrame) -> pl.DataFrame:
+        return cleaned.group_by("category").agg(pl.col("amount").sum())
+    yield aggregated  # published output
+```
+### Schema Validation
+Validate dataset outputs with Pydantic models.
+```python
+from pydantic import BaseModel
+class OrderSchema(BaseModel):
+    id: int
+    email: str
+    total: float
+@rowbase.dataset("validated", data_from=orders, schema=OrderSchema, on_schema_error="skip")
+def validated(orders: pl.DataFrame) -> pl.DataFrame:
+    return orders
+```
+`on_schema_error` options: `"fail"` (default), `"skip"`, `"collect"`.
+### Configuration
+Define config in `rowbase.yaml`:
+```yaml
+config:
+  api_key: secret_key
+  threshold: 100
+```
+Access values in your pipeline:
+```python
+rowbase.config.get("api_key")
+```
+Environment variable overrides follow the pattern `ROWBASE_CONFIG_<KEY>`.
+## CLI
+```
+rowbase pipeline validate -p pipeline.py          # Check DAG structure
+rowbase pipeline info -p pipeline.py               # Show sources, datasets, and graph
+rowbase pipeline dry-run -p pipeline.py -i orders=orders.csv --sample-rows 5
+rowbase pipeline run -p pipeline.py -i orders=orders.csv -o ./output/ --output-format parquet
+rowbase pipeline deploy -p pipeline.py             # Deploy to Rowbase platform
+rowbase dataset test -d cleaned -p pipeline.py -i orders=orders.csv
+rowbase data inspect data.csv                      # Inspect file structure
+rowbase auth login                                 # Authenticate
+rowbase auth status                                # Check auth state
+rowbase runs list                                  # View past runs
+rowbase runs show <run_id>                         # Run details
+rowbase runs download <run_id> -o ./results/       # Download outputs
+rowbase init                                       # Initialize a new project
+```
+## Programmatic Usage
+```python
+from pathlib import Path
+from rowbase.execution import PipelineRunner
+spec = my_pipeline()
+runner = PipelineRunner()
+result = runner.run(
+    spec,
+    inputs={"orders": Path("orders.csv")},
+    output_dir=Path("output/"),
+    output_format="parquet",
+)
+print(result.status)  # "success", "partial", or "failed"
+for name, df in result.dataframes.items():
+    print(f"{name}: {df.shape[0]} rows")
+```
+## Output Formats
+Parquet, CSV, NDJSON, and Excel. Set via `--output-format` in the CLI or `output_format` in `PipelineRunner.run()`.
+## License
+Proprietary. All rights reserved.

rowbase-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,95 @@
+[project]
+name = "rowbase"
+version = "0.1.0"
+description = "Rowbase SDK — declare data pipelines as Python functions"
+readme = "README.md"
+license = "LicenseRef-Proprietary"
+requires-python = ">=3.12"
+authors = [
+    { name = "Rowbase Team", email = "team@rowbase.com" },
+]
+keywords = ["data", "pipelines", "etl", "polars", "sdk"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "License :: Other/Proprietary License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Database",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+    "Typing :: Typed",
+]
+dependencies = [
+    "polars>=1.0",
+    "typer>=0.15.0",
+    "rich>=13.0",
+    "pydantic>=2.0",
+    "pyyaml>=6.0",
+    "pyarrow>=15.0",
+    "httpx>=0.27.0",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.3.0",
+    "ruff>=0.8.0",
+    "mypy>=1.13.0",
+]
+[project.scripts]
+rowbase = "rowbase.cli.main:app"
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["src/rowbase"]
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+[tool.ruff]
+target-version = "py312"
+line-length = 100
+src = ["src", "tests"]
+[tool.ruff.lint]
+select = [
+    "E",     # pycodestyle errors
+    "W",     # pycodestyle warnings
+    "F",     # pyflakes
+    "I",     # isort
+    "N",     # pep8-naming
+    "UP",    # pyupgrade
+    "B",     # flake8-bugbear
+    "SIM",   # flake8-simplify
+    "PLR",   # pylint refactor
+    "PLW",   # pylint warnings
+    "RUF",   # ruff-specific
+]
+ignore = [
+    "E501",     # line length (handled by formatter)
+    "PLR0913",  # too many arguments
+]
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["F401"]
+"tests/**/*.py" = [
+    "ARG",      # unused fixture arguments
+    "S101",     # assert statements
+    "F841",     # source() calls are side-effectful, handles often unused
+    "PLR2004",  # magic values in assertions
+]
+[tool.ruff.lint.isort]
+known-first-party = ["rowbase"]
+[tool.ruff.format]
+quote-style = "double"
+[tool.mypy]
+python_version = "3.12"
+strict = false
+warn_return_any = true
+warn_unused_configs = true

rowbase-0.1.0/src/rowbase/__init__.py ADDED Viewed

@@ -0,0 +1,25 @@
+"""Rowbase SDK — declare data pipelines as Python functions."""
+__version__ = "0.1.0"
+from rowbase.config import get_config as _get_config
+from rowbase.dataset import dataset
+from rowbase.errors import RowbaseError
+from rowbase.pipeline import pipeline
+from rowbase.source import source
+class _ConfigProxy:
+    """Lazy proxy so `rowbase.config.get(...)` works without explicit loading."""
+    def get(self, key: str, default: object = None) -> object:
+        return _get_config().get(key, default)
+config = _ConfigProxy()
+__all__ = ["RowbaseError", "config", "dataset", "pipeline", "source"]
+def connect(api_key: str | None = None) -> None:
+    """Connect to the Rowbase platform. No-op in Phase 1."""

rowbase-0.1.0/src/rowbase/_internal/__init__.py ADDED Viewed

File without changes

rowbase-0.1.0/src/rowbase/_internal/registry.py ADDED Viewed

@@ -0,0 +1,83 @@
+"""Scoped pipeline context registry.
+The @pipeline decorator creates a PipelineContext and sets it as the current
+context via a ContextVar. source() and @dataset register into this context
+during pipeline discovery.
+"""
+from __future__ import annotations
+import contextvars
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+    from collections.abc import Callable
+    from pydantic import BaseModel
+@dataclass
+class SourceMetadata:
+    """Metadata for a registered source."""
+    name: str
+    columns: list[str] | dict[str, type] | None = None
+    description: str = ""
+    reader_options: dict[str, Any] | None = None
+    optional: bool = False
+@dataclass
+class DatasetMetadata:
+    """Metadata for a registered dataset."""
+    name: str
+    fn: Callable[..., Any]
+    schema: type[BaseModel] | None = None
+    on_schema_error: str = "fail"
+    description: str = ""
+    depends_on: list[str] = field(default_factory=list)
+    metadata: bool = True
+@dataclass
+class PipelineContext:
+    """Scoped registry for a single pipeline's sources and datasets."""
+    sources: dict[str, SourceMetadata] = field(default_factory=dict)
+    datasets: dict[str, DatasetMetadata] = field(default_factory=dict)
+    published: set[str] = field(default_factory=set)
+    def register_source(self, meta: SourceMetadata) -> None:
+        self.sources[meta.name] = meta
+    def register_dataset(self, meta: DatasetMetadata) -> None:
+        self.datasets[meta.name] = meta
+    def mark_published(self, name: str) -> None:
+        self.published.add(name)
+    @property
+    def all_names(self) -> set[str]:
+        return set(self.sources) | set(self.datasets)
+_current_context: contextvars.ContextVar[PipelineContext | None] = contextvars.ContextVar(
+    "_current_context", default=None
+)
+def get_current_context() -> PipelineContext:
+    """Get the current pipeline context. Raises if called outside a pipeline function."""
+    ctx = _current_context.get()
+    if ctx is None:
+        raise RuntimeError(
+            "source() and @dataset must be called inside a @pipeline-decorated function."
+        )
+    return ctx
+def set_current_context(ctx: PipelineContext | None) -> contextvars.Token[PipelineContext | None]:
+    """Set the current pipeline context. Returns a token for resetting."""
+    return _current_context.set(ctx)

rowbase-0.1.0/src/rowbase/api/__init__.py ADDED Viewed

File without changes