rowbase 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. rowbase-0.1.0/PKG-INFO +195 -0
  2. rowbase-0.1.0/README.md +166 -0
  3. rowbase-0.1.0/pyproject.toml +95 -0
  4. rowbase-0.1.0/src/rowbase/__init__.py +25 -0
  5. rowbase-0.1.0/src/rowbase/_internal/__init__.py +0 -0
  6. rowbase-0.1.0/src/rowbase/_internal/registry.py +83 -0
  7. rowbase-0.1.0/src/rowbase/api/__init__.py +0 -0
  8. rowbase-0.1.0/src/rowbase/api/client.py +212 -0
  9. rowbase-0.1.0/src/rowbase/cli/__init__.py +0 -0
  10. rowbase-0.1.0/src/rowbase/cli/auth_cmd.py +54 -0
  11. rowbase-0.1.0/src/rowbase/cli/data_cmd.py +72 -0
  12. rowbase-0.1.0/src/rowbase/cli/dataset_cmd.py +62 -0
  13. rowbase-0.1.0/src/rowbase/cli/formatters.py +106 -0
  14. rowbase-0.1.0/src/rowbase/cli/init_cmd.py +76 -0
  15. rowbase-0.1.0/src/rowbase/cli/main.py +32 -0
  16. rowbase-0.1.0/src/rowbase/cli/pipeline_cmd.py +341 -0
  17. rowbase-0.1.0/src/rowbase/cli/runs_cmd.py +277 -0
  18. rowbase-0.1.0/src/rowbase/config.py +105 -0
  19. rowbase-0.1.0/src/rowbase/dag.py +127 -0
  20. rowbase-0.1.0/src/rowbase/dataset.py +110 -0
  21. rowbase-0.1.0/src/rowbase/errors.py +55 -0
  22. rowbase-0.1.0/src/rowbase/execution.py +249 -0
  23. rowbase-0.1.0/src/rowbase/io/__init__.py +0 -0
  24. rowbase-0.1.0/src/rowbase/io/readers.py +104 -0
  25. rowbase-0.1.0/src/rowbase/io/writers.py +102 -0
  26. rowbase-0.1.0/src/rowbase/pipeline.py +79 -0
  27. rowbase-0.1.0/src/rowbase/py.typed +0 -0
  28. rowbase-0.1.0/src/rowbase/schema.py +68 -0
  29. rowbase-0.1.0/src/rowbase/source.py +88 -0
  30. rowbase-0.1.0/src/rowbase/templates/.gitkeep +0 -0
  31. rowbase-0.1.0/tests/test_config.py +103 -0
  32. rowbase-0.1.0/tests/test_dag.py +172 -0
  33. rowbase-0.1.0/tests/test_dataset.py +175 -0
  34. rowbase-0.1.0/tests/test_errors.py +90 -0
  35. rowbase-0.1.0/tests/test_execution.py +518 -0
  36. rowbase-0.1.0/tests/test_pipeline.py +200 -0
  37. rowbase-0.1.0/tests/test_readers.py +95 -0
  38. rowbase-0.1.0/tests/test_schema.py +96 -0
  39. rowbase-0.1.0/tests/test_source.py +85 -0
  40. rowbase-0.1.0/tests/test_writers.py +103 -0
  41. rowbase-0.1.0/uv.lock +616 -0
rowbase-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,195 @@
1
+ Metadata-Version: 2.4
2
+ Name: rowbase
3
+ Version: 0.1.0
4
+ Summary: Rowbase SDK — declare data pipelines as Python functions
5
+ Author-email: Rowbase Team <team@rowbase.com>
6
+ License-Expression: LicenseRef-Proprietary
7
+ Keywords: data,etl,pipelines,polars,sdk
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: Other/Proprietary License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Topic :: Database
14
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
15
+ Classifier: Typing :: Typed
16
+ Requires-Python: >=3.12
17
+ Requires-Dist: httpx>=0.27.0
18
+ Requires-Dist: polars>=1.0
19
+ Requires-Dist: pyarrow>=15.0
20
+ Requires-Dist: pydantic>=2.0
21
+ Requires-Dist: pyyaml>=6.0
22
+ Requires-Dist: rich>=13.0
23
+ Requires-Dist: typer>=0.15.0
24
+ Provides-Extra: dev
25
+ Requires-Dist: mypy>=1.13.0; extra == 'dev'
26
+ Requires-Dist: pytest>=8.3.0; extra == 'dev'
27
+ Requires-Dist: ruff>=0.8.0; extra == 'dev'
28
+ Description-Content-Type: text/markdown
29
+
30
+ # Rowbase
31
+
32
+ Declare data pipelines as Python functions. Rowbase handles DAG construction, dependency resolution, execution, and validation — so you can focus on your transforms.
33
+
34
+ ## Install
35
+
36
+ ```bash
37
+ pip install rowbase
38
+ ```
39
+
40
+ ## Quick Start
41
+
42
+ ```python
43
+ import polars as pl
44
+ import rowbase
45
+
46
+ @rowbase.pipeline
47
+ def my_pipeline():
48
+ orders = rowbase.source("orders", columns=["id", "email", "total", "country"])
49
+
50
+ @rowbase.dataset("cleaned", data_from=orders)
51
+ def cleaned(orders: pl.DataFrame) -> pl.DataFrame:
52
+ return orders.filter(pl.col("email").is_not_null())
53
+
54
+ @rowbase.dataset("domestic", data_from=cleaned)
55
+ def domestic(cleaned: pl.DataFrame) -> pl.DataFrame:
56
+ return cleaned.filter(pl.col("country") == "US")
57
+
58
+ yield domestic
59
+ ```
60
+
61
+ Run it:
62
+
63
+ ```bash
64
+ rowbase pipeline run -p pipeline.py -i orders=orders.csv -o ./output/
65
+ ```
66
+
67
+ ## Core Concepts
68
+
69
+ ### Sources
70
+
71
+ Declare data inputs with `source()`. Each source maps to a file provided at runtime.
72
+
73
+ ```python
74
+ orders = rowbase.source("orders", columns=["id", "email", "total"])
75
+ ```
76
+
77
+ Supported formats: CSV, Parquet, Excel. Detected automatically by file extension.
78
+
79
+ ### Datasets
80
+
81
+ Transform functions decorated with `@dataset`. They consume sources or other datasets and return a Polars DataFrame.
82
+
83
+ ```python
84
+ @rowbase.dataset("summary", data_from=[orders, returns])
85
+ def summary(orders: pl.DataFrame, returns: pl.DataFrame) -> pl.DataFrame:
86
+ return orders.join(returns, on="order_id", how="left")
87
+ ```
88
+
89
+ ### Pipelines
90
+
91
+ Generator functions that wire sources and datasets together. `yield` a dataset to mark it as a published output — non-yielded datasets are intermediate.
92
+
93
+ ```python
94
+ @rowbase.pipeline
95
+ def my_pipeline():
96
+ raw = rowbase.source("raw")
97
+
98
+ @rowbase.dataset("cleaned", data_from=raw)
99
+ def cleaned(raw: pl.DataFrame) -> pl.DataFrame:
100
+ return raw.drop_nulls()
101
+
102
+ @rowbase.dataset("aggregated", data_from=cleaned)
103
+ def aggregated(cleaned: pl.DataFrame) -> pl.DataFrame:
104
+ return cleaned.group_by("category").agg(pl.col("amount").sum())
105
+
106
+ yield aggregated # published output
107
+ ```
108
+
109
+ ### Schema Validation
110
+
111
+ Validate dataset outputs with Pydantic models.
112
+
113
+ ```python
114
+ from pydantic import BaseModel
115
+
116
+ class OrderSchema(BaseModel):
117
+ id: int
118
+ email: str
119
+ total: float
120
+
121
+ @rowbase.dataset("validated", data_from=orders, schema=OrderSchema, on_schema_error="skip")
122
+ def validated(orders: pl.DataFrame) -> pl.DataFrame:
123
+ return orders
124
+ ```
125
+
126
+ `on_schema_error` options: `"fail"` (default), `"skip"`, `"collect"`.
127
+
128
+ ### Configuration
129
+
130
+ Define config in `rowbase.yaml`:
131
+
132
+ ```yaml
133
+ config:
134
+ api_key: secret_key
135
+ threshold: 100
136
+ ```
137
+
138
+ Access values in your pipeline:
139
+
140
+ ```python
141
+ rowbase.config.get("api_key")
142
+ ```
143
+
144
+ Environment variable overrides follow the pattern `ROWBASE_CONFIG_<KEY>`.
145
+
146
+ ## CLI
147
+
148
+ ```
149
+ rowbase pipeline validate -p pipeline.py # Check DAG structure
150
+ rowbase pipeline info -p pipeline.py # Show sources, datasets, and graph
151
+ rowbase pipeline dry-run -p pipeline.py -i orders=orders.csv --sample-rows 5
152
+ rowbase pipeline run -p pipeline.py -i orders=orders.csv -o ./output/ --output-format parquet
153
+ rowbase pipeline deploy -p pipeline.py # Deploy to Rowbase platform
154
+
155
+ rowbase dataset test -d cleaned -p pipeline.py -i orders=orders.csv
156
+
157
+ rowbase data inspect data.csv # Inspect file structure
158
+
159
+ rowbase auth login # Authenticate
160
+ rowbase auth status # Check auth state
161
+
162
+ rowbase runs list # View past runs
163
+ rowbase runs show <run_id> # Run details
164
+ rowbase runs download <run_id> -o ./results/ # Download outputs
165
+
166
+ rowbase init # Initialize a new project
167
+ ```
168
+
169
+ ## Programmatic Usage
170
+
171
+ ```python
172
+ from pathlib import Path
173
+ from rowbase.execution import PipelineRunner
174
+
175
+ spec = my_pipeline()
176
+ runner = PipelineRunner()
177
+ result = runner.run(
178
+ spec,
179
+ inputs={"orders": Path("orders.csv")},
180
+ output_dir=Path("output/"),
181
+ output_format="parquet",
182
+ )
183
+
184
+ print(result.status) # "success", "partial", or "failed"
185
+ for name, df in result.dataframes.items():
186
+ print(f"{name}: {df.shape[0]} rows")
187
+ ```
188
+
189
+ ## Output Formats
190
+
191
+ Parquet, CSV, NDJSON, and Excel. Set via `--output-format` in the CLI or `output_format` in `PipelineRunner.run()`.
192
+
193
+ ## License
194
+
195
+ Proprietary. All rights reserved.
@@ -0,0 +1,166 @@
1
+ # Rowbase
2
+
3
+ Declare data pipelines as Python functions. Rowbase handles DAG construction, dependency resolution, execution, and validation — so you can focus on your transforms.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ pip install rowbase
9
+ ```
10
+
11
+ ## Quick Start
12
+
13
+ ```python
14
+ import polars as pl
15
+ import rowbase
16
+
17
+ @rowbase.pipeline
18
+ def my_pipeline():
19
+ orders = rowbase.source("orders", columns=["id", "email", "total", "country"])
20
+
21
+ @rowbase.dataset("cleaned", data_from=orders)
22
+ def cleaned(orders: pl.DataFrame) -> pl.DataFrame:
23
+ return orders.filter(pl.col("email").is_not_null())
24
+
25
+ @rowbase.dataset("domestic", data_from=cleaned)
26
+ def domestic(cleaned: pl.DataFrame) -> pl.DataFrame:
27
+ return cleaned.filter(pl.col("country") == "US")
28
+
29
+ yield domestic
30
+ ```
31
+
32
+ Run it:
33
+
34
+ ```bash
35
+ rowbase pipeline run -p pipeline.py -i orders=orders.csv -o ./output/
36
+ ```
37
+
38
+ ## Core Concepts
39
+
40
+ ### Sources
41
+
42
+ Declare data inputs with `source()`. Each source maps to a file provided at runtime.
43
+
44
+ ```python
45
+ orders = rowbase.source("orders", columns=["id", "email", "total"])
46
+ ```
47
+
48
+ Supported formats: CSV, Parquet, Excel. Detected automatically by file extension.
49
+
50
+ ### Datasets
51
+
52
+ Transform functions decorated with `@dataset`. They consume sources or other datasets and return a Polars DataFrame.
53
+
54
+ ```python
55
+ @rowbase.dataset("summary", data_from=[orders, returns])
56
+ def summary(orders: pl.DataFrame, returns: pl.DataFrame) -> pl.DataFrame:
57
+ return orders.join(returns, on="order_id", how="left")
58
+ ```
59
+
60
+ ### Pipelines
61
+
62
+ Generator functions that wire sources and datasets together. `yield` a dataset to mark it as a published output — non-yielded datasets are intermediate.
63
+
64
+ ```python
65
+ @rowbase.pipeline
66
+ def my_pipeline():
67
+ raw = rowbase.source("raw")
68
+
69
+ @rowbase.dataset("cleaned", data_from=raw)
70
+ def cleaned(raw: pl.DataFrame) -> pl.DataFrame:
71
+ return raw.drop_nulls()
72
+
73
+ @rowbase.dataset("aggregated", data_from=cleaned)
74
+ def aggregated(cleaned: pl.DataFrame) -> pl.DataFrame:
75
+ return cleaned.group_by("category").agg(pl.col("amount").sum())
76
+
77
+ yield aggregated # published output
78
+ ```
79
+
80
+ ### Schema Validation
81
+
82
+ Validate dataset outputs with Pydantic models.
83
+
84
+ ```python
85
+ from pydantic import BaseModel
86
+
87
+ class OrderSchema(BaseModel):
88
+ id: int
89
+ email: str
90
+ total: float
91
+
92
+ @rowbase.dataset("validated", data_from=orders, schema=OrderSchema, on_schema_error="skip")
93
+ def validated(orders: pl.DataFrame) -> pl.DataFrame:
94
+ return orders
95
+ ```
96
+
97
+ `on_schema_error` options: `"fail"` (default), `"skip"`, `"collect"`.
98
+
99
+ ### Configuration
100
+
101
+ Define config in `rowbase.yaml`:
102
+
103
+ ```yaml
104
+ config:
105
+ api_key: secret_key
106
+ threshold: 100
107
+ ```
108
+
109
+ Access values in your pipeline:
110
+
111
+ ```python
112
+ rowbase.config.get("api_key")
113
+ ```
114
+
115
+ Environment variable overrides follow the pattern `ROWBASE_CONFIG_<KEY>`.
116
+
117
+ ## CLI
118
+
119
+ ```
120
+ rowbase pipeline validate -p pipeline.py # Check DAG structure
121
+ rowbase pipeline info -p pipeline.py # Show sources, datasets, and graph
122
+ rowbase pipeline dry-run -p pipeline.py -i orders=orders.csv --sample-rows 5
123
+ rowbase pipeline run -p pipeline.py -i orders=orders.csv -o ./output/ --output-format parquet
124
+ rowbase pipeline deploy -p pipeline.py # Deploy to Rowbase platform
125
+
126
+ rowbase dataset test -d cleaned -p pipeline.py -i orders=orders.csv
127
+
128
+ rowbase data inspect data.csv # Inspect file structure
129
+
130
+ rowbase auth login # Authenticate
131
+ rowbase auth status # Check auth state
132
+
133
+ rowbase runs list # View past runs
134
+ rowbase runs show <run_id> # Run details
135
+ rowbase runs download <run_id> -o ./results/ # Download outputs
136
+
137
+ rowbase init # Initialize a new project
138
+ ```
139
+
140
+ ## Programmatic Usage
141
+
142
+ ```python
143
+ from pathlib import Path
144
+ from rowbase.execution import PipelineRunner
145
+
146
+ spec = my_pipeline()
147
+ runner = PipelineRunner()
148
+ result = runner.run(
149
+ spec,
150
+ inputs={"orders": Path("orders.csv")},
151
+ output_dir=Path("output/"),
152
+ output_format="parquet",
153
+ )
154
+
155
+ print(result.status) # "success", "partial", or "failed"
156
+ for name, df in result.dataframes.items():
157
+ print(f"{name}: {df.shape[0]} rows")
158
+ ```
159
+
160
+ ## Output Formats
161
+
162
+ Parquet, CSV, NDJSON, and Excel. Set via `--output-format` in the CLI or `output_format` in `PipelineRunner.run()`.
163
+
164
+ ## License
165
+
166
+ Proprietary. All rights reserved.
@@ -0,0 +1,95 @@
1
+ [project]
2
+ name = "rowbase"
3
+ version = "0.1.0"
4
+ description = "Rowbase SDK — declare data pipelines as Python functions"
5
+ readme = "README.md"
6
+ license = "LicenseRef-Proprietary"
7
+ requires-python = ">=3.12"
8
+ authors = [
9
+ { name = "Rowbase Team", email = "team@rowbase.com" },
10
+ ]
11
+ keywords = ["data", "pipelines", "etl", "polars", "sdk"]
12
+ classifiers = [
13
+ "Development Status :: 3 - Alpha",
14
+ "Intended Audience :: Developers",
15
+ "License :: Other/Proprietary License",
16
+ "Programming Language :: Python :: 3",
17
+ "Programming Language :: Python :: 3.12",
18
+ "Topic :: Database",
19
+ "Topic :: Software Development :: Libraries :: Python Modules",
20
+ "Typing :: Typed",
21
+ ]
22
+ dependencies = [
23
+ "polars>=1.0",
24
+ "typer>=0.15.0",
25
+ "rich>=13.0",
26
+ "pydantic>=2.0",
27
+ "pyyaml>=6.0",
28
+ "pyarrow>=15.0",
29
+ "httpx>=0.27.0",
30
+ ]
31
+
32
+ [project.optional-dependencies]
33
+ dev = [
34
+ "pytest>=8.3.0",
35
+ "ruff>=0.8.0",
36
+ "mypy>=1.13.0",
37
+ ]
38
+
39
+ [project.scripts]
40
+ rowbase = "rowbase.cli.main:app"
41
+
42
+ [build-system]
43
+ requires = ["hatchling"]
44
+ build-backend = "hatchling.build"
45
+
46
+ [tool.hatch.build.targets.wheel]
47
+ packages = ["src/rowbase"]
48
+
49
+ [tool.pytest.ini_options]
50
+ testpaths = ["tests"]
51
+
52
+ [tool.ruff]
53
+ target-version = "py312"
54
+ line-length = 100
55
+ src = ["src", "tests"]
56
+
57
+ [tool.ruff.lint]
58
+ select = [
59
+ "E", # pycodestyle errors
60
+ "W", # pycodestyle warnings
61
+ "F", # pyflakes
62
+ "I", # isort
63
+ "N", # pep8-naming
64
+ "UP", # pyupgrade
65
+ "B", # flake8-bugbear
66
+ "SIM", # flake8-simplify
67
+ "PLR", # pylint refactor
68
+ "PLW", # pylint warnings
69
+ "RUF", # ruff-specific
70
+ ]
71
+ ignore = [
72
+ "E501", # line length (handled by formatter)
73
+ "PLR0913", # too many arguments
74
+ ]
75
+
76
+ [tool.ruff.lint.per-file-ignores]
77
+ "__init__.py" = ["F401"]
78
+ "tests/**/*.py" = [
79
+ "ARG", # unused fixture arguments
80
+ "S101", # assert statements
81
+ "F841", # source() calls are side-effectful, handles often unused
82
+ "PLR2004", # magic values in assertions
83
+ ]
84
+
85
+ [tool.ruff.lint.isort]
86
+ known-first-party = ["rowbase"]
87
+
88
+ [tool.ruff.format]
89
+ quote-style = "double"
90
+
91
+ [tool.mypy]
92
+ python_version = "3.12"
93
+ strict = false
94
+ warn_return_any = true
95
+ warn_unused_configs = true
@@ -0,0 +1,25 @@
1
+ """Rowbase SDK — declare data pipelines as Python functions."""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ from rowbase.config import get_config as _get_config
6
+ from rowbase.dataset import dataset
7
+ from rowbase.errors import RowbaseError
8
+ from rowbase.pipeline import pipeline
9
+ from rowbase.source import source
10
+
11
+
12
+ class _ConfigProxy:
13
+ """Lazy proxy so `rowbase.config.get(...)` works without explicit loading."""
14
+
15
+ def get(self, key: str, default: object = None) -> object:
16
+ return _get_config().get(key, default)
17
+
18
+
19
+ config = _ConfigProxy()
20
+
21
+ __all__ = ["RowbaseError", "config", "dataset", "pipeline", "source"]
22
+
23
+
24
+ def connect(api_key: str | None = None) -> None:
25
+ """Connect to the Rowbase platform. No-op in Phase 1."""
File without changes
@@ -0,0 +1,83 @@
1
+ """Scoped pipeline context registry.
2
+
3
+ The @pipeline decorator creates a PipelineContext and sets it as the current
4
+ context via a ContextVar. source() and @dataset register into this context
5
+ during pipeline discovery.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import contextvars
11
+ from dataclasses import dataclass, field
12
+ from typing import TYPE_CHECKING, Any
13
+
14
+ if TYPE_CHECKING:
15
+ from collections.abc import Callable
16
+
17
+ from pydantic import BaseModel
18
+
19
+
20
+ @dataclass
21
+ class SourceMetadata:
22
+ """Metadata for a registered source."""
23
+
24
+ name: str
25
+ columns: list[str] | dict[str, type] | None = None
26
+ description: str = ""
27
+ reader_options: dict[str, Any] | None = None
28
+ optional: bool = False
29
+
30
+
31
+ @dataclass
32
+ class DatasetMetadata:
33
+ """Metadata for a registered dataset."""
34
+
35
+ name: str
36
+ fn: Callable[..., Any]
37
+ schema: type[BaseModel] | None = None
38
+ on_schema_error: str = "fail"
39
+ description: str = ""
40
+ depends_on: list[str] = field(default_factory=list)
41
+ metadata: bool = True
42
+
43
+
44
+ @dataclass
45
+ class PipelineContext:
46
+ """Scoped registry for a single pipeline's sources and datasets."""
47
+
48
+ sources: dict[str, SourceMetadata] = field(default_factory=dict)
49
+ datasets: dict[str, DatasetMetadata] = field(default_factory=dict)
50
+ published: set[str] = field(default_factory=set)
51
+
52
+ def register_source(self, meta: SourceMetadata) -> None:
53
+ self.sources[meta.name] = meta
54
+
55
+ def register_dataset(self, meta: DatasetMetadata) -> None:
56
+ self.datasets[meta.name] = meta
57
+
58
+ def mark_published(self, name: str) -> None:
59
+ self.published.add(name)
60
+
61
+ @property
62
+ def all_names(self) -> set[str]:
63
+ return set(self.sources) | set(self.datasets)
64
+
65
+
66
+ _current_context: contextvars.ContextVar[PipelineContext | None] = contextvars.ContextVar(
67
+ "_current_context", default=None
68
+ )
69
+
70
+
71
+ def get_current_context() -> PipelineContext:
72
+ """Get the current pipeline context. Raises if called outside a pipeline function."""
73
+ ctx = _current_context.get()
74
+ if ctx is None:
75
+ raise RuntimeError(
76
+ "source() and @dataset must be called inside a @pipeline-decorated function."
77
+ )
78
+ return ctx
79
+
80
+
81
+ def set_current_context(ctx: PipelineContext | None) -> contextvars.Token[PipelineContext | None]:
82
+ """Set the current pipeline context. Returns a token for resetting."""
83
+ return _current_context.set(ctx)
File without changes