dpyr 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dpyr-0.0.1/PKG-INFO +69 -0
- dpyr-0.0.1/README.md +53 -0
- dpyr-0.0.1/docs/DESIGN.md +132 -0
- dpyr-0.0.1/docs/ROADMAP.md +100 -0
- dpyr-0.0.1/docs/SEMANTICS.md +35 -0
- dpyr-0.0.1/docs/TESTING.md +81 -0
- dpyr-0.0.1/pyproject.toml +35 -0
- dpyr-0.0.1/src/dpyr/__init__.py +17 -0
dpyr-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dpyr
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: dplyr for Python: tidy piped verbs over polars and duckdb, with real autocompletion. Name reservation — API under active development.
|
|
5
|
+
Project-URL: Repository, https://github.com/maximerivest/dataframe
|
|
6
|
+
Author-email: Maxime Rivest <mrive052@gmail.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
Keywords: data-analysis,dataframe,dplyr,duckdb,polars,tidyverse
|
|
9
|
+
Classifier: Development Status :: 1 - Planning
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering
|
|
14
|
+
Requires-Python: >=3.10
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
|
|
17
|
+
# dpyr
|
|
18
|
+
|
|
19
|
+
**dplyr for Python.** A tidy, pipe-style data manipulation API — fronting
|
|
20
|
+
[polars](https://pola.rs) and [duckdb](https://duckdb.org) — with real IDE
|
|
21
|
+
autocompletion and dplyr-faithful semantics, verified by differential testing
|
|
22
|
+
against dplyr itself.
|
|
23
|
+
|
|
24
|
+
```python
|
|
25
|
+
from dpyr import read_parquet, col, n, desc, starts_with
|
|
26
|
+
|
|
27
|
+
(
|
|
28
|
+
starwars
|
|
29
|
+
.filter(col.height > 180, col.mass < 100)
|
|
30
|
+
.mutate(bmi = col.mass / (col.height / 100) ** 2)
|
|
31
|
+
.group_by(col.species)
|
|
32
|
+
.summarize(
|
|
33
|
+
n = n(),
|
|
34
|
+
mean_bmi = col.bmi.mean(),
|
|
35
|
+
)
|
|
36
|
+
.arrange(desc(col.mean_bmi))
|
|
37
|
+
)
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Principles (the elevator pitch)
|
|
41
|
+
|
|
42
|
+
1. **dplyr's vocabulary, Python's idiom.** The verbs are dplyr's, verbatim
|
|
43
|
+
(`filter`, `mutate`, `select`, `arrange`, `group_by`, `summarize`,
|
|
44
|
+
joins, tidyselect). The pipe is Python's: method chaining.
|
|
45
|
+
2. **As lazy as possible internally, as eager as possible observably.**
|
|
46
|
+
Verbs build a plan; schema errors raise immediately on the offending
|
|
47
|
+
line; displaying/exporting auto-collects. Interactive feel, query-engine
|
|
48
|
+
performance.
|
|
49
|
+
3. **Autocompletion is a feature, not an accident.** The `col` proxy and
|
|
50
|
+
per-schema stub generation make column names and column-typed methods
|
|
51
|
+
complete in any IDE.
|
|
52
|
+
4. **Two backends, one semantics.** polars (in-memory/files) and duckdb
|
|
53
|
+
(SQL pushdown) must agree, bit-for-bit modulo the documented semantics
|
|
54
|
+
spec. Verified continuously.
|
|
55
|
+
5. **dplyr is the oracle.** Compatibility is demonstrated, not claimed:
|
|
56
|
+
golden outputs are generated by actual dplyr in CI.
|
|
57
|
+
|
|
58
|
+
## Project documents
|
|
59
|
+
|
|
60
|
+
| Doc | What it pins down |
|
|
61
|
+
|---|---|
|
|
62
|
+
| [docs/DESIGN.md](docs/DESIGN.md) | API design, laziness/materialization model, autocompletion strategy, architecture |
|
|
63
|
+
| [docs/SEMANTICS.md](docs/SEMANTICS.md) | The conformance spec: every deliberate decision where R, polars and duckdb disagree |
|
|
64
|
+
| [docs/TESTING.md](docs/TESTING.md) | Test strategy: dplyr-as-oracle, backend differential tests, Hypothesis properties |
|
|
65
|
+
| [docs/ROADMAP.md](docs/ROADMAP.md) | Epics and stories to MVP, in dependency order |
|
|
66
|
+
|
|
67
|
+
## Status
|
|
68
|
+
|
|
69
|
+
Pre-MVP. The plan is in [docs/ROADMAP.md](docs/ROADMAP.md).
|
dpyr-0.0.1/README.md
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# dpyr
|
|
2
|
+
|
|
3
|
+
**dplyr for Python.** A tidy, pipe-style data manipulation API — fronting
|
|
4
|
+
[polars](https://pola.rs) and [duckdb](https://duckdb.org) — with real IDE
|
|
5
|
+
autocompletion and dplyr-faithful semantics, verified by differential testing
|
|
6
|
+
against dplyr itself.
|
|
7
|
+
|
|
8
|
+
```python
|
|
9
|
+
from dpyr import read_parquet, col, n, desc, starts_with
|
|
10
|
+
|
|
11
|
+
(
|
|
12
|
+
starwars
|
|
13
|
+
.filter(col.height > 180, col.mass < 100)
|
|
14
|
+
.mutate(bmi = col.mass / (col.height / 100) ** 2)
|
|
15
|
+
.group_by(col.species)
|
|
16
|
+
.summarize(
|
|
17
|
+
n = n(),
|
|
18
|
+
mean_bmi = col.bmi.mean(),
|
|
19
|
+
)
|
|
20
|
+
.arrange(desc(col.mean_bmi))
|
|
21
|
+
)
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Principles (the elevator pitch)
|
|
25
|
+
|
|
26
|
+
1. **dplyr's vocabulary, Python's idiom.** The verbs are dplyr's, verbatim
|
|
27
|
+
(`filter`, `mutate`, `select`, `arrange`, `group_by`, `summarize`,
|
|
28
|
+
joins, tidyselect). The pipe is Python's: method chaining.
|
|
29
|
+
2. **As lazy as possible internally, as eager as possible observably.**
|
|
30
|
+
Verbs build a plan; schema errors raise immediately on the offending
|
|
31
|
+
line; displaying/exporting auto-collects. Interactive feel, query-engine
|
|
32
|
+
performance.
|
|
33
|
+
3. **Autocompletion is a feature, not an accident.** The `col` proxy and
|
|
34
|
+
per-schema stub generation make column names and column-typed methods
|
|
35
|
+
complete in any IDE.
|
|
36
|
+
4. **Two backends, one semantics.** polars (in-memory/files) and duckdb
|
|
37
|
+
(SQL pushdown) must agree, bit-for-bit modulo the documented semantics
|
|
38
|
+
spec. Verified continuously.
|
|
39
|
+
5. **dplyr is the oracle.** Compatibility is demonstrated, not claimed:
|
|
40
|
+
golden outputs are generated by actual dplyr in CI.
|
|
41
|
+
|
|
42
|
+
## Project documents
|
|
43
|
+
|
|
44
|
+
| Doc | What it pins down |
|
|
45
|
+
|---|---|
|
|
46
|
+
| [docs/DESIGN.md](docs/DESIGN.md) | API design, laziness/materialization model, autocompletion strategy, architecture |
|
|
47
|
+
| [docs/SEMANTICS.md](docs/SEMANTICS.md) | The conformance spec: every deliberate decision where R, polars and duckdb disagree |
|
|
48
|
+
| [docs/TESTING.md](docs/TESTING.md) | Test strategy: dplyr-as-oracle, backend differential tests, Hypothesis properties |
|
|
49
|
+
| [docs/ROADMAP.md](docs/ROADMAP.md) | Epics and stories to MVP, in dependency order |
|
|
50
|
+
|
|
51
|
+
## Status
|
|
52
|
+
|
|
53
|
+
Pre-MVP. The plan is in [docs/ROADMAP.md](docs/ROADMAP.md).
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
# DESIGN — intent and architecture
|
|
2
|
+
|
|
3
|
+
This document records the design decisions and *why* they were made, so the
|
|
4
|
+
project stays coherent as it grows. Changes to anything in here deserve a
|
|
5
|
+
discussion, not a drive-by PR.
|
|
6
|
+
|
|
7
|
+
## 1. The problem
|
|
8
|
+
|
|
9
|
+
dplyr's ergonomics rest on non-standard evaluation (bare column names),
|
|
10
|
+
which Python cannot do. Prior ports each gave up something essential:
|
|
11
|
+
|
|
12
|
+
- **dfply / dplython** — emulated `%>%` with `>>` operator hacks; alien to
|
|
13
|
+
Python, broke tooling, unmaintained.
|
|
14
|
+
- **siuba** — closest in spirit (`_` proxy, SQL backend) but weak typing
|
|
15
|
+
and completion.
|
|
16
|
+
- **tidypolars** — dplyr verb names over polars, but string column refs:
|
|
17
|
+
no completion, no expression typing.
|
|
18
|
+
- **ibis** — the right architecture (expression IR, many backends, lazy
|
|
19
|
+
with interactive mode) but the API drifted far from dplyr.
|
|
20
|
+
|
|
21
|
+
We take ibis's architecture, dplyr's API, and add the missing piece nobody
|
|
22
|
+
shipped: schema-aware autocompletion.
|
|
23
|
+
|
|
24
|
+
## 2. API surface
|
|
25
|
+
|
|
26
|
+
### Verbs (dplyr names, verbatim)
|
|
27
|
+
|
|
28
|
+
MVP set: `filter`, `mutate`, `select`, `rename`, `arrange`, `group_by`
|
|
29
|
+
(+ implicit ungroup after `summarize`), `summarize`/`summarise`,
|
|
30
|
+
`distinct`, `slice_head`/`slice_tail`/`slice_sample`, `count`,
|
|
31
|
+
`left_join`/`inner_join`/`right_join`/`full_join`/`semi_join`/`anti_join`,
|
|
32
|
+
`pivot_longer`, `pivot_wider`, `pull`.
|
|
33
|
+
|
|
34
|
+
Pipe = method chaining. No `>>`/`|` operator overloading, ever.
|
|
35
|
+
New columns via kwargs: `mutate(bmi = ...)`, `summarize(n = n())`.
|
|
36
|
+
|
|
37
|
+
### The `col` proxy
|
|
38
|
+
|
|
39
|
+
`col.height > 180` builds an expression tree (our IR), not a value.
|
|
40
|
+
Column-typed expression classes (`NumExpr`, `StrExpr`, `BoolExpr`,
|
|
41
|
+
`DtExpr`, ...) carry the appropriate methods (`.mean()`, `.str_detect()`,
|
|
42
|
+
`.year()`), so completion is type-correct.
|
|
43
|
+
|
|
44
|
+
Helpers as plain functions: `n()`, `desc()`, `if_else()`, `case_when()`,
|
|
45
|
+
`across()`, and tidyselect (`starts_with`, `ends_with`, `contains`,
|
|
46
|
+
`matches`, `where`, `everything`); negation via unary minus.
|
|
47
|
+
|
|
48
|
+
### Autocompletion strategy (the differentiator)
|
|
49
|
+
|
|
50
|
+
Three tiers, weakest to strongest:
|
|
51
|
+
|
|
52
|
+
1. **Runtime**: `DFrame.__getattr__`/`__dir__` and a frame-bound `df.c`
|
|
53
|
+
proxy populated from the live schema → Jupyter/REPL completion for free.
|
|
54
|
+
2. **Generic typing**: `DFrame[S]` parameterized by a `Cols` schema class;
|
|
55
|
+
`filter(lambda c: c.height > 180)` completes via the lambda's inferred
|
|
56
|
+
parameter type.
|
|
57
|
+
3. **Stub codegen**: a CLI (`dpyr stubgen data/*.parquet`) reads
|
|
58
|
+
parquet/duckdb schemas and emits `Cols` subclasses + typed module
|
|
59
|
+
attributes, giving full static completion and type-checking in any IDE.
|
|
60
|
+
|
|
61
|
+
## 3. Materialization model
|
|
62
|
+
|
|
63
|
+
**Schema-eager, data-lazy, display-eager.** (The core UX decision.)
|
|
64
|
+
|
|
65
|
+
- *Schema-eager*: every verb validates inputs against the known schema and
|
|
66
|
+
computes its output schema synchronously. Wrong column name, type
|
|
67
|
+
mismatch, bad group reference → exception on that line, with a one-frame
|
|
68
|
+
traceback. Requires only metadata; costs nothing.
|
|
69
|
+
- *Data-lazy*: verbs append to a logical plan. No intermediate
|
|
70
|
+
materialization within a chain → query fusion, predicate pushdown into
|
|
71
|
+
parquet/duckdb.
|
|
72
|
+
- *Display-eager* (default on): materialization happens automatically at
|
|
73
|
+
the boundaries where a value escapes the expression world —
|
|
74
|
+
`__repr__`/`_repr_html_` (fetch a capped preview, like tibble's 10-row
|
|
75
|
+
print), `len`, `.shape`, iteration, `.to_polars()`, `.to_pandas()`,
|
|
76
|
+
plotting hooks. In a notebook this *feels* fully eager; in a pipeline
|
|
77
|
+
that only collects at the end, the same code gets full laziness.
|
|
78
|
+
`mode="lazy"` / `.lazy()` opts out for production.
|
|
79
|
+
|
|
80
|
+
Rationale: dplyr rose to fame *eager*; immediate errors and immediate
|
|
81
|
+
results are why interactive analysis felt good. Immediate errors need only
|
|
82
|
+
the schema. Immediate results need collection only at display points.
|
|
83
|
+
|
|
84
|
+
### Sharp edges, handled deliberately
|
|
85
|
+
|
|
86
|
+
- **Repeated re-execution**: cache results on first materialization, keyed
|
|
87
|
+
by plan hash. Plus an explicit `.persist()` checkpoint verb
|
|
88
|
+
(polars: collected frame; duckdb: `CREATE TEMP TABLE`).
|
|
89
|
+
- **Source mutation between displays**: `persist()` is the snapshot
|
|
90
|
+
operator; documented honestly.
|
|
91
|
+
- **Schema-needs-data ops** (`pivot_wider` — output columns come from
|
|
92
|
+
values): implicitly persist their input, compute, continue. Users never
|
|
93
|
+
see the distinction.
|
|
94
|
+
- **Provenance in print**: repr shows collected rows plus
|
|
95
|
+
`# source: parquet (lazy) · showing 10 of ~87 rows`.
|
|
96
|
+
|
|
97
|
+
## 4. Architecture
|
|
98
|
+
|
|
99
|
+
```
|
|
100
|
+
user API (verbs, col proxy)
|
|
101
|
+
│ builds
|
|
102
|
+
▼
|
|
103
|
+
logical plan + expression IR ←— schema inference/validation lives here
|
|
104
|
+
│ compiles to
|
|
105
|
+
├──────────────► polars LazyFrame (expression 1:1 mapping)
|
|
106
|
+
└──────────────► SQL string / relation for duckdb
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
- The IR is *ours* and small; semantics (NA handling, sort stability,
|
|
110
|
+
grouped ordering — see SEMANTICS.md) are pinned in the IR, and each
|
|
111
|
+
backend compiler is responsible for complying, inserting casts/sorts
|
|
112
|
+
where the engine's defaults differ.
|
|
113
|
+
- Backend chosen at the source: `read_parquet/read_csv/from_polars/
|
|
114
|
+
from_pandas` → polars; `from_duckdb(con, "tbl")` / `read_sql` → duckdb.
|
|
115
|
+
- `group_by` returns `GroupedDFrame` (separate type → separate completion
|
|
116
|
+
surface), auto-ungrouping after `summarize`, matching dplyr.
|
|
117
|
+
|
|
118
|
+
## 5. Non-goals (MVP)
|
|
119
|
+
|
|
120
|
+
- No pandas execution backend (only conversion in/out).
|
|
121
|
+
- No R-style NSE magic via frame inspection / AST tricks.
|
|
122
|
+
- No plotting, no modeling. Frames out are polars/pandas; the ecosystem
|
|
123
|
+
does the rest.
|
|
124
|
+
- No distributed execution.
|
|
125
|
+
|
|
126
|
+
## 6. Name
|
|
127
|
+
|
|
128
|
+
PyPI name: `dpyr` — dplyr with the L dropped, so "py" sits in the middle:
|
|
129
|
+
dplyr-for-Python in four characters. Chosen 2026-06-09 after `dataframe`
|
|
130
|
+
turned out to be PyPI policy-blocked and `dataframes`/`table`/most
|
|
131
|
+
expressive English words were squatted; `tibble` and `dplyr` were free but
|
|
132
|
+
carry Posit brand-risk. Import name: `dpyr`.
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# ROADMAP — epics & stories to MVP
|
|
2
|
+
|
|
3
|
+
Ordered by dependency. A story is done when its tests (per TESTING.md) are
|
|
4
|
+
green on both backends. MVP = Epics 0–7 complete.
|
|
5
|
+
|
|
6
|
+
**MVP definition:** a user can `uv add dpyr`, read parquet/CSV or a
|
|
7
|
+
duckdb table, run the core dplyr verb set with the `col` proxy, get
|
|
8
|
+
immediate schema errors and auto-displayed results in Jupyter, and trust
|
|
9
|
+
the output because it's differentially tested against dplyr.
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## Epic 0 — Project foundation
|
|
14
|
+
- **0.1** Repo scaffold: uv project, `pyproject.toml`, ruff, mypy/pyright,
|
|
15
|
+
pytest, pre-commit, GitHub Actions skeleton. ✅ (this commit)
|
|
16
|
+
- **0.2** Reserve `dpyr` on PyPI with a 0.0.1 placeholder that
|
|
17
|
+
errors helpfully on import-and-use. (`dataframe` is PyPI policy-blocked;
|
|
18
|
+
see DESIGN.md §6 for the naming history.)
|
|
19
|
+
- **0.3** Commit intent docs (DESIGN/SEMANTICS/TESTING/ROADMAP). ✅
|
|
20
|
+
|
|
21
|
+
## Epic 1 — Expression IR & schema engine (the core)
|
|
22
|
+
- **1.1** Dtype system: Int64/Float64/Bool/Str/Date/Datetime/Null + NA
|
|
23
|
+
model (SEMANTICS S1).
|
|
24
|
+
- **1.2** Expression nodes: column refs, literals, arithmetic/comparison/
|
|
25
|
+
boolean ops, function calls; typed expr classes (NumExpr, StrExpr, ...).
|
|
26
|
+
- **1.3** `col` proxy producing typed expressions; `desc()`, `n()`,
|
|
27
|
+
`if_else()`, `case_when()`.
|
|
28
|
+
- **1.4** Logical plan nodes for every MVP verb; **schema inference +
|
|
29
|
+
validation per node** with did-you-mean errors and one-frame tracebacks.
|
|
30
|
+
- **1.5** Plan hashing (for the materialization cache) and stable repr
|
|
31
|
+
(for IR snapshot tests).
|
|
32
|
+
- Tests: unit + Hypothesis property 1 (schema soundness).
|
|
33
|
+
|
|
34
|
+
## Epic 2 — polars backend
|
|
35
|
+
- **2.1** Sources: `read_parquet`, `read_csv`, `from_polars`, `from_pandas`.
|
|
36
|
+
- **2.2** Compile IR → polars LazyFrame for: filter, mutate, select,
|
|
37
|
+
rename, arrange, distinct, slices.
|
|
38
|
+
- **2.3** group_by/summarize incl. grouped ordering (S7) and ungroup
|
|
39
|
+
semantics (S9).
|
|
40
|
+
- **2.4** Joins (all six) with dplyr suffix & NA-matching rules (S10–S11).
|
|
41
|
+
- **2.5** Semantics shims: sort stability/NA position (S3), int division
|
|
42
|
+
(S4), counts dtype (S13), division by zero (S14).
|
|
43
|
+
|
|
44
|
+
## Epic 3 — Materialization model (the UX)
|
|
45
|
+
- **3.1** Display-eager boundaries: `__repr__`/`_repr_html_` with capped
|
|
46
|
+
preview + provenance line, `len`, `.shape`, iteration, `.to_polars()`,
|
|
47
|
+
`.to_pandas()`, `.pull()`.
|
|
48
|
+
- **3.2** Result cache keyed by plan hash; invalidation on `persist()`.
|
|
49
|
+
- **3.3** `.persist()` checkpoint verb; `.lazy()` / `mode="lazy"` opt-out.
|
|
50
|
+
- **3.4** Implicit persist for schema-needs-data ops (groundwork for
|
|
51
|
+
pivot_wider).
|
|
52
|
+
- Tests: metamorphic laws on persist/collect/repr equivalence.
|
|
53
|
+
|
|
54
|
+
## Epic 4 — Oracle harness (start early, runs forever)
|
|
55
|
+
- **4.1** YAML spec format + Python spec runner.
|
|
56
|
+
- **4.2** `oracle/run_specs.R` + pinned rocker container; golden parquet
|
|
57
|
+
generation; fixture metadata (dplyr version).
|
|
58
|
+
- **4.3** Comparison/normalization harness implementing SEMANTICS
|
|
59
|
+
S6/S17/S19.
|
|
60
|
+
- **4.4** CI: per-push spec tests vs committed goldens; nightly golden
|
|
61
|
+
drift job.
|
|
62
|
+
- **4.5** Seed corpus: ~12 hand-written specs per MVP verb + ported dplyr
|
|
63
|
+
doc examples.
|
|
64
|
+
|
|
65
|
+
## Epic 5 — duckdb backend
|
|
66
|
+
- **5.1** Sources: `from_duckdb(con, table)`, `read_sql`.
|
|
67
|
+
- **5.2** IR → SQL compiler for the Epic-2 verb set, with semantics shims
|
|
68
|
+
(casts, ORDER BY NULLS LAST, etc.).
|
|
69
|
+
- **5.3** `persist()` as `CREATE TEMP TABLE`.
|
|
70
|
+
- **5.4** Hypothesis property 2: backend agreement, in CI on every push.
|
|
71
|
+
|
|
72
|
+
## Epic 6 — tidyselect, across, reshaping
|
|
73
|
+
- **6.1** tidyselect: `starts_with/ends_with/contains/matches/where/
|
|
74
|
+
everything`, negation; works in `select`, `rename`, `distinct`.
|
|
75
|
+
- **6.2** `across()` in mutate/summarize with `names=` templating.
|
|
76
|
+
- **6.3** `pivot_longer`; `pivot_wider` (uses 3.4); `count`; `pull`.
|
|
77
|
+
- **6.4** `GroupedDFrame` type with its own completion surface; grouped
|
|
78
|
+
filter/mutate window semantics.
|
|
79
|
+
|
|
80
|
+
## Epic 7 — Autocompletion & developer experience
|
|
81
|
+
- **7.1** Runtime completion: `__dir__` from live schema; frame-bound
|
|
82
|
+
`df.c` proxy.
|
|
83
|
+
- **7.2** `DFrame[S]` generic typing + lambda-style `filter(lambda c: ...)`.
|
|
84
|
+
- **7.3** `dpyr stubgen` CLI: parquet/duckdb schema → `Cols` stubs;
|
|
85
|
+
stubs checked by mypy+pyright in CI.
|
|
86
|
+
- **7.4** Error message polish pass (the "feels eager" acceptance test:
|
|
87
|
+
every user mistake surfaces on the line that made it).
|
|
88
|
+
|
|
89
|
+
## Epic 8 — Hardening & release (MVP gate)
|
|
90
|
+
- **8.1** Port dplyr testthat regression tests for MVP verbs.
|
|
91
|
+
- **8.2** Nightly Hypothesis-vs-oracle fuzzing job.
|
|
92
|
+
- **8.3** Docs site: tutorial mirroring the dplyr vignette, dplyr→dataframe
|
|
93
|
+
cheat sheet, SEMANTICS as a public page.
|
|
94
|
+
- **8.4** `0.1.0` to PyPI; announce with the differential-test count as
|
|
95
|
+
the headline ("passes N dplyr-generated golden tests").
|
|
96
|
+
|
|
97
|
+
## Post-MVP (parking lot)
|
|
98
|
+
`separate/unite`, `nest`, window function breadth, `slice_min/max`,
|
|
99
|
+
list-columns, streaming collect, arrow Flight sources, sqlite/postgres
|
|
100
|
+
backends via the duckdb SQL layer, plugin API for custom verbs.
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# SEMANTICS — the conformance spec
|
|
2
|
+
|
|
3
|
+
Where R/dplyr, polars and duckdb disagree, this file records the decision.
|
|
4
|
+
Every row below must be encoded as a test that links back here. Comparison
|
|
5
|
+
against the dplyr oracle is checked *modulo these documented divergences* —
|
|
6
|
+
never fuzzily.
|
|
7
|
+
|
|
8
|
+
Legend: **R** = follow dplyr · **P** = follow polars/duckdb · **pinned** =
|
|
9
|
+
our own rule, backends forced to comply.
|
|
10
|
+
|
|
11
|
+
| # | Area | dplyr | polars/duckdb | Decision |
|
|
12
|
+
|---|------|-------|---------------|----------|
|
|
13
|
+
| S1 | Missing values | typed `NA`, `NaN` distinct | `null` vs `NaN` | **pinned**: NA ↔ null bidirectionally; NaN preserved as NaN; document |
|
|
14
|
+
| S2 | `mean/sum/...` with missing | `NA` unless `na.rm=TRUE` | ignore nulls | **P**, with `na_rm: bool = True` kwarg for familiarity |
|
|
15
|
+
| S3 | Sort: NA position & stability | NAs last, stable sort | varies per engine | **pinned**: stable, NAs last; `desc()` keeps NAs last |
|
|
16
|
+
| S4 | `int / int` | promotes to double | varies | **R** (saner) |
|
|
17
|
+
| S5 | Integer overflow | promotes / warns | wraps or errors | **pinned**: Int64 default; overflow errors |
|
|
18
|
+
| S6 | String ordering / collation | locale-dependent (!) | byte/UTF-8 | **pinned**: C-locale codepoint order — *known divergence from R*; oracle harness normalizes |
|
|
19
|
+
| S7 | Grouped result ordering | sorted by group keys | hash order | **R**: sort by keys |
|
|
20
|
+
| S8 | Empty groups / zero-row inputs | specific dplyr behaviors | varies | **R**; port dplyr regression tests |
|
|
21
|
+
| S9 | `summarize` ungrouping | drops last group level | n/a | **R**, including the multi-key behavior |
|
|
22
|
+
| S10 | Join key NA matching | `NA` matches `NA` by default | SQL: NULL ≠ NULL | **R** default, `na_matches="never"` opt-out (mirrors dplyr arg) |
|
|
23
|
+
| S11 | Join suffixes | `.x` / `.y` | `_right` etc. | **R**: `(".x", ".y")` |
|
|
24
|
+
| S12 | Boolean with NA (3-valued logic) | NA propagates; `filter` drops NA | same in SQL | **R/SQL** (they agree); test it anyway |
|
|
25
|
+
| S13 | `n()` / counts dtype | integer | u32/i64 | **pinned**: Int64 |
|
|
26
|
+
| S14 | Division by zero | `Inf`/`NaN` | varies (duckdb errors on int) | **R**: `Inf`/`-Inf`/`NaN`, cast first on duckdb |
|
|
27
|
+
| S15 | `case_when` no match | `NA` | null | agree; pin the result dtype unification rule |
|
|
28
|
+
| S16 | Date/time zones | rich, messy | UTC-leaning | **pinned**: tz-aware UTC default; naive allowed; conversions explicit |
|
|
29
|
+
| S17 | Factors | core R type | none | not supported; oracle harness converts factors → strings before compare |
|
|
30
|
+
| S18 | Recycling length-1 values in `mutate` | yes | literals broadcast | **R** for scalars only; no general recycling |
|
|
31
|
+
| S19 | Float comparison in tests | — | — | harness: sort-normalize where order unspecified + ULP tolerance |
|
|
32
|
+
|
|
33
|
+
Process: when a differential test fails and the cause is a *new* semantic
|
|
34
|
+
disagreement, the fix is (1) add a row here, (2) encode it in the harness
|
|
35
|
+
normalization or backend compiler, (3) add a dedicated test naming the row.
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# TESTING — strategy
|
|
2
|
+
|
|
3
|
+
Three layers. dplyr is the oracle; polars↔duckdb agreement is the
|
|
4
|
+
workhorse; Hypothesis finds what nobody thought to write.
|
|
5
|
+
|
|
6
|
+
## Layer 1 — differential testing against dplyr (the oracle)
|
|
7
|
+
|
|
8
|
+
Test cases live in a neutral spec format (YAML), input data + verb chain:
|
|
9
|
+
|
|
10
|
+
```yaml
|
|
11
|
+
# tests/specs/summarize/grouped_mean.yaml
|
|
12
|
+
input: starwars
|
|
13
|
+
chain:
|
|
14
|
+
- filter: "height > 180"
|
|
15
|
+
- group_by: [species]
|
|
16
|
+
- summarize: {n: "n()", mh: "mean(height, na.rm=TRUE)"}
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
- `oracle/run_specs.R` translates each spec to dplyr code, runs it, writes
|
|
20
|
+
the result as **parquet** (lossless types, NA fidelity — never CSV) into
|
|
21
|
+
`tests/golden/`.
|
|
22
|
+
- pytest runs the same spec through our library (both backends) and
|
|
23
|
+
asserts frame equality via the normalization harness (see SEMANTICS.md
|
|
24
|
+
S6/S17/S19).
|
|
25
|
+
|
|
26
|
+
**R stays out of the inner loop.** Golden parquets are generated offline
|
|
27
|
+
and committed. Day-to-day `pytest` is pure Python. A separate CI job in a
|
|
28
|
+
pinned `rocker/tidyverse` container regenerates all goldens and fails on
|
|
29
|
+
drift — catching both our bugs and wrongly-encoded expectations. dplyr
|
|
30
|
+
version is pinned and recorded in fixture metadata.
|
|
31
|
+
|
|
32
|
+
## Layer 2 — mined test corpus
|
|
33
|
+
|
|
34
|
+
- Port **dplyr's documented examples** (every verb's examples, vignettes,
|
|
35
|
+
relevant R4DS chapters) into specs. They double as our tutorial docs.
|
|
36
|
+
- Read **dplyr's testthat suite** and port its *regression tests* — each
|
|
37
|
+
is a documented historical bug (zero-row groups, grouped filter, join
|
|
38
|
+
multiplicities, `across()` corners).
|
|
39
|
+
- **dbplyr's translation tests** inform the duckdb compiler.
|
|
40
|
+
|
|
41
|
+
## Layer 3 — property-based testing (Hypothesis)
|
|
42
|
+
|
|
43
|
+
Strategies generate small random frames (mixed dtypes, nulls, empty,
|
|
44
|
+
single-row, duplicates) and type-correct random verb chains (a grammar
|
|
45
|
+
over the IR).
|
|
46
|
+
|
|
47
|
+
Properties, in order of cost:
|
|
48
|
+
|
|
49
|
+
1. **Schema soundness** (no oracle): predicted output schema ==
|
|
50
|
+
actually-collected schema. Cheap; run always. This is the test of the
|
|
51
|
+
schema-eager promise.
|
|
52
|
+
2. **Backend agreement** (no oracle): polars result == duckdb result for
|
|
53
|
+
the same plan. Our most important invariant. Run always.
|
|
54
|
+
3. **Metamorphic laws** (no oracle):
|
|
55
|
+
- `filter(p).filter(q)` ≡ `filter(p & q)`
|
|
56
|
+
- `arrange ∘ filter` ≡ `filter ∘ arrange` (as multisets)
|
|
57
|
+
- `mutate` of an unused column doesn't change downstream `summarize`
|
|
58
|
+
- chain collected once ≡ collected verb-by-verb with `persist()`
|
|
59
|
+
between every step ← *specifically exercises the lazy/eager machinery*
|
|
60
|
+
- repr (display-eager) then collect ≡ collect directly (cache
|
|
61
|
+
correctness)
|
|
62
|
+
4. **Oracle fuzzing** (needs R): sampled generated cases through the
|
|
63
|
+
dplyr oracle. Nightly job, not per-commit.
|
|
64
|
+
|
|
65
|
+
## Unit layers (ordinary but required)
|
|
66
|
+
|
|
67
|
+
- IR construction & schema inference per verb (golden IR snapshots).
|
|
68
|
+
- duckdb compiler: IR → SQL string snapshots.
|
|
69
|
+
- Error UX: wrong column names raise immediately with short tracebacks
|
|
70
|
+
and did-you-mean suggestions (asserted on message content).
|
|
71
|
+
- Stubgen: generated stubs compile under mypy & pyright in CI.
|
|
72
|
+
|
|
73
|
+
## CI matrix
|
|
74
|
+
|
|
75
|
+
| Job | Trigger | Needs R |
|
|
76
|
+
|---|---|---|
|
|
77
|
+
| pytest (specs vs committed goldens, both backends) | every push | no |
|
|
78
|
+
| Hypothesis schema/backend/metamorphic | every push | no |
|
|
79
|
+
| golden regeneration + drift check | nightly + golden-touching PRs | yes (docker) |
|
|
80
|
+
| oracle fuzzing | nightly | yes (docker) |
|
|
81
|
+
| mypy/pyright incl. generated stubs | every push | no |
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "dpyr"
|
|
3
|
+
version = "0.0.1"
|
|
4
|
+
description = "dplyr for Python: tidy piped verbs over polars and duckdb, with real autocompletion. Name reservation — API under active development."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.10"
|
|
7
|
+
license = { text = "MIT" }
|
|
8
|
+
authors = [{ name = "Maxime Rivest", email = "mrive052@gmail.com" }]
|
|
9
|
+
keywords = ["dplyr", "dataframe", "polars", "duckdb", "tidyverse", "data-analysis"]
|
|
10
|
+
classifiers = [
|
|
11
|
+
"Development Status :: 1 - Planning",
|
|
12
|
+
"Intended Audience :: Science/Research",
|
|
13
|
+
"License :: OSI Approved :: MIT License",
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"Topic :: Scientific/Engineering",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.urls]
|
|
19
|
+
Repository = "https://github.com/maximerivest/dataframe"
|
|
20
|
+
|
|
21
|
+
[dependency-groups]
|
|
22
|
+
dev = ["pytest>=8", "ruff", "mypy", "hypothesis"]
|
|
23
|
+
|
|
24
|
+
[build-system]
|
|
25
|
+
requires = ["hatchling"]
|
|
26
|
+
build-backend = "hatchling.build"
|
|
27
|
+
|
|
28
|
+
[tool.hatch.build.targets.wheel]
|
|
29
|
+
packages = ["src/dpyr"]
|
|
30
|
+
|
|
31
|
+
[tool.ruff]
|
|
32
|
+
line-length = 100
|
|
33
|
+
|
|
34
|
+
[tool.pytest.ini_options]
|
|
35
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""dataframe — dplyr for Python, fronting polars and duckdb.
|
|
2
|
+
|
|
3
|
+
This release reserves the package name while the library is under active
|
|
4
|
+
development. See https://github.com/maximerivest/dpyr for the design
|
|
5
|
+
documents and roadmap.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__version__ = "0.0.1"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def __getattr__(name: str):
|
|
12
|
+
raise NotImplementedError(
|
|
13
|
+
f"dpyr.{name} is not available yet: version {__version__} is a "
|
|
14
|
+
"name-reservation release. The dplyr-style API (filter, mutate, "
|
|
15
|
+
"group_by, summarize, ...) is under development — see "
|
|
16
|
+
"https://github.com/maximerivest/dpyr for the roadmap."
|
|
17
|
+
)
|