dpyr 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dpyr-0.0.1/PKG-INFO ADDED
@@ -0,0 +1,69 @@
1
+ Metadata-Version: 2.4
2
+ Name: dpyr
3
+ Version: 0.0.1
4
+ Summary: dplyr for Python: tidy piped verbs over polars and duckdb, with real autocompletion. Name reservation — API under active development.
5
+ Project-URL: Repository, https://github.com/maximerivest/dataframe
6
+ Author-email: Maxime Rivest <mrive052@gmail.com>
7
+ License: MIT
8
+ Keywords: data-analysis,dataframe,dplyr,duckdb,polars,tidyverse
9
+ Classifier: Development Status :: 1 - Planning
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Topic :: Scientific/Engineering
14
+ Requires-Python: >=3.10
15
+ Description-Content-Type: text/markdown
16
+
17
+ # dpyr
18
+
19
+ **dplyr for Python.** A tidy, pipe-style data manipulation API — fronting
20
+ [polars](https://pola.rs) and [duckdb](https://duckdb.org) — with real IDE
21
+ autocompletion and dplyr-faithful semantics, verified by differential testing
22
+ against dplyr itself.
23
+
24
+ ```python
25
+ from dpyr import read_parquet, col, n, desc, starts_with
26
+
27
+ (
28
+ starwars
29
+ .filter(col.height > 180, col.mass < 100)
30
+ .mutate(bmi = col.mass / (col.height / 100) ** 2)
31
+ .group_by(col.species)
32
+ .summarize(
33
+ n = n(),
34
+ mean_bmi = col.bmi.mean(),
35
+ )
36
+ .arrange(desc(col.mean_bmi))
37
+ )
38
+ ```
39
+
40
+ ## Principles (the elevator pitch)
41
+
42
+ 1. **dplyr's vocabulary, Python's idiom.** The verbs are dplyr's, verbatim
43
+ (`filter`, `mutate`, `select`, `arrange`, `group_by`, `summarize`,
44
+ joins, tidyselect). The pipe is Python's: method chaining.
45
+ 2. **As lazy as possible internally, as eager as possible observably.**
46
+ Verbs build a plan; schema errors raise immediately on the offending
47
+ line; displaying/exporting auto-collects. Interactive feel, query-engine
48
+ performance.
49
+ 3. **Autocompletion is a feature, not an accident.** The `col` proxy and
50
+ per-schema stub generation make column names and column-typed methods
51
+ complete in any IDE.
52
+ 4. **Two backends, one semantics.** polars (in-memory/files) and duckdb
53
+ (SQL pushdown) must agree, bit-for-bit modulo the documented semantics
54
+ spec. Verified continuously.
55
+ 5. **dplyr is the oracle.** Compatibility is demonstrated, not claimed:
56
+ golden outputs are generated by actual dplyr in CI.
57
+
58
+ ## Project documents
59
+
60
+ | Doc | What it pins down |
61
+ |---|---|
62
+ | [docs/DESIGN.md](docs/DESIGN.md) | API design, laziness/materialization model, autocompletion strategy, architecture |
63
+ | [docs/SEMANTICS.md](docs/SEMANTICS.md) | The conformance spec: every deliberate decision where R, polars and duckdb disagree |
64
+ | [docs/TESTING.md](docs/TESTING.md) | Test strategy: dplyr-as-oracle, backend differential tests, Hypothesis properties |
65
+ | [docs/ROADMAP.md](docs/ROADMAP.md) | Epics and stories to MVP, in dependency order |
66
+
67
+ ## Status
68
+
69
+ Pre-MVP. The plan is in [docs/ROADMAP.md](docs/ROADMAP.md).
dpyr-0.0.1/README.md ADDED
@@ -0,0 +1,53 @@
1
+ # dpyr
2
+
3
+ **dplyr for Python.** A tidy, pipe-style data manipulation API — fronting
4
+ [polars](https://pola.rs) and [duckdb](https://duckdb.org) — with real IDE
5
+ autocompletion and dplyr-faithful semantics, verified by differential testing
6
+ against dplyr itself.
7
+
8
+ ```python
9
+ from dpyr import read_parquet, col, n, desc, starts_with
10
+
11
+ (
12
+ starwars
13
+ .filter(col.height > 180, col.mass < 100)
14
+ .mutate(bmi = col.mass / (col.height / 100) ** 2)
15
+ .group_by(col.species)
16
+ .summarize(
17
+ n = n(),
18
+ mean_bmi = col.bmi.mean(),
19
+ )
20
+ .arrange(desc(col.mean_bmi))
21
+ )
22
+ ```
23
+
24
+ ## Principles (the elevator pitch)
25
+
26
+ 1. **dplyr's vocabulary, Python's idiom.** The verbs are dplyr's, verbatim
27
+ (`filter`, `mutate`, `select`, `arrange`, `group_by`, `summarize`,
28
+ joins, tidyselect). The pipe is Python's: method chaining.
29
+ 2. **As lazy as possible internally, as eager as possible observably.**
30
+ Verbs build a plan; schema errors raise immediately on the offending
31
+ line; displaying/exporting auto-collects. Interactive feel, query-engine
32
+ performance.
33
+ 3. **Autocompletion is a feature, not an accident.** The `col` proxy and
34
+ per-schema stub generation make column names and column-typed methods
35
+ complete in any IDE.
36
+ 4. **Two backends, one semantics.** polars (in-memory/files) and duckdb
37
+ (SQL pushdown) must agree, bit-for-bit modulo the documented semantics
38
+ spec. Verified continuously.
39
+ 5. **dplyr is the oracle.** Compatibility is demonstrated, not claimed:
40
+ golden outputs are generated by actual dplyr in CI.
41
+
42
+ ## Project documents
43
+
44
+ | Doc | What it pins down |
45
+ |---|---|
46
+ | [docs/DESIGN.md](docs/DESIGN.md) | API design, laziness/materialization model, autocompletion strategy, architecture |
47
+ | [docs/SEMANTICS.md](docs/SEMANTICS.md) | The conformance spec: every deliberate decision where R, polars and duckdb disagree |
48
+ | [docs/TESTING.md](docs/TESTING.md) | Test strategy: dplyr-as-oracle, backend differential tests, Hypothesis properties |
49
+ | [docs/ROADMAP.md](docs/ROADMAP.md) | Epics and stories to MVP, in dependency order |
50
+
51
+ ## Status
52
+
53
+ Pre-MVP. The plan is in [docs/ROADMAP.md](docs/ROADMAP.md).
@@ -0,0 +1,132 @@
1
+ # DESIGN — intent and architecture
2
+
3
+ This document records the design decisions and *why* they were made, so the
4
+ project stays coherent as it grows. Changes to anything in here deserve a
5
+ discussion, not a drive-by PR.
6
+
7
+ ## 1. The problem
8
+
9
+ dplyr's ergonomics rest on non-standard evaluation (bare column names),
10
+ which Python cannot do. Prior ports each gave up something essential:
11
+
12
+ - **dfply / dplython** — emulated `%>%` with `>>` operator hacks; alien to
13
+ Python, broke tooling, unmaintained.
14
+ - **siuba** — closest in spirit (`_` proxy, SQL backend) but weak typing
15
+ and completion.
16
+ - **tidypolars** — dplyr verb names over polars, but string column refs:
17
+ no completion, no expression typing.
18
+ - **ibis** — the right architecture (expression IR, many backends, lazy
19
+ with interactive mode) but the API drifted far from dplyr.
20
+
21
+ We take ibis's architecture, dplyr's API, and add the missing piece nobody
22
+ shipped: schema-aware autocompletion.
23
+
24
+ ## 2. API surface
25
+
26
+ ### Verbs (dplyr names, verbatim)
27
+
28
+ MVP set: `filter`, `mutate`, `select`, `rename`, `arrange`, `group_by`
29
+ (+ implicit ungroup after `summarize`), `summarize`/`summarise`,
30
+ `distinct`, `slice_head`/`slice_tail`/`slice_sample`, `count`,
31
+ `left_join`/`inner_join`/`right_join`/`full_join`/`semi_join`/`anti_join`,
32
+ `pivot_longer`, `pivot_wider`, `pull`.
33
+
34
+ Pipe = method chaining. No `>>`/`|` operator overloading, ever.
35
+ New columns via kwargs: `mutate(bmi = ...)`, `summarize(n = n())`.
36
+
37
+ ### The `col` proxy
38
+
39
+ `col.height > 180` builds an expression tree (our IR), not a value.
40
+ Column-typed expression classes (`NumExpr`, `StrExpr`, `BoolExpr`,
41
+ `DtExpr`, ...) carry the appropriate methods (`.mean()`, `.str_detect()`,
42
+ `.year()`), so completion is type-correct.
43
+
44
+ Helpers as plain functions: `n()`, `desc()`, `if_else()`, `case_when()`,
45
+ `across()`, and tidyselect (`starts_with`, `ends_with`, `contains`,
46
+ `matches`, `where`, `everything`); negation via unary minus.
47
+
48
+ ### Autocompletion strategy (the differentiator)
49
+
50
+ Three tiers, weakest to strongest:
51
+
52
+ 1. **Runtime**: `DFrame.__getattr__`/`__dir__` and a frame-bound `df.c`
53
+ proxy populated from the live schema → Jupyter/REPL completion for free.
54
+ 2. **Generic typing**: `DFrame[S]` parameterized by a `Cols` schema class;
55
+ `filter(lambda c: c.height > 180)` completes via the lambda's inferred
56
+ parameter type.
57
+ 3. **Stub codegen**: a CLI (`dpyr stubgen data/*.parquet`) reads
58
+ parquet/duckdb schemas and emits `Cols` subclasses + typed module
59
+ attributes, giving full static completion and type-checking in any IDE.
60
+
61
+ ## 3. Materialization model
62
+
63
+ **Schema-eager, data-lazy, display-eager.** (The core UX decision.)
64
+
65
+ - *Schema-eager*: every verb validates inputs against the known schema and
66
+ computes its output schema synchronously. Wrong column name, type
67
+ mismatch, bad group reference → exception on that line, with a one-frame
68
+ traceback. Requires only metadata; costs nothing.
69
+ - *Data-lazy*: verbs append to a logical plan. No intermediate
70
+ materialization within a chain → query fusion, predicate pushdown into
71
+ parquet/duckdb.
72
+ - *Display-eager* (default on): materialization happens automatically at
73
+ the boundaries where a value escapes the expression world —
74
+ `__repr__`/`_repr_html_` (fetch a capped preview, like tibble's 10-row
75
+ print), `len`, `.shape`, iteration, `.to_polars()`, `.to_pandas()`,
76
+ plotting hooks. In a notebook this *feels* fully eager; in a pipeline
77
+ that only collects at the end, the same code gets full laziness.
78
+ `mode="lazy"` / `.lazy()` opts out for production.
79
+
80
+ Rationale: dplyr rose to fame *eager*; immediate errors and immediate
81
+ results are why interactive analysis felt good. Immediate errors need only
82
+ the schema. Immediate results need collection only at display points.
83
+
84
+ ### Sharp edges, handled deliberately
85
+
86
+ - **Repeated re-execution**: cache results on first materialization, keyed
87
+ by plan hash. Plus an explicit `.persist()` checkpoint verb
88
+ (polars: collected frame; duckdb: `CREATE TEMP TABLE`).
89
+ - **Source mutation between displays**: `persist()` is the snapshot
90
+ operator; documented honestly.
91
+ - **Schema-needs-data ops** (`pivot_wider` — output columns come from
92
+ values): implicitly persist their input, compute, continue. Users never
93
+ see the distinction.
94
+ - **Provenance in print**: repr shows collected rows plus
95
+ `# source: parquet (lazy) · showing 10 of ~87 rows`.
96
+
97
+ ## 4. Architecture
98
+
99
+ ```
100
+ user API (verbs, col proxy)
101
+ │ builds
102
+
103
+ logical plan + expression IR ←— schema inference/validation lives here
104
+ │ compiles to
105
+ ├──────────────► polars LazyFrame (expression 1:1 mapping)
106
+ └──────────────► SQL string / relation for duckdb
107
+ ```
108
+
109
+ - The IR is *ours* and small; semantics (NA handling, sort stability,
110
+ grouped ordering — see SEMANTICS.md) are pinned in the IR, and each
111
+ backend compiler is responsible for complying, inserting casts/sorts
112
+ where the engine's defaults differ.
113
+ - Backend chosen at the source: `read_parquet/read_csv/from_polars/
114
+ from_pandas` → polars; `from_duckdb(con, "tbl")` / `read_sql` → duckdb.
115
+ - `group_by` returns `GroupedDFrame` (separate type → separate completion
116
+ surface), auto-ungrouping after `summarize`, matching dplyr.
117
+
118
+ ## 5. Non-goals (MVP)
119
+
120
+ - No pandas execution backend (only conversion in/out).
121
+ - No R-style NSE magic via frame inspection / AST tricks.
122
+ - No plotting, no modeling. Frames out are polars/pandas; the ecosystem
123
+ does the rest.
124
+ - No distributed execution.
125
+
126
+ ## 6. Name
127
+
128
+ PyPI name: `dpyr` — dplyr with the L dropped, so "py" sits in the middle:
129
+ dplyr-for-Python in four characters. Chosen 2026-06-09 after `dataframe`
130
+ turned out to be PyPI policy-blocked and `dataframes`/`table`/most
131
+ expressive English words were squatted; `tibble` and `dplyr` were free but
132
+ carry Posit brand-risk. Import name: `dpyr`.
@@ -0,0 +1,100 @@
1
+ # ROADMAP — epics & stories to MVP
2
+
3
+ Ordered by dependency. A story is done when its tests (per TESTING.md) are
4
+ green on both backends. MVP = Epics 0–7 complete.
5
+
6
+ **MVP definition:** a user can `uv add dpyr`, read parquet/CSV or a
7
+ duckdb table, run the core dplyr verb set with the `col` proxy, get
8
+ immediate schema errors and auto-displayed results in Jupyter, and trust
9
+ the output because it's differentially tested against dplyr.
10
+
11
+ ---
12
+
13
+ ## Epic 0 — Project foundation
14
+ - **0.1** Repo scaffold: uv project, `pyproject.toml`, ruff, mypy/pyright,
15
+ pytest, pre-commit, GitHub Actions skeleton. ✅ (this commit)
16
+ - **0.2** Reserve `dpyr` on PyPI with a 0.0.1 placeholder that
17
+ errors helpfully on import-and-use. (`dataframe` is PyPI policy-blocked;
18
+ see DESIGN.md §6 for the naming history.)
19
+ - **0.3** Commit intent docs (DESIGN/SEMANTICS/TESTING/ROADMAP). ✅
20
+
21
+ ## Epic 1 — Expression IR & schema engine (the core)
22
+ - **1.1** Dtype system: Int64/Float64/Bool/Str/Date/Datetime/Null + NA
23
+ model (SEMANTICS S1).
24
+ - **1.2** Expression nodes: column refs, literals, arithmetic/comparison/
25
+ boolean ops, function calls; typed expr classes (NumExpr, StrExpr, ...).
26
+ - **1.3** `col` proxy producing typed expressions; `desc()`, `n()`,
27
+ `if_else()`, `case_when()`.
28
+ - **1.4** Logical plan nodes for every MVP verb; **schema inference +
29
+ validation per node** with did-you-mean errors and one-frame tracebacks.
30
+ - **1.5** Plan hashing (for the materialization cache) and stable repr
31
+ (for IR snapshot tests).
32
+ - Tests: unit + Hypothesis property 1 (schema soundness).
33
+
34
+ ## Epic 2 — polars backend
35
+ - **2.1** Sources: `read_parquet`, `read_csv`, `from_polars`, `from_pandas`.
36
+ - **2.2** Compile IR → polars LazyFrame for: filter, mutate, select,
37
+ rename, arrange, distinct, slices.
38
+ - **2.3** group_by/summarize incl. grouped ordering (S7) and ungroup
39
+ semantics (S9).
40
+ - **2.4** Joins (all six) with dplyr suffix & NA-matching rules (S10–S11).
41
+ - **2.5** Semantics shims: sort stability/NA position (S3), int division
42
+ (S4), counts dtype (S13), division by zero (S14).
43
+
44
+ ## Epic 3 — Materialization model (the UX)
45
+ - **3.1** Display-eager boundaries: `__repr__`/`_repr_html_` with capped
46
+ preview + provenance line, `len`, `.shape`, iteration, `.to_polars()`,
47
+ `.to_pandas()`, `.pull()`.
48
+ - **3.2** Result cache keyed by plan hash; invalidation on `persist()`.
49
+ - **3.3** `.persist()` checkpoint verb; `.lazy()` / `mode="lazy"` opt-out.
50
+ - **3.4** Implicit persist for schema-needs-data ops (groundwork for
51
+ pivot_wider).
52
+ - Tests: metamorphic laws on persist/collect/repr equivalence.
53
+
54
+ ## Epic 4 — Oracle harness (start early, runs forever)
55
+ - **4.1** YAML spec format + Python spec runner.
56
+ - **4.2** `oracle/run_specs.R` + pinned rocker container; golden parquet
57
+ generation; fixture metadata (dplyr version).
58
+ - **4.3** Comparison/normalization harness implementing SEMANTICS
59
+ S6/S17/S19.
60
+ - **4.4** CI: per-push spec tests vs committed goldens; nightly golden
61
+ drift job.
62
+ - **4.5** Seed corpus: ~12 hand-written specs per MVP verb + ported dplyr
63
+ doc examples.
64
+
65
+ ## Epic 5 — duckdb backend
66
+ - **5.1** Sources: `from_duckdb(con, table)`, `read_sql`.
67
+ - **5.2** IR → SQL compiler for the Epic-2 verb set, with semantics shims
68
+ (casts, ORDER BY NULLS LAST, etc.).
69
+ - **5.3** `persist()` as `CREATE TEMP TABLE`.
70
+ - **5.4** Hypothesis property 2: backend agreement, in CI on every push.
71
+
72
+ ## Epic 6 — tidyselect, across, reshaping
73
+ - **6.1** tidyselect: `starts_with/ends_with/contains/matches/where/
74
+ everything`, negation; works in `select`, `rename`, `distinct`.
75
+ - **6.2** `across()` in mutate/summarize with `names=` templating.
76
+ - **6.3** `pivot_longer`; `pivot_wider` (uses 3.4); `count`; `pull`.
77
+ - **6.4** `GroupedDFrame` type with its own completion surface; grouped
78
+ filter/mutate window semantics.
79
+
80
+ ## Epic 7 — Autocompletion & developer experience
81
+ - **7.1** Runtime completion: `__dir__` from live schema; frame-bound
82
+ `df.c` proxy.
83
+ - **7.2** `DFrame[S]` generic typing + lambda-style `filter(lambda c: ...)`.
84
+ - **7.3** `dpyr stubgen` CLI: parquet/duckdb schema → `Cols` stubs;
85
+ stubs checked by mypy+pyright in CI.
86
+ - **7.4** Error message polish pass (the "feels eager" acceptance test:
87
+ every user mistake surfaces on the line that made it).
88
+
89
+ ## Epic 8 — Hardening & release (MVP gate)
90
+ - **8.1** Port dplyr testthat regression tests for MVP verbs.
91
+ - **8.2** Nightly Hypothesis-vs-oracle fuzzing job.
92
+ - **8.3** Docs site: tutorial mirroring the dplyr vignette, dplyr→dataframe
93
+ cheat sheet, SEMANTICS as a public page.
94
+ - **8.4** `0.1.0` to PyPI; announce with the differential-test count as
95
+ the headline ("passes N dplyr-generated golden tests").
96
+
97
+ ## Post-MVP (parking lot)
98
+ `separate/unite`, `nest`, window function breadth, `slice_min/max`,
99
+ list-columns, streaming collect, arrow Flight sources, sqlite/postgres
100
+ backends via the duckdb SQL layer, plugin API for custom verbs.
@@ -0,0 +1,35 @@
1
+ # SEMANTICS — the conformance spec
2
+
3
+ Where R/dplyr, polars and duckdb disagree, this file records the decision.
4
+ Every row below must be encoded as a test that links back here. Comparison
5
+ against the dplyr oracle is checked *modulo these documented divergences* —
6
+ never fuzzily.
7
+
8
+ Legend: **R** = follow dplyr · **P** = follow polars/duckdb · **pinned** =
9
+ our own rule, backends forced to comply.
10
+
11
+ | # | Area | dplyr | polars/duckdb | Decision |
12
+ |---|------|-------|---------------|----------|
13
+ | S1 | Missing values | typed `NA`, `NaN` distinct | `null` vs `NaN` | **pinned**: NA ↔ null bidirectionally; NaN preserved as NaN; document |
14
+ | S2 | `mean/sum/...` with missing | `NA` unless `na.rm=TRUE` | ignore nulls | **P**, with `na_rm: bool = True` kwarg for familiarity |
15
+ | S3 | Sort: NA position & stability | NAs last, stable sort | varies per engine | **pinned**: stable, NAs last; `desc()` keeps NAs last |
16
+ | S4 | `int / int` | promotes to double | varies | **R** (saner) |
17
+ | S5 | Integer overflow | promotes / warns | wraps or errors | **pinned**: Int64 default; overflow errors |
18
+ | S6 | String ordering / collation | locale-dependent (!) | byte/UTF-8 | **pinned**: C-locale codepoint order — *known divergence from R*; oracle harness normalizes |
19
+ | S7 | Grouped result ordering | sorted by group keys | hash order | **R**: sort by keys |
20
+ | S8 | Empty groups / zero-row inputs | specific dplyr behaviors | varies | **R**; port dplyr regression tests |
21
+ | S9 | `summarize` ungrouping | drops last group level | n/a | **R**, including the multi-key behavior |
22
+ | S10 | Join key NA matching | `NA` matches `NA` by default | SQL: NULL ≠ NULL | **R** default, `na_matches="never"` opt-out (mirrors dplyr arg) |
23
+ | S11 | Join suffixes | `.x` / `.y` | `_right` etc. | **R**: `(".x", ".y")` |
24
+ | S12 | Boolean with NA (3-valued logic) | NA propagates; `filter` drops NA | same in SQL | **R/SQL** (they agree); test it anyway |
25
+ | S13 | `n()` / counts dtype | integer | u32/i64 | **pinned**: Int64 |
26
+ | S14 | Division by zero | `Inf`/`NaN` | varies (duckdb errors on int) | **R**: `Inf`/`-Inf`/`NaN`, cast first on duckdb |
27
+ | S15 | `case_when` no match | `NA` | null | agree; pin the result dtype unification rule |
28
+ | S16 | Date/time zones | rich, messy | UTC-leaning | **pinned**: tz-aware UTC default; naive allowed; conversions explicit |
29
+ | S17 | Factors | core R type | none | not supported; oracle harness converts factors → strings before compare |
30
+ | S18 | Recycling length-1 values in `mutate` | yes | literals broadcast | **R** for scalars only; no general recycling |
31
+ | S19 | Float comparison in tests | — | — | harness: sort-normalize where order unspecified + ULP tolerance |
32
+
33
+ Process: when a differential test fails and the cause is a *new* semantic
34
+ disagreement, the fix is (1) add a row here, (2) encode it in the harness
35
+ normalization or backend compiler, (3) add a dedicated test naming the row.
@@ -0,0 +1,81 @@
1
+ # TESTING — strategy
2
+
3
+ Three layers. dplyr is the oracle; polars↔duckdb agreement is the
4
+ workhorse; Hypothesis finds what nobody thought to write.
5
+
6
+ ## Layer 1 — differential testing against dplyr (the oracle)
7
+
8
+ Test cases live in a neutral spec format (YAML), input data + verb chain:
9
+
10
+ ```yaml
11
+ # tests/specs/summarize/grouped_mean.yaml
12
+ input: starwars
13
+ chain:
14
+ - filter: "height > 180"
15
+ - group_by: [species]
16
+ - summarize: {n: "n()", mh: "mean(height, na.rm=TRUE)"}
17
+ ```
18
+
19
+ - `oracle/run_specs.R` translates each spec to dplyr code, runs it, writes
20
+ the result as **parquet** (lossless types, NA fidelity — never CSV) into
21
+ `tests/golden/`.
22
+ - pytest runs the same spec through our library (both backends) and
23
+ asserts frame equality via the normalization harness (see SEMANTICS.md
24
+ S6/S17/S19).
25
+
26
+ **R stays out of the inner loop.** Golden parquets are generated offline
27
+ and committed. Day-to-day `pytest` is pure Python. A separate CI job in a
28
+ pinned `rocker/tidyverse` container regenerates all goldens and fails on
29
+ drift — catching both our bugs and wrongly-encoded expectations. dplyr
30
+ version is pinned and recorded in fixture metadata.
31
+
32
+ ## Layer 2 — mined test corpus
33
+
34
+ - Port **dplyr's documented examples** (every verb's examples, vignettes,
35
+ relevant R4DS chapters) into specs. They double as our tutorial docs.
36
+ - Read **dplyr's testthat suite** and port its *regression tests* — each
37
+ is a documented historical bug (zero-row groups, grouped filter, join
38
+ multiplicities, `across()` corners).
39
+ - **dbplyr's translation tests** inform the duckdb compiler.
40
+
41
+ ## Layer 3 — property-based testing (Hypothesis)
42
+
43
+ Strategies generate small random frames (mixed dtypes, nulls, empty,
44
+ single-row, duplicates) and type-correct random verb chains (a grammar
45
+ over the IR).
46
+
47
+ Properties, in order of cost:
48
+
49
+ 1. **Schema soundness** (no oracle): predicted output schema ==
50
+ actually-collected schema. Cheap; run always. This is the test of the
51
+ schema-eager promise.
52
+ 2. **Backend agreement** (no oracle): polars result == duckdb result for
53
+ the same plan. Our most important invariant. Run always.
54
+ 3. **Metamorphic laws** (no oracle):
55
+ - `filter(p).filter(q)` ≡ `filter(p & q)`
56
+ - `arrange ∘ filter` ≡ `filter ∘ arrange` (as multisets)
57
+ - `mutate` of an unused column doesn't change downstream `summarize`
58
+ - chain collected once ≡ collected verb-by-verb with `persist()`
59
+ between every step ← *specifically exercises the lazy/eager machinery*
60
+ - repr (display-eager) then collect ≡ collect directly (cache
61
+ correctness)
62
+ 4. **Oracle fuzzing** (needs R): sampled generated cases through the
63
+ dplyr oracle. Nightly job, not per-commit.
64
+
65
+ ## Unit layers (ordinary but required)
66
+
67
+ - IR construction & schema inference per verb (golden IR snapshots).
68
+ - duckdb compiler: IR → SQL string snapshots.
69
+ - Error UX: wrong column names raise immediately with short tracebacks
70
+ and did-you-mean suggestions (asserted on message content).
71
+ - Stubgen: generated stubs compile under mypy & pyright in CI.
72
+
73
+ ## CI matrix
74
+
75
+ | Job | Trigger | Needs R |
76
+ |---|---|---|
77
+ | pytest (specs vs committed goldens, both backends) | every push | no |
78
+ | Hypothesis schema/backend/metamorphic | every push | no |
79
+ | golden regeneration + drift check | nightly + golden-touching PRs | yes (docker) |
80
+ | oracle fuzzing | nightly | yes (docker) |
81
+ | mypy/pyright incl. generated stubs | every push | no |
@@ -0,0 +1,35 @@
1
+ [project]
2
+ name = "dpyr"
3
+ version = "0.0.1"
4
+ description = "dplyr for Python: tidy piped verbs over polars and duckdb, with real autocompletion. Name reservation — API under active development."
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ license = { text = "MIT" }
8
+ authors = [{ name = "Maxime Rivest", email = "mrive052@gmail.com" }]
9
+ keywords = ["dplyr", "dataframe", "polars", "duckdb", "tidyverse", "data-analysis"]
10
+ classifiers = [
11
+ "Development Status :: 1 - Planning",
12
+ "Intended Audience :: Science/Research",
13
+ "License :: OSI Approved :: MIT License",
14
+ "Programming Language :: Python :: 3",
15
+ "Topic :: Scientific/Engineering",
16
+ ]
17
+
18
+ [project.urls]
19
+ Repository = "https://github.com/maximerivest/dataframe"
20
+
21
+ [dependency-groups]
22
+ dev = ["pytest>=8", "ruff", "mypy", "hypothesis"]
23
+
24
+ [build-system]
25
+ requires = ["hatchling"]
26
+ build-backend = "hatchling.build"
27
+
28
+ [tool.hatch.build.targets.wheel]
29
+ packages = ["src/dpyr"]
30
+
31
+ [tool.ruff]
32
+ line-length = 100
33
+
34
+ [tool.pytest.ini_options]
35
+ testpaths = ["tests"]
@@ -0,0 +1,17 @@
1
+ """dataframe — dplyr for Python, fronting polars and duckdb.
2
+
3
+ This release reserves the package name while the library is under active
4
+ development. See https://github.com/maximerivest/dpyr for the design
5
+ documents and roadmap.
6
+ """
7
+
8
+ __version__ = "0.0.1"
9
+
10
+
11
+ def __getattr__(name: str):
12
+ raise NotImplementedError(
13
+ f"dpyr.{name} is not available yet: version {__version__} is a "
14
+ "name-reservation release. The dplyr-style API (filter, mutate, "
15
+ "group_by, summarize, ...) is under development — see "
16
+ "https://github.com/maximerivest/dpyr for the roadmap."
17
+ )