rpy-bridge 0.4.0__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. rpy_bridge-0.5.0/PKG-INFO +297 -0
  2. rpy_bridge-0.5.0/README.md +238 -0
  3. {rpy_bridge-0.4.0 → rpy_bridge-0.5.0}/pyproject.toml +1 -1
  4. rpy_bridge-0.5.0/src/rpy_bridge/__init__.py +14 -0
  5. rpy_bridge-0.5.0/src/rpy_bridge/compare.py +106 -0
  6. rpy_bridge-0.5.0/src/rpy_bridge/convert.py +63 -0
  7. rpy_bridge-0.5.0/src/rpy_bridge/core.py +505 -0
  8. rpy_bridge-0.5.0/src/rpy_bridge/dataframe.py +74 -0
  9. rpy_bridge-0.5.0/src/rpy_bridge/env.py +108 -0
  10. rpy_bridge-0.5.0/src/rpy_bridge/logging.py +50 -0
  11. rpy_bridge-0.5.0/src/rpy_bridge/renv.py +149 -0
  12. rpy_bridge-0.5.0/src/rpy_bridge/rpy2_loader.py +71 -0
  13. rpy_bridge-0.5.0/src/rpy_bridge.egg-info/PKG-INFO +297 -0
  14. {rpy_bridge-0.4.0 → rpy_bridge-0.5.0}/src/rpy_bridge.egg-info/SOURCES.txt +8 -1
  15. rpy_bridge-0.4.0/PKG-INFO +0 -258
  16. rpy_bridge-0.4.0/README.md +0 -199
  17. rpy_bridge-0.4.0/src/rpy_bridge/__init__.py +0 -38
  18. rpy_bridge-0.4.0/src/rpy_bridge/rpy2_utils.py +0 -1221
  19. rpy_bridge-0.4.0/src/rpy_bridge.egg-info/PKG-INFO +0 -258
  20. {rpy_bridge-0.4.0 → rpy_bridge-0.5.0}/LICENSE +0 -0
  21. {rpy_bridge-0.4.0 → rpy_bridge-0.5.0}/README.rst +0 -0
  22. {rpy_bridge-0.4.0 → rpy_bridge-0.5.0}/setup.cfg +0 -0
  23. {rpy_bridge-0.4.0 → rpy_bridge-0.5.0}/src/rpy_bridge/py.typed +0 -0
  24. {rpy_bridge-0.4.0 → rpy_bridge-0.5.0}/src/rpy_bridge.egg-info/dependency_links.txt +0 -0
  25. {rpy_bridge-0.4.0 → rpy_bridge-0.5.0}/src/rpy_bridge.egg-info/requires.txt +0 -0
  26. {rpy_bridge-0.4.0 → rpy_bridge-0.5.0}/src/rpy_bridge.egg-info/top_level.txt +0 -0
  27. {rpy_bridge-0.4.0 → rpy_bridge-0.5.0}/tests/test_package_call.py +0 -0
  28. {rpy_bridge-0.4.0 → rpy_bridge-0.5.0}/tests/test_py2r.py +0 -0
  29. {rpy_bridge-0.4.0 → rpy_bridge-0.5.0}/tests/test_roundtrip.py +0 -0
  30. {rpy_bridge-0.4.0 → rpy_bridge-0.5.0}/tests/test_wrapper.py +0 -0
@@ -0,0 +1,297 @@
1
+ Metadata-Version: 2.4
2
+ Name: rpy-bridge
3
+ Version: 0.5.0
4
+ Summary: Python-to-R interoperability engine with environment management, type-safe conversions, data normalization, and safe R function execution.
5
+ Author-email: Victoria Cheung <victoriakcheung@gmail.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2025 Victoria Cheung
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Acknowledgement: This project builds on work originally developed at
29
+ Revolution Medicines and interfaces with the rpy2 project, which is licensed
30
+ under the GNU General Public License version 2 or later.
31
+
32
+ Project-URL: Homepage, https://github.com/vic-cheung/rpy-bridge
33
+ Project-URL: Issue Tracker, https://github.com/vic-cheung/rpy-bridge/issues
34
+ Keywords: python,r,rpy2,python-r,interoperability,data-science,statistics,bioinformatics
35
+ Classifier: License :: OSI Approved :: MIT License
36
+ Classifier: Programming Language :: Python
37
+ Classifier: Programming Language :: Python :: 3
38
+ Classifier: Programming Language :: Python :: 3.11
39
+ Classifier: Programming Language :: Python :: 3.12
40
+ Classifier: Intended Audience :: Developers
41
+ Classifier: Intended Audience :: Science/Research
42
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
43
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
44
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
45
+ Requires-Python: >=3.11
46
+ Description-Content-Type: text/markdown
47
+ License-File: LICENSE
48
+ Requires-Dist: numpy>=1.24
49
+ Requires-Dist: pandas>=2.0
50
+ Requires-Dist: loguru>=0.7
51
+ Provides-Extra: r
52
+ Requires-Dist: rpy2>=3.5; extra == "r"
53
+ Provides-Extra: dev
54
+ Requires-Dist: ipykernel>=7.1.0; extra == "dev"
55
+ Provides-Extra: docs
56
+ Requires-Dist: sphinx; extra == "docs"
57
+ Requires-Dist: myst-parser; extra == "docs"
58
+ Dynamic: license-file
59
+
60
+ # rpy-bridge
61
+
62
+ **rpy-bridge** is a Python-controlled **R execution orchestrator** that enables
63
+ Python code to run R functions, scripts, and packages with **reproducible
64
+ filesystem and environment semantics**.
65
+
66
+ It is built on top of `rpy2`, but unlike thin wrappers, rpy-bridge stabilizes how
67
+ R code is executed when invoked from Python: project roots are inferred, `renv`
68
+ environments can be activated out-of-tree, relative paths behave as expected,
69
+ and return values are normalized for safe Python consumption.
70
+
71
+ This makes rpy-bridge suitable for production pipelines, CI, and bilingual
72
+ Python/R teams where R code must run reliably outside an interactive R session.
73
+
74
+ **Latest release:** [`rpy-bridge` on PyPI](https://pypi.org/project/rpy-bridge/)
75
+
76
+ ---
77
+
78
+ ## What this is (and is not)
79
+
80
+ rpy-bridge **is not a thin rpy2 wrapper**.
81
+
82
+ Typical rpy2 usage assumes:
83
+ - the Python working directory is the R project root
84
+ - `renv` lives next to the executing script
85
+ - relative paths resolve correctly by default
86
+ - all R code executes in `globalenv()`
87
+
88
+ These assumptions break quickly in real-world Python workflows.
89
+
90
+ rpy-bridge instead provides a **controlled R runtime** with explicit guarantees
91
+ around execution context, filesystem behavior, and environment activation.
92
+
93
+ ---
94
+
95
+ ## Core capabilities
96
+
97
+ ### 1. R execution orchestration
98
+
99
+ - Embeds R via `rpy2` with deterministic startup behavior
100
+ - Disables interactive and GUI-dependent hooks for headless execution
101
+ - Loads R scripts into isolated namespaces (not `globalenv()`)
102
+
103
+ ### 2. Project root inference and path stability
104
+
105
+ - Infers R project roots using markers such as:
106
+ `.git`, `.Rproj`, `renv.lock`, `DESCRIPTION`, `.here`
107
+ - Executes R code from the inferred project root regardless of Python CWD
108
+ - Preserves relative-path behavior expected by R scripts
109
+ - Supports R code using `here::here()` or project-local data
110
+
111
+ ### 3. Out-of-tree `renv` activation
112
+
113
+ - Activates `renv` projects located **outside** the calling Python directory
114
+ - Sources `.Rprofile` and `.Renviron` to reproduce R startup semantics
115
+ - Does not require R scripts and `renv` to live in the same directory
116
+
117
+ ### 4. Python ↔ R data conversion
118
+
119
+ - Converts Python scalars, lists, dicts, and pandas objects into R equivalents
120
+ - Converts R vectors, lists, and data.frames back into Python-native types
121
+ - Handles nested structures, missing values, and mixed types robustly
122
+
123
+ ### 5. Data normalization and diagnostics
124
+
125
+ - Post-processes R data.frames to fix dtype, timezone, and NA semantics
126
+ - Normalizes column types for reliable Python-side comparison
127
+ - Supports structured mismatch diagnostics between Python and R data
128
+
129
+ ### 6. Function invocation across scripts and packages
130
+
131
+ - Calls functions defined in sourced R scripts, base R, or installed packages
132
+ - Supports qualified function names (e.g. `stats::median`)
133
+ - Executes functions within the active project and library context
134
+
135
+ ---
136
+
137
+ ## Calling base R functions and managing packages
138
+
139
+ In addition to sourcing local R scripts, rpy-bridge supports calling functions
140
+ from base R and installed packages directly from Python.
141
+
142
+ Current support includes:
143
+
144
+ - Calling base R functions without a local R script
145
+ - Executing functions from installed R packages within the active environment
146
+
147
+ Planned extensions (roadmap):
148
+
149
+ - Programmatic installation of R packages into the active `renv` or system
150
+ environment when explicitly enabled
151
+ - Declarative package requirements at the Python call site
152
+ - Safe, opt-in package installation for CI and ephemeral environments
153
+
154
+ Package installation is intentionally **not automatic by default** to preserve
155
+ reproducibility and avoid side effects during execution.
156
+
157
+ ---
158
+
159
+ ## Installation
160
+
161
+ ### Prerequisites
162
+
163
+ - System R installed and available on `PATH`
164
+ - Python 3.12+
165
+
166
+ ### From PyPI
167
+
168
+ Install rpy-bridge with rpy2 for full R support:
169
+
170
+ ```bash
171
+ python3 -m pip install rpy-bridge rpy2
172
+ ```
173
+
174
+ Using `uv`:
175
+
176
+ ```bash
177
+ uv add rpy-bridge rpy2
178
+ ```
179
+
180
+ ### Development install
181
+
182
+ ```bash
183
+ python3 -m pip install -e .
184
+ ```
185
+
186
+ or:
187
+
188
+ ```bash
189
+ uv sync
190
+ ```
191
+
192
+ ### Required Python dependencies
193
+
194
+ - `rpy2`
195
+ - `pandas`
196
+ - `numpy`
197
+
198
+ ---
199
+
200
+ ## Usage
201
+
202
+ ### Call a function from a local R script
203
+
204
+ ```python
205
+ from pathlib import Path
206
+ from rpy_bridge import RFunctionCaller
207
+
208
+ project_dir = Path("/path/to/your-r-project")
209
+ script = project_dir / "scripts" / "example.R"
210
+
211
+ caller = RFunctionCaller(
212
+ path_to_renv=project_dir,
213
+ script_path=script,
214
+ )
215
+
216
+ result = caller.call("some_function", 42, named_arg="value")
217
+ ```
218
+
219
+ ### Call base R functions (no local script)
220
+
221
+ ```python
222
+ from rpy_bridge import RFunctionCaller
223
+
224
+ caller = RFunctionCaller(path_to_renv=None)
225
+
226
+ samples = caller.call("stats::rnorm", 10, mean=0, sd=1)
227
+ median_val = caller.call("stats::median", samples)
228
+ ```
229
+
230
+ ---
231
+
232
+ ## Round-trip Python ↔ R behavior
233
+
234
+ rpy-bridge attempts to convert Python objects to R and back. Most objects used in
235
+ scientific and ML pipelines round-trip cleanly, but some heterogeneous Python
236
+ structures may be wrapped or slightly altered due to differences in R’s type
237
+ system.
238
+
239
+ | Python type | Round-trip fidelity | Notes |
240
+ | ---------------------------------------------- | ------------------- | --------------------------------------------------------------------- |
241
+ | `int`, `float`, `bool`, `str` | High | Scalars convert directly |
242
+ | Homogeneous `list` of numbers/strings | High | Converted to atomic R vectors |
243
+ | Nested homogeneous lists | High | Converted to nested R lists |
244
+ | `pandas.DataFrame` / `pd.Series` | High | Converted to `data.frame` and normalized on return |
245
+ | Mixed-type `list` or `dict` | Partial | May be wrapped in single-element vectors |
246
+ | `None` / `pd.NA` | High | Converted to R `NULL` |
247
+
248
+ ---
249
+
250
+ ## R setup helpers
251
+
252
+ Helper scripts are provided in `examples/r-deps/` to prepare R environments.
253
+
254
+ - Install system R dependencies (macOS / Homebrew):
255
+
256
+ ```bash
257
+ bash examples/r-deps/install_r_dev_deps_homebrew.sh
258
+ ```
259
+
260
+ - Initialize an `renv` project:
261
+
262
+ ```r
263
+ source("examples/r-deps/setup_env.R")
264
+ ```
265
+
266
+ - Restore the environment on a new machine:
267
+
268
+ ```r
269
+ renv::restore()
270
+ ```
271
+
272
+ ---
273
+
274
+ ## Who this is for
275
+
276
+ rpy-bridge is designed for:
277
+
278
+ - Python-first pipelines that rely on mature R code
279
+ - Teams where R logic must remain authoritative
280
+ - CI or production systems that cannot rely on interactive R sessions
281
+ - Multi-repo or multi-directory projects with non-trivial filesystem layouts
282
+
283
+ It is **not** intended as a convenience wrapper for exploratory R usage.
284
+
285
+ ---
286
+
287
+ ## Licensing
288
+
289
+ - rpy-bridge is released under the MIT License © 2025 Victoria Cheung
290
+ - Depends on [`rpy2`](https://rpy2.github.io), licensed under the GNU GPL (v2 or later)
291
+
292
+ ---
293
+
294
+ ## Acknowledgements
295
+
296
+ This package was spun out of internal tooling I wrote at Revolution Medicines.
297
+ Thanks to the team there for supporting its open-source release.
@@ -0,0 +1,238 @@
1
+ # rpy-bridge
2
+
3
+ **rpy-bridge** is a Python-controlled **R execution orchestrator** that enables
4
+ Python code to run R functions, scripts, and packages with **reproducible
5
+ filesystem and environment semantics**.
6
+
7
+ It is built on top of `rpy2`, but unlike thin wrappers, rpy-bridge stabilizes how
8
+ R code is executed when invoked from Python: project roots are inferred, `renv`
9
+ environments can be activated out-of-tree, relative paths behave as expected,
10
+ and return values are normalized for safe Python consumption.
11
+
12
+ This makes rpy-bridge suitable for production pipelines, CI, and bilingual
13
+ Python/R teams where R code must run reliably outside an interactive R session.
14
+
15
+ **Latest release:** [`rpy-bridge` on PyPI](https://pypi.org/project/rpy-bridge/)
16
+
17
+ ---
18
+
19
+ ## What this is (and is not)
20
+
21
+ rpy-bridge **is not a thin rpy2 wrapper**.
22
+
23
+ Typical rpy2 usage assumes:
24
+ - the Python working directory is the R project root
25
+ - `renv` lives next to the executing script
26
+ - relative paths resolve correctly by default
27
+ - all R code executes in `globalenv()`
28
+
29
+ These assumptions break quickly in real-world Python workflows.
30
+
31
+ rpy-bridge instead provides a **controlled R runtime** with explicit guarantees
32
+ around execution context, filesystem behavior, and environment activation.
33
+
34
+ ---
35
+
36
+ ## Core capabilities
37
+
38
+ ### 1. R execution orchestration
39
+
40
+ - Embeds R via `rpy2` with deterministic startup behavior
41
+ - Disables interactive and GUI-dependent hooks for headless execution
42
+ - Loads R scripts into isolated namespaces (not `globalenv()`)
43
+
44
+ ### 2. Project root inference and path stability
45
+
46
+ - Infers R project roots using markers such as:
47
+ `.git`, `.Rproj`, `renv.lock`, `DESCRIPTION`, `.here`
48
+ - Executes R code from the inferred project root regardless of Python CWD
49
+ - Preserves relative-path behavior expected by R scripts
50
+ - Supports R code using `here::here()` or project-local data
51
+
52
+ ### 3. Out-of-tree `renv` activation
53
+
54
+ - Activates `renv` projects located **outside** the calling Python directory
55
+ - Sources `.Rprofile` and `.Renviron` to reproduce R startup semantics
56
+ - Does not require R scripts and `renv` to live in the same directory
57
+
58
+ ### 4. Python ↔ R data conversion
59
+
60
+ - Converts Python scalars, lists, dicts, and pandas objects into R equivalents
61
+ - Converts R vectors, lists, and data.frames back into Python-native types
62
+ - Handles nested structures, missing values, and mixed types robustly
63
+
64
+ ### 5. Data normalization and diagnostics
65
+
66
+ - Post-processes R data.frames to fix dtype, timezone, and NA semantics
67
+ - Normalizes column types for reliable Python-side comparison
68
+ - Supports structured mismatch diagnostics between Python and R data
69
+
70
+ ### 6. Function invocation across scripts and packages
71
+
72
+ - Calls functions defined in sourced R scripts, base R, or installed packages
73
+ - Supports qualified function names (e.g. `stats::median`)
74
+ - Executes functions within the active project and library context
75
+
76
+ ---
77
+
78
+ ## Calling base R functions and managing packages
79
+
80
+ In addition to sourcing local R scripts, rpy-bridge supports calling functions
81
+ from base R and installed packages directly from Python.
82
+
83
+ Current support includes:
84
+
85
+ - Calling base R functions without a local R script
86
+ - Executing functions from installed R packages within the active environment
87
+
88
+ Planned extensions (roadmap):
89
+
90
+ - Programmatic installation of R packages into the active `renv` or system
91
+ environment when explicitly enabled
92
+ - Declarative package requirements at the Python call site
93
+ - Safe, opt-in package installation for CI and ephemeral environments
94
+
95
+ Package installation is intentionally **not automatic by default** to preserve
96
+ reproducibility and avoid side effects during execution.
97
+
98
+ ---
99
+
100
+ ## Installation
101
+
102
+ ### Prerequisites
103
+
104
+ - System R installed and available on `PATH`
105
+ - Python 3.12+
106
+
107
+ ### From PyPI
108
+
109
+ Install rpy-bridge with rpy2 for full R support:
110
+
111
+ ```bash
112
+ python3 -m pip install rpy-bridge rpy2
113
+ ```
114
+
115
+ Using `uv`:
116
+
117
+ ```bash
118
+ uv add rpy-bridge rpy2
119
+ ```
120
+
121
+ ### Development install
122
+
123
+ ```bash
124
+ python3 -m pip install -e .
125
+ ```
126
+
127
+ or:
128
+
129
+ ```bash
130
+ uv sync
131
+ ```
132
+
133
+ ### Required Python dependencies
134
+
135
+ - `rpy2`
136
+ - `pandas`
137
+ - `numpy`
138
+
139
+ ---
140
+
141
+ ## Usage
142
+
143
+ ### Call a function from a local R script
144
+
145
+ ```python
146
+ from pathlib import Path
147
+ from rpy_bridge import RFunctionCaller
148
+
149
+ project_dir = Path("/path/to/your-r-project")
150
+ script = project_dir / "scripts" / "example.R"
151
+
152
+ caller = RFunctionCaller(
153
+ path_to_renv=project_dir,
154
+ script_path=script,
155
+ )
156
+
157
+ result = caller.call("some_function", 42, named_arg="value")
158
+ ```
159
+
160
+ ### Call base R functions (no local script)
161
+
162
+ ```python
163
+ from rpy_bridge import RFunctionCaller
164
+
165
+ caller = RFunctionCaller(path_to_renv=None)
166
+
167
+ samples = caller.call("stats::rnorm", 10, mean=0, sd=1)
168
+ median_val = caller.call("stats::median", samples)
169
+ ```
170
+
171
+ ---
172
+
173
+ ## Round-trip Python ↔ R behavior
174
+
175
+ rpy-bridge attempts to convert Python objects to R and back. Most objects used in
176
+ scientific and ML pipelines round-trip cleanly, but some heterogeneous Python
177
+ structures may be wrapped or slightly altered due to differences in R’s type
178
+ system.
179
+
180
+ | Python type | Round-trip fidelity | Notes |
181
+ | ---------------------------------------------- | ------------------- | --------------------------------------------------------------------- |
182
+ | `int`, `float`, `bool`, `str` | High | Scalars convert directly |
183
+ | Homogeneous `list` of numbers/strings | High | Converted to atomic R vectors |
184
+ | Nested homogeneous lists | High | Converted to nested R lists |
185
+ | `pandas.DataFrame` / `pd.Series` | High | Converted to `data.frame` and normalized on return |
186
+ | Mixed-type `list` or `dict` | Partial | May be wrapped in single-element vectors |
187
+ | `None` / `pd.NA` | High | Converted to R `NULL` |
188
+
189
+ ---
190
+
191
+ ## R setup helpers
192
+
193
+ Helper scripts are provided in `examples/r-deps/` to prepare R environments.
194
+
195
+ - Install system R dependencies (macOS / Homebrew):
196
+
197
+ ```bash
198
+ bash examples/r-deps/install_r_dev_deps_homebrew.sh
199
+ ```
200
+
201
+ - Initialize an `renv` project:
202
+
203
+ ```r
204
+ source("examples/r-deps/setup_env.R")
205
+ ```
206
+
207
+ - Restore the environment on a new machine:
208
+
209
+ ```r
210
+ renv::restore()
211
+ ```
212
+
213
+ ---
214
+
215
+ ## Who this is for
216
+
217
+ rpy-bridge is designed for:
218
+
219
+ - Python-first pipelines that rely on mature R code
220
+ - Teams where R logic must remain authoritative
221
+ - CI or production systems that cannot rely on interactive R sessions
222
+ - Multi-repo or multi-directory projects with non-trivial filesystem layouts
223
+
224
+ It is **not** intended as a convenience wrapper for exploratory R usage.
225
+
226
+ ---
227
+
228
+ ## Licensing
229
+
230
+ - rpy-bridge is released under the MIT License © 2025 Victoria Cheung
231
+ - Depends on [`rpy2`](https://rpy2.github.io), licensed under the GNU GPL (v2 or later)
232
+
233
+ ---
234
+
235
+ ## Acknowledgements
236
+
237
+ This package was spun out of internal tooling I wrote at Revolution Medicines.
238
+ Thanks to the team there for supporting its open-source release.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "rpy-bridge"
3
- version = "0.4.0"
3
+ version = "0.5.0"
4
4
  description = "Python-to-R interoperability engine with environment management, type-safe conversions, data normalization, and safe R function execution."
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -0,0 +1,14 @@
1
+ """
2
+ Public API for the rpy-bridge package.
3
+
4
+ `RFunctionCaller` is the primary entry point for loading R scripts and calling
5
+ functions. Other helpers are re-exported for compatibility.
6
+ """
7
+
8
+ from .core import RFunctionCaller
9
+ from .renv import activate_renv
10
+
11
+ __all__ = [
12
+ "activate_renv",
13
+ "RFunctionCaller",
14
+ ]
@@ -0,0 +1,106 @@
1
+ """
2
+ DataFrame comparison helpers used to validate parity between R and Python outputs.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Any
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+
12
+ from .dataframe import fix_r_dataframe_types, fix_string_nans
13
+
14
+
15
+ def normalize_dtypes(df1: pd.DataFrame, df2: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
16
+ for col in df1.columns.intersection(df2.columns):
17
+ df1[col] = df1[col].replace("", pd.NA)
18
+ df2[col] = df2[col].replace("", pd.NA)
19
+ s1, s2 = df1[col], df2[col]
20
+ dtype1, dtype2 = s1.dtype, s2.dtype
21
+ if (pd.api.types.is_numeric_dtype(dtype1) and pd.api.types.is_object_dtype(dtype2)) or (
22
+ pd.api.types.is_object_dtype(dtype1) and pd.api.types.is_numeric_dtype(dtype2)
23
+ ):
24
+ try:
25
+ df1[col] = pd.to_numeric(s1, errors="coerce")
26
+ df2[col] = pd.to_numeric(s2, errors="coerce")
27
+ continue
28
+ except Exception:
29
+ pass
30
+ if pd.api.types.is_numeric_dtype(dtype1) and pd.api.types.is_numeric_dtype(dtype2):
31
+ df1[col] = df1[col].astype("float64")
32
+ df2[col] = df2[col].astype("float64")
33
+ continue
34
+ if pd.api.types.is_object_dtype(dtype1) or pd.api.types.is_object_dtype(dtype2):
35
+ df1[col] = df1[col].astype(str)
36
+ df2[col] = df2[col].astype(str)
37
+ return df1, df2
38
+
39
+
40
+ def align_numeric_dtypes(df1: pd.DataFrame, df2: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
41
+ for col in df1.columns.intersection(df2.columns):
42
+ s1, s2 = df1[col].replace("", pd.NA), df2[col].replace("", pd.NA)
43
+ try:
44
+ s1_num = pd.to_numeric(s1, errors="coerce")
45
+ s2_num = pd.to_numeric(s2, errors="coerce")
46
+ if not s1_num.isna().all() or not s2_num.isna().all():
47
+ df1[col] = s1_num.astype("float64")
48
+ df2[col] = s2_num.astype("float64")
49
+ continue
50
+ except Exception:
51
+ pass
52
+ df1[col], df2[col] = s1, s2
53
+ return df1, df2
54
+
55
+
56
+ def compare_r_py_dataframes(df1: pd.DataFrame, df2: pd.DataFrame, float_tol: float = 1e-8) -> dict:
57
+ results: dict[str, Any] = {
58
+ "shape_mismatch": False,
59
+ "columns_mismatch": False,
60
+ "index_mismatch": False,
61
+ "numeric_diffs": {},
62
+ "non_numeric_diffs": {},
63
+ }
64
+ df2 = fix_r_dataframe_types(df2)
65
+ df1 = fix_string_nans(df1)
66
+ df2 = fix_string_nans(df2)
67
+ df1, df2 = normalize_dtypes(df1.copy(), df2.copy())
68
+ df1, df2 = align_numeric_dtypes(df1, df2)
69
+ if df1.shape != df2.shape:
70
+ results["shape_mismatch"] = True
71
+ print(f"[Warning] Shape mismatch: df1 {df1.shape} vs df2 {df2.shape}")
72
+ if set(df1.columns) != set(df2.columns):
73
+ results["columns_mismatch"] = True
74
+ print("[Warning] Column mismatch:")
75
+ print(f" df1: {df1.columns}")
76
+ print(f" df2: {df2.columns}")
77
+ common_cols = df1.columns.intersection(df2.columns)
78
+ else:
79
+ common_cols = df1.columns
80
+ df1_aligned, df2_aligned = df1.loc[:, common_cols], df2.loc[:, common_cols]
81
+ for col in common_cols:
82
+ col_py, col_r = df1_aligned[col], df2_aligned[col]
83
+ if pd.api.types.is_numeric_dtype(col_py) and pd.api.types.is_numeric_dtype(col_r):
84
+ col_py, col_r = col_py.align(col_r)
85
+ close = np.isclose(
86
+ col_py.fillna(np.nan),
87
+ col_r.fillna(np.nan),
88
+ atol=float_tol,
89
+ equal_nan=True,
90
+ )
91
+ if not close.all():
92
+ results["numeric_diffs"][col] = pd.DataFrame(
93
+ {"df1": col_py[~close], "df2": col_r[~close]}
94
+ )
95
+ else:
96
+ unequal = ~col_py.eq(col_r)
97
+ both_na = col_py.isna() & col_r.isna()
98
+ unequal = unequal & ~both_na
99
+ if unequal.any():
100
+ results["non_numeric_diffs"][col] = pd.DataFrame(
101
+ {"df1": col_py[unequal], "df2": col_r[unequal]}
102
+ )
103
+ return results
104
+
105
+
106
+ __all__ = ["normalize_dtypes", "align_numeric_dtypes", "compare_r_py_dataframes"]