rpy-bridge 0.1.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rpy_bridge/__init__.py +2 -2
- rpy_bridge/rpy2_utils.py +521 -257
- rpy_bridge-0.3.1.dist-info/METADATA +251 -0
- rpy_bridge-0.3.1.dist-info/RECORD +8 -0
- rpy_bridge-0.1.0.dist-info/METADATA +0 -205
- rpy_bridge-0.1.0.dist-info/RECORD +0 -8
- {rpy_bridge-0.1.0.dist-info → rpy_bridge-0.3.1.dist-info}/WHEEL +0 -0
- {rpy_bridge-0.1.0.dist-info → rpy_bridge-0.3.1.dist-info}/licenses/LICENSE +0 -0
- {rpy_bridge-0.1.0.dist-info → rpy_bridge-0.3.1.dist-info}/top_level.txt +0 -0
rpy_bridge/rpy2_utils.py
CHANGED
|
@@ -16,23 +16,12 @@ import warnings
|
|
|
16
16
|
warnings.filterwarnings("ignore", message="Environment variable .* redefined by R")
|
|
17
17
|
|
|
18
18
|
from pathlib import Path
|
|
19
|
+
import sys
|
|
20
|
+
import subprocess
|
|
19
21
|
|
|
22
|
+
import math
|
|
20
23
|
import numpy as np
|
|
21
24
|
import pandas as pd
|
|
22
|
-
import rpy2.robjects as ro
|
|
23
|
-
from rpy2 import robjects
|
|
24
|
-
from rpy2.rinterface_lib.sexp import NULLType
|
|
25
|
-
from rpy2.rlike.container import NamedList
|
|
26
|
-
from rpy2.robjects import pandas2ri
|
|
27
|
-
from rpy2.robjects.conversion import localconverter
|
|
28
|
-
from rpy2.robjects.vectors import (
|
|
29
|
-
BoolVector,
|
|
30
|
-
FloatVector,
|
|
31
|
-
IntVector,
|
|
32
|
-
ListVector,
|
|
33
|
-
StrVector,
|
|
34
|
-
)
|
|
35
|
-
from typing import Optional
|
|
36
25
|
|
|
37
26
|
try:
|
|
38
27
|
from loguru import logger # type: ignore
|
|
@@ -43,358 +32,619 @@ except Exception:
|
|
|
43
32
|
logger = logging.getLogger("rpy-bridge")
|
|
44
33
|
|
|
45
34
|
|
|
46
|
-
#
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
35
|
+
# ---------------------------------------------------------------------
|
|
36
|
+
# R detection and rpy2 installation
|
|
37
|
+
# ---------------------------------------------------------------------
|
|
38
|
+
def ensure_rpy2_installed(r_home: str):
|
|
39
|
+
os.environ["R_HOME"] = r_home
|
|
40
|
+
try:
|
|
41
|
+
import rpy2 # noqa: F401
|
|
42
|
+
except ImportError:
|
|
43
|
+
logger.info(
|
|
44
|
+
f"[Info] rpy2 not installed or incompatible with R_HOME={r_home}. Installing..."
|
|
45
|
+
)
|
|
46
|
+
subprocess.check_call(
|
|
47
|
+
[sys.executable, "-m", "pip", "install", "--force-reinstall", "rpy2"]
|
|
48
|
+
)
|
|
49
|
+
import rpy2 # noqa: F401
|
|
51
50
|
|
|
52
|
-
Accepts either:
|
|
53
|
-
- Direct path to renv directory (e.g., /path/to/renv)
|
|
54
|
-
- Parent directory containing renv/ folder (e.g., /path/to/repos where renv/ is inside)
|
|
55
|
-
"""
|
|
56
51
|
|
|
57
|
-
|
|
52
|
+
def find_r_home():
|
|
53
|
+
try:
|
|
54
|
+
r_home = subprocess.check_output(
|
|
55
|
+
["R", "--vanilla", "--slave", "-e", "cat(R.home())"],
|
|
56
|
+
stderr=subprocess.PIPE,
|
|
57
|
+
text=True,
|
|
58
|
+
).strip()
|
|
59
|
+
if r_home.endswith(">"):
|
|
60
|
+
r_home = r_home[:-1].strip()
|
|
61
|
+
return r_home
|
|
62
|
+
except FileNotFoundError:
|
|
63
|
+
possible_paths = [
|
|
64
|
+
"/usr/lib/R",
|
|
65
|
+
"/usr/local/lib/R",
|
|
66
|
+
"/opt/homebrew/Cellar/r/4.5.2/lib/R", # Homebrew macOS
|
|
67
|
+
"C:\\Program Files\\R\\R-4.5.2", # Windows
|
|
68
|
+
]
|
|
69
|
+
for p in possible_paths:
|
|
70
|
+
if os.path.exists(p):
|
|
71
|
+
return p
|
|
72
|
+
return None
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
R_HOME = find_r_home()
|
|
76
|
+
if not R_HOME:
|
|
77
|
+
raise RuntimeError("R not found. Please install R or add it to PATH.")
|
|
78
|
+
|
|
79
|
+
logger.info(f"R_HOME = {R_HOME}")
|
|
80
|
+
os.environ["R_HOME"] = R_HOME
|
|
81
|
+
ensure_rpy2_installed(R_HOME)
|
|
82
|
+
|
|
83
|
+
# macOS dynamic library path
|
|
84
|
+
if sys.platform == "darwin":
|
|
85
|
+
lib_path = os.path.join(R_HOME, "lib")
|
|
86
|
+
if lib_path not in os.environ.get("DYLD_FALLBACK_LIBRARY_PATH", ""):
|
|
87
|
+
os.environ["DYLD_FALLBACK_LIBRARY_PATH"] = (
|
|
88
|
+
f"{lib_path}:{os.environ.get('DYLD_FALLBACK_LIBRARY_PATH','')}"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
elif sys.platform.startswith("linux"):
|
|
92
|
+
lib_path = os.path.join(R_HOME, "lib")
|
|
93
|
+
ld_path = os.environ.get("LD_LIBRARY_PATH", "")
|
|
94
|
+
os.environ["LD_LIBRARY_PATH"] = f"{lib_path}:{ld_path}"
|
|
95
|
+
|
|
96
|
+
# ---------------------------------------------------------------------
|
|
97
|
+
# Lazy rpy2 import machinery
|
|
98
|
+
# ---------------------------------------------------------------------
|
|
99
|
+
_RPY2: dict | None = None
|
|
100
|
+
|
|
58
101
|
|
|
59
|
-
|
|
102
|
+
def _require_rpy2(raise_on_missing: bool = True) -> dict | None:
|
|
103
|
+
global _RPY2
|
|
104
|
+
if _RPY2 is not None:
|
|
105
|
+
return _RPY2
|
|
106
|
+
|
|
107
|
+
try:
|
|
108
|
+
import rpy2.robjects as ro
|
|
109
|
+
from rpy2 import robjects
|
|
110
|
+
from rpy2.robjects import pandas2ri
|
|
111
|
+
from rpy2.robjects.conversion import localconverter
|
|
112
|
+
from rpy2.robjects.vectors import (
|
|
113
|
+
BoolVector,
|
|
114
|
+
FloatVector,
|
|
115
|
+
IntVector,
|
|
116
|
+
ListVector,
|
|
117
|
+
StrVector,
|
|
118
|
+
)
|
|
119
|
+
from rpy2.rinterface_lib.sexp import NULLType
|
|
120
|
+
from rpy2.rlike.container import NamedList
|
|
121
|
+
|
|
122
|
+
_RPY2 = {
|
|
123
|
+
"ro": ro,
|
|
124
|
+
"robjects": robjects,
|
|
125
|
+
"pandas2ri": pandas2ri,
|
|
126
|
+
"localconverter": localconverter,
|
|
127
|
+
"BoolVector": BoolVector,
|
|
128
|
+
"FloatVector": FloatVector,
|
|
129
|
+
"IntVector": IntVector,
|
|
130
|
+
"ListVector": ListVector,
|
|
131
|
+
"StrVector": StrVector,
|
|
132
|
+
"NULLType": NULLType,
|
|
133
|
+
"NamedList": NamedList,
|
|
134
|
+
}
|
|
135
|
+
return _RPY2
|
|
136
|
+
|
|
137
|
+
except ImportError as e:
|
|
138
|
+
if raise_on_missing:
|
|
139
|
+
raise RuntimeError(
|
|
140
|
+
"R support requires optional dependency `rpy2`. Install with: pip install rpy-bridge[r]"
|
|
141
|
+
) from e
|
|
142
|
+
return None
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _ensure_rpy2() -> dict:
|
|
146
|
+
global _RPY2
|
|
147
|
+
if _RPY2 is None:
|
|
148
|
+
_RPY2 = _require_rpy2()
|
|
149
|
+
return _RPY2
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# ---------------------------------------------------------------------
|
|
153
|
+
# Activate renv
|
|
154
|
+
# ---------------------------------------------------------------------
|
|
155
|
+
def activate_renv(path_to_renv: Path) -> None:
|
|
156
|
+
r = _ensure_rpy2()
|
|
157
|
+
robjects = r["robjects"]
|
|
158
|
+
|
|
159
|
+
path_to_renv = path_to_renv.resolve()
|
|
60
160
|
if path_to_renv.name == "renv" and (path_to_renv / "activate.R").exists():
|
|
61
|
-
# Path points directly to renv directory
|
|
62
161
|
renv_dir = path_to_renv
|
|
63
|
-
|
|
162
|
+
project_dir = path_to_renv.parent
|
|
64
163
|
else:
|
|
65
|
-
# Path points to parent directory containing renv/
|
|
66
164
|
renv_dir = path_to_renv / "renv"
|
|
67
|
-
|
|
165
|
+
project_dir = path_to_renv
|
|
68
166
|
|
|
69
167
|
renv_activate = renv_dir / "activate.R"
|
|
70
|
-
renv_lock =
|
|
168
|
+
renv_lock = project_dir / "renv.lock"
|
|
71
169
|
|
|
72
170
|
if not renv_activate.exists() or not renv_lock.exists():
|
|
73
|
-
raise FileNotFoundError(
|
|
74
|
-
f"[Error] renv environment not found or incomplete.\n"
|
|
75
|
-
f" Expected activate.R at: {renv_activate}\n"
|
|
76
|
-
f" Expected renv.lock at: {renv_lock}\n"
|
|
77
|
-
f" Provided path: {path_to_renv}"
|
|
78
|
-
)
|
|
171
|
+
raise FileNotFoundError(f"[Error] renv environment incomplete: {path_to_renv}")
|
|
79
172
|
|
|
80
|
-
|
|
81
|
-
renviron_file = renv_project_dir / ".Renviron"
|
|
173
|
+
renviron_file = project_dir / ".Renviron"
|
|
82
174
|
if renviron_file.is_file():
|
|
83
175
|
os.environ["R_ENVIRON_USER"] = str(renviron_file)
|
|
84
|
-
logger.info("R_ENVIRON_USER set to: {}"
|
|
176
|
+
logger.info(f"R_ENVIRON_USER set to: {renviron_file}")
|
|
177
|
+
|
|
178
|
+
rprofile_file = project_dir / ".Rprofile"
|
|
179
|
+
if rprofile_file.is_file():
|
|
180
|
+
robjects.r(f'source("{rprofile_file.as_posix()}")')
|
|
181
|
+
logger.info(f".Rprofile sourced: {rprofile_file}")
|
|
85
182
|
|
|
86
|
-
# Load the renv package
|
|
87
183
|
try:
|
|
88
|
-
robjects.r("library(renv)")
|
|
184
|
+
robjects.r("suppressMessages(library(renv))")
|
|
89
185
|
except Exception:
|
|
90
|
-
|
|
91
|
-
robjects.r(
|
|
92
|
-
|
|
186
|
+
logger.info("Installing renv package in project library...")
|
|
187
|
+
robjects.r(
|
|
188
|
+
f'install.packages("renv", repos="https://cloud.r-project.org", lib="{renv_dir / "library"}")'
|
|
189
|
+
)
|
|
93
190
|
robjects.r("library(renv)")
|
|
94
191
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
logger.info("Using R at: {}", robjects.r("R.home()")[0])
|
|
98
|
-
robjects.r(f'renv::load("{renv_project_dir.as_posix()}")')
|
|
99
|
-
logger.info("renv environment loaded for project: {}", renv_project_dir)
|
|
100
|
-
except Exception as e:
|
|
101
|
-
raise RuntimeError(f"[Error] Failed to load renv environment: {e}")
|
|
102
|
-
|
|
103
|
-
logger.debug(".libPaths(): {}", robjects.r(".libPaths()"))
|
|
192
|
+
robjects.r(f'renv::load("{project_dir.as_posix()}")')
|
|
193
|
+
logger.info(f"renv environment loaded for project: {project_dir}")
|
|
104
194
|
|
|
105
195
|
|
|
106
|
-
#
|
|
196
|
+
# ---------------------------------------------------------------------
|
|
197
|
+
# RFunctionCaller
|
|
198
|
+
# ---------------------------------------------------------------------
|
|
107
199
|
class RFunctionCaller:
|
|
108
200
|
"""
|
|
109
|
-
|
|
110
|
-
"""
|
|
201
|
+
Utility to load and call R functions from a script, lazily loading rpy2 and activating renv.
|
|
111
202
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
raise FileNotFoundError(f"R script not found: {script_path}")
|
|
203
|
+
Supports:
|
|
204
|
+
- Scripts with custom functions
|
|
205
|
+
- Base R functions
|
|
206
|
+
- Functions in installed packages
|
|
207
|
+
- Automatic conversion of Python types (lists, dicts, scalars, pandas DataFrames) to R objects
|
|
208
|
+
"""
|
|
119
209
|
|
|
210
|
+
def __init__(
|
|
211
|
+
self,
|
|
212
|
+
path_to_renv: Path | None = None,
|
|
213
|
+
script_path: Path | None = None,
|
|
214
|
+
packages: list[str] | None = None,
|
|
215
|
+
):
|
|
120
216
|
self.path_to_renv = path_to_renv.resolve() if path_to_renv else None
|
|
217
|
+
self.script_path = script_path.resolve() if script_path else None
|
|
218
|
+
self.packages = packages or None
|
|
219
|
+
|
|
220
|
+
# Lazy-loaded attributes
|
|
221
|
+
self._r = None
|
|
222
|
+
self.ro = None
|
|
223
|
+
self.robjects = None
|
|
224
|
+
self.pandas2ri = None
|
|
225
|
+
self.localconverter = None
|
|
226
|
+
self.IntVector = None
|
|
227
|
+
self.FloatVector = None
|
|
228
|
+
self.BoolVector = None
|
|
229
|
+
self.StrVector = None
|
|
230
|
+
self.ListVector = None
|
|
231
|
+
self.NamedList = None
|
|
232
|
+
|
|
233
|
+
if self.script_path and not self.script_path.exists():
|
|
234
|
+
raise FileNotFoundError(f"R script not found: {self.script_path}")
|
|
235
|
+
|
|
236
|
+
self.script_dir = self.script_path.parent if self.script_path else None
|
|
237
|
+
self._script_loaded = False
|
|
238
|
+
self._renv_activated = False
|
|
239
|
+
self._packages_loaded = False
|
|
240
|
+
|
|
241
|
+
# -----------------------------------------------------------------
|
|
242
|
+
# Internal: lazy R loading
|
|
243
|
+
# -----------------------------------------------------------------
|
|
244
|
+
def _ensure_r_loaded(self):
|
|
245
|
+
if self._r is None:
|
|
246
|
+
r = _require_rpy2(raise_on_missing=True)
|
|
247
|
+
self._r = r
|
|
248
|
+
self.ro = r["ro"]
|
|
249
|
+
self.robjects = r["robjects"]
|
|
250
|
+
self.pandas2ri = r["pandas2ri"]
|
|
251
|
+
self.localconverter = r["localconverter"]
|
|
252
|
+
self.IntVector = r["IntVector"]
|
|
253
|
+
self.FloatVector = r["FloatVector"]
|
|
254
|
+
self.BoolVector = r["BoolVector"]
|
|
255
|
+
self.StrVector = r["StrVector"]
|
|
256
|
+
self.ListVector = r["ListVector"]
|
|
257
|
+
self.NamedList = r["NamedList"]
|
|
258
|
+
|
|
259
|
+
# Activate renv
|
|
260
|
+
if self.path_to_renv and not self._renv_activated:
|
|
261
|
+
activate_renv(self.path_to_renv)
|
|
262
|
+
self._renv_activated = True
|
|
121
263
|
|
|
122
|
-
|
|
123
|
-
self.
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
264
|
+
# Load packages
|
|
265
|
+
if self.packages and not self._packages_loaded:
|
|
266
|
+
for pkg in self.packages:
|
|
267
|
+
try:
|
|
268
|
+
self.robjects.r(f'suppressMessages(library("{pkg}"))')
|
|
269
|
+
except Exception:
|
|
270
|
+
logger.info(f"Package '{pkg}' not found. Installing...")
|
|
271
|
+
self.robjects.r(
|
|
272
|
+
f'install.packages("{pkg}", repos="https://cloud.r-project.org")'
|
|
273
|
+
)
|
|
274
|
+
self.robjects.r(f'suppressMessages(library("{pkg}"))')
|
|
275
|
+
self._packages_loaded = True
|
|
276
|
+
|
|
277
|
+
# Source script
|
|
278
|
+
if self.script_path and not self._script_loaded:
|
|
279
|
+
self.robjects.r(f'setwd("{self.script_dir.as_posix()}")')
|
|
280
|
+
self.robjects.r(f'source("{self.script_path.as_posix()}")')
|
|
281
|
+
logger.info(f"R script sourced: {self.script_path.name}")
|
|
282
|
+
self._script_loaded = True
|
|
283
|
+
|
|
284
|
+
def _clean_scalar(self, x):
|
|
128
285
|
"""
|
|
129
|
-
|
|
286
|
+
Clean R-style missing values to pandas/NumPy equivalents.
|
|
287
|
+
Called inside _r2py on each vector element; atomic/scalar only.
|
|
130
288
|
"""
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
else:
|
|
134
|
-
logger.info("No renv path provided; using base or current environment.")
|
|
289
|
+
r = self._r
|
|
290
|
+
ro = r["robjects"]
|
|
135
291
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
robjects.r(f'source("{self.script_path.as_posix()}")')
|
|
139
|
-
logger.info("R script sourced: {}", self.script_path.name)
|
|
292
|
+
if x is None:
|
|
293
|
+
return None
|
|
140
294
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
295
|
+
if x in (
|
|
296
|
+
getattr(ro, "NA_Real", None),
|
|
297
|
+
getattr(ro, "NA_Integer", None),
|
|
298
|
+
getattr(ro, "NA_Logical", None),
|
|
299
|
+
):
|
|
300
|
+
return None
|
|
145
301
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
- NamedList or ListVector
|
|
149
|
-
- Nested lists with data.frames inside
|
|
150
|
-
"""
|
|
302
|
+
if x is getattr(ro, "NA_Character", None):
|
|
303
|
+
return None
|
|
151
304
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
if isinstance(obj, pd.DataFrame):
|
|
155
|
-
return postprocess_r_dataframe(obj)
|
|
305
|
+
if isinstance(x, float) and np.isnan(x):
|
|
306
|
+
return None
|
|
156
307
|
|
|
157
|
-
|
|
158
|
-
elif isinstance(obj, dict):
|
|
159
|
-
return {k: _recursive_postprocess(v) for k, v in obj.items()}
|
|
308
|
+
return x
|
|
160
309
|
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
310
|
+
# -----------------------------------------------------------------
|
|
311
|
+
# Python -> R conversion
|
|
312
|
+
# -----------------------------------------------------------------
|
|
313
|
+
def _py2r(self, obj):
|
|
314
|
+
"""
|
|
315
|
+
Convert Python objects to R objects robustly.
|
|
316
|
+
Handles scalars, None/pd.NA, lists, dicts, and pandas DataFrames.
|
|
317
|
+
"""
|
|
318
|
+
self._ensure_r_loaded()
|
|
319
|
+
robjects = self.robjects
|
|
320
|
+
pandas2ri = self.pandas2ri
|
|
321
|
+
IntVector = self.IntVector
|
|
322
|
+
FloatVector = self.FloatVector
|
|
323
|
+
BoolVector = self.BoolVector
|
|
324
|
+
StrVector = self.StrVector
|
|
325
|
+
ListVector = self.ListVector
|
|
326
|
+
localconverter = self.localconverter
|
|
327
|
+
import pandas as pd
|
|
328
|
+
import rpy2.robjects.vectors as rvec
|
|
329
|
+
|
|
330
|
+
# Pass through existing R objects
|
|
331
|
+
if isinstance(
|
|
332
|
+
obj,
|
|
333
|
+
(
|
|
334
|
+
rvec.IntVector,
|
|
335
|
+
rvec.FloatVector,
|
|
336
|
+
rvec.BoolVector,
|
|
337
|
+
rvec.StrVector,
|
|
338
|
+
rvec.ListVector,
|
|
339
|
+
robjects.DataFrame,
|
|
340
|
+
),
|
|
341
|
+
):
|
|
342
|
+
return obj
|
|
164
343
|
|
|
165
|
-
|
|
344
|
+
with localconverter(robjects.default_converter + pandas2ri.converter):
|
|
345
|
+
if obj is None or obj is pd.NA:
|
|
346
|
+
return robjects.NULL
|
|
166
347
|
|
|
348
|
+
# DataFrame → data.frame
|
|
349
|
+
if isinstance(obj, pd.DataFrame):
|
|
350
|
+
return pandas2ri.py2rpy(obj)
|
|
351
|
+
|
|
352
|
+
# Series → vector
|
|
353
|
+
if isinstance(obj, pd.Series):
|
|
354
|
+
return self._py2r(obj.tolist())
|
|
355
|
+
|
|
356
|
+
# Scalars
|
|
357
|
+
if isinstance(obj, (int, float, bool, str)):
|
|
358
|
+
return obj
|
|
359
|
+
|
|
360
|
+
# Lists
|
|
361
|
+
if isinstance(obj, list):
|
|
362
|
+
if len(obj) == 0:
|
|
363
|
+
return FloatVector([])
|
|
364
|
+
elif all(isinstance(x, (int, float)) or x is None for x in obj):
|
|
365
|
+
return FloatVector(
|
|
366
|
+
[robjects.NA_Real if x is None else float(x) for x in obj]
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
def is_na(x):
|
|
370
|
+
return (
|
|
371
|
+
x is None or x is pd.NA or (isinstance(x, float) and pd.isna(x))
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
# Homogeneous numeric
|
|
375
|
+
if all(
|
|
376
|
+
isinstance(x, (int, float)) and not isinstance(x, bool) or is_na(x)
|
|
377
|
+
for x in obj
|
|
378
|
+
):
|
|
379
|
+
return FloatVector(
|
|
380
|
+
[robjects.NA_Real if is_na(x) else float(x) for x in obj]
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
# Homogeneous bool
|
|
384
|
+
if all(isinstance(x, bool) or is_na(x) for x in obj):
|
|
385
|
+
return BoolVector(
|
|
386
|
+
[robjects.NA_Logical if is_na(x) else x for x in obj]
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
# Homogeneous str
|
|
390
|
+
if all(isinstance(x, str) or is_na(x) for x in obj):
|
|
391
|
+
return StrVector(
|
|
392
|
+
[robjects.NA_Character if is_na(x) else x for x in obj]
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
# Mixed or nested list → ListVector with positional keys
|
|
396
|
+
return ListVector({str(i): self._py2r(v) for i, v in enumerate(obj)})
|
|
397
|
+
|
|
398
|
+
# Dict → NamedList
|
|
399
|
+
if isinstance(obj, dict):
|
|
400
|
+
return ListVector({k: self._py2r(v) for k, v in obj.items()})
|
|
401
|
+
|
|
402
|
+
raise NotImplementedError(f"Cannot convert Python object to R: {type(obj)}")
|
|
403
|
+
|
|
404
|
+
# -----------------------------------------------------------------
|
|
405
|
+
# R -> Python conversion
|
|
406
|
+
# -----------------------------------------------------------------
|
|
407
|
+
def _r2py(self, obj, top_level=True):
|
|
408
|
+
"""
|
|
409
|
+
Convert R objects to Python objects robustly.
|
|
410
|
+
Handles DataFrames, NamedList/ListVector, atomic vectors, and NULL.
|
|
411
|
+
"""
|
|
412
|
+
r = self._r
|
|
413
|
+
robjects = self.robjects
|
|
414
|
+
NamedList = self.NamedList
|
|
415
|
+
ListVector = self.ListVector
|
|
416
|
+
StrVector = self.StrVector
|
|
417
|
+
IntVector = self.IntVector
|
|
418
|
+
FloatVector = self.FloatVector
|
|
419
|
+
BoolVector = self.BoolVector
|
|
420
|
+
NULLType = r["NULLType"]
|
|
421
|
+
lc = self.localconverter
|
|
422
|
+
pandas2ri = self.pandas2ri
|
|
423
|
+
|
|
424
|
+
if isinstance(obj, NULLType):
|
|
425
|
+
return None
|
|
426
|
+
|
|
427
|
+
if isinstance(obj, robjects.DataFrame):
|
|
428
|
+
with lc(robjects.default_converter + pandas2ri.converter):
|
|
429
|
+
df = robjects.conversion.rpy2py(obj)
|
|
430
|
+
df = postprocess_r_dataframe(df)
|
|
431
|
+
df = clean_r_missing(df, caller=self)
|
|
432
|
+
return df
|
|
433
|
+
|
|
434
|
+
if isinstance(obj, (NamedList, ListVector)):
|
|
435
|
+
py_obj = r_namedlist_to_dict(obj, caller=self, top_level=top_level)
|
|
436
|
+
# Auto-unpack single-element lists only at top-level
|
|
437
|
+
if isinstance(py_obj, list) and len(py_obj) == 1 and top_level:
|
|
438
|
+
return py_obj[0]
|
|
439
|
+
return py_obj
|
|
440
|
+
|
|
441
|
+
if isinstance(obj, (StrVector, IntVector, FloatVector, BoolVector)):
|
|
442
|
+
py_list = [self._clean_scalar(v) for v in obj]
|
|
443
|
+
if len(py_list) == 1 and top_level:
|
|
444
|
+
return py_list[0]
|
|
445
|
+
return py_list
|
|
446
|
+
|
|
447
|
+
return self._clean_scalar(obj)
|
|
448
|
+
|
|
449
|
+
# -----------------------------------------------------------------
|
|
450
|
+
# Public: ensure R package is available
|
|
451
|
+
# -----------------------------------------------------------------
|
|
452
|
+
def ensure_r_package(self, pkg_name: str):
|
|
453
|
+
r = self.robjects.r
|
|
167
454
|
try:
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
r_kwargs = {k: robjects.conversion.py2rpy(v) for k, v in kwargs.items()}
|
|
173
|
-
result = r_func(*r_args, **r_kwargs)
|
|
455
|
+
r(f'suppressMessages(library("{pkg_name}", character.only=TRUE))')
|
|
456
|
+
except Exception:
|
|
457
|
+
r(f'install.packages("{pkg_name}", repos="https://cloud.r-project.org")')
|
|
458
|
+
r(f'suppressMessages(library("{pkg_name}", character.only=TRUE))')
|
|
174
459
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
460
|
+
# -----------------------------------------------------------------
|
|
461
|
+
# Public: call an R function
|
|
462
|
+
# -----------------------------------------------------------------
|
|
463
|
+
def call(self, func_name: str, *args, **kwargs):
|
|
464
|
+
"""
|
|
465
|
+
Call an R function safely. Supports:
|
|
466
|
+
- functions defined in scripts
|
|
467
|
+
- base R functions
|
|
468
|
+
- functions in loaded packages
|
|
469
|
+
"""
|
|
470
|
+
self._ensure_r_loaded()
|
|
178
471
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
472
|
+
# --- Find the function ---
|
|
473
|
+
try:
|
|
474
|
+
func = self.robjects.globalenv[func_name] # script-defined
|
|
475
|
+
except KeyError:
|
|
476
|
+
try:
|
|
477
|
+
func = self.robjects.r[func_name] # base or package function
|
|
478
|
+
except KeyError:
|
|
479
|
+
raise ValueError(f"R function '{func_name}' not found.")
|
|
182
480
|
|
|
183
|
-
|
|
184
|
-
|
|
481
|
+
# --- Convert Python args to R ---
|
|
482
|
+
r_args = [self._py2r(a) for a in args]
|
|
483
|
+
r_kwargs = {k: self._py2r(v) for k, v in kwargs.items()}
|
|
185
484
|
|
|
186
|
-
|
|
187
|
-
|
|
485
|
+
# --- Call safely ---
|
|
486
|
+
try:
|
|
487
|
+
result = func(*r_args, **r_kwargs)
|
|
188
488
|
except Exception as e:
|
|
189
|
-
raise RuntimeError(f"Error calling R function '{
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
cls,
|
|
194
|
-
repo: str,
|
|
195
|
-
file_path: str,
|
|
196
|
-
ref: str = "main",
|
|
197
|
-
token: Optional[str] = None,
|
|
198
|
-
cache_dir: Optional[Path] = None,
|
|
199
|
-
path_to_renv: Optional[Path] = None,
|
|
200
|
-
trust_remote_code: bool = False,
|
|
201
|
-
require_token: bool = False,
|
|
202
|
-
) -> "RFunctionCaller | Path":
|
|
203
|
-
"""
|
|
204
|
-
Download an R script from a GitHub repository and construct an RFunctionCaller.
|
|
205
|
-
|
|
206
|
-
Args:
|
|
207
|
-
repo: repository in the form "owner/repo".
|
|
208
|
-
file_path: path to the R script inside the repo (e.g. "scripts/my.R").
|
|
209
|
-
ref: branch name, tag or commit SHA. Defaults to "main".
|
|
210
|
-
token: optional GitHub token for private repos. If None, looks at
|
|
211
|
-
environment variables `GITHUB_TOKEN` or `GH_TOKEN`.
|
|
212
|
-
cache_dir: optional directory to cache downloaded files. Defaults to
|
|
213
|
-
`~/.cache/rpy-bridge`.
|
|
214
|
-
path_to_renv: optional path to renv or project directory to use.
|
|
215
|
-
trust_remote_code: MUST be True to execute remote code. If False,
|
|
216
|
-
the function will only return the local cached path.
|
|
217
|
-
|
|
218
|
-
Returns:
|
|
219
|
-
If `trust_remote_code` is True, returns an `RFunctionCaller` instance
|
|
220
|
-
ready to call functions from the downloaded script. Otherwise returns
|
|
221
|
-
the `Path` to the cached script so the caller can inspect it first.
|
|
222
|
-
"""
|
|
223
|
-
raise NotImplementedError(
|
|
224
|
-
"RFunctionCaller.from_github was removed. Clone repositories locally and pass a local script_path to RFunctionCaller instead."
|
|
225
|
-
)
|
|
489
|
+
raise RuntimeError(f"Error calling R function '{func_name}': {e}")
|
|
490
|
+
|
|
491
|
+
# --- Convert R result back to Python ---
|
|
492
|
+
return self._r2py(result)
|
|
226
493
|
|
|
227
494
|
|
|
228
495
|
# %%
|
|
229
|
-
|
|
496
|
+
# ------------------------------
|
|
497
|
+
# Utility functions for R ↔ Python
|
|
498
|
+
# ------------------------------
|
|
499
|
+
def r_namedlist_to_dict(namedlist, caller: RFunctionCaller, top_level=False):
|
|
230
500
|
"""
|
|
231
501
|
Recursively convert an R NamedList or ListVector to a Python dictionary.
|
|
232
|
-
|
|
233
|
-
- Convert data.frames to pandas DataFrames.
|
|
234
|
-
- Handles NULL or unnamed cases gracefully.
|
|
502
|
+
Uses the caller._r2py method for nested conversions.
|
|
235
503
|
"""
|
|
504
|
+
r = _ensure_rpy2()
|
|
505
|
+
NamedList = r["NamedList"]
|
|
506
|
+
ListVector = r["ListVector"]
|
|
236
507
|
|
|
237
|
-
# -------------------------------------------
|
|
238
|
-
# Handle named lists (NamedList or ListVector)
|
|
239
|
-
# -------------------------------------------
|
|
240
508
|
if isinstance(namedlist, (NamedList, ListVector)):
|
|
241
509
|
names = namedlist.names if not callable(namedlist.names) else namedlist.names()
|
|
510
|
+
|
|
511
|
+
# Detect positional (unnamed) list
|
|
512
|
+
if names and all(str(i) == str(name) for i, name in enumerate(names)):
|
|
513
|
+
out = []
|
|
514
|
+
for v in namedlist:
|
|
515
|
+
# Nested elements are never top-level
|
|
516
|
+
val = caller._r2py(v, top_level=False)
|
|
517
|
+
out.append(val)
|
|
518
|
+
return out
|
|
519
|
+
|
|
520
|
+
# Otherwise dict
|
|
242
521
|
result = {}
|
|
522
|
+
for i, val in enumerate(namedlist):
|
|
523
|
+
key = names[i] if names and i < len(names) else str(i)
|
|
524
|
+
v_py = caller._r2py(val, top_level=False) # nested elements
|
|
525
|
+
result[str(key)] = v_py
|
|
526
|
+
return result
|
|
243
527
|
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
for key, value in zip(names, namedlist):
|
|
247
|
-
key_str = str(key) if key is not None and not isinstance(key, NULLType) else None
|
|
248
|
-
if key_str:
|
|
249
|
-
result[key_str] = r_namedlist_to_dict(value)
|
|
250
|
-
return result
|
|
251
|
-
|
|
252
|
-
# If no names, fallback to a list
|
|
253
|
-
return [r_namedlist_to_dict(value) for value in namedlist]
|
|
254
|
-
|
|
255
|
-
# -------------------------------------------
|
|
256
|
-
# Handle atomic vectors (StrVector, IntVector, etc.)
|
|
257
|
-
# These may have names (e.g., c(a = 1, b = 2)) — if so, return a dict.
|
|
258
|
-
# Otherwise, convert to plain Python list.
|
|
259
|
-
# -------------------------------------------
|
|
260
|
-
if isinstance(namedlist, (StrVector, IntVector, FloatVector, BoolVector)):
|
|
261
|
-
names = namedlist.names if not callable(namedlist.names) else namedlist.names()
|
|
262
|
-
if not isinstance(names, NULLType):
|
|
263
|
-
return {
|
|
264
|
-
str(n): v
|
|
265
|
-
for n, v in zip(names, list(namedlist))
|
|
266
|
-
if n is not None and not isinstance(n, NULLType)
|
|
267
|
-
}
|
|
268
|
-
return list(namedlist)
|
|
269
|
-
|
|
270
|
-
# -------------------------------------------
|
|
271
|
-
# Attempt conversion via pandas2ri — works for data.frames, tibbles, etc.
|
|
272
|
-
# If it fails, fall back to returning the original R object.
|
|
273
|
-
# -------------------------------------------
|
|
274
|
-
with localconverter(robjects.default_converter + pandas2ri.converter):
|
|
275
|
-
try:
|
|
276
|
-
return robjects.conversion.rpy2py(namedlist)
|
|
277
|
-
except Exception:
|
|
278
|
-
return namedlist
|
|
528
|
+
# Fallback: scalar/vector at the very top
|
|
529
|
+
return caller._r2py(namedlist, top_level=top_level)
|
|
279
530
|
|
|
280
531
|
|
|
281
|
-
|
|
282
|
-
def clean_r_dataframe(r_df: object) -> object:
|
|
532
|
+
def clean_r_dataframe(r_df: pd.DataFrame) -> pd.DataFrame:
|
|
283
533
|
"""
|
|
284
|
-
Clean an R data.frame
|
|
285
|
-
like .groups and .rows.
|
|
534
|
+
Clean an R data.frame by removing non-structural attributes like .groups and .rows.
|
|
286
535
|
"""
|
|
287
536
|
for attr in [".groups", ".rows"]:
|
|
288
537
|
try:
|
|
289
|
-
del r_df.
|
|
538
|
+
del r_df.attrs[attr]
|
|
290
539
|
except (KeyError, AttributeError):
|
|
291
540
|
pass
|
|
292
541
|
return r_df
|
|
293
542
|
|
|
294
543
|
|
|
295
|
-
# %%
|
|
296
544
|
def fix_string_nans(df: pd.DataFrame) -> pd.DataFrame:
|
|
297
|
-
# Replace common string versions of NA/NaN with actual pd.NA
|
|
298
|
-
return df.replace(["nan", "NaN", "NA", "na", ""], pd.NA)
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
# %%
|
|
302
|
-
def replace_r_na(obj: object) -> object:
|
|
303
545
|
"""
|
|
304
|
-
|
|
546
|
+
Replace string NAs or empty strings with pd.NA.
|
|
305
547
|
"""
|
|
306
|
-
|
|
307
|
-
if isinstance(obj, pd.DataFrame):
|
|
308
|
-
return (
|
|
309
|
-
obj.replace({ro.NA_Character: np.nan}, regex=False)
|
|
310
|
-
if hasattr(ro, "NA_Character")
|
|
311
|
-
else obj
|
|
312
|
-
)
|
|
313
|
-
elif isinstance(obj, dict):
|
|
314
|
-
return {k: replace_r_na(v) for k, v in obj.items()}
|
|
315
|
-
elif isinstance(obj, list):
|
|
316
|
-
return [replace_r_na(item) for item in obj]
|
|
317
|
-
elif hasattr(ro, "NA_Character") and obj is ro.NA_Character:
|
|
318
|
-
return np.nan
|
|
319
|
-
else:
|
|
320
|
-
return obj
|
|
548
|
+
return df.replace(["nan", "NaN", "NA", "na", ""], pd.NA)
|
|
321
549
|
|
|
322
550
|
|
|
323
|
-
# %%
|
|
324
551
|
def normalize_single_df_dtypes(df: pd.DataFrame) -> pd.DataFrame:
|
|
552
|
+
"""
|
|
553
|
+
Normalize dtypes in a single DataFrame after R conversion.
|
|
554
|
+
"""
|
|
325
555
|
df = df.replace(["", "nan", "NaN", "NA", "na"], pd.NA)
|
|
326
556
|
|
|
327
557
|
for col in df.columns:
|
|
328
558
|
series = df[col]
|
|
329
|
-
|
|
330
|
-
# Try converting object/string columns to numeric if possible
|
|
331
559
|
if pd.api.types.is_object_dtype(series):
|
|
332
560
|
coerced = pd.to_numeric(series, errors="coerce")
|
|
333
|
-
# Replace column if conversion produced fewer NaNs (meaning more numeric)
|
|
334
561
|
if coerced.notna().sum() >= series.notna().sum() * 0.5:
|
|
335
562
|
df[col] = coerced
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
if pd.api.types.is_integer_dtype(df[col]):
|
|
339
|
-
if df[col].isna().any():
|
|
340
|
-
df[col] = df[col].astype("float64")
|
|
341
|
-
|
|
563
|
+
if pd.api.types.is_integer_dtype(df[col]) and df[col].isna().any():
|
|
564
|
+
df[col] = df[col].astype("float64")
|
|
342
565
|
return df
|
|
343
566
|
|
|
344
567
|
|
|
345
|
-
# %%
|
|
346
568
|
def fix_r_dataframe_types(df: pd.DataFrame) -> pd.DataFrame:
|
|
347
569
|
"""
|
|
348
|
-
Post-process
|
|
349
|
-
-
|
|
350
|
-
-
|
|
351
|
-
-
|
|
570
|
+
Post-process R DataFrame:
|
|
571
|
+
- Convert R NA_integer_ sentinel (-2147483648) to pd.NA
|
|
572
|
+
- Convert R-style numeric dates to datetime
|
|
573
|
+
- Remove timezone from datetime columns
|
|
352
574
|
"""
|
|
353
575
|
for col in df.columns:
|
|
354
576
|
series = df[col]
|
|
355
577
|
|
|
356
|
-
# Fix R's NA_integer_ sentinel (-2147483648)
|
|
357
578
|
if pd.api.types.is_integer_dtype(series):
|
|
358
|
-
|
|
359
|
-
df[col] = series.mask(series == -2147483648, pd.NA)
|
|
579
|
+
df[col] = series.mask(series == -2147483648, pd.NA)
|
|
360
580
|
|
|
361
|
-
# Convert R-style date columns (days since 1970) to datetime
|
|
362
581
|
if pd.api.types.is_numeric_dtype(series):
|
|
363
582
|
values = series.dropna()
|
|
364
583
|
if not values.empty and values.between(10000, 40000).all():
|
|
365
584
|
try:
|
|
366
|
-
|
|
367
|
-
|
|
585
|
+
df[col] = pd.to_datetime("1970-01-01") + pd.to_timedelta(
|
|
586
|
+
series, unit="D"
|
|
587
|
+
)
|
|
368
588
|
except Exception:
|
|
369
589
|
pass
|
|
370
590
|
|
|
371
|
-
# Remove timezone from datetime columns (e.g., POSIXct with tz)
|
|
372
591
|
if pd.api.types.is_datetime64tz_dtype(series):
|
|
373
592
|
df[col] = series.dt.tz_localize(None)
|
|
374
593
|
|
|
375
594
|
return df
|
|
376
595
|
|
|
377
596
|
|
|
378
|
-
# %%
|
|
379
597
|
def postprocess_r_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
598
|
+
"""
|
|
599
|
+
Apply a series of fixes to a DataFrame converted from R:
|
|
600
|
+
- Type corrections
|
|
601
|
+
- String NA normalization
|
|
602
|
+
- Index normalization
|
|
603
|
+
"""
|
|
380
604
|
df = fix_r_dataframe_types(df)
|
|
381
605
|
df = fix_string_nans(df)
|
|
382
606
|
df = normalize_single_df_dtypes(df)
|
|
383
607
|
|
|
384
|
-
# Normalize R-style string index starting from "1"
|
|
385
608
|
if df.index.dtype == object:
|
|
386
609
|
try:
|
|
387
610
|
int_index = df.index.astype(int)
|
|
388
|
-
if (int_index ==
|
|
611
|
+
if (int_index == np.arange(len(df)) + 1).all():
|
|
389
612
|
df.index = pd.RangeIndex(start=0, stop=len(df))
|
|
390
613
|
except Exception:
|
|
391
|
-
pass
|
|
614
|
+
pass
|
|
392
615
|
return df
|
|
393
616
|
|
|
394
617
|
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
618
|
+
def clean_r_missing(obj, caller: RFunctionCaller):
|
|
619
|
+
"""
|
|
620
|
+
Post-process R return objects for downstream Python use.
|
|
621
|
+
Recursively convert R-style missing values to pandas/NumPy:
|
|
622
|
+
- NA_integer_, NA_real_, NA_logical_ → np.nan
|
|
623
|
+
- NA_character_ → pd.NA
|
|
624
|
+
"""
|
|
625
|
+
r = _ensure_rpy2()
|
|
626
|
+
ro = r["robjects"]
|
|
627
|
+
|
|
628
|
+
NA_MAP = {
|
|
629
|
+
getattr(ro, "NA_Real", None): np.nan,
|
|
630
|
+
getattr(ro, "NA_Integer", None): np.nan,
|
|
631
|
+
getattr(ro, "NA_Logical", None): np.nan,
|
|
632
|
+
getattr(ro, "NA_Character", None): pd.NA,
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
if isinstance(obj, pd.DataFrame):
|
|
636
|
+
for col in obj.columns:
|
|
637
|
+
obj[col] = obj[col].apply(lambda x: clean_r_missing(x, caller))
|
|
638
|
+
return obj
|
|
639
|
+
|
|
640
|
+
elif isinstance(obj, dict):
|
|
641
|
+
return {k: clean_r_missing(v, caller) for k, v in obj.items()}
|
|
642
|
+
|
|
643
|
+
elif isinstance(obj, list):
|
|
644
|
+
return [clean_r_missing(v, caller) for v in obj]
|
|
645
|
+
|
|
646
|
+
else:
|
|
647
|
+
return NA_MAP.get(obj, obj)
|
|
398
648
|
|
|
399
649
|
|
|
400
650
|
# %%
|
|
@@ -404,7 +654,9 @@ def postprocess_r_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
404
654
|
# -------------------------------------------
|
|
405
655
|
|
|
406
656
|
|
|
407
|
-
def normalize_dtypes(
|
|
657
|
+
def normalize_dtypes(
|
|
658
|
+
df1: pd.DataFrame, df2: pd.DataFrame
|
|
659
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
408
660
|
"""
|
|
409
661
|
Aligns column dtypes across two DataFrames for accurate comparison.
|
|
410
662
|
- Replaces empty strings with pd.NA.
|
|
@@ -420,8 +672,12 @@ def normalize_dtypes(df1: pd.DataFrame, df2: pd.DataFrame) -> tuple[pd.DataFrame
|
|
|
420
672
|
dtype1, dtype2 = s1.dtype, s2.dtype
|
|
421
673
|
|
|
422
674
|
# If one is numeric and the other is object, try coercing both to numeric
|
|
423
|
-
if (
|
|
424
|
-
pd.api.types.
|
|
675
|
+
if (
|
|
676
|
+
pd.api.types.is_numeric_dtype(dtype1)
|
|
677
|
+
and pd.api.types.is_object_dtype(dtype2)
|
|
678
|
+
) or (
|
|
679
|
+
pd.api.types.is_object_dtype(dtype1)
|
|
680
|
+
and pd.api.types.is_numeric_dtype(dtype2)
|
|
425
681
|
):
|
|
426
682
|
try:
|
|
427
683
|
df1[col] = pd.to_numeric(s1, errors="coerce")
|
|
@@ -431,7 +687,9 @@ def normalize_dtypes(df1: pd.DataFrame, df2: pd.DataFrame) -> tuple[pd.DataFrame
|
|
|
431
687
|
pass # fallback to next block if coercion fails
|
|
432
688
|
|
|
433
689
|
# If both are numeric but of different types (e.g., int vs float), unify to float64
|
|
434
|
-
if pd.api.types.is_numeric_dtype(dtype1) and pd.api.types.is_numeric_dtype(
|
|
690
|
+
if pd.api.types.is_numeric_dtype(dtype1) and pd.api.types.is_numeric_dtype(
|
|
691
|
+
dtype2
|
|
692
|
+
):
|
|
435
693
|
df1[col] = df1[col].astype("float64")
|
|
436
694
|
df2[col] = df2[col].astype("float64")
|
|
437
695
|
continue
|
|
@@ -445,7 +703,9 @@ def normalize_dtypes(df1: pd.DataFrame, df2: pd.DataFrame) -> tuple[pd.DataFrame
|
|
|
445
703
|
|
|
446
704
|
|
|
447
705
|
# %%
|
|
448
|
-
def align_numeric_dtypes(
|
|
706
|
+
def align_numeric_dtypes(
|
|
707
|
+
df1: pd.DataFrame, df2: pd.DataFrame
|
|
708
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
449
709
|
"""
|
|
450
710
|
Ensure aligned numeric dtypes between two DataFrames for accurate comparison.
|
|
451
711
|
Converts between int, float, and numeric-looking strings where appropriate.
|
|
@@ -479,7 +739,9 @@ def align_numeric_dtypes(df1: pd.DataFrame, df2: pd.DataFrame) -> tuple[pd.DataF
|
|
|
479
739
|
|
|
480
740
|
|
|
481
741
|
# %%
|
|
482
|
-
def compare_r_py_dataframes(
|
|
742
|
+
def compare_r_py_dataframes(
|
|
743
|
+
df1: pd.DataFrame, df2: pd.DataFrame, float_tol: float = 1e-8
|
|
744
|
+
) -> dict:
|
|
483
745
|
"""
|
|
484
746
|
Compare a Python DataFrame (df1) with an R DataFrame converted to pandas (df2).
|
|
485
747
|
|
|
@@ -530,7 +792,9 @@ def compare_r_py_dataframes(df1: pd.DataFrame, df2: pd.DataFrame, float_tol: flo
|
|
|
530
792
|
col_py = df1_aligned[col]
|
|
531
793
|
col_r = df2_aligned[col]
|
|
532
794
|
|
|
533
|
-
if pd.api.types.is_numeric_dtype(col_py) and pd.api.types.is_numeric_dtype(
|
|
795
|
+
if pd.api.types.is_numeric_dtype(col_py) and pd.api.types.is_numeric_dtype(
|
|
796
|
+
col_r
|
|
797
|
+
):
|
|
534
798
|
col_py, col_r = col_py.align(col_r)
|
|
535
799
|
|
|
536
800
|
close = np.isclose(
|