rpy-bridge 0.3.9__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rpy_bridge/__init__.py +4 -28
- rpy_bridge/compare.py +106 -0
- rpy_bridge/convert.py +63 -0
- rpy_bridge/core.py +505 -0
- rpy_bridge/dataframe.py +74 -0
- rpy_bridge/env.py +108 -0
- rpy_bridge/logging.py +50 -0
- rpy_bridge/renv.py +149 -0
- rpy_bridge/rpy2_loader.py +71 -0
- rpy_bridge-0.5.0.dist-info/METADATA +297 -0
- rpy_bridge-0.5.0.dist-info/RECORD +15 -0
- rpy_bridge/rpy2_utils.py +0 -1052
- rpy_bridge-0.3.9.dist-info/METADATA +0 -258
- rpy_bridge-0.3.9.dist-info/RECORD +0 -8
- {rpy_bridge-0.3.9.dist-info → rpy_bridge-0.5.0.dist-info}/WHEEL +0 -0
- {rpy_bridge-0.3.9.dist-info → rpy_bridge-0.5.0.dist-info}/licenses/LICENSE +0 -0
- {rpy_bridge-0.3.9.dist-info → rpy_bridge-0.5.0.dist-info}/top_level.txt +0 -0
rpy_bridge/rpy2_utils.py
DELETED
|
@@ -1,1052 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
R–Python Integration Utility
|
|
3
|
-
|
|
4
|
-
Provides tools to load R scripts, activate renv environments, and call R functions
|
|
5
|
-
directly from Python, with automatic conversion between R and Python data types.
|
|
6
|
-
|
|
7
|
-
----------
|
|
8
|
-
Requirements
|
|
9
|
-
----------
|
|
10
|
-
- R must be installed and accessible in your system environment.
|
|
11
|
-
- Ensure compatibility with your R project's renv setup (or any other R environment you use).
|
|
12
|
-
|
|
13
|
-
Features
|
|
14
|
-
----------
|
|
15
|
-
- Lazy loading of rpy2 and R runtime.
|
|
16
|
-
- Activation of renv environments for isolated R project dependencies.
|
|
17
|
-
- Support for sourcing individual R scripts or directories of scripts.
|
|
18
|
-
- Namespace-based access to R functions.
|
|
19
|
-
- Automatic conversion between R vectors, data frames, and Python types (pandas, lists, scalars).
|
|
20
|
-
- Utilities for cleaning and aligning data frames between R and Python.
|
|
21
|
-
"""
|
|
22
|
-
|
|
23
|
-
# ruff: noqa: E402
|
|
24
|
-
# %%
|
|
25
|
-
# Import libraries
|
|
26
|
-
import importlib.util
|
|
27
|
-
import os
|
|
28
|
-
import subprocess
|
|
29
|
-
import sys
|
|
30
|
-
import warnings
|
|
31
|
-
from pathlib import Path
|
|
32
|
-
from typing import TYPE_CHECKING, Any, Iterable
|
|
33
|
-
|
|
34
|
-
import numpy as np
|
|
35
|
-
import pandas as pd
|
|
36
|
-
|
|
37
|
-
warnings.filterwarnings("ignore", message="Environment variable .* redefined by R")
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
if TYPE_CHECKING:
|
|
41
|
-
import logging as logging_module
|
|
42
|
-
|
|
43
|
-
from loguru import Logger as LoguruLogger
|
|
44
|
-
|
|
45
|
-
LoggerType = LoguruLogger | logging_module.Logger
|
|
46
|
-
|
|
47
|
-
else:
|
|
48
|
-
LoggerType = None # runtime doesn’t need the type object
|
|
49
|
-
|
|
50
|
-
import logging
|
|
51
|
-
|
|
52
|
-
try:
|
|
53
|
-
from loguru import logger as loguru_logger # type: ignore
|
|
54
|
-
|
|
55
|
-
logger = loguru_logger
|
|
56
|
-
except ImportError:
|
|
57
|
-
logging.basicConfig()
|
|
58
|
-
logger = logging.getLogger("rpy-bridge")
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
# --- Remove default handler to override global default ---
|
|
62
|
-
logger.remove()
|
|
63
|
-
|
|
64
|
-
# --- Add a "sink" for RFunctionCaller logs ---
|
|
65
|
-
_rfc_logger = logger.bind(tag="[RFunctionCaller]")
|
|
66
|
-
_rfc_logger.add(
|
|
67
|
-
sys.stderr,
|
|
68
|
-
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}", # Only show message
|
|
69
|
-
level="INFO",
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def _log_r_call(func_name: str, source_info: str):
|
|
74
|
-
"""
|
|
75
|
-
Log an R function call, showing only '[RFunctionCaller] Called ...'
|
|
76
|
-
"""
|
|
77
|
-
_rfc_logger.opt(depth=1, record=False).info(
|
|
78
|
-
"[rpy-bridge.RFunctionCaller] Called R function '{}' from {}",
|
|
79
|
-
func_name,
|
|
80
|
-
source_info,
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
# ---------------------------------------------------------------------
|
|
85
|
-
# Path resolution
|
|
86
|
-
# ---------------------------------------------------------------------
|
|
87
|
-
def _normalize_scripts(
|
|
88
|
-
scripts: str | Path | Iterable[str | Path] | None,
|
|
89
|
-
) -> list[Path]:
|
|
90
|
-
if scripts is None:
|
|
91
|
-
return []
|
|
92
|
-
if isinstance(scripts, (str, Path)):
|
|
93
|
-
return [Path(scripts).resolve()]
|
|
94
|
-
try:
|
|
95
|
-
return [Path(s).resolve() for s in scripts]
|
|
96
|
-
except TypeError:
|
|
97
|
-
raise TypeError(
|
|
98
|
-
f"Invalid type for 'scripts': {type(scripts)}. Must be str, Path, or list/iterable thereof."
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
# ---------------------------------------------------------------------
|
|
103
|
-
# R detection and rpy2 installation
|
|
104
|
-
# ---------------------------------------------------------------------
|
|
105
|
-
def ensure_rpy2_available() -> None:
|
|
106
|
-
"""
|
|
107
|
-
Ensure rpy2 is importable.
|
|
108
|
-
Do NOT attempt to install dynamically; fail with clear instructions instead.
|
|
109
|
-
"""
|
|
110
|
-
if importlib.util.find_spec("rpy2") is None:
|
|
111
|
-
raise RuntimeError(
|
|
112
|
-
"\n[Error] rpy2 is not installed. Please install it in your Python environment:\n"
|
|
113
|
-
" pip install rpy2\n\n"
|
|
114
|
-
"Make sure your Python environment can access your system R installation.\n"
|
|
115
|
-
"On macOS with Homebrew: brew install r\n"
|
|
116
|
-
"On Linux: apt install r-base (Debian/Ubuntu) or yum install R (CentOS/RHEL)\n"
|
|
117
|
-
"On Windows: install R from https://cran.r-project.org\n"
|
|
118
|
-
)
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
def find_r_home() -> str | None:
|
|
122
|
-
"""
|
|
123
|
-
Detect system R installation.
|
|
124
|
-
"""
|
|
125
|
-
try:
|
|
126
|
-
r_home = subprocess.check_output(
|
|
127
|
-
["R", "--vanilla", "--slave", "-e", "cat(R.home())"],
|
|
128
|
-
stderr=subprocess.PIPE,
|
|
129
|
-
text=True,
|
|
130
|
-
).strip()
|
|
131
|
-
if r_home.endswith(">"): # sometimes R console prints >
|
|
132
|
-
r_home = r_home[:-1].strip()
|
|
133
|
-
return r_home
|
|
134
|
-
except FileNotFoundError:
|
|
135
|
-
# fallback paths (Linux, macOS Homebrew, Windows)
|
|
136
|
-
possible_paths = [
|
|
137
|
-
"/usr/lib/R",
|
|
138
|
-
"/usr/local/lib/R",
|
|
139
|
-
"/opt/homebrew/Cellar/r/4.5.2/lib/R", # macOS Homebrew
|
|
140
|
-
"C:\\Program Files\\R\\R-4.5.2", # Windows
|
|
141
|
-
]
|
|
142
|
-
for p in possible_paths:
|
|
143
|
-
if os.path.exists(p):
|
|
144
|
-
return p
|
|
145
|
-
return None
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
# Determine if we're running in CI / testing
|
|
149
|
-
CI_TESTING = os.environ.get("GITHUB_ACTIONS") == "true" or os.environ.get("TESTING") == "1"
|
|
150
|
-
|
|
151
|
-
R_HOME = os.environ.get("R_HOME")
|
|
152
|
-
if not R_HOME:
|
|
153
|
-
R_HOME = find_r_home()
|
|
154
|
-
if not R_HOME:
|
|
155
|
-
if CI_TESTING:
|
|
156
|
-
logger.warning("R not found; skipping all R-dependent setup in CI/testing environment.")
|
|
157
|
-
R_HOME = None # Explicitly None to signal "no R available"
|
|
158
|
-
else:
|
|
159
|
-
raise RuntimeError("R not found. Please install R or add it to PATH.")
|
|
160
|
-
else:
|
|
161
|
-
os.environ["R_HOME"] = R_HOME
|
|
162
|
-
|
|
163
|
-
logger.info(
|
|
164
|
-
f"[rpy-bridge] R_HOME = {R_HOME if R_HOME else 'not detected; R-dependent code skipped'}"
|
|
165
|
-
)
|
|
166
|
-
|
|
167
|
-
# Only configure platform-specific library paths if R is available
|
|
168
|
-
if R_HOME:
|
|
169
|
-
if sys.platform == "darwin":
|
|
170
|
-
lib_path = os.path.join(R_HOME, "lib")
|
|
171
|
-
if lib_path not in os.environ.get("DYLD_FALLBACK_LIBRARY_PATH", ""):
|
|
172
|
-
os.environ["DYLD_FALLBACK_LIBRARY_PATH"] = (
|
|
173
|
-
f"{lib_path}:{os.environ.get('DYLD_FALLBACK_LIBRARY_PATH', '')}"
|
|
174
|
-
)
|
|
175
|
-
|
|
176
|
-
elif sys.platform.startswith("linux"):
|
|
177
|
-
lib_path = os.path.join(R_HOME, "lib")
|
|
178
|
-
ld_path = os.environ.get("LD_LIBRARY_PATH", "")
|
|
179
|
-
if lib_path not in ld_path.split(":"):
|
|
180
|
-
os.environ["LD_LIBRARY_PATH"] = f"{lib_path}:{ld_path}"
|
|
181
|
-
|
|
182
|
-
elif sys.platform.startswith("win"):
|
|
183
|
-
bin_path = os.path.join(R_HOME, "bin", "x64")
|
|
184
|
-
path_env = os.environ.get("PATH", "")
|
|
185
|
-
if bin_path not in path_env.split(os.pathsep):
|
|
186
|
-
os.environ["PATH"] = f"{bin_path}{os.pathsep}{path_env}"
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
# ---------------------------------------------------------------------
|
|
190
|
-
# Lazy rpy2 import machinery
|
|
191
|
-
# ---------------------------------------------------------------------
|
|
192
|
-
_RPY2: dict | None = None
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
def _require_rpy2(raise_on_missing: bool = True) -> dict | None:
|
|
196
|
-
global _RPY2
|
|
197
|
-
if _RPY2 is not None:
|
|
198
|
-
return _RPY2
|
|
199
|
-
|
|
200
|
-
try:
|
|
201
|
-
import rpy2.robjects as ro
|
|
202
|
-
from rpy2 import robjects
|
|
203
|
-
from rpy2.rinterface_lib.sexp import NULLType
|
|
204
|
-
from rpy2.rlike.container import NamedList
|
|
205
|
-
from rpy2.robjects import pandas2ri
|
|
206
|
-
from rpy2.robjects.conversion import localconverter
|
|
207
|
-
from rpy2.robjects.vectors import (
|
|
208
|
-
BoolVector,
|
|
209
|
-
FloatVector,
|
|
210
|
-
IntVector,
|
|
211
|
-
ListVector,
|
|
212
|
-
StrVector,
|
|
213
|
-
)
|
|
214
|
-
|
|
215
|
-
_RPY2 = {
|
|
216
|
-
"ro": ro,
|
|
217
|
-
"robjects": robjects,
|
|
218
|
-
"pandas2ri": pandas2ri,
|
|
219
|
-
"localconverter": localconverter,
|
|
220
|
-
"BoolVector": BoolVector,
|
|
221
|
-
"FloatVector": FloatVector,
|
|
222
|
-
"IntVector": IntVector,
|
|
223
|
-
"ListVector": ListVector,
|
|
224
|
-
"StrVector": StrVector,
|
|
225
|
-
"NULLType": NULLType,
|
|
226
|
-
"NamedList": NamedList,
|
|
227
|
-
}
|
|
228
|
-
return _RPY2
|
|
229
|
-
|
|
230
|
-
except ImportError as e:
|
|
231
|
-
if raise_on_missing:
|
|
232
|
-
raise RuntimeError(
|
|
233
|
-
"R support requires optional dependency `rpy2`. Install with: pip install rpy-bridge[r]"
|
|
234
|
-
) from e
|
|
235
|
-
return None
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
def _ensure_rpy2() -> dict:
|
|
239
|
-
global _RPY2
|
|
240
|
-
if _RPY2 is None:
|
|
241
|
-
_RPY2 = _require_rpy2()
|
|
242
|
-
assert _RPY2 is not None, "_require_rpy2() returned None"
|
|
243
|
-
return _RPY2
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
# ---------------------------------------------------------------------
|
|
247
|
-
# Activate renv
|
|
248
|
-
# ---------------------------------------------------------------------
|
|
249
|
-
def activate_renv(path_to_renv: Path) -> None:
|
|
250
|
-
r = _ensure_rpy2()
|
|
251
|
-
robjects = r["robjects"]
|
|
252
|
-
|
|
253
|
-
path_to_renv = path_to_renv.resolve()
|
|
254
|
-
if path_to_renv.name == "renv" and (path_to_renv / "activate.R").exists():
|
|
255
|
-
renv_dir = path_to_renv
|
|
256
|
-
project_dir = path_to_renv.parent
|
|
257
|
-
else:
|
|
258
|
-
renv_dir = path_to_renv / "renv"
|
|
259
|
-
project_dir = path_to_renv
|
|
260
|
-
|
|
261
|
-
renv_activate = renv_dir / "activate.R"
|
|
262
|
-
renv_lock = project_dir / "renv.lock"
|
|
263
|
-
|
|
264
|
-
if not renv_activate.exists() or not renv_lock.exists():
|
|
265
|
-
raise FileNotFoundError(f"[Error] renv environment incomplete: {path_to_renv}")
|
|
266
|
-
|
|
267
|
-
renviron_file = project_dir / ".Renviron"
|
|
268
|
-
if renviron_file.is_file():
|
|
269
|
-
os.environ["R_ENVIRON_USER"] = str(renviron_file)
|
|
270
|
-
logger.info(f"[rpy-bridge] R_ENVIRON_USER set to: {renviron_file}")
|
|
271
|
-
|
|
272
|
-
rprofile_file = project_dir / ".Rprofile"
|
|
273
|
-
if rprofile_file.is_file():
|
|
274
|
-
robjects.r(f'source("{rprofile_file.as_posix()}")')
|
|
275
|
-
logger.info(f"[rpy-bridge] .Rprofile sourced: {rprofile_file}")
|
|
276
|
-
|
|
277
|
-
try:
|
|
278
|
-
robjects.r("suppressMessages(library(renv))")
|
|
279
|
-
except Exception:
|
|
280
|
-
logger.info("[rpy-bridge] Installing renv package in project library...")
|
|
281
|
-
robjects.r(
|
|
282
|
-
f'install.packages("renv", repos="https://cloud.r-project.org", lib="{renv_dir / "library"}")'
|
|
283
|
-
)
|
|
284
|
-
robjects.r("library(renv)")
|
|
285
|
-
|
|
286
|
-
robjects.r(f'renv::load("{project_dir.as_posix()}")')
|
|
287
|
-
logger.info(f"[rpy-bridge] renv environment loaded for project: {project_dir}")
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
# ---------------------------------------------------------------------
|
|
291
|
-
# NamespaceWrapper
|
|
292
|
-
# ---------------------------------------------------------------------
|
|
293
|
-
class NamespaceWrapper:
|
|
294
|
-
"""
|
|
295
|
-
Wraps an R script namespace for Python attribute access.
|
|
296
|
-
"""
|
|
297
|
-
|
|
298
|
-
def __init__(self, env):
|
|
299
|
-
self._env = env
|
|
300
|
-
|
|
301
|
-
def __getattr__(self, func_name):
|
|
302
|
-
if func_name in self._env:
|
|
303
|
-
return self._env[func_name]
|
|
304
|
-
raise AttributeError(f"Function '{func_name}' not found in R namespace")
|
|
305
|
-
|
|
306
|
-
def list_functions(self):
|
|
307
|
-
"""
|
|
308
|
-
Return a list of callable functions in this namespace.
|
|
309
|
-
"""
|
|
310
|
-
return [k for k, v in self._env.items() if callable(v)]
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
# ---------------------------------------------------------------------
|
|
314
|
-
# RFunctionCaller
|
|
315
|
-
# ---------------------------------------------------------------------
|
|
316
|
-
class RFunctionCaller:
|
|
317
|
-
"""
|
|
318
|
-
Primary interface for calling R functions from Python.
|
|
319
|
-
|
|
320
|
-
``RFunctionCaller`` loads one or more R scripts into isolated namespaces
|
|
321
|
-
and provides a unified ``call()`` method for executing:
|
|
322
|
-
|
|
323
|
-
* Functions defined in sourced R scripts
|
|
324
|
-
* Base R functions (e.g. ``sum``, ``mean``)
|
|
325
|
-
* Functions from installed R packages (via ``package::function``)
|
|
326
|
-
|
|
327
|
-
In most workflows, users only need to interact with this class.
|
|
328
|
-
|
|
329
|
-
Parameters
|
|
330
|
-
----------
|
|
331
|
-
path_to_renv : Path or None, optional
|
|
332
|
-
Path to an R project that uses ``renv``. This may be either the project
|
|
333
|
-
root or the ``renv/`` directory itself. If provided, the renv
|
|
334
|
-
environment is activated before any scripts are sourced.
|
|
335
|
-
|
|
336
|
-
scripts : str, Path, list[str | Path], or None, optional
|
|
337
|
-
One or more ``.R`` files or directories containing ``.R`` files.
|
|
338
|
-
Each script is sourced into its own namespace.
|
|
339
|
-
|
|
340
|
-
packages : str or list[str], optional
|
|
341
|
-
R packages to load (and install if missing) before calling functions.
|
|
342
|
-
|
|
343
|
-
Notes
|
|
344
|
-
-----
|
|
345
|
-
* Python objects are automatically converted to R objects.
|
|
346
|
-
* R return values are converted back to Python equivalents.
|
|
347
|
-
* Missing values (``None``, ``pd.NA``) are mapped to R ``NA``.
|
|
348
|
-
"""
|
|
349
|
-
|
|
350
|
-
def __init__(
|
|
351
|
-
self,
|
|
352
|
-
path_to_renv: str | Path | None = None,
|
|
353
|
-
scripts: str | Path | list[str | Path] | None = None,
|
|
354
|
-
packages: str | list[str] | None = None,
|
|
355
|
-
**kwargs, # catch unexpected keywords
|
|
356
|
-
):
|
|
357
|
-
# Handle path_to_renv safely
|
|
358
|
-
if path_to_renv is not None:
|
|
359
|
-
if not isinstance(path_to_renv, Path):
|
|
360
|
-
path_to_renv = Path(path_to_renv)
|
|
361
|
-
self.path_to_renv = path_to_renv.resolve()
|
|
362
|
-
else:
|
|
363
|
-
self.path_to_renv = None
|
|
364
|
-
|
|
365
|
-
# --- Handle deprecated 'script_path' ---
|
|
366
|
-
if "script_path" in kwargs:
|
|
367
|
-
script_path_value = kwargs.pop("script_path")
|
|
368
|
-
warnings.warn(
|
|
369
|
-
"'script_path' argument is deprecated. "
|
|
370
|
-
"Please use 'scripts' instead (accepts a Path or list of Paths).",
|
|
371
|
-
DeprecationWarning,
|
|
372
|
-
stacklevel=2,
|
|
373
|
-
)
|
|
374
|
-
if scripts is None:
|
|
375
|
-
scripts = script_path_value
|
|
376
|
-
else:
|
|
377
|
-
# Both provided → prioritize scripts and ignore script_path
|
|
378
|
-
logger.warning("'script_path' ignored because 'scripts' argument is also provided.")
|
|
379
|
-
|
|
380
|
-
self.scripts = _normalize_scripts(scripts)
|
|
381
|
-
|
|
382
|
-
# --- Check all scripts exist immediately ---
|
|
383
|
-
for script_path in self.scripts:
|
|
384
|
-
if not script_path.exists():
|
|
385
|
-
raise FileNotFoundError(f"R script path not found: {script_path}")
|
|
386
|
-
|
|
387
|
-
# Raise error if other unexpected kwargs remain
|
|
388
|
-
if kwargs:
|
|
389
|
-
raise TypeError(
|
|
390
|
-
f"RFunctionCaller.__init__() received unexpected keyword arguments: {list(kwargs.keys())}"
|
|
391
|
-
)
|
|
392
|
-
|
|
393
|
-
self.path_to_renv = path_to_renv.resolve() if path_to_renv else None
|
|
394
|
-
self._namespaces: dict[str, Any] = {}
|
|
395
|
-
|
|
396
|
-
# Normalize scripts to a list
|
|
397
|
-
if scripts is None:
|
|
398
|
-
self.scripts: list[Path] = []
|
|
399
|
-
elif isinstance(scripts, Path):
|
|
400
|
-
self.scripts = [scripts.resolve()]
|
|
401
|
-
else:
|
|
402
|
-
self.scripts = [s.resolve() for s in scripts]
|
|
403
|
-
|
|
404
|
-
# Normalize packages to a list
|
|
405
|
-
if packages is None:
|
|
406
|
-
self.packages: list[str] = []
|
|
407
|
-
elif isinstance(packages, str):
|
|
408
|
-
self.packages = [packages]
|
|
409
|
-
else:
|
|
410
|
-
self.packages = packages
|
|
411
|
-
|
|
412
|
-
# Lazy-loaded attributes
|
|
413
|
-
self._r = None
|
|
414
|
-
self.ro = None
|
|
415
|
-
self.robjects = None
|
|
416
|
-
self.pandas2ri = None
|
|
417
|
-
self.localconverter = None
|
|
418
|
-
self.IntVector = None
|
|
419
|
-
self.FloatVector = None
|
|
420
|
-
self.BoolVector = None
|
|
421
|
-
self.StrVector = None
|
|
422
|
-
self.ListVector = None
|
|
423
|
-
self.NamedList = None
|
|
424
|
-
|
|
425
|
-
# Internal state
|
|
426
|
-
self._renv_activated = False
|
|
427
|
-
self._packages_loaded = False
|
|
428
|
-
self._scripts_loaded = [False] * len(self.scripts)
|
|
429
|
-
|
|
430
|
-
# -----------------------------------------------------------------
|
|
431
|
-
# Internal: lazy R loading
|
|
432
|
-
# -----------------------------------------------------------------
|
|
433
|
-
def _ensure_r_loaded(self) -> None:
|
|
434
|
-
"""
|
|
435
|
-
Ensure R runtime is initialized and all configured R scripts
|
|
436
|
-
are sourced exactly once, in isolated environments.
|
|
437
|
-
"""
|
|
438
|
-
if self.robjects is None:
|
|
439
|
-
rpy2_dict = _ensure_rpy2()
|
|
440
|
-
self._RPY2 = rpy2_dict # cache in instance
|
|
441
|
-
self._r = rpy2_dict["ro"]
|
|
442
|
-
self.ro = rpy2_dict["robjects"]
|
|
443
|
-
self.robjects = rpy2_dict["robjects"]
|
|
444
|
-
self.pandas2ri = rpy2_dict["pandas2ri"]
|
|
445
|
-
self.localconverter = rpy2_dict["localconverter"]
|
|
446
|
-
self.IntVector = rpy2_dict["IntVector"]
|
|
447
|
-
self.FloatVector = rpy2_dict["FloatVector"]
|
|
448
|
-
self.BoolVector = rpy2_dict["BoolVector"]
|
|
449
|
-
self.StrVector = rpy2_dict["StrVector"]
|
|
450
|
-
self.ListVector = rpy2_dict["ListVector"]
|
|
451
|
-
self.NamedList = rpy2_dict["NamedList"]
|
|
452
|
-
|
|
453
|
-
# Activate renv once if requested
|
|
454
|
-
if self.path_to_renv and not self._renv_activated:
|
|
455
|
-
try:
|
|
456
|
-
activate_renv(self.path_to_renv)
|
|
457
|
-
self._renv_activated = True
|
|
458
|
-
logger.info(
|
|
459
|
-
f"[rpy-bridge.RFunctionCaller] renv activated for project: {self.path_to_renv}"
|
|
460
|
-
)
|
|
461
|
-
except Exception as e:
|
|
462
|
-
raise RuntimeError(f"Failed to activate renv at {self.path_to_renv}: {e}") from e
|
|
463
|
-
|
|
464
|
-
r = self.robjects.r
|
|
465
|
-
|
|
466
|
-
# Ensure required R package
|
|
467
|
-
self.ensure_r_package("withr")
|
|
468
|
-
|
|
469
|
-
if not hasattr(self, "_namespaces"):
|
|
470
|
-
self._namespaces: dict[str, dict[str, Any]] = {}
|
|
471
|
-
|
|
472
|
-
# --- Iterate over scripts ---
|
|
473
|
-
for idx, script_entry in enumerate(self.scripts):
|
|
474
|
-
if self._scripts_loaded[idx]:
|
|
475
|
-
continue
|
|
476
|
-
|
|
477
|
-
script_entry = script_entry.resolve()
|
|
478
|
-
|
|
479
|
-
if script_entry.is_file():
|
|
480
|
-
r_files = [script_entry]
|
|
481
|
-
elif script_entry.is_dir():
|
|
482
|
-
r_files = sorted(script_entry.glob("*.R"))
|
|
483
|
-
if not r_files:
|
|
484
|
-
logger.warning(f"No .R files found in directory: {script_entry}")
|
|
485
|
-
self._scripts_loaded[idx] = True
|
|
486
|
-
continue
|
|
487
|
-
else:
|
|
488
|
-
raise ValueError(f"Invalid script path: {script_entry}")
|
|
489
|
-
|
|
490
|
-
for script_path in r_files:
|
|
491
|
-
ns_name = script_path.stem
|
|
492
|
-
logger.opt(depth=2).info(
|
|
493
|
-
"[rpy-bridge.RFunctionCaller] Loading R script '{}' as namespace '{}'",
|
|
494
|
-
script_path.name,
|
|
495
|
-
ns_name,
|
|
496
|
-
)
|
|
497
|
-
|
|
498
|
-
r("env <- new.env(parent=globalenv())")
|
|
499
|
-
r(f'script_path <- "{script_path.as_posix()}"')
|
|
500
|
-
|
|
501
|
-
r(
|
|
502
|
-
"""
|
|
503
|
-
withr::with_dir(
|
|
504
|
-
dirname(script_path),
|
|
505
|
-
sys.source(basename(script_path), envir=env)
|
|
506
|
-
)
|
|
507
|
-
"""
|
|
508
|
-
)
|
|
509
|
-
|
|
510
|
-
env_obj = r("env")
|
|
511
|
-
self._namespaces[ns_name] = {
|
|
512
|
-
name: env_obj[name] for name in env_obj.keys() if callable(env_obj[name])
|
|
513
|
-
}
|
|
514
|
-
|
|
515
|
-
logger.info(
|
|
516
|
-
f"[rpy-bridge.RFunctionCaller] Registered {len(self._namespaces[ns_name])} functions in namespace '{ns_name}'"
|
|
517
|
-
)
|
|
518
|
-
|
|
519
|
-
self._scripts_loaded[idx] = True
|
|
520
|
-
|
|
521
|
-
# -----------------------------------------------------------------
|
|
522
|
-
# Autocomplete-friendly attribute access for script namespaces
|
|
523
|
-
# -----------------------------------------------------------------
|
|
524
|
-
def __getattr__(self, name: str):
|
|
525
|
-
if "_namespaces" in self.__dict__ and name in self._namespaces:
|
|
526
|
-
ns_env = self._namespaces[name]
|
|
527
|
-
return NamespaceWrapper(ns_env)
|
|
528
|
-
raise AttributeError(f"'RFunctionCaller' object has no attribute '{name}'")
|
|
529
|
-
|
|
530
|
-
def _clean_scalar(self, x):
|
|
531
|
-
"""
|
|
532
|
-
Clean R-style missing values to pandas/NumPy equivalents.
|
|
533
|
-
Called inside _r2py on each vector element; atomic/scalar only.
|
|
534
|
-
"""
|
|
535
|
-
robjects = self.robjects
|
|
536
|
-
|
|
537
|
-
if x is None:
|
|
538
|
-
return None
|
|
539
|
-
|
|
540
|
-
if x in (
|
|
541
|
-
getattr(robjects, "NA_Real", None),
|
|
542
|
-
getattr(robjects, "NA_Integer", None),
|
|
543
|
-
getattr(robjects, "NA_Logical", None),
|
|
544
|
-
):
|
|
545
|
-
return None
|
|
546
|
-
|
|
547
|
-
if x is getattr(robjects, "NA_Character", None):
|
|
548
|
-
return None
|
|
549
|
-
|
|
550
|
-
if isinstance(x, float) and np.isnan(x):
|
|
551
|
-
return None
|
|
552
|
-
|
|
553
|
-
return x
|
|
554
|
-
|
|
555
|
-
def list_namespaces(self) -> list[str]:
|
|
556
|
-
"""
|
|
557
|
-
Return the names of all loaded script namespaces.
|
|
558
|
-
|
|
559
|
-
Returns
|
|
560
|
-
-------
|
|
561
|
-
list[str]
|
|
562
|
-
Names of sourced R script namespaces.
|
|
563
|
-
"""
|
|
564
|
-
self._ensure_r_loaded()
|
|
565
|
-
return list(self._namespaces.keys())
|
|
566
|
-
|
|
567
|
-
def list_namespace_functions(self, namespace: str) -> list[str]:
|
|
568
|
-
"""
|
|
569
|
-
Return all callable functions in a specific namespace.
|
|
570
|
-
"""
|
|
571
|
-
self._ensure_r_loaded()
|
|
572
|
-
if namespace not in self._namespaces:
|
|
573
|
-
raise ValueError(f"Namespace '{namespace}' not found")
|
|
574
|
-
return [k for k, v in self._namespaces[namespace].items() if callable(v)]
|
|
575
|
-
|
|
576
|
-
def _get_package_functions(self, pkg: str) -> list[str]:
|
|
577
|
-
"""
|
|
578
|
-
Return a list of callable functions from a loaded R package.
|
|
579
|
-
"""
|
|
580
|
-
r = self.robjects.r
|
|
581
|
-
try:
|
|
582
|
-
all_objs = list(r[f'ls("package:{pkg}")'])
|
|
583
|
-
funcs = [
|
|
584
|
-
name
|
|
585
|
-
for name in all_objs
|
|
586
|
-
if r(f'is.function(get("{name}", envir=asNamespace("{pkg}")))')[0]
|
|
587
|
-
]
|
|
588
|
-
return funcs
|
|
589
|
-
except Exception:
|
|
590
|
-
logger.warning(f"Failed to list functions for package '{pkg}'")
|
|
591
|
-
return []
|
|
592
|
-
|
|
593
|
-
def list_all_functions(self, include_packages: bool = False) -> dict[str, list[str]]:
|
|
594
|
-
"""
|
|
595
|
-
Return all callable R functions grouped by script namespace and package.
|
|
596
|
-
"""
|
|
597
|
-
self._ensure_r_loaded()
|
|
598
|
-
all_funcs = {}
|
|
599
|
-
|
|
600
|
-
# --- Script namespaces ---
|
|
601
|
-
for ns_name, ns_env in self._namespaces.items():
|
|
602
|
-
funcs = [name for name, val in ns_env.items() if callable(val)]
|
|
603
|
-
all_funcs[ns_name] = funcs
|
|
604
|
-
|
|
605
|
-
# --- Loaded R packages ---
|
|
606
|
-
if include_packages:
|
|
607
|
-
r = self.robjects.r
|
|
608
|
-
try:
|
|
609
|
-
pkgs = r("loadedNamespaces()")
|
|
610
|
-
for pkg in pkgs:
|
|
611
|
-
funcs = self._get_package_functions(pkg)
|
|
612
|
-
if not funcs:
|
|
613
|
-
# Add a placeholder note
|
|
614
|
-
funcs = [
|
|
615
|
-
"[See official documentation for functions, datasets, and objects]"
|
|
616
|
-
]
|
|
617
|
-
all_funcs[pkg] = funcs
|
|
618
|
-
except Exception:
|
|
619
|
-
pass
|
|
620
|
-
|
|
621
|
-
return all_funcs
|
|
622
|
-
|
|
623
|
-
def print_function_tree(self, include_packages: bool = False, max_display: int = 10):
|
|
624
|
-
"""
|
|
625
|
-
Pretty-print available R functions grouped by namespace.
|
|
626
|
-
|
|
627
|
-
Parameters
|
|
628
|
-
----------
|
|
629
|
-
include_packages : bool, default False
|
|
630
|
-
Whether to include functions from loaded R packages.
|
|
631
|
-
|
|
632
|
-
max_display : int, default 10
|
|
633
|
-
Maximum number of functions displayed per namespace.
|
|
634
|
-
|
|
635
|
-
Notes
|
|
636
|
-
-----
|
|
637
|
-
This method is intended for interactive exploration and debugging.
|
|
638
|
-
"""
|
|
639
|
-
all_funcs = self.list_all_functions(include_packages=include_packages)
|
|
640
|
-
|
|
641
|
-
for ns_name, funcs in all_funcs.items():
|
|
642
|
-
if not funcs:
|
|
643
|
-
continue
|
|
644
|
-
print(f"{ns_name}/")
|
|
645
|
-
for f in sorted(funcs)[:max_display]:
|
|
646
|
-
print(f" {f}")
|
|
647
|
-
if len(funcs) > max_display:
|
|
648
|
-
print(" ...")
|
|
649
|
-
|
|
650
|
-
# -----------------------------------------------------------------
|
|
651
|
-
# Python -> R conversion
|
|
652
|
-
# -----------------------------------------------------------------
|
|
653
|
-
def _py2r(self, obj):
|
|
654
|
-
"""
|
|
655
|
-
Convert Python objects to R objects robustly.
|
|
656
|
-
Handles scalars, None/pd.NA, lists, dicts, and pandas DataFrames.
|
|
657
|
-
"""
|
|
658
|
-
self._ensure_r_loaded()
|
|
659
|
-
robjects = self.robjects
|
|
660
|
-
pandas2ri = self.pandas2ri
|
|
661
|
-
FloatVector = self.FloatVector
|
|
662
|
-
BoolVector = self.BoolVector
|
|
663
|
-
StrVector = self.StrVector
|
|
664
|
-
ListVector = self.ListVector
|
|
665
|
-
localconverter = self.localconverter
|
|
666
|
-
|
|
667
|
-
r_types = (
|
|
668
|
-
robjects.vectors.IntVector,
|
|
669
|
-
robjects.vectors.FloatVector,
|
|
670
|
-
robjects.vectors.BoolVector,
|
|
671
|
-
robjects.vectors.StrVector,
|
|
672
|
-
robjects.vectors.ListVector,
|
|
673
|
-
robjects.DataFrame,
|
|
674
|
-
)
|
|
675
|
-
if isinstance(obj, r_types):
|
|
676
|
-
return obj
|
|
677
|
-
|
|
678
|
-
def is_na(x):
|
|
679
|
-
return x is None or x is pd.NA or (isinstance(x, float) and pd.isna(x))
|
|
680
|
-
|
|
681
|
-
with localconverter(robjects.default_converter + pandas2ri.converter):
|
|
682
|
-
if is_na(obj):
|
|
683
|
-
return robjects.NULL
|
|
684
|
-
if isinstance(obj, pd.DataFrame):
|
|
685
|
-
return pandas2ri.py2rpy(obj)
|
|
686
|
-
if isinstance(obj, pd.Series):
|
|
687
|
-
return self._py2r(obj.tolist())
|
|
688
|
-
if isinstance(obj, (int, float, bool, str)):
|
|
689
|
-
return obj
|
|
690
|
-
if isinstance(obj, list):
|
|
691
|
-
if len(obj) == 0:
|
|
692
|
-
return FloatVector([])
|
|
693
|
-
|
|
694
|
-
types = set(type(x) for x in obj if not is_na(x))
|
|
695
|
-
if types <= {int, float}:
|
|
696
|
-
return FloatVector([robjects.NA_Real if is_na(x) else float(x) for x in obj])
|
|
697
|
-
if types <= {bool}:
|
|
698
|
-
return BoolVector([robjects.NA_Logical if is_na(x) else x for x in obj])
|
|
699
|
-
if types <= {str}:
|
|
700
|
-
return StrVector([robjects.NA_Character if is_na(x) else x for x in obj])
|
|
701
|
-
return ListVector({str(i): self._py2r(v) for i, v in enumerate(obj)})
|
|
702
|
-
if isinstance(obj, dict):
|
|
703
|
-
return ListVector({k: self._py2r(v) for k, v in obj.items()})
|
|
704
|
-
raise NotImplementedError(f"Cannot convert Python object to R: {type(obj)}")
|
|
705
|
-
|
|
706
|
-
# -----------------------------------------------------------------
|
|
707
|
-
# R -> Python conversion
|
|
708
|
-
# -----------------------------------------------------------------
|
|
709
|
-
def _r2py(self, obj, top_level=True):
|
|
710
|
-
robjects = self.robjects
|
|
711
|
-
NamedList = self.NamedList
|
|
712
|
-
ListVector = self.ListVector
|
|
713
|
-
StrVector = self.StrVector
|
|
714
|
-
IntVector = self.IntVector
|
|
715
|
-
FloatVector = self.FloatVector
|
|
716
|
-
BoolVector = self.BoolVector
|
|
717
|
-
NULLType = self._RPY2["NULLType"]
|
|
718
|
-
lc = self.localconverter
|
|
719
|
-
pandas2ri = self.pandas2ri
|
|
720
|
-
|
|
721
|
-
if isinstance(obj, NULLType):
|
|
722
|
-
return None
|
|
723
|
-
|
|
724
|
-
if isinstance(obj, robjects.DataFrame):
|
|
725
|
-
with lc(robjects.default_converter + pandas2ri.converter):
|
|
726
|
-
df = robjects.conversion.rpy2py(obj)
|
|
727
|
-
df = postprocess_r_dataframe(df)
|
|
728
|
-
return clean_r_missing(df, caller=self)
|
|
729
|
-
|
|
730
|
-
if isinstance(obj, (NamedList, ListVector)):
|
|
731
|
-
py_obj = r_namedlist_to_dict(obj, caller=self, top_level=top_level)
|
|
732
|
-
if isinstance(py_obj, list) and len(py_obj) == 1 and top_level:
|
|
733
|
-
return py_obj[0]
|
|
734
|
-
return py_obj
|
|
735
|
-
|
|
736
|
-
if isinstance(obj, (StrVector, IntVector, FloatVector, BoolVector)):
|
|
737
|
-
py_list = [self._clean_scalar(v) for v in obj]
|
|
738
|
-
if len(py_list) == 1 and top_level:
|
|
739
|
-
return py_list[0]
|
|
740
|
-
return py_list
|
|
741
|
-
|
|
742
|
-
return self._clean_scalar(obj)
|
|
743
|
-
|
|
744
|
-
# -----------------------------------------------------------------
|
|
745
|
-
# Public: ensure R package is available
|
|
746
|
-
# -----------------------------------------------------------------
|
|
747
|
-
def ensure_r_package(self, pkg: str):
|
|
748
|
-
r = self.robjects.r
|
|
749
|
-
try:
|
|
750
|
-
r(f'suppressMessages(library("{pkg}", character.only=TRUE))')
|
|
751
|
-
except Exception:
|
|
752
|
-
logger.info(f"[rpy-bridge.RFunctionCaller] Package '{pkg}' not found.")
|
|
753
|
-
logger.warning(f"[rpy-bridge.RFunctionCaller] Installing missing R package: {pkg}")
|
|
754
|
-
r(f'install.packages("{pkg}", repos="https://cloud.r-project.org")')
|
|
755
|
-
r(f'suppressMessages(library("{pkg}", character.only=TRUE))')
|
|
756
|
-
|
|
757
|
-
# -----------------------------------------------------------------
|
|
758
|
-
# Public: call an R function
|
|
759
|
-
# -----------------------------------------------------------------
|
|
760
|
-
def call(self, func_name: str, *args, **kwargs):
|
|
761
|
-
"""
|
|
762
|
-
Call an R function.
|
|
763
|
-
|
|
764
|
-
The function may be defined in:
|
|
765
|
-
* a sourced R script
|
|
766
|
-
* an installed R package (using ``package::function`` syntax)
|
|
767
|
-
* base R
|
|
768
|
-
|
|
769
|
-
Parameters
|
|
770
|
-
----------
|
|
771
|
-
func_name : str
|
|
772
|
-
Name of the R function to call. Package functions should be specified
|
|
773
|
-
as ``package::function``.
|
|
774
|
-
|
|
775
|
-
*args
|
|
776
|
-
Positional arguments passed to the R function.
|
|
777
|
-
|
|
778
|
-
**kwargs
|
|
779
|
-
Named arguments passed to the R function.
|
|
780
|
-
|
|
781
|
-
Returns
|
|
782
|
-
-------
|
|
783
|
-
object
|
|
784
|
-
The result of the R function, converted to a Python object.
|
|
785
|
-
|
|
786
|
-
Examples
|
|
787
|
-
--------
|
|
788
|
-
>>> rfc.call("sum", [1, 2, 3])
|
|
789
|
-
>>> rfc.call("dplyr::n_distinct", [1, 2, 2, 3])
|
|
790
|
-
>>> rfc.call("add_and_scale", 2, 3, scale=10)
|
|
791
|
-
"""
|
|
792
|
-
|
|
793
|
-
self._ensure_r_loaded()
|
|
794
|
-
|
|
795
|
-
func = None
|
|
796
|
-
source_info = None
|
|
797
|
-
|
|
798
|
-
if "::" in func_name:
|
|
799
|
-
ns_name, fname = func_name.split("::", 1)
|
|
800
|
-
if ns_name in self._namespaces:
|
|
801
|
-
ns_env = self._namespaces[ns_name]
|
|
802
|
-
if fname in ns_env:
|
|
803
|
-
func = ns_env[fname]
|
|
804
|
-
source_info = f"script namespace '{ns_name}'"
|
|
805
|
-
else:
|
|
806
|
-
raise ValueError(
|
|
807
|
-
f"Function '{fname}' not found in R script namespace '{ns_name}'"
|
|
808
|
-
)
|
|
809
|
-
else:
|
|
810
|
-
try:
|
|
811
|
-
func = self.robjects.r(f"{ns_name}::{fname}")
|
|
812
|
-
source_info = f"R package '{ns_name}'"
|
|
813
|
-
except Exception as e:
|
|
814
|
-
raise RuntimeError(f"Failed to resolve R function '{func_name}': {e}") from e
|
|
815
|
-
|
|
816
|
-
else:
|
|
817
|
-
for ns_name, ns_env in self._namespaces.items():
|
|
818
|
-
if func_name in ns_env:
|
|
819
|
-
func = ns_env[func_name]
|
|
820
|
-
source_info = f"script namespace '{ns_name}'"
|
|
821
|
-
break
|
|
822
|
-
|
|
823
|
-
if func is None:
|
|
824
|
-
try:
|
|
825
|
-
func = self.robjects.globalenv[func_name]
|
|
826
|
-
source_info = "global environment"
|
|
827
|
-
except KeyError:
|
|
828
|
-
pass
|
|
829
|
-
|
|
830
|
-
if func is None:
|
|
831
|
-
try:
|
|
832
|
-
func = self.robjects.r[func_name]
|
|
833
|
-
source_info = "base R / loaded package"
|
|
834
|
-
except KeyError:
|
|
835
|
-
raise ValueError(
|
|
836
|
-
f"R function '{func_name}' not found in any namespace, global env, or base R."
|
|
837
|
-
)
|
|
838
|
-
|
|
839
|
-
r_args = [self._py2r(a) for a in args]
|
|
840
|
-
r_kwargs = {k: self._py2r(v) for k, v in kwargs.items()}
|
|
841
|
-
|
|
842
|
-
try:
|
|
843
|
-
result = func(*r_args, **r_kwargs)
|
|
844
|
-
except Exception as e:
|
|
845
|
-
raise RuntimeError(
|
|
846
|
-
f"Error calling R function '{func_name}' from {source_info}: {e}"
|
|
847
|
-
) from e
|
|
848
|
-
|
|
849
|
-
_log_r_call(func_name, source_info)
|
|
850
|
-
|
|
851
|
-
return self._r2py(result)
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
# %%
|
|
855
|
-
# ------------------------------
|
|
856
|
-
# Utility functions for R ↔ Python
|
|
857
|
-
# ------------------------------
|
|
858
|
-
def r_namedlist_to_dict(namedlist, caller: RFunctionCaller, top_level=False):
|
|
859
|
-
r = _ensure_rpy2()
|
|
860
|
-
NamedList = r["NamedList"]
|
|
861
|
-
ListVector = r["ListVector"]
|
|
862
|
-
|
|
863
|
-
if isinstance(namedlist, (NamedList, ListVector)):
|
|
864
|
-
names = namedlist.names if not callable(namedlist.names) else namedlist.names()
|
|
865
|
-
|
|
866
|
-
if names and all(str(i) == str(name) for i, name in enumerate(names)):
|
|
867
|
-
out = []
|
|
868
|
-
for v in namedlist:
|
|
869
|
-
val = caller._r2py(v, top_level=False)
|
|
870
|
-
out.append(val)
|
|
871
|
-
return out
|
|
872
|
-
|
|
873
|
-
result = {}
|
|
874
|
-
for i, val in enumerate(namedlist):
|
|
875
|
-
key = names[i] if names and i < len(names) else str(i)
|
|
876
|
-
v_py = caller._r2py(val, top_level=False)
|
|
877
|
-
result[str(key)] = v_py
|
|
878
|
-
return result
|
|
879
|
-
|
|
880
|
-
return caller._r2py(namedlist, top_level=top_level)
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
def clean_r_dataframe(r_df: pd.DataFrame) -> pd.DataFrame:
|
|
884
|
-
for attr in [".groups", ".rows"]:
|
|
885
|
-
try:
|
|
886
|
-
del r_df.attrs[attr]
|
|
887
|
-
except (KeyError, AttributeError):
|
|
888
|
-
pass
|
|
889
|
-
return r_df
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
def fix_string_nans(df: pd.DataFrame) -> pd.DataFrame:
|
|
893
|
-
return df.replace(["nan", "NaN", "NA", "na", ""], pd.NA)
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
def normalize_single_df_dtypes(df: pd.DataFrame) -> pd.DataFrame:
|
|
897
|
-
df = df.replace(["", "nan", "NaN", "NA", "na"], pd.NA)
|
|
898
|
-
for col in df.columns:
|
|
899
|
-
series = df[col]
|
|
900
|
-
if pd.api.types.is_object_dtype(series):
|
|
901
|
-
coerced = pd.to_numeric(series, errors="coerce")
|
|
902
|
-
if coerced.notna().sum() >= series.notna().sum() * 0.5:
|
|
903
|
-
df[col] = coerced
|
|
904
|
-
if pd.api.types.is_integer_dtype(df[col]) and df[col].isna().any():
|
|
905
|
-
df[col] = df[col].astype("float64")
|
|
906
|
-
return df
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
def fix_r_dataframe_types(df: pd.DataFrame) -> pd.DataFrame:
|
|
910
|
-
for col in df.columns:
|
|
911
|
-
series = df[col]
|
|
912
|
-
if pd.api.types.is_integer_dtype(series):
|
|
913
|
-
df[col] = series.mask(series == -2147483648, pd.NA)
|
|
914
|
-
if pd.api.types.is_numeric_dtype(series):
|
|
915
|
-
values = series.dropna()
|
|
916
|
-
if not values.empty and values.between(10000, 40000).all():
|
|
917
|
-
try:
|
|
918
|
-
df[col] = pd.to_datetime("1970-01-01") + pd.to_timedelta(series, unit="D")
|
|
919
|
-
except Exception:
|
|
920
|
-
pass
|
|
921
|
-
if pd.api.types.is_datetime64tz_dtype(series):
|
|
922
|
-
df[col] = series.dt.tz_localize(None)
|
|
923
|
-
return df
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
def postprocess_r_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
927
|
-
df = fix_r_dataframe_types(df)
|
|
928
|
-
df = fix_string_nans(df)
|
|
929
|
-
df = normalize_single_df_dtypes(df)
|
|
930
|
-
if df.index.dtype == object:
|
|
931
|
-
try:
|
|
932
|
-
int_index = df.index.astype(int)
|
|
933
|
-
if (int_index == np.arange(len(df)) + 1).all():
|
|
934
|
-
df.index = pd.RangeIndex(start=0, stop=len(df))
|
|
935
|
-
except Exception:
|
|
936
|
-
pass
|
|
937
|
-
return df
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
def clean_r_missing(obj, caller: RFunctionCaller):
|
|
941
|
-
robjects = caller.robjects
|
|
942
|
-
NA_MAP = {
|
|
943
|
-
getattr(robjects, "NA_Real", None): np.nan,
|
|
944
|
-
getattr(robjects, "NA_Integer", None): np.nan,
|
|
945
|
-
getattr(robjects, "NA_Logical", None): np.nan,
|
|
946
|
-
getattr(robjects, "NA_Character", None): pd.NA,
|
|
947
|
-
}
|
|
948
|
-
|
|
949
|
-
if isinstance(obj, pd.DataFrame):
|
|
950
|
-
for col in obj.columns:
|
|
951
|
-
obj[col] = obj[col].apply(lambda x: clean_r_missing(x, caller))
|
|
952
|
-
return obj
|
|
953
|
-
elif isinstance(obj, dict):
|
|
954
|
-
return {k: clean_r_missing(v, caller) for k, v in obj.items()}
|
|
955
|
-
elif isinstance(obj, list):
|
|
956
|
-
return [clean_r_missing(v, caller) for v in obj]
|
|
957
|
-
else:
|
|
958
|
-
return NA_MAP.get(obj, obj)
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
# ---------------------------------------------------------------------
|
|
962
|
-
# DataFrame comparison utilities
|
|
963
|
-
# ---------------------------------------------------------------------
|
|
964
|
-
def normalize_dtypes(df1: pd.DataFrame, df2: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
965
|
-
for col in df1.columns.intersection(df2.columns):
|
|
966
|
-
df1[col] = df1[col].replace("", pd.NA)
|
|
967
|
-
df2[col] = df2[col].replace("", pd.NA)
|
|
968
|
-
s1, s2 = df1[col], df2[col]
|
|
969
|
-
dtype1, dtype2 = s1.dtype, s2.dtype
|
|
970
|
-
if (pd.api.types.is_numeric_dtype(dtype1) and pd.api.types.is_object_dtype(dtype2)) or (
|
|
971
|
-
pd.api.types.is_object_dtype(dtype1) and pd.api.types.is_numeric_dtype(dtype2)
|
|
972
|
-
):
|
|
973
|
-
try:
|
|
974
|
-
df1[col] = pd.to_numeric(s1, errors="coerce")
|
|
975
|
-
df2[col] = pd.to_numeric(s2, errors="coerce")
|
|
976
|
-
continue
|
|
977
|
-
except Exception:
|
|
978
|
-
pass
|
|
979
|
-
if pd.api.types.is_numeric_dtype(dtype1) and pd.api.types.is_numeric_dtype(dtype2):
|
|
980
|
-
df1[col] = df1[col].astype("float64")
|
|
981
|
-
df2[col] = df2[col].astype("float64")
|
|
982
|
-
continue
|
|
983
|
-
if pd.api.types.is_object_dtype(dtype1) or pd.api.types.is_object_dtype(dtype2):
|
|
984
|
-
df1[col] = df1[col].astype(str)
|
|
985
|
-
df2[col] = df2[col].astype(str)
|
|
986
|
-
return df1, df2
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
def align_numeric_dtypes(df1: pd.DataFrame, df2: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
990
|
-
for col in df1.columns.intersection(df2.columns):
|
|
991
|
-
s1, s2 = df1[col].replace("", pd.NA), df2[col].replace("", pd.NA)
|
|
992
|
-
try:
|
|
993
|
-
s1_num = pd.to_numeric(s1, errors="coerce")
|
|
994
|
-
s2_num = pd.to_numeric(s2, errors="coerce")
|
|
995
|
-
if not s1_num.isna().all() or not s2_num.isna().all():
|
|
996
|
-
df1[col] = s1_num.astype("float64")
|
|
997
|
-
df2[col] = s2_num.astype("float64")
|
|
998
|
-
continue
|
|
999
|
-
except Exception:
|
|
1000
|
-
pass
|
|
1001
|
-
df1[col], df2[col] = s1, s2
|
|
1002
|
-
return df1, df2
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
def compare_r_py_dataframes(df1: pd.DataFrame, df2: pd.DataFrame, float_tol: float = 1e-8) -> dict:
|
|
1006
|
-
results: dict[str, Any] = {
|
|
1007
|
-
"shape_mismatch": False,
|
|
1008
|
-
"columns_mismatch": False,
|
|
1009
|
-
"index_mismatch": False,
|
|
1010
|
-
"numeric_diffs": {},
|
|
1011
|
-
"non_numeric_diffs": {},
|
|
1012
|
-
}
|
|
1013
|
-
df2 = fix_r_dataframe_types(df2)
|
|
1014
|
-
df1 = fix_string_nans(df1)
|
|
1015
|
-
df2 = fix_string_nans(df2)
|
|
1016
|
-
df1, df2 = normalize_dtypes(df1.copy(), df2.copy())
|
|
1017
|
-
df1, df2 = align_numeric_dtypes(df1, df2)
|
|
1018
|
-
if df1.shape != df2.shape:
|
|
1019
|
-
results["shape_mismatch"] = True
|
|
1020
|
-
print(f"[Warning] Shape mismatch: df1 {df1.shape} vs df2 {df2.shape}")
|
|
1021
|
-
if set(df1.columns) != set(df2.columns):
|
|
1022
|
-
results["columns_mismatch"] = True
|
|
1023
|
-
print("[Warning] Column mismatch:")
|
|
1024
|
-
print(f" df1: {df1.columns}")
|
|
1025
|
-
print(f" df2: {df2.columns}")
|
|
1026
|
-
common_cols = df1.columns.intersection(df2.columns)
|
|
1027
|
-
else:
|
|
1028
|
-
common_cols = df1.columns
|
|
1029
|
-
df1_aligned, df2_aligned = df1.loc[:, common_cols], df2.loc[:, common_cols]
|
|
1030
|
-
for col in common_cols:
|
|
1031
|
-
col_py, col_r = df1_aligned[col], df2_aligned[col]
|
|
1032
|
-
if pd.api.types.is_numeric_dtype(col_py) and pd.api.types.is_numeric_dtype(col_r):
|
|
1033
|
-
col_py, col_r = col_py.align(col_r)
|
|
1034
|
-
close = np.isclose(
|
|
1035
|
-
col_py.fillna(np.nan),
|
|
1036
|
-
col_r.fillna(np.nan),
|
|
1037
|
-
atol=float_tol,
|
|
1038
|
-
equal_nan=True,
|
|
1039
|
-
)
|
|
1040
|
-
if not close.all():
|
|
1041
|
-
results["numeric_diffs"][col] = pd.DataFrame(
|
|
1042
|
-
{"df1": col_py[~close], "df2": col_r[~close]}
|
|
1043
|
-
)
|
|
1044
|
-
else:
|
|
1045
|
-
unequal = ~col_py.eq(col_r)
|
|
1046
|
-
both_na = col_py.isna() & col_r.isna()
|
|
1047
|
-
unequal = unequal & ~both_na
|
|
1048
|
-
if unequal.any():
|
|
1049
|
-
results["non_numeric_diffs"][col] = pd.DataFrame(
|
|
1050
|
-
{"df1": col_py[unequal], "df2": col_r[unequal]}
|
|
1051
|
-
)
|
|
1052
|
-
return results
|