rpy-bridge 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rpy_bridge/__init__.py +2 -0
- rpy_bridge/rpy2_utils.py +532 -322
- {rpy_bridge-0.3.3.dist-info → rpy_bridge-0.3.5.dist-info}/METADATA +4 -4
- rpy_bridge-0.3.5.dist-info/RECORD +8 -0
- rpy_bridge-0.3.3.dist-info/RECORD +0 -8
- {rpy_bridge-0.3.3.dist-info → rpy_bridge-0.3.5.dist-info}/WHEEL +0 -0
- {rpy_bridge-0.3.3.dist-info → rpy_bridge-0.3.5.dist-info}/licenses/LICENSE +0 -0
- {rpy_bridge-0.3.3.dist-info → rpy_bridge-0.3.5.dist-info}/top_level.txt +0 -0
rpy_bridge/__init__.py
CHANGED
|
@@ -6,6 +6,7 @@ continue importing directly from ``rpy_bridge``.
|
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
from .rpy2_utils import (
|
|
9
|
+
NamespaceWrapper,
|
|
9
10
|
RFunctionCaller,
|
|
10
11
|
activate_renv,
|
|
11
12
|
align_numeric_dtypes,
|
|
@@ -23,6 +24,7 @@ from .rpy2_utils import (
|
|
|
23
24
|
__all__ = [
|
|
24
25
|
"activate_renv",
|
|
25
26
|
"RFunctionCaller",
|
|
27
|
+
"NamespaceWrapper",
|
|
26
28
|
"r_namedlist_to_dict",
|
|
27
29
|
"clean_r_dataframe",
|
|
28
30
|
"fix_string_nans",
|
rpy_bridge/rpy2_utils.py
CHANGED
|
@@ -1,10 +1,23 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
R–Python Integration Utility
|
|
3
3
|
|
|
4
|
+
Provides tools to load R scripts, activate renv environments, and call R functions
|
|
5
|
+
directly from Python, with automatic conversion between R and Python data types.
|
|
6
|
+
|
|
7
|
+
----------
|
|
8
|
+
Requirements
|
|
4
9
|
----------
|
|
5
|
-
|
|
6
|
-
Ensure compatibility with your R project's renv setup (or other
|
|
10
|
+
- R must be installed and accessible in your system environment.
|
|
11
|
+
- Ensure compatibility with your R project's renv setup (or any other R environment you use).
|
|
12
|
+
|
|
13
|
+
Features
|
|
7
14
|
----------
|
|
15
|
+
- Lazy loading of rpy2 and R runtime.
|
|
16
|
+
- Activation of renv environments for isolated R project dependencies.
|
|
17
|
+
- Support for sourcing individual R scripts or directories of scripts.
|
|
18
|
+
- Namespace-based access to R functions.
|
|
19
|
+
- Automatic conversion between R vectors, data frames, and Python types (pandas, lists, scalars).
|
|
20
|
+
- Utilities for cleaning and aligning data frames between R and Python.
|
|
8
21
|
"""
|
|
9
22
|
|
|
10
23
|
# ruff: noqa: E402
|
|
@@ -16,7 +29,7 @@ import subprocess
|
|
|
16
29
|
import sys
|
|
17
30
|
import warnings
|
|
18
31
|
from pathlib import Path
|
|
19
|
-
from typing import TYPE_CHECKING, Any, Union
|
|
32
|
+
from typing import TYPE_CHECKING, Any, Iterable, Union
|
|
20
33
|
|
|
21
34
|
import numpy as np
|
|
22
35
|
import pandas as pd
|
|
@@ -29,7 +42,8 @@ if TYPE_CHECKING:
|
|
|
29
42
|
|
|
30
43
|
from loguru import Logger as LoguruLogger
|
|
31
44
|
|
|
32
|
-
LoggerType =
|
|
45
|
+
LoggerType = Union[LoguruLogger, logging_module.Logger]
|
|
46
|
+
|
|
33
47
|
else:
|
|
34
48
|
LoggerType = None # runtime doesn’t need the type object
|
|
35
49
|
|
|
@@ -44,6 +58,47 @@ except ImportError:
|
|
|
44
58
|
logger = logging.getLogger("rpy-bridge")
|
|
45
59
|
|
|
46
60
|
|
|
61
|
+
# --- Remove default handler to override global default ---
|
|
62
|
+
logger.remove()
|
|
63
|
+
|
|
64
|
+
# --- Add a "sink" for RFunctionCaller logs ---
|
|
65
|
+
_rfc_logger = logger.bind(tag="[RFunctionCaller]")
|
|
66
|
+
_rfc_logger.add(
|
|
67
|
+
sys.stderr,
|
|
68
|
+
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}", # Only show message
|
|
69
|
+
level="INFO",
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _log_r_call(func_name: str, source_info: str):
|
|
74
|
+
"""
|
|
75
|
+
Log an R function call, showing only '[RFunctionCaller] Called ...'
|
|
76
|
+
"""
|
|
77
|
+
_rfc_logger.opt(depth=1, record=False).info(
|
|
78
|
+
"[rpy-bridge.RFunctionCaller] Called R function '{}' from {}",
|
|
79
|
+
func_name,
|
|
80
|
+
source_info,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# ---------------------------------------------------------------------
|
|
85
|
+
# Path resolution
|
|
86
|
+
# ---------------------------------------------------------------------
|
|
87
|
+
def _normalize_scripts(
|
|
88
|
+
scripts: Union[str, Path, Iterable[Union[str, Path]], None],
|
|
89
|
+
) -> list[Path]:
|
|
90
|
+
if scripts is None:
|
|
91
|
+
return []
|
|
92
|
+
if isinstance(scripts, (str, Path)):
|
|
93
|
+
return [Path(scripts).resolve()]
|
|
94
|
+
try:
|
|
95
|
+
return [Path(s).resolve() for s in scripts]
|
|
96
|
+
except TypeError:
|
|
97
|
+
raise TypeError(
|
|
98
|
+
f"Invalid type for 'scripts': {type(scripts)}. Must be str, Path, or list/iterable thereof."
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
|
|
47
102
|
# ---------------------------------------------------------------------
|
|
48
103
|
# R detection and rpy2 installation
|
|
49
104
|
# ---------------------------------------------------------------------
|
|
@@ -64,7 +119,9 @@ def ensure_rpy2_available() -> None:
|
|
|
64
119
|
|
|
65
120
|
|
|
66
121
|
def find_r_home() -> str | None:
|
|
67
|
-
"""
|
|
122
|
+
"""
|
|
123
|
+
Detect system R installation.
|
|
124
|
+
"""
|
|
68
125
|
try:
|
|
69
126
|
r_home = subprocess.check_output(
|
|
70
127
|
["R", "--vanilla", "--slave", "-e", "cat(R.home())"],
|
|
@@ -88,26 +145,50 @@ def find_r_home() -> str | None:
|
|
|
88
145
|
return None
|
|
89
146
|
|
|
90
147
|
|
|
91
|
-
|
|
148
|
+
# Determine if we're running in CI / testing
|
|
149
|
+
CI_TESTING = (
|
|
150
|
+
os.environ.get("GITHUB_ACTIONS") == "true" or os.environ.get("TESTING") == "1"
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
R_HOME = os.environ.get("R_HOME")
|
|
92
154
|
if not R_HOME:
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
#
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
os.environ["
|
|
104
|
-
|
|
105
|
-
|
|
155
|
+
R_HOME = find_r_home()
|
|
156
|
+
if not R_HOME:
|
|
157
|
+
if CI_TESTING:
|
|
158
|
+
logger.warning(
|
|
159
|
+
"R not found; skipping all R-dependent setup in CI/testing environment."
|
|
160
|
+
)
|
|
161
|
+
R_HOME = None # Explicitly None to signal "no R available"
|
|
162
|
+
else:
|
|
163
|
+
raise RuntimeError("R not found. Please install R or add it to PATH.")
|
|
164
|
+
else:
|
|
165
|
+
os.environ["R_HOME"] = R_HOME
|
|
166
|
+
|
|
167
|
+
logger.info(
|
|
168
|
+
f"[rpy-bridge] R_HOME = {R_HOME if R_HOME else 'not detected; R-dependent code skipped'}"
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# Only configure platform-specific library paths if R is available
|
|
172
|
+
if R_HOME:
|
|
173
|
+
if sys.platform == "darwin":
|
|
174
|
+
lib_path = os.path.join(R_HOME, "lib")
|
|
175
|
+
if lib_path not in os.environ.get("DYLD_FALLBACK_LIBRARY_PATH", ""):
|
|
176
|
+
os.environ["DYLD_FALLBACK_LIBRARY_PATH"] = (
|
|
177
|
+
f"{lib_path}:{os.environ.get('DYLD_FALLBACK_LIBRARY_PATH','')}"
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
elif sys.platform.startswith("linux"):
|
|
181
|
+
lib_path = os.path.join(R_HOME, "lib")
|
|
182
|
+
ld_path = os.environ.get("LD_LIBRARY_PATH", "")
|
|
183
|
+
if lib_path not in ld_path.split(":"):
|
|
184
|
+
os.environ["LD_LIBRARY_PATH"] = f"{lib_path}:{ld_path}"
|
|
185
|
+
|
|
186
|
+
elif sys.platform.startswith("win"):
|
|
187
|
+
bin_path = os.path.join(R_HOME, "bin", "x64")
|
|
188
|
+
path_env = os.environ.get("PATH", "")
|
|
189
|
+
if bin_path not in path_env.split(os.pathsep):
|
|
190
|
+
os.environ["PATH"] = f"{bin_path}{os.pathsep}{path_env}"
|
|
106
191
|
|
|
107
|
-
elif sys.platform.startswith("linux"):
|
|
108
|
-
lib_path = os.path.join(R_HOME, "lib")
|
|
109
|
-
ld_path = os.environ.get("LD_LIBRARY_PATH", "")
|
|
110
|
-
os.environ["LD_LIBRARY_PATH"] = f"{lib_path}:{ld_path}"
|
|
111
192
|
|
|
112
193
|
# ---------------------------------------------------------------------
|
|
113
194
|
# Lazy rpy2 import machinery
|
|
@@ -190,24 +271,47 @@ def activate_renv(path_to_renv: Path) -> None:
|
|
|
190
271
|
renviron_file = project_dir / ".Renviron"
|
|
191
272
|
if renviron_file.is_file():
|
|
192
273
|
os.environ["R_ENVIRON_USER"] = str(renviron_file)
|
|
193
|
-
logger.info(f"R_ENVIRON_USER set to: {renviron_file}")
|
|
274
|
+
logger.info(f"[rpy-bridge] R_ENVIRON_USER set to: {renviron_file}")
|
|
194
275
|
|
|
195
276
|
rprofile_file = project_dir / ".Rprofile"
|
|
196
277
|
if rprofile_file.is_file():
|
|
197
278
|
robjects.r(f'source("{rprofile_file.as_posix()}")')
|
|
198
|
-
logger.info(f".Rprofile sourced: {rprofile_file}")
|
|
279
|
+
logger.info(f"[rpy-bridge] .Rprofile sourced: {rprofile_file}")
|
|
199
280
|
|
|
200
281
|
try:
|
|
201
282
|
robjects.r("suppressMessages(library(renv))")
|
|
202
283
|
except Exception:
|
|
203
|
-
logger.info("Installing renv package in project library...")
|
|
284
|
+
logger.info("[rpy-bridge] Installing renv package in project library...")
|
|
204
285
|
robjects.r(
|
|
205
286
|
f'install.packages("renv", repos="https://cloud.r-project.org", lib="{renv_dir / "library"}")'
|
|
206
287
|
)
|
|
207
288
|
robjects.r("library(renv)")
|
|
208
289
|
|
|
209
290
|
robjects.r(f'renv::load("{project_dir.as_posix()}")')
|
|
210
|
-
logger.info(f"renv environment loaded for project: {project_dir}")
|
|
291
|
+
logger.info(f"[rpy-bridge] renv environment loaded for project: {project_dir}")
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
# ---------------------------------------------------------------------
|
|
295
|
+
# NamespaceWrapper
|
|
296
|
+
# ---------------------------------------------------------------------
|
|
297
|
+
class NamespaceWrapper:
|
|
298
|
+
"""
|
|
299
|
+
Wraps an R script namespace for Python attribute access.
|
|
300
|
+
"""
|
|
301
|
+
|
|
302
|
+
def __init__(self, env):
|
|
303
|
+
self._env = env
|
|
304
|
+
|
|
305
|
+
def __getattr__(self, func_name):
|
|
306
|
+
if func_name in self._env:
|
|
307
|
+
return self._env[func_name]
|
|
308
|
+
raise AttributeError(f"Function '{func_name}' not found in R namespace")
|
|
309
|
+
|
|
310
|
+
def list_functions(self):
|
|
311
|
+
"""
|
|
312
|
+
Return a list of callable functions in this namespace.
|
|
313
|
+
"""
|
|
314
|
+
return [k for k, v in self._env.items() if callable(v)]
|
|
211
315
|
|
|
212
316
|
|
|
213
317
|
# ---------------------------------------------------------------------
|
|
@@ -215,24 +319,93 @@ def activate_renv(path_to_renv: Path) -> None:
|
|
|
215
319
|
# ---------------------------------------------------------------------
|
|
216
320
|
class RFunctionCaller:
|
|
217
321
|
"""
|
|
218
|
-
|
|
322
|
+
Primary interface for calling R functions from Python.
|
|
323
|
+
|
|
324
|
+
``RFunctionCaller`` loads one or more R scripts into isolated namespaces
|
|
325
|
+
and provides a unified ``call()`` method for executing:
|
|
326
|
+
|
|
327
|
+
* Functions defined in sourced R scripts
|
|
328
|
+
* Base R functions (e.g. ``sum``, ``mean``)
|
|
329
|
+
* Functions from installed R packages (via ``package::function``)
|
|
219
330
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
331
|
+
In most workflows, users only need to interact with this class.
|
|
332
|
+
|
|
333
|
+
Parameters
|
|
334
|
+
----------
|
|
335
|
+
path_to_renv : Path or None, optional
|
|
336
|
+
Path to an R project that uses ``renv``. This may be either the project
|
|
337
|
+
root or the ``renv/`` directory itself. If provided, the renv
|
|
338
|
+
environment is activated before any scripts are sourced.
|
|
339
|
+
|
|
340
|
+
scripts : str, Path, list[str | Path], or None, optional
|
|
341
|
+
One or more ``.R`` files or directories containing ``.R`` files.
|
|
342
|
+
Each script is sourced into its own namespace.
|
|
343
|
+
|
|
344
|
+
packages : str or list[str], optional
|
|
345
|
+
R packages to load (and install if missing) before calling functions.
|
|
346
|
+
|
|
347
|
+
Notes
|
|
348
|
+
-----
|
|
349
|
+
* Python objects are automatically converted to R objects.
|
|
350
|
+
* R return values are converted back to Python equivalents.
|
|
351
|
+
* Missing values (``None``, ``pd.NA``) are mapped to R ``NA``.
|
|
225
352
|
"""
|
|
226
353
|
|
|
227
354
|
def __init__(
|
|
228
355
|
self,
|
|
229
356
|
path_to_renv: Path | None = None,
|
|
230
|
-
|
|
231
|
-
packages: list[str] | None = None,
|
|
357
|
+
scripts: str | Path | list[str | Path] | None = None,
|
|
358
|
+
packages: str | list[str] | None = None,
|
|
359
|
+
**kwargs, # catch unexpected keywords
|
|
232
360
|
):
|
|
361
|
+
# --- Handle deprecated 'script_path' ---
|
|
362
|
+
if "script_path" in kwargs:
|
|
363
|
+
script_path_value = kwargs.pop("script_path")
|
|
364
|
+
warnings.warn(
|
|
365
|
+
"'script_path' argument is deprecated. "
|
|
366
|
+
"Please use 'scripts' instead (accepts a Path or list of Paths).",
|
|
367
|
+
DeprecationWarning,
|
|
368
|
+
stacklevel=2,
|
|
369
|
+
)
|
|
370
|
+
if scripts is None:
|
|
371
|
+
scripts = script_path_value
|
|
372
|
+
else:
|
|
373
|
+
# Both provided → prioritize scripts and ignore script_path
|
|
374
|
+
logger.warning(
|
|
375
|
+
"'script_path' ignored because 'scripts' argument is also provided."
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
self.scripts = _normalize_scripts(scripts)
|
|
379
|
+
|
|
380
|
+
# --- Check all scripts exist immediately ---
|
|
381
|
+
for script_path in self.scripts:
|
|
382
|
+
if not script_path.exists():
|
|
383
|
+
raise FileNotFoundError(f"R script path not found: {script_path}")
|
|
384
|
+
|
|
385
|
+
# Raise error if other unexpected kwargs remain
|
|
386
|
+
if kwargs:
|
|
387
|
+
raise TypeError(
|
|
388
|
+
f"RFunctionCaller.__init__() received unexpected keyword arguments: {list(kwargs.keys())}"
|
|
389
|
+
)
|
|
390
|
+
|
|
233
391
|
self.path_to_renv = path_to_renv.resolve() if path_to_renv else None
|
|
234
|
-
self.
|
|
235
|
-
|
|
392
|
+
self._namespaces: dict[str, Any] = {}
|
|
393
|
+
|
|
394
|
+
# Normalize scripts to a list
|
|
395
|
+
if scripts is None:
|
|
396
|
+
self.scripts: list[Path] = []
|
|
397
|
+
elif isinstance(scripts, Path):
|
|
398
|
+
self.scripts = [scripts.resolve()]
|
|
399
|
+
else:
|
|
400
|
+
self.scripts = [s.resolve() for s in scripts]
|
|
401
|
+
|
|
402
|
+
# Normalize packages to a list
|
|
403
|
+
if packages is None:
|
|
404
|
+
self.packages: list[str] = []
|
|
405
|
+
elif isinstance(packages, str):
|
|
406
|
+
self.packages = [packages]
|
|
407
|
+
else:
|
|
408
|
+
self.packages = packages
|
|
236
409
|
|
|
237
410
|
# Lazy-loaded attributes
|
|
238
411
|
self._r = None
|
|
@@ -247,76 +420,120 @@ class RFunctionCaller:
|
|
|
247
420
|
self.ListVector = None
|
|
248
421
|
self.NamedList = None
|
|
249
422
|
|
|
250
|
-
|
|
251
|
-
raise FileNotFoundError(f"R script not found: {self.script_path}")
|
|
252
|
-
|
|
253
|
-
self.script_dir = self.script_path.parent if self.script_path else None
|
|
254
|
-
self._script_loaded = False
|
|
423
|
+
# Internal state
|
|
255
424
|
self._renv_activated = False
|
|
256
425
|
self._packages_loaded = False
|
|
426
|
+
self._scripts_loaded = [False] * len(self.scripts)
|
|
257
427
|
|
|
258
428
|
# -----------------------------------------------------------------
|
|
259
429
|
# Internal: lazy R loading
|
|
260
430
|
# -----------------------------------------------------------------
|
|
261
|
-
def _ensure_r_loaded(self):
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
self.
|
|
269
|
-
self.
|
|
270
|
-
self.
|
|
271
|
-
self.
|
|
272
|
-
self.
|
|
273
|
-
self.
|
|
274
|
-
self.
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
self.
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
431
|
+
def _ensure_r_loaded(self) -> None:
|
|
432
|
+
"""
|
|
433
|
+
Ensure R runtime is initialized and all configured R scripts
|
|
434
|
+
are sourced exactly once, in isolated environments.
|
|
435
|
+
"""
|
|
436
|
+
if self.robjects is None:
|
|
437
|
+
rpy2_dict = _ensure_rpy2()
|
|
438
|
+
self._RPY2 = rpy2_dict # cache in instance
|
|
439
|
+
self._r = rpy2_dict["ro"]
|
|
440
|
+
self.ro = rpy2_dict["robjects"]
|
|
441
|
+
self.robjects = rpy2_dict["robjects"]
|
|
442
|
+
self.pandas2ri = rpy2_dict["pandas2ri"]
|
|
443
|
+
self.localconverter = rpy2_dict["localconverter"]
|
|
444
|
+
self.IntVector = rpy2_dict["IntVector"]
|
|
445
|
+
self.FloatVector = rpy2_dict["FloatVector"]
|
|
446
|
+
self.BoolVector = rpy2_dict["BoolVector"]
|
|
447
|
+
self.StrVector = rpy2_dict["StrVector"]
|
|
448
|
+
self.ListVector = rpy2_dict["ListVector"]
|
|
449
|
+
self.NamedList = rpy2_dict["NamedList"]
|
|
450
|
+
|
|
451
|
+
r = self.robjects.r
|
|
452
|
+
|
|
453
|
+
# Ensure required R package
|
|
454
|
+
self.ensure_r_package("withr")
|
|
455
|
+
|
|
456
|
+
if not hasattr(self, "_namespaces"):
|
|
457
|
+
self._namespaces: dict[str, dict[str, Any]] = {}
|
|
458
|
+
|
|
459
|
+
# --- Iterate over scripts ---
|
|
460
|
+
for idx, script_entry in enumerate(self.scripts):
|
|
461
|
+
if self._scripts_loaded[idx]:
|
|
462
|
+
continue
|
|
463
|
+
|
|
464
|
+
script_entry = script_entry.resolve()
|
|
465
|
+
|
|
466
|
+
if script_entry.is_file():
|
|
467
|
+
r_files = [script_entry]
|
|
468
|
+
elif script_entry.is_dir():
|
|
469
|
+
r_files = sorted(script_entry.glob("*.R"))
|
|
470
|
+
if not r_files:
|
|
471
|
+
logger.warning(f"No .R files found in directory: {script_entry}")
|
|
472
|
+
self._scripts_loaded[idx] = True
|
|
473
|
+
continue
|
|
474
|
+
else:
|
|
475
|
+
raise ValueError(f"Invalid script path: {script_entry}")
|
|
476
|
+
|
|
477
|
+
for script_path in r_files:
|
|
478
|
+
ns_name = script_path.stem
|
|
479
|
+
logger.opt(depth=2).info(
|
|
480
|
+
"[rpy-bridge.RFunctionCaller] Loading R script '{}' as namespace '{}'",
|
|
481
|
+
script_path.name,
|
|
482
|
+
ns_name,
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
r("env <- new.env(parent=globalenv())")
|
|
486
|
+
r(f'script_path <- "{script_path.as_posix()}"')
|
|
487
|
+
|
|
488
|
+
r(
|
|
489
|
+
"""
|
|
490
|
+
withr::with_dir(
|
|
491
|
+
dirname(script_path),
|
|
492
|
+
sys.source(basename(script_path), envir=env)
|
|
290
493
|
)
|
|
291
|
-
|
|
292
|
-
|
|
494
|
+
"""
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
env_obj = r("env")
|
|
498
|
+
self._namespaces[ns_name] = {
|
|
499
|
+
name: env_obj[name]
|
|
500
|
+
for name in env_obj.keys()
|
|
501
|
+
if callable(env_obj[name])
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
logger.info(
|
|
505
|
+
f"[rpy-bridge.RFunctionCaller] Registered {len(self._namespaces[ns_name])} functions in namespace '{ns_name}'"
|
|
506
|
+
)
|
|
293
507
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
508
|
+
self._scripts_loaded[idx] = True
|
|
509
|
+
|
|
510
|
+
# -----------------------------------------------------------------
|
|
511
|
+
# Autocomplete-friendly attribute access for script namespaces
|
|
512
|
+
# -----------------------------------------------------------------
|
|
513
|
+
def __getattr__(self, name: str):
|
|
514
|
+
if "_namespaces" in self.__dict__ and name in self._namespaces:
|
|
515
|
+
ns_env = self._namespaces[name]
|
|
516
|
+
return NamespaceWrapper(ns_env)
|
|
517
|
+
raise AttributeError(f"'RFunctionCaller' object has no attribute '{name}'")
|
|
300
518
|
|
|
301
519
|
def _clean_scalar(self, x):
|
|
302
520
|
"""
|
|
303
521
|
Clean R-style missing values to pandas/NumPy equivalents.
|
|
304
522
|
Called inside _r2py on each vector element; atomic/scalar only.
|
|
305
523
|
"""
|
|
306
|
-
|
|
307
|
-
ro = r["robjects"]
|
|
524
|
+
robjects = self.robjects
|
|
308
525
|
|
|
309
526
|
if x is None:
|
|
310
527
|
return None
|
|
311
528
|
|
|
312
529
|
if x in (
|
|
313
|
-
getattr(
|
|
314
|
-
getattr(
|
|
315
|
-
getattr(
|
|
530
|
+
getattr(robjects, "NA_Real", None),
|
|
531
|
+
getattr(robjects, "NA_Integer", None),
|
|
532
|
+
getattr(robjects, "NA_Logical", None),
|
|
316
533
|
):
|
|
317
534
|
return None
|
|
318
535
|
|
|
319
|
-
if x is getattr(
|
|
536
|
+
if x is getattr(robjects, "NA_Character", None):
|
|
320
537
|
return None
|
|
321
538
|
|
|
322
539
|
if isinstance(x, float) and np.isnan(x):
|
|
@@ -324,6 +541,105 @@ class RFunctionCaller:
|
|
|
324
541
|
|
|
325
542
|
return x
|
|
326
543
|
|
|
544
|
+
def list_namespaces(self) -> list[str]:
|
|
545
|
+
"""
|
|
546
|
+
Return the names of all loaded script namespaces.
|
|
547
|
+
|
|
548
|
+
Returns
|
|
549
|
+
-------
|
|
550
|
+
list[str]
|
|
551
|
+
Names of sourced R script namespaces.
|
|
552
|
+
"""
|
|
553
|
+
self._ensure_r_loaded()
|
|
554
|
+
return list(self._namespaces.keys())
|
|
555
|
+
|
|
556
|
+
def list_namespace_functions(self, namespace: str) -> list[str]:
|
|
557
|
+
"""
|
|
558
|
+
Return all callable functions in a specific namespace.
|
|
559
|
+
"""
|
|
560
|
+
self._ensure_r_loaded()
|
|
561
|
+
if namespace not in self._namespaces:
|
|
562
|
+
raise ValueError(f"Namespace '{namespace}' not found")
|
|
563
|
+
return [k for k, v in self._namespaces[namespace].items() if callable(v)]
|
|
564
|
+
|
|
565
|
+
def _get_package_functions(self, pkg: str) -> list[str]:
|
|
566
|
+
"""
|
|
567
|
+
Return a list of callable functions from a loaded R package.
|
|
568
|
+
"""
|
|
569
|
+
r = self.robjects.r
|
|
570
|
+
try:
|
|
571
|
+
all_objs = list(r[f'ls("package:{pkg}")'])
|
|
572
|
+
funcs = [
|
|
573
|
+
name
|
|
574
|
+
for name in all_objs
|
|
575
|
+
if r(f'is.function(get("{name}", envir=asNamespace("{pkg}")))')[0]
|
|
576
|
+
]
|
|
577
|
+
return funcs
|
|
578
|
+
except Exception:
|
|
579
|
+
logger.warning(f"Failed to list functions for package '{pkg}'")
|
|
580
|
+
return []
|
|
581
|
+
|
|
582
|
+
def list_all_functions(
|
|
583
|
+
self, include_packages: bool = False
|
|
584
|
+
) -> dict[str, list[str]]:
|
|
585
|
+
"""
|
|
586
|
+
Return all callable R functions grouped by script namespace and package.
|
|
587
|
+
"""
|
|
588
|
+
self._ensure_r_loaded()
|
|
589
|
+
all_funcs = {}
|
|
590
|
+
|
|
591
|
+
# --- Script namespaces ---
|
|
592
|
+
for ns_name, ns_env in self._namespaces.items():
|
|
593
|
+
funcs = [name for name, val in ns_env.items() if callable(val)]
|
|
594
|
+
all_funcs[ns_name] = funcs
|
|
595
|
+
|
|
596
|
+
# --- Loaded R packages ---
|
|
597
|
+
if include_packages:
|
|
598
|
+
r = self.robjects.r
|
|
599
|
+
try:
|
|
600
|
+
pkgs = r("loadedNamespaces()")
|
|
601
|
+
for pkg in pkgs:
|
|
602
|
+
funcs = self._get_package_functions(pkg)
|
|
603
|
+
if not funcs:
|
|
604
|
+
# Add a placeholder note
|
|
605
|
+
funcs = [
|
|
606
|
+
"[See official documentation for functions, datasets, and objects]"
|
|
607
|
+
]
|
|
608
|
+
all_funcs[pkg] = funcs
|
|
609
|
+
except Exception:
|
|
610
|
+
pass
|
|
611
|
+
|
|
612
|
+
return all_funcs
|
|
613
|
+
|
|
614
|
+
def print_function_tree(
|
|
615
|
+
self, include_packages: bool = False, max_display: int = 10
|
|
616
|
+
):
|
|
617
|
+
"""
|
|
618
|
+
Pretty-print available R functions grouped by namespace.
|
|
619
|
+
|
|
620
|
+
Parameters
|
|
621
|
+
----------
|
|
622
|
+
include_packages : bool, default False
|
|
623
|
+
Whether to include functions from loaded R packages.
|
|
624
|
+
|
|
625
|
+
max_display : int, default 10
|
|
626
|
+
Maximum number of functions displayed per namespace.
|
|
627
|
+
|
|
628
|
+
Notes
|
|
629
|
+
-----
|
|
630
|
+
This method is intended for interactive exploration and debugging.
|
|
631
|
+
"""
|
|
632
|
+
all_funcs = self.list_all_functions(include_packages=include_packages)
|
|
633
|
+
|
|
634
|
+
for ns_name, funcs in all_funcs.items():
|
|
635
|
+
if not funcs:
|
|
636
|
+
continue
|
|
637
|
+
print(f"{ns_name}/")
|
|
638
|
+
for f in sorted(funcs)[:max_display]:
|
|
639
|
+
print(f" {f}")
|
|
640
|
+
if len(funcs) > max_display:
|
|
641
|
+
print(" ...")
|
|
642
|
+
|
|
327
643
|
# -----------------------------------------------------------------
|
|
328
644
|
# Python -> R conversion
|
|
329
645
|
# -----------------------------------------------------------------
|
|
@@ -340,92 +656,56 @@ class RFunctionCaller:
|
|
|
340
656
|
StrVector = self.StrVector
|
|
341
657
|
ListVector = self.ListVector
|
|
342
658
|
localconverter = self.localconverter
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
rvec.StrVector,
|
|
354
|
-
rvec.ListVector,
|
|
355
|
-
robjects.DataFrame,
|
|
356
|
-
),
|
|
357
|
-
):
|
|
659
|
+
|
|
660
|
+
r_types = (
|
|
661
|
+
robjects.vectors.IntVector,
|
|
662
|
+
robjects.vectors.FloatVector,
|
|
663
|
+
robjects.vectors.BoolVector,
|
|
664
|
+
robjects.vectors.StrVector,
|
|
665
|
+
robjects.vectors.ListVector,
|
|
666
|
+
robjects.DataFrame,
|
|
667
|
+
)
|
|
668
|
+
if isinstance(obj, r_types):
|
|
358
669
|
return obj
|
|
359
670
|
|
|
671
|
+
def is_na(x):
|
|
672
|
+
return x is None or x is pd.NA or (isinstance(x, float) and pd.isna(x))
|
|
673
|
+
|
|
360
674
|
with localconverter(robjects.default_converter + pandas2ri.converter):
|
|
361
|
-
if obj
|
|
675
|
+
if is_na(obj):
|
|
362
676
|
return robjects.NULL
|
|
363
|
-
|
|
364
|
-
# DataFrame → data.frame
|
|
365
677
|
if isinstance(obj, pd.DataFrame):
|
|
366
678
|
return pandas2ri.py2rpy(obj)
|
|
367
|
-
|
|
368
|
-
# Series → vector
|
|
369
679
|
if isinstance(obj, pd.Series):
|
|
370
680
|
return self._py2r(obj.tolist())
|
|
371
|
-
|
|
372
|
-
# Scalars
|
|
373
681
|
if isinstance(obj, (int, float, bool, str)):
|
|
374
682
|
return obj
|
|
375
|
-
|
|
376
|
-
# Lists
|
|
377
683
|
if isinstance(obj, list):
|
|
378
684
|
if len(obj) == 0:
|
|
379
685
|
return FloatVector([])
|
|
380
|
-
elif all(isinstance(x, (int, float)) or x is None for x in obj):
|
|
381
|
-
return FloatVector(
|
|
382
|
-
[robjects.NA_Real if x is None else float(x) for x in obj]
|
|
383
|
-
)
|
|
384
|
-
|
|
385
|
-
def is_na(x):
|
|
386
|
-
return (
|
|
387
|
-
x is None or x is pd.NA or (isinstance(x, float) and pd.isna(x))
|
|
388
|
-
)
|
|
389
686
|
|
|
390
|
-
|
|
391
|
-
if
|
|
392
|
-
isinstance(x, (int, float)) and not isinstance(x, bool) or is_na(x)
|
|
393
|
-
for x in obj
|
|
394
|
-
):
|
|
687
|
+
types = set(type(x) for x in obj if not is_na(x))
|
|
688
|
+
if types <= {int, float}:
|
|
395
689
|
return FloatVector(
|
|
396
690
|
[robjects.NA_Real if is_na(x) else float(x) for x in obj]
|
|
397
691
|
)
|
|
398
|
-
|
|
399
|
-
# Homogeneous bool
|
|
400
|
-
if all(isinstance(x, bool) or is_na(x) for x in obj):
|
|
692
|
+
if types <= {bool}:
|
|
401
693
|
return BoolVector(
|
|
402
694
|
[robjects.NA_Logical if is_na(x) else x for x in obj]
|
|
403
695
|
)
|
|
404
|
-
|
|
405
|
-
# Homogeneous str
|
|
406
|
-
if all(isinstance(x, str) or is_na(x) for x in obj):
|
|
696
|
+
if types <= {str}:
|
|
407
697
|
return StrVector(
|
|
408
698
|
[robjects.NA_Character if is_na(x) else x for x in obj]
|
|
409
699
|
)
|
|
410
|
-
|
|
411
|
-
# Mixed or nested list → ListVector with positional keys
|
|
412
700
|
return ListVector({str(i): self._py2r(v) for i, v in enumerate(obj)})
|
|
413
|
-
|
|
414
|
-
# Dict → NamedList
|
|
415
701
|
if isinstance(obj, dict):
|
|
416
702
|
return ListVector({k: self._py2r(v) for k, v in obj.items()})
|
|
417
|
-
|
|
418
703
|
raise NotImplementedError(f"Cannot convert Python object to R: {type(obj)}")
|
|
419
704
|
|
|
420
705
|
# -----------------------------------------------------------------
|
|
421
706
|
# R -> Python conversion
|
|
422
707
|
# -----------------------------------------------------------------
|
|
423
708
|
def _r2py(self, obj, top_level=True):
|
|
424
|
-
"""
|
|
425
|
-
Convert R objects to Python objects robustly.
|
|
426
|
-
Handles DataFrames, NamedList/ListVector, atomic vectors, and NULL.
|
|
427
|
-
"""
|
|
428
|
-
r = self._r
|
|
429
709
|
robjects = self.robjects
|
|
430
710
|
NamedList = self.NamedList
|
|
431
711
|
ListVector = self.ListVector
|
|
@@ -433,7 +713,7 @@ class RFunctionCaller:
|
|
|
433
713
|
IntVector = self.IntVector
|
|
434
714
|
FloatVector = self.FloatVector
|
|
435
715
|
BoolVector = self.BoolVector
|
|
436
|
-
NULLType =
|
|
716
|
+
NULLType = self._RPY2["NULLType"]
|
|
437
717
|
lc = self.localconverter
|
|
438
718
|
pandas2ri = self.pandas2ri
|
|
439
719
|
|
|
@@ -444,12 +724,10 @@ class RFunctionCaller:
|
|
|
444
724
|
with lc(robjects.default_converter + pandas2ri.converter):
|
|
445
725
|
df = robjects.conversion.rpy2py(obj)
|
|
446
726
|
df = postprocess_r_dataframe(df)
|
|
447
|
-
|
|
448
|
-
return df
|
|
727
|
+
return clean_r_missing(df, caller=self)
|
|
449
728
|
|
|
450
729
|
if isinstance(obj, (NamedList, ListVector)):
|
|
451
730
|
py_obj = r_namedlist_to_dict(obj, caller=self, top_level=top_level)
|
|
452
|
-
# Auto-unpack single-element lists only at top-level
|
|
453
731
|
if isinstance(py_obj, list) and len(py_obj) == 1 and top_level:
|
|
454
732
|
return py_obj[0]
|
|
455
733
|
return py_obj
|
|
@@ -465,58 +743,114 @@ class RFunctionCaller:
|
|
|
465
743
|
# -----------------------------------------------------------------
|
|
466
744
|
# Public: ensure R package is available
|
|
467
745
|
# -----------------------------------------------------------------
|
|
468
|
-
def ensure_r_package(self,
|
|
746
|
+
def ensure_r_package(self, pkg: str):
|
|
469
747
|
r = self.robjects.r
|
|
470
748
|
try:
|
|
471
|
-
r(f'suppressMessages(library("{
|
|
749
|
+
r(f'suppressMessages(library("{pkg}", character.only=TRUE))')
|
|
472
750
|
except Exception:
|
|
473
|
-
|
|
474
|
-
|
|
751
|
+
logger.info(f"[rpy-bridge.RFunctionCaller] Package '{pkg}' not found.")
|
|
752
|
+
logger.warning(
|
|
753
|
+
f"[rpy-bridge.RFunctionCaller] Installing missing R package: {pkg}"
|
|
754
|
+
)
|
|
755
|
+
r(f'install.packages("{pkg}", repos="https://cloud.r-project.org")')
|
|
756
|
+
r(f'suppressMessages(library("{pkg}", character.only=TRUE))')
|
|
475
757
|
|
|
476
758
|
# -----------------------------------------------------------------
|
|
477
759
|
# Public: call an R function
|
|
478
760
|
# -----------------------------------------------------------------
|
|
479
761
|
def call(self, func_name: str, *args, **kwargs):
|
|
480
762
|
"""
|
|
481
|
-
Call an R function
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
763
|
+
Call an R function.
|
|
764
|
+
|
|
765
|
+
The function may be defined in:
|
|
766
|
+
* a sourced R script
|
|
767
|
+
* an installed R package (using ``package::function`` syntax)
|
|
768
|
+
* base R
|
|
769
|
+
|
|
770
|
+
Parameters
|
|
771
|
+
----------
|
|
772
|
+
func_name : str
|
|
773
|
+
Name of the R function to call. Package functions should be specified
|
|
774
|
+
as ``package::function``.
|
|
775
|
+
|
|
776
|
+
*args
|
|
777
|
+
Positional arguments passed to the R function.
|
|
778
|
+
|
|
779
|
+
**kwargs
|
|
780
|
+
Named arguments passed to the R function.
|
|
781
|
+
|
|
782
|
+
Returns
|
|
783
|
+
-------
|
|
784
|
+
object
|
|
785
|
+
The result of the R function, converted to a Python object.
|
|
786
|
+
|
|
787
|
+
Examples
|
|
788
|
+
--------
|
|
789
|
+
>>> rfc.call("sum", [1, 2, 3])
|
|
790
|
+
>>> rfc.call("dplyr::n_distinct", [1, 2, 2, 3])
|
|
791
|
+
>>> rfc.call("add_and_scale", 2, 3, scale=10)
|
|
485
792
|
"""
|
|
793
|
+
|
|
486
794
|
self._ensure_r_loaded()
|
|
487
795
|
|
|
488
|
-
# --- Find the function ---
|
|
489
796
|
func = None
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
797
|
+
source_info = None
|
|
798
|
+
|
|
799
|
+
if "::" in func_name:
|
|
800
|
+
ns_name, fname = func_name.split("::", 1)
|
|
801
|
+
if ns_name in self._namespaces:
|
|
802
|
+
ns_env = self._namespaces[ns_name]
|
|
803
|
+
if fname in ns_env:
|
|
804
|
+
func = ns_env[fname]
|
|
805
|
+
source_info = f"script namespace '{ns_name}'"
|
|
806
|
+
else:
|
|
807
|
+
raise ValueError(
|
|
808
|
+
f"Function '{fname}' not found in R script namespace '{ns_name}'"
|
|
809
|
+
)
|
|
810
|
+
else:
|
|
811
|
+
try:
|
|
812
|
+
func = self.robjects.r(f"{ns_name}::{fname}")
|
|
813
|
+
source_info = f"R package '{ns_name}'"
|
|
814
|
+
except Exception as e:
|
|
815
|
+
raise RuntimeError(
|
|
816
|
+
f"Failed to resolve R function '{func_name}': {e}"
|
|
817
|
+
) from e
|
|
818
|
+
|
|
819
|
+
else:
|
|
820
|
+
for ns_name, ns_env in self._namespaces.items():
|
|
821
|
+
if func_name in ns_env:
|
|
822
|
+
func = ns_env[func_name]
|
|
823
|
+
source_info = f"script namespace '{ns_name}'"
|
|
824
|
+
break
|
|
825
|
+
|
|
826
|
+
if func is None:
|
|
827
|
+
try:
|
|
828
|
+
func = self.robjects.globalenv[func_name]
|
|
829
|
+
source_info = "global environment"
|
|
830
|
+
except KeyError:
|
|
831
|
+
pass
|
|
832
|
+
|
|
833
|
+
if func is None:
|
|
834
|
+
try:
|
|
835
|
+
func = self.robjects.r[func_name]
|
|
836
|
+
source_info = "base R / loaded package"
|
|
837
|
+
except KeyError:
|
|
838
|
+
raise ValueError(
|
|
839
|
+
f"R function '{func_name}' not found in any namespace, global env, or base R."
|
|
840
|
+
)
|
|
841
|
+
|
|
510
842
|
r_args = [self._py2r(a) for a in args]
|
|
511
843
|
r_kwargs = {k: self._py2r(v) for k, v in kwargs.items()}
|
|
512
844
|
|
|
513
|
-
# --- Call safely ---
|
|
514
845
|
try:
|
|
515
846
|
result = func(*r_args, **r_kwargs)
|
|
516
847
|
except Exception as e:
|
|
517
|
-
raise RuntimeError(
|
|
848
|
+
raise RuntimeError(
|
|
849
|
+
f"Error calling R function '{func_name}' from {source_info}: {e}"
|
|
850
|
+
) from e
|
|
851
|
+
|
|
852
|
+
_log_r_call(func_name, source_info)
|
|
518
853
|
|
|
519
|
-
# --- Convert R result back to Python ---
|
|
520
854
|
return self._r2py(result)
|
|
521
855
|
|
|
522
856
|
|
|
@@ -525,10 +859,6 @@ class RFunctionCaller:
|
|
|
525
859
|
# Utility functions for R ↔ Python
|
|
526
860
|
# ------------------------------
|
|
527
861
|
def r_namedlist_to_dict(namedlist, caller: RFunctionCaller, top_level=False):
|
|
528
|
-
"""
|
|
529
|
-
Recursively convert an R NamedList or ListVector to a Python dictionary.
|
|
530
|
-
Uses the caller._r2py method for nested conversions.
|
|
531
|
-
"""
|
|
532
862
|
r = _ensure_rpy2()
|
|
533
863
|
NamedList = r["NamedList"]
|
|
534
864
|
ListVector = r["ListVector"]
|
|
@@ -536,31 +866,24 @@ def r_namedlist_to_dict(namedlist, caller: RFunctionCaller, top_level=False):
|
|
|
536
866
|
if isinstance(namedlist, (NamedList, ListVector)):
|
|
537
867
|
names = namedlist.names if not callable(namedlist.names) else namedlist.names()
|
|
538
868
|
|
|
539
|
-
# Detect positional (unnamed) list
|
|
540
869
|
if names and all(str(i) == str(name) for i, name in enumerate(names)):
|
|
541
870
|
out = []
|
|
542
871
|
for v in namedlist:
|
|
543
|
-
# Nested elements are never top-level
|
|
544
872
|
val = caller._r2py(v, top_level=False)
|
|
545
873
|
out.append(val)
|
|
546
874
|
return out
|
|
547
875
|
|
|
548
|
-
# Otherwise dict
|
|
549
876
|
result = {}
|
|
550
877
|
for i, val in enumerate(namedlist):
|
|
551
878
|
key = names[i] if names and i < len(names) else str(i)
|
|
552
|
-
v_py = caller._r2py(val, top_level=False)
|
|
879
|
+
v_py = caller._r2py(val, top_level=False)
|
|
553
880
|
result[str(key)] = v_py
|
|
554
881
|
return result
|
|
555
882
|
|
|
556
|
-
# Fallback: scalar/vector at the very top
|
|
557
883
|
return caller._r2py(namedlist, top_level=top_level)
|
|
558
884
|
|
|
559
885
|
|
|
560
886
|
def clean_r_dataframe(r_df: pd.DataFrame) -> pd.DataFrame:
|
|
561
|
-
"""
|
|
562
|
-
Clean an R data.frame by removing non-structural attributes like .groups and .rows.
|
|
563
|
-
"""
|
|
564
887
|
for attr in [".groups", ".rows"]:
|
|
565
888
|
try:
|
|
566
889
|
del r_df.attrs[attr]
|
|
@@ -570,18 +893,11 @@ def clean_r_dataframe(r_df: pd.DataFrame) -> pd.DataFrame:
|
|
|
570
893
|
|
|
571
894
|
|
|
572
895
|
def fix_string_nans(df: pd.DataFrame) -> pd.DataFrame:
|
|
573
|
-
"""
|
|
574
|
-
Replace string NAs or empty strings with pd.NA.
|
|
575
|
-
"""
|
|
576
896
|
return df.replace(["nan", "NaN", "NA", "na", ""], pd.NA)
|
|
577
897
|
|
|
578
898
|
|
|
579
899
|
def normalize_single_df_dtypes(df: pd.DataFrame) -> pd.DataFrame:
|
|
580
|
-
"""
|
|
581
|
-
Normalize dtypes in a single DataFrame after R conversion.
|
|
582
|
-
"""
|
|
583
900
|
df = df.replace(["", "nan", "NaN", "NA", "na"], pd.NA)
|
|
584
|
-
|
|
585
901
|
for col in df.columns:
|
|
586
902
|
series = df[col]
|
|
587
903
|
if pd.api.types.is_object_dtype(series):
|
|
@@ -594,18 +910,10 @@ def normalize_single_df_dtypes(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
594
910
|
|
|
595
911
|
|
|
596
912
|
def fix_r_dataframe_types(df: pd.DataFrame) -> pd.DataFrame:
|
|
597
|
-
"""
|
|
598
|
-
Post-process R DataFrame:
|
|
599
|
-
- Convert R NA_integer_ sentinel (-2147483648) to pd.NA
|
|
600
|
-
- Convert R-style numeric dates to datetime
|
|
601
|
-
- Remove timezone from datetime columns
|
|
602
|
-
"""
|
|
603
913
|
for col in df.columns:
|
|
604
914
|
series = df[col]
|
|
605
|
-
|
|
606
915
|
if pd.api.types.is_integer_dtype(series):
|
|
607
916
|
df[col] = series.mask(series == -2147483648, pd.NA)
|
|
608
|
-
|
|
609
917
|
if pd.api.types.is_numeric_dtype(series):
|
|
610
918
|
values = series.dropna()
|
|
611
919
|
if not values.empty and values.between(10000, 40000).all():
|
|
@@ -615,24 +923,15 @@ def fix_r_dataframe_types(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
615
923
|
)
|
|
616
924
|
except Exception:
|
|
617
925
|
pass
|
|
618
|
-
|
|
619
926
|
if pd.api.types.is_datetime64tz_dtype(series):
|
|
620
927
|
df[col] = series.dt.tz_localize(None)
|
|
621
|
-
|
|
622
928
|
return df
|
|
623
929
|
|
|
624
930
|
|
|
625
931
|
def postprocess_r_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
626
|
-
"""
|
|
627
|
-
Apply a series of fixes to a DataFrame converted from R:
|
|
628
|
-
- Type corrections
|
|
629
|
-
- String NA normalization
|
|
630
|
-
- Index normalization
|
|
631
|
-
"""
|
|
632
932
|
df = fix_r_dataframe_types(df)
|
|
633
933
|
df = fix_string_nans(df)
|
|
634
934
|
df = normalize_single_df_dtypes(df)
|
|
635
|
-
|
|
636
935
|
if df.index.dtype == object:
|
|
637
936
|
try:
|
|
638
937
|
int_index = df.index.astype(int)
|
|
@@ -644,62 +943,37 @@ def postprocess_r_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
644
943
|
|
|
645
944
|
|
|
646
945
|
def clean_r_missing(obj, caller: RFunctionCaller):
|
|
647
|
-
|
|
648
|
-
Post-process R return objects for downstream Python use.
|
|
649
|
-
Recursively convert R-style missing values to pandas/NumPy:
|
|
650
|
-
- NA_integer_, NA_real_, NA_logical_ → np.nan
|
|
651
|
-
- NA_character_ → pd.NA
|
|
652
|
-
"""
|
|
653
|
-
r = _ensure_rpy2()
|
|
654
|
-
ro = r["robjects"]
|
|
655
|
-
|
|
946
|
+
robjects = caller.robjects
|
|
656
947
|
NA_MAP = {
|
|
657
|
-
getattr(
|
|
658
|
-
getattr(
|
|
659
|
-
getattr(
|
|
660
|
-
getattr(
|
|
948
|
+
getattr(robjects, "NA_Real", None): np.nan,
|
|
949
|
+
getattr(robjects, "NA_Integer", None): np.nan,
|
|
950
|
+
getattr(robjects, "NA_Logical", None): np.nan,
|
|
951
|
+
getattr(robjects, "NA_Character", None): pd.NA,
|
|
661
952
|
}
|
|
662
953
|
|
|
663
954
|
if isinstance(obj, pd.DataFrame):
|
|
664
955
|
for col in obj.columns:
|
|
665
956
|
obj[col] = obj[col].apply(lambda x: clean_r_missing(x, caller))
|
|
666
957
|
return obj
|
|
667
|
-
|
|
668
958
|
elif isinstance(obj, dict):
|
|
669
959
|
return {k: clean_r_missing(v, caller) for k, v in obj.items()}
|
|
670
|
-
|
|
671
960
|
elif isinstance(obj, list):
|
|
672
961
|
return [clean_r_missing(v, caller) for v in obj]
|
|
673
|
-
|
|
674
962
|
else:
|
|
675
963
|
return NA_MAP.get(obj, obj)
|
|
676
964
|
|
|
677
965
|
|
|
678
|
-
#
|
|
679
|
-
#
|
|
680
|
-
#
|
|
681
|
-
# for comparing R and Python DataFrames.
|
|
682
|
-
# -------------------------------------------
|
|
683
|
-
|
|
684
|
-
|
|
966
|
+
# ---------------------------------------------------------------------
|
|
967
|
+
# DataFrame comparison utilities
|
|
968
|
+
# ---------------------------------------------------------------------
|
|
685
969
|
def normalize_dtypes(
|
|
686
970
|
df1: pd.DataFrame, df2: pd.DataFrame
|
|
687
971
|
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
688
|
-
"""
|
|
689
|
-
Aligns column dtypes across two DataFrames for accurate comparison.
|
|
690
|
-
- Replaces empty strings with pd.NA.
|
|
691
|
-
- Attempts to coerce strings to numeric where applicable.
|
|
692
|
-
- Aligns dtypes between matching columns (e.g. float64 vs int64).
|
|
693
|
-
"""
|
|
694
972
|
for col in df1.columns.intersection(df2.columns):
|
|
695
|
-
# Replace empty strings with NA
|
|
696
973
|
df1[col] = df1[col].replace("", pd.NA)
|
|
697
974
|
df2[col] = df2[col].replace("", pd.NA)
|
|
698
|
-
|
|
699
975
|
s1, s2 = df1[col], df2[col]
|
|
700
976
|
dtype1, dtype2 = s1.dtype, s2.dtype
|
|
701
|
-
|
|
702
|
-
# If one is numeric and the other is object, try coercing both to numeric
|
|
703
977
|
if (
|
|
704
978
|
pd.api.types.is_numeric_dtype(dtype1)
|
|
705
979
|
and pd.api.types.is_object_dtype(dtype2)
|
|
@@ -710,98 +984,57 @@ def normalize_dtypes(
|
|
|
710
984
|
try:
|
|
711
985
|
df1[col] = pd.to_numeric(s1, errors="coerce")
|
|
712
986
|
df2[col] = pd.to_numeric(s2, errors="coerce")
|
|
713
|
-
continue
|
|
987
|
+
continue
|
|
714
988
|
except Exception:
|
|
715
|
-
pass
|
|
716
|
-
|
|
717
|
-
# If both are numeric but of different types (e.g., int vs float), unify to float64
|
|
989
|
+
pass
|
|
718
990
|
if pd.api.types.is_numeric_dtype(dtype1) and pd.api.types.is_numeric_dtype(
|
|
719
991
|
dtype2
|
|
720
992
|
):
|
|
721
993
|
df1[col] = df1[col].astype("float64")
|
|
722
994
|
df2[col] = df2[col].astype("float64")
|
|
723
995
|
continue
|
|
724
|
-
|
|
725
|
-
# If both are objects or strings, convert both to str for equality comparison
|
|
726
996
|
if pd.api.types.is_object_dtype(dtype1) or pd.api.types.is_object_dtype(dtype2):
|
|
727
997
|
df1[col] = df1[col].astype(str)
|
|
728
998
|
df2[col] = df2[col].astype(str)
|
|
729
|
-
|
|
730
999
|
return df1, df2
|
|
731
1000
|
|
|
732
1001
|
|
|
733
|
-
# %%
|
|
734
1002
|
def align_numeric_dtypes(
|
|
735
1003
|
df1: pd.DataFrame, df2: pd.DataFrame
|
|
736
1004
|
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
737
|
-
"""
|
|
738
|
-
Ensure aligned numeric dtypes between two DataFrames for accurate comparison.
|
|
739
|
-
Converts between int, float, and numeric-looking strings where appropriate.
|
|
740
|
-
Also handles NA and empty string normalization.
|
|
741
|
-
"""
|
|
742
1005
|
for col in df1.columns.intersection(df2.columns):
|
|
743
|
-
s1, s2 = df1[col], df2[col]
|
|
744
|
-
|
|
745
|
-
# Replace empty strings with NA to avoid type promotion issues
|
|
746
|
-
s1 = s1.replace("", pd.NA)
|
|
747
|
-
s2 = s2.replace("", pd.NA)
|
|
748
|
-
|
|
749
|
-
# Try to coerce both to numeric (non-destructive)
|
|
1006
|
+
s1, s2 = df1[col].replace("", pd.NA), df2[col].replace("", pd.NA)
|
|
750
1007
|
try:
|
|
751
1008
|
s1_num = pd.to_numeric(s1, errors="coerce")
|
|
752
1009
|
s2_num = pd.to_numeric(s2, errors="coerce")
|
|
753
|
-
|
|
754
|
-
# If at least one successfully converts and it's not all NaN
|
|
755
1010
|
if not s1_num.isna().all() or not s2_num.isna().all():
|
|
756
1011
|
df1[col] = s1_num.astype("float64")
|
|
757
1012
|
df2[col] = s2_num.astype("float64")
|
|
758
|
-
continue
|
|
1013
|
+
continue
|
|
759
1014
|
except Exception:
|
|
760
1015
|
pass
|
|
761
|
-
|
|
762
|
-
# Otherwise, fall back to original values
|
|
763
|
-
df1[col] = s1
|
|
764
|
-
df2[col] = s2
|
|
765
|
-
|
|
1016
|
+
df1[col], df2[col] = s1, s2
|
|
766
1017
|
return df1, df2
|
|
767
1018
|
|
|
768
1019
|
|
|
769
|
-
# %%
|
|
770
1020
|
def compare_r_py_dataframes(
|
|
771
1021
|
df1: pd.DataFrame, df2: pd.DataFrame, float_tol: float = 1e-8
|
|
772
1022
|
) -> dict:
|
|
773
|
-
"""
|
|
774
|
-
Compare a Python DataFrame (df1) with an R DataFrame converted to pandas (df2).
|
|
775
|
-
|
|
776
|
-
Returns:
|
|
777
|
-
dict with mismatch diagnostics, preserving original indices in diffs.
|
|
778
|
-
"""
|
|
779
|
-
|
|
780
1023
|
results: dict[str, Any] = {
|
|
781
1024
|
"shape_mismatch": False,
|
|
782
1025
|
"columns_mismatch": False,
|
|
783
1026
|
"index_mismatch": False,
|
|
784
|
-
"numeric_diffs": {},
|
|
785
|
-
"non_numeric_diffs": {},
|
|
1027
|
+
"numeric_diffs": {},
|
|
1028
|
+
"non_numeric_diffs": {},
|
|
786
1029
|
}
|
|
787
|
-
|
|
788
|
-
# --- Preprocessing: fix R-specific issues ---
|
|
789
1030
|
df2 = fix_r_dataframe_types(df2)
|
|
790
|
-
|
|
791
|
-
# --- Replace common string NAs with proper pd.NA ---
|
|
792
1031
|
df1 = fix_string_nans(df1)
|
|
793
1032
|
df2 = fix_string_nans(df2)
|
|
794
|
-
|
|
795
|
-
# --- Normalize and align dtypes ---
|
|
796
1033
|
df1, df2 = normalize_dtypes(df1.copy(), df2.copy())
|
|
797
1034
|
df1, df2 = align_numeric_dtypes(df1, df2)
|
|
798
|
-
|
|
799
|
-
# --- Check shape ---
|
|
800
1035
|
if df1.shape != df2.shape:
|
|
801
1036
|
results["shape_mismatch"] = True
|
|
802
1037
|
print(f"[Warning] Shape mismatch: df1 {df1.shape} vs df2 {df2.shape}")
|
|
803
|
-
|
|
804
|
-
# --- Check columns ---
|
|
805
1038
|
if set(df1.columns) != set(df2.columns):
|
|
806
1039
|
results["columns_mismatch"] = True
|
|
807
1040
|
print("[Warning] Column mismatch:")
|
|
@@ -810,21 +1043,13 @@ def compare_r_py_dataframes(
|
|
|
810
1043
|
common_cols = df1.columns.intersection(df2.columns)
|
|
811
1044
|
else:
|
|
812
1045
|
common_cols = df1.columns
|
|
813
|
-
|
|
814
|
-
# --- Ensure columns are the same order ---
|
|
815
|
-
df1_aligned = df1.loc[:, common_cols]
|
|
816
|
-
df2_aligned = df2.loc[:, common_cols]
|
|
817
|
-
|
|
818
|
-
# --- Compare values column by column ---
|
|
1046
|
+
df1_aligned, df2_aligned = df1.loc[:, common_cols], df2.loc[:, common_cols]
|
|
819
1047
|
for col in common_cols:
|
|
820
|
-
col_py = df1_aligned[col]
|
|
821
|
-
col_r = df2_aligned[col]
|
|
822
|
-
|
|
1048
|
+
col_py, col_r = df1_aligned[col], df2_aligned[col]
|
|
823
1049
|
if pd.api.types.is_numeric_dtype(col_py) and pd.api.types.is_numeric_dtype(
|
|
824
1050
|
col_r
|
|
825
1051
|
):
|
|
826
1052
|
col_py, col_r = col_py.align(col_r)
|
|
827
|
-
|
|
828
1053
|
close = np.isclose(
|
|
829
1054
|
col_py.fillna(np.nan),
|
|
830
1055
|
col_r.fillna(np.nan),
|
|
@@ -832,30 +1057,15 @@ def compare_r_py_dataframes(
|
|
|
832
1057
|
equal_nan=True,
|
|
833
1058
|
)
|
|
834
1059
|
if not close.all():
|
|
835
|
-
|
|
836
|
-
{
|
|
837
|
-
"df1": col_py[~close],
|
|
838
|
-
"df2": col_r[~close],
|
|
839
|
-
}
|
|
1060
|
+
results["numeric_diffs"][col] = pd.DataFrame(
|
|
1061
|
+
{"df1": col_py[~close], "df2": col_r[~close]}
|
|
840
1062
|
)
|
|
841
|
-
results["numeric_diffs"][col] = diffs
|
|
842
|
-
|
|
843
1063
|
else:
|
|
844
|
-
# Treat missing values as equal: create mask where values differ excluding matching NAs
|
|
845
1064
|
unequal = ~col_py.eq(col_r)
|
|
846
1065
|
both_na = col_py.isna() & col_r.isna()
|
|
847
1066
|
unequal = unequal & ~both_na
|
|
848
|
-
|
|
849
1067
|
if unequal.any():
|
|
850
|
-
|
|
851
|
-
{
|
|
852
|
-
"df1": col_py[unequal],
|
|
853
|
-
"df2": col_r[unequal],
|
|
854
|
-
}
|
|
1068
|
+
results["non_numeric_diffs"][col] = pd.DataFrame(
|
|
1069
|
+
{"df1": col_py[unequal], "df2": col_r[unequal]}
|
|
855
1070
|
)
|
|
856
|
-
results["non_numeric_diffs"][col] = diffs
|
|
857
|
-
|
|
858
1071
|
return results
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
# %%
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rpy-bridge
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.5
|
|
4
4
|
Summary: Python-to-R interoperability engine with environment management, type-safe conversions, data normalization, and safe R function execution.
|
|
5
5
|
Author-email: Victoria Cheung <victoriakcheung@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -150,12 +150,12 @@ uv sync
|
|
|
150
150
|
from pathlib import Path
|
|
151
151
|
from rpy_bridge import RFunctionCaller
|
|
152
152
|
|
|
153
|
-
|
|
153
|
+
rfc = RFunctionCaller(
|
|
154
154
|
path_to_renv=Path("/path/to/project"),
|
|
155
|
-
|
|
155
|
+
script=Path("/path/to/script.R"),
|
|
156
156
|
)
|
|
157
157
|
|
|
158
|
-
summary_df =
|
|
158
|
+
summary_df = rfc.call("summarize_cohort", cohort_df)
|
|
159
159
|
```
|
|
160
160
|
|
|
161
161
|
---
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
rpy_bridge/__init__.py,sha256=VDCx-CiTBJO0cMp59v-gyJGBVYHjLjATTIdtYxBsK5Q,875
|
|
2
|
+
rpy_bridge/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
rpy_bridge/rpy2_utils.py,sha256=1W1Lgt0HI3TGs20GugHceFih1uLTTilz_pmkzNkPujY,37516
|
|
4
|
+
rpy_bridge-0.3.5.dist-info/licenses/LICENSE,sha256=JwbWVcSfeoLfZ2M_ZiyygKVDvhBDW3zbqTWwXOJwmrA,1276
|
|
5
|
+
rpy_bridge-0.3.5.dist-info/METADATA,sha256=uZBsfC-lyYhYQfVvLJPGGi2XCwfq-8cSbILCFgmPAFs,9580
|
|
6
|
+
rpy_bridge-0.3.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
7
|
+
rpy_bridge-0.3.5.dist-info/top_level.txt,sha256=z9UZ77ZuUPoLqMDQEpP4btstsaM1IpXb9Cn9yBVaHmU,11
|
|
8
|
+
rpy_bridge-0.3.5.dist-info/RECORD,,
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
rpy_bridge/__init__.py,sha256=1cyWVzhVnSqMRY6OkSo8RYjTKWjmaV9WR-otu4Y5dJc,829
|
|
2
|
-
rpy_bridge/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
rpy_bridge/rpy2_utils.py,sha256=n58oSoqkZRv320dtgxEW597G8PrzCO8jCeGPZQH_5t8,29234
|
|
4
|
-
rpy_bridge-0.3.3.dist-info/licenses/LICENSE,sha256=JwbWVcSfeoLfZ2M_ZiyygKVDvhBDW3zbqTWwXOJwmrA,1276
|
|
5
|
-
rpy_bridge-0.3.3.dist-info/METADATA,sha256=Frw8qT49nSrWClRKCMKfU8cvLQVVKvpz3By99rTB_3A,9591
|
|
6
|
-
rpy_bridge-0.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
7
|
-
rpy_bridge-0.3.3.dist-info/top_level.txt,sha256=z9UZ77ZuUPoLqMDQEpP4btstsaM1IpXb9Cn9yBVaHmU,11
|
|
8
|
-
rpy_bridge-0.3.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|