rpy-bridge 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rpy_bridge/rpy2_utils.py CHANGED
@@ -16,23 +16,12 @@ import warnings
16
16
  warnings.filterwarnings("ignore", message="Environment variable .* redefined by R")
17
17
 
18
18
  from pathlib import Path
19
+ import sys
20
+ import subprocess
19
21
 
22
+ import math
20
23
  import numpy as np
21
24
  import pandas as pd
22
- import rpy2.robjects as ro
23
- from rpy2 import robjects
24
- from rpy2.rinterface_lib.sexp import NULLType
25
- from rpy2.rlike.container import NamedList
26
- from rpy2.robjects import pandas2ri
27
- from rpy2.robjects.conversion import localconverter
28
- from rpy2.robjects.vectors import (
29
- BoolVector,
30
- FloatVector,
31
- IntVector,
32
- ListVector,
33
- StrVector,
34
- )
35
- from typing import Optional
36
25
 
37
26
  try:
38
27
  from loguru import logger # type: ignore
@@ -43,358 +32,619 @@ except Exception:
43
32
  logger = logging.getLogger("rpy-bridge")
44
33
 
45
34
 
46
- # %%
47
- def activate_renv(path_to_renv: Path) -> None:
48
- """
49
- Activates the renv environment using renv::load() to ensure the correct project is loaded.
50
- This avoids sourcing activate.R directly and avoids accidentally initializing a new environment.
35
+ # ---------------------------------------------------------------------
36
+ # R detection and rpy2 installation
37
+ # ---------------------------------------------------------------------
38
+ def ensure_rpy2_installed(r_home: str):
39
+ os.environ["R_HOME"] = r_home
40
+ try:
41
+ import rpy2 # noqa: F401
42
+ except ImportError:
43
+ logger.info(
44
+ f"[Info] rpy2 not installed or incompatible with R_HOME={r_home}. Installing..."
45
+ )
46
+ subprocess.check_call(
47
+ [sys.executable, "-m", "pip", "install", "--force-reinstall", "rpy2"]
48
+ )
49
+ import rpy2 # noqa: F401
51
50
 
52
- Accepts either:
53
- - Direct path to renv directory (e.g., /path/to/renv)
54
- - Parent directory containing renv/ folder (e.g., /path/to/repos where renv/ is inside)
55
- """
56
51
 
57
- path_to_renv = path_to_renv.resolve()
52
+ def find_r_home():
53
+ try:
54
+ r_home = subprocess.check_output(
55
+ ["R", "--vanilla", "--slave", "-e", "cat(R.home())"],
56
+ stderr=subprocess.PIPE,
57
+ text=True,
58
+ ).strip()
59
+ if r_home.endswith(">"):
60
+ r_home = r_home[:-1].strip()
61
+ return r_home
62
+ except FileNotFoundError:
63
+ possible_paths = [
64
+ "/usr/lib/R",
65
+ "/usr/local/lib/R",
66
+ "/opt/homebrew/Cellar/r/4.5.2/lib/R", # Homebrew macOS
67
+ "C:\\Program Files\\R\\R-4.5.2", # Windows
68
+ ]
69
+ for p in possible_paths:
70
+ if os.path.exists(p):
71
+ return p
72
+ return None
73
+
74
+
75
+ R_HOME = find_r_home()
76
+ if not R_HOME:
77
+ raise RuntimeError("R not found. Please install R or add it to PATH.")
78
+
79
+ logger.info(f"R_HOME = {R_HOME}")
80
+ os.environ["R_HOME"] = R_HOME
81
+ ensure_rpy2_installed(R_HOME)
82
+
83
+ # macOS dynamic library path
84
+ if sys.platform == "darwin":
85
+ lib_path = os.path.join(R_HOME, "lib")
86
+ if lib_path not in os.environ.get("DYLD_FALLBACK_LIBRARY_PATH", ""):
87
+ os.environ["DYLD_FALLBACK_LIBRARY_PATH"] = (
88
+ f"{lib_path}:{os.environ.get('DYLD_FALLBACK_LIBRARY_PATH','')}"
89
+ )
90
+
91
+ elif sys.platform.startswith("linux"):
92
+ lib_path = os.path.join(R_HOME, "lib")
93
+ ld_path = os.environ.get("LD_LIBRARY_PATH", "")
94
+ os.environ["LD_LIBRARY_PATH"] = f"{lib_path}:{ld_path}"
95
+
96
+ # ---------------------------------------------------------------------
97
+ # Lazy rpy2 import machinery
98
+ # ---------------------------------------------------------------------
99
+ _RPY2: dict | None = None
100
+
58
101
 
59
- # Determine if path_to_renv is the renv directory itself or its parent
102
+ def _require_rpy2(raise_on_missing: bool = True) -> dict | None:
103
+ global _RPY2
104
+ if _RPY2 is not None:
105
+ return _RPY2
106
+
107
+ try:
108
+ import rpy2.robjects as ro
109
+ from rpy2 import robjects
110
+ from rpy2.robjects import pandas2ri
111
+ from rpy2.robjects.conversion import localconverter
112
+ from rpy2.robjects.vectors import (
113
+ BoolVector,
114
+ FloatVector,
115
+ IntVector,
116
+ ListVector,
117
+ StrVector,
118
+ )
119
+ from rpy2.rinterface_lib.sexp import NULLType
120
+ from rpy2.rlike.container import NamedList
121
+
122
+ _RPY2 = {
123
+ "ro": ro,
124
+ "robjects": robjects,
125
+ "pandas2ri": pandas2ri,
126
+ "localconverter": localconverter,
127
+ "BoolVector": BoolVector,
128
+ "FloatVector": FloatVector,
129
+ "IntVector": IntVector,
130
+ "ListVector": ListVector,
131
+ "StrVector": StrVector,
132
+ "NULLType": NULLType,
133
+ "NamedList": NamedList,
134
+ }
135
+ return _RPY2
136
+
137
+ except ImportError as e:
138
+ if raise_on_missing:
139
+ raise RuntimeError(
140
+ "R support requires optional dependency `rpy2`. Install with: pip install rpy-bridge[r]"
141
+ ) from e
142
+ return None
143
+
144
+
145
+ def _ensure_rpy2() -> dict:
146
+ global _RPY2
147
+ if _RPY2 is None:
148
+ _RPY2 = _require_rpy2()
149
+ return _RPY2
150
+
151
+
152
+ # ---------------------------------------------------------------------
153
+ # Activate renv
154
+ # ---------------------------------------------------------------------
155
+ def activate_renv(path_to_renv: Path) -> None:
156
+ r = _ensure_rpy2()
157
+ robjects = r["robjects"]
158
+
159
+ path_to_renv = path_to_renv.resolve()
60
160
  if path_to_renv.name == "renv" and (path_to_renv / "activate.R").exists():
61
- # Path points directly to renv directory
62
161
  renv_dir = path_to_renv
63
- renv_project_dir = path_to_renv.parent
162
+ project_dir = path_to_renv.parent
64
163
  else:
65
- # Path points to parent directory containing renv/
66
164
  renv_dir = path_to_renv / "renv"
67
- renv_project_dir = path_to_renv
165
+ project_dir = path_to_renv
68
166
 
69
167
  renv_activate = renv_dir / "activate.R"
70
- renv_lock = renv_project_dir / "renv.lock"
168
+ renv_lock = project_dir / "renv.lock"
71
169
 
72
170
  if not renv_activate.exists() or not renv_lock.exists():
73
- raise FileNotFoundError(
74
- f"[Error] renv environment not found or incomplete.\n"
75
- f" Expected activate.R at: {renv_activate}\n"
76
- f" Expected renv.lock at: {renv_lock}\n"
77
- f" Provided path: {path_to_renv}"
78
- )
171
+ raise FileNotFoundError(f"[Error] renv environment incomplete: {path_to_renv}")
79
172
 
80
- # Optional: set R_ENVIRON_USER if .Renviron exists
81
- renviron_file = renv_project_dir / ".Renviron"
173
+ renviron_file = project_dir / ".Renviron"
82
174
  if renviron_file.is_file():
83
175
  os.environ["R_ENVIRON_USER"] = str(renviron_file)
84
- logger.info("R_ENVIRON_USER set to: {}", renviron_file)
176
+ logger.info(f"R_ENVIRON_USER set to: {renviron_file}")
177
+
178
+ rprofile_file = project_dir / ".Rprofile"
179
+ if rprofile_file.is_file():
180
+ robjects.r(f'source("{rprofile_file.as_posix()}")')
181
+ logger.info(f".Rprofile sourced: {rprofile_file}")
85
182
 
86
- # Load the renv package
87
183
  try:
88
- robjects.r("library(renv)")
184
+ robjects.r("suppressMessages(library(renv))")
89
185
  except Exception:
90
- print("[Info] renv package not found in R. Attempting to install...")
91
- robjects.r('install.packages("renv", repos="https://cloud.r-project.org")')
92
- # Try loading again after installation
186
+ logger.info("Installing renv package in project library...")
187
+ robjects.r(
188
+ f'install.packages("renv", repos="https://cloud.r-project.org", lib="{renv_dir / "library"}")'
189
+ )
93
190
  robjects.r("library(renv)")
94
191
 
95
- # Load the renv environment using renv::load(path)
96
- try:
97
- logger.info("Using R at: {}", robjects.r("R.home()")[0])
98
- robjects.r(f'renv::load("{renv_project_dir.as_posix()}")')
99
- logger.info("renv environment loaded for project: {}", renv_project_dir)
100
- except Exception as e:
101
- raise RuntimeError(f"[Error] Failed to load renv environment: {e}")
102
-
103
- logger.debug(".libPaths(): {}", robjects.r(".libPaths()"))
192
+ robjects.r(f'renv::load("{project_dir.as_posix()}")')
193
+ logger.info(f"renv environment loaded for project: {project_dir}")
104
194
 
105
195
 
106
- # %%
196
+ # ---------------------------------------------------------------------
197
+ # RFunctionCaller
198
+ # ---------------------------------------------------------------------
107
199
  class RFunctionCaller:
108
200
  """
109
- A utility class to load and execute R functions from a specified R script using rpy2.
110
- """
201
+ Utility to load and call R functions from a script, lazily loading rpy2 and activating renv.
111
202
 
112
- def __init__(self, path_to_renv: Path | None, script_path: Path):
113
- """
114
- Initialize the RFunctionCaller with the path to the renv environment and the R script.
115
- Set path_to_renv to None if no renv is used.
116
- """
117
- if not script_path.exists():
118
- raise FileNotFoundError(f"R script not found: {script_path}")
203
+ Supports:
204
+ - Scripts with custom functions
205
+ - Base R functions
206
+ - Functions in installed packages
207
+ - Automatic conversion of Python types (lists, dicts, scalars, pandas DataFrames) to R objects
208
+ """
119
209
 
210
+ def __init__(
211
+ self,
212
+ path_to_renv: Path | None = None,
213
+ script_path: Path | None = None,
214
+ packages: list[str] | None = None,
215
+ ):
120
216
  self.path_to_renv = path_to_renv.resolve() if path_to_renv else None
217
+ self.script_path = script_path.resolve() if script_path else None
218
+ self.packages = packages or None
219
+
220
+ # Lazy-loaded attributes
221
+ self._r = None
222
+ self.ro = None
223
+ self.robjects = None
224
+ self.pandas2ri = None
225
+ self.localconverter = None
226
+ self.IntVector = None
227
+ self.FloatVector = None
228
+ self.BoolVector = None
229
+ self.StrVector = None
230
+ self.ListVector = None
231
+ self.NamedList = None
232
+
233
+ if self.script_path and not self.script_path.exists():
234
+ raise FileNotFoundError(f"R script not found: {self.script_path}")
235
+
236
+ self.script_dir = self.script_path.parent if self.script_path else None
237
+ self._script_loaded = False
238
+ self._renv_activated = False
239
+ self._packages_loaded = False
240
+
241
+ # -----------------------------------------------------------------
242
+ # Internal: lazy R loading
243
+ # -----------------------------------------------------------------
244
+ def _ensure_r_loaded(self):
245
+ if self._r is None:
246
+ r = _require_rpy2(raise_on_missing=True)
247
+ self._r = r
248
+ self.ro = r["ro"]
249
+ self.robjects = r["robjects"]
250
+ self.pandas2ri = r["pandas2ri"]
251
+ self.localconverter = r["localconverter"]
252
+ self.IntVector = r["IntVector"]
253
+ self.FloatVector = r["FloatVector"]
254
+ self.BoolVector = r["BoolVector"]
255
+ self.StrVector = r["StrVector"]
256
+ self.ListVector = r["ListVector"]
257
+ self.NamedList = r["NamedList"]
258
+
259
+ # Activate renv
260
+ if self.path_to_renv and not self._renv_activated:
261
+ activate_renv(self.path_to_renv)
262
+ self._renv_activated = True
121
263
 
122
- self.script_path = script_path.resolve()
123
- self.script_dir = self.script_path.parent
124
-
125
- self._load_script()
126
-
127
- def _load_script(self):
264
+ # Load packages
265
+ if self.packages and not self._packages_loaded:
266
+ for pkg in self.packages:
267
+ try:
268
+ self.robjects.r(f'suppressMessages(library("{pkg}"))')
269
+ except Exception:
270
+ logger.info(f"Package '{pkg}' not found. Installing...")
271
+ self.robjects.r(
272
+ f'install.packages("{pkg}", repos="https://cloud.r-project.org")'
273
+ )
274
+ self.robjects.r(f'suppressMessages(library("{pkg}"))')
275
+ self._packages_loaded = True
276
+
277
+ # Source script
278
+ if self.script_path and not self._script_loaded:
279
+ self.robjects.r(f'setwd("{self.script_dir.as_posix()}")')
280
+ self.robjects.r(f'source("{self.script_path.as_posix()}")')
281
+ logger.info(f"R script sourced: {self.script_path.name}")
282
+ self._script_loaded = True
283
+
284
+ def _clean_scalar(self, x):
128
285
  """
129
- Set the R working directory and source the R script.
286
+ Clean R-style missing values to pandas/NumPy equivalents.
287
+ Called inside _r2py on each vector element; atomic/scalar only.
130
288
  """
131
- if self.path_to_renv:
132
- activate_renv(self.path_to_renv)
133
- else:
134
- logger.info("No renv path provided; using base or current environment.")
289
+ r = self._r
290
+ ro = r["robjects"]
135
291
 
136
- # Set the working directory to the script's directory
137
- robjects.r(f'setwd("{self.script_dir.as_posix()}")')
138
- robjects.r(f'source("{self.script_path.as_posix()}")')
139
- logger.info("R script sourced: {}", self.script_path.name)
292
+ if x is None:
293
+ return None
140
294
 
141
- def call(self, function_name: str, *args: object, **kwargs: object) -> object:
142
- """
143
- Call an R function from the sourced script, and recursively convert &
144
- post-process the result.
295
+ if x in (
296
+ getattr(ro, "NA_Real", None),
297
+ getattr(ro, "NA_Integer", None),
298
+ getattr(ro, "NA_Logical", None),
299
+ ):
300
+ return None
145
301
 
146
- Handles:
147
- - Direct data.frame
148
- - NamedList or ListVector
149
- - Nested lists with data.frames inside
150
- """
302
+ if x is getattr(ro, "NA_Character", None):
303
+ return None
151
304
 
152
- def _recursive_postprocess(obj):
153
- # Handle single DataFrame
154
- if isinstance(obj, pd.DataFrame):
155
- return postprocess_r_dataframe(obj)
305
+ if isinstance(x, float) and np.isnan(x):
306
+ return None
156
307
 
157
- # Handle dictionary (e.g. NamedList converted)
158
- elif isinstance(obj, dict):
159
- return {k: _recursive_postprocess(v) for k, v in obj.items()}
308
+ return x
160
309
 
161
- # Handle list of items
162
- elif isinstance(obj, list):
163
- return [_recursive_postprocess(item) for item in obj]
310
+ # -----------------------------------------------------------------
311
+ # Python -> R conversion
312
+ # -----------------------------------------------------------------
313
+ def _py2r(self, obj):
314
+ """
315
+ Convert Python objects to R objects robustly.
316
+ Handles scalars, None/pd.NA, lists, dicts, and pandas DataFrames.
317
+ """
318
+ self._ensure_r_loaded()
319
+ robjects = self.robjects
320
+ pandas2ri = self.pandas2ri
321
+ IntVector = self.IntVector
322
+ FloatVector = self.FloatVector
323
+ BoolVector = self.BoolVector
324
+ StrVector = self.StrVector
325
+ ListVector = self.ListVector
326
+ localconverter = self.localconverter
327
+ import pandas as pd
328
+ import rpy2.robjects.vectors as rvec
329
+
330
+ # Pass through existing R objects
331
+ if isinstance(
332
+ obj,
333
+ (
334
+ rvec.IntVector,
335
+ rvec.FloatVector,
336
+ rvec.BoolVector,
337
+ rvec.StrVector,
338
+ rvec.ListVector,
339
+ robjects.DataFrame,
340
+ ),
341
+ ):
342
+ return obj
164
343
 
165
- return obj # Primitive values stay as-is
344
+ with localconverter(robjects.default_converter + pandas2ri.converter):
345
+ if obj is None or obj is pd.NA:
346
+ return robjects.NULL
166
347
 
348
+ # DataFrame → data.frame
349
+ if isinstance(obj, pd.DataFrame):
350
+ return pandas2ri.py2rpy(obj)
351
+
352
+ # Series → vector
353
+ if isinstance(obj, pd.Series):
354
+ return self._py2r(obj.tolist())
355
+
356
+ # Scalars
357
+ if isinstance(obj, (int, float, bool, str)):
358
+ return obj
359
+
360
+ # Lists
361
+ if isinstance(obj, list):
362
+ if len(obj) == 0:
363
+ return FloatVector([])
364
+ elif all(isinstance(x, (int, float)) or x is None for x in obj):
365
+ return FloatVector(
366
+ [robjects.NA_Real if x is None else float(x) for x in obj]
367
+ )
368
+
369
+ def is_na(x):
370
+ return (
371
+ x is None or x is pd.NA or (isinstance(x, float) and pd.isna(x))
372
+ )
373
+
374
+ # Homogeneous numeric
375
+ if all(
376
+ isinstance(x, (int, float)) and not isinstance(x, bool) or is_na(x)
377
+ for x in obj
378
+ ):
379
+ return FloatVector(
380
+ [robjects.NA_Real if is_na(x) else float(x) for x in obj]
381
+ )
382
+
383
+ # Homogeneous bool
384
+ if all(isinstance(x, bool) or is_na(x) for x in obj):
385
+ return BoolVector(
386
+ [robjects.NA_Logical if is_na(x) else x for x in obj]
387
+ )
388
+
389
+ # Homogeneous str
390
+ if all(isinstance(x, str) or is_na(x) for x in obj):
391
+ return StrVector(
392
+ [robjects.NA_Character if is_na(x) else x for x in obj]
393
+ )
394
+
395
+ # Mixed or nested list → ListVector with positional keys
396
+ return ListVector({str(i): self._py2r(v) for i, v in enumerate(obj)})
397
+
398
+ # Dict → NamedList
399
+ if isinstance(obj, dict):
400
+ return ListVector({k: self._py2r(v) for k, v in obj.items()})
401
+
402
+ raise NotImplementedError(f"Cannot convert Python object to R: {type(obj)}")
403
+
404
+ # -----------------------------------------------------------------
405
+ # R -> Python conversion
406
+ # -----------------------------------------------------------------
407
+ def _r2py(self, obj, top_level=True):
408
+ """
409
+ Convert R objects to Python objects robustly.
410
+ Handles DataFrames, NamedList/ListVector, atomic vectors, and NULL.
411
+ """
412
+ r = self._r
413
+ robjects = self.robjects
414
+ NamedList = self.NamedList
415
+ ListVector = self.ListVector
416
+ StrVector = self.StrVector
417
+ IntVector = self.IntVector
418
+ FloatVector = self.FloatVector
419
+ BoolVector = self.BoolVector
420
+ NULLType = r["NULLType"]
421
+ lc = self.localconverter
422
+ pandas2ri = self.pandas2ri
423
+
424
+ if isinstance(obj, NULLType):
425
+ return None
426
+
427
+ if isinstance(obj, robjects.DataFrame):
428
+ with lc(robjects.default_converter + pandas2ri.converter):
429
+ df = robjects.conversion.rpy2py(obj)
430
+ df = postprocess_r_dataframe(df)
431
+ df = clean_r_missing(df, caller=self)
432
+ return df
433
+
434
+ if isinstance(obj, (NamedList, ListVector)):
435
+ py_obj = r_namedlist_to_dict(obj, caller=self, top_level=top_level)
436
+ # Auto-unpack single-element lists only at top-level
437
+ if isinstance(py_obj, list) and len(py_obj) == 1 and top_level:
438
+ return py_obj[0]
439
+ return py_obj
440
+
441
+ if isinstance(obj, (StrVector, IntVector, FloatVector, BoolVector)):
442
+ py_list = [self._clean_scalar(v) for v in obj]
443
+ if len(py_list) == 1 and top_level:
444
+ return py_list[0]
445
+ return py_list
446
+
447
+ return self._clean_scalar(obj)
448
+
449
+ # -----------------------------------------------------------------
450
+ # Public: ensure R package is available
451
+ # -----------------------------------------------------------------
452
+ def ensure_r_package(self, pkg_name: str):
453
+ r = self.robjects.r
167
454
  try:
168
- r_func = robjects.globalenv[function_name]
169
-
170
- with localconverter(robjects.default_converter + pandas2ri.converter):
171
- r_args = [robjects.conversion.py2rpy(arg) for arg in args]
172
- r_kwargs = {k: robjects.conversion.py2rpy(v) for k, v in kwargs.items()}
173
- result = r_func(*r_args, **r_kwargs)
455
+ r(f'suppressMessages(library("{pkg_name}", character.only=TRUE))')
456
+ except Exception:
457
+ r(f'install.packages("{pkg_name}", repos="https://cloud.r-project.org")')
458
+ r(f'suppressMessages(library("{pkg_name}", character.only=TRUE))')
174
459
 
175
- # Step 1: Try direct conversion
176
- with localconverter(robjects.default_converter + pandas2ri.converter):
177
- py_result = robjects.conversion.rpy2py(result)
460
+ # -----------------------------------------------------------------
461
+ # Public: call an R function
462
+ # -----------------------------------------------------------------
463
+ def call(self, func_name: str, *args, **kwargs):
464
+ """
465
+ Call an R function safely. Supports:
466
+ - functions defined in scripts
467
+ - base R functions
468
+ - functions in loaded packages
469
+ """
470
+ self._ensure_r_loaded()
178
471
 
179
- # Step 2: If it's still an R container, convert it
180
- if isinstance(py_result, (NamedList, ListVector)):
181
- py_result = r_namedlist_to_dict(py_result)
472
+ # --- Find the function ---
473
+ try:
474
+ func = self.robjects.globalenv[func_name] # script-defined
475
+ except KeyError:
476
+ try:
477
+ func = self.robjects.r[func_name] # base or package function
478
+ except KeyError:
479
+ raise ValueError(f"R function '{func_name}' not found.")
182
480
 
183
- # Step 3: Recursively process any nested frames
184
- return replace_r_na(_recursive_postprocess(py_result))
481
+ # --- Convert Python args to R ---
482
+ r_args = [self._py2r(a) for a in args]
483
+ r_kwargs = {k: self._py2r(v) for k, v in kwargs.items()}
185
484
 
186
- except KeyError:
187
- raise ValueError(f"Function '{function_name}' not found in the R script.")
485
+ # --- Call safely ---
486
+ try:
487
+ result = func(*r_args, **r_kwargs)
188
488
  except Exception as e:
189
- raise RuntimeError(f"Error calling R function '{function_name}': {e}")
190
-
191
- @classmethod
192
- def from_github(
193
- cls,
194
- repo: str,
195
- file_path: str,
196
- ref: str = "main",
197
- token: Optional[str] = None,
198
- cache_dir: Optional[Path] = None,
199
- path_to_renv: Optional[Path] = None,
200
- trust_remote_code: bool = False,
201
- require_token: bool = False,
202
- ) -> "RFunctionCaller | Path":
203
- """
204
- Download an R script from a GitHub repository and construct an RFunctionCaller.
205
-
206
- Args:
207
- repo: repository in the form "owner/repo".
208
- file_path: path to the R script inside the repo (e.g. "scripts/my.R").
209
- ref: branch name, tag or commit SHA. Defaults to "main".
210
- token: optional GitHub token for private repos. If None, looks at
211
- environment variables `GITHUB_TOKEN` or `GH_TOKEN`.
212
- cache_dir: optional directory to cache downloaded files. Defaults to
213
- `~/.cache/rpy-bridge`.
214
- path_to_renv: optional path to renv or project directory to use.
215
- trust_remote_code: MUST be True to execute remote code. If False,
216
- the function will only return the local cached path.
217
-
218
- Returns:
219
- If `trust_remote_code` is True, returns an `RFunctionCaller` instance
220
- ready to call functions from the downloaded script. Otherwise returns
221
- the `Path` to the cached script so the caller can inspect it first.
222
- """
223
- raise NotImplementedError(
224
- "RFunctionCaller.from_github was removed. Clone repositories locally and pass a local script_path to RFunctionCaller instead."
225
- )
489
+ raise RuntimeError(f"Error calling R function '{func_name}': {e}")
490
+
491
+ # --- Convert R result back to Python ---
492
+ return self._r2py(result)
226
493
 
227
494
 
228
495
  # %%
229
- def r_namedlist_to_dict(namedlist: object) -> object:
496
+ # ------------------------------
497
+ # Utility functions for R ↔ Python
498
+ # ------------------------------
499
+ def r_namedlist_to_dict(namedlist, caller: RFunctionCaller, top_level=False):
230
500
  """
231
501
  Recursively convert an R NamedList or ListVector to a Python dictionary.
232
- - Unwrap atomic R vectors (StrVector, IntVector, etc.) into Python lists or dicts if named.
233
- - Convert data.frames to pandas DataFrames.
234
- - Handles NULL or unnamed cases gracefully.
502
+ Uses the caller._r2py method for nested conversions.
235
503
  """
504
+ r = _ensure_rpy2()
505
+ NamedList = r["NamedList"]
506
+ ListVector = r["ListVector"]
236
507
 
237
- # -------------------------------------------
238
- # Handle named lists (NamedList or ListVector)
239
- # -------------------------------------------
240
508
  if isinstance(namedlist, (NamedList, ListVector)):
241
509
  names = namedlist.names if not callable(namedlist.names) else namedlist.names()
510
+
511
+ # Detect positional (unnamed) list
512
+ if names and all(str(i) == str(name) for i, name in enumerate(names)):
513
+ out = []
514
+ for v in namedlist:
515
+ # Nested elements are never top-level
516
+ val = caller._r2py(v, top_level=False)
517
+ out.append(val)
518
+ return out
519
+
520
+ # Otherwise dict
242
521
  result = {}
522
+ for i, val in enumerate(namedlist):
523
+ key = names[i] if names and i < len(names) else str(i)
524
+ v_py = caller._r2py(val, top_level=False) # nested elements
525
+ result[str(key)] = v_py
526
+ return result
243
527
 
244
- # Only iterate if names is not NULL
245
- if not isinstance(names, NULLType):
246
- for key, value in zip(names, namedlist):
247
- key_str = str(key) if key is not None and not isinstance(key, NULLType) else None
248
- if key_str:
249
- result[key_str] = r_namedlist_to_dict(value)
250
- return result
251
-
252
- # If no names, fallback to a list
253
- return [r_namedlist_to_dict(value) for value in namedlist]
254
-
255
- # -------------------------------------------
256
- # Handle atomic vectors (StrVector, IntVector, etc.)
257
- # These may have names (e.g., c(a = 1, b = 2)) — if so, return a dict.
258
- # Otherwise, convert to plain Python list.
259
- # -------------------------------------------
260
- if isinstance(namedlist, (StrVector, IntVector, FloatVector, BoolVector)):
261
- names = namedlist.names if not callable(namedlist.names) else namedlist.names()
262
- if not isinstance(names, NULLType):
263
- return {
264
- str(n): v
265
- for n, v in zip(names, list(namedlist))
266
- if n is not None and not isinstance(n, NULLType)
267
- }
268
- return list(namedlist)
269
-
270
- # -------------------------------------------
271
- # Attempt conversion via pandas2ri — works for data.frames, tibbles, etc.
272
- # If it fails, fall back to returning the original R object.
273
- # -------------------------------------------
274
- with localconverter(robjects.default_converter + pandas2ri.converter):
275
- try:
276
- return robjects.conversion.rpy2py(namedlist)
277
- except Exception:
278
- return namedlist
528
+ # Fallback: scalar/vector at the very top
529
+ return caller._r2py(namedlist, top_level=top_level)
279
530
 
280
531
 
281
- # %%
282
- def clean_r_dataframe(r_df: object) -> object:
532
+ def clean_r_dataframe(r_df: pd.DataFrame) -> pd.DataFrame:
283
533
  """
284
- Clean an R data.frame object by removing common non-structural attributes
285
- like .groups and .rows.
534
+ Clean an R data.frame by removing non-structural attributes like .groups and .rows.
286
535
  """
287
536
  for attr in [".groups", ".rows"]:
288
537
  try:
289
- del r_df.attr[attr]
538
+ del r_df.attrs[attr]
290
539
  except (KeyError, AttributeError):
291
540
  pass
292
541
  return r_df
293
542
 
294
543
 
295
- # %%
296
544
  def fix_string_nans(df: pd.DataFrame) -> pd.DataFrame:
297
- # Replace common string versions of NA/NaN with actual pd.NA
298
- return df.replace(["nan", "NaN", "NA", "na", ""], pd.NA)
299
-
300
-
301
- # %%
302
- def replace_r_na(obj: object) -> object:
303
545
  """
304
- Recursively replace R NA_Character with np.nan in any structure.
546
+ Replace string NAs or empty strings with pd.NA.
305
547
  """
306
- # Handle DataFrame
307
- if isinstance(obj, pd.DataFrame):
308
- return (
309
- obj.replace({ro.NA_Character: np.nan}, regex=False)
310
- if hasattr(ro, "NA_Character")
311
- else obj
312
- )
313
- elif isinstance(obj, dict):
314
- return {k: replace_r_na(v) for k, v in obj.items()}
315
- elif isinstance(obj, list):
316
- return [replace_r_na(item) for item in obj]
317
- elif hasattr(ro, "NA_Character") and obj is ro.NA_Character:
318
- return np.nan
319
- else:
320
- return obj
548
+ return df.replace(["nan", "NaN", "NA", "na", ""], pd.NA)
321
549
 
322
550
 
323
- # %%
324
551
  def normalize_single_df_dtypes(df: pd.DataFrame) -> pd.DataFrame:
552
+ """
553
+ Normalize dtypes in a single DataFrame after R conversion.
554
+ """
325
555
  df = df.replace(["", "nan", "NaN", "NA", "na"], pd.NA)
326
556
 
327
557
  for col in df.columns:
328
558
  series = df[col]
329
-
330
- # Try converting object/string columns to numeric if possible
331
559
  if pd.api.types.is_object_dtype(series):
332
560
  coerced = pd.to_numeric(series, errors="coerce")
333
- # Replace column if conversion produced fewer NaNs (meaning more numeric)
334
561
  if coerced.notna().sum() >= series.notna().sum() * 0.5:
335
562
  df[col] = coerced
336
-
337
- # Cast integer columns with NA to float to accommodate pd.NA
338
- if pd.api.types.is_integer_dtype(df[col]):
339
- if df[col].isna().any():
340
- df[col] = df[col].astype("float64")
341
-
563
+ if pd.api.types.is_integer_dtype(df[col]) and df[col].isna().any():
564
+ df[col] = df[col].astype("float64")
342
565
  return df
343
566
 
344
567
 
345
- # %%
346
568
  def fix_r_dataframe_types(df: pd.DataFrame) -> pd.DataFrame:
347
569
  """
348
- Post-process a DataFrame converted from R via rpy2:
349
- - Converts numeric columns that represent R dates into datetime
350
- - Converts timezone-aware datetimes to naive datetimes
351
- - Replaces R's NA_integer_ sentinel (-2147483648) with pd.NA
570
+ Post-process R DataFrame:
571
+ - Convert R NA_integer_ sentinel (-2147483648) to pd.NA
572
+ - Convert R-style numeric dates to datetime
573
+ - Remove timezone from datetime columns
352
574
  """
353
575
  for col in df.columns:
354
576
  series = df[col]
355
577
 
356
- # Fix R's NA_integer_ sentinel (-2147483648)
357
578
  if pd.api.types.is_integer_dtype(series):
358
- if (series == -2147483648).any():
359
- df[col] = series.mask(series == -2147483648, pd.NA)
579
+ df[col] = series.mask(series == -2147483648, pd.NA)
360
580
 
361
- # Convert R-style date columns (days since 1970) to datetime
362
581
  if pd.api.types.is_numeric_dtype(series):
363
582
  values = series.dropna()
364
583
  if not values.empty and values.between(10000, 40000).all():
365
584
  try:
366
- # "1970-01-01" is the reference date for Unix Epoch
367
- df[col] = pd.to_datetime("1970-01-01") + pd.to_timedelta(series, unit="D")
585
+ df[col] = pd.to_datetime("1970-01-01") + pd.to_timedelta(
586
+ series, unit="D"
587
+ )
368
588
  except Exception:
369
589
  pass
370
590
 
371
- # Remove timezone from datetime columns (e.g., POSIXct with tz)
372
591
  if pd.api.types.is_datetime64tz_dtype(series):
373
592
  df[col] = series.dt.tz_localize(None)
374
593
 
375
594
  return df
376
595
 
377
596
 
378
- # %%
379
597
  def postprocess_r_dataframe(df: pd.DataFrame) -> pd.DataFrame:
598
+ """
599
+ Apply a series of fixes to a DataFrame converted from R:
600
+ - Type corrections
601
+ - String NA normalization
602
+ - Index normalization
603
+ """
380
604
  df = fix_r_dataframe_types(df)
381
605
  df = fix_string_nans(df)
382
606
  df = normalize_single_df_dtypes(df)
383
607
 
384
- # Normalize R-style string index starting from "1"
385
608
  if df.index.dtype == object:
386
609
  try:
387
610
  int_index = df.index.astype(int)
388
- if (int_index == (np.arange(len(df)) + 1)).all():
611
+ if (int_index == np.arange(len(df)) + 1).all():
389
612
  df.index = pd.RangeIndex(start=0, stop=len(df))
390
613
  except Exception:
391
- pass # leave index as-is if not convertible
614
+ pass
392
615
  return df
393
616
 
394
617
 
395
- # Note: GitHub fetch helpers were removed to keep the API focused on
396
- # local script invocation. If you need to run remote scripts, clone the
397
- # repository locally and pass the local `script_path` to `RFunctionCaller`.
618
+ def clean_r_missing(obj, caller: RFunctionCaller):
619
+ """
620
+ Post-process R return objects for downstream Python use.
621
+ Recursively convert R-style missing values to pandas/NumPy:
622
+ - NA_integer_, NA_real_, NA_logical_ → np.nan
623
+ - NA_character_ → pd.NA
624
+ """
625
+ r = _ensure_rpy2()
626
+ ro = r["robjects"]
627
+
628
+ NA_MAP = {
629
+ getattr(ro, "NA_Real", None): np.nan,
630
+ getattr(ro, "NA_Integer", None): np.nan,
631
+ getattr(ro, "NA_Logical", None): np.nan,
632
+ getattr(ro, "NA_Character", None): pd.NA,
633
+ }
634
+
635
+ if isinstance(obj, pd.DataFrame):
636
+ for col in obj.columns:
637
+ obj[col] = obj[col].apply(lambda x: clean_r_missing(x, caller))
638
+ return obj
639
+
640
+ elif isinstance(obj, dict):
641
+ return {k: clean_r_missing(v, caller) for k, v in obj.items()}
642
+
643
+ elif isinstance(obj, list):
644
+ return [clean_r_missing(v, caller) for v in obj]
645
+
646
+ else:
647
+ return NA_MAP.get(obj, obj)
398
648
 
399
649
 
400
650
  # %%
@@ -404,7 +654,9 @@ def postprocess_r_dataframe(df: pd.DataFrame) -> pd.DataFrame:
404
654
  # -------------------------------------------
405
655
 
406
656
 
407
- def normalize_dtypes(df1: pd.DataFrame, df2: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
657
+ def normalize_dtypes(
658
+ df1: pd.DataFrame, df2: pd.DataFrame
659
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
408
660
  """
409
661
  Aligns column dtypes across two DataFrames for accurate comparison.
410
662
  - Replaces empty strings with pd.NA.
@@ -420,8 +672,12 @@ def normalize_dtypes(df1: pd.DataFrame, df2: pd.DataFrame) -> tuple[pd.DataFrame
420
672
  dtype1, dtype2 = s1.dtype, s2.dtype
421
673
 
422
674
  # If one is numeric and the other is object, try coercing both to numeric
423
- if (pd.api.types.is_numeric_dtype(dtype1) and pd.api.types.is_object_dtype(dtype2)) or (
424
- pd.api.types.is_object_dtype(dtype1) and pd.api.types.is_numeric_dtype(dtype2)
675
+ if (
676
+ pd.api.types.is_numeric_dtype(dtype1)
677
+ and pd.api.types.is_object_dtype(dtype2)
678
+ ) or (
679
+ pd.api.types.is_object_dtype(dtype1)
680
+ and pd.api.types.is_numeric_dtype(dtype2)
425
681
  ):
426
682
  try:
427
683
  df1[col] = pd.to_numeric(s1, errors="coerce")
@@ -431,7 +687,9 @@ def normalize_dtypes(df1: pd.DataFrame, df2: pd.DataFrame) -> tuple[pd.DataFrame
431
687
  pass # fallback to next block if coercion fails
432
688
 
433
689
  # If both are numeric but of different types (e.g., int vs float), unify to float64
434
- if pd.api.types.is_numeric_dtype(dtype1) and pd.api.types.is_numeric_dtype(dtype2):
690
+ if pd.api.types.is_numeric_dtype(dtype1) and pd.api.types.is_numeric_dtype(
691
+ dtype2
692
+ ):
435
693
  df1[col] = df1[col].astype("float64")
436
694
  df2[col] = df2[col].astype("float64")
437
695
  continue
@@ -445,7 +703,9 @@ def normalize_dtypes(df1: pd.DataFrame, df2: pd.DataFrame) -> tuple[pd.DataFrame
445
703
 
446
704
 
447
705
  # %%
448
- def align_numeric_dtypes(df1: pd.DataFrame, df2: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
706
+ def align_numeric_dtypes(
707
+ df1: pd.DataFrame, df2: pd.DataFrame
708
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
449
709
  """
450
710
  Ensure aligned numeric dtypes between two DataFrames for accurate comparison.
451
711
  Converts between int, float, and numeric-looking strings where appropriate.
@@ -479,7 +739,9 @@ def align_numeric_dtypes(df1: pd.DataFrame, df2: pd.DataFrame) -> tuple[pd.DataF
479
739
 
480
740
 
481
741
  # %%
482
- def compare_r_py_dataframes(df1: pd.DataFrame, df2: pd.DataFrame, float_tol: float = 1e-8) -> dict:
742
+ def compare_r_py_dataframes(
743
+ df1: pd.DataFrame, df2: pd.DataFrame, float_tol: float = 1e-8
744
+ ) -> dict:
483
745
  """
484
746
  Compare a Python DataFrame (df1) with an R DataFrame converted to pandas (df2).
485
747
 
@@ -530,7 +792,9 @@ def compare_r_py_dataframes(df1: pd.DataFrame, df2: pd.DataFrame, float_tol: flo
530
792
  col_py = df1_aligned[col]
531
793
  col_r = df2_aligned[col]
532
794
 
533
- if pd.api.types.is_numeric_dtype(col_py) and pd.api.types.is_numeric_dtype(col_r):
795
+ if pd.api.types.is_numeric_dtype(col_py) and pd.api.types.is_numeric_dtype(
796
+ col_r
797
+ ):
534
798
  col_py, col_r = col_py.align(col_r)
535
799
 
536
800
  close = np.isclose(