rpy-bridge 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rpy_bridge/rpy2_utils.py DELETED
@@ -1,1221 +0,0 @@
1
- """
2
- R–Python Integration Utility
3
-
4
- Provides tools to load R scripts, activate renv environments, and call R functions
5
- directly from Python, with automatic conversion between R and Python data types.
6
-
7
- ----------
8
- Requirements
9
- ----------
10
- - R must be installed and accessible in your system environment.
11
- - Ensure compatibility with your R project's renv setup (or any other R environment you use).
12
-
13
- Features
14
- ----------
15
- - Lazy loading of rpy2 and R runtime.
16
- - Activation of renv environments for isolated R project dependencies.
17
- - Support for sourcing individual R scripts or directories of scripts.
18
- - Namespace-based access to R functions.
19
- - Automatic conversion between R vectors, data frames, and Python types (pandas, lists, scalars).
20
- - Utilities for cleaning and aligning data frames between R and Python.
21
- """
22
-
23
- # ruff: noqa: E402
24
- # %%
25
- # Import libraries
26
- import importlib.util
27
- import os
28
- import subprocess
29
- import sys
30
- import warnings
31
- from pathlib import Path
32
- from typing import TYPE_CHECKING, Any, Iterable
33
-
34
- import numpy as np
35
- import pandas as pd
36
-
37
- warnings.filterwarnings("ignore", message="Environment variable .* redefined by R")
38
-
39
-
40
- if TYPE_CHECKING:
41
- import logging as logging_module
42
-
43
- from loguru import Logger as LoguruLogger
44
-
45
- LoggerType = LoguruLogger | logging_module.Logger
46
-
47
- else:
48
- LoggerType = None # runtime doesn’t need the type object
49
-
50
- import logging
51
-
52
- try:
53
- from loguru import logger as loguru_logger # type: ignore
54
-
55
- logger = loguru_logger
56
- except ImportError:
57
- logging.basicConfig()
58
- logger = logging.getLogger("rpy-bridge")
59
-
60
-
61
- # --- Remove default handler to override global default ---
62
- logger.remove()
63
-
64
- # --- Add a "sink" for RFunctionCaller logs ---
65
- _rfc_logger = logger.bind(tag="[RFunctionCaller]")
66
- _rfc_logger.add(
67
- sys.stderr,
68
- format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}", # Only show message
69
- level="INFO",
70
- )
71
-
72
-
73
- def _log_r_call(func_name: str, source_info: str):
74
- """
75
- Log an R function call, showing only '[RFunctionCaller] Called ...'
76
- """
77
- _rfc_logger.opt(depth=1, record=False).info(
78
- "[rpy-bridge.RFunctionCaller] Called R function '{}' from {}",
79
- func_name,
80
- source_info,
81
- )
82
-
83
-
84
- # ---------------------------------------------------------------------
85
- # Path resolution
86
- # ---------------------------------------------------------------------
87
- def _normalize_scripts(
88
- scripts: str | Path | Iterable[str | Path] | None,
89
- ) -> list[Path]:
90
- if scripts is None:
91
- return []
92
- if isinstance(scripts, (str, Path)):
93
- return [Path(scripts).resolve()]
94
- try:
95
- return [Path(s).resolve() for s in scripts]
96
- except TypeError:
97
- raise TypeError(
98
- f"Invalid type for 'scripts': {type(scripts)}. Must be str, Path, or list/iterable thereof."
99
- )
100
-
101
-
102
- # ---------------------------------------------------------------------
103
- # R detection and rpy2 installation
104
- # ---------------------------------------------------------------------
105
- def ensure_rpy2_available() -> None:
106
- """
107
- Ensure rpy2 is importable.
108
- Do NOT attempt to install dynamically; fail with clear instructions instead.
109
- """
110
- if importlib.util.find_spec("rpy2") is None:
111
- raise RuntimeError(
112
- "\n[Error] rpy2 is not installed. Please install it in your Python environment:\n"
113
- " pip install rpy2\n\n"
114
- "Make sure your Python environment can access your system R installation.\n"
115
- "On macOS with Homebrew: brew install r\n"
116
- "On Linux: apt install r-base (Debian/Ubuntu) or yum install R (CentOS/RHEL)\n"
117
- "On Windows: install R from https://cran.r-project.org\n"
118
- )
119
-
120
-
121
- def find_r_home() -> str | None:
122
- """
123
- Detect system R installation.
124
- """
125
- try:
126
- r_home = subprocess.check_output(
127
- ["R", "--vanilla", "--slave", "-e", "cat(R.home())"],
128
- stderr=subprocess.PIPE,
129
- text=True,
130
- ).strip()
131
- if r_home.endswith(">"): # sometimes R console prints >
132
- r_home = r_home[:-1].strip()
133
- return r_home
134
- except FileNotFoundError:
135
- # fallback paths (Linux, macOS Homebrew, Windows)
136
- possible_paths = [
137
- "/usr/lib/R",
138
- "/usr/local/lib/R",
139
- "/opt/homebrew/Cellar/r/4.5.2/lib/R", # macOS Homebrew
140
- "C:\\Program Files\\R\\R-4.5.2", # Windows
141
- ]
142
- for p in possible_paths:
143
- if os.path.exists(p):
144
- return p
145
- return None
146
-
147
-
148
- # Determine if we're running in CI / testing
149
- CI_TESTING = os.environ.get("GITHUB_ACTIONS") == "true" or os.environ.get("TESTING") == "1"
150
-
151
- R_HOME = os.environ.get("R_HOME")
152
- if not R_HOME:
153
- R_HOME = find_r_home()
154
- if not R_HOME:
155
- if CI_TESTING:
156
- logger.warning("R not found; skipping all R-dependent setup in CI/testing environment.")
157
- R_HOME = None # Explicitly None to signal "no R available"
158
- else:
159
- raise RuntimeError("R not found. Please install R or add it to PATH.")
160
- else:
161
- os.environ["R_HOME"] = R_HOME
162
-
163
- logger.info(
164
- f"[rpy-bridge] R_HOME = {R_HOME if R_HOME else 'not detected; R-dependent code skipped'}"
165
- )
166
-
167
- # Only configure platform-specific library paths if R is available
168
- if R_HOME:
169
- if sys.platform == "darwin":
170
- lib_path = os.path.join(R_HOME, "lib")
171
- if lib_path not in os.environ.get("DYLD_FALLBACK_LIBRARY_PATH", ""):
172
- os.environ["DYLD_FALLBACK_LIBRARY_PATH"] = (
173
- f"{lib_path}:{os.environ.get('DYLD_FALLBACK_LIBRARY_PATH', '')}"
174
- )
175
-
176
- elif sys.platform.startswith("linux"):
177
- lib_path = os.path.join(R_HOME, "lib")
178
- ld_path = os.environ.get("LD_LIBRARY_PATH", "")
179
- if lib_path not in ld_path.split(":"):
180
- os.environ["LD_LIBRARY_PATH"] = f"{lib_path}:{ld_path}"
181
-
182
- elif sys.platform.startswith("win"):
183
- bin_path = os.path.join(R_HOME, "bin", "x64")
184
- path_env = os.environ.get("PATH", "")
185
- if bin_path not in path_env.split(os.pathsep):
186
- os.environ["PATH"] = f"{bin_path}{os.pathsep}{path_env}"
187
-
188
-
189
- # ---------------------------------------------------------------------
190
- # Lazy rpy2 import machinery
191
- # ---------------------------------------------------------------------
192
- _RPY2: dict | None = None
193
-
194
-
195
- def _require_rpy2(raise_on_missing: bool = True) -> dict | None:
196
- global _RPY2
197
- if _RPY2 is not None:
198
- return _RPY2
199
-
200
- try:
201
- import rpy2.robjects as ro
202
- from rpy2 import robjects
203
- from rpy2.rinterface_lib.sexp import NULLType
204
- from rpy2.rlike.container import NamedList
205
- from rpy2.robjects import pandas2ri
206
- from rpy2.robjects.conversion import localconverter
207
- from rpy2.robjects.vectors import (
208
- BoolVector,
209
- FloatVector,
210
- IntVector,
211
- ListVector,
212
- StrVector,
213
- )
214
-
215
- _RPY2 = {
216
- "ro": ro,
217
- "robjects": robjects,
218
- "pandas2ri": pandas2ri,
219
- "localconverter": localconverter,
220
- "BoolVector": BoolVector,
221
- "FloatVector": FloatVector,
222
- "IntVector": IntVector,
223
- "ListVector": ListVector,
224
- "StrVector": StrVector,
225
- "NULLType": NULLType,
226
- "NamedList": NamedList,
227
- }
228
- return _RPY2
229
-
230
- except ImportError as e:
231
- if raise_on_missing:
232
- raise RuntimeError(
233
- "R support requires rpy2; install it in your Python env (e.g., pip install rpy2)"
234
- ) from e
235
- return None
236
-
237
-
238
- def _ensure_rpy2() -> dict:
239
- global _RPY2
240
- if _RPY2 is None:
241
- _RPY2 = _require_rpy2()
242
- assert _RPY2 is not None, "_require_rpy2() returned None"
243
- return _RPY2
244
-
245
-
246
- # ---------------------------------------------------------------------
247
- # Project root discovery (for this.path / working dir)
248
- # ---------------------------------------------------------------------
249
- def _candidate_project_dirs(base: Path, depth: int = 3) -> list[Path]:
250
- return [base] + list(base.parents)[:depth]
251
-
252
-
253
- def _has_root_marker(path: Path) -> bool:
254
- if (path / ".git").exists():
255
- return True
256
- if any(path.glob("*.Rproj")):
257
- return True
258
- if (path / ".here").exists():
259
- return True
260
- if (path / "DESCRIPTION").exists():
261
- return True
262
- if (path / "renv.lock").exists():
263
- return True
264
- return False
265
-
266
-
267
- def _find_project_root(path_to_renv: Path | None, scripts: list[Path]) -> Path | None:
268
- # Prefer roots discovered from script locations first; fall back to path_to_renv hints.
269
- bases: list[Path] = []
270
- if scripts:
271
- bases.extend(_candidate_project_dirs(scripts[0].parent))
272
- if path_to_renv is not None:
273
- bases.extend(_candidate_project_dirs(path_to_renv))
274
-
275
- seen = set()
276
- for cand in bases:
277
- c = cand.resolve()
278
- if c in seen:
279
- continue
280
- seen.add(c)
281
- if _has_root_marker(c):
282
- return c
283
- return None
284
-
285
-
286
- # ---------------------------------------------------------------------
287
- # Activate renv
288
- # ---------------------------------------------------------------------
289
- def activate_renv(path_to_renv: Path) -> None:
290
- r = _ensure_rpy2()
291
- robjects = r["robjects"]
292
-
293
- # Normalize and allow flexible layouts. Users may pass:
294
- # - the project root (with renv.lock and renv/)
295
- # - the renv directory itself
296
- # - a script dir that sits beside or inside the project; we search upwards.
297
- path_to_renv = path_to_renv.resolve()
298
-
299
- def _candidates(base: Path) -> list[Path]:
300
- # Search base, then parents up to 3 levels for renv assets
301
- parents = [base] + list(base.parents)[:3]
302
- return parents
303
-
304
- project_dir = None
305
- renv_dir = None
306
- renv_activate = None
307
- renv_lock = None
308
-
309
- for cand in _candidates(path_to_renv):
310
- # If the candidate *is* a renv dir with activate.R, treat its parent as project
311
- cand_is_renv = cand.name == "renv" and (cand / "activate.R").exists()
312
- if cand_is_renv:
313
- rd = cand
314
- pd = cand.parent
315
- else:
316
- rd = cand / "renv"
317
- pd = cand
318
-
319
- activate_path = rd / "activate.R"
320
- lock_path = pd / "renv.lock"
321
- if not lock_path.exists():
322
- alt_lock = rd / "renv.lock"
323
- if alt_lock.exists():
324
- lock_path = alt_lock
325
-
326
- if activate_path.exists() and lock_path.exists():
327
- project_dir = pd
328
- renv_dir = rd
329
- renv_activate = activate_path
330
- renv_lock = lock_path
331
- break
332
-
333
- if renv_dir is None or renv_activate is None or renv_lock is None:
334
- raise FileNotFoundError(
335
- f"[Error] renv environment incomplete: activate.R or renv.lock not found near {path_to_renv}"
336
- )
337
-
338
- renviron_file = project_dir / ".Renviron"
339
- if renviron_file.is_file():
340
- os.environ["R_ENVIRON_USER"] = str(renviron_file)
341
- logger.info(f"[rpy-bridge] R_ENVIRON_USER set to: {renviron_file}")
342
-
343
- rprofile_file = project_dir / ".Rprofile"
344
- if rprofile_file.is_file():
345
- # Source .Rprofile from the project root so any relative paths (e.g. renv/activate.R)
346
- # are resolved correctly even when the current R working directory is elsewhere.
347
- try:
348
- robjects.r(
349
- f'old_wd <- getwd(); setwd("{project_dir.as_posix()}"); '
350
- f"on.exit(setwd(old_wd), add = TRUE); "
351
- f'source("{rprofile_file.as_posix()}")'
352
- )
353
- logger.info(f"[rpy-bridge] .Rprofile sourced: {rprofile_file}")
354
- except Exception as e: # pragma: no cover - defensive fallback
355
- logger.warning(
356
- "[rpy-bridge] Failed to source .Rprofile; falling back to renv::activate(): %s",
357
- e,
358
- )
359
-
360
- # If .Rprofile was absent or failed, ensure renv is loaded directly.
361
- try:
362
- robjects.r("suppressMessages(library(renv))")
363
- except Exception:
364
- logger.info("[rpy-bridge] Installing renv package in project library...")
365
- robjects.r(
366
- f'install.packages("renv", repos="https://cloud.r-project.org", lib="{renv_dir / "library"}")'
367
- )
368
- robjects.r("library(renv)")
369
-
370
- # Activate renv explicitly in case .Rprofile did not already do it (or failed).
371
- robjects.r(f'renv::load("{project_dir.as_posix()}")')
372
- logger.info(f"[rpy-bridge] renv environment loaded for project: {project_dir}")
373
-
374
-
375
- # ---------------------------------------------------------------------
376
- # NamespaceWrapper
377
- # ---------------------------------------------------------------------
378
- class NamespaceWrapper:
379
- """
380
- Wraps an R script namespace for Python attribute access.
381
- """
382
-
383
- def __init__(self, env):
384
- self._env = env
385
-
386
- def __getattr__(self, func_name):
387
- if func_name in self._env:
388
- return self._env[func_name]
389
- raise AttributeError(f"Function '{func_name}' not found in R namespace")
390
-
391
- def list_functions(self):
392
- """
393
- Return a list of callable functions in this namespace.
394
- """
395
- return [k for k, v in self._env.items() if callable(v)]
396
-
397
-
398
- # ---------------------------------------------------------------------
399
- # RFunctionCaller
400
- # ---------------------------------------------------------------------
401
- class RFunctionCaller:
402
- """
403
- Primary interface for calling R functions from Python.
404
-
405
- ``RFunctionCaller`` loads one or more R scripts into isolated namespaces
406
- and provides a unified ``call()`` method for executing:
407
-
408
- * Functions defined in sourced R scripts
409
- * Base R functions (e.g. ``sum``, ``mean``)
410
- * Functions from installed R packages (via ``package::function``)
411
-
412
- In most workflows, users only need to interact with this class.
413
-
414
- Parameters
415
- ----------
416
- path_to_renv : Path or None, optional
417
- Path to an R project that uses ``renv``. This may be either the project
418
- root or the ``renv/`` directory itself. If provided, the renv
419
- environment is activated before any scripts are sourced.
420
-
421
- scripts : str, Path, list[str | Path], or None, optional
422
- One or more ``.R`` files or directories containing ``.R`` files.
423
- Each script is sourced into its own namespace.
424
-
425
- packages : str or list[str], optional
426
- R packages to load (and install if missing) before calling functions.
427
-
428
- Notes
429
- -----
430
- * Python objects are automatically converted to R objects.
431
- * R return values are converted back to Python equivalents.
432
- * Missing values (``None``, ``pd.NA``) are mapped to R ``NA``.
433
- """
434
-
435
- def __init__(
436
- self,
437
- path_to_renv: str | Path | None = None,
438
- scripts: str | Path | list[str | Path] | None = None,
439
- packages: str | list[str] | None = None,
440
- headless: bool = True,
441
- skip_renv_if_no_r: bool = True,
442
- **kwargs, # catch unexpected keywords
443
- ):
444
- # Handle path_to_renv safely
445
- if path_to_renv is not None:
446
- if not isinstance(path_to_renv, Path):
447
- path_to_renv = Path(path_to_renv)
448
- self.path_to_renv = path_to_renv.resolve()
449
- else:
450
- self.path_to_renv = None
451
-
452
- # --- Handle deprecated 'script_path' ---
453
- if "script_path" in kwargs:
454
- script_path_value = kwargs.pop("script_path")
455
- warnings.warn(
456
- "'script_path' argument is deprecated. "
457
- "Please use 'scripts' instead (accepts a Path or list of Paths).",
458
- DeprecationWarning,
459
- stacklevel=2,
460
- )
461
- if scripts is None:
462
- scripts = script_path_value
463
- else:
464
- # Both provided → prioritize scripts and ignore script_path
465
- logger.warning("'script_path' ignored because 'scripts' argument is also provided.")
466
-
467
- self.scripts = _normalize_scripts(scripts)
468
-
469
- # --- Check all scripts exist immediately ---
470
- for script_path in self.scripts:
471
- if not script_path.exists():
472
- raise FileNotFoundError(f"R script path not found: {script_path}")
473
-
474
- # Raise error if other unexpected kwargs remain
475
- if kwargs:
476
- raise TypeError(
477
- f"RFunctionCaller.__init__() received unexpected keyword arguments: {list(kwargs.keys())}"
478
- )
479
-
480
- self.path_to_renv = path_to_renv.resolve() if path_to_renv else None
481
- self._namespaces: dict[str, Any] = {}
482
- self._namespace_roots: dict[str, Path] = {}
483
-
484
- # Normalize scripts to a list
485
- if scripts is None:
486
- self.scripts: list[Path] = []
487
- elif isinstance(scripts, Path):
488
- self.scripts = [scripts.resolve()]
489
- else:
490
- self.scripts = [s.resolve() for s in scripts]
491
-
492
- # Normalize packages to a list
493
- if packages is None:
494
- self.packages: list[str] = []
495
- elif isinstance(packages, str):
496
- self.packages = [packages]
497
- else:
498
- self.packages = packages
499
-
500
- # Headless mode guards (avoid GUI probing in non-interactive runs)
501
- self.headless = headless
502
- self.skip_renv_if_no_r = skip_renv_if_no_r
503
-
504
- # Lazy-loaded attributes
505
- self._r = None
506
- self.ro = None
507
- self.robjects = None
508
- self.pandas2ri = None
509
- self.localconverter = None
510
- self.IntVector = None
511
- self.FloatVector = None
512
- self.BoolVector = None
513
- self.StrVector = None
514
- self.ListVector = None
515
- self.NamedList = None
516
-
517
- # Internal state
518
- self._renv_activated = False
519
- self._packages_loaded = False
520
- self._scripts_loaded = [False] * len(self.scripts)
521
-
522
- def _should_activate_renv(self) -> bool:
523
- """Determine if renv activation should run, honoring CI/override knobs."""
524
- if not self.path_to_renv:
525
- return False
526
-
527
- # Explicit opt-out (e.g., CI jobs that only run pure-Python tests)
528
- if os.environ.get("RPY_BRIDGE_SKIP_RENV") in {"1", "true", "TRUE"}:
529
- logger.info("[rpy-bridge] Skipping renv activation: RPY_BRIDGE_SKIP_RENV set")
530
- return False
531
-
532
- # CI without R available: skip if allowed
533
- if CI_TESTING and R_HOME is None and self.skip_renv_if_no_r:
534
- logger.info("[rpy-bridge] Skipping renv activation in CI: R_HOME not detected")
535
- return False
536
-
537
- # Require R_HOME in non-CI runs
538
- if R_HOME is None:
539
- raise RuntimeError(
540
- "R_HOME not detected; cannot activate renv. Install R or set R_HOME."
541
- )
542
-
543
- return True
544
-
545
- def _ensure_headless_env(self) -> None:
546
- """Set defaults that prevent R GUI probing (e.g., this.path:::.gui_path)."""
547
- if not self.headless:
548
- return
549
- defaults = {
550
- "R_DEFAULT_DEVICE": "png",
551
- "R_INTERACTIVE": "false",
552
- "R_GUI_APP_VERSION": "0",
553
- "RSTUDIO": "0",
554
- }
555
- for key, val in defaults.items():
556
- os.environ.setdefault(key, val)
557
-
558
- # -----------------------------------------------------------------
559
- # Internal: lazy R loading
560
- # -----------------------------------------------------------------
561
- def _ensure_r_loaded(self) -> None:
562
- """
563
- Ensure R runtime is initialized and all configured R scripts
564
- are sourced exactly once, in isolated environments.
565
- """
566
- # Ensure headless-safe env before rpy2 initializes R
567
- self._ensure_headless_env()
568
-
569
- if self.robjects is None:
570
- rpy2_dict = _ensure_rpy2()
571
- self._RPY2 = rpy2_dict # cache in instance
572
- self._r = rpy2_dict["ro"]
573
- self.ro = rpy2_dict["robjects"]
574
- self.robjects = rpy2_dict["robjects"]
575
- self.pandas2ri = rpy2_dict["pandas2ri"]
576
- self.localconverter = rpy2_dict["localconverter"]
577
- self.IntVector = rpy2_dict["IntVector"]
578
- self.FloatVector = rpy2_dict["FloatVector"]
579
- self.BoolVector = rpy2_dict["BoolVector"]
580
- self.StrVector = rpy2_dict["StrVector"]
581
- self.ListVector = rpy2_dict["ListVector"]
582
- self.NamedList = rpy2_dict["NamedList"]
583
-
584
- # Activate renv once if requested and allowed
585
- if not self._renv_activated and self._should_activate_renv():
586
- try:
587
- activate_renv(self.path_to_renv)
588
- self._renv_activated = True
589
- logger.info(
590
- f"[rpy-bridge.RFunctionCaller] renv activated for project: {self.path_to_renv}"
591
- )
592
- except Exception as e:
593
- raise RuntimeError(f"Failed to activate renv at {self.path_to_renv}: {e}") from e
594
-
595
- r = self.robjects.r
596
-
597
- # Configure this.path to avoid GUI detection errors in embedded/headless R (e.g., rpy2)
598
- try:
599
- r('options(this.path.gui = "httpd")')
600
- r("options(this.path.verbose = FALSE)")
601
- # Patch this.path::.gui_path to avoid GUI detection errors in headless/rpy2 contexts.
602
- r(
603
- """
604
- if (requireNamespace("this.path", quietly = TRUE)) {
605
- try({
606
- assignInNamespace(".gui_path", function(...) "httpd", ns = "this.path")
607
- }, silent = TRUE)
608
- }
609
- """
610
- )
611
- except Exception:
612
- pass
613
-
614
- # Ensure required R package
615
- self.ensure_r_package("withr")
616
-
617
- if not hasattr(self, "_namespaces"):
618
- self._namespaces: dict[str, dict[str, Any]] = {}
619
-
620
- # --- Iterate over scripts ---
621
- for idx, script_entry in enumerate(self.scripts):
622
- if self._scripts_loaded[idx]:
623
- continue
624
-
625
- script_entry = script_entry.resolve()
626
-
627
- if script_entry.is_file():
628
- r_files = [script_entry]
629
- elif script_entry.is_dir():
630
- r_files = sorted(script_entry.glob("*.R"))
631
- if not r_files:
632
- logger.warning(f"No .R files found in directory: {script_entry}")
633
- self._scripts_loaded[idx] = True
634
- continue
635
- else:
636
- raise ValueError(f"Invalid script path: {script_entry}")
637
-
638
- for script_path in r_files:
639
- ns_name = script_path.stem
640
- logger.opt(depth=2).info(
641
- "[rpy-bridge.RFunctionCaller] Loading R script '{}' as namespace '{}'",
642
- script_path.name,
643
- ns_name,
644
- )
645
-
646
- r("env <- new.env(parent=globalenv())")
647
- r(f'script_path <- "{script_path.as_posix()}"')
648
-
649
- # Determine a root for this script: prefer a discovered project root; else script dir.
650
- script_root = _find_project_root(self.path_to_renv, [script_path])
651
- # Prefer script-local roots; if none, fall back to script directory.
652
- if script_root is None:
653
- script_root = script_path.parent.resolve()
654
- script_root_arg = f'"{script_root.as_posix()}"'
655
-
656
- r(
657
- f"""
658
- withr::with_dir(
659
- {script_root_arg},
660
- sys.source(script_path, envir=env, chdir = TRUE)
661
- )
662
- """
663
- )
664
-
665
- env_obj = r("env")
666
- self._namespaces[ns_name] = {
667
- name: env_obj[name] for name in env_obj.keys() if callable(env_obj[name])
668
- }
669
- self._namespace_roots[ns_name] = script_root
670
-
671
- logger.info(
672
- f"[rpy-bridge.RFunctionCaller] Registered {len(self._namespaces[ns_name])} functions in namespace '{ns_name}'"
673
- )
674
-
675
- self._scripts_loaded[idx] = True
676
-
677
- # -----------------------------------------------------------------
678
- # Autocomplete-friendly attribute access for script namespaces
679
- # -----------------------------------------------------------------
680
- def __getattr__(self, name: str):
681
- if "_namespaces" in self.__dict__ and name in self._namespaces:
682
- ns_env = self._namespaces[name]
683
- return NamespaceWrapper(ns_env)
684
- raise AttributeError(f"'RFunctionCaller' object has no attribute '{name}'")
685
-
686
- def _clean_scalar(self, x):
687
- """
688
- Clean R-style missing values to pandas/NumPy equivalents.
689
- Called inside _r2py on each vector element; atomic/scalar only.
690
- """
691
- robjects = self.robjects
692
-
693
- if x is None:
694
- return None
695
-
696
- if x in (
697
- getattr(robjects, "NA_Real", None),
698
- getattr(robjects, "NA_Integer", None),
699
- getattr(robjects, "NA_Logical", None),
700
- ):
701
- return None
702
-
703
- if x is getattr(robjects, "NA_Character", None):
704
- return None
705
-
706
- if isinstance(x, float) and np.isnan(x):
707
- return None
708
-
709
- return x
710
-
711
- def list_namespaces(self) -> list[str]:
712
- """
713
- Return the names of all loaded script namespaces.
714
-
715
- Returns
716
- -------
717
- list[str]
718
- Names of sourced R script namespaces.
719
- """
720
- self._ensure_r_loaded()
721
- return list(self._namespaces.keys())
722
-
723
- def list_namespace_functions(self, namespace: str) -> list[str]:
724
- """
725
- Return all callable functions in a specific namespace.
726
- """
727
- self._ensure_r_loaded()
728
- if namespace not in self._namespaces:
729
- raise ValueError(f"Namespace '{namespace}' not found")
730
- return [k for k, v in self._namespaces[namespace].items() if callable(v)]
731
-
732
- def _get_package_functions(self, pkg: str) -> list[str]:
733
- """
734
- Return a list of callable functions from a loaded R package.
735
- """
736
- r = self.robjects.r
737
- try:
738
- all_objs = list(r[f'ls("package:{pkg}")'])
739
- funcs = [
740
- name
741
- for name in all_objs
742
- if r(f'is.function(get("{name}", envir=asNamespace("{pkg}")))')[0]
743
- ]
744
- return funcs
745
- except Exception:
746
- logger.warning(f"Failed to list functions for package '{pkg}'")
747
- return []
748
-
749
- def list_all_functions(self, include_packages: bool = False) -> dict[str, list[str]]:
750
- """
751
- Return all callable R functions grouped by script namespace and package.
752
- """
753
- self._ensure_r_loaded()
754
- all_funcs = {}
755
-
756
- # --- Script namespaces ---
757
- for ns_name, ns_env in self._namespaces.items():
758
- funcs = [name for name, val in ns_env.items() if callable(val)]
759
- all_funcs[ns_name] = funcs
760
-
761
- # --- Loaded R packages ---
762
- if include_packages:
763
- r = self.robjects.r
764
- try:
765
- pkgs = r("loadedNamespaces()")
766
- for pkg in pkgs:
767
- funcs = self._get_package_functions(pkg)
768
- if not funcs:
769
- # Add a placeholder note
770
- funcs = [
771
- "[See official documentation for functions, datasets, and objects]"
772
- ]
773
- all_funcs[pkg] = funcs
774
- except Exception:
775
- pass
776
-
777
- return all_funcs
778
-
779
- def print_function_tree(self, include_packages: bool = False, max_display: int = 10):
780
- """
781
- Pretty-print available R functions grouped by namespace.
782
-
783
- Parameters
784
- ----------
785
- include_packages : bool, default False
786
- Whether to include functions from loaded R packages.
787
-
788
- max_display : int, default 10
789
- Maximum number of functions displayed per namespace.
790
-
791
- Notes
792
- -----
793
- This method is intended for interactive exploration and debugging.
794
- """
795
- all_funcs = self.list_all_functions(include_packages=include_packages)
796
-
797
- for ns_name, funcs in all_funcs.items():
798
- if not funcs:
799
- continue
800
- print(f"{ns_name}/")
801
- for f in sorted(funcs)[:max_display]:
802
- print(f" {f}")
803
- if len(funcs) > max_display:
804
- print(" ...")
805
-
806
- # -----------------------------------------------------------------
807
- # Python -> R conversion
808
- # -----------------------------------------------------------------
809
- def _py2r(self, obj):
810
- """
811
- Convert Python objects to R objects robustly.
812
- Handles scalars, None/pd.NA, lists, dicts, and pandas DataFrames.
813
- """
814
- self._ensure_r_loaded()
815
- robjects = self.robjects
816
- pandas2ri = self.pandas2ri
817
- FloatVector = self.FloatVector
818
- BoolVector = self.BoolVector
819
- StrVector = self.StrVector
820
- ListVector = self.ListVector
821
- localconverter = self.localconverter
822
-
823
- r_types = (
824
- robjects.vectors.IntVector,
825
- robjects.vectors.FloatVector,
826
- robjects.vectors.BoolVector,
827
- robjects.vectors.StrVector,
828
- robjects.vectors.ListVector,
829
- robjects.DataFrame,
830
- )
831
- if isinstance(obj, r_types):
832
- return obj
833
-
834
- def is_na(x):
835
- return x is None or x is pd.NA or (isinstance(x, float) and pd.isna(x))
836
-
837
- with localconverter(robjects.default_converter + pandas2ri.converter):
838
- if is_na(obj):
839
- return robjects.NULL
840
- if isinstance(obj, pd.DataFrame):
841
- return pandas2ri.py2rpy(obj)
842
- if isinstance(obj, pd.Series):
843
- return self._py2r(obj.tolist())
844
- if isinstance(obj, (int, float, bool, str)):
845
- return obj
846
- if isinstance(obj, list):
847
- if len(obj) == 0:
848
- return FloatVector([])
849
-
850
- types = set(type(x) for x in obj if not is_na(x))
851
- if types <= {int, float}:
852
- return FloatVector([robjects.NA_Real if is_na(x) else float(x) for x in obj])
853
- if types <= {bool}:
854
- return BoolVector([robjects.NA_Logical if is_na(x) else x for x in obj])
855
- if types <= {str}:
856
- return StrVector([robjects.NA_Character if is_na(x) else x for x in obj])
857
- return ListVector({str(i): self._py2r(v) for i, v in enumerate(obj)})
858
- if isinstance(obj, dict):
859
- return ListVector({k: self._py2r(v) for k, v in obj.items()})
860
- raise NotImplementedError(f"Cannot convert Python object to R: {type(obj)}")
861
-
862
- # -----------------------------------------------------------------
863
- # R -> Python conversion
864
- # -----------------------------------------------------------------
865
- def _r2py(self, obj, top_level=True):
866
- robjects = self.robjects
867
- NamedList = self.NamedList
868
- ListVector = self.ListVector
869
- StrVector = self.StrVector
870
- IntVector = self.IntVector
871
- FloatVector = self.FloatVector
872
- BoolVector = self.BoolVector
873
- NULLType = self._RPY2["NULLType"]
874
- lc = self.localconverter
875
- pandas2ri = self.pandas2ri
876
-
877
- if isinstance(obj, NULLType):
878
- return None
879
-
880
- if isinstance(obj, robjects.DataFrame):
881
- with lc(robjects.default_converter + pandas2ri.converter):
882
- df = robjects.conversion.rpy2py(obj)
883
- df = postprocess_r_dataframe(df)
884
- return clean_r_missing(df, caller=self)
885
-
886
- if isinstance(obj, (NamedList, ListVector)):
887
- py_obj = r_namedlist_to_dict(obj, caller=self, top_level=top_level)
888
- if isinstance(py_obj, list) and len(py_obj) == 1 and top_level:
889
- return py_obj[0]
890
- return py_obj
891
-
892
- if isinstance(obj, (StrVector, IntVector, FloatVector, BoolVector)):
893
- py_list = [self._clean_scalar(v) for v in obj]
894
- if len(py_list) == 1 and top_level:
895
- return py_list[0]
896
- return py_list
897
-
898
- return self._clean_scalar(obj)
899
-
900
- # -----------------------------------------------------------------
901
- # Public: ensure R package is available
902
- # -----------------------------------------------------------------
903
- def ensure_r_package(self, pkg: str):
904
- r = self.robjects.r
905
- try:
906
- r(f'suppressMessages(library("{pkg}", character.only=TRUE))')
907
- except Exception:
908
- logger.info(f"[rpy-bridge.RFunctionCaller] Package '{pkg}' not found.")
909
- logger.warning(f"[rpy-bridge.RFunctionCaller] Installing missing R package: {pkg}")
910
- r(f'install.packages("{pkg}", repos="https://cloud.r-project.org")')
911
- r(f'suppressMessages(library("{pkg}", character.only=TRUE))')
912
-
913
- # -----------------------------------------------------------------
914
- # Public: call an R function
915
- # -----------------------------------------------------------------
916
- def call(self, func_name: str, *args, **kwargs):
917
- """
918
- Call an R function.
919
-
920
- The function may be defined in:
921
- * a sourced R script
922
- * an installed R package (using ``package::function`` syntax)
923
- * base R
924
-
925
- Parameters
926
- ----------
927
- func_name : str
928
- Name of the R function to call. Package functions should be specified
929
- as ``package::function``.
930
-
931
- *args
932
- Positional arguments passed to the R function.
933
-
934
- **kwargs
935
- Named arguments passed to the R function.
936
-
937
- Returns
938
- -------
939
- object
940
- The result of the R function, converted to a Python object.
941
-
942
- Examples
943
- --------
944
- >>> rfc.call("sum", [1, 2, 3])
945
- >>> rfc.call("dplyr::n_distinct", [1, 2, 2, 3])
946
- >>> rfc.call("add_and_scale", 2, 3, scale=10)
947
- """
948
-
949
- self._ensure_r_loaded()
950
-
951
- func = None
952
- source_info = None
953
-
954
- if "::" in func_name:
955
- ns_name, fname = func_name.split("::", 1)
956
- if ns_name in self._namespaces:
957
- ns_env = self._namespaces[ns_name]
958
- if fname in ns_env:
959
- func = ns_env[fname]
960
- source_info = f"script namespace '{ns_name}'"
961
- namespace_root = self._namespace_roots.get(ns_name)
962
- else:
963
- raise ValueError(
964
- f"Function '{fname}' not found in R script namespace '{ns_name}'"
965
- )
966
- else:
967
- try:
968
- func = self.robjects.r(f"{ns_name}::{fname}")
969
- source_info = f"R package '{ns_name}'"
970
- except Exception as e:
971
- raise RuntimeError(f"Failed to resolve R function '{func_name}': {e}") from e
972
-
973
- else:
974
- for ns_name, ns_env in self._namespaces.items():
975
- if func_name in ns_env:
976
- func = ns_env[func_name]
977
- source_info = f"script namespace '{ns_name}'"
978
- namespace_root = self._namespace_roots.get(ns_name)
979
- break
980
-
981
- if func is None:
982
- try:
983
- func = self.robjects.globalenv[func_name]
984
- source_info = "global environment"
985
- except KeyError:
986
- pass
987
-
988
- if func is None:
989
- try:
990
- func = self.robjects.r[func_name]
991
- source_info = "base R / loaded package"
992
- except KeyError:
993
- raise ValueError(
994
- f"R function '{func_name}' not found in any namespace, global env, or base R."
995
- )
996
-
997
- r_args = [self._py2r(a) for a in args]
998
- r_kwargs = {k: self._py2r(v) for k, v in kwargs.items()}
999
-
1000
- try:
1001
- if source_info and source_info.startswith("script namespace") and namespace_root:
1002
- r = self.robjects.r
1003
- try:
1004
- r(f'old_wd <- getwd(); setwd("{namespace_root.as_posix()}")')
1005
- result = func(*r_args, **r_kwargs)
1006
- finally:
1007
- try:
1008
- r("setwd(old_wd)")
1009
- except Exception:
1010
- pass
1011
- else:
1012
- result = func(*r_args, **r_kwargs)
1013
- except Exception as e:
1014
- raise RuntimeError(
1015
- f"Error calling R function '{func_name}' from {source_info}: {e}"
1016
- ) from e
1017
-
1018
- _log_r_call(func_name, source_info)
1019
-
1020
- return self._r2py(result)
1021
-
1022
-
1023
- # %%
1024
- # ------------------------------
1025
- # Utility functions for R ↔ Python
1026
- # ------------------------------
1027
- def r_namedlist_to_dict(namedlist, caller: RFunctionCaller, top_level=False):
1028
- r = _ensure_rpy2()
1029
- NamedList = r["NamedList"]
1030
- ListVector = r["ListVector"]
1031
-
1032
- if isinstance(namedlist, (NamedList, ListVector)):
1033
- names = namedlist.names if not callable(namedlist.names) else namedlist.names()
1034
-
1035
- if names and all(str(i) == str(name) for i, name in enumerate(names)):
1036
- out = []
1037
- for v in namedlist:
1038
- val = caller._r2py(v, top_level=False)
1039
- out.append(val)
1040
- return out
1041
-
1042
- result = {}
1043
- for i, val in enumerate(namedlist):
1044
- key = names[i] if names and i < len(names) else str(i)
1045
- v_py = caller._r2py(val, top_level=False)
1046
- result[str(key)] = v_py
1047
- return result
1048
-
1049
- return caller._r2py(namedlist, top_level=top_level)
1050
-
1051
-
1052
- def clean_r_dataframe(r_df: pd.DataFrame) -> pd.DataFrame:
1053
- for attr in [".groups", ".rows"]:
1054
- try:
1055
- del r_df.attrs[attr]
1056
- except (KeyError, AttributeError):
1057
- pass
1058
- return r_df
1059
-
1060
-
1061
- def fix_string_nans(df: pd.DataFrame) -> pd.DataFrame:
1062
- return df.replace(["nan", "NaN", "NA", "na", ""], pd.NA)
1063
-
1064
-
1065
- def normalize_single_df_dtypes(df: pd.DataFrame) -> pd.DataFrame:
1066
- df = df.replace(["", "nan", "NaN", "NA", "na"], pd.NA)
1067
- for col in df.columns:
1068
- series = df[col]
1069
- if pd.api.types.is_object_dtype(series):
1070
- coerced = pd.to_numeric(series, errors="coerce")
1071
- if coerced.notna().sum() >= series.notna().sum() * 0.5:
1072
- df[col] = coerced
1073
- if pd.api.types.is_integer_dtype(df[col]) and df[col].isna().any():
1074
- df[col] = df[col].astype("float64")
1075
- return df
1076
-
1077
-
1078
- def fix_r_dataframe_types(df: pd.DataFrame) -> pd.DataFrame:
1079
- for col in df.columns:
1080
- series = df[col]
1081
- if pd.api.types.is_integer_dtype(series):
1082
- df[col] = series.mask(series == -2147483648, pd.NA)
1083
- if pd.api.types.is_numeric_dtype(series):
1084
- values = series.dropna()
1085
- if not values.empty and values.between(10000, 40000).all():
1086
- try:
1087
- df[col] = pd.to_datetime("1970-01-01") + pd.to_timedelta(series, unit="D")
1088
- except Exception:
1089
- pass
1090
- if pd.api.types.is_datetime64tz_dtype(series):
1091
- df[col] = series.dt.tz_localize(None)
1092
- return df
1093
-
1094
-
1095
- def postprocess_r_dataframe(df: pd.DataFrame) -> pd.DataFrame:
1096
- df = fix_r_dataframe_types(df)
1097
- df = fix_string_nans(df)
1098
- df = normalize_single_df_dtypes(df)
1099
- if df.index.dtype == object:
1100
- try:
1101
- int_index = df.index.astype(int)
1102
- if (int_index == np.arange(len(df)) + 1).all():
1103
- df.index = pd.RangeIndex(start=0, stop=len(df))
1104
- except Exception:
1105
- pass
1106
- return df
1107
-
1108
-
1109
- def clean_r_missing(obj, caller: RFunctionCaller):
1110
- robjects = caller.robjects
1111
- NA_MAP = {
1112
- getattr(robjects, "NA_Real", None): np.nan,
1113
- getattr(robjects, "NA_Integer", None): np.nan,
1114
- getattr(robjects, "NA_Logical", None): np.nan,
1115
- getattr(robjects, "NA_Character", None): pd.NA,
1116
- }
1117
-
1118
- if isinstance(obj, pd.DataFrame):
1119
- for col in obj.columns:
1120
- obj[col] = obj[col].apply(lambda x: clean_r_missing(x, caller))
1121
- return obj
1122
- elif isinstance(obj, dict):
1123
- return {k: clean_r_missing(v, caller) for k, v in obj.items()}
1124
- elif isinstance(obj, list):
1125
- return [clean_r_missing(v, caller) for v in obj]
1126
- else:
1127
- return NA_MAP.get(obj, obj)
1128
-
1129
-
1130
- # ---------------------------------------------------------------------
1131
- # DataFrame comparison utilities
1132
- # ---------------------------------------------------------------------
1133
- def normalize_dtypes(df1: pd.DataFrame, df2: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
1134
- for col in df1.columns.intersection(df2.columns):
1135
- df1[col] = df1[col].replace("", pd.NA)
1136
- df2[col] = df2[col].replace("", pd.NA)
1137
- s1, s2 = df1[col], df2[col]
1138
- dtype1, dtype2 = s1.dtype, s2.dtype
1139
- if (pd.api.types.is_numeric_dtype(dtype1) and pd.api.types.is_object_dtype(dtype2)) or (
1140
- pd.api.types.is_object_dtype(dtype1) and pd.api.types.is_numeric_dtype(dtype2)
1141
- ):
1142
- try:
1143
- df1[col] = pd.to_numeric(s1, errors="coerce")
1144
- df2[col] = pd.to_numeric(s2, errors="coerce")
1145
- continue
1146
- except Exception:
1147
- pass
1148
- if pd.api.types.is_numeric_dtype(dtype1) and pd.api.types.is_numeric_dtype(dtype2):
1149
- df1[col] = df1[col].astype("float64")
1150
- df2[col] = df2[col].astype("float64")
1151
- continue
1152
- if pd.api.types.is_object_dtype(dtype1) or pd.api.types.is_object_dtype(dtype2):
1153
- df1[col] = df1[col].astype(str)
1154
- df2[col] = df2[col].astype(str)
1155
- return df1, df2
1156
-
1157
-
1158
- def align_numeric_dtypes(df1: pd.DataFrame, df2: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
1159
- for col in df1.columns.intersection(df2.columns):
1160
- s1, s2 = df1[col].replace("", pd.NA), df2[col].replace("", pd.NA)
1161
- try:
1162
- s1_num = pd.to_numeric(s1, errors="coerce")
1163
- s2_num = pd.to_numeric(s2, errors="coerce")
1164
- if not s1_num.isna().all() or not s2_num.isna().all():
1165
- df1[col] = s1_num.astype("float64")
1166
- df2[col] = s2_num.astype("float64")
1167
- continue
1168
- except Exception:
1169
- pass
1170
- df1[col], df2[col] = s1, s2
1171
- return df1, df2
1172
-
1173
-
1174
- def compare_r_py_dataframes(df1: pd.DataFrame, df2: pd.DataFrame, float_tol: float = 1e-8) -> dict:
1175
- results: dict[str, Any] = {
1176
- "shape_mismatch": False,
1177
- "columns_mismatch": False,
1178
- "index_mismatch": False,
1179
- "numeric_diffs": {},
1180
- "non_numeric_diffs": {},
1181
- }
1182
- df2 = fix_r_dataframe_types(df2)
1183
- df1 = fix_string_nans(df1)
1184
- df2 = fix_string_nans(df2)
1185
- df1, df2 = normalize_dtypes(df1.copy(), df2.copy())
1186
- df1, df2 = align_numeric_dtypes(df1, df2)
1187
- if df1.shape != df2.shape:
1188
- results["shape_mismatch"] = True
1189
- print(f"[Warning] Shape mismatch: df1 {df1.shape} vs df2 {df2.shape}")
1190
- if set(df1.columns) != set(df2.columns):
1191
- results["columns_mismatch"] = True
1192
- print("[Warning] Column mismatch:")
1193
- print(f" df1: {df1.columns}")
1194
- print(f" df2: {df2.columns}")
1195
- common_cols = df1.columns.intersection(df2.columns)
1196
- else:
1197
- common_cols = df1.columns
1198
- df1_aligned, df2_aligned = df1.loc[:, common_cols], df2.loc[:, common_cols]
1199
- for col in common_cols:
1200
- col_py, col_r = df1_aligned[col], df2_aligned[col]
1201
- if pd.api.types.is_numeric_dtype(col_py) and pd.api.types.is_numeric_dtype(col_r):
1202
- col_py, col_r = col_py.align(col_r)
1203
- close = np.isclose(
1204
- col_py.fillna(np.nan),
1205
- col_r.fillna(np.nan),
1206
- atol=float_tol,
1207
- equal_nan=True,
1208
- )
1209
- if not close.all():
1210
- results["numeric_diffs"][col] = pd.DataFrame(
1211
- {"df1": col_py[~close], "df2": col_r[~close]}
1212
- )
1213
- else:
1214
- unequal = ~col_py.eq(col_r)
1215
- both_na = col_py.isna() & col_r.isna()
1216
- unequal = unequal & ~both_na
1217
- if unequal.any():
1218
- results["non_numeric_diffs"][col] = pd.DataFrame(
1219
- {"df1": col_py[unequal], "df2": col_r[unequal]}
1220
- )
1221
- return results