rpy-bridge 0.3.9__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rpy_bridge/rpy2_utils.py DELETED
@@ -1,1052 +0,0 @@
1
- """
2
- R–Python Integration Utility
3
-
4
- Provides tools to load R scripts, activate renv environments, and call R functions
5
- directly from Python, with automatic conversion between R and Python data types.
6
-
7
- ----------
8
- Requirements
9
- ----------
10
- - R must be installed and accessible in your system environment.
11
- - Ensure compatibility with your R project's renv setup (or any other R environment you use).
12
-
13
- Features
14
- ----------
15
- - Lazy loading of rpy2 and R runtime.
16
- - Activation of renv environments for isolated R project dependencies.
17
- - Support for sourcing individual R scripts or directories of scripts.
18
- - Namespace-based access to R functions.
19
- - Automatic conversion between R vectors, data frames, and Python types (pandas, lists, scalars).
20
- - Utilities for cleaning and aligning data frames between R and Python.
21
- """
22
-
23
- # ruff: noqa: E402
24
- # %%
25
- # Import libraries
26
- import importlib.util
27
- import os
28
- import subprocess
29
- import sys
30
- import warnings
31
- from pathlib import Path
32
- from typing import TYPE_CHECKING, Any, Iterable
33
-
34
- import numpy as np
35
- import pandas as pd
36
-
37
- warnings.filterwarnings("ignore", message="Environment variable .* redefined by R")
38
-
39
-
40
- if TYPE_CHECKING:
41
- import logging as logging_module
42
-
43
- from loguru import Logger as LoguruLogger
44
-
45
- LoggerType = LoguruLogger | logging_module.Logger
46
-
47
- else:
48
- LoggerType = None # runtime doesn’t need the type object
49
-
50
- import logging
51
-
52
- try:
53
- from loguru import logger as loguru_logger # type: ignore
54
-
55
- logger = loguru_logger
56
- except ImportError:
57
- logging.basicConfig()
58
- logger = logging.getLogger("rpy-bridge")
59
-
60
-
61
- # --- Remove default handler to override global default ---
62
- logger.remove()
63
-
64
- # --- Add a "sink" for RFunctionCaller logs ---
65
- _rfc_logger = logger.bind(tag="[RFunctionCaller]")
66
- _rfc_logger.add(
67
- sys.stderr,
68
- format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}", # Only show message
69
- level="INFO",
70
- )
71
-
72
-
73
- def _log_r_call(func_name: str, source_info: str):
74
- """
75
- Log an R function call, showing only '[RFunctionCaller] Called ...'
76
- """
77
- _rfc_logger.opt(depth=1, record=False).info(
78
- "[rpy-bridge.RFunctionCaller] Called R function '{}' from {}",
79
- func_name,
80
- source_info,
81
- )
82
-
83
-
84
- # ---------------------------------------------------------------------
85
- # Path resolution
86
- # ---------------------------------------------------------------------
87
- def _normalize_scripts(
88
- scripts: str | Path | Iterable[str | Path] | None,
89
- ) -> list[Path]:
90
- if scripts is None:
91
- return []
92
- if isinstance(scripts, (str, Path)):
93
- return [Path(scripts).resolve()]
94
- try:
95
- return [Path(s).resolve() for s in scripts]
96
- except TypeError:
97
- raise TypeError(
98
- f"Invalid type for 'scripts': {type(scripts)}. Must be str, Path, or list/iterable thereof."
99
- )
100
-
101
-
102
- # ---------------------------------------------------------------------
103
- # R detection and rpy2 installation
104
- # ---------------------------------------------------------------------
105
- def ensure_rpy2_available() -> None:
106
- """
107
- Ensure rpy2 is importable.
108
- Do NOT attempt to install dynamically; fail with clear instructions instead.
109
- """
110
- if importlib.util.find_spec("rpy2") is None:
111
- raise RuntimeError(
112
- "\n[Error] rpy2 is not installed. Please install it in your Python environment:\n"
113
- " pip install rpy2\n\n"
114
- "Make sure your Python environment can access your system R installation.\n"
115
- "On macOS with Homebrew: brew install r\n"
116
- "On Linux: apt install r-base (Debian/Ubuntu) or yum install R (CentOS/RHEL)\n"
117
- "On Windows: install R from https://cran.r-project.org\n"
118
- )
119
-
120
-
121
- def find_r_home() -> str | None:
122
- """
123
- Detect system R installation.
124
- """
125
- try:
126
- r_home = subprocess.check_output(
127
- ["R", "--vanilla", "--slave", "-e", "cat(R.home())"],
128
- stderr=subprocess.PIPE,
129
- text=True,
130
- ).strip()
131
- if r_home.endswith(">"): # sometimes R console prints >
132
- r_home = r_home[:-1].strip()
133
- return r_home
134
- except FileNotFoundError:
135
- # fallback paths (Linux, macOS Homebrew, Windows)
136
- possible_paths = [
137
- "/usr/lib/R",
138
- "/usr/local/lib/R",
139
- "/opt/homebrew/Cellar/r/4.5.2/lib/R", # macOS Homebrew
140
- "C:\\Program Files\\R\\R-4.5.2", # Windows
141
- ]
142
- for p in possible_paths:
143
- if os.path.exists(p):
144
- return p
145
- return None
146
-
147
-
148
- # Determine if we're running in CI / testing
149
- CI_TESTING = os.environ.get("GITHUB_ACTIONS") == "true" or os.environ.get("TESTING") == "1"
150
-
151
- R_HOME = os.environ.get("R_HOME")
152
- if not R_HOME:
153
- R_HOME = find_r_home()
154
- if not R_HOME:
155
- if CI_TESTING:
156
- logger.warning("R not found; skipping all R-dependent setup in CI/testing environment.")
157
- R_HOME = None # Explicitly None to signal "no R available"
158
- else:
159
- raise RuntimeError("R not found. Please install R or add it to PATH.")
160
- else:
161
- os.environ["R_HOME"] = R_HOME
162
-
163
- logger.info(
164
- f"[rpy-bridge] R_HOME = {R_HOME if R_HOME else 'not detected; R-dependent code skipped'}"
165
- )
166
-
167
- # Only configure platform-specific library paths if R is available
168
- if R_HOME:
169
- if sys.platform == "darwin":
170
- lib_path = os.path.join(R_HOME, "lib")
171
- if lib_path not in os.environ.get("DYLD_FALLBACK_LIBRARY_PATH", ""):
172
- os.environ["DYLD_FALLBACK_LIBRARY_PATH"] = (
173
- f"{lib_path}:{os.environ.get('DYLD_FALLBACK_LIBRARY_PATH', '')}"
174
- )
175
-
176
- elif sys.platform.startswith("linux"):
177
- lib_path = os.path.join(R_HOME, "lib")
178
- ld_path = os.environ.get("LD_LIBRARY_PATH", "")
179
- if lib_path not in ld_path.split(":"):
180
- os.environ["LD_LIBRARY_PATH"] = f"{lib_path}:{ld_path}"
181
-
182
- elif sys.platform.startswith("win"):
183
- bin_path = os.path.join(R_HOME, "bin", "x64")
184
- path_env = os.environ.get("PATH", "")
185
- if bin_path not in path_env.split(os.pathsep):
186
- os.environ["PATH"] = f"{bin_path}{os.pathsep}{path_env}"
187
-
188
-
189
- # ---------------------------------------------------------------------
190
- # Lazy rpy2 import machinery
191
- # ---------------------------------------------------------------------
192
- _RPY2: dict | None = None
193
-
194
-
195
- def _require_rpy2(raise_on_missing: bool = True) -> dict | None:
196
- global _RPY2
197
- if _RPY2 is not None:
198
- return _RPY2
199
-
200
- try:
201
- import rpy2.robjects as ro
202
- from rpy2 import robjects
203
- from rpy2.rinterface_lib.sexp import NULLType
204
- from rpy2.rlike.container import NamedList
205
- from rpy2.robjects import pandas2ri
206
- from rpy2.robjects.conversion import localconverter
207
- from rpy2.robjects.vectors import (
208
- BoolVector,
209
- FloatVector,
210
- IntVector,
211
- ListVector,
212
- StrVector,
213
- )
214
-
215
- _RPY2 = {
216
- "ro": ro,
217
- "robjects": robjects,
218
- "pandas2ri": pandas2ri,
219
- "localconverter": localconverter,
220
- "BoolVector": BoolVector,
221
- "FloatVector": FloatVector,
222
- "IntVector": IntVector,
223
- "ListVector": ListVector,
224
- "StrVector": StrVector,
225
- "NULLType": NULLType,
226
- "NamedList": NamedList,
227
- }
228
- return _RPY2
229
-
230
- except ImportError as e:
231
- if raise_on_missing:
232
- raise RuntimeError(
233
- "R support requires optional dependency `rpy2`. Install with: pip install rpy-bridge[r]"
234
- ) from e
235
- return None
236
-
237
-
238
- def _ensure_rpy2() -> dict:
239
- global _RPY2
240
- if _RPY2 is None:
241
- _RPY2 = _require_rpy2()
242
- assert _RPY2 is not None, "_require_rpy2() returned None"
243
- return _RPY2
244
-
245
-
246
- # ---------------------------------------------------------------------
247
- # Activate renv
248
- # ---------------------------------------------------------------------
249
- def activate_renv(path_to_renv: Path) -> None:
250
- r = _ensure_rpy2()
251
- robjects = r["robjects"]
252
-
253
- path_to_renv = path_to_renv.resolve()
254
- if path_to_renv.name == "renv" and (path_to_renv / "activate.R").exists():
255
- renv_dir = path_to_renv
256
- project_dir = path_to_renv.parent
257
- else:
258
- renv_dir = path_to_renv / "renv"
259
- project_dir = path_to_renv
260
-
261
- renv_activate = renv_dir / "activate.R"
262
- renv_lock = project_dir / "renv.lock"
263
-
264
- if not renv_activate.exists() or not renv_lock.exists():
265
- raise FileNotFoundError(f"[Error] renv environment incomplete: {path_to_renv}")
266
-
267
- renviron_file = project_dir / ".Renviron"
268
- if renviron_file.is_file():
269
- os.environ["R_ENVIRON_USER"] = str(renviron_file)
270
- logger.info(f"[rpy-bridge] R_ENVIRON_USER set to: {renviron_file}")
271
-
272
- rprofile_file = project_dir / ".Rprofile"
273
- if rprofile_file.is_file():
274
- robjects.r(f'source("{rprofile_file.as_posix()}")')
275
- logger.info(f"[rpy-bridge] .Rprofile sourced: {rprofile_file}")
276
-
277
- try:
278
- robjects.r("suppressMessages(library(renv))")
279
- except Exception:
280
- logger.info("[rpy-bridge] Installing renv package in project library...")
281
- robjects.r(
282
- f'install.packages("renv", repos="https://cloud.r-project.org", lib="{renv_dir / "library"}")'
283
- )
284
- robjects.r("library(renv)")
285
-
286
- robjects.r(f'renv::load("{project_dir.as_posix()}")')
287
- logger.info(f"[rpy-bridge] renv environment loaded for project: {project_dir}")
288
-
289
-
290
- # ---------------------------------------------------------------------
291
- # NamespaceWrapper
292
- # ---------------------------------------------------------------------
293
- class NamespaceWrapper:
294
- """
295
- Wraps an R script namespace for Python attribute access.
296
- """
297
-
298
- def __init__(self, env):
299
- self._env = env
300
-
301
- def __getattr__(self, func_name):
302
- if func_name in self._env:
303
- return self._env[func_name]
304
- raise AttributeError(f"Function '{func_name}' not found in R namespace")
305
-
306
- def list_functions(self):
307
- """
308
- Return a list of callable functions in this namespace.
309
- """
310
- return [k for k, v in self._env.items() if callable(v)]
311
-
312
-
313
- # ---------------------------------------------------------------------
314
- # RFunctionCaller
315
- # ---------------------------------------------------------------------
316
- class RFunctionCaller:
317
- """
318
- Primary interface for calling R functions from Python.
319
-
320
- ``RFunctionCaller`` loads one or more R scripts into isolated namespaces
321
- and provides a unified ``call()`` method for executing:
322
-
323
- * Functions defined in sourced R scripts
324
- * Base R functions (e.g. ``sum``, ``mean``)
325
- * Functions from installed R packages (via ``package::function``)
326
-
327
- In most workflows, users only need to interact with this class.
328
-
329
- Parameters
330
- ----------
331
- path_to_renv : Path or None, optional
332
- Path to an R project that uses ``renv``. This may be either the project
333
- root or the ``renv/`` directory itself. If provided, the renv
334
- environment is activated before any scripts are sourced.
335
-
336
- scripts : str, Path, list[str | Path], or None, optional
337
- One or more ``.R`` files or directories containing ``.R`` files.
338
- Each script is sourced into its own namespace.
339
-
340
- packages : str or list[str], optional
341
- R packages to load (and install if missing) before calling functions.
342
-
343
- Notes
344
- -----
345
- * Python objects are automatically converted to R objects.
346
- * R return values are converted back to Python equivalents.
347
- * Missing values (``None``, ``pd.NA``) are mapped to R ``NA``.
348
- """
349
-
350
- def __init__(
351
- self,
352
- path_to_renv: str | Path | None = None,
353
- scripts: str | Path | list[str | Path] | None = None,
354
- packages: str | list[str] | None = None,
355
- **kwargs, # catch unexpected keywords
356
- ):
357
- # Handle path_to_renv safely
358
- if path_to_renv is not None:
359
- if not isinstance(path_to_renv, Path):
360
- path_to_renv = Path(path_to_renv)
361
- self.path_to_renv = path_to_renv.resolve()
362
- else:
363
- self.path_to_renv = None
364
-
365
- # --- Handle deprecated 'script_path' ---
366
- if "script_path" in kwargs:
367
- script_path_value = kwargs.pop("script_path")
368
- warnings.warn(
369
- "'script_path' argument is deprecated. "
370
- "Please use 'scripts' instead (accepts a Path or list of Paths).",
371
- DeprecationWarning,
372
- stacklevel=2,
373
- )
374
- if scripts is None:
375
- scripts = script_path_value
376
- else:
377
- # Both provided → prioritize scripts and ignore script_path
378
- logger.warning("'script_path' ignored because 'scripts' argument is also provided.")
379
-
380
- self.scripts = _normalize_scripts(scripts)
381
-
382
- # --- Check all scripts exist immediately ---
383
- for script_path in self.scripts:
384
- if not script_path.exists():
385
- raise FileNotFoundError(f"R script path not found: {script_path}")
386
-
387
- # Raise error if other unexpected kwargs remain
388
- if kwargs:
389
- raise TypeError(
390
- f"RFunctionCaller.__init__() received unexpected keyword arguments: {list(kwargs.keys())}"
391
- )
392
-
393
- self.path_to_renv = path_to_renv.resolve() if path_to_renv else None
394
- self._namespaces: dict[str, Any] = {}
395
-
396
- # Normalize scripts to a list
397
- if scripts is None:
398
- self.scripts: list[Path] = []
399
- elif isinstance(scripts, Path):
400
- self.scripts = [scripts.resolve()]
401
- else:
402
- self.scripts = [s.resolve() for s in scripts]
403
-
404
- # Normalize packages to a list
405
- if packages is None:
406
- self.packages: list[str] = []
407
- elif isinstance(packages, str):
408
- self.packages = [packages]
409
- else:
410
- self.packages = packages
411
-
412
- # Lazy-loaded attributes
413
- self._r = None
414
- self.ro = None
415
- self.robjects = None
416
- self.pandas2ri = None
417
- self.localconverter = None
418
- self.IntVector = None
419
- self.FloatVector = None
420
- self.BoolVector = None
421
- self.StrVector = None
422
- self.ListVector = None
423
- self.NamedList = None
424
-
425
- # Internal state
426
- self._renv_activated = False
427
- self._packages_loaded = False
428
- self._scripts_loaded = [False] * len(self.scripts)
429
-
430
- # -----------------------------------------------------------------
431
- # Internal: lazy R loading
432
- # -----------------------------------------------------------------
433
- def _ensure_r_loaded(self) -> None:
434
- """
435
- Ensure R runtime is initialized and all configured R scripts
436
- are sourced exactly once, in isolated environments.
437
- """
438
- if self.robjects is None:
439
- rpy2_dict = _ensure_rpy2()
440
- self._RPY2 = rpy2_dict # cache in instance
441
- self._r = rpy2_dict["ro"]
442
- self.ro = rpy2_dict["robjects"]
443
- self.robjects = rpy2_dict["robjects"]
444
- self.pandas2ri = rpy2_dict["pandas2ri"]
445
- self.localconverter = rpy2_dict["localconverter"]
446
- self.IntVector = rpy2_dict["IntVector"]
447
- self.FloatVector = rpy2_dict["FloatVector"]
448
- self.BoolVector = rpy2_dict["BoolVector"]
449
- self.StrVector = rpy2_dict["StrVector"]
450
- self.ListVector = rpy2_dict["ListVector"]
451
- self.NamedList = rpy2_dict["NamedList"]
452
-
453
- # Activate renv once if requested
454
- if self.path_to_renv and not self._renv_activated:
455
- try:
456
- activate_renv(self.path_to_renv)
457
- self._renv_activated = True
458
- logger.info(
459
- f"[rpy-bridge.RFunctionCaller] renv activated for project: {self.path_to_renv}"
460
- )
461
- except Exception as e:
462
- raise RuntimeError(f"Failed to activate renv at {self.path_to_renv}: {e}") from e
463
-
464
- r = self.robjects.r
465
-
466
- # Ensure required R package
467
- self.ensure_r_package("withr")
468
-
469
- if not hasattr(self, "_namespaces"):
470
- self._namespaces: dict[str, dict[str, Any]] = {}
471
-
472
- # --- Iterate over scripts ---
473
- for idx, script_entry in enumerate(self.scripts):
474
- if self._scripts_loaded[idx]:
475
- continue
476
-
477
- script_entry = script_entry.resolve()
478
-
479
- if script_entry.is_file():
480
- r_files = [script_entry]
481
- elif script_entry.is_dir():
482
- r_files = sorted(script_entry.glob("*.R"))
483
- if not r_files:
484
- logger.warning(f"No .R files found in directory: {script_entry}")
485
- self._scripts_loaded[idx] = True
486
- continue
487
- else:
488
- raise ValueError(f"Invalid script path: {script_entry}")
489
-
490
- for script_path in r_files:
491
- ns_name = script_path.stem
492
- logger.opt(depth=2).info(
493
- "[rpy-bridge.RFunctionCaller] Loading R script '{}' as namespace '{}'",
494
- script_path.name,
495
- ns_name,
496
- )
497
-
498
- r("env <- new.env(parent=globalenv())")
499
- r(f'script_path <- "{script_path.as_posix()}"')
500
-
501
- r(
502
- """
503
- withr::with_dir(
504
- dirname(script_path),
505
- sys.source(basename(script_path), envir=env)
506
- )
507
- """
508
- )
509
-
510
- env_obj = r("env")
511
- self._namespaces[ns_name] = {
512
- name: env_obj[name] for name in env_obj.keys() if callable(env_obj[name])
513
- }
514
-
515
- logger.info(
516
- f"[rpy-bridge.RFunctionCaller] Registered {len(self._namespaces[ns_name])} functions in namespace '{ns_name}'"
517
- )
518
-
519
- self._scripts_loaded[idx] = True
520
-
521
- # -----------------------------------------------------------------
522
- # Autocomplete-friendly attribute access for script namespaces
523
- # -----------------------------------------------------------------
524
- def __getattr__(self, name: str):
525
- if "_namespaces" in self.__dict__ and name in self._namespaces:
526
- ns_env = self._namespaces[name]
527
- return NamespaceWrapper(ns_env)
528
- raise AttributeError(f"'RFunctionCaller' object has no attribute '{name}'")
529
-
530
- def _clean_scalar(self, x):
531
- """
532
- Clean R-style missing values to pandas/NumPy equivalents.
533
- Called inside _r2py on each vector element; atomic/scalar only.
534
- """
535
- robjects = self.robjects
536
-
537
- if x is None:
538
- return None
539
-
540
- if x in (
541
- getattr(robjects, "NA_Real", None),
542
- getattr(robjects, "NA_Integer", None),
543
- getattr(robjects, "NA_Logical", None),
544
- ):
545
- return None
546
-
547
- if x is getattr(robjects, "NA_Character", None):
548
- return None
549
-
550
- if isinstance(x, float) and np.isnan(x):
551
- return None
552
-
553
- return x
554
-
555
- def list_namespaces(self) -> list[str]:
556
- """
557
- Return the names of all loaded script namespaces.
558
-
559
- Returns
560
- -------
561
- list[str]
562
- Names of sourced R script namespaces.
563
- """
564
- self._ensure_r_loaded()
565
- return list(self._namespaces.keys())
566
-
567
- def list_namespace_functions(self, namespace: str) -> list[str]:
568
- """
569
- Return all callable functions in a specific namespace.
570
- """
571
- self._ensure_r_loaded()
572
- if namespace not in self._namespaces:
573
- raise ValueError(f"Namespace '{namespace}' not found")
574
- return [k for k, v in self._namespaces[namespace].items() if callable(v)]
575
-
576
- def _get_package_functions(self, pkg: str) -> list[str]:
577
- """
578
- Return a list of callable functions from a loaded R package.
579
- """
580
- r = self.robjects.r
581
- try:
582
- all_objs = list(r[f'ls("package:{pkg}")'])
583
- funcs = [
584
- name
585
- for name in all_objs
586
- if r(f'is.function(get("{name}", envir=asNamespace("{pkg}")))')[0]
587
- ]
588
- return funcs
589
- except Exception:
590
- logger.warning(f"Failed to list functions for package '{pkg}'")
591
- return []
592
-
593
- def list_all_functions(self, include_packages: bool = False) -> dict[str, list[str]]:
594
- """
595
- Return all callable R functions grouped by script namespace and package.
596
- """
597
- self._ensure_r_loaded()
598
- all_funcs = {}
599
-
600
- # --- Script namespaces ---
601
- for ns_name, ns_env in self._namespaces.items():
602
- funcs = [name for name, val in ns_env.items() if callable(val)]
603
- all_funcs[ns_name] = funcs
604
-
605
- # --- Loaded R packages ---
606
- if include_packages:
607
- r = self.robjects.r
608
- try:
609
- pkgs = r("loadedNamespaces()")
610
- for pkg in pkgs:
611
- funcs = self._get_package_functions(pkg)
612
- if not funcs:
613
- # Add a placeholder note
614
- funcs = [
615
- "[See official documentation for functions, datasets, and objects]"
616
- ]
617
- all_funcs[pkg] = funcs
618
- except Exception:
619
- pass
620
-
621
- return all_funcs
622
-
623
- def print_function_tree(self, include_packages: bool = False, max_display: int = 10):
624
- """
625
- Pretty-print available R functions grouped by namespace.
626
-
627
- Parameters
628
- ----------
629
- include_packages : bool, default False
630
- Whether to include functions from loaded R packages.
631
-
632
- max_display : int, default 10
633
- Maximum number of functions displayed per namespace.
634
-
635
- Notes
636
- -----
637
- This method is intended for interactive exploration and debugging.
638
- """
639
- all_funcs = self.list_all_functions(include_packages=include_packages)
640
-
641
- for ns_name, funcs in all_funcs.items():
642
- if not funcs:
643
- continue
644
- print(f"{ns_name}/")
645
- for f in sorted(funcs)[:max_display]:
646
- print(f" {f}")
647
- if len(funcs) > max_display:
648
- print(" ...")
649
-
650
- # -----------------------------------------------------------------
651
- # Python -> R conversion
652
- # -----------------------------------------------------------------
653
- def _py2r(self, obj):
654
- """
655
- Convert Python objects to R objects robustly.
656
- Handles scalars, None/pd.NA, lists, dicts, and pandas DataFrames.
657
- """
658
- self._ensure_r_loaded()
659
- robjects = self.robjects
660
- pandas2ri = self.pandas2ri
661
- FloatVector = self.FloatVector
662
- BoolVector = self.BoolVector
663
- StrVector = self.StrVector
664
- ListVector = self.ListVector
665
- localconverter = self.localconverter
666
-
667
- r_types = (
668
- robjects.vectors.IntVector,
669
- robjects.vectors.FloatVector,
670
- robjects.vectors.BoolVector,
671
- robjects.vectors.StrVector,
672
- robjects.vectors.ListVector,
673
- robjects.DataFrame,
674
- )
675
- if isinstance(obj, r_types):
676
- return obj
677
-
678
- def is_na(x):
679
- return x is None or x is pd.NA or (isinstance(x, float) and pd.isna(x))
680
-
681
- with localconverter(robjects.default_converter + pandas2ri.converter):
682
- if is_na(obj):
683
- return robjects.NULL
684
- if isinstance(obj, pd.DataFrame):
685
- return pandas2ri.py2rpy(obj)
686
- if isinstance(obj, pd.Series):
687
- return self._py2r(obj.tolist())
688
- if isinstance(obj, (int, float, bool, str)):
689
- return obj
690
- if isinstance(obj, list):
691
- if len(obj) == 0:
692
- return FloatVector([])
693
-
694
- types = set(type(x) for x in obj if not is_na(x))
695
- if types <= {int, float}:
696
- return FloatVector([robjects.NA_Real if is_na(x) else float(x) for x in obj])
697
- if types <= {bool}:
698
- return BoolVector([robjects.NA_Logical if is_na(x) else x for x in obj])
699
- if types <= {str}:
700
- return StrVector([robjects.NA_Character if is_na(x) else x for x in obj])
701
- return ListVector({str(i): self._py2r(v) for i, v in enumerate(obj)})
702
- if isinstance(obj, dict):
703
- return ListVector({k: self._py2r(v) for k, v in obj.items()})
704
- raise NotImplementedError(f"Cannot convert Python object to R: {type(obj)}")
705
-
706
- # -----------------------------------------------------------------
707
- # R -> Python conversion
708
- # -----------------------------------------------------------------
709
- def _r2py(self, obj, top_level=True):
710
- robjects = self.robjects
711
- NamedList = self.NamedList
712
- ListVector = self.ListVector
713
- StrVector = self.StrVector
714
- IntVector = self.IntVector
715
- FloatVector = self.FloatVector
716
- BoolVector = self.BoolVector
717
- NULLType = self._RPY2["NULLType"]
718
- lc = self.localconverter
719
- pandas2ri = self.pandas2ri
720
-
721
- if isinstance(obj, NULLType):
722
- return None
723
-
724
- if isinstance(obj, robjects.DataFrame):
725
- with lc(robjects.default_converter + pandas2ri.converter):
726
- df = robjects.conversion.rpy2py(obj)
727
- df = postprocess_r_dataframe(df)
728
- return clean_r_missing(df, caller=self)
729
-
730
- if isinstance(obj, (NamedList, ListVector)):
731
- py_obj = r_namedlist_to_dict(obj, caller=self, top_level=top_level)
732
- if isinstance(py_obj, list) and len(py_obj) == 1 and top_level:
733
- return py_obj[0]
734
- return py_obj
735
-
736
- if isinstance(obj, (StrVector, IntVector, FloatVector, BoolVector)):
737
- py_list = [self._clean_scalar(v) for v in obj]
738
- if len(py_list) == 1 and top_level:
739
- return py_list[0]
740
- return py_list
741
-
742
- return self._clean_scalar(obj)
743
-
744
- # -----------------------------------------------------------------
745
- # Public: ensure R package is available
746
- # -----------------------------------------------------------------
747
- def ensure_r_package(self, pkg: str):
748
- r = self.robjects.r
749
- try:
750
- r(f'suppressMessages(library("{pkg}", character.only=TRUE))')
751
- except Exception:
752
- logger.info(f"[rpy-bridge.RFunctionCaller] Package '{pkg}' not found.")
753
- logger.warning(f"[rpy-bridge.RFunctionCaller] Installing missing R package: {pkg}")
754
- r(f'install.packages("{pkg}", repos="https://cloud.r-project.org")')
755
- r(f'suppressMessages(library("{pkg}", character.only=TRUE))')
756
-
757
- # -----------------------------------------------------------------
758
- # Public: call an R function
759
- # -----------------------------------------------------------------
760
- def call(self, func_name: str, *args, **kwargs):
761
- """
762
- Call an R function.
763
-
764
- The function may be defined in:
765
- * a sourced R script
766
- * an installed R package (using ``package::function`` syntax)
767
- * base R
768
-
769
- Parameters
770
- ----------
771
- func_name : str
772
- Name of the R function to call. Package functions should be specified
773
- as ``package::function``.
774
-
775
- *args
776
- Positional arguments passed to the R function.
777
-
778
- **kwargs
779
- Named arguments passed to the R function.
780
-
781
- Returns
782
- -------
783
- object
784
- The result of the R function, converted to a Python object.
785
-
786
- Examples
787
- --------
788
- >>> rfc.call("sum", [1, 2, 3])
789
- >>> rfc.call("dplyr::n_distinct", [1, 2, 2, 3])
790
- >>> rfc.call("add_and_scale", 2, 3, scale=10)
791
- """
792
-
793
- self._ensure_r_loaded()
794
-
795
- func = None
796
- source_info = None
797
-
798
- if "::" in func_name:
799
- ns_name, fname = func_name.split("::", 1)
800
- if ns_name in self._namespaces:
801
- ns_env = self._namespaces[ns_name]
802
- if fname in ns_env:
803
- func = ns_env[fname]
804
- source_info = f"script namespace '{ns_name}'"
805
- else:
806
- raise ValueError(
807
- f"Function '{fname}' not found in R script namespace '{ns_name}'"
808
- )
809
- else:
810
- try:
811
- func = self.robjects.r(f"{ns_name}::{fname}")
812
- source_info = f"R package '{ns_name}'"
813
- except Exception as e:
814
- raise RuntimeError(f"Failed to resolve R function '{func_name}': {e}") from e
815
-
816
- else:
817
- for ns_name, ns_env in self._namespaces.items():
818
- if func_name in ns_env:
819
- func = ns_env[func_name]
820
- source_info = f"script namespace '{ns_name}'"
821
- break
822
-
823
- if func is None:
824
- try:
825
- func = self.robjects.globalenv[func_name]
826
- source_info = "global environment"
827
- except KeyError:
828
- pass
829
-
830
- if func is None:
831
- try:
832
- func = self.robjects.r[func_name]
833
- source_info = "base R / loaded package"
834
- except KeyError:
835
- raise ValueError(
836
- f"R function '{func_name}' not found in any namespace, global env, or base R."
837
- )
838
-
839
- r_args = [self._py2r(a) for a in args]
840
- r_kwargs = {k: self._py2r(v) for k, v in kwargs.items()}
841
-
842
- try:
843
- result = func(*r_args, **r_kwargs)
844
- except Exception as e:
845
- raise RuntimeError(
846
- f"Error calling R function '{func_name}' from {source_info}: {e}"
847
- ) from e
848
-
849
- _log_r_call(func_name, source_info)
850
-
851
- return self._r2py(result)
852
-
853
-
854
- # %%
855
- # ------------------------------
856
- # Utility functions for R ↔ Python
857
- # ------------------------------
858
- def r_namedlist_to_dict(namedlist, caller: RFunctionCaller, top_level=False):
859
- r = _ensure_rpy2()
860
- NamedList = r["NamedList"]
861
- ListVector = r["ListVector"]
862
-
863
- if isinstance(namedlist, (NamedList, ListVector)):
864
- names = namedlist.names if not callable(namedlist.names) else namedlist.names()
865
-
866
- if names and all(str(i) == str(name) for i, name in enumerate(names)):
867
- out = []
868
- for v in namedlist:
869
- val = caller._r2py(v, top_level=False)
870
- out.append(val)
871
- return out
872
-
873
- result = {}
874
- for i, val in enumerate(namedlist):
875
- key = names[i] if names and i < len(names) else str(i)
876
- v_py = caller._r2py(val, top_level=False)
877
- result[str(key)] = v_py
878
- return result
879
-
880
- return caller._r2py(namedlist, top_level=top_level)
881
-
882
-
883
- def clean_r_dataframe(r_df: pd.DataFrame) -> pd.DataFrame:
884
- for attr in [".groups", ".rows"]:
885
- try:
886
- del r_df.attrs[attr]
887
- except (KeyError, AttributeError):
888
- pass
889
- return r_df
890
-
891
-
892
- def fix_string_nans(df: pd.DataFrame) -> pd.DataFrame:
893
- return df.replace(["nan", "NaN", "NA", "na", ""], pd.NA)
894
-
895
-
896
- def normalize_single_df_dtypes(df: pd.DataFrame) -> pd.DataFrame:
897
- df = df.replace(["", "nan", "NaN", "NA", "na"], pd.NA)
898
- for col in df.columns:
899
- series = df[col]
900
- if pd.api.types.is_object_dtype(series):
901
- coerced = pd.to_numeric(series, errors="coerce")
902
- if coerced.notna().sum() >= series.notna().sum() * 0.5:
903
- df[col] = coerced
904
- if pd.api.types.is_integer_dtype(df[col]) and df[col].isna().any():
905
- df[col] = df[col].astype("float64")
906
- return df
907
-
908
-
909
- def fix_r_dataframe_types(df: pd.DataFrame) -> pd.DataFrame:
910
- for col in df.columns:
911
- series = df[col]
912
- if pd.api.types.is_integer_dtype(series):
913
- df[col] = series.mask(series == -2147483648, pd.NA)
914
- if pd.api.types.is_numeric_dtype(series):
915
- values = series.dropna()
916
- if not values.empty and values.between(10000, 40000).all():
917
- try:
918
- df[col] = pd.to_datetime("1970-01-01") + pd.to_timedelta(series, unit="D")
919
- except Exception:
920
- pass
921
- if pd.api.types.is_datetime64tz_dtype(series):
922
- df[col] = series.dt.tz_localize(None)
923
- return df
924
-
925
-
926
- def postprocess_r_dataframe(df: pd.DataFrame) -> pd.DataFrame:
927
- df = fix_r_dataframe_types(df)
928
- df = fix_string_nans(df)
929
- df = normalize_single_df_dtypes(df)
930
- if df.index.dtype == object:
931
- try:
932
- int_index = df.index.astype(int)
933
- if (int_index == np.arange(len(df)) + 1).all():
934
- df.index = pd.RangeIndex(start=0, stop=len(df))
935
- except Exception:
936
- pass
937
- return df
938
-
939
-
940
- def clean_r_missing(obj, caller: RFunctionCaller):
941
- robjects = caller.robjects
942
- NA_MAP = {
943
- getattr(robjects, "NA_Real", None): np.nan,
944
- getattr(robjects, "NA_Integer", None): np.nan,
945
- getattr(robjects, "NA_Logical", None): np.nan,
946
- getattr(robjects, "NA_Character", None): pd.NA,
947
- }
948
-
949
- if isinstance(obj, pd.DataFrame):
950
- for col in obj.columns:
951
- obj[col] = obj[col].apply(lambda x: clean_r_missing(x, caller))
952
- return obj
953
- elif isinstance(obj, dict):
954
- return {k: clean_r_missing(v, caller) for k, v in obj.items()}
955
- elif isinstance(obj, list):
956
- return [clean_r_missing(v, caller) for v in obj]
957
- else:
958
- return NA_MAP.get(obj, obj)
959
-
960
-
961
- # ---------------------------------------------------------------------
962
- # DataFrame comparison utilities
963
- # ---------------------------------------------------------------------
964
- def normalize_dtypes(df1: pd.DataFrame, df2: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
965
- for col in df1.columns.intersection(df2.columns):
966
- df1[col] = df1[col].replace("", pd.NA)
967
- df2[col] = df2[col].replace("", pd.NA)
968
- s1, s2 = df1[col], df2[col]
969
- dtype1, dtype2 = s1.dtype, s2.dtype
970
- if (pd.api.types.is_numeric_dtype(dtype1) and pd.api.types.is_object_dtype(dtype2)) or (
971
- pd.api.types.is_object_dtype(dtype1) and pd.api.types.is_numeric_dtype(dtype2)
972
- ):
973
- try:
974
- df1[col] = pd.to_numeric(s1, errors="coerce")
975
- df2[col] = pd.to_numeric(s2, errors="coerce")
976
- continue
977
- except Exception:
978
- pass
979
- if pd.api.types.is_numeric_dtype(dtype1) and pd.api.types.is_numeric_dtype(dtype2):
980
- df1[col] = df1[col].astype("float64")
981
- df2[col] = df2[col].astype("float64")
982
- continue
983
- if pd.api.types.is_object_dtype(dtype1) or pd.api.types.is_object_dtype(dtype2):
984
- df1[col] = df1[col].astype(str)
985
- df2[col] = df2[col].astype(str)
986
- return df1, df2
987
-
988
-
989
- def align_numeric_dtypes(df1: pd.DataFrame, df2: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
990
- for col in df1.columns.intersection(df2.columns):
991
- s1, s2 = df1[col].replace("", pd.NA), df2[col].replace("", pd.NA)
992
- try:
993
- s1_num = pd.to_numeric(s1, errors="coerce")
994
- s2_num = pd.to_numeric(s2, errors="coerce")
995
- if not s1_num.isna().all() or not s2_num.isna().all():
996
- df1[col] = s1_num.astype("float64")
997
- df2[col] = s2_num.astype("float64")
998
- continue
999
- except Exception:
1000
- pass
1001
- df1[col], df2[col] = s1, s2
1002
- return df1, df2
1003
-
1004
-
1005
- def compare_r_py_dataframes(df1: pd.DataFrame, df2: pd.DataFrame, float_tol: float = 1e-8) -> dict:
1006
- results: dict[str, Any] = {
1007
- "shape_mismatch": False,
1008
- "columns_mismatch": False,
1009
- "index_mismatch": False,
1010
- "numeric_diffs": {},
1011
- "non_numeric_diffs": {},
1012
- }
1013
- df2 = fix_r_dataframe_types(df2)
1014
- df1 = fix_string_nans(df1)
1015
- df2 = fix_string_nans(df2)
1016
- df1, df2 = normalize_dtypes(df1.copy(), df2.copy())
1017
- df1, df2 = align_numeric_dtypes(df1, df2)
1018
- if df1.shape != df2.shape:
1019
- results["shape_mismatch"] = True
1020
- print(f"[Warning] Shape mismatch: df1 {df1.shape} vs df2 {df2.shape}")
1021
- if set(df1.columns) != set(df2.columns):
1022
- results["columns_mismatch"] = True
1023
- print("[Warning] Column mismatch:")
1024
- print(f" df1: {df1.columns}")
1025
- print(f" df2: {df2.columns}")
1026
- common_cols = df1.columns.intersection(df2.columns)
1027
- else:
1028
- common_cols = df1.columns
1029
- df1_aligned, df2_aligned = df1.loc[:, common_cols], df2.loc[:, common_cols]
1030
- for col in common_cols:
1031
- col_py, col_r = df1_aligned[col], df2_aligned[col]
1032
- if pd.api.types.is_numeric_dtype(col_py) and pd.api.types.is_numeric_dtype(col_r):
1033
- col_py, col_r = col_py.align(col_r)
1034
- close = np.isclose(
1035
- col_py.fillna(np.nan),
1036
- col_r.fillna(np.nan),
1037
- atol=float_tol,
1038
- equal_nan=True,
1039
- )
1040
- if not close.all():
1041
- results["numeric_diffs"][col] = pd.DataFrame(
1042
- {"df1": col_py[~close], "df2": col_r[~close]}
1043
- )
1044
- else:
1045
- unequal = ~col_py.eq(col_r)
1046
- both_na = col_py.isna() & col_r.isna()
1047
- unequal = unequal & ~both_na
1048
- if unequal.any():
1049
- results["non_numeric_diffs"][col] = pd.DataFrame(
1050
- {"df1": col_py[unequal], "df2": col_r[unequal]}
1051
- )
1052
- return results