rpy-bridge 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rpy_bridge/__init__.py +4 -28
- rpy_bridge/compare.py +106 -0
- rpy_bridge/convert.py +63 -0
- rpy_bridge/core.py +505 -0
- rpy_bridge/dataframe.py +74 -0
- rpy_bridge/env.py +108 -0
- rpy_bridge/logging.py +50 -0
- rpy_bridge/renv.py +149 -0
- rpy_bridge/rpy2_loader.py +71 -0
- rpy_bridge-0.5.1.dist-info/METADATA +291 -0
- rpy_bridge-0.5.1.dist-info/RECORD +15 -0
- rpy_bridge/rpy2_utils.py +0 -1221
- rpy_bridge-0.4.0.dist-info/METADATA +0 -258
- rpy_bridge-0.4.0.dist-info/RECORD +0 -8
- {rpy_bridge-0.4.0.dist-info → rpy_bridge-0.5.1.dist-info}/WHEEL +0 -0
- {rpy_bridge-0.4.0.dist-info → rpy_bridge-0.5.1.dist-info}/licenses/LICENSE +0 -0
- {rpy_bridge-0.4.0.dist-info → rpy_bridge-0.5.1.dist-info}/top_level.txt +0 -0
rpy_bridge/rpy2_utils.py
DELETED
|
@@ -1,1221 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
R–Python Integration Utility
|
|
3
|
-
|
|
4
|
-
Provides tools to load R scripts, activate renv environments, and call R functions
|
|
5
|
-
directly from Python, with automatic conversion between R and Python data types.
|
|
6
|
-
|
|
7
|
-
----------
|
|
8
|
-
Requirements
|
|
9
|
-
----------
|
|
10
|
-
- R must be installed and accessible in your system environment.
|
|
11
|
-
- Ensure compatibility with your R project's renv setup (or any other R environment you use).
|
|
12
|
-
|
|
13
|
-
Features
|
|
14
|
-
----------
|
|
15
|
-
- Lazy loading of rpy2 and R runtime.
|
|
16
|
-
- Activation of renv environments for isolated R project dependencies.
|
|
17
|
-
- Support for sourcing individual R scripts or directories of scripts.
|
|
18
|
-
- Namespace-based access to R functions.
|
|
19
|
-
- Automatic conversion between R vectors, data frames, and Python types (pandas, lists, scalars).
|
|
20
|
-
- Utilities for cleaning and aligning data frames between R and Python.
|
|
21
|
-
"""
|
|
22
|
-
|
|
23
|
-
# ruff: noqa: E402
|
|
24
|
-
# %%
|
|
25
|
-
# Import libraries
|
|
26
|
-
import importlib.util
|
|
27
|
-
import os
|
|
28
|
-
import subprocess
|
|
29
|
-
import sys
|
|
30
|
-
import warnings
|
|
31
|
-
from pathlib import Path
|
|
32
|
-
from typing import TYPE_CHECKING, Any, Iterable
|
|
33
|
-
|
|
34
|
-
import numpy as np
|
|
35
|
-
import pandas as pd
|
|
36
|
-
|
|
37
|
-
warnings.filterwarnings("ignore", message="Environment variable .* redefined by R")
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
if TYPE_CHECKING:
|
|
41
|
-
import logging as logging_module
|
|
42
|
-
|
|
43
|
-
from loguru import Logger as LoguruLogger
|
|
44
|
-
|
|
45
|
-
LoggerType = LoguruLogger | logging_module.Logger
|
|
46
|
-
|
|
47
|
-
else:
|
|
48
|
-
LoggerType = None # runtime doesn’t need the type object
|
|
49
|
-
|
|
50
|
-
import logging
|
|
51
|
-
|
|
52
|
-
try:
|
|
53
|
-
from loguru import logger as loguru_logger # type: ignore
|
|
54
|
-
|
|
55
|
-
logger = loguru_logger
|
|
56
|
-
except ImportError:
|
|
57
|
-
logging.basicConfig()
|
|
58
|
-
logger = logging.getLogger("rpy-bridge")
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
# --- Remove default handler to override global default ---
|
|
62
|
-
logger.remove()
|
|
63
|
-
|
|
64
|
-
# --- Add a "sink" for RFunctionCaller logs ---
|
|
65
|
-
_rfc_logger = logger.bind(tag="[RFunctionCaller]")
|
|
66
|
-
_rfc_logger.add(
|
|
67
|
-
sys.stderr,
|
|
68
|
-
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}", # Only show message
|
|
69
|
-
level="INFO",
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def _log_r_call(func_name: str, source_info: str):
|
|
74
|
-
"""
|
|
75
|
-
Log an R function call, showing only '[RFunctionCaller] Called ...'
|
|
76
|
-
"""
|
|
77
|
-
_rfc_logger.opt(depth=1, record=False).info(
|
|
78
|
-
"[rpy-bridge.RFunctionCaller] Called R function '{}' from {}",
|
|
79
|
-
func_name,
|
|
80
|
-
source_info,
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
# ---------------------------------------------------------------------
|
|
85
|
-
# Path resolution
|
|
86
|
-
# ---------------------------------------------------------------------
|
|
87
|
-
def _normalize_scripts(
|
|
88
|
-
scripts: str | Path | Iterable[str | Path] | None,
|
|
89
|
-
) -> list[Path]:
|
|
90
|
-
if scripts is None:
|
|
91
|
-
return []
|
|
92
|
-
if isinstance(scripts, (str, Path)):
|
|
93
|
-
return [Path(scripts).resolve()]
|
|
94
|
-
try:
|
|
95
|
-
return [Path(s).resolve() for s in scripts]
|
|
96
|
-
except TypeError:
|
|
97
|
-
raise TypeError(
|
|
98
|
-
f"Invalid type for 'scripts': {type(scripts)}. Must be str, Path, or list/iterable thereof."
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
# ---------------------------------------------------------------------
|
|
103
|
-
# R detection and rpy2 installation
|
|
104
|
-
# ---------------------------------------------------------------------
|
|
105
|
-
def ensure_rpy2_available() -> None:
|
|
106
|
-
"""
|
|
107
|
-
Ensure rpy2 is importable.
|
|
108
|
-
Do NOT attempt to install dynamically; fail with clear instructions instead.
|
|
109
|
-
"""
|
|
110
|
-
if importlib.util.find_spec("rpy2") is None:
|
|
111
|
-
raise RuntimeError(
|
|
112
|
-
"\n[Error] rpy2 is not installed. Please install it in your Python environment:\n"
|
|
113
|
-
" pip install rpy2\n\n"
|
|
114
|
-
"Make sure your Python environment can access your system R installation.\n"
|
|
115
|
-
"On macOS with Homebrew: brew install r\n"
|
|
116
|
-
"On Linux: apt install r-base (Debian/Ubuntu) or yum install R (CentOS/RHEL)\n"
|
|
117
|
-
"On Windows: install R from https://cran.r-project.org\n"
|
|
118
|
-
)
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
def find_r_home() -> str | None:
|
|
122
|
-
"""
|
|
123
|
-
Detect system R installation.
|
|
124
|
-
"""
|
|
125
|
-
try:
|
|
126
|
-
r_home = subprocess.check_output(
|
|
127
|
-
["R", "--vanilla", "--slave", "-e", "cat(R.home())"],
|
|
128
|
-
stderr=subprocess.PIPE,
|
|
129
|
-
text=True,
|
|
130
|
-
).strip()
|
|
131
|
-
if r_home.endswith(">"): # sometimes R console prints >
|
|
132
|
-
r_home = r_home[:-1].strip()
|
|
133
|
-
return r_home
|
|
134
|
-
except FileNotFoundError:
|
|
135
|
-
# fallback paths (Linux, macOS Homebrew, Windows)
|
|
136
|
-
possible_paths = [
|
|
137
|
-
"/usr/lib/R",
|
|
138
|
-
"/usr/local/lib/R",
|
|
139
|
-
"/opt/homebrew/Cellar/r/4.5.2/lib/R", # macOS Homebrew
|
|
140
|
-
"C:\\Program Files\\R\\R-4.5.2", # Windows
|
|
141
|
-
]
|
|
142
|
-
for p in possible_paths:
|
|
143
|
-
if os.path.exists(p):
|
|
144
|
-
return p
|
|
145
|
-
return None
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
# Determine if we're running in CI / testing
|
|
149
|
-
CI_TESTING = os.environ.get("GITHUB_ACTIONS") == "true" or os.environ.get("TESTING") == "1"
|
|
150
|
-
|
|
151
|
-
R_HOME = os.environ.get("R_HOME")
|
|
152
|
-
if not R_HOME:
|
|
153
|
-
R_HOME = find_r_home()
|
|
154
|
-
if not R_HOME:
|
|
155
|
-
if CI_TESTING:
|
|
156
|
-
logger.warning("R not found; skipping all R-dependent setup in CI/testing environment.")
|
|
157
|
-
R_HOME = None # Explicitly None to signal "no R available"
|
|
158
|
-
else:
|
|
159
|
-
raise RuntimeError("R not found. Please install R or add it to PATH.")
|
|
160
|
-
else:
|
|
161
|
-
os.environ["R_HOME"] = R_HOME
|
|
162
|
-
|
|
163
|
-
logger.info(
|
|
164
|
-
f"[rpy-bridge] R_HOME = {R_HOME if R_HOME else 'not detected; R-dependent code skipped'}"
|
|
165
|
-
)
|
|
166
|
-
|
|
167
|
-
# Only configure platform-specific library paths if R is available
|
|
168
|
-
if R_HOME:
|
|
169
|
-
if sys.platform == "darwin":
|
|
170
|
-
lib_path = os.path.join(R_HOME, "lib")
|
|
171
|
-
if lib_path not in os.environ.get("DYLD_FALLBACK_LIBRARY_PATH", ""):
|
|
172
|
-
os.environ["DYLD_FALLBACK_LIBRARY_PATH"] = (
|
|
173
|
-
f"{lib_path}:{os.environ.get('DYLD_FALLBACK_LIBRARY_PATH', '')}"
|
|
174
|
-
)
|
|
175
|
-
|
|
176
|
-
elif sys.platform.startswith("linux"):
|
|
177
|
-
lib_path = os.path.join(R_HOME, "lib")
|
|
178
|
-
ld_path = os.environ.get("LD_LIBRARY_PATH", "")
|
|
179
|
-
if lib_path not in ld_path.split(":"):
|
|
180
|
-
os.environ["LD_LIBRARY_PATH"] = f"{lib_path}:{ld_path}"
|
|
181
|
-
|
|
182
|
-
elif sys.platform.startswith("win"):
|
|
183
|
-
bin_path = os.path.join(R_HOME, "bin", "x64")
|
|
184
|
-
path_env = os.environ.get("PATH", "")
|
|
185
|
-
if bin_path not in path_env.split(os.pathsep):
|
|
186
|
-
os.environ["PATH"] = f"{bin_path}{os.pathsep}{path_env}"
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
# ---------------------------------------------------------------------
|
|
190
|
-
# Lazy rpy2 import machinery
|
|
191
|
-
# ---------------------------------------------------------------------
|
|
192
|
-
_RPY2: dict | None = None
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
def _require_rpy2(raise_on_missing: bool = True) -> dict | None:
|
|
196
|
-
global _RPY2
|
|
197
|
-
if _RPY2 is not None:
|
|
198
|
-
return _RPY2
|
|
199
|
-
|
|
200
|
-
try:
|
|
201
|
-
import rpy2.robjects as ro
|
|
202
|
-
from rpy2 import robjects
|
|
203
|
-
from rpy2.rinterface_lib.sexp import NULLType
|
|
204
|
-
from rpy2.rlike.container import NamedList
|
|
205
|
-
from rpy2.robjects import pandas2ri
|
|
206
|
-
from rpy2.robjects.conversion import localconverter
|
|
207
|
-
from rpy2.robjects.vectors import (
|
|
208
|
-
BoolVector,
|
|
209
|
-
FloatVector,
|
|
210
|
-
IntVector,
|
|
211
|
-
ListVector,
|
|
212
|
-
StrVector,
|
|
213
|
-
)
|
|
214
|
-
|
|
215
|
-
_RPY2 = {
|
|
216
|
-
"ro": ro,
|
|
217
|
-
"robjects": robjects,
|
|
218
|
-
"pandas2ri": pandas2ri,
|
|
219
|
-
"localconverter": localconverter,
|
|
220
|
-
"BoolVector": BoolVector,
|
|
221
|
-
"FloatVector": FloatVector,
|
|
222
|
-
"IntVector": IntVector,
|
|
223
|
-
"ListVector": ListVector,
|
|
224
|
-
"StrVector": StrVector,
|
|
225
|
-
"NULLType": NULLType,
|
|
226
|
-
"NamedList": NamedList,
|
|
227
|
-
}
|
|
228
|
-
return _RPY2
|
|
229
|
-
|
|
230
|
-
except ImportError as e:
|
|
231
|
-
if raise_on_missing:
|
|
232
|
-
raise RuntimeError(
|
|
233
|
-
"R support requires rpy2; install it in your Python env (e.g., pip install rpy2)"
|
|
234
|
-
) from e
|
|
235
|
-
return None
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
def _ensure_rpy2() -> dict:
|
|
239
|
-
global _RPY2
|
|
240
|
-
if _RPY2 is None:
|
|
241
|
-
_RPY2 = _require_rpy2()
|
|
242
|
-
assert _RPY2 is not None, "_require_rpy2() returned None"
|
|
243
|
-
return _RPY2
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
# ---------------------------------------------------------------------
|
|
247
|
-
# Project root discovery (for this.path / working dir)
|
|
248
|
-
# ---------------------------------------------------------------------
|
|
249
|
-
def _candidate_project_dirs(base: Path, depth: int = 3) -> list[Path]:
|
|
250
|
-
return [base] + list(base.parents)[:depth]
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
def _has_root_marker(path: Path) -> bool:
|
|
254
|
-
if (path / ".git").exists():
|
|
255
|
-
return True
|
|
256
|
-
if any(path.glob("*.Rproj")):
|
|
257
|
-
return True
|
|
258
|
-
if (path / ".here").exists():
|
|
259
|
-
return True
|
|
260
|
-
if (path / "DESCRIPTION").exists():
|
|
261
|
-
return True
|
|
262
|
-
if (path / "renv.lock").exists():
|
|
263
|
-
return True
|
|
264
|
-
return False
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
def _find_project_root(path_to_renv: Path | None, scripts: list[Path]) -> Path | None:
|
|
268
|
-
# Prefer roots discovered from script locations first; fall back to path_to_renv hints.
|
|
269
|
-
bases: list[Path] = []
|
|
270
|
-
if scripts:
|
|
271
|
-
bases.extend(_candidate_project_dirs(scripts[0].parent))
|
|
272
|
-
if path_to_renv is not None:
|
|
273
|
-
bases.extend(_candidate_project_dirs(path_to_renv))
|
|
274
|
-
|
|
275
|
-
seen = set()
|
|
276
|
-
for cand in bases:
|
|
277
|
-
c = cand.resolve()
|
|
278
|
-
if c in seen:
|
|
279
|
-
continue
|
|
280
|
-
seen.add(c)
|
|
281
|
-
if _has_root_marker(c):
|
|
282
|
-
return c
|
|
283
|
-
return None
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
# ---------------------------------------------------------------------
|
|
287
|
-
# Activate renv
|
|
288
|
-
# ---------------------------------------------------------------------
|
|
289
|
-
def activate_renv(path_to_renv: Path) -> None:
|
|
290
|
-
r = _ensure_rpy2()
|
|
291
|
-
robjects = r["robjects"]
|
|
292
|
-
|
|
293
|
-
# Normalize and allow flexible layouts. Users may pass:
|
|
294
|
-
# - the project root (with renv.lock and renv/)
|
|
295
|
-
# - the renv directory itself
|
|
296
|
-
# - a script dir that sits beside or inside the project; we search upwards.
|
|
297
|
-
path_to_renv = path_to_renv.resolve()
|
|
298
|
-
|
|
299
|
-
def _candidates(base: Path) -> list[Path]:
|
|
300
|
-
# Search base, then parents up to 3 levels for renv assets
|
|
301
|
-
parents = [base] + list(base.parents)[:3]
|
|
302
|
-
return parents
|
|
303
|
-
|
|
304
|
-
project_dir = None
|
|
305
|
-
renv_dir = None
|
|
306
|
-
renv_activate = None
|
|
307
|
-
renv_lock = None
|
|
308
|
-
|
|
309
|
-
for cand in _candidates(path_to_renv):
|
|
310
|
-
# If the candidate *is* a renv dir with activate.R, treat its parent as project
|
|
311
|
-
cand_is_renv = cand.name == "renv" and (cand / "activate.R").exists()
|
|
312
|
-
if cand_is_renv:
|
|
313
|
-
rd = cand
|
|
314
|
-
pd = cand.parent
|
|
315
|
-
else:
|
|
316
|
-
rd = cand / "renv"
|
|
317
|
-
pd = cand
|
|
318
|
-
|
|
319
|
-
activate_path = rd / "activate.R"
|
|
320
|
-
lock_path = pd / "renv.lock"
|
|
321
|
-
if not lock_path.exists():
|
|
322
|
-
alt_lock = rd / "renv.lock"
|
|
323
|
-
if alt_lock.exists():
|
|
324
|
-
lock_path = alt_lock
|
|
325
|
-
|
|
326
|
-
if activate_path.exists() and lock_path.exists():
|
|
327
|
-
project_dir = pd
|
|
328
|
-
renv_dir = rd
|
|
329
|
-
renv_activate = activate_path
|
|
330
|
-
renv_lock = lock_path
|
|
331
|
-
break
|
|
332
|
-
|
|
333
|
-
if renv_dir is None or renv_activate is None or renv_lock is None:
|
|
334
|
-
raise FileNotFoundError(
|
|
335
|
-
f"[Error] renv environment incomplete: activate.R or renv.lock not found near {path_to_renv}"
|
|
336
|
-
)
|
|
337
|
-
|
|
338
|
-
renviron_file = project_dir / ".Renviron"
|
|
339
|
-
if renviron_file.is_file():
|
|
340
|
-
os.environ["R_ENVIRON_USER"] = str(renviron_file)
|
|
341
|
-
logger.info(f"[rpy-bridge] R_ENVIRON_USER set to: {renviron_file}")
|
|
342
|
-
|
|
343
|
-
rprofile_file = project_dir / ".Rprofile"
|
|
344
|
-
if rprofile_file.is_file():
|
|
345
|
-
# Source .Rprofile from the project root so any relative paths (e.g. renv/activate.R)
|
|
346
|
-
# are resolved correctly even when the current R working directory is elsewhere.
|
|
347
|
-
try:
|
|
348
|
-
robjects.r(
|
|
349
|
-
f'old_wd <- getwd(); setwd("{project_dir.as_posix()}"); '
|
|
350
|
-
f"on.exit(setwd(old_wd), add = TRUE); "
|
|
351
|
-
f'source("{rprofile_file.as_posix()}")'
|
|
352
|
-
)
|
|
353
|
-
logger.info(f"[rpy-bridge] .Rprofile sourced: {rprofile_file}")
|
|
354
|
-
except Exception as e: # pragma: no cover - defensive fallback
|
|
355
|
-
logger.warning(
|
|
356
|
-
"[rpy-bridge] Failed to source .Rprofile; falling back to renv::activate(): %s",
|
|
357
|
-
e,
|
|
358
|
-
)
|
|
359
|
-
|
|
360
|
-
# If .Rprofile was absent or failed, ensure renv is loaded directly.
|
|
361
|
-
try:
|
|
362
|
-
robjects.r("suppressMessages(library(renv))")
|
|
363
|
-
except Exception:
|
|
364
|
-
logger.info("[rpy-bridge] Installing renv package in project library...")
|
|
365
|
-
robjects.r(
|
|
366
|
-
f'install.packages("renv", repos="https://cloud.r-project.org", lib="{renv_dir / "library"}")'
|
|
367
|
-
)
|
|
368
|
-
robjects.r("library(renv)")
|
|
369
|
-
|
|
370
|
-
# Activate renv explicitly in case .Rprofile did not already do it (or failed).
|
|
371
|
-
robjects.r(f'renv::load("{project_dir.as_posix()}")')
|
|
372
|
-
logger.info(f"[rpy-bridge] renv environment loaded for project: {project_dir}")
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
# ---------------------------------------------------------------------
|
|
376
|
-
# NamespaceWrapper
|
|
377
|
-
# ---------------------------------------------------------------------
|
|
378
|
-
class NamespaceWrapper:
|
|
379
|
-
"""
|
|
380
|
-
Wraps an R script namespace for Python attribute access.
|
|
381
|
-
"""
|
|
382
|
-
|
|
383
|
-
def __init__(self, env):
|
|
384
|
-
self._env = env
|
|
385
|
-
|
|
386
|
-
def __getattr__(self, func_name):
|
|
387
|
-
if func_name in self._env:
|
|
388
|
-
return self._env[func_name]
|
|
389
|
-
raise AttributeError(f"Function '{func_name}' not found in R namespace")
|
|
390
|
-
|
|
391
|
-
def list_functions(self):
|
|
392
|
-
"""
|
|
393
|
-
Return a list of callable functions in this namespace.
|
|
394
|
-
"""
|
|
395
|
-
return [k for k, v in self._env.items() if callable(v)]
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
# ---------------------------------------------------------------------
|
|
399
|
-
# RFunctionCaller
|
|
400
|
-
# ---------------------------------------------------------------------
|
|
401
|
-
class RFunctionCaller:
|
|
402
|
-
"""
|
|
403
|
-
Primary interface for calling R functions from Python.
|
|
404
|
-
|
|
405
|
-
``RFunctionCaller`` loads one or more R scripts into isolated namespaces
|
|
406
|
-
and provides a unified ``call()`` method for executing:
|
|
407
|
-
|
|
408
|
-
* Functions defined in sourced R scripts
|
|
409
|
-
* Base R functions (e.g. ``sum``, ``mean``)
|
|
410
|
-
* Functions from installed R packages (via ``package::function``)
|
|
411
|
-
|
|
412
|
-
In most workflows, users only need to interact with this class.
|
|
413
|
-
|
|
414
|
-
Parameters
|
|
415
|
-
----------
|
|
416
|
-
path_to_renv : Path or None, optional
|
|
417
|
-
Path to an R project that uses ``renv``. This may be either the project
|
|
418
|
-
root or the ``renv/`` directory itself. If provided, the renv
|
|
419
|
-
environment is activated before any scripts are sourced.
|
|
420
|
-
|
|
421
|
-
scripts : str, Path, list[str | Path], or None, optional
|
|
422
|
-
One or more ``.R`` files or directories containing ``.R`` files.
|
|
423
|
-
Each script is sourced into its own namespace.
|
|
424
|
-
|
|
425
|
-
packages : str or list[str], optional
|
|
426
|
-
R packages to load (and install if missing) before calling functions.
|
|
427
|
-
|
|
428
|
-
Notes
|
|
429
|
-
-----
|
|
430
|
-
* Python objects are automatically converted to R objects.
|
|
431
|
-
* R return values are converted back to Python equivalents.
|
|
432
|
-
* Missing values (``None``, ``pd.NA``) are mapped to R ``NA``.
|
|
433
|
-
"""
|
|
434
|
-
|
|
435
|
-
def __init__(
|
|
436
|
-
self,
|
|
437
|
-
path_to_renv: str | Path | None = None,
|
|
438
|
-
scripts: str | Path | list[str | Path] | None = None,
|
|
439
|
-
packages: str | list[str] | None = None,
|
|
440
|
-
headless: bool = True,
|
|
441
|
-
skip_renv_if_no_r: bool = True,
|
|
442
|
-
**kwargs, # catch unexpected keywords
|
|
443
|
-
):
|
|
444
|
-
# Handle path_to_renv safely
|
|
445
|
-
if path_to_renv is not None:
|
|
446
|
-
if not isinstance(path_to_renv, Path):
|
|
447
|
-
path_to_renv = Path(path_to_renv)
|
|
448
|
-
self.path_to_renv = path_to_renv.resolve()
|
|
449
|
-
else:
|
|
450
|
-
self.path_to_renv = None
|
|
451
|
-
|
|
452
|
-
# --- Handle deprecated 'script_path' ---
|
|
453
|
-
if "script_path" in kwargs:
|
|
454
|
-
script_path_value = kwargs.pop("script_path")
|
|
455
|
-
warnings.warn(
|
|
456
|
-
"'script_path' argument is deprecated. "
|
|
457
|
-
"Please use 'scripts' instead (accepts a Path or list of Paths).",
|
|
458
|
-
DeprecationWarning,
|
|
459
|
-
stacklevel=2,
|
|
460
|
-
)
|
|
461
|
-
if scripts is None:
|
|
462
|
-
scripts = script_path_value
|
|
463
|
-
else:
|
|
464
|
-
# Both provided → prioritize scripts and ignore script_path
|
|
465
|
-
logger.warning("'script_path' ignored because 'scripts' argument is also provided.")
|
|
466
|
-
|
|
467
|
-
self.scripts = _normalize_scripts(scripts)
|
|
468
|
-
|
|
469
|
-
# --- Check all scripts exist immediately ---
|
|
470
|
-
for script_path in self.scripts:
|
|
471
|
-
if not script_path.exists():
|
|
472
|
-
raise FileNotFoundError(f"R script path not found: {script_path}")
|
|
473
|
-
|
|
474
|
-
# Raise error if other unexpected kwargs remain
|
|
475
|
-
if kwargs:
|
|
476
|
-
raise TypeError(
|
|
477
|
-
f"RFunctionCaller.__init__() received unexpected keyword arguments: {list(kwargs.keys())}"
|
|
478
|
-
)
|
|
479
|
-
|
|
480
|
-
self.path_to_renv = path_to_renv.resolve() if path_to_renv else None
|
|
481
|
-
self._namespaces: dict[str, Any] = {}
|
|
482
|
-
self._namespace_roots: dict[str, Path] = {}
|
|
483
|
-
|
|
484
|
-
# Normalize scripts to a list
|
|
485
|
-
if scripts is None:
|
|
486
|
-
self.scripts: list[Path] = []
|
|
487
|
-
elif isinstance(scripts, Path):
|
|
488
|
-
self.scripts = [scripts.resolve()]
|
|
489
|
-
else:
|
|
490
|
-
self.scripts = [s.resolve() for s in scripts]
|
|
491
|
-
|
|
492
|
-
# Normalize packages to a list
|
|
493
|
-
if packages is None:
|
|
494
|
-
self.packages: list[str] = []
|
|
495
|
-
elif isinstance(packages, str):
|
|
496
|
-
self.packages = [packages]
|
|
497
|
-
else:
|
|
498
|
-
self.packages = packages
|
|
499
|
-
|
|
500
|
-
# Headless mode guards (avoid GUI probing in non-interactive runs)
|
|
501
|
-
self.headless = headless
|
|
502
|
-
self.skip_renv_if_no_r = skip_renv_if_no_r
|
|
503
|
-
|
|
504
|
-
# Lazy-loaded attributes
|
|
505
|
-
self._r = None
|
|
506
|
-
self.ro = None
|
|
507
|
-
self.robjects = None
|
|
508
|
-
self.pandas2ri = None
|
|
509
|
-
self.localconverter = None
|
|
510
|
-
self.IntVector = None
|
|
511
|
-
self.FloatVector = None
|
|
512
|
-
self.BoolVector = None
|
|
513
|
-
self.StrVector = None
|
|
514
|
-
self.ListVector = None
|
|
515
|
-
self.NamedList = None
|
|
516
|
-
|
|
517
|
-
# Internal state
|
|
518
|
-
self._renv_activated = False
|
|
519
|
-
self._packages_loaded = False
|
|
520
|
-
self._scripts_loaded = [False] * len(self.scripts)
|
|
521
|
-
|
|
522
|
-
def _should_activate_renv(self) -> bool:
|
|
523
|
-
"""Determine if renv activation should run, honoring CI/override knobs."""
|
|
524
|
-
if not self.path_to_renv:
|
|
525
|
-
return False
|
|
526
|
-
|
|
527
|
-
# Explicit opt-out (e.g., CI jobs that only run pure-Python tests)
|
|
528
|
-
if os.environ.get("RPY_BRIDGE_SKIP_RENV") in {"1", "true", "TRUE"}:
|
|
529
|
-
logger.info("[rpy-bridge] Skipping renv activation: RPY_BRIDGE_SKIP_RENV set")
|
|
530
|
-
return False
|
|
531
|
-
|
|
532
|
-
# CI without R available: skip if allowed
|
|
533
|
-
if CI_TESTING and R_HOME is None and self.skip_renv_if_no_r:
|
|
534
|
-
logger.info("[rpy-bridge] Skipping renv activation in CI: R_HOME not detected")
|
|
535
|
-
return False
|
|
536
|
-
|
|
537
|
-
# Require R_HOME in non-CI runs
|
|
538
|
-
if R_HOME is None:
|
|
539
|
-
raise RuntimeError(
|
|
540
|
-
"R_HOME not detected; cannot activate renv. Install R or set R_HOME."
|
|
541
|
-
)
|
|
542
|
-
|
|
543
|
-
return True
|
|
544
|
-
|
|
545
|
-
def _ensure_headless_env(self) -> None:
|
|
546
|
-
"""Set defaults that prevent R GUI probing (e.g., this.path:::.gui_path)."""
|
|
547
|
-
if not self.headless:
|
|
548
|
-
return
|
|
549
|
-
defaults = {
|
|
550
|
-
"R_DEFAULT_DEVICE": "png",
|
|
551
|
-
"R_INTERACTIVE": "false",
|
|
552
|
-
"R_GUI_APP_VERSION": "0",
|
|
553
|
-
"RSTUDIO": "0",
|
|
554
|
-
}
|
|
555
|
-
for key, val in defaults.items():
|
|
556
|
-
os.environ.setdefault(key, val)
|
|
557
|
-
|
|
558
|
-
# -----------------------------------------------------------------
|
|
559
|
-
# Internal: lazy R loading
|
|
560
|
-
# -----------------------------------------------------------------
|
|
561
|
-
def _ensure_r_loaded(self) -> None:
|
|
562
|
-
"""
|
|
563
|
-
Ensure R runtime is initialized and all configured R scripts
|
|
564
|
-
are sourced exactly once, in isolated environments.
|
|
565
|
-
"""
|
|
566
|
-
# Ensure headless-safe env before rpy2 initializes R
|
|
567
|
-
self._ensure_headless_env()
|
|
568
|
-
|
|
569
|
-
if self.robjects is None:
|
|
570
|
-
rpy2_dict = _ensure_rpy2()
|
|
571
|
-
self._RPY2 = rpy2_dict # cache in instance
|
|
572
|
-
self._r = rpy2_dict["ro"]
|
|
573
|
-
self.ro = rpy2_dict["robjects"]
|
|
574
|
-
self.robjects = rpy2_dict["robjects"]
|
|
575
|
-
self.pandas2ri = rpy2_dict["pandas2ri"]
|
|
576
|
-
self.localconverter = rpy2_dict["localconverter"]
|
|
577
|
-
self.IntVector = rpy2_dict["IntVector"]
|
|
578
|
-
self.FloatVector = rpy2_dict["FloatVector"]
|
|
579
|
-
self.BoolVector = rpy2_dict["BoolVector"]
|
|
580
|
-
self.StrVector = rpy2_dict["StrVector"]
|
|
581
|
-
self.ListVector = rpy2_dict["ListVector"]
|
|
582
|
-
self.NamedList = rpy2_dict["NamedList"]
|
|
583
|
-
|
|
584
|
-
# Activate renv once if requested and allowed
|
|
585
|
-
if not self._renv_activated and self._should_activate_renv():
|
|
586
|
-
try:
|
|
587
|
-
activate_renv(self.path_to_renv)
|
|
588
|
-
self._renv_activated = True
|
|
589
|
-
logger.info(
|
|
590
|
-
f"[rpy-bridge.RFunctionCaller] renv activated for project: {self.path_to_renv}"
|
|
591
|
-
)
|
|
592
|
-
except Exception as e:
|
|
593
|
-
raise RuntimeError(f"Failed to activate renv at {self.path_to_renv}: {e}") from e
|
|
594
|
-
|
|
595
|
-
r = self.robjects.r
|
|
596
|
-
|
|
597
|
-
# Configure this.path to avoid GUI detection errors in embedded/headless R (e.g., rpy2)
|
|
598
|
-
try:
|
|
599
|
-
r('options(this.path.gui = "httpd")')
|
|
600
|
-
r("options(this.path.verbose = FALSE)")
|
|
601
|
-
# Patch this.path::.gui_path to avoid GUI detection errors in headless/rpy2 contexts.
|
|
602
|
-
r(
|
|
603
|
-
"""
|
|
604
|
-
if (requireNamespace("this.path", quietly = TRUE)) {
|
|
605
|
-
try({
|
|
606
|
-
assignInNamespace(".gui_path", function(...) "httpd", ns = "this.path")
|
|
607
|
-
}, silent = TRUE)
|
|
608
|
-
}
|
|
609
|
-
"""
|
|
610
|
-
)
|
|
611
|
-
except Exception:
|
|
612
|
-
pass
|
|
613
|
-
|
|
614
|
-
# Ensure required R package
|
|
615
|
-
self.ensure_r_package("withr")
|
|
616
|
-
|
|
617
|
-
if not hasattr(self, "_namespaces"):
|
|
618
|
-
self._namespaces: dict[str, dict[str, Any]] = {}
|
|
619
|
-
|
|
620
|
-
# --- Iterate over scripts ---
|
|
621
|
-
for idx, script_entry in enumerate(self.scripts):
|
|
622
|
-
if self._scripts_loaded[idx]:
|
|
623
|
-
continue
|
|
624
|
-
|
|
625
|
-
script_entry = script_entry.resolve()
|
|
626
|
-
|
|
627
|
-
if script_entry.is_file():
|
|
628
|
-
r_files = [script_entry]
|
|
629
|
-
elif script_entry.is_dir():
|
|
630
|
-
r_files = sorted(script_entry.glob("*.R"))
|
|
631
|
-
if not r_files:
|
|
632
|
-
logger.warning(f"No .R files found in directory: {script_entry}")
|
|
633
|
-
self._scripts_loaded[idx] = True
|
|
634
|
-
continue
|
|
635
|
-
else:
|
|
636
|
-
raise ValueError(f"Invalid script path: {script_entry}")
|
|
637
|
-
|
|
638
|
-
for script_path in r_files:
|
|
639
|
-
ns_name = script_path.stem
|
|
640
|
-
logger.opt(depth=2).info(
|
|
641
|
-
"[rpy-bridge.RFunctionCaller] Loading R script '{}' as namespace '{}'",
|
|
642
|
-
script_path.name,
|
|
643
|
-
ns_name,
|
|
644
|
-
)
|
|
645
|
-
|
|
646
|
-
r("env <- new.env(parent=globalenv())")
|
|
647
|
-
r(f'script_path <- "{script_path.as_posix()}"')
|
|
648
|
-
|
|
649
|
-
# Determine a root for this script: prefer a discovered project root; else script dir.
|
|
650
|
-
script_root = _find_project_root(self.path_to_renv, [script_path])
|
|
651
|
-
# Prefer script-local roots; if none, fall back to script directory.
|
|
652
|
-
if script_root is None:
|
|
653
|
-
script_root = script_path.parent.resolve()
|
|
654
|
-
script_root_arg = f'"{script_root.as_posix()}"'
|
|
655
|
-
|
|
656
|
-
r(
|
|
657
|
-
f"""
|
|
658
|
-
withr::with_dir(
|
|
659
|
-
{script_root_arg},
|
|
660
|
-
sys.source(script_path, envir=env, chdir = TRUE)
|
|
661
|
-
)
|
|
662
|
-
"""
|
|
663
|
-
)
|
|
664
|
-
|
|
665
|
-
env_obj = r("env")
|
|
666
|
-
self._namespaces[ns_name] = {
|
|
667
|
-
name: env_obj[name] for name in env_obj.keys() if callable(env_obj[name])
|
|
668
|
-
}
|
|
669
|
-
self._namespace_roots[ns_name] = script_root
|
|
670
|
-
|
|
671
|
-
logger.info(
|
|
672
|
-
f"[rpy-bridge.RFunctionCaller] Registered {len(self._namespaces[ns_name])} functions in namespace '{ns_name}'"
|
|
673
|
-
)
|
|
674
|
-
|
|
675
|
-
self._scripts_loaded[idx] = True
|
|
676
|
-
|
|
677
|
-
# -----------------------------------------------------------------
|
|
678
|
-
# Autocomplete-friendly attribute access for script namespaces
|
|
679
|
-
# -----------------------------------------------------------------
|
|
680
|
-
def __getattr__(self, name: str):
|
|
681
|
-
if "_namespaces" in self.__dict__ and name in self._namespaces:
|
|
682
|
-
ns_env = self._namespaces[name]
|
|
683
|
-
return NamespaceWrapper(ns_env)
|
|
684
|
-
raise AttributeError(f"'RFunctionCaller' object has no attribute '{name}'")
|
|
685
|
-
|
|
686
|
-
def _clean_scalar(self, x):
|
|
687
|
-
"""
|
|
688
|
-
Clean R-style missing values to pandas/NumPy equivalents.
|
|
689
|
-
Called inside _r2py on each vector element; atomic/scalar only.
|
|
690
|
-
"""
|
|
691
|
-
robjects = self.robjects
|
|
692
|
-
|
|
693
|
-
if x is None:
|
|
694
|
-
return None
|
|
695
|
-
|
|
696
|
-
if x in (
|
|
697
|
-
getattr(robjects, "NA_Real", None),
|
|
698
|
-
getattr(robjects, "NA_Integer", None),
|
|
699
|
-
getattr(robjects, "NA_Logical", None),
|
|
700
|
-
):
|
|
701
|
-
return None
|
|
702
|
-
|
|
703
|
-
if x is getattr(robjects, "NA_Character", None):
|
|
704
|
-
return None
|
|
705
|
-
|
|
706
|
-
if isinstance(x, float) and np.isnan(x):
|
|
707
|
-
return None
|
|
708
|
-
|
|
709
|
-
return x
|
|
710
|
-
|
|
711
|
-
def list_namespaces(self) -> list[str]:
|
|
712
|
-
"""
|
|
713
|
-
Return the names of all loaded script namespaces.
|
|
714
|
-
|
|
715
|
-
Returns
|
|
716
|
-
-------
|
|
717
|
-
list[str]
|
|
718
|
-
Names of sourced R script namespaces.
|
|
719
|
-
"""
|
|
720
|
-
self._ensure_r_loaded()
|
|
721
|
-
return list(self._namespaces.keys())
|
|
722
|
-
|
|
723
|
-
def list_namespace_functions(self, namespace: str) -> list[str]:
|
|
724
|
-
"""
|
|
725
|
-
Return all callable functions in a specific namespace.
|
|
726
|
-
"""
|
|
727
|
-
self._ensure_r_loaded()
|
|
728
|
-
if namespace not in self._namespaces:
|
|
729
|
-
raise ValueError(f"Namespace '{namespace}' not found")
|
|
730
|
-
return [k for k, v in self._namespaces[namespace].items() if callable(v)]
|
|
731
|
-
|
|
732
|
-
def _get_package_functions(self, pkg: str) -> list[str]:
|
|
733
|
-
"""
|
|
734
|
-
Return a list of callable functions from a loaded R package.
|
|
735
|
-
"""
|
|
736
|
-
r = self.robjects.r
|
|
737
|
-
try:
|
|
738
|
-
all_objs = list(r[f'ls("package:{pkg}")'])
|
|
739
|
-
funcs = [
|
|
740
|
-
name
|
|
741
|
-
for name in all_objs
|
|
742
|
-
if r(f'is.function(get("{name}", envir=asNamespace("{pkg}")))')[0]
|
|
743
|
-
]
|
|
744
|
-
return funcs
|
|
745
|
-
except Exception:
|
|
746
|
-
logger.warning(f"Failed to list functions for package '{pkg}'")
|
|
747
|
-
return []
|
|
748
|
-
|
|
749
|
-
def list_all_functions(self, include_packages: bool = False) -> dict[str, list[str]]:
|
|
750
|
-
"""
|
|
751
|
-
Return all callable R functions grouped by script namespace and package.
|
|
752
|
-
"""
|
|
753
|
-
self._ensure_r_loaded()
|
|
754
|
-
all_funcs = {}
|
|
755
|
-
|
|
756
|
-
# --- Script namespaces ---
|
|
757
|
-
for ns_name, ns_env in self._namespaces.items():
|
|
758
|
-
funcs = [name for name, val in ns_env.items() if callable(val)]
|
|
759
|
-
all_funcs[ns_name] = funcs
|
|
760
|
-
|
|
761
|
-
# --- Loaded R packages ---
|
|
762
|
-
if include_packages:
|
|
763
|
-
r = self.robjects.r
|
|
764
|
-
try:
|
|
765
|
-
pkgs = r("loadedNamespaces()")
|
|
766
|
-
for pkg in pkgs:
|
|
767
|
-
funcs = self._get_package_functions(pkg)
|
|
768
|
-
if not funcs:
|
|
769
|
-
# Add a placeholder note
|
|
770
|
-
funcs = [
|
|
771
|
-
"[See official documentation for functions, datasets, and objects]"
|
|
772
|
-
]
|
|
773
|
-
all_funcs[pkg] = funcs
|
|
774
|
-
except Exception:
|
|
775
|
-
pass
|
|
776
|
-
|
|
777
|
-
return all_funcs
|
|
778
|
-
|
|
779
|
-
def print_function_tree(self, include_packages: bool = False, max_display: int = 10):
|
|
780
|
-
"""
|
|
781
|
-
Pretty-print available R functions grouped by namespace.
|
|
782
|
-
|
|
783
|
-
Parameters
|
|
784
|
-
----------
|
|
785
|
-
include_packages : bool, default False
|
|
786
|
-
Whether to include functions from loaded R packages.
|
|
787
|
-
|
|
788
|
-
max_display : int, default 10
|
|
789
|
-
Maximum number of functions displayed per namespace.
|
|
790
|
-
|
|
791
|
-
Notes
|
|
792
|
-
-----
|
|
793
|
-
This method is intended for interactive exploration and debugging.
|
|
794
|
-
"""
|
|
795
|
-
all_funcs = self.list_all_functions(include_packages=include_packages)
|
|
796
|
-
|
|
797
|
-
for ns_name, funcs in all_funcs.items():
|
|
798
|
-
if not funcs:
|
|
799
|
-
continue
|
|
800
|
-
print(f"{ns_name}/")
|
|
801
|
-
for f in sorted(funcs)[:max_display]:
|
|
802
|
-
print(f" {f}")
|
|
803
|
-
if len(funcs) > max_display:
|
|
804
|
-
print(" ...")
|
|
805
|
-
|
|
806
|
-
# -----------------------------------------------------------------
|
|
807
|
-
# Python -> R conversion
|
|
808
|
-
# -----------------------------------------------------------------
|
|
809
|
-
def _py2r(self, obj):
|
|
810
|
-
"""
|
|
811
|
-
Convert Python objects to R objects robustly.
|
|
812
|
-
Handles scalars, None/pd.NA, lists, dicts, and pandas DataFrames.
|
|
813
|
-
"""
|
|
814
|
-
self._ensure_r_loaded()
|
|
815
|
-
robjects = self.robjects
|
|
816
|
-
pandas2ri = self.pandas2ri
|
|
817
|
-
FloatVector = self.FloatVector
|
|
818
|
-
BoolVector = self.BoolVector
|
|
819
|
-
StrVector = self.StrVector
|
|
820
|
-
ListVector = self.ListVector
|
|
821
|
-
localconverter = self.localconverter
|
|
822
|
-
|
|
823
|
-
r_types = (
|
|
824
|
-
robjects.vectors.IntVector,
|
|
825
|
-
robjects.vectors.FloatVector,
|
|
826
|
-
robjects.vectors.BoolVector,
|
|
827
|
-
robjects.vectors.StrVector,
|
|
828
|
-
robjects.vectors.ListVector,
|
|
829
|
-
robjects.DataFrame,
|
|
830
|
-
)
|
|
831
|
-
if isinstance(obj, r_types):
|
|
832
|
-
return obj
|
|
833
|
-
|
|
834
|
-
def is_na(x):
|
|
835
|
-
return x is None or x is pd.NA or (isinstance(x, float) and pd.isna(x))
|
|
836
|
-
|
|
837
|
-
with localconverter(robjects.default_converter + pandas2ri.converter):
|
|
838
|
-
if is_na(obj):
|
|
839
|
-
return robjects.NULL
|
|
840
|
-
if isinstance(obj, pd.DataFrame):
|
|
841
|
-
return pandas2ri.py2rpy(obj)
|
|
842
|
-
if isinstance(obj, pd.Series):
|
|
843
|
-
return self._py2r(obj.tolist())
|
|
844
|
-
if isinstance(obj, (int, float, bool, str)):
|
|
845
|
-
return obj
|
|
846
|
-
if isinstance(obj, list):
|
|
847
|
-
if len(obj) == 0:
|
|
848
|
-
return FloatVector([])
|
|
849
|
-
|
|
850
|
-
types = set(type(x) for x in obj if not is_na(x))
|
|
851
|
-
if types <= {int, float}:
|
|
852
|
-
return FloatVector([robjects.NA_Real if is_na(x) else float(x) for x in obj])
|
|
853
|
-
if types <= {bool}:
|
|
854
|
-
return BoolVector([robjects.NA_Logical if is_na(x) else x for x in obj])
|
|
855
|
-
if types <= {str}:
|
|
856
|
-
return StrVector([robjects.NA_Character if is_na(x) else x for x in obj])
|
|
857
|
-
return ListVector({str(i): self._py2r(v) for i, v in enumerate(obj)})
|
|
858
|
-
if isinstance(obj, dict):
|
|
859
|
-
return ListVector({k: self._py2r(v) for k, v in obj.items()})
|
|
860
|
-
raise NotImplementedError(f"Cannot convert Python object to R: {type(obj)}")
|
|
861
|
-
|
|
862
|
-
# -----------------------------------------------------------------
|
|
863
|
-
# R -> Python conversion
|
|
864
|
-
# -----------------------------------------------------------------
|
|
865
|
-
def _r2py(self, obj, top_level=True):
|
|
866
|
-
robjects = self.robjects
|
|
867
|
-
NamedList = self.NamedList
|
|
868
|
-
ListVector = self.ListVector
|
|
869
|
-
StrVector = self.StrVector
|
|
870
|
-
IntVector = self.IntVector
|
|
871
|
-
FloatVector = self.FloatVector
|
|
872
|
-
BoolVector = self.BoolVector
|
|
873
|
-
NULLType = self._RPY2["NULLType"]
|
|
874
|
-
lc = self.localconverter
|
|
875
|
-
pandas2ri = self.pandas2ri
|
|
876
|
-
|
|
877
|
-
if isinstance(obj, NULLType):
|
|
878
|
-
return None
|
|
879
|
-
|
|
880
|
-
if isinstance(obj, robjects.DataFrame):
|
|
881
|
-
with lc(robjects.default_converter + pandas2ri.converter):
|
|
882
|
-
df = robjects.conversion.rpy2py(obj)
|
|
883
|
-
df = postprocess_r_dataframe(df)
|
|
884
|
-
return clean_r_missing(df, caller=self)
|
|
885
|
-
|
|
886
|
-
if isinstance(obj, (NamedList, ListVector)):
|
|
887
|
-
py_obj = r_namedlist_to_dict(obj, caller=self, top_level=top_level)
|
|
888
|
-
if isinstance(py_obj, list) and len(py_obj) == 1 and top_level:
|
|
889
|
-
return py_obj[0]
|
|
890
|
-
return py_obj
|
|
891
|
-
|
|
892
|
-
if isinstance(obj, (StrVector, IntVector, FloatVector, BoolVector)):
|
|
893
|
-
py_list = [self._clean_scalar(v) for v in obj]
|
|
894
|
-
if len(py_list) == 1 and top_level:
|
|
895
|
-
return py_list[0]
|
|
896
|
-
return py_list
|
|
897
|
-
|
|
898
|
-
return self._clean_scalar(obj)
|
|
899
|
-
|
|
900
|
-
# -----------------------------------------------------------------
|
|
901
|
-
# Public: ensure R package is available
|
|
902
|
-
# -----------------------------------------------------------------
|
|
903
|
-
def ensure_r_package(self, pkg: str):
|
|
904
|
-
r = self.robjects.r
|
|
905
|
-
try:
|
|
906
|
-
r(f'suppressMessages(library("{pkg}", character.only=TRUE))')
|
|
907
|
-
except Exception:
|
|
908
|
-
logger.info(f"[rpy-bridge.RFunctionCaller] Package '{pkg}' not found.")
|
|
909
|
-
logger.warning(f"[rpy-bridge.RFunctionCaller] Installing missing R package: {pkg}")
|
|
910
|
-
r(f'install.packages("{pkg}", repos="https://cloud.r-project.org")')
|
|
911
|
-
r(f'suppressMessages(library("{pkg}", character.only=TRUE))')
|
|
912
|
-
|
|
913
|
-
# -----------------------------------------------------------------
|
|
914
|
-
# Public: call an R function
|
|
915
|
-
# -----------------------------------------------------------------
|
|
916
|
-
def call(self, func_name: str, *args, **kwargs):
|
|
917
|
-
"""
|
|
918
|
-
Call an R function.
|
|
919
|
-
|
|
920
|
-
The function may be defined in:
|
|
921
|
-
* a sourced R script
|
|
922
|
-
* an installed R package (using ``package::function`` syntax)
|
|
923
|
-
* base R
|
|
924
|
-
|
|
925
|
-
Parameters
|
|
926
|
-
----------
|
|
927
|
-
func_name : str
|
|
928
|
-
Name of the R function to call. Package functions should be specified
|
|
929
|
-
as ``package::function``.
|
|
930
|
-
|
|
931
|
-
*args
|
|
932
|
-
Positional arguments passed to the R function.
|
|
933
|
-
|
|
934
|
-
**kwargs
|
|
935
|
-
Named arguments passed to the R function.
|
|
936
|
-
|
|
937
|
-
Returns
|
|
938
|
-
-------
|
|
939
|
-
object
|
|
940
|
-
The result of the R function, converted to a Python object.
|
|
941
|
-
|
|
942
|
-
Examples
|
|
943
|
-
--------
|
|
944
|
-
>>> rfc.call("sum", [1, 2, 3])
|
|
945
|
-
>>> rfc.call("dplyr::n_distinct", [1, 2, 2, 3])
|
|
946
|
-
>>> rfc.call("add_and_scale", 2, 3, scale=10)
|
|
947
|
-
"""
|
|
948
|
-
|
|
949
|
-
self._ensure_r_loaded()
|
|
950
|
-
|
|
951
|
-
func = None
|
|
952
|
-
source_info = None
|
|
953
|
-
|
|
954
|
-
if "::" in func_name:
|
|
955
|
-
ns_name, fname = func_name.split("::", 1)
|
|
956
|
-
if ns_name in self._namespaces:
|
|
957
|
-
ns_env = self._namespaces[ns_name]
|
|
958
|
-
if fname in ns_env:
|
|
959
|
-
func = ns_env[fname]
|
|
960
|
-
source_info = f"script namespace '{ns_name}'"
|
|
961
|
-
namespace_root = self._namespace_roots.get(ns_name)
|
|
962
|
-
else:
|
|
963
|
-
raise ValueError(
|
|
964
|
-
f"Function '{fname}' not found in R script namespace '{ns_name}'"
|
|
965
|
-
)
|
|
966
|
-
else:
|
|
967
|
-
try:
|
|
968
|
-
func = self.robjects.r(f"{ns_name}::{fname}")
|
|
969
|
-
source_info = f"R package '{ns_name}'"
|
|
970
|
-
except Exception as e:
|
|
971
|
-
raise RuntimeError(f"Failed to resolve R function '{func_name}': {e}") from e
|
|
972
|
-
|
|
973
|
-
else:
|
|
974
|
-
for ns_name, ns_env in self._namespaces.items():
|
|
975
|
-
if func_name in ns_env:
|
|
976
|
-
func = ns_env[func_name]
|
|
977
|
-
source_info = f"script namespace '{ns_name}'"
|
|
978
|
-
namespace_root = self._namespace_roots.get(ns_name)
|
|
979
|
-
break
|
|
980
|
-
|
|
981
|
-
if func is None:
|
|
982
|
-
try:
|
|
983
|
-
func = self.robjects.globalenv[func_name]
|
|
984
|
-
source_info = "global environment"
|
|
985
|
-
except KeyError:
|
|
986
|
-
pass
|
|
987
|
-
|
|
988
|
-
if func is None:
|
|
989
|
-
try:
|
|
990
|
-
func = self.robjects.r[func_name]
|
|
991
|
-
source_info = "base R / loaded package"
|
|
992
|
-
except KeyError:
|
|
993
|
-
raise ValueError(
|
|
994
|
-
f"R function '{func_name}' not found in any namespace, global env, or base R."
|
|
995
|
-
)
|
|
996
|
-
|
|
997
|
-
r_args = [self._py2r(a) for a in args]
|
|
998
|
-
r_kwargs = {k: self._py2r(v) for k, v in kwargs.items()}
|
|
999
|
-
|
|
1000
|
-
try:
|
|
1001
|
-
if source_info and source_info.startswith("script namespace") and namespace_root:
|
|
1002
|
-
r = self.robjects.r
|
|
1003
|
-
try:
|
|
1004
|
-
r(f'old_wd <- getwd(); setwd("{namespace_root.as_posix()}")')
|
|
1005
|
-
result = func(*r_args, **r_kwargs)
|
|
1006
|
-
finally:
|
|
1007
|
-
try:
|
|
1008
|
-
r("setwd(old_wd)")
|
|
1009
|
-
except Exception:
|
|
1010
|
-
pass
|
|
1011
|
-
else:
|
|
1012
|
-
result = func(*r_args, **r_kwargs)
|
|
1013
|
-
except Exception as e:
|
|
1014
|
-
raise RuntimeError(
|
|
1015
|
-
f"Error calling R function '{func_name}' from {source_info}: {e}"
|
|
1016
|
-
) from e
|
|
1017
|
-
|
|
1018
|
-
_log_r_call(func_name, source_info)
|
|
1019
|
-
|
|
1020
|
-
return self._r2py(result)
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
# %%
|
|
1024
|
-
# ------------------------------
|
|
1025
|
-
# Utility functions for R ↔ Python
|
|
1026
|
-
# ------------------------------
|
|
1027
|
-
def r_namedlist_to_dict(namedlist, caller: RFunctionCaller, top_level=False):
|
|
1028
|
-
r = _ensure_rpy2()
|
|
1029
|
-
NamedList = r["NamedList"]
|
|
1030
|
-
ListVector = r["ListVector"]
|
|
1031
|
-
|
|
1032
|
-
if isinstance(namedlist, (NamedList, ListVector)):
|
|
1033
|
-
names = namedlist.names if not callable(namedlist.names) else namedlist.names()
|
|
1034
|
-
|
|
1035
|
-
if names and all(str(i) == str(name) for i, name in enumerate(names)):
|
|
1036
|
-
out = []
|
|
1037
|
-
for v in namedlist:
|
|
1038
|
-
val = caller._r2py(v, top_level=False)
|
|
1039
|
-
out.append(val)
|
|
1040
|
-
return out
|
|
1041
|
-
|
|
1042
|
-
result = {}
|
|
1043
|
-
for i, val in enumerate(namedlist):
|
|
1044
|
-
key = names[i] if names and i < len(names) else str(i)
|
|
1045
|
-
v_py = caller._r2py(val, top_level=False)
|
|
1046
|
-
result[str(key)] = v_py
|
|
1047
|
-
return result
|
|
1048
|
-
|
|
1049
|
-
return caller._r2py(namedlist, top_level=top_level)
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
def clean_r_dataframe(r_df: pd.DataFrame) -> pd.DataFrame:
|
|
1053
|
-
for attr in [".groups", ".rows"]:
|
|
1054
|
-
try:
|
|
1055
|
-
del r_df.attrs[attr]
|
|
1056
|
-
except (KeyError, AttributeError):
|
|
1057
|
-
pass
|
|
1058
|
-
return r_df
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
def fix_string_nans(df: pd.DataFrame) -> pd.DataFrame:
|
|
1062
|
-
return df.replace(["nan", "NaN", "NA", "na", ""], pd.NA)
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
def normalize_single_df_dtypes(df: pd.DataFrame) -> pd.DataFrame:
|
|
1066
|
-
df = df.replace(["", "nan", "NaN", "NA", "na"], pd.NA)
|
|
1067
|
-
for col in df.columns:
|
|
1068
|
-
series = df[col]
|
|
1069
|
-
if pd.api.types.is_object_dtype(series):
|
|
1070
|
-
coerced = pd.to_numeric(series, errors="coerce")
|
|
1071
|
-
if coerced.notna().sum() >= series.notna().sum() * 0.5:
|
|
1072
|
-
df[col] = coerced
|
|
1073
|
-
if pd.api.types.is_integer_dtype(df[col]) and df[col].isna().any():
|
|
1074
|
-
df[col] = df[col].astype("float64")
|
|
1075
|
-
return df
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
def fix_r_dataframe_types(df: pd.DataFrame) -> pd.DataFrame:
|
|
1079
|
-
for col in df.columns:
|
|
1080
|
-
series = df[col]
|
|
1081
|
-
if pd.api.types.is_integer_dtype(series):
|
|
1082
|
-
df[col] = series.mask(series == -2147483648, pd.NA)
|
|
1083
|
-
if pd.api.types.is_numeric_dtype(series):
|
|
1084
|
-
values = series.dropna()
|
|
1085
|
-
if not values.empty and values.between(10000, 40000).all():
|
|
1086
|
-
try:
|
|
1087
|
-
df[col] = pd.to_datetime("1970-01-01") + pd.to_timedelta(series, unit="D")
|
|
1088
|
-
except Exception:
|
|
1089
|
-
pass
|
|
1090
|
-
if pd.api.types.is_datetime64tz_dtype(series):
|
|
1091
|
-
df[col] = series.dt.tz_localize(None)
|
|
1092
|
-
return df
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
def postprocess_r_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
1096
|
-
df = fix_r_dataframe_types(df)
|
|
1097
|
-
df = fix_string_nans(df)
|
|
1098
|
-
df = normalize_single_df_dtypes(df)
|
|
1099
|
-
if df.index.dtype == object:
|
|
1100
|
-
try:
|
|
1101
|
-
int_index = df.index.astype(int)
|
|
1102
|
-
if (int_index == np.arange(len(df)) + 1).all():
|
|
1103
|
-
df.index = pd.RangeIndex(start=0, stop=len(df))
|
|
1104
|
-
except Exception:
|
|
1105
|
-
pass
|
|
1106
|
-
return df
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
def clean_r_missing(obj, caller: RFunctionCaller):
|
|
1110
|
-
robjects = caller.robjects
|
|
1111
|
-
NA_MAP = {
|
|
1112
|
-
getattr(robjects, "NA_Real", None): np.nan,
|
|
1113
|
-
getattr(robjects, "NA_Integer", None): np.nan,
|
|
1114
|
-
getattr(robjects, "NA_Logical", None): np.nan,
|
|
1115
|
-
getattr(robjects, "NA_Character", None): pd.NA,
|
|
1116
|
-
}
|
|
1117
|
-
|
|
1118
|
-
if isinstance(obj, pd.DataFrame):
|
|
1119
|
-
for col in obj.columns:
|
|
1120
|
-
obj[col] = obj[col].apply(lambda x: clean_r_missing(x, caller))
|
|
1121
|
-
return obj
|
|
1122
|
-
elif isinstance(obj, dict):
|
|
1123
|
-
return {k: clean_r_missing(v, caller) for k, v in obj.items()}
|
|
1124
|
-
elif isinstance(obj, list):
|
|
1125
|
-
return [clean_r_missing(v, caller) for v in obj]
|
|
1126
|
-
else:
|
|
1127
|
-
return NA_MAP.get(obj, obj)
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
# ---------------------------------------------------------------------
|
|
1131
|
-
# DataFrame comparison utilities
|
|
1132
|
-
# ---------------------------------------------------------------------
|
|
1133
|
-
def normalize_dtypes(df1: pd.DataFrame, df2: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
1134
|
-
for col in df1.columns.intersection(df2.columns):
|
|
1135
|
-
df1[col] = df1[col].replace("", pd.NA)
|
|
1136
|
-
df2[col] = df2[col].replace("", pd.NA)
|
|
1137
|
-
s1, s2 = df1[col], df2[col]
|
|
1138
|
-
dtype1, dtype2 = s1.dtype, s2.dtype
|
|
1139
|
-
if (pd.api.types.is_numeric_dtype(dtype1) and pd.api.types.is_object_dtype(dtype2)) or (
|
|
1140
|
-
pd.api.types.is_object_dtype(dtype1) and pd.api.types.is_numeric_dtype(dtype2)
|
|
1141
|
-
):
|
|
1142
|
-
try:
|
|
1143
|
-
df1[col] = pd.to_numeric(s1, errors="coerce")
|
|
1144
|
-
df2[col] = pd.to_numeric(s2, errors="coerce")
|
|
1145
|
-
continue
|
|
1146
|
-
except Exception:
|
|
1147
|
-
pass
|
|
1148
|
-
if pd.api.types.is_numeric_dtype(dtype1) and pd.api.types.is_numeric_dtype(dtype2):
|
|
1149
|
-
df1[col] = df1[col].astype("float64")
|
|
1150
|
-
df2[col] = df2[col].astype("float64")
|
|
1151
|
-
continue
|
|
1152
|
-
if pd.api.types.is_object_dtype(dtype1) or pd.api.types.is_object_dtype(dtype2):
|
|
1153
|
-
df1[col] = df1[col].astype(str)
|
|
1154
|
-
df2[col] = df2[col].astype(str)
|
|
1155
|
-
return df1, df2
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
def align_numeric_dtypes(df1: pd.DataFrame, df2: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
1159
|
-
for col in df1.columns.intersection(df2.columns):
|
|
1160
|
-
s1, s2 = df1[col].replace("", pd.NA), df2[col].replace("", pd.NA)
|
|
1161
|
-
try:
|
|
1162
|
-
s1_num = pd.to_numeric(s1, errors="coerce")
|
|
1163
|
-
s2_num = pd.to_numeric(s2, errors="coerce")
|
|
1164
|
-
if not s1_num.isna().all() or not s2_num.isna().all():
|
|
1165
|
-
df1[col] = s1_num.astype("float64")
|
|
1166
|
-
df2[col] = s2_num.astype("float64")
|
|
1167
|
-
continue
|
|
1168
|
-
except Exception:
|
|
1169
|
-
pass
|
|
1170
|
-
df1[col], df2[col] = s1, s2
|
|
1171
|
-
return df1, df2
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
def compare_r_py_dataframes(df1: pd.DataFrame, df2: pd.DataFrame, float_tol: float = 1e-8) -> dict:
|
|
1175
|
-
results: dict[str, Any] = {
|
|
1176
|
-
"shape_mismatch": False,
|
|
1177
|
-
"columns_mismatch": False,
|
|
1178
|
-
"index_mismatch": False,
|
|
1179
|
-
"numeric_diffs": {},
|
|
1180
|
-
"non_numeric_diffs": {},
|
|
1181
|
-
}
|
|
1182
|
-
df2 = fix_r_dataframe_types(df2)
|
|
1183
|
-
df1 = fix_string_nans(df1)
|
|
1184
|
-
df2 = fix_string_nans(df2)
|
|
1185
|
-
df1, df2 = normalize_dtypes(df1.copy(), df2.copy())
|
|
1186
|
-
df1, df2 = align_numeric_dtypes(df1, df2)
|
|
1187
|
-
if df1.shape != df2.shape:
|
|
1188
|
-
results["shape_mismatch"] = True
|
|
1189
|
-
print(f"[Warning] Shape mismatch: df1 {df1.shape} vs df2 {df2.shape}")
|
|
1190
|
-
if set(df1.columns) != set(df2.columns):
|
|
1191
|
-
results["columns_mismatch"] = True
|
|
1192
|
-
print("[Warning] Column mismatch:")
|
|
1193
|
-
print(f" df1: {df1.columns}")
|
|
1194
|
-
print(f" df2: {df2.columns}")
|
|
1195
|
-
common_cols = df1.columns.intersection(df2.columns)
|
|
1196
|
-
else:
|
|
1197
|
-
common_cols = df1.columns
|
|
1198
|
-
df1_aligned, df2_aligned = df1.loc[:, common_cols], df2.loc[:, common_cols]
|
|
1199
|
-
for col in common_cols:
|
|
1200
|
-
col_py, col_r = df1_aligned[col], df2_aligned[col]
|
|
1201
|
-
if pd.api.types.is_numeric_dtype(col_py) and pd.api.types.is_numeric_dtype(col_r):
|
|
1202
|
-
col_py, col_r = col_py.align(col_r)
|
|
1203
|
-
close = np.isclose(
|
|
1204
|
-
col_py.fillna(np.nan),
|
|
1205
|
-
col_r.fillna(np.nan),
|
|
1206
|
-
atol=float_tol,
|
|
1207
|
-
equal_nan=True,
|
|
1208
|
-
)
|
|
1209
|
-
if not close.all():
|
|
1210
|
-
results["numeric_diffs"][col] = pd.DataFrame(
|
|
1211
|
-
{"df1": col_py[~close], "df2": col_r[~close]}
|
|
1212
|
-
)
|
|
1213
|
-
else:
|
|
1214
|
-
unequal = ~col_py.eq(col_r)
|
|
1215
|
-
both_na = col_py.isna() & col_r.isna()
|
|
1216
|
-
unequal = unequal & ~both_na
|
|
1217
|
-
if unequal.any():
|
|
1218
|
-
results["non_numeric_diffs"][col] = pd.DataFrame(
|
|
1219
|
-
{"df1": col_py[unequal], "df2": col_r[unequal]}
|
|
1220
|
-
)
|
|
1221
|
-
return results
|