bioguider 0.2.21__py3-none-any.whl → 0.2.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bioguider might be problematic. Click here for more details.

@@ -1,368 +1,549 @@
1
- import re
2
1
  import os
3
- from typing import List, Tuple, Optional
2
+ import re
3
+ from dataclasses import dataclass
4
+ from typing import List, Optional, Tuple
5
+
6
+ @dataclass
7
+ class RSymbol:
8
+ name: str
9
+ parent: Optional[str]
10
+ start_line: int
11
+ end_line: int
12
+ docstring: Optional[str]
13
+ params: List[str]
14
+
4
15
 
5
16
  class RFileHandler:
17
+ # only up to "function("
18
+ FUNC_DEF_HEAD_RE = re.compile(
19
+ r'(?P<name>[A-Za-z.][\w.]*)\s*<-\s*function\s*\(',
20
+ re.MULTILINE,
21
+ )
22
+
23
+ S3_METHOD_HEAD_RE = re.compile(
24
+ r'(?P<generic>[A-Za-z.][\w.]*)\.(?P<class>[A-Za-z.][\w.]*)\s*<-\s*function\s*\(',
25
+ re.MULTILINE,
26
+ )
27
+
28
+ # R6 method head: "name = function("
29
+ R6_METHOD_HEAD_RE = re.compile(
30
+ r'(?P<mname>[A-Za-z.][\w.]*)\s*=\s*function\s*\(',
31
+ re.MULTILINE,
32
+ )
33
+
34
+ # S4 method head inside setMethod(... function(
35
+ S4_METHOD_HEAD_RE = re.compile(
36
+ r'setMethod\s*\(\s*["\'](?P<generic>[^"\']+)["\']\s*,.*?function\s*\(',
37
+ re.MULTILINE | re.DOTALL,
38
+ )
39
+
40
+ FUNC_DEF_RE = re.compile(
41
+ # name <- function( ... ) { with multi-line args allowed
42
+ r'(?P<name>[A-Za-z.][\w.]*)\s*<-\s*function\s*\((?P<args>[^)]*)\)\s*\{',
43
+ re.MULTILINE,
44
+ )
45
+ S3_METHOD_RE = re.compile(
46
+ r'(?P<generic>[A-Za-z.][\w.]*)\.(?P<class>[A-Za-z.][\w.]*)\s*<-\s*function\s*\((?P<args>[^)]*)\)\s*\{',
47
+ re.MULTILINE,
48
+ )
49
+ R6_CLASS_RE = re.compile(
50
+ r'(?P<varname>[A-Za-z.][\w.]*)\s*<-\s*R6Class\s*\(\s*["\'](?P<classname>[^"\']+)["\']',
51
+ re.MULTILINE | re.DOTALL,
52
+ )
53
+ R6_METHOD_RE = re.compile(
54
+ r'(?P<mname>[A-Za-z.][\w.]*)\s*=\s*function\s*\((?P<args>[^)]*)\)\s*\{',
55
+ re.MULTILINE,
56
+ )
57
+ S4_CLASS_RE = re.compile(
58
+ r'setClass\s*\(\s*["\'](?P<classname>[^"\']+)["\']',
59
+ re.MULTILINE,
60
+ )
61
+ S4_METHOD_RE = re.compile(
62
+ r'setMethod\s*\(\s*["\'](?P<generic>[^"\']+)["\']\s*,.*?function\s*\((?P<args>[^)]*)\)\s*\{',
63
+ re.MULTILINE | re.DOTALL,
64
+ )
65
+ S4_SIG_CLASS_RE = re.compile(
66
+ r'signature\s*=\s*(?:list\s*\(|\()\s*(?:[^)]*class\s*=\s*["\'](?P<classname>[^"\']+)["\']|["\'](?P<classname2>[^"\']+)["\'])',
67
+ re.MULTILINE,
68
+ )
69
+ LIB_REQUIRE_RE = re.compile(
70
+ r'\b(?:library|require)\s*\(\s*([A-Za-z.][\w.]*)\s*\)',
71
+ re.MULTILINE,
72
+ )
73
+ NS_USE_RE = re.compile(
74
+ r'(?P<pkg>[A-Za-z.][\w.]*):::{0,2}(?P<sym>[A-Za-z.][\w.]*)',
75
+ re.MULTILINE,
76
+ )
77
+
6
78
  def __init__(self, file_path: str):
7
79
  self.file_path = file_path
80
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
81
+ self.text = f.read()
82
+ self.lines = self.text.splitlines()
83
+ self._brace_map = self._build_brace_map_safely() # FIX: ignore comments/strings
84
+
85
+ # ---------------- Public API ----------------
8
86
 
9
87
  def get_functions_and_classes(self) -> List[Tuple[str, Optional[str], int, int, Optional[str], List[str]]]:
10
- """
11
- Get the functions and S4 classes in a given R file.
12
- Returns a list of tuples, each containing:
13
- 1. the function or class name,
14
- 2. parent name (None for R, as R doesn't have nested functions in the same way),
15
- 3. start line number,
16
- 4. end line number,
17
- 5. doc string (roxygen comments),
18
- 6. params (function parameters).
19
- """
20
- with open(self.file_path, 'r', encoding='utf-8') as f:
21
- lines = f.readlines()
22
-
23
- functions_and_classes = []
24
- i = 0
25
-
26
- while i < len(lines):
27
- line = lines[i].strip()
28
-
29
- # Skip empty lines and comments (except roxygen)
30
- if not line or (line.startswith('#') and not line.startswith('#\'') and not line.startswith('#@')):
31
- i += 1
32
- continue
33
-
34
- # Check for function definitions
35
- func_match = self._match_function(lines, i)
36
- if func_match:
37
- name, start_line, end_line, doc_string, params = func_match
38
- functions_and_classes.append((name, None, start_line + 1, end_line + 1, doc_string, params))
39
- i = end_line + 1
40
- continue
41
-
42
- # Check for S4 class definitions
43
- class_match = self._match_s4_class(lines, i)
44
- if class_match:
45
- name, start_line, end_line, doc_string = class_match
46
- functions_and_classes.append((name, None, start_line + 1, end_line + 1, doc_string, []))
47
- i = end_line + 1
48
- continue
49
-
50
- # Check for S3 class methods (functions with class-specific naming)
51
- s3_match = self._match_s3_method(lines, i)
52
- if s3_match:
53
- name, start_line, end_line, doc_string, params = s3_match
54
- functions_and_classes.append((name, None, start_line + 1, end_line + 1, doc_string, params))
55
- i = end_line + 1
56
- continue
57
-
58
- i += 1
59
-
60
- return functions_and_classes
88
+ items: List[RSymbol] = []
89
+ items.extend(self._parse_functions())
90
+ items.extend(self._parse_s3_methods())
91
+ items.extend(self._parse_r6())
92
+ items.extend(self._parse_s4())
93
+ items.sort(key=lambda s: (s.start_line, s.end_line))
94
+ return [(i.name, i.parent, i.start_line, i.end_line, i.docstring, i.params) for i in items]
95
+
96
+ def get_imports(self) -> List[str]:
97
+ pkgs = set(self.LIB_REQUIRE_RE.findall(self.text))
98
+ for m in self.NS_USE_RE.finditer(self.text):
99
+ pkgs.add(m.group('pkg'))
100
+ return sorted(pkgs)
101
+
102
+ # ---------------- Parsers ----------------
103
+
104
+ def _parse_functions(self) -> List[RSymbol]:
105
+ syms: List[RSymbol] = []
106
+ for m in self.FUNC_DEF_HEAD_RE.finditer(self.text):
107
+ name = m.group('name')
108
+ open_paren = m.end() - 1 # points at '('
109
+ close_paren = self._matching_paren_pos_global(open_paren)
110
+ if close_paren is None:
111
+ continue
112
+ args_text = self.text[open_paren + 1: close_paren]
113
+ args = self._parse_params(args_text)
61
114
 
62
- def _match_function(self, lines: List[str], start_idx: int) -> Optional[Tuple[str, int, int, Optional[str], List[str]]]:
63
- """Match function definitions in R code."""
64
- # Collect roxygen documentation before function
65
- doc_string = self._extract_roxygen_doc(lines, start_idx)
66
- doc_start_idx = start_idx
67
-
68
- # Skip roxygen comments to find function definition
69
- while start_idx < len(lines) and (lines[start_idx].strip().startswith('#\'') or
70
- lines[start_idx].strip().startswith('#@') or
71
- not lines[start_idx].strip()):
72
- start_idx += 1
73
-
74
- if start_idx >= len(lines):
75
- return None
76
-
77
- # Pattern for function definition: name <- function(params) or name = function(params)
78
- func_pattern = r'^(\s*)([a-zA-Z_][a-zA-Z0-9_.\$]*)\s*(<-|=)\s*function\s*\('
79
-
80
- line = lines[start_idx]
81
- match = re.match(func_pattern, line)
82
-
83
- if not match:
84
- return None
85
-
86
- func_name = match.group(2)
87
- indent_level = len(match.group(1))
88
-
89
- # Extract parameters
90
- params = self._extract_function_params(lines, start_idx)
91
-
92
- # Find the end of the function by tracking braces
93
- end_idx = self._find_function_end(lines, start_idx, indent_level)
94
-
95
- return func_name, doc_start_idx, end_idx, doc_string, params
115
+ block_open = self._find_next_code_brace_after(close_paren + 1)
116
+ if block_open is None:
117
+ continue
118
+ block_close = self._matching_brace_pos(block_open)
96
119
 
97
- def _match_s4_class(self, lines: List[str], start_idx: int) -> Optional[Tuple[str, int, int, Optional[str]]]:
98
- """Match S4 class definitions."""
99
- doc_string = self._extract_roxygen_doc(lines, start_idx)
100
- doc_start_idx = start_idx
101
-
102
- # Skip documentation to find class definition
103
- while start_idx < len(lines) and (lines[start_idx].strip().startswith('#\'') or
104
- lines[start_idx].strip().startswith('#@') or
105
- not lines[start_idx].strip()):
106
- start_idx += 1
107
-
108
- if start_idx >= len(lines):
109
- return None
110
-
111
- # Pattern for S4 class: setClass("ClassName", ...)
112
- class_pattern = r'setClass\s*\(\s*["\']([^"\']+)["\']'
113
-
114
- line = lines[start_idx]
115
- match = re.search(class_pattern, line)
116
-
117
- if not match:
118
- return None
119
-
120
- class_name = match.group(1)
121
-
122
- # Find the end by tracking parentheses
123
- end_idx = self._find_parentheses_end(lines, start_idx)
124
-
125
- return class_name, doc_start_idx, end_idx, doc_string
120
+ start_line = self._pos_to_line(block_open)
121
+ end_line = self._pos_to_line(block_close)
122
+ doc = self._roxygen_before(m.start())
126
123
 
127
- def _match_s3_method(self, lines: List[str], start_idx: int) -> Optional[Tuple[str, int, int, Optional[str], List[str]]]:
128
- """Match S3 method definitions (method.class pattern)."""
129
- doc_string = self._extract_roxygen_doc(lines, start_idx)
130
- doc_start_idx = start_idx
131
-
132
- # Skip documentation
133
- while start_idx < len(lines) and (lines[start_idx].strip().startswith('#\'') or
134
- lines[start_idx].strip().startswith('#@') or
135
- not lines[start_idx].strip()):
136
- start_idx += 1
137
-
138
- if start_idx >= len(lines):
139
- return None
140
-
141
- # Pattern for S3 method: method.class <- function(params)
142
- s3_pattern = r'^(\s*)([a-zA-Z_][a-zA-Z0-9_]*\.[a-zA-Z_][a-zA-Z0-9_]*)\s*(<-|=)\s*function\s*\('
143
-
144
- line = lines[start_idx]
145
- match = re.match(s3_pattern, line)
146
-
147
- if not match:
148
- return None
149
-
150
- method_name = match.group(2)
151
- indent_level = len(match.group(1))
152
-
153
- # Extract parameters
154
- params = self._extract_function_params(lines, start_idx)
155
-
156
- # Find the end of the function
157
- end_idx = self._find_function_end(lines, start_idx, indent_level)
158
-
159
- return method_name, doc_start_idx, end_idx, doc_string, params
124
+ syms.append(RSymbol(name=name, parent=None,
125
+ start_line=start_line, end_line=end_line,
126
+ docstring=doc, params=args))
160
127
 
161
- def _extract_roxygen_doc(self, lines: List[str], start_idx: int) -> Optional[str]:
162
- """Extract roxygen2 documentation comments."""
163
- doc_lines = []
164
- i = start_idx
165
-
166
- # Go backwards to find the start of roxygen comments
167
- while i > 0 and (lines[i-1].strip().startswith('#\'') or lines[i-1].strip().startswith('#@') or not lines[i-1].strip()):
168
- if lines[i-1].strip().startswith('#\'') or lines[i-1].strip().startswith('#@'):
169
- i -= 1
170
- elif not lines[i-1].strip():
171
- i -= 1
172
- else:
173
- break
174
-
175
- # Collect roxygen comments
176
- while i < len(lines):
177
- line = lines[i].strip()
178
- if line.startswith('#\'') or line.startswith('#@'):
179
- # Remove the roxygen prefix
180
- clean_line = re.sub(r'^#[\'@]\s?', '', line)
181
- doc_lines.append(clean_line)
182
- i += 1
183
- elif not line: # Empty line
184
- i += 1
185
- else:
186
- break
187
-
188
- return '\n'.join(doc_lines) if doc_lines else None
128
+ # nested
129
+ syms.extend(self._parse_nested_functions(block_open, block_close, parent=name))
130
+ return syms
131
+
132
+ def _parse_nested_functions(self, abs_start: int, abs_end: int, parent: str) -> List[RSymbol]:
133
+ sub = self.text[abs_start:abs_end+1]
134
+ syms: List[RSymbol] = []
135
+ for m in self.FUNC_DEF_HEAD_RE.finditer(sub):
136
+ open_rel = m.end() - 1
137
+ close_rel = self._matching_paren_pos_in_text(sub, open_rel)
138
+ if close_rel is None:
139
+ continue
140
+ args_text = sub[open_rel + 1: close_rel]
141
+ args = self._parse_params(args_text)
189
142
 
190
- def _extract_function_params(self, lines: List[str], start_idx: int) -> List[str]:
191
- """Extract function parameters from function definition."""
192
- params = []
193
-
194
- # Find the function line and extract parameters
195
- func_line_complete = ""
196
- i = start_idx
197
- paren_count = 0
198
- found_opening = False
199
-
200
- while i < len(lines):
201
- line = lines[i]
202
- func_line_complete += line
203
-
204
- # Count parentheses to find the complete parameter list
205
- for char in line:
206
- if char == '(':
207
- paren_count += 1
208
- found_opening = True
209
- elif char == ')':
210
- paren_count -= 1
211
-
212
- if found_opening and paren_count == 0:
213
- break
214
- i += 1
215
-
216
- # Extract parameters using regex
217
- param_match = re.search(r'function\s*\((.*?)\)', func_line_complete, re.DOTALL)
218
- if param_match:
219
- param_str = param_match.group(1).strip()
220
- if param_str:
221
- # Split by comma, but be careful with nested parentheses and quotes
222
- params = self._smart_split_params(param_str)
223
- # Clean up parameter names (remove default values, whitespace)
224
- params = [re.split(r'\s*=\s*', param.strip())[0].strip() for param in params]
225
- params = [param for param in params if param and param != '...']
226
-
227
- return params
143
+ # brace after ')' within the slice
144
+ func_open_rel = self._find_next_char_in_text(sub, '{', close_rel + 1)
145
+ if func_open_rel is None:
146
+ continue
147
+ func_close_rel = self._matching_brace_pos_in_text(sub, func_open_rel)
148
+ if func_close_rel is None:
149
+ continue
228
150
 
229
- def _smart_split_params(self, param_str: str) -> List[str]:
230
- """Split parameters by comma, handling nested structures."""
231
- params = []
232
- current_param = ""
233
- paren_count = 0
234
- quote_char = None
235
-
236
- for char in param_str:
237
- if quote_char:
238
- current_param += char
239
- if char == quote_char and (len(current_param) == 1 or current_param[-2] != '\\'):
240
- quote_char = None
241
- elif char in ['"', "'"]:
242
- quote_char = char
243
- current_param += char
244
- elif char == '(':
245
- paren_count += 1
246
- current_param += char
247
- elif char == ')':
248
- paren_count -= 1
249
- current_param += char
250
- elif char == ',' and paren_count == 0:
251
- params.append(current_param.strip())
252
- current_param = ""
253
- else:
254
- current_param += char
255
-
256
- if current_param.strip():
257
- params.append(current_param.strip())
258
-
259
- return params
151
+ block_open = abs_start + func_open_rel
152
+ block_close = abs_start + func_close_rel
153
+ name = m.group('name')
154
+ doc = self._roxygen_before(block_open)
155
+ syms.append(RSymbol(
156
+ name=name, parent=parent,
157
+ start_line=self._pos_to_line(block_open),
158
+ end_line=self._pos_to_line(block_close),
159
+ docstring=doc, params=args
160
+ ))
161
+ return syms
162
+
163
+
164
+ def _parse_s3_methods(self) -> List[RSymbol]:
165
+ syms: List[RSymbol] = []
166
+ for m in self.S3_METHOD_HEAD_RE.finditer(self.text):
167
+ generic = m.group('generic')
168
+ clazz = m.group('class')
169
+ name = f"{generic}.{clazz}"
260
170
 
261
- def _find_function_end(self, lines: List[str], start_idx: int, indent_level: int) -> int:
262
- """Find the end of a function by tracking braces and indentation."""
263
- brace_count = 0
264
- in_function = False
265
- i = start_idx
266
-
267
- while i < len(lines):
268
- line = lines[i]
269
-
270
- # Count braces
271
- for char in line:
272
- if char == '{':
273
- brace_count += 1
274
- in_function = True
275
- elif char == '}':
276
- brace_count -= 1
277
-
278
- # If we've closed all braces, we're at the end
279
- if in_function and brace_count == 0:
280
- return i
281
-
282
- # If no braces are used, look for next function or end of file
283
- if not in_function and i > start_idx:
284
- stripped = line.strip()
285
- if stripped and not stripped.startswith('#'):
286
- # Check if this looks like a new function or assignment at same/higher level
287
- if re.match(r'^(\s*)[a-zA-Z_][a-zA-Z0-9_.\$]*\s*(<-|=)', line):
288
- current_indent = len(re.match(r'^(\s*)', line).group(1))
289
- if current_indent <= indent_level:
290
- return i - 1
291
-
292
- i += 1
293
-
294
- return len(lines) - 1
171
+ open_paren = m.end() - 1
172
+ close_paren = self._matching_paren_pos_global(open_paren)
173
+ if close_paren is None:
174
+ continue
175
+ args_text = self.text[open_paren + 1: close_paren]
176
+ args = self._parse_params(args_text)
295
177
 
296
- def _find_parentheses_end(self, lines: List[str], start_idx: int) -> int:
297
- """Find the end of a parenthetical expression."""
298
- paren_count = 0
299
- i = start_idx
300
-
301
- while i < len(lines):
302
- line = lines[i]
303
- for char in line:
304
- if char == '(':
305
- paren_count += 1
306
- elif char == ')':
307
- paren_count -= 1
308
- if paren_count == 0:
309
- return i
310
- i += 1
311
-
312
- return len(lines) - 1
178
+ block_open = self._find_next_code_brace_after(close_paren + 1)
179
+ if block_open is None:
180
+ continue
181
+ block_close = self._matching_brace_pos(block_open)
313
182
 
314
- def get_imports(self) -> List[str]:
183
+ syms.append(RSymbol(
184
+ name=name, parent=generic,
185
+ start_line=self._pos_to_line(block_open),
186
+ end_line=self._pos_to_line(block_close),
187
+ docstring=self._roxygen_before(m.start()),
188
+ params=args
189
+ ))
190
+ return syms
191
+
192
+
193
+ def _parse_r6(self) -> List[RSymbol]:
194
+ syms: List[RSymbol] = []
195
+ for m in self.R6_CLASS_RE.finditer(self.text):
196
+ classname = m.group('classname')
197
+ # Find the first '{' after R6Class( — it's the class call's body brace
198
+ first_brace = self._find_next_code_brace_after(m.end())
199
+ if first_brace is None:
200
+ continue
201
+ class_end = self._matching_brace_pos(first_brace)
202
+ syms.append(RSymbol(
203
+ name=classname, parent=None,
204
+ start_line=self._pos_to_line(first_brace),
205
+ end_line=self._pos_to_line(class_end),
206
+ docstring=self._roxygen_before(m.start()),
207
+ params=[]
208
+ ))
209
+ # Methods within public/private/active lists
210
+ class_text = self.text[m.start():class_end+1]
211
+ base = m.start()
212
+ for sect in ('public', 'private', 'active'):
213
+ for meth in self._parse_r6_section_methods(class_text, base, sect, classname):
214
+ syms.append(meth)
215
+ return syms
216
+
217
+ def _parse_r6_section_methods(self, class_text: str, base: int, section: str, parent_class: str) -> List[RSymbol]:
218
+ syms: List[RSymbol] = []
219
+ for sec in re.finditer(rf'{section}\s*=\s*list\s*\(', class_text):
220
+ lst_open = sec.end() - 1
221
+ lst_close = self._matching_paren_pos_in_text(class_text, lst_open)
222
+ if lst_close is None:
223
+ continue
224
+ list_text = class_text[lst_open:lst_close+1]
225
+ for m in self.R6_METHOD_HEAD_RE.finditer(list_text):
226
+ open_rel = m.end() - 1
227
+ close_rel = self._matching_paren_pos_in_text(list_text, open_rel)
228
+ if close_rel is None:
229
+ continue
230
+ args_text = list_text[open_rel + 1: close_rel]
231
+ args = self._parse_params(args_text)
232
+
233
+ func_open_rel = self._find_next_char_in_text(list_text, '{', close_rel + 1)
234
+ if func_open_rel is None:
235
+ continue
236
+ func_close_rel = self._matching_brace_pos_in_text(list_text, func_open_rel)
237
+ if func_close_rel is None:
238
+ continue
239
+
240
+ block_open = base + lst_open + func_open_rel
241
+ block_close = base + lst_open + func_close_rel
242
+
243
+ syms.append(RSymbol(
244
+ name=f"{parent_class}${m.group('mname')}",
245
+ parent=parent_class,
246
+ start_line=self._pos_to_line(block_open),
247
+ end_line=self._pos_to_line(block_close),
248
+ docstring=self._roxygen_before(block_open),
249
+ params=args
250
+ ))
251
+ return syms
252
+
253
+
254
+ def _parse_s4(self) -> List[RSymbol]:
255
+ syms: List[RSymbol] = []
256
+ for m in self.S4_CLASS_RE.finditer(self.text):
257
+ syms.append(RSymbol(
258
+ name=m.group('classname'), parent=None,
259
+ start_line=self._pos_to_line(m.start()),
260
+ end_line=self._pos_to_line(m.start()),
261
+ docstring=self._roxygen_before(m.start()),
262
+ params=[]
263
+ ))
264
+ for m in self.S4_METHOD_HEAD_RE.finditer(self.text):
265
+ generic = m.group('generic')
266
+
267
+ open_paren = m.end() - 1
268
+ close_paren = self._matching_paren_pos_global(open_paren)
269
+ if close_paren is None:
270
+ continue
271
+ args_text = self.text[open_paren + 1: close_paren]
272
+ args = self._parse_params(args_text)
273
+
274
+ block_open = self._find_next_code_brace_after(close_paren + 1)
275
+ block_close = self._matching_brace_pos(block_open) if block_open is not None else m.end()
276
+
277
+ sig_slice = self.text[m.start(): block_open or m.end()]
278
+ cm = self.S4_SIG_CLASS_RE.search(sig_slice)
279
+ clazz = cm.group('classname') if cm and cm.group('classname') else (cm.group('classname2') if cm else None)
280
+ name = f"{generic}{'<' + clazz + '>' if clazz else ''}"
281
+
282
+ syms.append(RSymbol(
283
+ name=name, parent=generic,
284
+ start_line=self._pos_to_line(block_open if block_open is not None else m.start()),
285
+ end_line=self._pos_to_line(block_close),
286
+ docstring=self._roxygen_before(m.start()),
287
+ params=args
288
+ ))
289
+
290
+ return syms
291
+
292
+ # ---------------- Utilities ----------------
293
+
294
+ def _parse_params(self, arg_str: str) -> List[str]:
295
+ params = []
296
+ depth = 0
297
+ token = []
298
+ in_s: Optional[str] = None
299
+ escape = False
300
+ for ch in arg_str:
301
+ if in_s:
302
+ token.append(ch)
303
+ if escape:
304
+ escape = False
305
+ elif ch == '\\':
306
+ escape = True
307
+ elif ch == in_s:
308
+ in_s = None
309
+ continue
310
+ if ch in ('"', "'"):
311
+ in_s = ch
312
+ token.append(ch)
313
+ continue
314
+ if ch in '([{':
315
+ depth += 1
316
+ token.append(ch)
317
+ elif ch in ')]}':
318
+ depth -= 1
319
+ token.append(ch)
320
+ elif ch == ',' and depth == 0:
321
+ params.append(''.join(token).strip())
322
+ token = []
323
+ else:
324
+ token.append(ch)
325
+ if token:
326
+ params.append(''.join(token).strip())
327
+
328
+ cleaned = []
329
+ for p in params:
330
+ p = p.strip()
331
+ if not p:
332
+ continue
333
+ if p == '...':
334
+ cleaned.append('...')
335
+ continue
336
+ name = p.split('=')[0].strip()
337
+ if name:
338
+ cleaned.append(name)
339
+ return cleaned
340
+
341
+ def _roxygen_before(self, pos: int) -> Optional[str]:
342
+ line_idx = self._pos_to_line(pos) - 2
343
+ if line_idx < 0:
344
+ return None
345
+ buf = []
346
+ while line_idx >= 0:
347
+ line = self.lines[line_idx]
348
+ s = line.lstrip()
349
+ if s.startswith("#'"):
350
+ buf.append(s[2:].lstrip())
351
+ line_idx -= 1
352
+ continue
353
+ # stop at first non-roxygen line (don’t cross blank + NULL padding blocks)
354
+ break
355
+ if not buf:
356
+ return None
357
+ buf.reverse()
358
+ return '\n'.join(buf).strip() or None
359
+
360
+ # -------- Position / brace helpers (comment/string aware) --------
361
+
362
+ def _build_brace_map_safely(self):
315
363
  """
316
- Get library imports and source statements in R code.
317
- Returns a list of library names and sourced files.
364
+ Build a map of '{' -> matching '}' while ignoring braces inside:
365
+ - comments starting with '#'
366
+ - single- and double-quoted strings with escapes
318
367
  """
319
- imports = []
320
-
321
- with open(self.file_path, 'r', encoding='utf-8') as f:
322
- lines = f.readlines()
323
-
324
- for line in lines:
325
- line = line.strip()
326
-
327
- # Match library() calls
328
- lib_match = re.search(r'library\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)', line)
329
- if lib_match:
330
- imports.append(f"library({lib_match.group(1)})")
331
-
332
- # Match require() calls
333
- req_match = re.search(r'require\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)', line)
334
- if req_match:
335
- imports.append(f"require({req_match.group(1)})")
336
-
337
- # Match source() calls
338
- src_match = re.search(r'source\s*\(\s*["\']([^"\']+)["\']\s*\)', line)
339
- if src_match:
340
- imports.append(f"source({src_match.group(1)})")
341
-
342
- # Match :: namespace calls (just collect unique packages)
343
- ns_matches = re.findall(r'([a-zA-Z_][a-zA-Z0-9_.]*)::', line)
344
- for ns in ns_matches:
345
- ns_import = f"{ns}::"
346
- if ns_import not in imports:
347
- imports.append(ns_import)
348
-
349
- return imports
368
+ stack = []
369
+ pairs = {}
370
+ in_string: Optional[str] = None
371
+ escape = False
372
+ in_comment = False
350
373
 
374
+ for i, ch in enumerate(self.text):
375
+ if in_comment:
376
+ if ch == '\n':
377
+ in_comment = False
378
+ continue
351
379
 
352
- # Example usage:
353
- if __name__ == "__main__":
354
- # Example R file analysis
355
- handler = RFileHandler("example.R")
356
-
357
- # Get functions and classes
358
- functions_and_classes = handler.get_functions_and_classes()
359
- print("Functions and Classes:")
360
- for item in functions_and_classes:
361
- name, parent, start, end, doc, params = item
362
- print(f" {name}: lines {start}-{end}, params: {params}")
363
- if doc:
364
- print(f" Doc: {doc[:50]}...")
380
+ if in_string:
381
+ if escape:
382
+ escape = False
383
+ continue
384
+ if ch == '\\':
385
+ escape = True
386
+ continue
387
+ if ch == in_string:
388
+ in_string = None
389
+ continue
390
+
391
+ # not in string/comment
392
+ if ch == '#':
393
+ in_comment = True
394
+ continue
395
+ if ch == '"' or ch == "'":
396
+ in_string = ch
397
+ continue
398
+
399
+ if ch == '{':
400
+ stack.append(i)
401
+ elif ch == '}':
402
+ if stack:
403
+ open_i = stack.pop()
404
+ pairs[open_i] = i
405
+ return pairs
406
+
407
+ def _matching_brace_pos(self, open_brace_pos: int) -> int:
408
+ return self._brace_map.get(open_brace_pos, len(self.text) - 1)
409
+
410
+ def _find_next_code_brace_after(self, start: int) -> Optional[int]:
411
+ """Find next '{' after start, skipping ones in comments/strings by scanning forward again."""
412
+ in_string: Optional[str] = None
413
+ escape = False
414
+ in_comment = False
415
+ for i in range(start, len(self.text)):
416
+ ch = self.text[i]
417
+ if in_comment:
418
+ if ch == '\n':
419
+ in_comment = False
420
+ continue
421
+ if in_string:
422
+ if escape:
423
+ escape = False
424
+ continue
425
+ if ch == '\\':
426
+ escape = True
427
+ continue
428
+ if ch == in_string:
429
+ in_string = None
430
+ continue
431
+ if ch == '#':
432
+ in_comment = True
433
+ continue
434
+ if ch == '"' or ch == "'":
435
+ in_string = ch
436
+ continue
437
+ if ch == '{':
438
+ return i
439
+ return None
440
+
441
+ def _pos_to_line(self, pos: int) -> int:
442
+ return self.text.count('\n', 0, max(0, pos)) + 1
443
+
444
+ def _find_next_char_in_text(self, text: str, ch: str, start: int) -> Optional[int]:
445
+ idx = text.find(ch, start)
446
+ return idx if idx != -1 else None
447
+
448
+ # For nested parsing on a slice (already delimited correctly)
449
+ def _matching_brace_pos_in_text(self, text: str, open_idx: int) -> Optional[int]:
450
+ in_string: Optional[str] = None
451
+ escape = False
452
+ in_comment = False
453
+ depth = 0
454
+ for i in range(open_idx, len(text)):
455
+ ch = text[i]
456
+ if in_comment:
457
+ if ch == '\n':
458
+ in_comment = False
459
+ continue
460
+ if in_string:
461
+ if escape:
462
+ escape = False
463
+ elif ch == '\\':
464
+ escape = True
465
+ elif ch == in_string:
466
+ in_string = None
467
+ continue
468
+ if ch == '#':
469
+ in_comment = True
470
+ continue
471
+ if ch == '"' or ch == "'":
472
+ in_string = ch
473
+ continue
474
+ if ch == '{':
475
+ depth += 1
476
+ elif ch == '}':
477
+ depth -= 1
478
+ if depth == 0:
479
+ return i
480
+ return None
481
+
482
+ def _matching_paren_pos_in_text(self, text: str, open_idx: int) -> Optional[int]:
483
+ in_string: Optional[str] = None
484
+ escape = False
485
+ in_comment = False
486
+ depth = 0
487
+ for i in range(open_idx, len(text)):
488
+ ch = text[i]
489
+ if in_comment:
490
+ if ch == '\n':
491
+ in_comment = False
492
+ continue
493
+ if in_string:
494
+ if escape:
495
+ escape = False
496
+ elif ch == '\\':
497
+ escape = True
498
+ elif ch == in_string:
499
+ in_string = None
500
+ continue
501
+ if ch == '#':
502
+ in_comment = True
503
+ continue
504
+ if ch == '"' or ch == "'":
505
+ in_string = ch
506
+ continue
507
+ if ch == '(':
508
+ depth += 1
509
+ elif ch == ')':
510
+ depth -= 1
511
+ if depth == 0:
512
+ return i
513
+ return None
365
514
 
366
- # Get imports
367
- imports = handler.get_imports()
368
- print(f"\nImports: {imports}")
515
+ def _matching_paren_pos_global(self, open_idx: int) -> Optional[int]:
516
+ """Given an index of '(' in self.text, return the matching ')' index,
517
+ ignoring parentheses inside strings/comments."""
518
+ in_string: Optional[str] = None
519
+ escape = False
520
+ in_comment = False
521
+ depth = 0
522
+ for i in range(open_idx, len(self.text)):
523
+ ch = self.text[i]
524
+ if in_comment:
525
+ if ch == '\n':
526
+ in_comment = False
527
+ continue
528
+ if in_string:
529
+ if escape:
530
+ escape = False
531
+ elif ch == '\\':
532
+ escape = True
533
+ elif ch == in_string:
534
+ in_string = None
535
+ continue
536
+ if ch == '#':
537
+ in_comment = True
538
+ continue
539
+ if ch == '"' or ch == "'":
540
+ in_string = ch
541
+ continue
542
+ if ch == '(':
543
+ depth += 1
544
+ elif ch == ')':
545
+ depth -= 1
546
+ if depth == 0:
547
+ return i
548
+ return None
549
+