bioguider 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bioguider might be problematic. Click here for more details.
- bioguider/agents/agent_utils.py +16 -10
- bioguider/agents/collection_observe_step.py +7 -2
- bioguider/agents/collection_task_utils.py +1 -0
- bioguider/agents/consistency_collection_step.py +102 -0
- bioguider/agents/consistency_evaluation_task.py +57 -0
- bioguider/agents/consistency_evaluation_task_utils.py +14 -0
- bioguider/agents/consistency_observe_step.py +109 -0
- bioguider/agents/consistency_query_step.py +74 -0
- bioguider/agents/evaluation_task.py +0 -110
- bioguider/agents/evaluation_tutorial_task.py +156 -0
- bioguider/agents/evaluation_tutorial_task_prompts.py +114 -0
- bioguider/agents/evaluation_userguide_task.py +13 -43
- bioguider/agents/prompt_utils.py +15 -2
- bioguider/database/code_structure_db.py +20 -9
- bioguider/database/summarized_file_db.py +6 -3
- bioguider/managers/evaluation_manager.py +16 -2
- bioguider/rag/data_pipeline.py +1 -1
- bioguider/utils/code_structure_builder.py +15 -8
- bioguider/utils/constants.py +12 -12
- bioguider/utils/notebook_utils.py +117 -0
- bioguider/utils/{file_handler.py → python_file_handler.py} +1 -1
- bioguider/utils/r_file_handler.py +549 -0
- bioguider/utils/utils.py +34 -1
- {bioguider-0.2.20.dist-info → bioguider-0.2.22.dist-info}/METADATA +1 -1
- {bioguider-0.2.20.dist-info → bioguider-0.2.22.dist-info}/RECORD +27 -23
- bioguider/agents/consistency_collection_execute_step.py +0 -152
- bioguider/agents/consistency_collection_observe_step.py +0 -128
- bioguider/agents/consistency_collection_plan_step.py +0 -128
- bioguider/agents/consistency_collection_task.py +0 -109
- bioguider/agents/consistency_collection_task_utils.py +0 -137
- {bioguider-0.2.20.dist-info → bioguider-0.2.22.dist-info}/LICENSE +0 -0
- {bioguider-0.2.20.dist-info → bioguider-0.2.22.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Union, Dict, Any, List
|
|
4
|
+
import json
|
|
5
|
+
|
|
6
|
+
def extract_markdown_from_notebook(
|
|
7
|
+
ipynb_path: Union[str, Path],
|
|
8
|
+
out_path: Union[str, Path, None] = None,
|
|
9
|
+
) -> Dict[str, Any]:
|
|
10
|
+
"""
|
|
11
|
+
Extract markdown from a Jupyter notebook.
|
|
12
|
+
"""
|
|
13
|
+
ipynb_path = Path(ipynb_path)
|
|
14
|
+
if not ipynb_path.exists():
|
|
15
|
+
raise FileNotFoundError(f"File {ipynb_path} does not exist")
|
|
16
|
+
try:
|
|
17
|
+
with ipynb_path.open("r", encoding="utf-8") as f:
|
|
18
|
+
nb = json.load(f)
|
|
19
|
+
except json.JSONDecodeError:
|
|
20
|
+
raise ValueError(f"File {ipynb_path} is not a valid JSON file")
|
|
21
|
+
|
|
22
|
+
markdown_txts = [
|
|
23
|
+
"\n".join(cell.get("source")) if isinstance(cell.get("source"), list) else cell.get("source") for cell in nb.get("cells", [])
|
|
24
|
+
if cell.get("cell_type") == "markdown"
|
|
25
|
+
]
|
|
26
|
+
text = "\n".join(markdown_txts)
|
|
27
|
+
if out_path is not None:
|
|
28
|
+
with open(out_path, "w", encoding="utf-8") as f:
|
|
29
|
+
f.write(text)
|
|
30
|
+
return text
|
|
31
|
+
|
|
32
|
+
def strip_notebook_to_code_and_markdown(
|
|
33
|
+
ipynb_path: Union[str, Path],
|
|
34
|
+
out_path: Union[str, Path, None] = None,
|
|
35
|
+
keep_top_metadata: bool = True,
|
|
36
|
+
) -> Dict[str, Any]:
|
|
37
|
+
"""
|
|
38
|
+
Load a .ipynb and return a new notebook that:
|
|
39
|
+
- keeps ONLY 'code' and 'markdown' cells
|
|
40
|
+
- empties outputs and execution_count for code cells
|
|
41
|
+
- drops all other cell types (e.g., 'raw')
|
|
42
|
+
- preserves attachments on markdown cells
|
|
43
|
+
- optionally preserves top-level metadata (kernelspec, language_info, etc.)
|
|
44
|
+
|
|
45
|
+
Parameters
|
|
46
|
+
----------
|
|
47
|
+
ipynb_path : str | Path
|
|
48
|
+
Path to the input .ipynb file.
|
|
49
|
+
out_path : str | Path | None, default None
|
|
50
|
+
If provided, write the cleaned notebook to this path.
|
|
51
|
+
keep_top_metadata : bool, default True
|
|
52
|
+
If True, copy top-level metadata as-is (useful for re-running).
|
|
53
|
+
If False, keep only minimal metadata.
|
|
54
|
+
|
|
55
|
+
Returns
|
|
56
|
+
-------
|
|
57
|
+
dict
|
|
58
|
+
The cleaned notebook (nbformat v4-style dict).
|
|
59
|
+
"""
|
|
60
|
+
ipynb_path = Path(ipynb_path)
|
|
61
|
+
if not ipynb_path.exists():
|
|
62
|
+
raise FileNotFoundError(f"File {ipynb_path} does not exist")
|
|
63
|
+
try:
|
|
64
|
+
with ipynb_path.open("r", encoding="utf-8") as f:
|
|
65
|
+
nb = json.load(f)
|
|
66
|
+
except json.JSONDecodeError:
|
|
67
|
+
raise ValueError(f"File {ipynb_path} is not a valid JSON file")
|
|
68
|
+
|
|
69
|
+
nbformat = nb.get("nbformat", 4)
|
|
70
|
+
nbformat_minor = nb.get("nbformat_minor", 5)
|
|
71
|
+
|
|
72
|
+
def _to_text(src) -> str:
|
|
73
|
+
# nbformat allows str or list of lines
|
|
74
|
+
if isinstance(src, list):
|
|
75
|
+
return "".join(src)
|
|
76
|
+
return src or ""
|
|
77
|
+
|
|
78
|
+
new_cells: List[Dict[str, Any]] = []
|
|
79
|
+
for cell in nb.get("cells", []):
|
|
80
|
+
ctype = cell.get("cell_type")
|
|
81
|
+
if ctype == "markdown":
|
|
82
|
+
new_cell = {
|
|
83
|
+
"cell_type": "markdown",
|
|
84
|
+
"metadata": cell.get("metadata", {}),
|
|
85
|
+
"source": _to_text(cell.get("source", "")),
|
|
86
|
+
}
|
|
87
|
+
if "attachments" in cell:
|
|
88
|
+
new_cell["attachments"] = cell["attachments"]
|
|
89
|
+
new_cells.append(new_cell)
|
|
90
|
+
|
|
91
|
+
elif ctype == "code":
|
|
92
|
+
new_cells.append({
|
|
93
|
+
"cell_type": "code",
|
|
94
|
+
"metadata": cell.get("metadata", {}),
|
|
95
|
+
"source": _to_text(cell.get("source", "")),
|
|
96
|
+
"execution_count": None, # clear execution count
|
|
97
|
+
"outputs": [], # strip ALL outputs
|
|
98
|
+
})
|
|
99
|
+
|
|
100
|
+
# else: drop 'raw' and any other unknown cell types
|
|
101
|
+
|
|
102
|
+
# Build new notebook object
|
|
103
|
+
new_nb: Dict[str, Any] = {
|
|
104
|
+
"nbformat": nbformat,
|
|
105
|
+
"nbformat_minor": nbformat_minor,
|
|
106
|
+
"metadata": nb.get("metadata", {}) if keep_top_metadata else {},
|
|
107
|
+
"cells": new_cells,
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
if out_path is not None:
|
|
111
|
+
out_path = Path(out_path)
|
|
112
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
113
|
+
with out_path.open("w", encoding="utf-8") as f:
|
|
114
|
+
json.dump(new_nb, f, ensure_ascii=False, indent=1)
|
|
115
|
+
|
|
116
|
+
return new_nb
|
|
117
|
+
|
|
@@ -0,0 +1,549 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import List, Optional, Tuple
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class RSymbol:
|
|
8
|
+
name: str
|
|
9
|
+
parent: Optional[str]
|
|
10
|
+
start_line: int
|
|
11
|
+
end_line: int
|
|
12
|
+
docstring: Optional[str]
|
|
13
|
+
params: List[str]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RFileHandler:
|
|
17
|
+
# only up to "function("
|
|
18
|
+
FUNC_DEF_HEAD_RE = re.compile(
|
|
19
|
+
r'(?P<name>[A-Za-z.][\w.]*)\s*<-\s*function\s*\(',
|
|
20
|
+
re.MULTILINE,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
S3_METHOD_HEAD_RE = re.compile(
|
|
24
|
+
r'(?P<generic>[A-Za-z.][\w.]*)\.(?P<class>[A-Za-z.][\w.]*)\s*<-\s*function\s*\(',
|
|
25
|
+
re.MULTILINE,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
# R6 method head: "name = function("
|
|
29
|
+
R6_METHOD_HEAD_RE = re.compile(
|
|
30
|
+
r'(?P<mname>[A-Za-z.][\w.]*)\s*=\s*function\s*\(',
|
|
31
|
+
re.MULTILINE,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
# S4 method head inside setMethod(... function(
|
|
35
|
+
S4_METHOD_HEAD_RE = re.compile(
|
|
36
|
+
r'setMethod\s*\(\s*["\'](?P<generic>[^"\']+)["\']\s*,.*?function\s*\(',
|
|
37
|
+
re.MULTILINE | re.DOTALL,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
FUNC_DEF_RE = re.compile(
|
|
41
|
+
# name <- function( ... ) { with multi-line args allowed
|
|
42
|
+
r'(?P<name>[A-Za-z.][\w.]*)\s*<-\s*function\s*\((?P<args>[^)]*)\)\s*\{',
|
|
43
|
+
re.MULTILINE,
|
|
44
|
+
)
|
|
45
|
+
S3_METHOD_RE = re.compile(
|
|
46
|
+
r'(?P<generic>[A-Za-z.][\w.]*)\.(?P<class>[A-Za-z.][\w.]*)\s*<-\s*function\s*\((?P<args>[^)]*)\)\s*\{',
|
|
47
|
+
re.MULTILINE,
|
|
48
|
+
)
|
|
49
|
+
R6_CLASS_RE = re.compile(
|
|
50
|
+
r'(?P<varname>[A-Za-z.][\w.]*)\s*<-\s*R6Class\s*\(\s*["\'](?P<classname>[^"\']+)["\']',
|
|
51
|
+
re.MULTILINE | re.DOTALL,
|
|
52
|
+
)
|
|
53
|
+
R6_METHOD_RE = re.compile(
|
|
54
|
+
r'(?P<mname>[A-Za-z.][\w.]*)\s*=\s*function\s*\((?P<args>[^)]*)\)\s*\{',
|
|
55
|
+
re.MULTILINE,
|
|
56
|
+
)
|
|
57
|
+
S4_CLASS_RE = re.compile(
|
|
58
|
+
r'setClass\s*\(\s*["\'](?P<classname>[^"\']+)["\']',
|
|
59
|
+
re.MULTILINE,
|
|
60
|
+
)
|
|
61
|
+
S4_METHOD_RE = re.compile(
|
|
62
|
+
r'setMethod\s*\(\s*["\'](?P<generic>[^"\']+)["\']\s*,.*?function\s*\((?P<args>[^)]*)\)\s*\{',
|
|
63
|
+
re.MULTILINE | re.DOTALL,
|
|
64
|
+
)
|
|
65
|
+
S4_SIG_CLASS_RE = re.compile(
|
|
66
|
+
r'signature\s*=\s*(?:list\s*\(|\()\s*(?:[^)]*class\s*=\s*["\'](?P<classname>[^"\']+)["\']|["\'](?P<classname2>[^"\']+)["\'])',
|
|
67
|
+
re.MULTILINE,
|
|
68
|
+
)
|
|
69
|
+
LIB_REQUIRE_RE = re.compile(
|
|
70
|
+
r'\b(?:library|require)\s*\(\s*([A-Za-z.][\w.]*)\s*\)',
|
|
71
|
+
re.MULTILINE,
|
|
72
|
+
)
|
|
73
|
+
NS_USE_RE = re.compile(
|
|
74
|
+
r'(?P<pkg>[A-Za-z.][\w.]*):::{0,2}(?P<sym>[A-Za-z.][\w.]*)',
|
|
75
|
+
re.MULTILINE,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
def __init__(self, file_path: str):
|
|
79
|
+
self.file_path = file_path
|
|
80
|
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
81
|
+
self.text = f.read()
|
|
82
|
+
self.lines = self.text.splitlines()
|
|
83
|
+
self._brace_map = self._build_brace_map_safely() # FIX: ignore comments/strings
|
|
84
|
+
|
|
85
|
+
# ---------------- Public API ----------------
|
|
86
|
+
|
|
87
|
+
def get_functions_and_classes(self) -> List[Tuple[str, Optional[str], int, int, Optional[str], List[str]]]:
|
|
88
|
+
items: List[RSymbol] = []
|
|
89
|
+
items.extend(self._parse_functions())
|
|
90
|
+
items.extend(self._parse_s3_methods())
|
|
91
|
+
items.extend(self._parse_r6())
|
|
92
|
+
items.extend(self._parse_s4())
|
|
93
|
+
items.sort(key=lambda s: (s.start_line, s.end_line))
|
|
94
|
+
return [(i.name, i.parent, i.start_line, i.end_line, i.docstring, i.params) for i in items]
|
|
95
|
+
|
|
96
|
+
def get_imports(self) -> List[str]:
|
|
97
|
+
pkgs = set(self.LIB_REQUIRE_RE.findall(self.text))
|
|
98
|
+
for m in self.NS_USE_RE.finditer(self.text):
|
|
99
|
+
pkgs.add(m.group('pkg'))
|
|
100
|
+
return sorted(pkgs)
|
|
101
|
+
|
|
102
|
+
# ---------------- Parsers ----------------
|
|
103
|
+
|
|
104
|
+
def _parse_functions(self) -> List[RSymbol]:
|
|
105
|
+
syms: List[RSymbol] = []
|
|
106
|
+
for m in self.FUNC_DEF_HEAD_RE.finditer(self.text):
|
|
107
|
+
name = m.group('name')
|
|
108
|
+
open_paren = m.end() - 1 # points at '('
|
|
109
|
+
close_paren = self._matching_paren_pos_global(open_paren)
|
|
110
|
+
if close_paren is None:
|
|
111
|
+
continue
|
|
112
|
+
args_text = self.text[open_paren + 1: close_paren]
|
|
113
|
+
args = self._parse_params(args_text)
|
|
114
|
+
|
|
115
|
+
block_open = self._find_next_code_brace_after(close_paren + 1)
|
|
116
|
+
if block_open is None:
|
|
117
|
+
continue
|
|
118
|
+
block_close = self._matching_brace_pos(block_open)
|
|
119
|
+
|
|
120
|
+
start_line = self._pos_to_line(block_open)
|
|
121
|
+
end_line = self._pos_to_line(block_close)
|
|
122
|
+
doc = self._roxygen_before(m.start())
|
|
123
|
+
|
|
124
|
+
syms.append(RSymbol(name=name, parent=None,
|
|
125
|
+
start_line=start_line, end_line=end_line,
|
|
126
|
+
docstring=doc, params=args))
|
|
127
|
+
|
|
128
|
+
# nested
|
|
129
|
+
syms.extend(self._parse_nested_functions(block_open, block_close, parent=name))
|
|
130
|
+
return syms
|
|
131
|
+
|
|
132
|
+
def _parse_nested_functions(self, abs_start: int, abs_end: int, parent: str) -> List[RSymbol]:
|
|
133
|
+
sub = self.text[abs_start:abs_end+1]
|
|
134
|
+
syms: List[RSymbol] = []
|
|
135
|
+
for m in self.FUNC_DEF_HEAD_RE.finditer(sub):
|
|
136
|
+
open_rel = m.end() - 1
|
|
137
|
+
close_rel = self._matching_paren_pos_in_text(sub, open_rel)
|
|
138
|
+
if close_rel is None:
|
|
139
|
+
continue
|
|
140
|
+
args_text = sub[open_rel + 1: close_rel]
|
|
141
|
+
args = self._parse_params(args_text)
|
|
142
|
+
|
|
143
|
+
# brace after ')' within the slice
|
|
144
|
+
func_open_rel = self._find_next_char_in_text(sub, '{', close_rel + 1)
|
|
145
|
+
if func_open_rel is None:
|
|
146
|
+
continue
|
|
147
|
+
func_close_rel = self._matching_brace_pos_in_text(sub, func_open_rel)
|
|
148
|
+
if func_close_rel is None:
|
|
149
|
+
continue
|
|
150
|
+
|
|
151
|
+
block_open = abs_start + func_open_rel
|
|
152
|
+
block_close = abs_start + func_close_rel
|
|
153
|
+
name = m.group('name')
|
|
154
|
+
doc = self._roxygen_before(block_open)
|
|
155
|
+
syms.append(RSymbol(
|
|
156
|
+
name=name, parent=parent,
|
|
157
|
+
start_line=self._pos_to_line(block_open),
|
|
158
|
+
end_line=self._pos_to_line(block_close),
|
|
159
|
+
docstring=doc, params=args
|
|
160
|
+
))
|
|
161
|
+
return syms
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _parse_s3_methods(self) -> List[RSymbol]:
|
|
165
|
+
syms: List[RSymbol] = []
|
|
166
|
+
for m in self.S3_METHOD_HEAD_RE.finditer(self.text):
|
|
167
|
+
generic = m.group('generic')
|
|
168
|
+
clazz = m.group('class')
|
|
169
|
+
name = f"{generic}.{clazz}"
|
|
170
|
+
|
|
171
|
+
open_paren = m.end() - 1
|
|
172
|
+
close_paren = self._matching_paren_pos_global(open_paren)
|
|
173
|
+
if close_paren is None:
|
|
174
|
+
continue
|
|
175
|
+
args_text = self.text[open_paren + 1: close_paren]
|
|
176
|
+
args = self._parse_params(args_text)
|
|
177
|
+
|
|
178
|
+
block_open = self._find_next_code_brace_after(close_paren + 1)
|
|
179
|
+
if block_open is None:
|
|
180
|
+
continue
|
|
181
|
+
block_close = self._matching_brace_pos(block_open)
|
|
182
|
+
|
|
183
|
+
syms.append(RSymbol(
|
|
184
|
+
name=name, parent=generic,
|
|
185
|
+
start_line=self._pos_to_line(block_open),
|
|
186
|
+
end_line=self._pos_to_line(block_close),
|
|
187
|
+
docstring=self._roxygen_before(m.start()),
|
|
188
|
+
params=args
|
|
189
|
+
))
|
|
190
|
+
return syms
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _parse_r6(self) -> List[RSymbol]:
|
|
194
|
+
syms: List[RSymbol] = []
|
|
195
|
+
for m in self.R6_CLASS_RE.finditer(self.text):
|
|
196
|
+
classname = m.group('classname')
|
|
197
|
+
# Find the first '{' after R6Class( — it's the class call's body brace
|
|
198
|
+
first_brace = self._find_next_code_brace_after(m.end())
|
|
199
|
+
if first_brace is None:
|
|
200
|
+
continue
|
|
201
|
+
class_end = self._matching_brace_pos(first_brace)
|
|
202
|
+
syms.append(RSymbol(
|
|
203
|
+
name=classname, parent=None,
|
|
204
|
+
start_line=self._pos_to_line(first_brace),
|
|
205
|
+
end_line=self._pos_to_line(class_end),
|
|
206
|
+
docstring=self._roxygen_before(m.start()),
|
|
207
|
+
params=[]
|
|
208
|
+
))
|
|
209
|
+
# Methods within public/private/active lists
|
|
210
|
+
class_text = self.text[m.start():class_end+1]
|
|
211
|
+
base = m.start()
|
|
212
|
+
for sect in ('public', 'private', 'active'):
|
|
213
|
+
for meth in self._parse_r6_section_methods(class_text, base, sect, classname):
|
|
214
|
+
syms.append(meth)
|
|
215
|
+
return syms
|
|
216
|
+
|
|
217
|
+
def _parse_r6_section_methods(self, class_text: str, base: int, section: str, parent_class: str) -> List[RSymbol]:
|
|
218
|
+
syms: List[RSymbol] = []
|
|
219
|
+
for sec in re.finditer(rf'{section}\s*=\s*list\s*\(', class_text):
|
|
220
|
+
lst_open = sec.end() - 1
|
|
221
|
+
lst_close = self._matching_paren_pos_in_text(class_text, lst_open)
|
|
222
|
+
if lst_close is None:
|
|
223
|
+
continue
|
|
224
|
+
list_text = class_text[lst_open:lst_close+1]
|
|
225
|
+
for m in self.R6_METHOD_HEAD_RE.finditer(list_text):
|
|
226
|
+
open_rel = m.end() - 1
|
|
227
|
+
close_rel = self._matching_paren_pos_in_text(list_text, open_rel)
|
|
228
|
+
if close_rel is None:
|
|
229
|
+
continue
|
|
230
|
+
args_text = list_text[open_rel + 1: close_rel]
|
|
231
|
+
args = self._parse_params(args_text)
|
|
232
|
+
|
|
233
|
+
func_open_rel = self._find_next_char_in_text(list_text, '{', close_rel + 1)
|
|
234
|
+
if func_open_rel is None:
|
|
235
|
+
continue
|
|
236
|
+
func_close_rel = self._matching_brace_pos_in_text(list_text, func_open_rel)
|
|
237
|
+
if func_close_rel is None:
|
|
238
|
+
continue
|
|
239
|
+
|
|
240
|
+
block_open = base + lst_open + func_open_rel
|
|
241
|
+
block_close = base + lst_open + func_close_rel
|
|
242
|
+
|
|
243
|
+
syms.append(RSymbol(
|
|
244
|
+
name=f"{parent_class}${m.group('mname')}",
|
|
245
|
+
parent=parent_class,
|
|
246
|
+
start_line=self._pos_to_line(block_open),
|
|
247
|
+
end_line=self._pos_to_line(block_close),
|
|
248
|
+
docstring=self._roxygen_before(block_open),
|
|
249
|
+
params=args
|
|
250
|
+
))
|
|
251
|
+
return syms
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _parse_s4(self) -> List[RSymbol]:
|
|
255
|
+
syms: List[RSymbol] = []
|
|
256
|
+
for m in self.S4_CLASS_RE.finditer(self.text):
|
|
257
|
+
syms.append(RSymbol(
|
|
258
|
+
name=m.group('classname'), parent=None,
|
|
259
|
+
start_line=self._pos_to_line(m.start()),
|
|
260
|
+
end_line=self._pos_to_line(m.start()),
|
|
261
|
+
docstring=self._roxygen_before(m.start()),
|
|
262
|
+
params=[]
|
|
263
|
+
))
|
|
264
|
+
for m in self.S4_METHOD_HEAD_RE.finditer(self.text):
|
|
265
|
+
generic = m.group('generic')
|
|
266
|
+
|
|
267
|
+
open_paren = m.end() - 1
|
|
268
|
+
close_paren = self._matching_paren_pos_global(open_paren)
|
|
269
|
+
if close_paren is None:
|
|
270
|
+
continue
|
|
271
|
+
args_text = self.text[open_paren + 1: close_paren]
|
|
272
|
+
args = self._parse_params(args_text)
|
|
273
|
+
|
|
274
|
+
block_open = self._find_next_code_brace_after(close_paren + 1)
|
|
275
|
+
block_close = self._matching_brace_pos(block_open) if block_open is not None else m.end()
|
|
276
|
+
|
|
277
|
+
sig_slice = self.text[m.start(): block_open or m.end()]
|
|
278
|
+
cm = self.S4_SIG_CLASS_RE.search(sig_slice)
|
|
279
|
+
clazz = cm.group('classname') if cm and cm.group('classname') else (cm.group('classname2') if cm else None)
|
|
280
|
+
name = f"{generic}{'<' + clazz + '>' if clazz else ''}"
|
|
281
|
+
|
|
282
|
+
syms.append(RSymbol(
|
|
283
|
+
name=name, parent=generic,
|
|
284
|
+
start_line=self._pos_to_line(block_open if block_open is not None else m.start()),
|
|
285
|
+
end_line=self._pos_to_line(block_close),
|
|
286
|
+
docstring=self._roxygen_before(m.start()),
|
|
287
|
+
params=args
|
|
288
|
+
))
|
|
289
|
+
|
|
290
|
+
return syms
|
|
291
|
+
|
|
292
|
+
# ---------------- Utilities ----------------
|
|
293
|
+
|
|
294
|
+
def _parse_params(self, arg_str: str) -> List[str]:
|
|
295
|
+
params = []
|
|
296
|
+
depth = 0
|
|
297
|
+
token = []
|
|
298
|
+
in_s: Optional[str] = None
|
|
299
|
+
escape = False
|
|
300
|
+
for ch in arg_str:
|
|
301
|
+
if in_s:
|
|
302
|
+
token.append(ch)
|
|
303
|
+
if escape:
|
|
304
|
+
escape = False
|
|
305
|
+
elif ch == '\\':
|
|
306
|
+
escape = True
|
|
307
|
+
elif ch == in_s:
|
|
308
|
+
in_s = None
|
|
309
|
+
continue
|
|
310
|
+
if ch in ('"', "'"):
|
|
311
|
+
in_s = ch
|
|
312
|
+
token.append(ch)
|
|
313
|
+
continue
|
|
314
|
+
if ch in '([{':
|
|
315
|
+
depth += 1
|
|
316
|
+
token.append(ch)
|
|
317
|
+
elif ch in ')]}':
|
|
318
|
+
depth -= 1
|
|
319
|
+
token.append(ch)
|
|
320
|
+
elif ch == ',' and depth == 0:
|
|
321
|
+
params.append(''.join(token).strip())
|
|
322
|
+
token = []
|
|
323
|
+
else:
|
|
324
|
+
token.append(ch)
|
|
325
|
+
if token:
|
|
326
|
+
params.append(''.join(token).strip())
|
|
327
|
+
|
|
328
|
+
cleaned = []
|
|
329
|
+
for p in params:
|
|
330
|
+
p = p.strip()
|
|
331
|
+
if not p:
|
|
332
|
+
continue
|
|
333
|
+
if p == '...':
|
|
334
|
+
cleaned.append('...')
|
|
335
|
+
continue
|
|
336
|
+
name = p.split('=')[0].strip()
|
|
337
|
+
if name:
|
|
338
|
+
cleaned.append(name)
|
|
339
|
+
return cleaned
|
|
340
|
+
|
|
341
|
+
def _roxygen_before(self, pos: int) -> Optional[str]:
|
|
342
|
+
line_idx = self._pos_to_line(pos) - 2
|
|
343
|
+
if line_idx < 0:
|
|
344
|
+
return None
|
|
345
|
+
buf = []
|
|
346
|
+
while line_idx >= 0:
|
|
347
|
+
line = self.lines[line_idx]
|
|
348
|
+
s = line.lstrip()
|
|
349
|
+
if s.startswith("#'"):
|
|
350
|
+
buf.append(s[2:].lstrip())
|
|
351
|
+
line_idx -= 1
|
|
352
|
+
continue
|
|
353
|
+
# stop at first non-roxygen line (don’t cross blank + NULL padding blocks)
|
|
354
|
+
break
|
|
355
|
+
if not buf:
|
|
356
|
+
return None
|
|
357
|
+
buf.reverse()
|
|
358
|
+
return '\n'.join(buf).strip() or None
|
|
359
|
+
|
|
360
|
+
# -------- Position / brace helpers (comment/string aware) --------
|
|
361
|
+
|
|
362
|
+
def _build_brace_map_safely(self):
|
|
363
|
+
"""
|
|
364
|
+
Build a map of '{' -> matching '}' while ignoring braces inside:
|
|
365
|
+
- comments starting with '#'
|
|
366
|
+
- single- and double-quoted strings with escapes
|
|
367
|
+
"""
|
|
368
|
+
stack = []
|
|
369
|
+
pairs = {}
|
|
370
|
+
in_string: Optional[str] = None
|
|
371
|
+
escape = False
|
|
372
|
+
in_comment = False
|
|
373
|
+
|
|
374
|
+
for i, ch in enumerate(self.text):
|
|
375
|
+
if in_comment:
|
|
376
|
+
if ch == '\n':
|
|
377
|
+
in_comment = False
|
|
378
|
+
continue
|
|
379
|
+
|
|
380
|
+
if in_string:
|
|
381
|
+
if escape:
|
|
382
|
+
escape = False
|
|
383
|
+
continue
|
|
384
|
+
if ch == '\\':
|
|
385
|
+
escape = True
|
|
386
|
+
continue
|
|
387
|
+
if ch == in_string:
|
|
388
|
+
in_string = None
|
|
389
|
+
continue
|
|
390
|
+
|
|
391
|
+
# not in string/comment
|
|
392
|
+
if ch == '#':
|
|
393
|
+
in_comment = True
|
|
394
|
+
continue
|
|
395
|
+
if ch == '"' or ch == "'":
|
|
396
|
+
in_string = ch
|
|
397
|
+
continue
|
|
398
|
+
|
|
399
|
+
if ch == '{':
|
|
400
|
+
stack.append(i)
|
|
401
|
+
elif ch == '}':
|
|
402
|
+
if stack:
|
|
403
|
+
open_i = stack.pop()
|
|
404
|
+
pairs[open_i] = i
|
|
405
|
+
return pairs
|
|
406
|
+
|
|
407
|
+
def _matching_brace_pos(self, open_brace_pos: int) -> int:
|
|
408
|
+
return self._brace_map.get(open_brace_pos, len(self.text) - 1)
|
|
409
|
+
|
|
410
|
+
def _find_next_code_brace_after(self, start: int) -> Optional[int]:
|
|
411
|
+
"""Find next '{' after start, skipping ones in comments/strings by scanning forward again."""
|
|
412
|
+
in_string: Optional[str] = None
|
|
413
|
+
escape = False
|
|
414
|
+
in_comment = False
|
|
415
|
+
for i in range(start, len(self.text)):
|
|
416
|
+
ch = self.text[i]
|
|
417
|
+
if in_comment:
|
|
418
|
+
if ch == '\n':
|
|
419
|
+
in_comment = False
|
|
420
|
+
continue
|
|
421
|
+
if in_string:
|
|
422
|
+
if escape:
|
|
423
|
+
escape = False
|
|
424
|
+
continue
|
|
425
|
+
if ch == '\\':
|
|
426
|
+
escape = True
|
|
427
|
+
continue
|
|
428
|
+
if ch == in_string:
|
|
429
|
+
in_string = None
|
|
430
|
+
continue
|
|
431
|
+
if ch == '#':
|
|
432
|
+
in_comment = True
|
|
433
|
+
continue
|
|
434
|
+
if ch == '"' or ch == "'":
|
|
435
|
+
in_string = ch
|
|
436
|
+
continue
|
|
437
|
+
if ch == '{':
|
|
438
|
+
return i
|
|
439
|
+
return None
|
|
440
|
+
|
|
441
|
+
def _pos_to_line(self, pos: int) -> int:
|
|
442
|
+
return self.text.count('\n', 0, max(0, pos)) + 1
|
|
443
|
+
|
|
444
|
+
def _find_next_char_in_text(self, text: str, ch: str, start: int) -> Optional[int]:
|
|
445
|
+
idx = text.find(ch, start)
|
|
446
|
+
return idx if idx != -1 else None
|
|
447
|
+
|
|
448
|
+
# For nested parsing on a slice (already delimited correctly)
|
|
449
|
+
def _matching_brace_pos_in_text(self, text: str, open_idx: int) -> Optional[int]:
|
|
450
|
+
in_string: Optional[str] = None
|
|
451
|
+
escape = False
|
|
452
|
+
in_comment = False
|
|
453
|
+
depth = 0
|
|
454
|
+
for i in range(open_idx, len(text)):
|
|
455
|
+
ch = text[i]
|
|
456
|
+
if in_comment:
|
|
457
|
+
if ch == '\n':
|
|
458
|
+
in_comment = False
|
|
459
|
+
continue
|
|
460
|
+
if in_string:
|
|
461
|
+
if escape:
|
|
462
|
+
escape = False
|
|
463
|
+
elif ch == '\\':
|
|
464
|
+
escape = True
|
|
465
|
+
elif ch == in_string:
|
|
466
|
+
in_string = None
|
|
467
|
+
continue
|
|
468
|
+
if ch == '#':
|
|
469
|
+
in_comment = True
|
|
470
|
+
continue
|
|
471
|
+
if ch == '"' or ch == "'":
|
|
472
|
+
in_string = ch
|
|
473
|
+
continue
|
|
474
|
+
if ch == '{':
|
|
475
|
+
depth += 1
|
|
476
|
+
elif ch == '}':
|
|
477
|
+
depth -= 1
|
|
478
|
+
if depth == 0:
|
|
479
|
+
return i
|
|
480
|
+
return None
|
|
481
|
+
|
|
482
|
+
def _matching_paren_pos_in_text(self, text: str, open_idx: int) -> Optional[int]:
|
|
483
|
+
in_string: Optional[str] = None
|
|
484
|
+
escape = False
|
|
485
|
+
in_comment = False
|
|
486
|
+
depth = 0
|
|
487
|
+
for i in range(open_idx, len(text)):
|
|
488
|
+
ch = text[i]
|
|
489
|
+
if in_comment:
|
|
490
|
+
if ch == '\n':
|
|
491
|
+
in_comment = False
|
|
492
|
+
continue
|
|
493
|
+
if in_string:
|
|
494
|
+
if escape:
|
|
495
|
+
escape = False
|
|
496
|
+
elif ch == '\\':
|
|
497
|
+
escape = True
|
|
498
|
+
elif ch == in_string:
|
|
499
|
+
in_string = None
|
|
500
|
+
continue
|
|
501
|
+
if ch == '#':
|
|
502
|
+
in_comment = True
|
|
503
|
+
continue
|
|
504
|
+
if ch == '"' or ch == "'":
|
|
505
|
+
in_string = ch
|
|
506
|
+
continue
|
|
507
|
+
if ch == '(':
|
|
508
|
+
depth += 1
|
|
509
|
+
elif ch == ')':
|
|
510
|
+
depth -= 1
|
|
511
|
+
if depth == 0:
|
|
512
|
+
return i
|
|
513
|
+
return None
|
|
514
|
+
|
|
515
|
+
def _matching_paren_pos_global(self, open_idx: int) -> Optional[int]:
|
|
516
|
+
"""Given an index of '(' in self.text, return the matching ')' index,
|
|
517
|
+
ignoring parentheses inside strings/comments."""
|
|
518
|
+
in_string: Optional[str] = None
|
|
519
|
+
escape = False
|
|
520
|
+
in_comment = False
|
|
521
|
+
depth = 0
|
|
522
|
+
for i in range(open_idx, len(self.text)):
|
|
523
|
+
ch = self.text[i]
|
|
524
|
+
if in_comment:
|
|
525
|
+
if ch == '\n':
|
|
526
|
+
in_comment = False
|
|
527
|
+
continue
|
|
528
|
+
if in_string:
|
|
529
|
+
if escape:
|
|
530
|
+
escape = False
|
|
531
|
+
elif ch == '\\':
|
|
532
|
+
escape = True
|
|
533
|
+
elif ch == in_string:
|
|
534
|
+
in_string = None
|
|
535
|
+
continue
|
|
536
|
+
if ch == '#':
|
|
537
|
+
in_comment = True
|
|
538
|
+
continue
|
|
539
|
+
if ch == '"' or ch == "'":
|
|
540
|
+
in_string = ch
|
|
541
|
+
continue
|
|
542
|
+
if ch == '(':
|
|
543
|
+
depth += 1
|
|
544
|
+
elif ch == ')':
|
|
545
|
+
depth -= 1
|
|
546
|
+
if depth == 0:
|
|
547
|
+
return i
|
|
548
|
+
return None
|
|
549
|
+
|