jupyter-agent 2025.6.104__py3-none-any.whl → 2025.7.100__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jupyter_agent/bot_actions.py +270 -0
- jupyter_agent/bot_agents/__init__.py +0 -42
- jupyter_agent/bot_agents/base.py +89 -45
- jupyter_agent/bot_agents/master_planner.py +1 -0
- jupyter_agent/bot_agents/output_task_result.py +6 -7
- jupyter_agent/bot_agents/prepare_next_cell.py +52 -0
- jupyter_agent/bot_agents/request_user_supply.py +186 -0
- jupyter_agent/bot_agents/task_code_executor.py +3 -2
- jupyter_agent/bot_agents/task_planner_v3.py +16 -13
- jupyter_agent/bot_agents/task_reasoner.py +3 -2
- jupyter_agent/bot_agents/task_structrue_reasoner.py +22 -12
- jupyter_agent/bot_agents/task_structrue_summarier.py +22 -18
- jupyter_agent/bot_agents/task_summarier.py +3 -2
- jupyter_agent/bot_agents/task_verifier.py +2 -1
- jupyter_agent/bot_agents/task_verify_summarier.py +6 -6
- jupyter_agent/bot_chat.py +2 -2
- jupyter_agent/bot_contexts.py +37 -29
- jupyter_agent/bot_evaluation.py +262 -143
- jupyter_agent/bot_evaluators/__init__.py +0 -0
- jupyter_agent/bot_evaluators/base.py +42 -0
- jupyter_agent/bot_evaluators/dummy_flow.py +20 -0
- jupyter_agent/bot_evaluators/dummy_global.py +20 -0
- jupyter_agent/bot_evaluators/dummy_task.py +20 -0
- jupyter_agent/bot_evaluators/flow_global_planning.py +88 -0
- jupyter_agent/bot_evaluators/flow_task_executor.py +152 -0
- jupyter_agent/bot_flows/__init__.py +0 -4
- jupyter_agent/bot_flows/base.py +120 -41
- jupyter_agent/bot_flows/master_planner.py +15 -4
- jupyter_agent/bot_flows/task_executor_v3.py +57 -38
- jupyter_agent/bot_magics.py +119 -69
- jupyter_agent/bot_outputs.py +37 -43
- jupyter_agent/utils.py +20 -31
- {jupyter_agent-2025.6.104.dist-info → jupyter_agent-2025.7.100.dist-info}/METADATA +56 -4
- jupyter_agent-2025.7.100.dist-info/RECORD +41 -0
- jupyter_agent/bot_agents/task_planner_v1.py +0 -158
- jupyter_agent/bot_agents/task_planner_v2.py +0 -172
- jupyter_agent/bot_flows/task_executor_v1.py +0 -86
- jupyter_agent/bot_flows/task_executor_v2.py +0 -84
- jupyter_agent-2025.6.104.dist-info/RECORD +0 -35
- {jupyter_agent-2025.6.104.dist-info → jupyter_agent-2025.7.100.dist-info}/WHEEL +0 -0
- {jupyter_agent-2025.6.104.dist-info → jupyter_agent-2025.7.100.dist-info}/entry_points.txt +0 -0
- {jupyter_agent-2025.6.104.dist-info → jupyter_agent-2025.7.100.dist-info}/licenses/LICENSE +0 -0
- {jupyter_agent-2025.6.104.dist-info → jupyter_agent-2025.7.100.dist-info}/top_level.txt +0 -0
jupyter_agent/bot_evaluation.py
CHANGED
@@ -12,144 +12,271 @@ import argparse
|
|
12
12
|
import nbformat
|
13
13
|
|
14
14
|
from pathlib import Path
|
15
|
+
from typing import Optional
|
16
|
+
from enum import Enum
|
17
|
+
from pydantic import BaseModel, Field
|
15
18
|
from nbclient.client import NotebookClient
|
16
|
-
from .
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
is_global_finished = False
|
47
|
-
|
48
|
-
def save_notebook(**kwargs):
|
49
|
-
"""Save the executed notebook to the specified output path."""
|
50
|
-
nonlocal is_global_finished
|
51
|
-
|
52
|
-
if kwargs:
|
53
|
-
cell_idx = kwargs.get("cell_index", 0)
|
54
|
-
cell_type = kwargs.get("cell", {}).get("cell_type")
|
55
|
-
cell_id = kwargs.get("cell", {}).get("id")
|
56
|
-
cell_exec_count = kwargs.get("cell", {}).get("execution_count")
|
57
|
-
cell_meta = kwargs.get("cell", {}).get("metadata", {})
|
58
|
-
cell_payloads = kwargs.get("execute_reply", {}).get("content", {}).get("payload", [])
|
59
|
-
cell_outputs = kwargs.get("cell", {}).get("outputs", [])
|
60
|
-
for payload in cell_payloads:
|
61
|
-
if payload.get("source") == "set_next_input" and payload.get("replace") is True:
|
62
|
-
print(f"CELL[{cell_idx}] Replacing cell with set_next_input payload")
|
63
|
-
nb.cells[cell_idx].source = payload.get("text", "")
|
64
|
-
cell_agent_data_timestamp = cell_meta.get("jupyter-agent-data-timestamp", 0)
|
65
|
-
output_agent_data_timestamp = cell_agent_data_timestamp
|
66
|
-
for output in cell_outputs:
|
67
|
-
if output["output_type"] == "display_data":
|
68
|
-
output_meta = output.get("metadata", {})
|
69
|
-
if (
|
70
|
-
output_meta.get("jupyter-agent-data-store")
|
71
|
-
and output_meta.get("jupyter-agent-data-timestamp", 0) > output_agent_data_timestamp
|
72
|
-
and output_meta.get("jupyter-agent-data", {})
|
73
|
-
):
|
74
|
-
print(f"CELL[{cell_idx}] Found jupyter-agent-data-store outputs, save it to cell metadata")
|
75
|
-
output_agent_data_timestamp = output_meta.get("jupyter-agent-data-timestamp", 0)
|
76
|
-
nb.cells[cell_idx].metadata["jupyter-agent-data-store"] = True
|
77
|
-
nb.cells[cell_idx].metadata["jupyter-agent-data-timestamp"] = output_agent_data_timestamp
|
78
|
-
if "jupyter-agent-data" not in nb.cells[cell_idx].metadata:
|
79
|
-
nb.cells[cell_idx].metadata["jupyter-agent-data"] = {}
|
80
|
-
nb.cells[cell_idx].metadata["jupyter-agent-data"].update(output_meta["jupyter-agent-data"])
|
81
|
-
for record in output_meta.get("jupyter-agent-evaluation-records", []):
|
82
|
-
record["notebook_name"] = output_path
|
83
|
-
if record["eval_type"] == "NOTEBOOK":
|
84
|
-
record["execution_duration"] = time.time() - start_time
|
85
|
-
is_global_finished = True
|
86
|
-
del nb.cells[cell_idx + 1 :] # Remove all cells after the notebook cell
|
87
|
-
print(
|
88
|
-
f"CELL[{cell_idx}] Evaluating record: {record['eval_type']} "
|
89
|
-
f"duration: {record['execution_duration']:.2f}s "
|
90
|
-
f"success: {record['is_success']} "
|
91
|
-
f"correct: {record['correct_score']:.2f}"
|
92
|
-
)
|
93
|
-
if evaluation_path:
|
94
|
-
with open(evaluation_path, "a") as eval_file:
|
95
|
-
eval_file.write(json.dumps(record) + "\n")
|
96
|
-
print(f"CELL[{cell_idx}] Saving executed {cell_type} cell - {cell_id}: {cell_exec_count}")
|
97
|
-
else:
|
98
|
-
print(f"Saving executed notebook to: {output_path}")
|
99
|
-
nbformat.write(nb, output_path)
|
100
|
-
|
101
|
-
# Add metadata to the notebook
|
102
|
-
nb.cells.insert(
|
103
|
-
0,
|
104
|
-
nbformat.v4.new_code_cell(
|
105
|
-
source=(
|
106
|
-
f"# Executed notebook: {input_path.name}\n"
|
107
|
-
f"# Output saved to: {output_path}\n\n"
|
108
|
-
f"__evaluation_ipynb_file__ = '{output_path}'\n"
|
109
|
-
),
|
110
|
-
metadata={"tags": ["CTX_EXCLUDE"]},
|
111
|
-
),
|
112
|
-
)
|
113
|
-
save_notebook()
|
114
|
-
|
115
|
-
# Configure nbclient to run the notebook
|
116
|
-
client = NotebookClient(
|
117
|
-
nb,
|
118
|
-
timeout=timeout,
|
119
|
-
startup_timeout=startup_timeout,
|
120
|
-
skip_cells_with_tag=skip_cells_with_tag,
|
121
|
-
allow_errors=allow_errors,
|
122
|
-
kernel_name=kernel_name,
|
123
|
-
resources={"metadata": {"path": input_path.parent.absolute()}},
|
124
|
-
on_cell_executed=save_notebook,
|
125
|
-
)
|
19
|
+
from .bot_actions import ActionBase, ActionSetCellContent, SetCellContentParams, get_action_class
|
20
|
+
|
21
|
+
|
22
|
+
class BaseEvaluationRecord(BaseModel):
|
23
|
+
timestamp: float = 0
|
24
|
+
notebook_name: str = ""
|
25
|
+
evaluator: str = ""
|
26
|
+
eval_type: str = "BASE"
|
27
|
+
cell_index: int = -1
|
28
|
+
execution_duration: float = 0.0
|
29
|
+
is_success: bool = False
|
30
|
+
correct_score: float = 0.0
|
31
|
+
|
32
|
+
|
33
|
+
class StageEvaluationRecord(BaseEvaluationRecord):
|
34
|
+
eval_type: str = "STAGE"
|
35
|
+
flow: str = ""
|
36
|
+
stage: str = ""
|
37
|
+
agent: str = ""
|
38
|
+
|
39
|
+
|
40
|
+
class FlowEvaluationRecord(BaseEvaluationRecord):
|
41
|
+
eval_type: str = "FLOW"
|
42
|
+
flow: str = ""
|
43
|
+
stage_count: int = 0
|
44
|
+
planning_score: float = 0.0
|
45
|
+
reasoning_score: float = 0.0
|
46
|
+
coding_score: float = 0.0
|
47
|
+
important_score: float = 0.0
|
48
|
+
user_supply_score: float = 0.0
|
126
49
|
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
50
|
+
|
51
|
+
class NotebookEvaluationRecord(BaseEvaluationRecord):
|
52
|
+
eval_type: str = "NOTEBOOK"
|
53
|
+
flow_count: int = 0
|
54
|
+
planning_score: float = 0.0
|
55
|
+
coding_score: float = 0.0
|
56
|
+
important_score: float = 0.0
|
57
|
+
user_supply_score: float = 0.0
|
58
|
+
|
59
|
+
|
60
|
+
class NotebookRunner:
|
61
|
+
|
62
|
+
def __init__(
|
63
|
+
self,
|
64
|
+
input_path: str | Path,
|
65
|
+
output_path: str | Path = "",
|
66
|
+
evaluate_path: str | Path = "",
|
67
|
+
timeout: int = -1,
|
68
|
+
startup_timeout: int = 60,
|
69
|
+
allow_errors: bool = False,
|
70
|
+
skip_cells_with_tag: str = "skip-execution",
|
71
|
+
**kwargs,
|
72
|
+
):
|
73
|
+
self.input_path = Path(input_path).with_suffix(".ipynb")
|
74
|
+
self.output_path = output_path
|
75
|
+
self.evaluate_path = evaluate_path
|
76
|
+
self.start_time = 0
|
77
|
+
self.is_global_finished = False
|
78
|
+
|
79
|
+
if not self.output_path:
|
80
|
+
self.output_path = self.input_path.parent.joinpath(f"{self.input_path.with_suffix('').name}_eval.ipynb")
|
81
|
+
if not self.evaluate_path:
|
82
|
+
self.evaluate_path = self.input_path.parent.joinpath(f"{self.input_path.with_suffix('').name}_eval.jsonl")
|
83
|
+
self.output_path = Path(self.output_path).absolute()
|
84
|
+
self.evaluate_path = Path(self.evaluate_path).absolute()
|
85
|
+
|
86
|
+
with self.input_path.open() as f:
|
87
|
+
print("Opening notebook:", input_path)
|
88
|
+
self.notebook = nbformat.read(f, as_version=4)
|
89
|
+
|
90
|
+
self.client = NotebookClient(
|
91
|
+
self.notebook,
|
92
|
+
timeout=timeout,
|
93
|
+
startup_timeout=startup_timeout,
|
94
|
+
skip_cells_with_tag=skip_cells_with_tag,
|
95
|
+
allow_errors=allow_errors,
|
96
|
+
resources={"metadata": {"path": self.input_path.parent.absolute()}},
|
97
|
+
on_notebook_start=self.on_notebook_start,
|
98
|
+
on_notebook_complete=self.on_notebook_complete,
|
99
|
+
on_cell_executed=self.on_cell_executed,
|
100
|
+
**kwargs,
|
143
101
|
)
|
102
|
+
|
103
|
+
def save_evaluation_record(self, record: BaseEvaluationRecord):
|
104
|
+
|
105
|
+
if isinstance(record, FlowEvaluationRecord):
|
106
|
+
eval_source = record.flow + "-" + record.evaluator
|
107
|
+
elif isinstance(record, StageEvaluationRecord):
|
108
|
+
eval_source = record.flow + "-" + record.stage + "-" + record.agent + "-" + record.evaluator
|
109
|
+
else:
|
110
|
+
eval_source = record.evaluator
|
144
111
|
print(
|
145
|
-
f"
|
112
|
+
f"CELL[{record.cell_index}] Evaluation: {record.eval_type}[{eval_source}] "
|
113
|
+
f"{'SUCCESS' if record.is_success else 'FAILURE'} "
|
146
114
|
f"duration: {record.execution_duration:.2f}s "
|
147
|
-
f"success: {record.is_success} "
|
148
115
|
f"correct: {record.correct_score:.2f}"
|
149
116
|
)
|
150
|
-
if
|
151
|
-
with open(
|
152
|
-
eval_file.write(
|
117
|
+
if self.evaluate_path:
|
118
|
+
with open(self.evaluate_path, "a") as eval_file:
|
119
|
+
eval_file.write(record.model_dump_json() + "\n")
|
120
|
+
|
121
|
+
def handle_cell_payloads(self, cell_index, cell_payloads):
|
122
|
+
for payload in cell_payloads:
|
123
|
+
if payload.get("source") == "set_next_input" and payload.get("replace") is True:
|
124
|
+
print(f"CELL[{cell_index}] Replacing cell with set_next_input payload")
|
125
|
+
self.notebook.cells[cell_index].source = payload.get("text", "")
|
126
|
+
|
127
|
+
def handle_jupyter_agent_data(self, cell_index, cell_meta, cell_output_metas):
|
128
|
+
cell_agent_data_timestamp = cell_meta.get("jupyter-agent-data-timestamp", 0)
|
129
|
+
output_agent_data_timestamp = cell_agent_data_timestamp
|
130
|
+
for output_meta in cell_output_metas:
|
131
|
+
if (
|
132
|
+
output_meta.get("jupyter-agent-data-store")
|
133
|
+
and output_meta.get("jupyter-agent-data-timestamp", 0) > cell_agent_data_timestamp
|
134
|
+
and output_meta.get("jupyter-agent-data", {})
|
135
|
+
):
|
136
|
+
print(f"CELL[{cell_index}] Found jupyter-agent-data-store outputs, save it to cell metadata")
|
137
|
+
output_agent_data_timestamp = max(
|
138
|
+
output_agent_data_timestamp,
|
139
|
+
output_meta.get("jupyter-agent-data-timestamp", 0),
|
140
|
+
)
|
141
|
+
self.notebook.cells[cell_index].metadata["jupyter-agent-data-store"] = True
|
142
|
+
self.notebook.cells[cell_index].metadata["jupyter-agent-data-timestamp"] = output_agent_data_timestamp
|
143
|
+
if "jupyter-agent-data" not in self.notebook.cells[cell_index].metadata:
|
144
|
+
self.notebook.cells[cell_index].metadata["jupyter-agent-data"] = {}
|
145
|
+
self.notebook.cells[cell_index].metadata["jupyter-agent-data"].update(
|
146
|
+
output_meta["jupyter-agent-data"]
|
147
|
+
)
|
148
|
+
|
149
|
+
def handle_evaluation_record(self, cell_index, cell_output_metas):
|
150
|
+
is_bot_cell = False
|
151
|
+
is_flow_completed = False
|
152
|
+
for output_meta in cell_output_metas:
|
153
|
+
for record in output_meta.get("jupyter-agent-evaluation-records", []):
|
154
|
+
is_bot_cell = True
|
155
|
+
if record["eval_type"] == "NOTEBOOK":
|
156
|
+
record = NotebookEvaluationRecord(**record)
|
157
|
+
record.timestamp = record.timestamp or time.time()
|
158
|
+
record.notebook_name = str(self.output_path)
|
159
|
+
record.execution_duration = time.time() - self.start_time
|
160
|
+
self.is_global_finished = True
|
161
|
+
is_flow_completed = True
|
162
|
+
del self.notebook.cells[cell_index + 1 :] # Remove all cells after the notebook cell
|
163
|
+
elif record["eval_type"] == "FLOW":
|
164
|
+
record = FlowEvaluationRecord(**record)
|
165
|
+
record.timestamp = record.timestamp or time.time()
|
166
|
+
record.notebook_name = str(self.output_path)
|
167
|
+
is_flow_completed = True
|
168
|
+
elif record["eval_type"] == "STAGE":
|
169
|
+
record = StageEvaluationRecord(**record)
|
170
|
+
record.timestamp = record.timestamp or time.time()
|
171
|
+
record.notebook_name = str(self.output_path)
|
172
|
+
else:
|
173
|
+
record = BaseEvaluationRecord(**record)
|
174
|
+
record.timestamp = record.timestamp or time.time()
|
175
|
+
record.notebook_name = str(self.output_path)
|
176
|
+
self.save_evaluation_record(record)
|
177
|
+
if is_bot_cell and not is_flow_completed:
|
178
|
+
self.save_evaluation_record(
|
179
|
+
FlowEvaluationRecord(
|
180
|
+
timestamp=time.time(),
|
181
|
+
notebook_name=str(self.output_path),
|
182
|
+
evaluator="bot",
|
183
|
+
eval_type="FLOW",
|
184
|
+
cell_index=cell_index,
|
185
|
+
is_success=False,
|
186
|
+
)
|
187
|
+
)
|
188
|
+
|
189
|
+
def handle_set_next_cell(self, cell_index, action):
|
190
|
+
if action.params.index == 0:
|
191
|
+
self.notebook.cells[cell_index].source = action.params.source
|
192
|
+
self.notebook.cells[cell_index].metadata.update(action.params.metadata)
|
193
|
+
self.notebook.cells[cell_index].metadata["tags"] = action.params.tags
|
194
|
+
print(f"CELL[{cell_index}] Replacing cell with set_next_cell action")
|
195
|
+
return cell_index
|
196
|
+
else:
|
197
|
+
metadata = dict(action.params.metadata)
|
198
|
+
metadata["tags"] = action.params.tags
|
199
|
+
if action.params.type == "code":
|
200
|
+
new_cell = nbformat.v4.new_code_cell(source=action.params.source, metadata=metadata)
|
201
|
+
elif action.params.type == "markdown":
|
202
|
+
new_cell = nbformat.v4.new_markdown_cell(source=action.params.source, metadata=metadata)
|
203
|
+
else:
|
204
|
+
raise ValueError(f"Unsupported cell type: {action.params.type}")
|
205
|
+
insert_idx = cell_index if action.params.index == -1 else cell_index + action.params.index
|
206
|
+
ret_idx = cell_index + 1 if action.params.index == -1 else cell_index
|
207
|
+
self.notebook.cells.insert(insert_idx, new_cell)
|
208
|
+
print(f"CELL[{cell_index}] Inserting cell at [{insert_idx}] with set_next_cell action")
|
209
|
+
return ret_idx
|
210
|
+
|
211
|
+
def handle_jupyter_agent_actions(self, cell_index, cell_meta, cell_output_metas):
|
212
|
+
cell_action_timestamp = cell_meta.get("jupyter-agent-action-timestamp", 0)
|
213
|
+
output_action_timestamp = cell_action_timestamp
|
214
|
+
for output_meta in cell_output_metas:
|
215
|
+
for action in output_meta.get("jupyter-agent-action-records", []):
|
216
|
+
action = get_action_class(action["action"])(**action)
|
217
|
+
if action.timestamp > cell_action_timestamp:
|
218
|
+
output_action_timestamp = max(action.timestamp, output_action_timestamp)
|
219
|
+
if isinstance(action, ActionSetCellContent):
|
220
|
+
print(f"CELL[{cell_index}] Action: {action.action} - {action.source} - {action.timestamp}")
|
221
|
+
cell_index = self.handle_set_next_cell(cell_index, action)
|
222
|
+
self.notebook.cells[cell_index].metadata["jupyter-agent-action-timestamp"] = output_action_timestamp
|
223
|
+
|
224
|
+
def on_cell_executed(self, cell_index, cell, execute_reply):
|
225
|
+
cell_id = cell.get("id")
|
226
|
+
cell_type = cell.get("cell_type")
|
227
|
+
cell_meta = cell.get("metadata", {})
|
228
|
+
cell_outputs = cell.get("outputs", [])
|
229
|
+
cell_payloads = execute_reply.get("content", {}).get("payload", [])
|
230
|
+
cell_output_metas = [
|
231
|
+
output["metadata"]
|
232
|
+
for output in cell_outputs
|
233
|
+
if output.get("output_type") == "display_data" and output.get("metadata")
|
234
|
+
]
|
235
|
+
self.handle_cell_payloads(cell_index, cell_payloads)
|
236
|
+
self.handle_jupyter_agent_data(cell_index, cell_meta, cell_output_metas)
|
237
|
+
self.handle_evaluation_record(cell_index, cell_output_metas)
|
238
|
+
self.handle_jupyter_agent_actions(cell_index, cell_meta, cell_output_metas)
|
239
|
+
print(f"CELL[{cell_index}] Saving executed {cell_type} cell - {cell_id}")
|
240
|
+
nbformat.write(self.notebook, self.output_path)
|
241
|
+
|
242
|
+
def on_notebook_start(self, notebook):
|
243
|
+
print("Notebook execution started.")
|
244
|
+
self.start_time = time.time()
|
245
|
+
if not self.notebook.cells[0].source.startswith("# -*- Jupyter Agent Evaluation Notebook -*-"):
|
246
|
+
self.notebook.cells.insert(
|
247
|
+
0,
|
248
|
+
nbformat.v4.new_code_cell(
|
249
|
+
source=(
|
250
|
+
f"# -*- Jupyter Agent Evaluation Notebook -*-\n"
|
251
|
+
f"# Executed notebook: {self.input_path}\n"
|
252
|
+
f"# Output saved to: {self.output_path}\n\n"
|
253
|
+
f"__evaluation_ipynb_file__ = '{self.output_path}'\n"
|
254
|
+
),
|
255
|
+
metadata={"tags": ["CTX_EXCLUDE"]},
|
256
|
+
),
|
257
|
+
)
|
258
|
+
|
259
|
+
def on_notebook_complete(self, notebook):
|
260
|
+
print("Notebook execution completed.")
|
261
|
+
# If the notebook did not finish globally, append an evaluation record
|
262
|
+
if not self.is_global_finished:
|
263
|
+
print("Notebook execution did not finish globally, appending evaluation records.")
|
264
|
+
self.save_evaluation_record(
|
265
|
+
NotebookEvaluationRecord(
|
266
|
+
notebook_name=str(self.output_path),
|
267
|
+
timestamp=time.time(),
|
268
|
+
evaluator="bot",
|
269
|
+
eval_type="NOTEBOOK",
|
270
|
+
execution_duration=time.time() - self.start_time,
|
271
|
+
is_success=False,
|
272
|
+
)
|
273
|
+
)
|
274
|
+
print(f"Saving executed notebook to: {self.output_path}")
|
275
|
+
nbformat.write(self.notebook, self.output_path)
|
276
|
+
|
277
|
+
def run(self):
|
278
|
+
|
279
|
+
self.client.execute()
|
153
280
|
|
154
281
|
|
155
282
|
def main():
|
@@ -159,14 +286,7 @@ def main():
|
|
159
286
|
"-o", "--output_path", type=str, default="", help="Path to save the executed notebook (default: same as input)"
|
160
287
|
)
|
161
288
|
parser.add_argument(
|
162
|
-
"-
|
163
|
-
)
|
164
|
-
parser.add_argument(
|
165
|
-
"-e",
|
166
|
-
"--evaluation_path",
|
167
|
-
type=str,
|
168
|
-
default="",
|
169
|
-
help="Path to save evaluation records (default: no evaluation records saved)",
|
289
|
+
"-e", "--evaluate_path", type=str, default="", help="Path to save evaluate records (default: same as input)"
|
170
290
|
)
|
171
291
|
parser.add_argument(
|
172
292
|
"--timeout", type=int, default=-1, help="Execution timeout in seconds (default: -1, no timeout)"
|
@@ -189,17 +309,16 @@ def main():
|
|
189
309
|
parser.add_argument("input_path", type=str, help="Path to the input notebook file")
|
190
310
|
args = parser.parse_args()
|
191
311
|
|
192
|
-
|
312
|
+
NotebookRunner(
|
193
313
|
input_path=args.input_path,
|
194
314
|
output_path=args.output_path,
|
195
|
-
|
315
|
+
evaluate_path=args.evaluate_path,
|
196
316
|
timeout=args.timeout,
|
197
317
|
startup_timeout=args.startup_timeout,
|
198
318
|
allow_errors=args.allow_errors,
|
199
319
|
kernel_name=args.kernel_name,
|
200
320
|
skip_cells_with_tag=args.skip_cells_with_tag,
|
201
|
-
|
202
|
-
)
|
321
|
+
).run()
|
203
322
|
|
204
323
|
|
205
324
|
if __name__ == "__main__":
|
File without changes
|
@@ -0,0 +1,42 @@
|
|
1
|
+
"""
|
2
|
+
Copyright (c) 2025 viewstar000
|
3
|
+
|
4
|
+
This software is released under the MIT License.
|
5
|
+
https://opensource.org/licenses/MIT
|
6
|
+
"""
|
7
|
+
|
8
|
+
import importlib
|
9
|
+
|
10
|
+
from ..bot_outputs import _B
|
11
|
+
from ..bot_agents.base import BaseChatAgent, AgentOutputFormat, AgentModelType, AgentFactory
|
12
|
+
|
13
|
+
|
14
|
+
class BaseEvaluator(BaseChatAgent):
|
15
|
+
"""
|
16
|
+
Base class for evaluators.
|
17
|
+
"""
|
18
|
+
|
19
|
+
OUTPUT_FORMAT = AgentOutputFormat.JSON
|
20
|
+
MODEL_TYPE = AgentModelType.EVALUATING
|
21
|
+
DISPLAY_REPLY = False
|
22
|
+
|
23
|
+
def on_reply(self, reply):
|
24
|
+
_B(reply.model_dump_json(indent=2), title="Evaluator Reply", format="code", code_language="json")
|
25
|
+
return reply
|
26
|
+
|
27
|
+
def __call__(self, **kwargs):
|
28
|
+
# Ensure BaseChatAgent has a __call__ method, otherwise call a valid method
|
29
|
+
result = super().__call__(**kwargs) if hasattr(super(), "__call__") else None
|
30
|
+
if result is not None:
|
31
|
+
return result[-1]
|
32
|
+
raise NotImplementedError("BaseChatAgent does not implement __call__ method.")
|
33
|
+
|
34
|
+
|
35
|
+
class EvaluatorFactory(AgentFactory):
|
36
|
+
|
37
|
+
def get_agent_class(self, agent_class):
|
38
|
+
if isinstance(agent_class, str):
|
39
|
+
bot_agents = importlib.import_module("..bot_evaluators", __package__)
|
40
|
+
agent_class = getattr(bot_agents, agent_class)
|
41
|
+
assert issubclass(agent_class, BaseEvaluator), "Unsupported agent class: {}".format(agent_class)
|
42
|
+
return agent_class
|
@@ -0,0 +1,20 @@
|
|
1
|
+
"""
|
2
|
+
Copyright (c) 2025 viewstar000
|
3
|
+
|
4
|
+
This software is released under the MIT License.
|
5
|
+
https://opensource.org/licenses/MIT
|
6
|
+
"""
|
7
|
+
|
8
|
+
import time
|
9
|
+
|
10
|
+
from .base import BaseEvaluator
|
11
|
+
from ..bot_evaluation import FlowEvaluationRecord
|
12
|
+
|
13
|
+
|
14
|
+
class DummyFlowEvaluator(BaseEvaluator):
|
15
|
+
|
16
|
+
def __call__(self, **kwargs):
|
17
|
+
"""
|
18
|
+
Dummy evaluator that does nothing and returns a dummy response.
|
19
|
+
"""
|
20
|
+
return FlowEvaluationRecord(timestamp=time.time(), evaluator="dummy")
|
@@ -0,0 +1,20 @@
|
|
1
|
+
"""
|
2
|
+
Copyright (c) 2025 viewstar000
|
3
|
+
|
4
|
+
This software is released under the MIT License.
|
5
|
+
https://opensource.org/licenses/MIT
|
6
|
+
"""
|
7
|
+
|
8
|
+
import time
|
9
|
+
|
10
|
+
from .base import BaseEvaluator
|
11
|
+
from ..bot_evaluation import NotebookEvaluationRecord
|
12
|
+
|
13
|
+
|
14
|
+
class DummyGlobalEvaluator(BaseEvaluator):
|
15
|
+
|
16
|
+
def __call__(self, **kwargs):
|
17
|
+
"""
|
18
|
+
Dummy evaluator that does nothing and returns a dummy response.
|
19
|
+
"""
|
20
|
+
return NotebookEvaluationRecord(timestamp=time.time(), evaluator="dummy")
|
@@ -0,0 +1,20 @@
|
|
1
|
+
"""
|
2
|
+
Copyright (c) 2025 viewstar000
|
3
|
+
|
4
|
+
This software is released under the MIT License.
|
5
|
+
https://opensource.org/licenses/MIT
|
6
|
+
"""
|
7
|
+
|
8
|
+
import time
|
9
|
+
|
10
|
+
from .base import BaseEvaluator
|
11
|
+
from ..bot_evaluation import StageEvaluationRecord
|
12
|
+
|
13
|
+
|
14
|
+
class DummyTaskEvaluator(BaseEvaluator):
|
15
|
+
|
16
|
+
def __call__(self, **kwargs):
|
17
|
+
"""
|
18
|
+
Dummy evaluator that does nothing and returns a dummy response.
|
19
|
+
"""
|
20
|
+
return StageEvaluationRecord(timestamp=time.time(), evaluator="dummy")
|
@@ -0,0 +1,88 @@
|
|
1
|
+
"""
|
2
|
+
Copyright (c) 2025 viewstar000
|
3
|
+
|
4
|
+
This software is released under the MIT License.
|
5
|
+
https://opensource.org/licenses/MIT
|
6
|
+
"""
|
7
|
+
|
8
|
+
import time
|
9
|
+
|
10
|
+
from enum import Enum
|
11
|
+
from typing import Optional, List
|
12
|
+
from pydantic import BaseModel, Field
|
13
|
+
from IPython.display import Markdown
|
14
|
+
from .base import BaseEvaluator
|
15
|
+
from ..bot_outputs import _D, _I, _W, _E, _F, _A, _O, _C, _M, _B
|
16
|
+
from ..bot_evaluation import FlowEvaluationRecord
|
17
|
+
|
18
|
+
|
19
|
+
FLOW_GLOBAL_PLANNING_EVAL_PROMPT = """\
|
20
|
+
**角色定义**:
|
21
|
+
|
22
|
+
你是一个任务规划评估专家,负责对任务规划的结果进行评估。
|
23
|
+
|
24
|
+
**任务要求**:
|
25
|
+
|
26
|
+
请你根据任务规划的结果,评估任务规划的质量和准确性,并给出相应的评分和反馈。
|
27
|
+
|
28
|
+
{% include "TASK_OUTPUT_FORMAT" %}
|
29
|
+
|
30
|
+
---
|
31
|
+
|
32
|
+
**当前用户提交的任务目标**
|
33
|
+
|
34
|
+
{{ task.source }}
|
35
|
+
|
36
|
+
---
|
37
|
+
|
38
|
+
**当前生成的全局任务规划**
|
39
|
+
|
40
|
+
{{ task.result }}
|
41
|
+
|
42
|
+
---
|
43
|
+
|
44
|
+
请按要求给出当前任务规划的评估结果:
|
45
|
+
"""
|
46
|
+
|
47
|
+
|
48
|
+
class FlowGlobalPlanningEvalResult(BaseModel):
|
49
|
+
"""
|
50
|
+
任务规划评估结果
|
51
|
+
"""
|
52
|
+
|
53
|
+
is_correct: bool = Field(description="任务规划是否与用户目标一致", examples=[True, False])
|
54
|
+
quality_score: float = Field(
|
55
|
+
description="任务规划质量评分,任务规划是否符合用户目标要求,是否是完整、详细、准确的步骤说明,"
|
56
|
+
"是否存在逻辑错误、冗余、抽象不合理等情况,范围0-1,>=0.5表示符合要求,<0.5表示不符合要求",
|
57
|
+
examples=[0.8, 0.3],
|
58
|
+
)
|
59
|
+
feedback: Optional[str] = Field(default=None, description="评估反馈")
|
60
|
+
|
61
|
+
|
62
|
+
class EvaluationResult(BaseModel):
|
63
|
+
"""
|
64
|
+
任务规划评估结果
|
65
|
+
"""
|
66
|
+
|
67
|
+
description: str = Field(description="评估任务描述", examples=["任务规划评估结果"])
|
68
|
+
properties: FlowGlobalPlanningEvalResult = Field(
|
69
|
+
description="评估任务具体结果",
|
70
|
+
examples=[FlowGlobalPlanningEvalResult(is_correct=True, quality_score=0.8, feedback="任务规划符合要求")],
|
71
|
+
)
|
72
|
+
|
73
|
+
|
74
|
+
class FlowGlobalPlanningEvaluator(BaseEvaluator):
|
75
|
+
"""
|
76
|
+
任务规划评估器
|
77
|
+
"""
|
78
|
+
|
79
|
+
PROMPT = FLOW_GLOBAL_PLANNING_EVAL_PROMPT
|
80
|
+
OUTPUT_JSON_SCHEMA = EvaluationResult
|
81
|
+
|
82
|
+
def on_reply(self, reply):
|
83
|
+
reply = super().on_reply(reply)
|
84
|
+
return FlowEvaluationRecord(
|
85
|
+
timestamp=time.time(),
|
86
|
+
evaluator="flow_global_planning",
|
87
|
+
correct_score=reply.properties.quality_score,
|
88
|
+
)
|