jupyter-agent 2025.6.103__py3-none-any.whl → 2025.6.105__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. jupyter_agent/bot_actions.py +270 -0
  2. jupyter_agent/bot_agents/__init__.py +0 -42
  3. jupyter_agent/bot_agents/base.py +85 -45
  4. jupyter_agent/bot_agents/master_planner.py +2 -0
  5. jupyter_agent/bot_agents/output_task_result.py +6 -7
  6. jupyter_agent/bot_agents/request_user_supply.py +186 -0
  7. jupyter_agent/bot_agents/task_planner_v3.py +12 -13
  8. jupyter_agent/bot_agents/task_reasoner.py +2 -2
  9. jupyter_agent/bot_agents/task_structrue_reasoner.py +19 -12
  10. jupyter_agent/bot_agents/task_structrue_summarier.py +19 -18
  11. jupyter_agent/bot_agents/task_summarier.py +2 -2
  12. jupyter_agent/bot_agents/task_verifier.py +1 -1
  13. jupyter_agent/bot_agents/task_verify_summarier.py +5 -6
  14. jupyter_agent/bot_chat.py +2 -2
  15. jupyter_agent/bot_contexts.py +28 -23
  16. jupyter_agent/bot_evaluation.py +325 -0
  17. jupyter_agent/bot_evaluators/__init__.py +0 -0
  18. jupyter_agent/bot_evaluators/base.py +42 -0
  19. jupyter_agent/bot_evaluators/dummy_flow.py +20 -0
  20. jupyter_agent/bot_evaluators/dummy_global.py +20 -0
  21. jupyter_agent/bot_evaluators/dummy_task.py +20 -0
  22. jupyter_agent/bot_evaluators/flow_global_planning.py +88 -0
  23. jupyter_agent/bot_evaluators/flow_task_executor.py +152 -0
  24. jupyter_agent/bot_flows/__init__.py +0 -4
  25. jupyter_agent/bot_flows/base.py +114 -10
  26. jupyter_agent/bot_flows/master_planner.py +7 -2
  27. jupyter_agent/bot_flows/task_executor_v3.py +45 -20
  28. jupyter_agent/bot_magics.py +108 -53
  29. jupyter_agent/bot_outputs.py +56 -3
  30. jupyter_agent/utils.py +20 -31
  31. {jupyter_agent-2025.6.103.dist-info → jupyter_agent-2025.6.105.dist-info}/METADATA +39 -8
  32. jupyter_agent-2025.6.105.dist-info/RECORD +40 -0
  33. jupyter_agent-2025.6.105.dist-info/entry_points.txt +2 -0
  34. jupyter_agent/bot_agents/task_planner_v1.py +0 -158
  35. jupyter_agent/bot_agents/task_planner_v2.py +0 -172
  36. jupyter_agent/bot_flows/task_executor_v1.py +0 -86
  37. jupyter_agent/bot_flows/task_executor_v2.py +0 -84
  38. jupyter_agent-2025.6.103.dist-info/RECORD +0 -33
  39. {jupyter_agent-2025.6.103.dist-info → jupyter_agent-2025.6.105.dist-info}/WHEEL +0 -0
  40. {jupyter_agent-2025.6.103.dist-info → jupyter_agent-2025.6.105.dist-info}/licenses/LICENSE +0 -0
  41. {jupyter_agent-2025.6.103.dist-info → jupyter_agent-2025.6.105.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,325 @@
1
+ """
2
+ Copyright (c) 2025 viewstar000
3
+
4
+ This software is released under the MIT License.
5
+ https://opensource.org/licenses/MIT
6
+ """
7
+
8
+ import os
9
+ import time
10
+ import json
11
+ import argparse
12
+ import nbformat
13
+
14
+ from pathlib import Path
15
+ from typing import Optional
16
+ from enum import Enum
17
+ from pydantic import BaseModel, Field
18
+ from nbclient.client import NotebookClient
19
+ from .bot_actions import ActionBase, ActionSetCellContent, SetCellContentParams, get_action_class
20
+
21
+
22
+ class BaseEvaluationRecord(BaseModel):
23
+ timestamp: float = 0
24
+ notebook_name: str = ""
25
+ evaluator: str = ""
26
+ eval_type: str = "BASE"
27
+ cell_index: int = -1
28
+ execution_duration: float = 0.0
29
+ is_success: bool = False
30
+ correct_score: float = 0.0
31
+
32
+
33
+ class StageEvaluationRecord(BaseEvaluationRecord):
34
+ eval_type: str = "STAGE"
35
+ flow: str = ""
36
+ stage: str = ""
37
+ agent: str = ""
38
+
39
+
40
+ class FlowEvaluationRecord(BaseEvaluationRecord):
41
+ eval_type: str = "FLOW"
42
+ flow: str = ""
43
+ stage_count: int = 0
44
+ planning_score: float = 0.0
45
+ reasoning_score: float = 0.0
46
+ coding_score: float = 0.0
47
+ important_score: float = 0.0
48
+ user_supply_score: float = 0.0
49
+
50
+
51
+ class NotebookEvaluationRecord(BaseEvaluationRecord):
52
+ eval_type: str = "NOTEBOOK"
53
+ flow_count: int = 0
54
+ planning_score: float = 0.0
55
+ coding_score: float = 0.0
56
+ important_score: float = 0.0
57
+ user_supply_score: float = 0.0
58
+
59
+
60
+ class NotebookRunner:
61
+
62
+ def __init__(
63
+ self,
64
+ input_path: str | Path,
65
+ output_path: str | Path = "",
66
+ evaluate_path: str | Path = "",
67
+ timeout: int = -1,
68
+ startup_timeout: int = 60,
69
+ allow_errors: bool = False,
70
+ skip_cells_with_tag: str = "skip-execution",
71
+ **kwargs,
72
+ ):
73
+ self.input_path = Path(input_path).with_suffix(".ipynb")
74
+ self.output_path = output_path
75
+ self.evaluate_path = evaluate_path
76
+ self.start_time = 0
77
+ self.is_global_finished = False
78
+
79
+ if not self.output_path:
80
+ self.output_path = self.input_path.parent.joinpath(f"{self.input_path.with_suffix('').name}_eval.ipynb")
81
+ if not self.evaluate_path:
82
+ self.evaluate_path = self.input_path.parent.joinpath(f"{self.input_path.with_suffix('').name}_eval.jsonl")
83
+ self.output_path = Path(self.output_path).absolute()
84
+ self.evaluate_path = Path(self.evaluate_path).absolute()
85
+
86
+ with self.input_path.open() as f:
87
+ print("Opening notebook:", input_path)
88
+ self.notebook = nbformat.read(f, as_version=4)
89
+
90
+ self.client = NotebookClient(
91
+ self.notebook,
92
+ timeout=timeout,
93
+ startup_timeout=startup_timeout,
94
+ skip_cells_with_tag=skip_cells_with_tag,
95
+ allow_errors=allow_errors,
96
+ resources={"metadata": {"path": self.input_path.parent.absolute()}},
97
+ on_notebook_start=self.on_notebook_start,
98
+ on_notebook_complete=self.on_notebook_complete,
99
+ on_cell_executed=self.on_cell_executed,
100
+ **kwargs,
101
+ )
102
+
103
+ def save_evaluation_record(self, record: BaseEvaluationRecord):
104
+
105
+ if isinstance(record, FlowEvaluationRecord):
106
+ eval_source = record.flow + "-" + record.evaluator
107
+ elif isinstance(record, StageEvaluationRecord):
108
+ eval_source = record.flow + "-" + record.stage + "-" + record.agent + "-" + record.evaluator
109
+ else:
110
+ eval_source = record.evaluator
111
+ print(
112
+ f"CELL[{record.cell_index}] Evaluation: {record.eval_type}[{eval_source}] "
113
+ f"{'SUCCESS' if record.is_success else 'FAILURE'} "
114
+ f"duration: {record.execution_duration:.2f}s "
115
+ f"correct: {record.correct_score:.2f}"
116
+ )
117
+ if self.evaluate_path:
118
+ with open(self.evaluate_path, "a") as eval_file:
119
+ eval_file.write(record.model_dump_json() + "\n")
120
+
121
+ def handle_cell_payloads(self, cell_index, cell_payloads):
122
+ for payload in cell_payloads:
123
+ if payload.get("source") == "set_next_input" and payload.get("replace") is True:
124
+ print(f"CELL[{cell_index}] Replacing cell with set_next_input payload")
125
+ self.notebook.cells[cell_index].source = payload.get("text", "")
126
+
127
+ def handle_jupyter_agent_data(self, cell_index, cell_meta, cell_output_metas):
128
+ cell_agent_data_timestamp = cell_meta.get("jupyter-agent-data-timestamp", 0)
129
+ output_agent_data_timestamp = cell_agent_data_timestamp
130
+ for output_meta in cell_output_metas:
131
+ if (
132
+ output_meta.get("jupyter-agent-data-store")
133
+ and output_meta.get("jupyter-agent-data-timestamp", 0) > cell_agent_data_timestamp
134
+ and output_meta.get("jupyter-agent-data", {})
135
+ ):
136
+ print(f"CELL[{cell_index}] Found jupyter-agent-data-store outputs, save it to cell metadata")
137
+ output_agent_data_timestamp = max(
138
+ output_agent_data_timestamp,
139
+ output_meta.get("jupyter-agent-data-timestamp", 0),
140
+ )
141
+ self.notebook.cells[cell_index].metadata["jupyter-agent-data-store"] = True
142
+ self.notebook.cells[cell_index].metadata["jupyter-agent-data-timestamp"] = output_agent_data_timestamp
143
+ if "jupyter-agent-data" not in self.notebook.cells[cell_index].metadata:
144
+ self.notebook.cells[cell_index].metadata["jupyter-agent-data"] = {}
145
+ self.notebook.cells[cell_index].metadata["jupyter-agent-data"].update(
146
+ output_meta["jupyter-agent-data"]
147
+ )
148
+
149
+ def handle_evaluation_record(self, cell_index, cell_output_metas):
150
+ is_bot_cell = False
151
+ is_flow_completed = False
152
+ for output_meta in cell_output_metas:
153
+ for record in output_meta.get("jupyter-agent-evaluation-records", []):
154
+ is_bot_cell = True
155
+ if record["eval_type"] == "NOTEBOOK":
156
+ record = NotebookEvaluationRecord(**record)
157
+ record.timestamp = record.timestamp or time.time()
158
+ record.notebook_name = str(self.output_path)
159
+ record.execution_duration = time.time() - self.start_time
160
+ self.is_global_finished = True
161
+ is_flow_completed = True
162
+ del self.notebook.cells[cell_index + 1 :] # Remove all cells after the notebook cell
163
+ elif record["eval_type"] == "FLOW":
164
+ record = FlowEvaluationRecord(**record)
165
+ record.timestamp = record.timestamp or time.time()
166
+ record.notebook_name = str(self.output_path)
167
+ is_flow_completed = True
168
+ elif record["eval_type"] == "STAGE":
169
+ record = StageEvaluationRecord(**record)
170
+ record.timestamp = record.timestamp or time.time()
171
+ record.notebook_name = str(self.output_path)
172
+ else:
173
+ record = BaseEvaluationRecord(**record)
174
+ record.timestamp = record.timestamp or time.time()
175
+ record.notebook_name = str(self.output_path)
176
+ self.save_evaluation_record(record)
177
+ if is_bot_cell and not is_flow_completed:
178
+ self.save_evaluation_record(
179
+ FlowEvaluationRecord(
180
+ timestamp=time.time(),
181
+ notebook_name=str(self.output_path),
182
+ evaluator="bot",
183
+ eval_type="FLOW",
184
+ cell_index=cell_index,
185
+ is_success=False,
186
+ )
187
+ )
188
+
189
+ def handle_set_next_cell(self, cell_index, action):
190
+ if action.params.index == 0:
191
+ self.notebook.cells[cell_index].source = action.params.source
192
+ self.notebook.cells[cell_index].metadata.update(action.params.metadata)
193
+ self.notebook.cells[cell_index].metadata["tags"] = action.params.tags
194
+ print(f"CELL[{cell_index}] Replacing cell with set_next_cell action")
195
+ return cell_index
196
+ else:
197
+ metadata = dict(action.params.metadata)
198
+ metadata["tags"] = action.params.tags
199
+ if action.params.type == "code":
200
+ new_cell = nbformat.v4.new_code_cell(source=action.params.source, metadata=metadata)
201
+ elif action.params.type == "markdown":
202
+ new_cell = nbformat.v4.new_markdown_cell(source=action.params.source, metadata=metadata)
203
+ else:
204
+ raise ValueError(f"Unsupported cell type: {action.params.type}")
205
+ insert_idx = cell_index if action.params.index == -1 else cell_index + action.params.index
206
+ ret_idx = cell_index + 1 if action.params.index == -1 else cell_index
207
+ self.notebook.cells.insert(insert_idx, new_cell)
208
+ print(f"CELL[{cell_index}] Inserting cell at [{insert_idx}] with set_next_cell action")
209
+ return ret_idx
210
+
211
+ def handle_jupyter_agent_actions(self, cell_index, cell_meta, cell_output_metas):
212
+ cell_action_timestamp = cell_meta.get("jupyter-agent-action-timestamp", 0)
213
+ output_action_timestamp = cell_action_timestamp
214
+ for output_meta in cell_output_metas:
215
+ for action in output_meta.get("jupyter-agent-action-records", []):
216
+ action = get_action_class(action["action"])(**action)
217
+ if action.timestamp > cell_action_timestamp:
218
+ output_action_timestamp = max(action.timestamp, output_action_timestamp)
219
+ if isinstance(action, ActionSetCellContent):
220
+ print(f"CELL[{cell_index}] Action: {action.action} - {action.source} - {action.timestamp}")
221
+ cell_index = self.handle_set_next_cell(cell_index, action)
222
+ self.notebook.cells[cell_index].metadata["jupyter-agent-action-timestamp"] = output_action_timestamp
223
+
224
+ def on_cell_executed(self, cell_index, cell, execute_reply):
225
+ cell_id = cell.get("id")
226
+ cell_type = cell.get("cell_type")
227
+ cell_meta = cell.get("metadata", {})
228
+ cell_outputs = cell.get("outputs", [])
229
+ cell_payloads = execute_reply.get("content", {}).get("payload", [])
230
+ cell_output_metas = [
231
+ output["metadata"]
232
+ for output in cell_outputs
233
+ if output.get("output_type") == "display_data" and output.get("metadata")
234
+ ]
235
+ self.handle_cell_payloads(cell_index, cell_payloads)
236
+ self.handle_jupyter_agent_data(cell_index, cell_meta, cell_output_metas)
237
+ self.handle_evaluation_record(cell_index, cell_output_metas)
238
+ self.handle_jupyter_agent_actions(cell_index, cell_meta, cell_output_metas)
239
+ print(f"CELL[{cell_index}] Saving executed {cell_type} cell - {cell_id}")
240
+ nbformat.write(self.notebook, self.output_path)
241
+
242
+ def on_notebook_start(self, notebook):
243
+ print("Notebook execution started.")
244
+ self.start_time = time.time()
245
+ if not self.notebook.cells[0].source.startswith("# -*- Jupyter Agent Evaluation Notebook -*-"):
246
+ self.notebook.cells.insert(
247
+ 0,
248
+ nbformat.v4.new_code_cell(
249
+ source=(
250
+ f"# -*- Jupyter Agent Evaluation Notebook -*-\n"
251
+ f"# Executed notebook: {self.input_path}\n"
252
+ f"# Output saved to: {self.output_path}\n\n"
253
+ f"__evaluation_ipynb_file__ = '{self.output_path}'\n"
254
+ ),
255
+ metadata={"tags": ["CTX_EXCLUDE"]},
256
+ ),
257
+ )
258
+
259
+ def on_notebook_complete(self, notebook):
260
+ print("Notebook execution completed.")
261
+ # If the notebook did not finish globally, append an evaluation record
262
+ if not self.is_global_finished:
263
+ print("Notebook execution did not finish globally, appending evaluation records.")
264
+ self.save_evaluation_record(
265
+ NotebookEvaluationRecord(
266
+ notebook_name=str(self.output_path),
267
+ timestamp=time.time(),
268
+ evaluator="bot",
269
+ eval_type="NOTEBOOK",
270
+ execution_duration=time.time() - self.start_time,
271
+ is_success=False,
272
+ )
273
+ )
274
+ print(f"Saving executed notebook to: {self.output_path}")
275
+ nbformat.write(self.notebook, self.output_path)
276
+
277
+ def run(self):
278
+
279
+ self.client.execute()
280
+
281
+
282
+ def main():
283
+ """Main function to run the notebook execution."""
284
+ parser = argparse.ArgumentParser(description="Run a Jupyter notebook.")
285
+ parser.add_argument(
286
+ "-o", "--output_path", type=str, default="", help="Path to save the executed notebook (default: same as input)"
287
+ )
288
+ parser.add_argument(
289
+ "-e", "--evaluate_path", type=str, default="", help="Path to save evaluate records (default: same as input)"
290
+ )
291
+ parser.add_argument(
292
+ "--timeout", type=int, default=-1, help="Execution timeout in seconds (default: -1, no timeout)"
293
+ )
294
+ parser.add_argument(
295
+ "--startup_timeout", type=int, default=60, help="Kernel startup timeout in seconds (default: 60)"
296
+ )
297
+ parser.add_argument(
298
+ "--allow_errors", action="store_true", help="Allow errors in the notebook execution (default: False)"
299
+ )
300
+ parser.add_argument(
301
+ "--kernel_name", type=str, default="", help="Kernel name to use for execution (default: use notebook's kernel)"
302
+ )
303
+ parser.add_argument(
304
+ "--skip_cells_with_tag",
305
+ type=str,
306
+ default="skip-execution",
307
+ help="Tag to skip cells with (default: 'skip-execution')",
308
+ )
309
+ parser.add_argument("input_path", type=str, help="Path to the input notebook file")
310
+ args = parser.parse_args()
311
+
312
+ NotebookRunner(
313
+ input_path=args.input_path,
314
+ output_path=args.output_path,
315
+ evaluate_path=args.evaluate_path,
316
+ timeout=args.timeout,
317
+ startup_timeout=args.startup_timeout,
318
+ allow_errors=args.allow_errors,
319
+ kernel_name=args.kernel_name,
320
+ skip_cells_with_tag=args.skip_cells_with_tag,
321
+ ).run()
322
+
323
+
324
+ if __name__ == "__main__":
325
+ main()
File without changes
@@ -0,0 +1,42 @@
1
+ """
2
+ Copyright (c) 2025 viewstar000
3
+
4
+ This software is released under the MIT License.
5
+ https://opensource.org/licenses/MIT
6
+ """
7
+
8
+ import importlib
9
+
10
+ from ..bot_outputs import _B
11
+ from ..bot_agents.base import BaseChatAgent, AgentOutputFormat, AgentModelType, AgentFactory
12
+
13
+
14
+ class BaseEvaluator(BaseChatAgent):
15
+ """
16
+ Base class for evaluators.
17
+ """
18
+
19
+ OUTPUT_FORMAT = AgentOutputFormat.JSON
20
+ MODEL_TYPE = AgentModelType.REASONING
21
+ DISPLAY_REPLY = False
22
+
23
+ def on_reply(self, reply):
24
+ _B(reply.model_dump_json(indent=2), title="Evaluator Reply", format="code", code_language="json")
25
+ return reply
26
+
27
+ def __call__(self, **kwargs):
28
+ # Ensure BaseChatAgent has a __call__ method, otherwise call a valid method
29
+ result = super().__call__(**kwargs) if hasattr(super(), "__call__") else None
30
+ if result is not None:
31
+ return result[-1]
32
+ raise NotImplementedError("BaseChatAgent does not implement __call__ method.")
33
+
34
+
35
+ class EvaluatorFactory(AgentFactory):
36
+
37
+ def get_agent_class(self, agent_class):
38
+ if isinstance(agent_class, str):
39
+ bot_agents = importlib.import_module("..bot_evaluators", __package__)
40
+ agent_class = getattr(bot_agents, agent_class)
41
+ assert issubclass(agent_class, BaseEvaluator), "Unsupported agent class: {}".format(agent_class)
42
+ return agent_class
@@ -0,0 +1,20 @@
1
+ """
2
+ Copyright (c) 2025 viewstar000
3
+
4
+ This software is released under the MIT License.
5
+ https://opensource.org/licenses/MIT
6
+ """
7
+
8
+ import time
9
+
10
+ from .base import BaseEvaluator
11
+ from ..bot_evaluation import FlowEvaluationRecord
12
+
13
+
14
+ class DummyFlowEvaluator(BaseEvaluator):
15
+
16
+ def __call__(self, **kwargs):
17
+ """
18
+ Dummy evaluator that does nothing and returns a dummy response.
19
+ """
20
+ return FlowEvaluationRecord(timestamp=time.time(), evaluator="dummy")
@@ -0,0 +1,20 @@
1
+ """
2
+ Copyright (c) 2025 viewstar000
3
+
4
+ This software is released under the MIT License.
5
+ https://opensource.org/licenses/MIT
6
+ """
7
+
8
+ import time
9
+
10
+ from .base import BaseEvaluator
11
+ from ..bot_evaluation import NotebookEvaluationRecord
12
+
13
+
14
+ class DummyGlobalEvaluator(BaseEvaluator):
15
+
16
+ def __call__(self, **kwargs):
17
+ """
18
+ Dummy evaluator that does nothing and returns a dummy response.
19
+ """
20
+ return NotebookEvaluationRecord(timestamp=time.time(), evaluator="dummy")
@@ -0,0 +1,20 @@
1
+ """
2
+ Copyright (c) 2025 viewstar000
3
+
4
+ This software is released under the MIT License.
5
+ https://opensource.org/licenses/MIT
6
+ """
7
+
8
+ import time
9
+
10
+ from .base import BaseEvaluator
11
+ from ..bot_evaluation import StageEvaluationRecord
12
+
13
+
14
+ class DummyTaskEvaluator(BaseEvaluator):
15
+
16
+ def __call__(self, **kwargs):
17
+ """
18
+ Dummy evaluator that does nothing and returns a dummy response.
19
+ """
20
+ return StageEvaluationRecord(timestamp=time.time(), evaluator="dummy")
@@ -0,0 +1,88 @@
1
+ """
2
+ Copyright (c) 2025 viewstar000
3
+
4
+ This software is released under the MIT License.
5
+ https://opensource.org/licenses/MIT
6
+ """
7
+
8
+ import time
9
+
10
+ from enum import Enum
11
+ from typing import Optional, List
12
+ from pydantic import BaseModel, Field
13
+ from IPython.display import Markdown
14
+ from .base import BaseEvaluator
15
+ from ..bot_outputs import _D, _I, _W, _E, _F, _A, _O, _C, _M, _B
16
+ from ..bot_evaluation import FlowEvaluationRecord
17
+
18
+
19
+ FLOW_GLOBAL_PLANNING_EVAL_PROMPT = """\
20
+ **角色定义**:
21
+
22
+ 你是一个任务规划评估专家,负责对任务规划的结果进行评估。
23
+
24
+ **任务要求**:
25
+
26
+ 请你根据任务规划的结果,评估任务规划的质量和准确性,并给出相应的评分和反馈。
27
+
28
+ {% include "TASK_OUTPUT_FORMAT" %}
29
+
30
+ ---
31
+
32
+ **当前用户提交的任务目标**
33
+
34
+ {{ task.source }}
35
+
36
+ ---
37
+
38
+ **当前生成的全局任务规划**
39
+
40
+ {{ task.result }}
41
+
42
+ ---
43
+
44
+ 请按要求给出当前任务规划的评估结果:
45
+ """
46
+
47
+
48
+ class FlowGlobalPlanningEvalResult(BaseModel):
49
+ """
50
+ 任务规划评估结果
51
+ """
52
+
53
+ is_correct: bool = Field(description="任务规划是否与用户目标一致", examples=[True, False])
54
+ quality_score: float = Field(
55
+ description="任务规划质量评分,任务规划是否符合用户目标要求,是否是完整、详细、准确的步骤说明,"
56
+ "是否存在逻辑错误、冗余、抽象不合理等情况,范围0-1,>=0.5表示符合要求,<0.5表示不符合要求",
57
+ examples=[0.8, 0.3],
58
+ )
59
+ feedback: Optional[str] = Field(default=None, description="评估反馈")
60
+
61
+
62
+ class EvaluationResult(BaseModel):
63
+ """
64
+ 任务规划评估结果
65
+ """
66
+
67
+ description: str = Field(description="评估任务描述", examples=["任务规划评估结果"])
68
+ properties: FlowGlobalPlanningEvalResult = Field(
69
+ description="评估任务具体结果",
70
+ examples=[FlowGlobalPlanningEvalResult(is_correct=True, quality_score=0.8, feedback="任务规划符合要求")],
71
+ )
72
+
73
+
74
+ class FlowGlobalPlanningEvaluator(BaseEvaluator):
75
+ """
76
+ 任务规划评估器
77
+ """
78
+
79
+ PROMPT = FLOW_GLOBAL_PLANNING_EVAL_PROMPT
80
+ OUTPUT_JSON_SCHEMA = EvaluationResult
81
+
82
+ def on_reply(self, reply):
83
+ reply = super().on_reply(reply)
84
+ return FlowEvaluationRecord(
85
+ timestamp=time.time(),
86
+ evaluator="flow_global_planning",
87
+ correct_score=reply.properties.quality_score,
88
+ )
@@ -0,0 +1,152 @@
1
+ """
2
+ Copyright (c) 2025 viewstar000
3
+
4
+ This software is released under the MIT License.
5
+ https://opensource.org/licenses/MIT
6
+ """
7
+
8
+ import time
9
+
10
+ from enum import Enum
11
+ from typing import Optional, List
12
+ from pydantic import BaseModel, Field
13
+ from IPython.display import Markdown
14
+ from .base import BaseEvaluator
15
+ from ..bot_outputs import _D, _I, _W, _E, _F, _A, _O, _C, _M, _B
16
+ from ..bot_evaluation import FlowEvaluationRecord
17
+
18
+
19
+ FLOW_TASK_EXEC_EVAL_PROMPT = """\
20
+ **角色定义**:
21
+
22
+ 你是一个任务规划评估专家,负责对任务规划的结果进行评估。
23
+
24
+ **任务要求**:
25
+
26
+ 请你根据任务规划的结果,评估任务规划的质量和准确性,并给出相应的评分和反馈。
27
+
28
+ {% include "TASK_OUTPUT_FORMAT" %}
29
+
30
+ ---
31
+
32
+ {% include "TASK_CONTEXTS" %}
33
+
34
+ ---
35
+
36
+ {% include "CODE_CONTEXTS" %}
37
+
38
+ ---
39
+
40
+ **当前子任务规划信息**:
41
+
42
+ ### 当前子任务规划目标:
43
+
44
+ {{ task.subject }}
45
+
46
+ {% if task.coding_prompt %}
47
+ ### 当前子任务代码需求:
48
+
49
+ {{ task.coding_prompt }}
50
+
51
+ ### 当前子任务生成的代码:
52
+
53
+ ```python
54
+ {{ task.source }}
55
+ ```
56
+
57
+ ### 当前代码执行的输出与结果:
58
+
59
+ {{ task.output }}
60
+ {% endif %}
61
+
62
+ ### 当前子任务总结要求:
63
+
64
+ {{ task.summary_prompt }}
65
+
66
+
67
+ ### 当前子任务输出的分析总结后的最终结果:
68
+
69
+ ```markdown
70
+ {{ task.result }}
71
+ ```
72
+
73
+ {% if task.important_infos %}
74
+ ### 当前子任务输出的重要信息:
75
+
76
+ ```json
77
+ {{ task.important_infos | json }}
78
+ ```
79
+ {% endif %}
80
+
81
+ {% if task.request_below_supply_infos %}
82
+ ### 当前子任务输出的请求用户补充确认的信息:
83
+
84
+ ```json
85
+ {{ task.request_below_supply_infos | json }}
86
+ ```
87
+ {% endif %}
88
+
89
+ ---
90
+
91
+ 请按要求给出当前子任务规划的评估结果:
92
+ """
93
+
94
+
95
+ class FlowTaskExecEvalResult(BaseModel):
96
+ """
97
+ 任务规划评估结果
98
+ """
99
+
100
+ is_correct: bool = Field(description="最终结果是否符合当前子任务的目标", examples=[True, False])
101
+ correct_score: float = Field(
102
+ description="最终结果符合当前子任务目标的分数,范围0-1,>=0.5表示符合目标,<0.5表示不符合目标",
103
+ examples=[0.95, 0.3],
104
+ )
105
+ planning_score: float = Field(
106
+ description="当前子任务的目标规划、代码生成、总结是否符合全局目标规划要求,范围0-1,>=0.5表示符合要求,<0.5表示不符合要求",
107
+ examples=[0.85, 0.25],
108
+ )
109
+ reasoning_score: float = Field(
110
+ description="当前子任务的推理过程是否合理,是否存在逻辑错误,是否存在与前置子任务相冲突的情况,"
111
+ "范围0-1,>=0.5表示合理、正确、无冲突,<0.5表示不合理",
112
+ examples=[0.9, 0.4],
113
+ )
114
+ coding_score: float = Field(
115
+ description="代码生成的质量评分,代码逻辑是否符合规划要求,是否存在逻辑错误,是否存在冗余、抽象不合理等情况,"
116
+ "范围0-1,>=0.5表示代码质量较高,<0.5表示代码质量较低",
117
+ examples=[0.75, 0.2],
118
+ )
119
+ important_info_score: float = Field(
120
+ description="重要信息分数,当前子任务的规划、代码生成、总结是否充分考虑了前置任务生成的重要信息,"
121
+ "以及当前子任务的重要信息是否完整、准确、无误导、无冲突,"
122
+ "范围0-1,>=0.5表示重要信息完整、准确,<0.5表示重要信息不完整或不准确",
123
+ examples=[0.9, 0.4],
124
+ )
125
+ user_supply_info_score: float = Field(
126
+ description="用户补充信息分数,当前子任务的规划、代码生成、总结是否充分考虑了用户补充的信息,"
127
+ "范围0-1,>=0.5表示充分考虑,<0.5表示未充分考虑",
128
+ examples=[0.8, 0.3],
129
+ )
130
+ feedback: Optional[str] = Field(default=None, description="评估反馈")
131
+
132
+
133
+ class FlowTaskExecEvaluator(BaseEvaluator):
134
+ """
135
+ 任务规划评估器
136
+ """
137
+
138
+ PROMPT = FLOW_TASK_EXEC_EVAL_PROMPT
139
+ OUTPUT_JSON_SCHEMA = FlowTaskExecEvalResult
140
+
141
+ def on_reply(self, reply):
142
+ reply = super().on_reply(reply)
143
+ return FlowEvaluationRecord(
144
+ timestamp=time.time(),
145
+ evaluator="flow_task_executor",
146
+ correct_score=reply.correct_score,
147
+ planning_score=reply.planning_score,
148
+ reasoning_score=reply.reasoning_score,
149
+ coding_score=reply.coding_score,
150
+ important_score=reply.important_info_score,
151
+ user_supply_score=reply.user_supply_info_score,
152
+ )
@@ -7,14 +7,10 @@ https://opensource.org/licenses/MIT
7
7
 
8
8
  from .base import BaseTaskFlow
9
9
  from .master_planner import MasterPlannerFlow
10
- from .task_executor_v1 import TaskExecutorFlowV1
11
- from .task_executor_v2 import TaskExecutorFlowV2
12
10
  from .task_executor_v3 import TaskExecutorFlowV3
13
11
 
14
12
  __all__ = [
15
13
  "BaseTaskFlow",
16
14
  "MasterPlannerFlow",
17
- "TaskExecutorFlowV1",
18
- "TaskExecutorFlowV2",
19
15
  "TaskExecutorFlowV3",
20
16
  ]