MenuPilot 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- menupilot/__init__.py +3 -0
- menupilot/__main__.py +4 -0
- menupilot/agent/__init__.py +0 -0
- menupilot/agent/agent_loop.py +414 -0
- menupilot/agent/matching_engine.py +974 -0
- menupilot/agent/option_expander.py +490 -0
- menupilot/agent/orchestration.py +570 -0
- menupilot/agent/rule_engine.py +509 -0
- menupilot/agent/sandbox.py +216 -0
- menupilot/agent/schema_analyzer.py +1026 -0
- menupilot/agent/template_preprocessor.py +293 -0
- menupilot/agent/token_classifier.py +816 -0
- menupilot/agent/tools.py +365 -0
- menupilot/agent/workflow.py +1072 -0
- menupilot/cli/human_review.py +191 -0
- menupilot/cli/repl.py +821 -0
- menupilot/config.py +113 -0
- menupilot/data/__init__.py +0 -0
- menupilot/data/canonical_schema.py +135 -0
- menupilot/data/mapping_rules.yaml +387 -0
- menupilot/data/memory.py +674 -0
- menupilot/data/token_dict.py +275 -0
- menupilot/excel_io/__init__.py +0 -0
- menupilot/excel_io/excel_reader.py +552 -0
- menupilot/excel_io/excel_writer.py +413 -0
- menupilot/main.py +322 -0
- menupilot/wizard.py +86 -0
- menupilot-0.1.0.dist-info/METADATA +397 -0
- menupilot-0.1.0.dist-info/RECORD +33 -0
- menupilot-0.1.0.dist-info/WHEEL +5 -0
- menupilot-0.1.0.dist-info/entry_points.txt +2 -0
- menupilot-0.1.0.dist-info/licenses/LICENSE +21 -0
- menupilot-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1072 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LangGraph 工作流定义 — 编排完整 POS 模板映射管线。
|
|
3
|
+
|
|
4
|
+
节点顺序:
|
|
5
|
+
load_data → preprocess → analyze_schema → classify_tokens → normalize → validate → match → write_output
|
|
6
|
+
|
|
7
|
+
每个节点读取上一个节点的输出,写入本节点的结果。任一步骤失败可捕获并进入错误处理。
|
|
8
|
+
兼容无 langgraph 安装环境,提供 run_pipeline() 作为纯顺序回退方案。
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from typing import Any, Dict, List, Optional
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
from typing import NotRequired # Python >= 3.11
|
|
15
|
+
except ImportError:
|
|
16
|
+
from typing_extensions import NotRequired # Python < 3.11
|
|
17
|
+
|
|
18
|
+
import pandas as pd
|
|
19
|
+
|
|
20
|
+
from menupilot.agent.matching_engine import generate_console_summary, generate_report as me_generate_report
|
|
21
|
+
from menupilot.agent.matching_engine import match
|
|
22
|
+
from menupilot.agent.rule_engine import (
|
|
23
|
+
check_row_completeness,
|
|
24
|
+
master_to_canonical,
|
|
25
|
+
template_to_canonical,
|
|
26
|
+
validate_tokens,
|
|
27
|
+
)
|
|
28
|
+
from menupilot.agent.schema_analyzer import analyze_from_dataframe
|
|
29
|
+
from menupilot.agent.token_classifier import classify_from_dataframe, reset_cache as tc_reset_cache
|
|
30
|
+
from menupilot.excel_io.excel_reader import read_master, read_template
|
|
31
|
+
from menupilot.excel_io.excel_writer import write_result
|
|
32
|
+
|
|
33
|
+
# ── 工作流状态键 ────────────────────────────────────────────────
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
from typing import TypedDict # Python >= 3.8
|
|
37
|
+
except ImportError:
|
|
38
|
+
from typing_extensions import TypedDict
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class PipelineState(TypedDict, total=False):
|
|
42
|
+
"""TypedDict 版本的管线状态(所有字段可选,支持增量更新)。
|
|
43
|
+
|
|
44
|
+
total=False: 节点只需返回变化的字段,其余字段由 LangGraph 自动合并。
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
# ── 输入参数 ──
|
|
48
|
+
master_path: str
|
|
49
|
+
template_path: str
|
|
50
|
+
output_path: str
|
|
51
|
+
report_path: str
|
|
52
|
+
target_col: str
|
|
53
|
+
master_sheet: int
|
|
54
|
+
template_sheet: int
|
|
55
|
+
|
|
56
|
+
# ── 中间数据 ──
|
|
57
|
+
master_df: "pd.DataFrame"
|
|
58
|
+
template_df: "pd.DataFrame"
|
|
59
|
+
template_type: str
|
|
60
|
+
chowbus_rows: List[Dict[str, Any]]
|
|
61
|
+
schema_result: Dict[str, Any]
|
|
62
|
+
token_results: List[Dict[str, Any]]
|
|
63
|
+
master_canonical: List[Dict[str, Any]]
|
|
64
|
+
template_canonical: List[Dict[str, Any]]
|
|
65
|
+
validated_tokens: List[Dict[str, Any]]
|
|
66
|
+
match_results: List[Dict[str, Any]]
|
|
67
|
+
report: str
|
|
68
|
+
console_summary: str
|
|
69
|
+
|
|
70
|
+
# ── Human Review(用户审核介入)──
|
|
71
|
+
low_conf_rows: List[Dict[str, Any]]
|
|
72
|
+
human_review_result: Dict[str, Any]
|
|
73
|
+
|
|
74
|
+
# ── 错误信息 ──
|
|
75
|
+
error: str
|
|
76
|
+
error_step: str
|
|
77
|
+
|
|
78
|
+
# ── 统计 ──
|
|
79
|
+
api_call_count: int
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def make_pipeline_state(
|
|
83
|
+
master_path: str = "",
|
|
84
|
+
template_path: str = "",
|
|
85
|
+
output_path: str = "",
|
|
86
|
+
report_path: str = "",
|
|
87
|
+
target_col: str = "配料",
|
|
88
|
+
master_sheet: int = 0,
|
|
89
|
+
template_sheet: int = 0,
|
|
90
|
+
) -> PipelineState:
|
|
91
|
+
"""创建 PipelineState 并填充默认值。
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
master_path: 主数据表 Excel 路径。
|
|
95
|
+
template_path: POS 模板 Excel 路径。
|
|
96
|
+
output_path: 输出 Excel 路径。
|
|
97
|
+
report_path: 校验报告路径。
|
|
98
|
+
target_col: 目标列名。
|
|
99
|
+
master_sheet: 主数据表 Sheet 序号。
|
|
100
|
+
template_sheet: 模板表 Sheet 序号。
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
初始化好的 PipelineState。
|
|
104
|
+
"""
|
|
105
|
+
return PipelineState(
|
|
106
|
+
master_path=master_path,
|
|
107
|
+
template_path=template_path,
|
|
108
|
+
output_path=output_path,
|
|
109
|
+
report_path=report_path or output_path.replace(".xlsx", "_report.txt") if output_path else "",
|
|
110
|
+
target_col=target_col,
|
|
111
|
+
master_sheet=master_sheet,
|
|
112
|
+
template_sheet=template_sheet,
|
|
113
|
+
# 中间数据
|
|
114
|
+
master_df=None,
|
|
115
|
+
template_df=None,
|
|
116
|
+
template_type="standard",
|
|
117
|
+
chowbus_rows=None,
|
|
118
|
+
schema_result=None,
|
|
119
|
+
token_results=None,
|
|
120
|
+
master_canonical=None,
|
|
121
|
+
template_canonical=None,
|
|
122
|
+
validated_tokens=None,
|
|
123
|
+
match_results=None,
|
|
124
|
+
report="",
|
|
125
|
+
console_summary="",
|
|
126
|
+
# 错误信息
|
|
127
|
+
error=None,
|
|
128
|
+
error_step=None,
|
|
129
|
+
# 统计
|
|
130
|
+
api_call_count=0,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# ── 管线节点 ────────────────────────────────────────────────────
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def step_load_data(state: PipelineState) -> PipelineState:
|
|
138
|
+
"""Step 1: 读取主数据表和模板表 + 模板类型检测。"""
|
|
139
|
+
if state.get("error") is not None:
|
|
140
|
+
return state
|
|
141
|
+
try:
|
|
142
|
+
from menupilot.agent.template_preprocessor import detect_template_type, collect_chowbus_rows
|
|
143
|
+
|
|
144
|
+
state["master_df"] = read_master(state["master_path"], sheet_name=state["master_sheet"])
|
|
145
|
+
|
|
146
|
+
# 先以 header=None 读取模板原始数据,用于类型检测
|
|
147
|
+
from menupilot.excel_io.excel_reader import read_template_raw
|
|
148
|
+
raw_df = read_template_raw(state["template_path"], sheet_name=state["template_sheet"])
|
|
149
|
+
state["template_type"] = detect_template_type(raw_df)
|
|
150
|
+
|
|
151
|
+
if state["template_type"] == "chowbus":
|
|
152
|
+
# chowbus 类型:收集散列字段,跳过 Schema Analyzer
|
|
153
|
+
state["chowbus_rows"] = collect_chowbus_rows(raw_df)
|
|
154
|
+
state["template_df"] = None # chowbus 不使用标准 template_df
|
|
155
|
+
# 目标列:chowbus 模板固定为 sop_code
|
|
156
|
+
if state["target_col"] == "配料":
|
|
157
|
+
state["target_col"] = "sop_code"
|
|
158
|
+
else:
|
|
159
|
+
# standard 类型:正常读取
|
|
160
|
+
state["template_df"] = read_template(
|
|
161
|
+
state["template_path"], sheet_name=state["template_sheet"]
|
|
162
|
+
)
|
|
163
|
+
except Exception as e:
|
|
164
|
+
state["error"] = str(e)
|
|
165
|
+
state["error_step"] = "load_data"
|
|
166
|
+
return state
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def step_preprocess(state: PipelineState) -> PipelineState:
|
|
170
|
+
"""Step 1.5: chowbus 预处理 — 收集散列字段 → Token 分类 → 标准化为 Canonical。
|
|
171
|
+
|
|
172
|
+
对 standard 类型透明跳过。
|
|
173
|
+
"""
|
|
174
|
+
if state.get("error") is not None or state["template_type"] != "chowbus":
|
|
175
|
+
return state
|
|
176
|
+
try:
|
|
177
|
+
from menupilot.agent.token_classifier import (
|
|
178
|
+
classify_single,
|
|
179
|
+
set_prompt_hook as tc_set_prompt_hook,
|
|
180
|
+
)
|
|
181
|
+
from menupilot.agent.rule_engine import master_to_canonical
|
|
182
|
+
|
|
183
|
+
# 注入静默钩子:UNKNOWN 不触发询问
|
|
184
|
+
tc_set_prompt_hook(lambda word, ctx, llm: {"action": "skip"})
|
|
185
|
+
|
|
186
|
+
# 对每行 composite_info 做 Token 分类
|
|
187
|
+
for row in state["chowbus_rows"]:
|
|
188
|
+
composite_str = row.get("composite_info", "")
|
|
189
|
+
if composite_str:
|
|
190
|
+
result = classify_single(composite_str)
|
|
191
|
+
row["_tokens"] = result.get("tokens", [])
|
|
192
|
+
row["_missing"] = result.get("missing", [])
|
|
193
|
+
else:
|
|
194
|
+
row["_tokens"] = []
|
|
195
|
+
row["_missing"] = []
|
|
196
|
+
|
|
197
|
+
# 还原静默钩子
|
|
198
|
+
tc_set_prompt_hook(None)
|
|
199
|
+
|
|
200
|
+
# 转换为 Canonical Schema 行
|
|
201
|
+
from menupilot.data.canonical_schema import CANONICAL_FIELDS
|
|
202
|
+
from menupilot.data.token_dict import normalize_token
|
|
203
|
+
|
|
204
|
+
canonical_rows = []
|
|
205
|
+
for row in state["chowbus_rows"]:
|
|
206
|
+
cr = {f: None for f in CANONICAL_FIELDS}
|
|
207
|
+
cr["product_name"] = str(row.get("product_name", "") or "").strip()
|
|
208
|
+
for token in row.get("_tokens", []):
|
|
209
|
+
ttype = token.get("type", "")
|
|
210
|
+
tvalue = token.get("value", "")
|
|
211
|
+
# 将 token 类型映射到 canonical 字段
|
|
212
|
+
type_map = {
|
|
213
|
+
"茶底": "tea_base",
|
|
214
|
+
"奶底": "milk_base",
|
|
215
|
+
"糖度": "sugar",
|
|
216
|
+
"温度": "temperature",
|
|
217
|
+
"规格": "size",
|
|
218
|
+
}
|
|
219
|
+
cfield = type_map.get(ttype)
|
|
220
|
+
if cfield and cfield in cr:
|
|
221
|
+
cr[cfield] = tvalue
|
|
222
|
+
# 补充:直接收集的中文值可能未被 token 词典识别,
|
|
223
|
+
# 但标准化层接受部分缺失,匹配引擎会处理通配
|
|
224
|
+
canonical_rows.append(cr)
|
|
225
|
+
|
|
226
|
+
state["template_canonical"] = canonical_rows
|
|
227
|
+
|
|
228
|
+
# 主数据标准化(复用现有逻辑)
|
|
229
|
+
state["master_canonical"] = master_to_canonical(state["master_df"])
|
|
230
|
+
|
|
231
|
+
except Exception as e:
|
|
232
|
+
state["error"] = str(e)
|
|
233
|
+
state["error_step"] = "preprocess"
|
|
234
|
+
return state
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def step_analyze_schema(state: PipelineState) -> PipelineState:
|
|
238
|
+
"""Step 2: Schema Analyzer 分析模板字段语义。"""
|
|
239
|
+
if state.get("error") is not None:
|
|
240
|
+
return state
|
|
241
|
+
# chowbus 类型跳过
|
|
242
|
+
if state["template_type"] == "chowbus":
|
|
243
|
+
return state
|
|
244
|
+
try:
|
|
245
|
+
state["schema_result"] = analyze_from_dataframe(state["template_df"])
|
|
246
|
+
except Exception as e:
|
|
247
|
+
state["error"] = str(e)
|
|
248
|
+
state["error_step"] = "analyze_schema"
|
|
249
|
+
return state
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def step_classify_tokens(state: PipelineState) -> PipelineState:
|
|
253
|
+
"""Step 3: Token Classifier 解析组合字段。"""
|
|
254
|
+
if state.get("error") is not None:
|
|
255
|
+
return state
|
|
256
|
+
if state["template_type"] == "chowbus":
|
|
257
|
+
return state
|
|
258
|
+
try:
|
|
259
|
+
composite_col = state["schema_result"].get("composite_col")
|
|
260
|
+
if composite_col and composite_col in state["template_df"].columns:
|
|
261
|
+
state["token_results"] = classify_from_dataframe(
|
|
262
|
+
state["template_df"], composite_col
|
|
263
|
+
)
|
|
264
|
+
else:
|
|
265
|
+
# 无组合字段,提供空 token 结果
|
|
266
|
+
state["token_results"] = [
|
|
267
|
+
{"tokens": [], "missing": []}
|
|
268
|
+
for _ in range(len(state["template_df"]))
|
|
269
|
+
]
|
|
270
|
+
except Exception as e:
|
|
271
|
+
state["error"] = str(e)
|
|
272
|
+
state["error_step"] = "classify_tokens"
|
|
273
|
+
return state
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def step_normalize(state: PipelineState) -> PipelineState:
|
|
277
|
+
"""Step 4: Rule Engine — 主数据 + 模板标准化为 Canonical Schema。"""
|
|
278
|
+
if state.get("error") is not None:
|
|
279
|
+
return state
|
|
280
|
+
if state["template_type"] == "chowbus":
|
|
281
|
+
return state # chowbus 已在 preprocess 中完成标准化
|
|
282
|
+
try:
|
|
283
|
+
fm = state["schema_result"].get("field_mapping", {})
|
|
284
|
+
composite_col = state["schema_result"].get("composite_col", "")
|
|
285
|
+
state["master_canonical"] = master_to_canonical(state["master_df"])
|
|
286
|
+
state["template_canonical"] = template_to_canonical(
|
|
287
|
+
state["template_df"], fm, composite_col, state["token_results"]
|
|
288
|
+
)
|
|
289
|
+
except Exception as e:
|
|
290
|
+
state["error"] = str(e)
|
|
291
|
+
state["error_step"] = "normalize"
|
|
292
|
+
return state
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def step_validate(state: PipelineState) -> PipelineState:
|
|
296
|
+
"""Step 5: Rule Engine — Token 验证 + 必要维度检查。"""
|
|
297
|
+
if state.get("error") is not None:
|
|
298
|
+
return state
|
|
299
|
+
if state["template_type"] == "chowbus":
|
|
300
|
+
return state # chowbus 已在 preprocess 中完成验证
|
|
301
|
+
try:
|
|
302
|
+
state["validated_tokens"] = validate_tokens(state["token_results"])
|
|
303
|
+
|
|
304
|
+
# 检查每行完整度,在 canonical 行上标记
|
|
305
|
+
for i, trow in enumerate(state["template_canonical"]):
|
|
306
|
+
missing = check_row_completeness(trow)
|
|
307
|
+
if missing:
|
|
308
|
+
trow["_completeness_issues"] = missing
|
|
309
|
+
except Exception as e:
|
|
310
|
+
state["error"] = str(e)
|
|
311
|
+
state["error_step"] = "validate"
|
|
312
|
+
return state
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def step_match(state: PipelineState) -> PipelineState:
|
|
316
|
+
"""Step 6: Matching Engine — 模板行 → 主数据行匹配。"""
|
|
317
|
+
if state.get("error") is not None:
|
|
318
|
+
return state
|
|
319
|
+
try:
|
|
320
|
+
# ── 断言:验证商品名称在管线各阶段未被改写 ──
|
|
321
|
+
_assert_product_name_integrity(state)
|
|
322
|
+
state["match_results"] = match(state["template_canonical"], state["master_canonical"])
|
|
323
|
+
# 提取低置信度行,供 human_review 审核
|
|
324
|
+
state["low_conf_rows"] = [
|
|
325
|
+
r for r in state["match_results"]
|
|
326
|
+
if r.get("confidence") == "LOW_CONFIDENCE"
|
|
327
|
+
]
|
|
328
|
+
except Exception as e:
|
|
329
|
+
state["error"] = str(e)
|
|
330
|
+
state["error_step"] = "match"
|
|
331
|
+
return state
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def _assert_product_name_integrity(state: PipelineState) -> None:
|
|
335
|
+
"""验证模板商品名称从原始读取到匹配前保持一致。
|
|
336
|
+
|
|
337
|
+
对比 state["template_df"](原始 Excel 读取值)与
|
|
338
|
+
state["template_canonical"](经 Schema Analyzer → Token Classifier → Rule Engine 处理后)
|
|
339
|
+
中的 product_name 字段。任何不一致都立即报错,防止 LLM 静默改写数据。
|
|
340
|
+
"""
|
|
341
|
+
# chowbus 类型跳过(template_df 为 None,产品名来自预处理层)
|
|
342
|
+
if state["template_type"] == "chowbus" or state["template_df"] is None:
|
|
343
|
+
return
|
|
344
|
+
|
|
345
|
+
# 找到模板中映射为 product_name 的列
|
|
346
|
+
fm = state["schema_result"].get("field_mapping", {})
|
|
347
|
+
src_col = None
|
|
348
|
+
for tcol, cfield in fm.items():
|
|
349
|
+
if cfield == "product_name":
|
|
350
|
+
src_col = tcol
|
|
351
|
+
break
|
|
352
|
+
|
|
353
|
+
if src_col is None or src_col not in state["template_df"].columns:
|
|
354
|
+
return # 无法验证,跳过
|
|
355
|
+
|
|
356
|
+
raw_names = state["template_df"][src_col].astype(str).str.strip().tolist()
|
|
357
|
+
canonical_names = [
|
|
358
|
+
str(r.get("product_name", "")).strip()
|
|
359
|
+
for r in state["template_canonical"]
|
|
360
|
+
]
|
|
361
|
+
|
|
362
|
+
mismatches = []
|
|
363
|
+
for i, (raw, canonical) in enumerate(zip(raw_names, canonical_names)):
|
|
364
|
+
if raw != canonical:
|
|
365
|
+
mismatches.append(
|
|
366
|
+
f" 行 {i+1}: 原始='{raw}' -> 改写为='{canonical}'"
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
if mismatches:
|
|
370
|
+
raise ValueError(
|
|
371
|
+
f"商品名称在管线中被意外改写!{len(mismatches)} 行不一致:\n"
|
|
372
|
+
+ "\n".join(mismatches[:10])
|
|
373
|
+
+ ("\n ..." if len(mismatches) > 10 else "")
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def step_write_output(state: PipelineState) -> PipelineState:
|
|
378
|
+
"""Step 7: 写入结果 Excel + 校验报告。"""
|
|
379
|
+
if state.get("error") is not None:
|
|
380
|
+
return state
|
|
381
|
+
try:
|
|
382
|
+
# 构建结果 DataFrame
|
|
383
|
+
sops = [r.get("sop", "") for r in state["match_results"]]
|
|
384
|
+
confidences = [r.get("confidence", "") for r in state["match_results"]]
|
|
385
|
+
|
|
386
|
+
result_df = pd.DataFrame({
|
|
387
|
+
state["target_col"]: sops,
|
|
388
|
+
"匹配置信度": confidences,
|
|
389
|
+
})
|
|
390
|
+
|
|
391
|
+
write_result(
|
|
392
|
+
state["template_path"],
|
|
393
|
+
state["output_path"],
|
|
394
|
+
result_df,
|
|
395
|
+
target_col=state["target_col"],
|
|
396
|
+
header_row=1,
|
|
397
|
+
data_start_row=3 if state["template_type"] == "chowbus" else None,
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
# 生成用户友好摘要报告(文件 = 完整日志)
|
|
401
|
+
state["report"] = me_generate_report(state["match_results"])
|
|
402
|
+
state["console_summary"] = generate_console_summary(
|
|
403
|
+
state["match_results"], report_path=state["report_path"]
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
# 写入报告文件(完整日志)
|
|
407
|
+
from pathlib import Path
|
|
408
|
+
Path(state["report_path"]).write_text(state["report"], encoding="utf-8")
|
|
409
|
+
except Exception as e:
|
|
410
|
+
state["error"] = str(e)
|
|
411
|
+
state["error_step"] = "write_output"
|
|
412
|
+
return state
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
# ── 路由函数 ────────────────────────────────────────────────────
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def route_after_load(state: PipelineState) -> str:
|
|
419
|
+
"""load_data 之后的条件路由。
|
|
420
|
+
|
|
421
|
+
- 有错误 → 直接跳到 write_output
|
|
422
|
+
- chowbus 模板 → preprocess
|
|
423
|
+
- standard 模板 → analyze_schema
|
|
424
|
+
"""
|
|
425
|
+
if state.get("error") is not None:
|
|
426
|
+
return "write_output"
|
|
427
|
+
if state.get("template_type") == "chowbus":
|
|
428
|
+
return "preprocess"
|
|
429
|
+
return "analyze_schema"
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def route_after_match(state: PipelineState) -> str:
|
|
433
|
+
"""match 之后的条件路由。
|
|
434
|
+
|
|
435
|
+
- 有错误 → 直接跳到 write_output
|
|
436
|
+
- 有 LOW_CONFIDENCE 行 → human_review
|
|
437
|
+
- 全部 HIGH → write_output
|
|
438
|
+
"""
|
|
439
|
+
if state.get("error") is not None:
|
|
440
|
+
return "write_output"
|
|
441
|
+
low_conf = state.get("low_conf_rows", [])
|
|
442
|
+
if low_conf:
|
|
443
|
+
return "human_review"
|
|
444
|
+
return "write_output"
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
# ── Human Review 节点 ────────────────────────────────────────────
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
def _compute_master_fingerprint(path: str) -> str:
|
|
451
|
+
"""计算主数据文件的内容指纹(MD5 前 8 位)。"""
|
|
452
|
+
if not path:
|
|
453
|
+
return ""
|
|
454
|
+
import hashlib
|
|
455
|
+
with open(path, "rb") as f:
|
|
456
|
+
return hashlib.md5(f.read()).hexdigest()[:8]
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def step_human_review(state: PipelineState) -> PipelineState:
|
|
460
|
+
"""Human Review 节点:应用用户审核决策。
|
|
461
|
+
|
|
462
|
+
前提:interrupt_before 挂起后,外部通过 update_state 注入
|
|
463
|
+
human_review_result。本节点读取该字段并应用到 match_results。
|
|
464
|
+
|
|
465
|
+
逻辑:
|
|
466
|
+
- accept/manual → 升级对应行为 HIGH,写入长期记忆
|
|
467
|
+
- permanent_skip → 写入 __SKIP__ 到长期记忆,保持 LOW_CONFIDENCE
|
|
468
|
+
- skip → 不写记忆,保持 LOW_CONFIDENCE
|
|
469
|
+
"""
|
|
470
|
+
review_result = state.get("human_review_result", {})
|
|
471
|
+
if not review_result:
|
|
472
|
+
return state
|
|
473
|
+
|
|
474
|
+
decisions = review_result.get("decisions", [])
|
|
475
|
+
if not decisions:
|
|
476
|
+
return state
|
|
477
|
+
|
|
478
|
+
match_results = state.get("match_results", [])
|
|
479
|
+
template_canonical = state.get("template_canonical", [])
|
|
480
|
+
master_fingerprint = _compute_master_fingerprint(
|
|
481
|
+
str(state.get("master_path", ""))
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
from menupilot.data.memory import add_confirmed_mapping, build_confirmed_key
|
|
485
|
+
from menupilot.data.memory import get_confirmed_mapping
|
|
486
|
+
|
|
487
|
+
for decision in decisions:
|
|
488
|
+
idx = decision["row_index"]
|
|
489
|
+
action = decision["action"]
|
|
490
|
+
|
|
491
|
+
if idx < 0 or idx >= len(match_results):
|
|
492
|
+
continue
|
|
493
|
+
|
|
494
|
+
if action in ("accept", "manual"):
|
|
495
|
+
sop = decision.get("sop", "")
|
|
496
|
+
match_results[idx]["sop"] = sop
|
|
497
|
+
match_results[idx]["confidence"] = "HIGH"
|
|
498
|
+
# 写入长期记忆
|
|
499
|
+
if idx < len(template_canonical):
|
|
500
|
+
key = build_confirmed_key(master_fingerprint, template_canonical[idx])
|
|
501
|
+
add_confirmed_mapping(key, sop)
|
|
502
|
+
|
|
503
|
+
elif action == "permanent_skip":
|
|
504
|
+
if idx < len(template_canonical):
|
|
505
|
+
key = build_confirmed_key(master_fingerprint, template_canonical[idx])
|
|
506
|
+
add_confirmed_mapping(key, "__SKIP__")
|
|
507
|
+
|
|
508
|
+
# action == "skip":不写记忆,保持 LOW_CONFIDENCE
|
|
509
|
+
|
|
510
|
+
state["match_results"] = match_results
|
|
511
|
+
# 清除 human_review_result,防止重复应用
|
|
512
|
+
state["human_review_result"] = {}
|
|
513
|
+
return state
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
# ── DataFrame 序列化适配器 ────────────────────────────────────────
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
class _DataFrameSerde:
|
|
520
|
+
"""包装默认 serde,处理 DataFrame 的 msgpack 序列化。
|
|
521
|
+
|
|
522
|
+
将 DataFrame 转为 records 格式序列化,反序列化时还原。
|
|
523
|
+
保证 checkpointer 可以在 interrupt 后正确恢复状态。
|
|
524
|
+
"""
|
|
525
|
+
|
|
526
|
+
def __init__(self):
|
|
527
|
+
from langgraph.checkpoint.serde.jsonplus import JsonPlusSerializer
|
|
528
|
+
self._inner = JsonPlusSerializer()
|
|
529
|
+
|
|
530
|
+
def dumps_typed(self, obj):
|
|
531
|
+
import pandas as pd
|
|
532
|
+
if isinstance(obj, pd.DataFrame):
|
|
533
|
+
wrapped = {"__dataframe__": obj.to_dict(orient="records")}
|
|
534
|
+
# 也保存列名(空 DataFrame 的 records 为空列表)
|
|
535
|
+
wrapped["__columns__"] = list(obj.columns)
|
|
536
|
+
return self._inner.dumps_typed(wrapped)
|
|
537
|
+
return self._inner.dumps_typed(obj)
|
|
538
|
+
|
|
539
|
+
def loads_typed(self, data):
|
|
540
|
+
import pandas as pd
|
|
541
|
+
result = self._inner.loads_typed(data)
|
|
542
|
+
if isinstance(result, dict) and "__dataframe__" in result:
|
|
543
|
+
return pd.DataFrame(
|
|
544
|
+
result["__dataframe__"],
|
|
545
|
+
columns=result.get("__columns__"),
|
|
546
|
+
)
|
|
547
|
+
return result
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
# ── LangGraph 工作流 ─────────────────────────────────────────────
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def build_graph():
|
|
554
|
+
"""构建 LangGraph StateGraph(条件路由 + checkpoint + interrupt)。
|
|
555
|
+
|
|
556
|
+
需要 langgraph 已安装。返回编译后的 app 对象。
|
|
557
|
+
|
|
558
|
+
Raises:
|
|
559
|
+
ImportError: langgraph 未安装。
|
|
560
|
+
"""
|
|
561
|
+
from langgraph.checkpoint.memory import MemorySaver
|
|
562
|
+
from langgraph.graph import END, StateGraph
|
|
563
|
+
|
|
564
|
+
graph = StateGraph(PipelineState)
|
|
565
|
+
|
|
566
|
+
# 注册所有节点
|
|
567
|
+
graph.add_node("load_data", step_load_data)
|
|
568
|
+
graph.add_node("preprocess", step_preprocess)
|
|
569
|
+
graph.add_node("analyze_schema", step_analyze_schema)
|
|
570
|
+
graph.add_node("classify_tokens", step_classify_tokens)
|
|
571
|
+
graph.add_node("normalize", step_normalize)
|
|
572
|
+
graph.add_node("validate", step_validate)
|
|
573
|
+
graph.add_node("match", step_match)
|
|
574
|
+
graph.add_node("human_review", step_human_review)
|
|
575
|
+
graph.add_node("write_output", step_write_output)
|
|
576
|
+
|
|
577
|
+
graph.set_entry_point("load_data")
|
|
578
|
+
|
|
579
|
+
# ── 条件边:load_data 之后根据模板类型 + 错误状态分流 ──
|
|
580
|
+
graph.add_conditional_edges(
|
|
581
|
+
"load_data",
|
|
582
|
+
route_after_load,
|
|
583
|
+
{
|
|
584
|
+
"preprocess": "preprocess",
|
|
585
|
+
"analyze_schema": "analyze_schema",
|
|
586
|
+
"write_output": "write_output",
|
|
587
|
+
},
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
# ── chowbus:preprocess → 直接 match ──
|
|
591
|
+
graph.add_edge("preprocess", "match")
|
|
592
|
+
|
|
593
|
+
# ── standard:标准管线链 ──
|
|
594
|
+
graph.add_edge("analyze_schema", "classify_tokens")
|
|
595
|
+
graph.add_edge("classify_tokens", "normalize")
|
|
596
|
+
graph.add_edge("normalize", "validate")
|
|
597
|
+
graph.add_edge("validate", "match")
|
|
598
|
+
|
|
599
|
+
# ── 条件边:match 之后根据低置信度行数分流 ──
|
|
600
|
+
graph.add_conditional_edges(
|
|
601
|
+
"match",
|
|
602
|
+
route_after_match,
|
|
603
|
+
{
|
|
604
|
+
"human_review": "human_review",
|
|
605
|
+
"write_output": "write_output",
|
|
606
|
+
},
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
# ── human_review 后直接输出 ──
|
|
610
|
+
graph.add_edge("human_review", "write_output")
|
|
611
|
+
graph.add_edge("write_output", END)
|
|
612
|
+
|
|
613
|
+
# ── 编译 + checkpoint + interrupt ──
|
|
614
|
+
checkpointer = MemorySaver(serde=_DataFrameSerde())
|
|
615
|
+
return graph.compile(checkpointer=checkpointer)
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
# ── 公开 API ────────────────────────────────────────────────────
|
|
619
|
+
|
|
620
|
+
|
|
621
|
+
def run_pipeline(
|
|
622
|
+
master_path: str,
|
|
623
|
+
template_path: str,
|
|
624
|
+
output_path: str,
|
|
625
|
+
report_path: Optional[str] = None,
|
|
626
|
+
target_col: str = "配料",
|
|
627
|
+
master_sheet: int = 0,
|
|
628
|
+
template_sheet: int = 0,
|
|
629
|
+
use_langgraph: bool = True,
|
|
630
|
+
) -> PipelineState:
|
|
631
|
+
"""运行完整的 POS 模板映射管线。
|
|
632
|
+
|
|
633
|
+
这是工作流的主入口。默认使用纯顺序执行(不依赖 langgraph)。
|
|
634
|
+
传入 use_langgraph=True 可启用 LangGraph 编排。
|
|
635
|
+
|
|
636
|
+
Args:
|
|
637
|
+
master_path: 主数据表 Excel 路径。
|
|
638
|
+
template_path: POS 模板 Excel 路径。
|
|
639
|
+
output_path: 输出 Excel 路径。
|
|
640
|
+
report_path: 校验报告路径(默认 output_path 同目录 + _report.txt)。
|
|
641
|
+
target_col: 需要填充的目标列名,默认 "配料"。
|
|
642
|
+
master_sheet: 主数据表 Sheet 序号(从 0 开始),默认 0。
|
|
643
|
+
template_sheet: 模板表 Sheet 序号(从 0 开始),默认 0。
|
|
644
|
+
use_langgraph: 是否使用 LangGraph 编排(需安装 langgraph)。
|
|
645
|
+
|
|
646
|
+
Returns:
|
|
647
|
+
PipelineState,包含所有中间数据和最终结果。
|
|
648
|
+
检查 state.get("error") is not None 判断是否成功。
|
|
649
|
+
|
|
650
|
+
Raises:
|
|
651
|
+
ImportError: use_langgraph=True 但 langgraph 未安装。
|
|
652
|
+
"""
|
|
653
|
+
state = make_pipeline_state(
|
|
654
|
+
master_path=master_path,
|
|
655
|
+
template_path=template_path,
|
|
656
|
+
output_path=output_path,
|
|
657
|
+
report_path=report_path,
|
|
658
|
+
target_col=target_col,
|
|
659
|
+
master_sheet=master_sheet,
|
|
660
|
+
template_sheet=template_sheet,
|
|
661
|
+
)
|
|
662
|
+
|
|
663
|
+
# 重置 API 调用计数器 + 会话新增 token 追踪
|
|
664
|
+
from menupilot.agent.schema_analyzer import reset_api_call_count as _sa_reset
|
|
665
|
+
from menupilot.agent.token_classifier import reset_api_call_count as _tc_reset
|
|
666
|
+
from menupilot.data.memory import reset_new_tokens as _reset_new_tokens
|
|
667
|
+
_sa_reset()
|
|
668
|
+
_tc_reset()
|
|
669
|
+
_reset_new_tokens()
|
|
670
|
+
|
|
671
|
+
if use_langgraph:
|
|
672
|
+
import uuid
|
|
673
|
+
app = build_graph()
|
|
674
|
+
config = {"configurable": {"thread_id": str(uuid.uuid4())}}
|
|
675
|
+
result = app.invoke(state, config)
|
|
676
|
+
|
|
677
|
+
# Human Review 已禁用:输出报告后直接结束
|
|
678
|
+
# (保留 interrupt/resume 代码,需要时取消注释即可恢复)
|
|
679
|
+
#
|
|
680
|
+
# snapshot = app.get_state(config)
|
|
681
|
+
# if snapshot.next and "human_review" in snapshot.next:
|
|
682
|
+
# snap_state = snapshot.values
|
|
683
|
+
# low_conf = snap_state.get("low_conf_rows", [])
|
|
684
|
+
# master_fp = _compute_master_fingerprint(
|
|
685
|
+
# str(snap_state.get("master_path", ""))
|
|
686
|
+
# )
|
|
687
|
+
# from menupilot.cli.human_review import run_review
|
|
688
|
+
# review_result = run_review(low_conf, master_fp)
|
|
689
|
+
# app.update_state(config, {"human_review_result": review_result})
|
|
690
|
+
# result = app.invoke(None, config)
|
|
691
|
+
|
|
692
|
+
# 收集 API 调用统计
|
|
693
|
+
from menupilot.agent.schema_analyzer import get_api_call_count as _sa_count
|
|
694
|
+
from menupilot.agent.token_classifier import get_api_call_count as _tc_count
|
|
695
|
+
result["api_call_count"] = _sa_count() + _tc_count()
|
|
696
|
+
return result
|
|
697
|
+
|
|
698
|
+
# 纯顺序执行
|
|
699
|
+
steps = [
|
|
700
|
+
("load_data", step_load_data),
|
|
701
|
+
("preprocess", step_preprocess),
|
|
702
|
+
("analyze_schema", step_analyze_schema),
|
|
703
|
+
("classify_tokens", step_classify_tokens),
|
|
704
|
+
("normalize", step_normalize),
|
|
705
|
+
("validate", step_validate),
|
|
706
|
+
("match", step_match),
|
|
707
|
+
]
|
|
708
|
+
|
|
709
|
+
for step_name, step_fn in steps:
|
|
710
|
+
step_fn(state)
|
|
711
|
+
if state.get("error") is not None:
|
|
712
|
+
break
|
|
713
|
+
|
|
714
|
+
# Human Review 已禁用:输出报告后直接结束
|
|
715
|
+
# (保留代码,需要时取消注释即可恢复)
|
|
716
|
+
#
|
|
717
|
+
# if not state.get("error"):
|
|
718
|
+
# low_conf_rows = [
|
|
719
|
+
# r for r in state.get("match_results", [])
|
|
720
|
+
# if r.get("confidence") == "LOW_CONFIDENCE"
|
|
721
|
+
# ]
|
|
722
|
+
# if low_conf_rows:
|
|
723
|
+
# from menupilot.cli.human_review import run_review
|
|
724
|
+
# master_fp = _compute_master_fingerprint(
|
|
725
|
+
# str(state.get("master_path", ""))
|
|
726
|
+
# )
|
|
727
|
+
# review_result = run_review(low_conf_rows, master_fp)
|
|
728
|
+
# state["human_review_result"] = review_result
|
|
729
|
+
# step_human_review(state)
|
|
730
|
+
|
|
731
|
+
# 输出
|
|
732
|
+
if not state.get("error"):
|
|
733
|
+
step_write_output(state)
|
|
734
|
+
|
|
735
|
+
# 收集 API 调用统计
|
|
736
|
+
from menupilot.agent.schema_analyzer import get_api_call_count as _sa_count
|
|
737
|
+
from menupilot.agent.token_classifier import get_api_call_count as _tc_count
|
|
738
|
+
state["api_call_count"] = _sa_count() + _tc_count()
|
|
739
|
+
|
|
740
|
+
return state
|
|
741
|
+
|
|
742
|
+
|
|
743
|
+
# ── 自测 ────────────────────────────────────────────────────────
|
|
744
|
+
|
|
745
|
+
if __name__ == "__main__":
|
|
746
|
+
import os
|
|
747
|
+
import tempfile
|
|
748
|
+
|
|
749
|
+
os.environ["USE_MOCK_LLM"] = "1"
|
|
750
|
+
import importlib
|
|
751
|
+
|
|
752
|
+
importlib.reload(__import__("config"))
|
|
753
|
+
|
|
754
|
+
passed = 0
|
|
755
|
+
failed = 0
|
|
756
|
+
|
|
757
|
+
def check(condition, msg):
|
|
758
|
+
global passed, failed
|
|
759
|
+
if condition:
|
|
760
|
+
passed += 1
|
|
761
|
+
print(f" PASS {msg}")
|
|
762
|
+
else:
|
|
763
|
+
failed += 1
|
|
764
|
+
print(f" FAIL {msg}")
|
|
765
|
+
|
|
766
|
+
print("=== Workflow 自测(Mock LLM 模式)===\n")
|
|
767
|
+
|
|
768
|
+
tmpdir = tempfile.mkdtemp()
|
|
769
|
+
master_path = os.path.join(tmpdir, "master.xlsx")
|
|
770
|
+
template_path = os.path.join(tmpdir, "template.xlsx")
|
|
771
|
+
output_path = os.path.join(tmpdir, "output.xlsx")
|
|
772
|
+
|
|
773
|
+
# ── 覆盖 Mock 响应以匹配测试数据 ──
|
|
774
|
+
from menupilot import config as cfg
|
|
775
|
+
|
|
776
|
+
original_mock_schema = dict(cfg.MOCK_SCHEMA_RESPONSE)
|
|
777
|
+
tc_reset_cache()
|
|
778
|
+
|
|
779
|
+
# ── 准备测试用主数据表和模板表 ──
|
|
780
|
+
pd.DataFrame({
|
|
781
|
+
"品名": ["浅浅清茶", "浅浅清茶", "黑糖波波牛乳", "珍珠奶茶"],
|
|
782
|
+
"杯型": ["中杯", "中杯", "大杯", "中杯"],
|
|
783
|
+
"奶底": ["牛奶", "牛奶", "", "椰乳"],
|
|
784
|
+
"做法": ["少冰", "去冰", "正常冰", "热"],
|
|
785
|
+
"糖": ["七分糖", "标准糖", "标准糖", "无糖"],
|
|
786
|
+
"SOP": [
|
|
787
|
+
"T240、B30/80、S4",
|
|
788
|
+
"T265、B30/105、S5",
|
|
789
|
+
"T200、B50/100、S5",
|
|
790
|
+
"T180、B40/80、S2",
|
|
791
|
+
],
|
|
792
|
+
}).to_excel(master_path, index=False)
|
|
793
|
+
|
|
794
|
+
pd.DataFrame({
|
|
795
|
+
"菜品名称": ["浅浅清茶", "浅浅清茶", "黑糖波波牛乳"],
|
|
796
|
+
"规格": ["中杯", "中杯", "大杯"],
|
|
797
|
+
"口味做法组合": [
|
|
798
|
+
"牛奶, 少冰, 七分糖",
|
|
799
|
+
"牛奶, 去冰, 标准糖",
|
|
800
|
+
"正常冰, 标准糖",
|
|
801
|
+
],
|
|
802
|
+
"配料": ["", "", ""],
|
|
803
|
+
}).to_excel(template_path, index=False)
|
|
804
|
+
|
|
805
|
+
try:
|
|
806
|
+
# ── 1. 完整管线运行 ──
|
|
807
|
+
print("1. 完整管线 run_pipeline()")
|
|
808
|
+
state = run_pipeline(master_path, template_path, output_path)
|
|
809
|
+
|
|
810
|
+
check(not state.get("error") is not None, f"管线无错误(错误: {state["error"]})")
|
|
811
|
+
check(state["master_df"] is not None, "master_df 已加载")
|
|
812
|
+
check(state["template_df"] is not None, "template_df 已加载")
|
|
813
|
+
check(state["schema_result"] is not None, "schema_result 已生成")
|
|
814
|
+
check(state["token_results"] is not None, "token_results 已生成")
|
|
815
|
+
check(state["master_canonical"] is not None, "master_canonical 已转换")
|
|
816
|
+
check(state["template_canonical"] is not None, "template_canonical 已转换")
|
|
817
|
+
check(state["validated_tokens"] is not None, "validated_tokens 已验证")
|
|
818
|
+
check(state["match_results"] is not None, "match_results 已生成")
|
|
819
|
+
check(len(state["match_results"]) == 3, f"3 条匹配结果(实际 {len(state["match_results"])})")
|
|
820
|
+
print()
|
|
821
|
+
|
|
822
|
+
# ── 2. 匹配结果验证 ──
|
|
823
|
+
print("2. 匹配结果验证")
|
|
824
|
+
check(
|
|
825
|
+
state["match_results"][0]["confidence"] == "HIGH",
|
|
826
|
+
f"第 1 行 HIGH(实际 {state["match_results"][0]['confidence']})",
|
|
827
|
+
)
|
|
828
|
+
check(
|
|
829
|
+
state["match_results"][1]["confidence"] == "HIGH",
|
|
830
|
+
f"第 2 行 HIGH(实际 {state["match_results"][1]['confidence']})",
|
|
831
|
+
)
|
|
832
|
+
check(
|
|
833
|
+
state["match_results"][2]["product_score"] >= 90,
|
|
834
|
+
f"第 3 行商品名分数 ≥ 90(实际 {state["match_results"][2]['product_score']})",
|
|
835
|
+
)
|
|
836
|
+
print()
|
|
837
|
+
|
|
838
|
+
# ── 3. 输出文件验证 ──
|
|
839
|
+
print("3. 输出文件验证")
|
|
840
|
+
check(os.path.exists(output_path), "输出 Excel 文件已生成")
|
|
841
|
+
report_path = output_path.replace(".xlsx", "_report.txt")
|
|
842
|
+
check(os.path.exists(report_path), "校验报告已生成")
|
|
843
|
+
|
|
844
|
+
# 读取输出 Excel 验证内容
|
|
845
|
+
df_out = pd.read_excel(output_path)
|
|
846
|
+
check("配料" in df_out.columns, "输出包含 '配料' 列")
|
|
847
|
+
check("匹配置信度" in df_out.columns, "输出包含 '匹配置信度' 列")
|
|
848
|
+
check(
|
|
849
|
+
df_out.iloc[0]["配料"] == "T240、B30/80、S4",
|
|
850
|
+
f"第 1 行 SOP 正确(实际 {df_out.iloc[0]['配料']})",
|
|
851
|
+
)
|
|
852
|
+
|
|
853
|
+
# 读取报告验证
|
|
854
|
+
report_text = open(report_path, encoding="utf-8").read()
|
|
855
|
+
check("本次映射完成" in report_text, "报告包含标题")
|
|
856
|
+
print()
|
|
857
|
+
|
|
858
|
+
# ── 4. 多候选精确匹配 ──
|
|
859
|
+
print("4. 多候选精确属性选择")
|
|
860
|
+
tc_reset_cache()
|
|
861
|
+
|
|
862
|
+
# 同产品名三个主数据行(不同属性),应精确选择对的属性
|
|
863
|
+
pd.DataFrame({
|
|
864
|
+
"品名": ["测试茶", "测试茶", "测试茶"],
|
|
865
|
+
"杯型": ["大杯", "中杯", "小杯"],
|
|
866
|
+
"奶底": ["牛奶", "燕麦奶", ""],
|
|
867
|
+
"做法": ["正常冰", "少冰", "去冰"],
|
|
868
|
+
"糖": ["全糖", "七分糖", "三分糖"],
|
|
869
|
+
"SOP": ["SOP-A", "SOP-B", "SOP-C"],
|
|
870
|
+
}).to_excel(master_path, index=False)
|
|
871
|
+
|
|
872
|
+
pd.DataFrame({
|
|
873
|
+
"菜品名称": ["测试茶"],
|
|
874
|
+
"规格": ["中杯"],
|
|
875
|
+
"口味做法组合": ["燕麦奶, 少冰, 七分糖"],
|
|
876
|
+
"配料": [""],
|
|
877
|
+
}).to_excel(template_path, index=False)
|
|
878
|
+
|
|
879
|
+
state2 = run_pipeline(master_path, template_path, output_path)
|
|
880
|
+
check(not state2.get("error") is not None, "二次运行无错误")
|
|
881
|
+
check(len(state2["match_results"]) == 1, "1 条匹配")
|
|
882
|
+
check(
|
|
883
|
+
state2["match_results"][0]["sop"] == "SOP-B",
|
|
884
|
+
f"选中 SOP-B(中杯/燕麦奶/少冰/七分糖)(实际 {state2["match_results"][0]['sop']})",
|
|
885
|
+
)
|
|
886
|
+
check(state2["match_results"][0]["confidence"] == "HIGH", "置信度 HIGH")
|
|
887
|
+
print()
|
|
888
|
+
|
|
889
|
+
# ── 5. 错误处理:文件不存在 ──
|
|
890
|
+
print("5. 错误处理:文件不存在")
|
|
891
|
+
state_err = run_pipeline(
|
|
892
|
+
"不存在的文件.xlsx", template_path, output_path
|
|
893
|
+
)
|
|
894
|
+
check(state_err.get("error") is not None, "文件不存在 → has_error=True")
|
|
895
|
+
check(state_err["error_step"] == "load_data", "错误发生在 load_data 步骤")
|
|
896
|
+
print()
|
|
897
|
+
|
|
898
|
+
# ── 6. 报告内容验证 ──
|
|
899
|
+
print("6. 报告内容")
|
|
900
|
+
check(
|
|
901
|
+
"高置信匹配" in state["report"],
|
|
902
|
+
"报告包含高置信匹配统计",
|
|
903
|
+
)
|
|
904
|
+
check(
|
|
905
|
+
"需要确认" in state["report"] or "高置信匹配" in state["report"],
|
|
906
|
+
"报告包含置信度分级",
|
|
907
|
+
)
|
|
908
|
+
print()
|
|
909
|
+
|
|
910
|
+
# ── 7. 无匹配商品 → LOW_CONFIDENCE ──
|
|
911
|
+
print("7. 无匹配商品 → LOW_CONFIDENCE")
|
|
912
|
+
tc_reset_cache()
|
|
913
|
+
pd.DataFrame({
|
|
914
|
+
"品名": ["产品A"],
|
|
915
|
+
"杯型": ["中杯"],
|
|
916
|
+
"奶底": [""],
|
|
917
|
+
"做法": ["正常冰"],
|
|
918
|
+
"糖": ["标准糖"],
|
|
919
|
+
"SOP": ["SOP-X"],
|
|
920
|
+
}).to_excel(master_path, index=False)
|
|
921
|
+
|
|
922
|
+
pd.DataFrame({
|
|
923
|
+
"菜品名称": ["完全不存在的商品"],
|
|
924
|
+
"规格": ["中杯"],
|
|
925
|
+
"口味做法组合": ["正常冰, 标准糖"],
|
|
926
|
+
"配料": [""],
|
|
927
|
+
}).to_excel(template_path, index=False)
|
|
928
|
+
|
|
929
|
+
state3 = run_pipeline(master_path, template_path, output_path)
|
|
930
|
+
check(not state3.get("error") is not None, "无错误")
|
|
931
|
+
check(len(state3["match_results"]) == 1, "1 条结果")
|
|
932
|
+
check(
|
|
933
|
+
state3["match_results"][0]["confidence"] == "LOW_CONFIDENCE",
|
|
934
|
+
f"无匹配 → LOW_CONFIDENCE(实际 {state3["match_results"][0]['confidence']})",
|
|
935
|
+
)
|
|
936
|
+
check(
|
|
937
|
+
state3["match_results"][0]["match_type"] == "best_guess",
|
|
938
|
+
f"匹配类型 best_guess(实际 {state3["match_results"][0]['match_type']})",
|
|
939
|
+
)
|
|
940
|
+
print()
|
|
941
|
+
|
|
942
|
+
# ── 8. LangGraph 路径和顺序路径结果一致 ──
|
|
943
|
+
print("8. LangGraph 路径和顺序路径结果一致")
|
|
944
|
+
# 重新写入最初的标准测试数据
|
|
945
|
+
pd.DataFrame({
|
|
946
|
+
"品名": ["浅浅清茶", "浅浅清茶", "黑糖波波牛乳", "珍珠奶茶"],
|
|
947
|
+
"杯型": ["中杯", "中杯", "大杯", "中杯"],
|
|
948
|
+
"奶底": ["牛奶", "牛奶", "", "椰乳"],
|
|
949
|
+
"做法": ["少冰", "去冰", "正常冰", "热"],
|
|
950
|
+
"糖": ["七分糖", "标准糖", "标准糖", "无糖"],
|
|
951
|
+
"SOP": [
|
|
952
|
+
"T240、B30/80、S4", "T265、B30/105、S5",
|
|
953
|
+
"T200、B50/100、S5", "T180、B40/80、S2",
|
|
954
|
+
],
|
|
955
|
+
}).to_excel(master_path, index=False)
|
|
956
|
+
|
|
957
|
+
pd.DataFrame({
|
|
958
|
+
"菜品名称": ["浅浅清茶", "浅浅清茶", "黑糖波波牛乳"],
|
|
959
|
+
"规格": ["中杯", "中杯", "大杯"],
|
|
960
|
+
"口味做法组合": [
|
|
961
|
+
"牛奶, 少冰, 七分糖",
|
|
962
|
+
"牛奶, 去冰, 标准糖",
|
|
963
|
+
"正常冰, 标准糖",
|
|
964
|
+
],
|
|
965
|
+
"配料": ["", "", ""],
|
|
966
|
+
}).to_excel(template_path, index=False)
|
|
967
|
+
tc_reset_cache()
|
|
968
|
+
|
|
969
|
+
# 顺序执行
|
|
970
|
+
state_seq = run_pipeline(master_path, template_path, output_path, use_langgraph=False)
|
|
971
|
+
check(state_seq.get("error") is None, "顺序执行无错误")
|
|
972
|
+
seq_sops = [r.get("sop", "") for r in state_seq["match_results"]]
|
|
973
|
+
|
|
974
|
+
# LangGraph 执行
|
|
975
|
+
state_lg = run_pipeline(master_path, template_path, output_path, use_langgraph=True)
|
|
976
|
+
check(state_lg.get("error") is None, "LangGraph 执行无错误")
|
|
977
|
+
|
|
978
|
+
if state_lg.get("error") is None:
|
|
979
|
+
lg_sops = [r.get("sop", "") for r in state_lg["match_results"]]
|
|
980
|
+
lg_confs = [r.get("confidence", "") for r in state_lg["match_results"]]
|
|
981
|
+
seq_confs = [r.get("confidence", "") for r in state_seq["match_results"]]
|
|
982
|
+
check(seq_sops == lg_sops, f"SOP 结果一致(顺序={len(seq_sops)}, LG={len(lg_sops)})")
|
|
983
|
+
check(seq_confs == lg_confs, "置信度结果一致")
|
|
984
|
+
check(len(state_seq["match_results"]) == len(state_lg["match_results"]),
|
|
985
|
+
f"匹配行数一致({len(state_seq['match_results'])})")
|
|
986
|
+
print()
|
|
987
|
+
|
|
988
|
+
# ── 9. 路由函数单元测试 ──
|
|
989
|
+
print("9. 路由函数单元测试")
|
|
990
|
+
check(route_after_load({"error": "test error"}) == "write_output",
|
|
991
|
+
"有 error → write_output")
|
|
992
|
+
check(route_after_load({"template_type": "chowbus"}) == "preprocess",
|
|
993
|
+
"chowbus → preprocess")
|
|
994
|
+
check(route_after_load({"template_type": "standard"}) == "analyze_schema",
|
|
995
|
+
"standard → analyze_schema")
|
|
996
|
+
check(route_after_load({}) == "analyze_schema",
|
|
997
|
+
"无 template_type 默认 → analyze_schema")
|
|
998
|
+
|
|
999
|
+
check(route_after_match({"error": "test"}) == "write_output",
|
|
1000
|
+
"match 有 error → write_output")
|
|
1001
|
+
check(route_after_match({"low_conf_rows": [{"confidence": "LOW_CONFIDENCE"}]}) == "human_review",
|
|
1002
|
+
"有低置信度行 → human_review")
|
|
1003
|
+
check(route_after_match({"low_conf_rows": []}) == "write_output",
|
|
1004
|
+
"无低置信度行 → write_output")
|
|
1005
|
+
check(route_after_match({}) == "write_output",
|
|
1006
|
+
"low_conf_rows 不存在 → write_output")
|
|
1007
|
+
print()
|
|
1008
|
+
|
|
1009
|
+
# ── 10. step_human_review 单元测试 ──
|
|
1010
|
+
print("10. step_human_review 单元测试")
|
|
1011
|
+
# 10a: 空 review_result → 无操作
|
|
1012
|
+
s10a = {"human_review_result": {}, "match_results": [{"sop": "OLD", "confidence": "LOW_CONFIDENCE"}]}
|
|
1013
|
+
step_human_review(s10a)
|
|
1014
|
+
check(s10a["match_results"][0]["sop"] == "OLD", "空 review → 不变")
|
|
1015
|
+
|
|
1016
|
+
# 10b: accept 升级为 HIGH
|
|
1017
|
+
s10b = {
|
|
1018
|
+
"human_review_result": {"decisions": [{"row_index": 0, "action": "accept", "sop": "NEW"}]},
|
|
1019
|
+
"match_results": [{"sop": "OLD", "confidence": "LOW_CONFIDENCE"}],
|
|
1020
|
+
"template_canonical": [{"product_name": "测试"}],
|
|
1021
|
+
"master_path": master_path,
|
|
1022
|
+
}
|
|
1023
|
+
step_human_review(s10b)
|
|
1024
|
+
check(s10b["match_results"][0]["confidence"] == "HIGH", "accept → HIGH")
|
|
1025
|
+
check(s10b["match_results"][0]["sop"] == "NEW", "accept → sop=NEW")
|
|
1026
|
+
|
|
1027
|
+
# 10c: manual 手动输入
|
|
1028
|
+
s10c = {
|
|
1029
|
+
"human_review_result": {"decisions": [{"row_index": 0, "action": "manual", "sop": "手动SOP"}]},
|
|
1030
|
+
"match_results": [{"sop": "OLD", "confidence": "LOW_CONFIDENCE"}],
|
|
1031
|
+
"template_canonical": [{"product_name": "测试"}],
|
|
1032
|
+
"master_path": master_path,
|
|
1033
|
+
}
|
|
1034
|
+
step_human_review(s10c)
|
|
1035
|
+
check(s10c["match_results"][0]["confidence"] == "HIGH", "manual → HIGH")
|
|
1036
|
+
check(s10c["match_results"][0]["sop"] == "手动SOP", "manual → sop=手动SOP")
|
|
1037
|
+
|
|
1038
|
+
# 10d: skip 保持 LOW_CONFIDENCE
|
|
1039
|
+
s10d = {
|
|
1040
|
+
"human_review_result": {"decisions": [{"row_index": 0, "action": "skip"}]},
|
|
1041
|
+
"match_results": [{"sop": "OLD", "confidence": "LOW_CONFIDENCE"}],
|
|
1042
|
+
"template_canonical": [],
|
|
1043
|
+
"master_path": "",
|
|
1044
|
+
}
|
|
1045
|
+
step_human_review(s10d)
|
|
1046
|
+
check(s10d["match_results"][0]["confidence"] == "LOW_CONFIDENCE", "skip → 保持 LOW_CONFIDENCE")
|
|
1047
|
+
|
|
1048
|
+
# 10e: permanent_skip 写入 __SKIP__
|
|
1049
|
+
s10e = {
|
|
1050
|
+
"human_review_result": {"decisions": [{"row_index": 0, "action": "permanent_skip"}]},
|
|
1051
|
+
"match_results": [{"sop": "OLD", "confidence": "LOW_CONFIDENCE"}],
|
|
1052
|
+
"template_canonical": [{"product_name": "永久跳过商品"}],
|
|
1053
|
+
"master_path": master_path,
|
|
1054
|
+
}
|
|
1055
|
+
step_human_review(s10e)
|
|
1056
|
+
check(s10e["match_results"][0]["confidence"] == "LOW_CONFIDENCE",
|
|
1057
|
+
"permanent_skip → 保持 LOW_CONFIDENCE(标记在 memory 中)")
|
|
1058
|
+
print()
|
|
1059
|
+
|
|
1060
|
+
finally:
|
|
1061
|
+
# 清理临时文件
|
|
1062
|
+
for f in [master_path, template_path, output_path,
|
|
1063
|
+
output_path.replace(".xlsx", "_report.txt")]:
|
|
1064
|
+
if os.path.exists(f):
|
|
1065
|
+
os.remove(f)
|
|
1066
|
+
os.rmdir(tmpdir)
|
|
1067
|
+
|
|
1068
|
+
# 还原 Mock 设置
|
|
1069
|
+
cfg.MOCK_SCHEMA_RESPONSE = original_mock_schema
|
|
1070
|
+
tc_reset_cache()
|
|
1071
|
+
|
|
1072
|
+
print(f"=== 结果: {passed} passed, {failed} failed ===")
|