MenuPilot 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,509 @@
1
+ """
2
+ Rule Engine — 字段标准化、Token 验证、奶底通配逻辑、Canonical Schema 转换。
3
+ 纯规则逻辑,不调用 LLM。位于 Schema Analyzer / Token Classifier 之后,Matching Engine 之前。
4
+ """
5
+
6
+ import math
7
+ from typing import Any, Dict, List, Optional
8
+
9
+ import pandas as pd
10
+
11
+ from menupilot.data.token_dict import lookup, is_known, normalize_token, UNKNOWN_TOKEN
12
+
13
+ # ── Canonical Schema 字段 ─────────────────────────────────────
14
+
15
+ CANONICAL_FIELDS = ["product_name", "size", "milk_base", "temperature", "sugar", "tea_base"]
16
+
17
+ # 主数据表中文列名 → Canonical(主数据表字段名固定,不需要 LLM 识别)
18
+ MASTER_COLUMN_MAP = {
19
+ "品名": "product_name",
20
+ "杯型": "size",
21
+ "奶底": "milk_base",
22
+ "做法": "temperature",
23
+ "糖": "sugar",
24
+ }
25
+
26
+ # 可通配的维度:主数据中为空的维度可匹配任意值
27
+ WILDCARD_DIMENSIONS = {"milk_base", "tea_base"}
28
+
29
+ # 必要维度:匹配时必须有值,缺列直接报错
30
+ REQUIRED_DIMENSIONS = {"size", "temperature", "sugar"}
31
+
32
+ # Token 中文类型名 → Canonical 字段名
33
+ TOKEN_TYPE_TO_FIELD = {
34
+ "温度": "temperature",
35
+ "糖度": "sugar",
36
+ "奶底": "milk_base",
37
+ "规格": "size",
38
+ "茶底": "tea_base",
39
+ }
40
+
41
+
42
+ def _empty(val) -> bool:
43
+ """判断值是否为空(NaN / None / 空字符串 / 纯空白)。"""
44
+ if val is None:
45
+ return True
46
+ if isinstance(val, float) and math.isnan(val):
47
+ return True
48
+ if isinstance(val, str) and val.strip() == "":
49
+ return True
50
+ return False
51
+
52
+
53
+ # ── 主数据标准化 ──────────────────────────────────────────────
54
+
55
+ def master_to_canonical(master_df: pd.DataFrame) -> List[Dict[str, Any]]:
56
+ """将主数据表转换为 Canonical Schema 行列表。
57
+
58
+ 主数据表字段名固定为中文(品名/杯型/奶底/做法/糖),
59
+ 映射为 product_name / size / milk_base / temperature / sugar。
60
+
61
+ 主数据表缺少奶底或茶底列时,自动将该维度设为全行通配(None),
62
+ 不报错、不退出、不触发交互。缺少规格/做法/糖列时抛出 ValueError。
63
+
64
+ Args:
65
+ master_df: 主数据 DataFrame。
66
+
67
+ Returns:
68
+ canonical_rows: 每行一个 dict,包含 canonical 字段 + sop(若有)。
69
+
70
+ Raises:
71
+ ValueError: 缺少 REQUIRED_DIMENSIONS 对应的列时抛出。
72
+ """
73
+ existing_cols = set(master_df.columns)
74
+
75
+ # ── 检测通配维度列是否缺失 ──
76
+ for cn_col, en_col in MASTER_COLUMN_MAP.items():
77
+ if en_col in WILDCARD_DIMENSIONS and cn_col not in existing_cols:
78
+ print(f"[INFO] 主数据表未检测到「{cn_col}」列,该维度将作为通配符处理")
79
+
80
+ # ── 检测必要维度列是否缺失 ──
81
+ for cn_col, en_col in MASTER_COLUMN_MAP.items():
82
+ if en_col in REQUIRED_DIMENSIONS and cn_col not in existing_cols:
83
+ raise ValueError(
84
+ f"主数据表缺少必要列「{cn_col}」(对应维度: {en_col}),"
85
+ f"请检查主数据表是否完整"
86
+ )
87
+
88
+ rows = []
89
+ for _, row in master_df.iterrows():
90
+ cr = {f: None for f in CANONICAL_FIELDS}
91
+ for cn_col, en_col in MASTER_COLUMN_MAP.items():
92
+ if cn_col not in existing_cols:
93
+ # 列不存在 → 通配维度强制 None,必要维度已在上面报错不会到这里
94
+ cr[en_col] = None
95
+ continue
96
+ val = row.get(cn_col)
97
+ if _empty(val):
98
+ # 奶底和茶底允许空值(通配),其余维度保留 None 以便后续检测
99
+ cr[en_col] = None if en_col in WILDCARD_DIMENSIONS else val
100
+ else:
101
+ raw = str(val).strip()
102
+ cr[en_col] = normalize_token(raw)
103
+
104
+ # 保留 SOP 字段(目标值,匹配时直接引用)
105
+ # 主数据表中 SOP 列可能命名为 "SOP"、"配料" 或 "SOP 代码"
106
+ sop_col = None
107
+ for candidate in ["SOP", "配料", "SOP 代码", "代码"]:
108
+ if candidate in master_df.columns:
109
+ sop_col = candidate
110
+ break
111
+ if sop_col and not _empty(row.get(sop_col)):
112
+ cr["sop"] = str(row[sop_col]).strip()
113
+ else:
114
+ cr["sop"] = None
115
+
116
+ rows.append(cr)
117
+ return rows
118
+
119
+
120
+ # ── 模板标准化 ────────────────────────────────────────────────
121
+
122
+ def template_to_canonical(
123
+ template_df: pd.DataFrame,
124
+ field_mapping: Dict[str, str],
125
+ composite_col: str,
126
+ token_results: List[Dict[str, Any]],
127
+ ) -> List[Dict[str, Any]]:
128
+ """将模板表转换为 Canonical Schema 行列表。
129
+
130
+ 分为两步:
131
+ 1. 直接列映射:根据 field_mapping 将模板各列映射到 canonical 字段。
132
+ 2. 组合字段注入:将 Token Classifier 的分类结果注入 canonical 行,
133
+ 覆盖/补充对应的 canonical 字段。
134
+
135
+ Args:
136
+ template_df: 模板 DataFrame。
137
+ field_mapping: 模板列名 → canonical 字段名 的映射。
138
+ composite_col: 组合字段列名(如 "口味做法组合")。
139
+ token_results: Token Classifier 输出的逐行分类结果列表,每行为:
140
+ {"tokens": [{"value": "红茶", "type": "茶底"}, ...], "missing": ["奶底"]}
141
+
142
+ Returns:
143
+ canonical_rows: 每行一个 dict,包含 canonical 字段。
144
+ """
145
+ rows = []
146
+ for i, (_, trow) in enumerate(template_df.iterrows()):
147
+ cr = {f: None for f in CANONICAL_FIELDS}
148
+
149
+ # Step 1: 直接列映射
150
+ for tcol, cfield in field_mapping.items():
151
+ if tcol in template_df.columns:
152
+ val = trow[tcol]
153
+ if not _empty(val):
154
+ raw = str(val).strip()
155
+ cr[cfield] = normalize_token(raw)
156
+
157
+ # Step 2: 组合字段注入
158
+ if i < len(token_results):
159
+ tr = token_results[i]
160
+ for tok in tr.get("tokens", []):
161
+ token_val = tok.get("value", "")
162
+ token_type = tok.get("type", "")
163
+ cfield = TOKEN_TYPE_TO_FIELD.get(token_type)
164
+ if cfield and cfield in CANONICAL_FIELDS:
165
+ cr[cfield] = normalize_token(str(token_val).strip())
166
+
167
+ # 记录缺失维度
168
+ cr["_missing_dimensions"] = tr.get("missing", [])
169
+
170
+ rows.append(cr)
171
+ return rows
172
+
173
+
174
+ # ── Token 验证 ────────────────────────────────────────────────
175
+
176
+ def validate_tokens(token_results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
177
+ """验证 Token Classifier 的分类结果,标注未知 token。
178
+
179
+ 对每个 token 调用 token_dict.lookup,词典中不存在的 token
180
+ 在其结果中追加 `verified_type: "UNKNOWN_TOKEN"` 标记。
181
+
182
+ Args:
183
+ token_results: Token Classifier 原始输出。
184
+
185
+ Returns:
186
+ 验证后的 token_results,每个 token 新增 verified_type 和 is_known 字段。
187
+ """
188
+ validated = []
189
+ for tr in token_results:
190
+ new_tokens = []
191
+ for tok in tr.get("tokens", []):
192
+ raw_val = str(tok.get("value", "")).strip()
193
+ # 先 normalize 再 lookup:处理带后缀的 token 值
194
+ cleaned = normalize_token(raw_val)
195
+ llm_type = tok.get("type", "")
196
+ verified = lookup(cleaned)
197
+ new_tokens.append({
198
+ **tok,
199
+ "value": cleaned, # 更新为清洗后的值
200
+ "raw_value": raw_val, # 保留原始值供审计
201
+ "verified_type": verified,
202
+ "is_known": verified != UNKNOWN_TOKEN,
203
+ # 如果 LLM 分类和词典不一致,以词典为准
204
+ "type_conflict": (verified != UNKNOWN_TOKEN and verified != llm_type),
205
+ })
206
+ validated.append({
207
+ **tr,
208
+ "tokens": new_tokens,
209
+ })
210
+ return validated
211
+
212
+
213
+ def check_row_completeness(canonical_row: Dict[str, Any]) -> List[str]:
214
+ """检查 canonical 行的必要维度是否缺失。
215
+
216
+ 规格/温度/糖度 必须存在,缺失时返回维度名列表。
217
+
218
+ Args:
219
+ canonical_row: 单行 canonical dict。
220
+
221
+ Returns:
222
+ 缺失的必要维度列表。
223
+ """
224
+ return [f for f in REQUIRED_DIMENSIONS if canonical_row.get(f) is None]
225
+
226
+
227
+ # ── 自测 ──────────────────────────────────────────────────────
228
+
229
+ if __name__ == "__main__":
230
+ passed = 0
231
+ failed = 0
232
+
233
+ def check(condition, msg):
234
+ global passed, failed
235
+ if condition:
236
+ passed += 1
237
+ print(f" PASS {msg}")
238
+ else:
239
+ failed += 1
240
+ print(f" FAIL {msg}")
241
+
242
+ print("=== Rule Engine 自测 ===\n")
243
+
244
+ # ── 1. master_to_canonical: 正常行 ──
245
+ print("1. master_to_canonical(正常行)")
246
+ master_df = pd.DataFrame([
247
+ {"品名": "浅浅清茶", "杯型": "中杯", "奶底": "牛奶", "做法": "少冰", "糖": "七分糖", "SOP": "T240、B30/80、S4"},
248
+ {"品名": "浅浅清茶", "杯型": "中杯", "奶底": "牛奶", "做法": "去冰", "糖": "标准糖", "SOP": "T265、B30/105、S5"},
249
+ ])
250
+ m_rows = master_to_canonical(master_df)
251
+ check(len(m_rows) == 2, f"2 行 → {len(m_rows)} 行")
252
+ check(m_rows[0]["product_name"] == "浅浅清茶", "品名 → product_name")
253
+ check(m_rows[0]["size"] == "中杯", "杯型 → size")
254
+ check(m_rows[0]["milk_base"] == "牛奶", "奶底 → milk_base")
255
+ check(m_rows[0]["temperature"] == "少冰", "做法 → temperature")
256
+ check(m_rows[0]["sugar"] == "七分糖", "糖 → sugar")
257
+ check(m_rows[0]["tea_base"] is None, "茶底 为 None(主数据无此字段)")
258
+ check(m_rows[0]["sop"] == "T240、B30/80、S4", "SOP 保留")
259
+ print()
260
+
261
+ # ── 2. master_to_canonical: 奶底为空(通配) ──
262
+ print("2. master_to_canonical(奶底为空 → 通配符)")
263
+ master_empty_milk = pd.DataFrame([
264
+ {"品名": "黑糖波波", "杯型": "大杯", "奶底": "", "做法": "正常冰", "糖": "全糖"},
265
+ ])
266
+ m2 = master_to_canonical(master_empty_milk)
267
+ check(m2[0]["milk_base"] is None, "奶底空字符串 → None(通配)")
268
+ check(m2[0]["product_name"] == "黑糖波波", "品名正常")
269
+ print()
270
+
271
+ # ── 3. master_to_canonical: NaN 处理 ──
272
+ print("3. master_to_canonical(NaN 处理)")
273
+ master_nan = pd.DataFrame([
274
+ {"品名": "测试", "杯型": "大杯", "奶底": float("nan"), "做法": "热", "糖": "无糖"},
275
+ ])
276
+ m3 = master_to_canonical(master_nan)
277
+ check(m3[0]["milk_base"] is None, "奶底 NaN → None(通配)")
278
+ check(m3[0]["temperature"] == "热", "做法正常")
279
+ print()
280
+
281
+ # ── 3b. master_to_canonical: suffix 清洗 ──
282
+ print("3b. master_to_canonical(suffix 清洗)")
283
+ master_suffix = pd.DataFrame([
284
+ {"品名": "浅浅清茶", "杯型": "中杯", "奶底": "牛奶", "做法": "正常冰|推荐", "糖": "七分糖|推荐", "SOP": "T240"},
285
+ {"品名": "珍珠奶茶", "杯型": "大杯/新", "奶底": "椰乳", "做法": "热", "糖": "标准糖|推荐", "SOP": "T180"},
286
+ ])
287
+ ms = master_to_canonical(master_suffix)
288
+ check(ms[0]["sugar"] == "七分糖", f"'七分糖|推荐' → '七分糖'(实际 {ms[0]['sugar']})")
289
+ check(ms[0]["temperature"] == "正常冰", f"'正常冰|推荐' → '正常冰'(实际 {ms[0]['temperature']})")
290
+ check(ms[1]["size"] == "大杯", f"'大杯/新' → '大杯'(实际 {ms[1]['size']})")
291
+ check(ms[1]["sugar"] == "标准糖", f"'标准糖|推荐' → '标准糖'(实际 {ms[1]['sugar']})")
292
+ # 无后缀的不受影响
293
+ check(ms[0]["milk_base"] == "牛奶", "无后缀 milk_base 保持不变")
294
+ check(ms[0]["product_name"] == "浅浅清茶", "无后缀 product_name 保持不变")
295
+ print()
296
+
297
+ # ── 3c. master_to_canonical: 缺奶底列 → 自动通配 ──
298
+ print("3c. master_to_canonical(缺奶底列 → 自动通配)")
299
+ master_no_milk = pd.DataFrame([
300
+ {"品名": "浅浅清茶", "杯型": "中杯", "做法": "少冰", "糖": "七分糖", "SOP": "T240"},
301
+ {"品名": "黑糖波波", "杯型": "大杯", "做法": "正常冰", "糖": "全糖", "SOP": "T180"},
302
+ ])
303
+ m_no_milk = master_to_canonical(master_no_milk)
304
+ check(len(m_no_milk) == 2, f"2 行 → {len(m_no_milk)} 行")
305
+ check(m_no_milk[0]["milk_base"] is None, "缺奶底列 → 奶底全行 None(通配)")
306
+ check(m_no_milk[1]["milk_base"] is None, "第2行奶底也为 None")
307
+ check(m_no_milk[0]["product_name"] == "浅浅清茶", "品名正常")
308
+ check(m_no_milk[0]["size"] == "中杯", "杯型正常")
309
+ check(m_no_milk[0]["temperature"] == "少冰", "做法正常")
310
+ check(m_no_milk[0]["sugar"] == "七分糖", "糖正常")
311
+ check(m_no_milk[0]["sop"] == "T240", "SOP 正常")
312
+ print()
313
+
314
+ # ── 3d. master_to_canonical: 缺茶底列 → 自动通配 ──
315
+ print("3d. master_to_canonical(缺茶底列 → 自动通配)")
316
+ master_no_tea = pd.DataFrame([
317
+ {"品名": "浅浅清茶", "杯型": "中杯", "奶底": "牛奶", "做法": "少冰", "糖": "七分糖"},
318
+ ])
319
+ m_no_tea = master_to_canonical(master_no_tea)
320
+ check(m_no_tea[0]["tea_base"] is None, "缺茶底列 → 茶底全行 None(通配)")
321
+ check(m_no_tea[0]["milk_base"] == "牛奶", "奶底正常")
322
+ check(m_no_tea[0]["product_name"] == "浅浅清茶", "品名正常")
323
+ print()
324
+
325
+ # ── 3e. master_to_canonical: 同时缺奶底和茶底列 → 自动通配 ──
326
+ print("3e. master_to_canonical(同时缺奶底+茶底列 → 自动通配)")
327
+ master_no_both = pd.DataFrame([
328
+ {"品名": "浅浅清茶", "杯型": "中杯", "做法": "少冰", "糖": "七分糖"},
329
+ ])
330
+ m_no_both = master_to_canonical(master_no_both)
331
+ check(m_no_both[0]["milk_base"] is None, "奶底 None(通配)")
332
+ check(m_no_both[0]["tea_base"] is None, "茶底 None(通配)")
333
+ check(m_no_both[0]["product_name"] == "浅浅清茶", "品名正常")
334
+ check(m_no_both[0]["size"] == "中杯", "杯型正常")
335
+ print()
336
+
337
+ # ── 3f. master_to_canonical: 缺必要维度列(做法/temperature)→ 报错 ──
338
+ print("3f. master_to_canonical(缺必要维度列 → 报错)")
339
+ master_no_temp = pd.DataFrame([
340
+ {"品名": "浅浅清茶", "杯型": "中杯", "奶底": "牛奶", "糖": "七分糖"},
341
+ ])
342
+ error_raised = False
343
+ try:
344
+ master_to_canonical(master_no_temp)
345
+ except ValueError as e:
346
+ error_raised = True
347
+ check("做法" in str(e), f"报错信息包含「做法」(实际: {e})")
348
+ check("temperature" in str(e), f"报错信息包含 temperature(实际: {e})")
349
+ check(error_raised, "缺做法列应抛出 ValueError")
350
+
351
+ # 缺糖列
352
+ master_no_sugar = pd.DataFrame([
353
+ {"品名": "浅浅清茶", "杯型": "中杯", "奶底": "牛奶", "做法": "少冰"},
354
+ ])
355
+ error_raised2 = False
356
+ try:
357
+ master_to_canonical(master_no_sugar)
358
+ except ValueError as e:
359
+ error_raised2 = True
360
+ check("糖" in str(e), f"报错信息包含「糖」(实际: {e})")
361
+ check(error_raised2, "缺糖列应抛出 ValueError")
362
+
363
+ # 缺杯型列
364
+ master_no_size = pd.DataFrame([
365
+ {"品名": "浅浅清茶", "奶底": "牛奶", "做法": "少冰", "糖": "七分糖"},
366
+ ])
367
+ error_raised3 = False
368
+ try:
369
+ master_to_canonical(master_no_size)
370
+ except ValueError as e:
371
+ error_raised3 = True
372
+ check("杯型" in str(e), f"报错信息包含「杯型」(实际: {e})")
373
+ check(error_raised3, "缺杯型列应抛出 ValueError")
374
+ print()
375
+
376
+ # ── 4. template_to_canonical ──
377
+ print("4. template_to_canonical(字段映射 + Token 注入)")
378
+ template_df = pd.DataFrame([
379
+ {"菜品名称": "五黄高纤慢养瓶", "规格": "五角瓶", "口味做法组合": "红茶, 十二分糖, 温热", "配料": ""},
380
+ {"菜品名称": "五黄高纤慢养瓶", "规格": "五角瓶", "口味做法组合": "燕麦奶, 正常冰, 七分糖", "配料": ""},
381
+ ])
382
+ field_mapping = {
383
+ "菜品名称": "product_name",
384
+ "规格": "size",
385
+ }
386
+ token_results = [
387
+ {
388
+ "tokens": [
389
+ {"value": "红茶", "type": "茶底"},
390
+ {"value": "十二分糖", "type": "糖度"},
391
+ {"value": "温热", "type": "温度"},
392
+ ],
393
+ "missing": ["奶底"],
394
+ },
395
+ {
396
+ "tokens": [
397
+ {"value": "燕麦奶", "type": "奶底"},
398
+ {"value": "正常冰", "type": "温度"},
399
+ {"value": "七分糖", "type": "糖度"},
400
+ ],
401
+ "missing": ["茶底"],
402
+ },
403
+ ]
404
+ t_rows = template_to_canonical(template_df, field_mapping, "口味做法组合", token_results)
405
+ check(len(t_rows) == 2, f"2 行 → {len(t_rows)} 行")
406
+
407
+ # 第 1 行:直接映射 + Token 注入
408
+ check(t_rows[0]["product_name"] == "五黄高纤慢养瓶", "product_name 来自直接映射")
409
+ check(t_rows[0]["size"] == "五角瓶", "size 来自直接映射")
410
+ check(t_rows[0]["tea_base"] == "红茶", "tea_base 来自 Token 注入")
411
+ check(t_rows[0]["sugar"] == "十二分糖", "sugar 来自 Token 注入")
412
+ check(t_rows[0]["temperature"] == "温热", "temperature 来自 Token 注入")
413
+ check(t_rows[0]["milk_base"] is None, "milk_base 缺失 → None")
414
+ check("奶底" in t_rows[0]["_missing_dimensions"], "missing 记录: 奶底")
415
+
416
+ # 第 2 行:缺茶底
417
+ check(t_rows[1]["product_name"] == "五黄高纤慢养瓶", "第2行 product_name 正确")
418
+ check(t_rows[1]["milk_base"] == "燕麦奶", "第2行 milk_base 来自 Token")
419
+ check(t_rows[1]["tea_base"] is None, "第2行 tea_base 缺失 → None")
420
+ check("茶底" in t_rows[1]["_missing_dimensions"], "missing 记录: 茶底")
421
+ print()
422
+
423
+ # ── 4b. template_to_canonical: suffix 清洗 ──
424
+ print("4b. template_to_canonical(直接映射 + Token 注入 suffix 清洗)")
425
+ template_suffix_df = pd.DataFrame([
426
+ {"菜品名称": "五黄高纤慢养瓶", "规格": "五角瓶/新", "口味做法组合": "红茶, 十二分糖|推荐, 温热", "配料": ""},
427
+ ])
428
+ # Token 值带后缀(模拟 LLM 偶尔返回带后缀的情况)
429
+ token_results_suffix = [
430
+ {
431
+ "tokens": [
432
+ {"value": "红茶", "type": "茶底"},
433
+ {"value": "十二分糖|推荐", "type": "糖度"},
434
+ {"value": "温热", "type": "温度"},
435
+ ],
436
+ "missing": ["奶底"],
437
+ },
438
+ ]
439
+ ts = template_to_canonical(template_suffix_df, field_mapping, "口味做法组合", token_results_suffix)
440
+ check(ts[0]["size"] == "五角瓶", f"直接映射 '五角瓶/新' → '五角瓶'(实际 {ts[0]['size']})")
441
+ check(ts[0]["sugar"] == "十二分糖", f"Token 注入 '十二分糖|推荐' → '十二分糖'(实际 {ts[0]['sugar']})")
442
+ check(ts[0]["tea_base"] == "红茶", "无后缀 Token 不受影响")
443
+ check(ts[0]["temperature"] == "温热", "无后缀 Token 不受影响")
444
+ print()
445
+
446
+ # ── 5. validate_tokens ──
447
+ print("5. validate_tokens(Token 验证)")
448
+ sample_results = [
449
+ {"tokens": [
450
+ {"value": "红茶", "type": "茶底"},
451
+ {"value": "珍珠", "type": "配料"}, # 词典外
452
+ {"value": "温热", "type": "温度"},
453
+ ]},
454
+ ]
455
+ v = validate_tokens(sample_results)
456
+ tokens_v = v[0]["tokens"]
457
+ check(len(tokens_v) == 3, "3 个 token 全部保留")
458
+ check(tokens_v[0]["is_known"] is True, "'红茶' is_known=True")
459
+ check(tokens_v[0]["verified_type"] == "茶底", "'红茶' verified_type=茶底")
460
+ check(tokens_v[1]["is_known"] is False, "'珍珠' is_known=False")
461
+ check(tokens_v[1]["verified_type"] == "UNKNOWN_TOKEN", "'珍珠' verified_type=UNKNOWN_TOKEN")
462
+ check(tokens_v[2]["is_known"] is True, "'温热' is_known=True")
463
+
464
+ # type_conflict 检测
465
+ conflict_result = [
466
+ {"tokens": [{"value": "牛奶", "type": "糖度"}]}, # LLM 说牛奶是糖度,词典说奶底
467
+ ]
468
+ vc = validate_tokens(conflict_result)
469
+ check(vc[0]["tokens"][0]["type_conflict"] is True, "牛奶被 LLM 标为糖度 → type_conflict=True")
470
+ print()
471
+
472
+ # ── 5b. validate_tokens: suffix 清洗后验证 ──
473
+ print("5b. validate_tokens(suffix 清洗后验证)")
474
+ suffix_tokens = [
475
+ {"tokens": [
476
+ {"value": "七分糖|推荐", "type": "糖度"},
477
+ {"value": "珍珠|推荐", "type": "配料"}, # 词典外,即使去后缀也未知
478
+ {"value": "正常冰/新", "type": "温度"},
479
+ ]},
480
+ ]
481
+ vs = validate_tokens(suffix_tokens)
482
+ # '七分糖|推荐' → normalize → '七分糖' → lookup → '糖度'
483
+ check(vs[0]["tokens"][0]["is_known"] is True, "'七分糖|推荐' 清洗后 is_known=True")
484
+ check(vs[0]["tokens"][0]["value"] == "七分糖", "cleaned value='七分糖'")
485
+ check(vs[0]["tokens"][0]["raw_value"] == "七分糖|推荐", "raw_value 保留原始值")
486
+ # '珍珠|推荐' → normalize 不匹配 → '珍珠|推荐' → lookup → UNKNOWN_TOKEN
487
+ check(vs[0]["tokens"][1]["is_known"] is False, "'珍珠|推荐' 清洗失败 is_known=False")
488
+ check(vs[0]["tokens"][1]["verified_type"] == "UNKNOWN_TOKEN", "unknown 标注 UNKNOWN_TOKEN")
489
+ # '正常冰/新' → normalize → '正常冰' → lookup → '温度'
490
+ check(vs[0]["tokens"][2]["is_known"] is True, "'正常冰/新' 清洗后 is_known=True")
491
+ check(vs[0]["tokens"][2]["value"] == "正常冰", "cleaned value='正常冰'")
492
+ print()
493
+
494
+ # ── 6. check_row_completeness ──
495
+ print("6. check_row_completeness(必要维度检查)")
496
+ complete_row = {"size": "中杯", "temperature": "少冰", "sugar": "七分糖"}
497
+ check(check_row_completeness(complete_row) == [], "完整行 → 空列表")
498
+
499
+ missing_temp = {"size": "大杯", "temperature": None, "sugar": "全糖"}
500
+ missing_list = check_row_completeness(missing_temp)
501
+ check("temperature" in missing_list, "缺 temperature 被检测到")
502
+
503
+ missing_all = {"size": None, "temperature": None, "sugar": None}
504
+ ma = check_row_completeness(missing_all)
505
+ check(len(ma) == 3, "缺全部 3 项被检测到")
506
+ print()
507
+
508
+ # ── 汇总 ──
509
+ print(f"=== 结果: {passed} passed, {failed} failed ===")