MenuPilot 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1026 @@
1
+ """
2
+ Schema Analyzer — LLM 模板字段语义分析。
3
+ 仅在初始化阶段调用一次,输出字段映射配置供 Rule Engine 使用。
4
+ 不参与逐行匹配,结果可缓存复用。
5
+ """
6
+
7
+ import hashlib
8
+ import json
9
+ import re
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ from menupilot import config
13
+ from menupilot.data.canonical_schema import CANONICAL_FIELDS
14
+ from menupilot.data.memory import get_template_rule as mem_get_template_rule
15
+ from menupilot.data.memory import save_template_rule as mem_save_template_rule
16
+
17
+ # ── Prompt 模板 ──────────────────────────────────────────────────
18
+
19
+ SYSTEM_PROMPT = """\
20
+ You are a schema analysis expert for POS (Point of Sale) templates in the food & beverage industry.
21
+ Given the column names and sample data of a POS template spreadsheet, identify the semantic meaning
22
+ of each column and produce a field mapping configuration.
23
+
24
+ ## Canonical Fields (internal standard schema)
25
+ - product_name: 商品/菜品名称 (product/dish name)
26
+ - size: 规格/杯型 (size/cup size, e.g. 大杯/中杯/小杯/五角瓶)
27
+ - milk_base: 奶底 (milk base, e.g. 牛奶/燕麦奶/厚乳/椰乳)
28
+ - temperature: 温度/做法 (temperature/preparation, e.g. 正常冰/少冰/去冰/热/温热)
29
+ - sugar: 糖度 (sugar level, e.g. 全糖/七分糖/五分糖/无糖)
30
+ - tea_base: 茶底 (tea base, e.g. 红茶/绿茶/乌龙茶)
31
+
32
+ ## Rules
33
+ 1. Map each template column to the MOST appropriate canonical field above.
34
+ - Only include columns that semantically match a canonical field; a column named "序号" or "备注" is NOT product_name.
35
+ - A single template column might cover multiple dimensions (composite column) — do NOT map it directly; mark it as composite_col instead.
36
+ 2. Identify the "composite column" — a column whose cells contain comma-separated combinations
37
+ of multiple dimensions (e.g. "红茶, 十二分糖, 温热" contains tea_base + sugar + temperature).
38
+ If no such column exists, set composite_col to null.
39
+ 3. Identify the "target column" — the column that needs to be filled with SOP data
40
+ (usually empty cells, named something like 配料/做法/SOP/备注等). If unclear, set to null.
41
+ 4. Identify irrelevant columns that should be ignored (e.g. 序号, 备注/remarks, 图片, etc.).
42
+
43
+ ## Output Format
44
+ Return ONLY a single JSON object. No markdown fences, no extra text:
45
+ {"field_mapping": {"template_col": "canonical_field", ...}, "composite_col": "col_name_or_null", "target_col": "col_name_or_null", "irrelevant_cols": ["col1", ...]}"""
46
+
47
+ USER_PROMPT_TEMPLATE = """\
48
+ ## Template Columns (with per-column sample values, NOT full rows)
49
+ {columns}
50
+
51
+ ## Per-Column Sample Values
52
+ {sample_data}
53
+
54
+ Analyze the template schema and return the JSON field mapping configuration.
55
+ IMPORTANT: Use the column NAMES and sample values ONLY to understand semantic types.
56
+ Do NOT modify or rewrite any sample values — they are read-only references."""
57
+
58
+ # ── API 调用计数器 ─────────────────────────────────────────────
59
+
60
+ _api_call_count: int = 0
61
+
62
+
63
+ def get_api_call_count() -> int:
64
+ """返回 Schema Analyzer 的真实 API 调用次数。"""
65
+ return _api_call_count
66
+
67
+
68
+ def reset_api_call_count() -> None:
69
+ """重置 API 调用计数器(用于测试)。"""
70
+ global _api_call_count
71
+ _api_call_count = 0
72
+
73
+
74
+ # ── 缓存 ────────────────────────────────────────────────────────
75
+
76
+ _cache: Dict[str, Dict[str, Any]] = {}
77
+
78
+
79
+ def _cache_key(columns: List[str]) -> str:
80
+ """基于列名列表生成进程内缓存键(SHA256)。"""
81
+ return hashlib.sha256(",".join(sorted(columns)).encode()).hexdigest()
82
+
83
+
84
+ def _template_fingerprint(columns: List[str]) -> str:
85
+ """基于列名列表生成模板指纹(MD5),用于持久化缓存键。
86
+
87
+ 对模板所有列名排序后用逗号拼接,取 MD5 摘要。
88
+ 同一模板的列名无论顺序如何,指纹一致。
89
+ """
90
+ return hashlib.md5(",".join(sorted(columns)).encode()).hexdigest()
91
+
92
+
93
+ def reset_cache() -> None:
94
+ """清空缓存(用于测试)。"""
95
+ _cache.clear()
96
+
97
+
98
+ # ── LLM 客户端 ──────────────────────────────────────────────────
99
+
100
+ def _get_client():
101
+ """创建 DeepSeek API 客户端(兼容 OpenAI SDK)。延迟导入,Mock 模式下不触发。"""
102
+ from openai import OpenAI # 延迟导入,Mock 模式下不需要安装
103
+
104
+ return OpenAI(
105
+ api_key=config.DEEPSEEK_API_KEY,
106
+ base_url=config.DEEPSEEK_BASE_URL,
107
+ timeout=config.LLM_TIMEOUT_SECONDS,
108
+ )
109
+
110
+
111
+ def _call_llm(columns: List[str], sample_data: List[Dict[str, Any]]) -> str:
112
+ """调用 DeepSeek LLM 分析模板 Schema。
113
+
114
+ LLM 只接收列名和每列的去重样例值(最多 3 个),不接收完整行数据,
115
+ 防止 LLM 意外改写原始值。
116
+
117
+ Args:
118
+ columns: 模板列名列表。
119
+ sample_data: 模板前 N 行数据(仅用于提取每列的样例值)。
120
+
121
+ Returns:
122
+ LLM 原始响应文本。
123
+ """
124
+ client = _get_client()
125
+
126
+ # 为每列提取去重样例值(不保留行间关联,仅用于语义理解)
127
+ column_samples = []
128
+ for col in columns:
129
+ values = []
130
+ for row in sample_data:
131
+ v = str(row.get(col, "")).strip()
132
+ if v and v not in values:
133
+ values.append(v)
134
+ if len(values) >= 3:
135
+ break
136
+ if values:
137
+ column_samples.append(f" - {col}: e.g. {', '.join(values)}")
138
+ else:
139
+ column_samples.append(f" - {col}: (empty)")
140
+
141
+ global _api_call_count
142
+ _api_call_count += 1
143
+
144
+ user_prompt = USER_PROMPT_TEMPLATE.format(
145
+ columns="\n".join(f" - {c}" for c in columns),
146
+ n=len(sample_data),
147
+ sample_data="\n".join(column_samples),
148
+ )
149
+
150
+ response = client.chat.completions.create(
151
+ model=config.DEEPSEEK_MODEL,
152
+ messages=[
153
+ {"role": "system", "content": SYSTEM_PROMPT},
154
+ {"role": "user", "content": user_prompt},
155
+ ],
156
+ temperature=config.LLM_TEMPERATURE,
157
+ max_tokens=config.LLM_MAX_TOKENS,
158
+ )
159
+
160
+ return response.choices[0].message.content or ""
161
+
162
+
163
+ # ── 列覆盖率检测 ──────────────────────────────────────────────
164
+
165
+
166
+ def _get_unmapped_columns(
167
+ all_cols: List[str],
168
+ field_mapping: Dict[str, str],
169
+ composite_col: Optional[str],
170
+ target_col: Optional[str],
171
+ irrelevant_cols: List[str],
172
+ ) -> List[str]:
173
+ """计算未被覆盖的列名列表。
174
+
175
+ 已覆盖 = field_mapping 的 key ∪ {composite_col} ∪ {target_col} ∪ irrelevant_cols
176
+ 返回不在已覆盖集合中的列名。
177
+ """
178
+ covered = set(field_mapping.keys())
179
+ if composite_col:
180
+ covered.add(composite_col)
181
+ if target_col:
182
+ covered.add(target_col)
183
+ covered.update(irrelevant_cols)
184
+ return [c for c in all_cols if c not in covered]
185
+
186
+
187
+ # ── 响应解析与验证 ──────────────────────────────────────────────
188
+
189
+
190
+ def _extract_json(text: str) -> str:
191
+ """从 LLM 响应中提取 JSON 字符串。
192
+
193
+ 处理以下格式:
194
+ - 纯 JSON: {"field_mapping": ...}
195
+ - Markdown 代码块: ```json ... ```
196
+ - 无语言标注代码块: ``` ... ```
197
+ """
198
+ text = text.strip()
199
+ # 尝试匹配 ```json ... ``` 或 ``` ... ```
200
+ m = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", text, re.DOTALL)
201
+ if m:
202
+ return m.group(1).strip()
203
+ return text
204
+
205
+
206
+ def _validate_response(data: dict, columns: List[str]) -> None:
207
+ """验证 LLM 返回的 Schema 分析结果。
208
+
209
+ Args:
210
+ data: 解析后的 JSON dict。
211
+ columns: 原始模板列名列表。
212
+
213
+ Raises:
214
+ ValueError: 必填字段缺失或字段值不合法。
215
+ """
216
+ # 必填顶级字段
217
+ if "field_mapping" not in data:
218
+ raise ValueError("LLM 响应缺少 'field_mapping' 字段")
219
+ if not isinstance(data["field_mapping"], dict):
220
+ raise ValueError("'field_mapping' 必须是 dict")
221
+
222
+ # field_mapping 值必须是合法的 canonical 字段
223
+ for tcol, cfield in data["field_mapping"].items():
224
+ if cfield not in CANONICAL_FIELDS:
225
+ raise ValueError(
226
+ f"非法 canonical 字段 '{cfield}'(模板列 '{tcol}'),"
227
+ f"合法值: {CANONICAL_FIELDS}"
228
+ )
229
+ if tcol not in columns:
230
+ raise ValueError(
231
+ f"field_mapping 中的模板列 '{tcol}' 不在实际列名中: {columns}"
232
+ )
233
+
234
+ # composite_col 如果非空,必须是实际列名
235
+ composite = data.get("composite_col")
236
+ if composite is not None and composite not in columns:
237
+ raise ValueError(
238
+ f"composite_col '{composite}' 不在实际列名中: {columns}"
239
+ )
240
+
241
+ # target_col 如果非空,必须是实际列名
242
+ target = data.get("target_col")
243
+ if target is not None and target not in columns:
244
+ raise ValueError(
245
+ f"target_col '{target}' 不在实际列名中: {columns}"
246
+ )
247
+
248
+ # irrelevant_cols 必须是列表且值在 columns 中
249
+ irrelevant = data.get("irrelevant_cols", [])
250
+ if not isinstance(irrelevant, list):
251
+ raise ValueError("'irrelevant_cols' 必须是 list")
252
+ for col in irrelevant:
253
+ if col not in columns:
254
+ raise ValueError(
255
+ f"irrelevant_cols 中的 '{col}' 不在实际列名中: {columns}"
256
+ )
257
+
258
+ # composite_col 不应该同时出现在 field_mapping 中
259
+ if composite and composite in data["field_mapping"]:
260
+ raise ValueError(
261
+ f"composite_col '{composite}' 不应同时出现在 field_mapping 中"
262
+ )
263
+
264
+
265
+ # ── 公开 API ────────────────────────────────────────────────────
266
+
267
+
268
+ def analyze(
269
+ columns: List[str],
270
+ sample_data: Optional[List[Dict[str, Any]]] = None,
271
+ use_cache: bool = True,
272
+ ) -> Dict[str, Any]:
273
+ """分析模板 Schema,输出字段映射配置。
274
+
275
+ 两阶段处理:
276
+ Stage 1(Pre-LLM):查 column_aliases 记忆,注入已知列映射。
277
+ Stage 2(Post-LLM):仅有未知列才送 LLM;合并后检测未覆盖列。
278
+
279
+ 结果缓存:仅在 unrecognized_cols 为空时才写入磁盘指纹缓存,
280
+ 确保二次运行时跳过的不完整结果不会进入持久化。
281
+
282
+ Args:
283
+ columns: 模板表列名列表(已 strip)。
284
+ sample_data: 模板前 N 行数据,用于提供语义上下文。
285
+ use_cache: 是否使用缓存。默认 True。
286
+
287
+ Returns:
288
+ {
289
+ "field_mapping": {"菜品名称": "product_name", "规格": "size", ...},
290
+ "composite_col": "口味做法组合", # 或 None
291
+ "target_col": "配料", # 或 None
292
+ "irrelevant_cols": [], # 忽略的列名列表
293
+ "unrecognized_cols": [], # 未识别列(供 CLI 交互)
294
+ }
295
+
296
+ Raises:
297
+ ValueError: 模板列为空或 LLM 返回结果不合法。
298
+ """
299
+ if not columns:
300
+ raise ValueError("模板列名列表不能为空")
301
+
302
+ sample_data = sample_data or []
303
+
304
+ # ── 三级缓存:进程内 → 磁盘记忆 → LLM ──
305
+ key = _cache_key(columns)
306
+ if use_cache and key in _cache:
307
+ return _cache[key]
308
+
309
+ fingerprint = _template_fingerprint(columns)
310
+ if use_cache:
311
+ cached_rule = mem_get_template_rule(fingerprint)
312
+ if cached_rule is not None:
313
+ _cache[key] = cached_rule
314
+ print(f"[Schema] 缓存命中:模板指纹 {fingerprint[:12]}...(跳过 LLM)")
315
+ return cached_rule
316
+
317
+ # ═══════════════════════════════════════════════════════════════
318
+ # Stage 1: Pre-LLM — 注入已知列别名
319
+ # ═══════════════════════════════════════════════════════════════
320
+
321
+ from menupilot.data.memory import get_column_alias
322
+
323
+ known_fm: Dict[str, str] = {}
324
+ known_irrelevant: List[str] = []
325
+ known_composite: Optional[str] = None
326
+ known_target: Optional[str] = None
327
+ unknown_cols: List[str] = []
328
+
329
+ for col in columns:
330
+ alias = get_column_alias(col)
331
+ if alias is None:
332
+ unknown_cols.append(col)
333
+ elif alias == "ignore":
334
+ known_irrelevant.append(col)
335
+ elif alias == "composite_col":
336
+ known_composite = col
337
+ elif alias == "sop":
338
+ known_target = col
339
+ elif alias in CANONICAL_FIELDS:
340
+ known_fm[col] = alias
341
+ else:
342
+ # 未知别名值,交给 LLM
343
+ unknown_cols.append(col)
344
+
345
+ # ═══════════════════════════════════════════════════════════════
346
+ # Stage 2: LLM — 仅对未知列调用
347
+ # ═══════════════════════════════════════════════════════════════
348
+
349
+ llm_fm: Dict[str, str] = {}
350
+ llm_composite: Optional[str] = None
351
+ llm_target: Optional[str] = None
352
+ llm_irrelevant: List[str] = []
353
+ llm_error: Optional[str] = None
354
+
355
+ if unknown_cols:
356
+ if config.USE_MOCK_LLM:
357
+ alias_count = len(columns) - len(unknown_cols)
358
+ if alias_count > 0:
359
+ print(f"[Schema] 列别名命中 {alias_count} 列,剩余 {len(unknown_cols)} 列送 LLM...(Mock 模式)")
360
+ else:
361
+ print("[Schema] 新模板,调用 LLM 分析...(Mock 模式)")
362
+ raw = json.dumps(config.MOCK_SCHEMA_RESPONSE, ensure_ascii=False)
363
+ else:
364
+ alias_count = len(columns) - len(unknown_cols)
365
+ if alias_count > 0:
366
+ print(f"[Schema] 列别名命中 {alias_count} 列,剩余 {len(unknown_cols)} 列送 LLM...")
367
+ else:
368
+ print("[Schema] 新模板,调用 LLM 分析...")
369
+ try:
370
+ # 仅传递未知列的样本数据
371
+ unknown_sample = [
372
+ {c: row.get(c, "") for c in unknown_cols}
373
+ for row in sample_data
374
+ ] if sample_data else []
375
+ raw = _call_llm(unknown_cols, unknown_sample)
376
+ except Exception as e:
377
+ llm_error = f"LLM 调用失败: {e}"
378
+ raw = None
379
+
380
+ if raw is not None:
381
+ try:
382
+ llm_data = json.loads(_extract_json(raw))
383
+ llm_data.setdefault("composite_col", None)
384
+ llm_data.setdefault("target_col", None)
385
+ llm_data.setdefault("irrelevant_cols", [])
386
+
387
+ # 过滤 field_mapping:只保留未知列范围内的映射
388
+ if "field_mapping" in llm_data:
389
+ llm_fm = {
390
+ k: v for k, v in llm_data["field_mapping"].items()
391
+ if k in unknown_cols
392
+ }
393
+ # 验证过滤后的结果
394
+ _validate_response(
395
+ {**llm_data, "field_mapping": llm_fm},
396
+ unknown_cols,
397
+ )
398
+ llm_composite = llm_data.get("composite_col")
399
+ llm_target = llm_data.get("target_col")
400
+ llm_irrelevant = llm_data.get("irrelevant_cols", [])
401
+ except (json.JSONDecodeError, ValueError) as e:
402
+ llm_error = str(e)
403
+ else:
404
+ print("[Schema] 所有列已在列别名中,跳过 LLM")
405
+
406
+ # ═══════════════════════════════════════════════════════════════
407
+ # Stage 3: 合并结果
408
+ # ═══════════════════════════════════════════════════════════════
409
+
410
+ # field_mapping:已知别名优先,LLM 补充
411
+ fm = dict(known_fm)
412
+ fm.update(llm_fm)
413
+
414
+ # composite / target:已知别名优先,LLM 补充
415
+ composite_col = known_composite or llm_composite
416
+ target_col = known_target or llm_target
417
+ irrelevant_cols = known_irrelevant + llm_irrelevant
418
+
419
+ # ═══════════════════════════════════════════════════════════════
420
+ # Stage 4: 检测未覆盖列
421
+ # ═══════════════════════════════════════════════════════════════
422
+
423
+ unrecognized = _get_unmapped_columns(
424
+ columns, fm, composite_col, target_col, irrelevant_cols
425
+ )
426
+
427
+ # LLM 失败 → 原本归 LLM 处理的未知列全部变为未识别
428
+ if llm_error and unknown_cols:
429
+ unrecognized = sorted(set(unrecognized + unknown_cols))
430
+ if not config.USE_MOCK_LLM:
431
+ print(f"[Schema] {llm_error},{len(unrecognized)} 列待手动确认")
432
+
433
+ result = {
434
+ "field_mapping": fm,
435
+ "composite_col": composite_col,
436
+ "target_col": target_col,
437
+ "irrelevant_cols": irrelevant_cols,
438
+ "unrecognized_cols": unrecognized,
439
+ }
440
+
441
+ # ── 缓存 ──
442
+ _cache[key] = result
443
+
444
+ # 仅在完全解析时写入磁盘指纹缓存(确保二次运行跳过的是完整结果)
445
+ if use_cache and not unrecognized:
446
+ mem_save_template_rule(fingerprint, result)
447
+
448
+ return result
449
+
450
+
451
+ # ── 主数据列推断 ────────────────────────────────────────────────
452
+
453
+ MASTER_INFERENCE_SYSTEM = """\
454
+ You are a data schema matching expert. Given candidate columns from a master spreadsheet and a list of required but unfound canonical fields, determine which candidate column matches which canonical field.
455
+
456
+ ## Canonical Fields (for reference)
457
+ - product_name: 商品/菜品名称
458
+ - size: 规格/杯型
459
+ - milk_base: 奶底
460
+ - temperature: 温度/做法 (ice level, e.g. 正常冰/少冰/去冰/热/温热)
461
+ - sugar: 糖度 (sugar level, e.g. 全糖/七分糖/五分糖/无糖)
462
+ - tea_base: 茶底
463
+
464
+ ## Output Format
465
+ Return ONLY a JSON object. Each candidate column name is a key. Value is an object with:
466
+ - field: canonical field name (or null if cannot match)
467
+ - confidence: "high" (sure) or "low" (guess)
468
+ - reason: brief one-line explanation in Chinese
469
+
470
+ Example:
471
+ {"温度": {"field": "temperature", "confidence": "high", "reason": "样例值为标准冰温描述"}}"""
472
+
473
+ MASTER_INFERENCE_USER = """\
474
+ ## Required Canonical Fields (missing from spreadsheet)
475
+ {missing_fields}
476
+
477
+ ## Candidate Columns (with up to 5 sample values each)
478
+ {candidate_samples}
479
+
480
+ Match each candidate column to one of the required fields above. If a column clearly doesn't match any field, set field=null and confidence="low". Only match to fields listed in "Required Canonical Fields"."""
481
+
482
+ # LLM 推断 hook(测试用)
483
+ _inference_hook: Optional[callable] = None
484
+
485
+
486
+ def set_inference_hook(hook: Optional[callable]) -> None:
487
+ """注入自定义主数据列推断回调(用于测试)。设为 None 恢复默认。"""
488
+ global _inference_hook
489
+ _inference_hook = hook
490
+
491
+
492
+ def infer_master_columns(
493
+ candidate_cols: List[str],
494
+ sample_data: dict,
495
+ missing_fields: List[str],
496
+ ) -> Dict[str, Dict[str, str]]:
497
+ """使用 LLM 推断候选列与缺失 canonical 字段的匹配关系。
498
+
499
+ Args:
500
+ candidate_cols: 候选列名列表(未被现有校验覆盖的列)。
501
+ sample_data: {col_name: [sample_value, ...]} 每列最多 5 个样例值。
502
+ missing_fields: 缺失的 canonical 字段名列表
503
+ (如 ["temperature"] — 即 MASTER_REQUIRED_COLUMNS 中未找到的)。
504
+
505
+ Returns:
506
+ {col_name: {"field": "temperature"|None, "confidence": "high"|"low", "reason": "..."}}
507
+ 只有高置信度的列应该自动映射;低置信度/field=null 应交由交互确认。
508
+ """
509
+ if not candidate_cols or not missing_fields:
510
+ return {}
511
+
512
+ # ── 测试模式:优先使用注入的 hook ──
513
+ if _inference_hook is not None:
514
+ return _inference_hook(candidate_cols, sample_data, missing_fields)
515
+
516
+ # ── 构建候选列样例 ──
517
+ candidate_lines = []
518
+ for col in candidate_cols:
519
+ vals = sample_data.get(col, [])
520
+ val_str = ", ".join(str(v) for v in vals) if vals else "(空)"
521
+ candidate_lines.append(f" Column「{col}」: {val_str}")
522
+
523
+ # ── 构建缺失字段描述 ──
524
+ FIELD_DESCRIPTIONS = {
525
+ "product_name": "product_name: 商品/菜品名称",
526
+ "size": "size: 规格/杯型",
527
+ "milk_base": "milk_base: 奶底(如 牛奶/燕麦奶/厚乳/椰乳)",
528
+ "temperature": "temperature: 温度/做法(如 正常冰/少冰/去冰/热/温热)",
529
+ "sugar": "sugar: 糖度(如 全糖/七分糖/五分糖/无糖)",
530
+ "tea_base": "tea_base: 茶底(如 红茶/绿茶/乌龙茶)",
531
+ }
532
+ missing_lines = [
533
+ f" - {FIELD_DESCRIPTIONS.get(f, f)}"
534
+ for f in missing_fields
535
+ ]
536
+
537
+ # ── 调用 LLM ──
538
+ user_prompt = MASTER_INFERENCE_USER.format(
539
+ missing_fields="\n".join(missing_lines),
540
+ candidate_samples="\n".join(candidate_lines),
541
+ )
542
+
543
+ try:
544
+ client = _get_client()
545
+ response = client.chat.completions.create(
546
+ model=config.DEEPSEEK_MODEL,
547
+ messages=[
548
+ {"role": "system", "content": MASTER_INFERENCE_SYSTEM},
549
+ {"role": "user", "content": user_prompt},
550
+ ],
551
+ temperature=0.1,
552
+ max_tokens=512,
553
+ )
554
+ raw = response.choices[0].message.content or "{}"
555
+ except Exception:
556
+ # LLM 不可用 → 全部返回低置信度
557
+ result = {}
558
+ for col in candidate_cols:
559
+ result[col] = {"field": None, "confidence": "low", "reason": "LLM 调用失败"}
560
+ return result
561
+
562
+ # ── 解析 ──
563
+ try:
564
+ data = json.loads(_extract_json(raw))
565
+ except json.JSONDecodeError:
566
+ # 无法解析 → 全部低置信度
567
+ result = {}
568
+ for col in candidate_cols:
569
+ result[col] = {"field": None, "confidence": "low", "reason": "LLM 返回无法解析"}
570
+ return result
571
+
572
+ # ── 标准化输出格式 ──
573
+ result = {}
574
+ for col in candidate_cols:
575
+ entry = data.get(col)
576
+ if isinstance(entry, dict):
577
+ field = entry.get("field")
578
+ confidence = entry.get("confidence", "low")
579
+ reason = entry.get("reason", "")
580
+ # 仅接受 high + 合法 canonical 字段 的组合
581
+ if confidence == "high" and field in missing_fields:
582
+ result[col] = {"field": field, "confidence": "high", "reason": reason}
583
+ elif field in missing_fields:
584
+ result[col] = {"field": field, "confidence": "low", "reason": reason}
585
+ else:
586
+ result[col] = {"field": None, "confidence": "low",
587
+ "reason": reason or "LLM 未给出有效匹配"}
588
+ else:
589
+ result[col] = {"field": None, "confidence": "low", "reason": "LLM 未返回此列"}
590
+
591
+ return result
592
+
593
+
594
+ def analyze_from_dataframe(
595
+ df: "pd.DataFrame",
596
+ sample_rows: int = 3,
597
+ ) -> Dict[str, Any]:
598
+ """从模板 DataFrame 直接分析 Schema(便捷方法)。
599
+
600
+ Args:
601
+ df: 模板 DataFrame。
602
+ sample_rows: 提供给 LLM 的样本行数。
603
+
604
+ Returns:
605
+ 同 analyze()。
606
+ """
607
+ columns = list(df.columns)
608
+ sample = df.head(sample_rows).to_dict(orient="records")
609
+ return analyze(columns, sample)
610
+
611
+
612
+ # ── 自测 ────────────────────────────────────────────────────────
613
+
614
+ if __name__ == "__main__":
615
+ import os
616
+ import pandas as pd
617
+
618
+ # 强制使用 Mock 模式进行自测
619
+ os.environ["USE_MOCK_LLM"] = "1"
620
+ import importlib
621
+ importlib.reload(config)
622
+
623
+ # 重新导入 schema_analyzer 以获取更新后的 config 引用(from menupilot import config 是动态访问,不需重新导入本模块)
624
+ # 但 config 模块本身需要 reload 以更新 USE_MOCK_LLM
625
+
626
+ passed = 0
627
+ failed = 0
628
+
629
+ def check(condition, msg):
630
+ global passed, failed
631
+ if condition:
632
+ passed += 1
633
+ print(f" PASS {msg}")
634
+ else:
635
+ failed += 1
636
+ print(f" FAIL {msg}")
637
+
638
+ print("=== Schema Analyzer 自测(Mock 模式)===\n")
639
+
640
+ # ── 准备测试数据 ──
641
+ # 模拟真实模板:4 列
642
+ template_df = pd.DataFrame({
643
+ "菜品名称": ["五黄高纤慢养瓶", "五黄高纤慢养瓶"],
644
+ "规格": ["五角瓶", "五角瓶"],
645
+ "口味做法组合": ["红茶, 十二分糖, 温热", "红茶, 十二分糖, 正常冰"],
646
+ "配料": ["", ""],
647
+ })
648
+
649
+ columns = list(template_df.columns)
650
+ sample_data = template_df.head(3).to_dict(orient="records")
651
+
652
+ # ── 1. 基本分析 ──
653
+ print("1. analyze 基本分析(Mock)")
654
+ result = analyze(columns, sample_data)
655
+ check(isinstance(result, dict), "返回 dict")
656
+ check("field_mapping" in result, "包含 field_mapping")
657
+ check("composite_col" in result, "包含 composite_col")
658
+ check("target_col" in result, "包含 target_col")
659
+ check("irrelevant_cols" in result, "包含 irrelevant_cols")
660
+ print()
661
+
662
+ # ── 2. field_mapping 内容 ──
663
+ print("2. field_mapping 内容验证")
664
+ fm = result["field_mapping"]
665
+ check(fm.get("菜品名称") == "product_name", "菜品名称 → product_name")
666
+ check(fm.get("规格") == "size", "规格 → size")
667
+ # composite_col 不应在 field_mapping 中
668
+ check("口味做法组合" not in fm, "复合列不在 field_mapping 中")
669
+ print()
670
+
671
+ # ── 3. composite_col / target_col ──
672
+ print("3. composite_col / target_col")
673
+ check(result["composite_col"] == "口味做法组合", "composite_col = 口味做法组合")
674
+ check(result["target_col"] == "配料", "target_col = 配料")
675
+ print()
676
+
677
+ # ── 4. irrelevant_cols ──
678
+ print("4. irrelevant_cols")
679
+ check(isinstance(result["irrelevant_cols"], list), "irrelevant_cols 是 list")
680
+ check(len(result["irrelevant_cols"]) == 0, "本次无无关列")
681
+ print()
682
+
683
+ # ── 5. 缓存测试 ──
684
+ print("5. 缓存测试")
685
+ reset_cache()
686
+ result1 = analyze(columns, sample_data)
687
+ result2 = analyze(columns, sample_data) # 第二次应从缓存读取
688
+ check(result1 == result2, "相同 columns 命中缓存(结果一致)")
689
+
690
+ # 不同 columns 不应命中缓存
691
+ result3 = analyze(columns + ["额外列"], [])
692
+ check(result3 is not result1, "不同 columns 不命中缓存")
693
+ reset_cache()
694
+ print()
695
+
696
+ # ── 6. analyze_from_dataframe ──
697
+ print("6. analyze_from_dataframe 便捷方法")
698
+ result_df = analyze_from_dataframe(template_df)
699
+ check(result_df["composite_col"] == "口味做法组合", "DataFrame 输入正常")
700
+ print()
701
+
702
+ # ── 7. 空 columns 应抛异常 ──
703
+ print("7. 空 columns 异常处理")
704
+ try:
705
+ analyze([])
706
+ check(False, "空 columns 应抛异常")
707
+ except ValueError as e:
708
+ check("不能为空" in str(e), f"ValueError: {e}")
709
+ print()
710
+
711
+ # ── 8. 含无关列的模板 ──
712
+ print("8. 含无关列的模板分析")
713
+ df_with_extra = pd.DataFrame({
714
+ "序号": [1, 2],
715
+ "商品名": ["珍珠奶茶", "椰果奶茶"],
716
+ "杯型": ["大杯", "中杯"],
717
+ "配料": ["", ""],
718
+ "备注": ["", ""],
719
+ })
720
+ # 覆盖 mock 响应以测试含 irrelevant_cols 的场景
721
+ original_mock = config.MOCK_SCHEMA_RESPONSE
722
+ config.MOCK_SCHEMA_RESPONSE = {
723
+ "field_mapping": {"商品名": "product_name", "杯型": "size"},
724
+ "composite_col": None,
725
+ "target_col": "配料",
726
+ "irrelevant_cols": ["序号", "备注"],
727
+ }
728
+ result_extra = analyze_from_dataframe(df_with_extra)
729
+ check("序号" in result_extra["irrelevant_cols"], "序号 被标为无关列")
730
+ check("备注" in result_extra["irrelevant_cols"], "备注 被标为无关列")
731
+ check(result_extra["composite_col"] is None, "无复合列 → None")
732
+ config.MOCK_SCHEMA_RESPONSE = original_mock
733
+ print()
734
+
735
+ # ── 9. 没有复合列的模板 ──
736
+ print("9. 无复合列的模板")
737
+ config.MOCK_SCHEMA_RESPONSE = {
738
+ "field_mapping": {
739
+ "品名": "product_name",
740
+ "杯型": "size",
741
+ "奶底": "milk_base",
742
+ "温度": "temperature",
743
+ "糖度": "sugar",
744
+ },
745
+ "composite_col": None,
746
+ "target_col": "SOP",
747
+ "irrelevant_cols": [],
748
+ }
749
+ result_no_composite = analyze(
750
+ ["品名", "杯型", "奶底", "温度", "糖度", "SOP"],
751
+ [{"品名": "测试", "杯型": "中杯", "奶底": "牛奶", "温度": "少冰", "糖度": "七分糖", "SOP": ""}],
752
+ )
753
+ check(result_no_composite["composite_col"] is None, "无复合列 → composite_col=None")
754
+ check(len(result_no_composite["field_mapping"]) == 5, "5 个字段被映射")
755
+ check(result_no_composite["target_col"] == "SOP", "target_col = SOP")
756
+ check(result_no_composite["irrelevant_cols"] == [], "无无关列")
757
+ config.MOCK_SCHEMA_RESPONSE = original_mock
758
+ print()
759
+
760
+ # ── 10. 模板指纹持久化缓存(第一次 → Mock 调用,写入记忆) ──
761
+ print("10. 指纹持久化缓存 — 第一次运行(Mock 调用 + 写入记忆)")
762
+ reset_cache()
763
+
764
+ # ── 备份真实 memory.json ──
765
+ import shutil as _shutil
766
+ _mem_path = os.path.expanduser("~/.menupilot/memory.json")
767
+ _mem_backup = None
768
+ if os.path.exists(_mem_path):
769
+ _mem_backup_path = _mem_path + ".self_test_backup"
770
+ _shutil.copy(_mem_path, _mem_backup_path)
771
+ _mem_backup = _mem_backup_path
772
+
773
+ from menupilot.data.memory import reset_memory, get_template_rule as mem_get_rule
774
+ reset_memory()
775
+
776
+ columns_a = ["菜品名称", "规格", "口味做法组合", "配料"]
777
+ # 计算预期指纹
778
+ expected_fp = _template_fingerprint(columns_a)
779
+
780
+ # 第一次:未命中记忆,走 Mock 流程
781
+ result_a1 = analyze(columns_a, [], use_cache=True)
782
+ check(isinstance(result_a1, dict), "第一次返回 dict")
783
+ check(result_a1.get("composite_col") == "口味做法组合", "内容正确")
784
+
785
+ # 验证已写入记忆
786
+ cached_a = mem_get_rule(expected_fp)
787
+ check(cached_a is not None, "第一次运行后记忆中有缓存")
788
+ check(cached_a.get("composite_col") == "口味做法组合", "记忆缓存内容正确")
789
+ print()
790
+
791
+ # ── 11. 第二次运行同一模板 → 缓存命中,不调用 Mock ──
792
+ print("11. 第二次运行同一模板 → 缓存命中(免 LLM)")
793
+ # 改变 Mock 响应,如果走了 Mock 流程结果会不同 → 可验证是否真正命中缓存
794
+ original_mock = config.MOCK_SCHEMA_RESPONSE
795
+ config.MOCK_SCHEMA_RESPONSE = {
796
+ "field_mapping": {"X": "product_name"}, # 不同的响应
797
+ "composite_col": None,
798
+ "target_col": None,
799
+ "irrelevant_cols": [],
800
+ }
801
+
802
+ # 清进程内缓存以仅依赖记忆缓存
803
+ reset_cache()
804
+ result_a2 = analyze(columns_a, [], use_cache=True)
805
+
806
+ # 应命中记忆缓存,返回与第一次一致的结果(不是改过的 Mock)
807
+ check(result_a2.get("composite_col") == "口味做法组合",
808
+ "第二次命中记忆缓存 → composite_col 与第一次一致")
809
+ check(result_a2.get("field_mapping") == result_a1.get("field_mapping"),
810
+ "field_mapping 与第一次完全一致")
811
+ config.MOCK_SCHEMA_RESPONSE = original_mock
812
+ print()
813
+
814
+ # ── 12. 模板列名变化 → 指纹不同,重新调用 ──
815
+ print("12. 模板列名变化 → 不同指纹 → 重新调用")
816
+ # 覆盖 mock 以包含新列 "备注"(否则 unrecognized_cols 非空 → 不写缓存)
817
+ config.MOCK_SCHEMA_RESPONSE = {
818
+ "field_mapping": {"菜品名称": "product_name", "规格": "size"},
819
+ "composite_col": "口味做法组合",
820
+ "target_col": "配料",
821
+ "irrelevant_cols": ["备注"],
822
+ }
823
+ columns_b = ["菜品名称", "规格", "口味做法组合", "配料", "备注"]
824
+ fp_b = _template_fingerprint(columns_b)
825
+ check(fp_b != expected_fp, f"不同列名 → 不同指纹 ({fp_b[:12]}... ≠ {expected_fp[:12]}...)")
826
+
827
+ cached_b_before = mem_get_rule(fp_b)
828
+ check(cached_b_before is None, "新模板指纹初始无缓存")
829
+
830
+ result_b = analyze(columns_b, [], use_cache=True)
831
+ check(isinstance(result_b, dict), "新模板分析成功")
832
+ check(result_b["unrecognized_cols"] == [], "全识别 → unrecognized_cols 为空")
833
+
834
+ cached_b_after = mem_get_rule(fp_b)
835
+ check(cached_b_after is not None, "新模板结果已写入记忆")
836
+ config.MOCK_SCHEMA_RESPONSE = original_mock
837
+ print()
838
+
839
+ # ── 13. 手动删除 memory → 退化为首次运行 ──
840
+ print("13. 手动清空记忆 → 退化为首次运行")
841
+ reset_memory()
842
+ check(mem_get_rule(expected_fp) is None, "清空后原指纹无缓存")
843
+
844
+ reset_cache()
845
+ result_a3 = analyze(columns_a, [], use_cache=True)
846
+ check(isinstance(result_a3, dict), "清空记忆后仍正常运行(Mock 兜底)")
847
+ # 清空后重新写入
848
+ cached_a3 = mem_get_rule(expected_fp)
849
+ check(cached_a3 is not None, "清空后再运行 → 重新写入记忆")
850
+ check(cached_a3.get("composite_col") == "口味做法组合", "重新写入内容正确")
851
+ print()
852
+
853
+ # ── 14. _template_fingerprint 确定性 ──
854
+ print("14. _template_fingerprint 确定性验证")
855
+ cols_unsorted = ["配料", "规格", "口味做法组合", "菜品名称"]
856
+ fp_unsorted = _template_fingerprint(cols_unsorted)
857
+ check(fp_unsorted == expected_fp, f"列名顺序不影响指纹 ({fp_unsorted[:12]}... = {expected_fp[:12]}...)")
858
+ print()
859
+
860
+ # ── 15. _get_unmapped_columns ──
861
+ print("15. _get_unmapped_columns 覆盖率检测")
862
+ check(_get_unmapped_columns(
863
+ ["A", "B", "C"], {"A": "product_name"}, None, None, []
864
+ ) == ["B", "C"], "仅 A 被覆盖 → B, C 未识别")
865
+ check(_get_unmapped_columns(
866
+ ["A", "B", "C"], {"A": "product_name"}, "B", None, ["C"]
867
+ ) == [], "A(fm) + B(composite) + C(irrelevant) → 全覆盖")
868
+ check(_get_unmapped_columns(
869
+ [], {}, None, None, []
870
+ ) == [], "空列名 → 空未识别")
871
+ print()
872
+
873
+ # ── 16. pre-LLM 列别名注入 ──
874
+ print("16. pre-LLM 列别名注入(column_aliases 记忆)")
875
+ reset_memory()
876
+ reset_cache()
877
+ from menupilot.data.memory import add_column_alias, get_column_alias as mem_get_col
878
+
879
+ # 预设列别名
880
+ add_column_alias("菜品名称", "product_name")
881
+ add_column_alias("备注", "ignore")
882
+
883
+ # Mock 响应只覆盖 LLM 看到的未知列
884
+ original_mock = config.MOCK_SCHEMA_RESPONSE
885
+ config.MOCK_SCHEMA_RESPONSE = {
886
+ "field_mapping": {"规格": "size"},
887
+ "composite_col": "口味做法组合",
888
+ "target_col": None,
889
+ "irrelevant_cols": [],
890
+ }
891
+
892
+ result16 = analyze(
893
+ ["菜品名称", "规格", "口味做法组合", "备注", "配料"],
894
+ [{"菜品名称": "测试", "规格": "中杯", "口味做法组合": "牛奶,少冰",
895
+ "备注": "备注内容", "配料": ""}],
896
+ )
897
+ # "菜品名称" 已被别名覆盖 → 不送 LLM
898
+ # "备注" → ignore → irrelevant_cols
899
+ # 剩余 ["规格", "口味做法组合", "配料"] → 送 LLM(Mock 覆盖了前两个)
900
+ check(result16["field_mapping"].get("菜品名称") == "product_name",
901
+ "别名注入: 菜品名称 → product_name")
902
+ check(result16["field_mapping"].get("规格") == "size",
903
+ "LLM 补充: 规格 → size")
904
+ check(result16["composite_col"] == "口味做法组合",
905
+ "LLM 补充: composite_col")
906
+ check("备注" in result16["irrelevant_cols"],
907
+ "别名注入: 备注 → ignore → irrelevant_cols")
908
+ # "配料" 既未被别名覆盖,也未被 LLM 映射 → unrecognized
909
+ check("配料" in result16["unrecognized_cols"],
910
+ f"配料 未被识别(实际 unrecognized: {result16['unrecognized_cols']})")
911
+ check(result16["unrecognized_cols"] == ["配料"],
912
+ "仅 配料 未识别")
913
+ print()
914
+
915
+ # ── 17. 不完整结果不写入磁盘缓存 ──
916
+ print("17. 不完整结果(unrecognized_cols 非空)→ 不写入磁盘指纹缓存")
917
+ fp17 = _template_fingerprint(["菜品名称", "规格", "口味做法组合", "备注", "配料"])
918
+ cached17 = mem_get_rule(fp17)
919
+ check(cached17 is None,
920
+ f"存在未识别列时不写入缓存(实际: {cached17 is not None})")
921
+
922
+ # 全识别结果应写入缓存 — 先用 column_aliases 覆盖全部列
923
+ reset_memory()
924
+ reset_cache()
925
+ for col, field in [
926
+ ("菜品名称", "product_name"),
927
+ ("规格", "size"),
928
+ ("口味做法组合", "composite_col"),
929
+ ("备注", "ignore"),
930
+ ("配料", "sop"),
931
+ ]:
932
+ add_column_alias(col, field)
933
+ config.MOCK_SCHEMA_RESPONSE = {}
934
+ result17 = analyze(
935
+ ["菜品名称", "规格", "口味做法组合", "备注", "配料"],
936
+ [{"菜品名称": "测试", "规格": "中杯", "口味做法组合": "牛奶,少冰",
937
+ "备注": "备注内容", "配料": ""}],
938
+ )
939
+ check(result17["unrecognized_cols"] == [],
940
+ f"列别名全覆盖 → unrecognized 为空(实际: {result17['unrecognized_cols']})")
941
+ check(result17["target_col"] == "配料",
942
+ "别名 '配料' → sop → target_col")
943
+ check(result17["irrelevant_cols"] == ["备注"],
944
+ "别名 '备注' → ignore → irrelevant_cols")
945
+ cached17b = mem_get_rule(fp17)
946
+ check(cached17b is not None,
947
+ "全识别后写入缓存")
948
+ print()
949
+
950
+ # ── 18. unrecognized_cols 在返回值中 ──
951
+ print("18. unrecognized_cols 返回值验证")
952
+ reset_memory()
953
+ reset_cache()
954
+ config.MOCK_SCHEMA_RESPONSE = {
955
+ "field_mapping": {"A": "product_name"},
956
+ "composite_col": None,
957
+ "target_col": None,
958
+ "irrelevant_cols": [],
959
+ }
960
+ result18 = analyze(["A", "B", "C"], [{"A": "x", "B": "y", "C": "z"}])
961
+ check("unrecognized_cols" in result18, "返回值含 unrecognized_cols")
962
+ check(isinstance(result18["unrecognized_cols"], list),
963
+ "unrecognized_cols 是 list")
964
+ check(set(result18["unrecognized_cols"]) == {"B", "C"},
965
+ f"B, C 未识别(实际 {result18['unrecognized_cols']})")
966
+ print()
967
+
968
+ # ── 19. infer_master_columns(Mock hook 模式)──
969
+ print("19. infer_master_columns(LLM 主数据列推断)")
970
+
971
+ def mock_inference(candidate_cols, sample_data, missing_fields):
972
+ """Mock LLM: 温度→temperature(high), Unnamed→null(low)"""
973
+ result = {}
974
+ for col in candidate_cols:
975
+ if "温度" in col or "做法" in col:
976
+ result[col] = {"field": "temperature", "confidence": "high",
977
+ "reason": "样例值为冰温描述"}
978
+ elif "Unnamed" in col:
979
+ result[col] = {"field": None, "confidence": "low",
980
+ "reason": "列数据为空"}
981
+ else:
982
+ result[col] = {"field": None, "confidence": "low",
983
+ "reason": "无法判断"}
984
+ return result
985
+
986
+ set_inference_hook(mock_inference)
987
+
988
+ sample = {"温度": ["少冰", "去冰", "正常冰", "热", "温热"],
989
+ "Unnamed: 2": ["", ""],
990
+ "代码": ["T240", "T265"]}
991
+
992
+ result19 = infer_master_columns(
993
+ ["温度", "Unnamed: 2", "代码"],
994
+ sample,
995
+ ["temperature"],
996
+ )
997
+ check(result19["温度"]["confidence"] == "high",
998
+ f"温度→high(实际 {result19['温度']['confidence']})")
999
+ check(result19["温度"]["field"] == "temperature",
1000
+ f"温度→temperature(实际 {result19['温度']['field']})")
1001
+ check(result19["Unnamed: 2"]["confidence"] == "low",
1002
+ f"Unnamed→low(实际 {result19['Unnamed: 2']['confidence']})")
1003
+ check(result19["Unnamed: 2"]["field"] is None,
1004
+ "Unnamed: 2 → field=None")
1005
+ check(result19["代码"]["confidence"] == "low",
1006
+ "无法判断 → low")
1007
+ # 空输入
1008
+ check(infer_master_columns([], {}, []) == {}, "空候选/空缺失 → 空结果")
1009
+ check(infer_master_columns(["A"], {"A": ["x"]}, []) == {}, "空缺失字段 → 空结果")
1010
+
1011
+ set_inference_hook(None)
1012
+ print()
1013
+
1014
+ # 清理
1015
+ config.MOCK_SCHEMA_RESPONSE = original_mock
1016
+ from menupilot.data.memory import reset_memory as rm
1017
+ rm()
1018
+
1019
+ # ── 还原真实 memory.json ──
1020
+ if _mem_backup:
1021
+ from menupilot.data.memory import reload as _mem_reload
1022
+ _shutil.move(_mem_backup, _mem_path)
1023
+ _mem_reload()
1024
+
1025
+ # ── 汇总 ──
1026
+ print(f"=== 结果: {passed} passed, {failed} failed ===")