MenuPilot 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,293 @@
1
+ """
2
+ 模板预处理层 — 识别模板类型并标准化行结构。
3
+
4
+ 在 Excel 读取后、Schema Analyzer 之前执行。
5
+ - standard 类型:透明通过,不影响现有流程
6
+ - chowbus 类型:收集散列字段,输出标准行结构
7
+ """
8
+
9
+ import re
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ import pandas as pd
13
+
14
+
15
+ def contains_chinese(value: Any) -> bool:
16
+ """判断值是否包含中文字符。
17
+
18
+ Args:
19
+ value: 任意值(None / 数字 / 字符串)。
20
+
21
+ Returns:
22
+ True 当值包含至少一个中文字符(Unicode 一-鿿)。
23
+ """
24
+ if value is None:
25
+ return False
26
+ if isinstance(value, (int, float)):
27
+ return False
28
+ s = str(value).strip()
29
+ if not s:
30
+ return False
31
+ return bool(re.search(r"[一-鿿]", s))
32
+
33
+
34
+ def _clean_value(value: Any) -> str:
35
+ """清洗值:去除 |推荐 等后缀标记,返回纯文本。
36
+
37
+ Args:
38
+ value: 原始单元格值。
39
+
40
+ Returns:
41
+ 清洗后的字符串。
42
+ """
43
+ if value is None:
44
+ return ""
45
+ s = str(value).strip()
46
+ # 去除 |推荐、|必选 等后缀
47
+ if "|" in s:
48
+ s = s.split("|")[0].strip()
49
+ return s
50
+
51
+
52
+ def detect_template_type(df: pd.DataFrame) -> str:
53
+ """识别模板类型。
54
+
55
+ 检测条件:
56
+ 1. 模板第一行为英文字段名(至少 3 个纯英文列名)
57
+ 2. 第二行包含中文字段注释
58
+ 3. 存在 item_cn 列
59
+ 4. 存在至少一个 customization{N}_id 列(N>=1)
60
+
61
+ Args:
62
+ df: 模板 DataFrame(header=None 读取)。
63
+
64
+ Returns:
65
+ "chowbus" 或 "standard"。
66
+ """
67
+ if df.shape[0] < 3:
68
+ return "standard"
69
+
70
+ row0 = [str(df.iloc[0, c]) if pd.notna(df.iloc[0, c]) else "" for c in range(df.shape[1])]
71
+ row1 = [str(df.iloc[1, c]) if pd.notna(df.iloc[1, c]) else "" for c in range(df.shape[1])]
72
+
73
+ # 条件 1: 第一行至少有 3 个纯英文列名
74
+ english_cols = [v for v in row0 if re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", v)]
75
+ if len(english_cols) < 3:
76
+ return "standard"
77
+
78
+ # 条件 2: 第二行包含中文字符
79
+ row1_str = "".join(row1)
80
+ if not contains_chinese(row1_str):
81
+ return "standard"
82
+
83
+ # 条件 3: 存在 item_cn 列
84
+ has_item_cn = any("item_cn" == v for v in row0)
85
+ if not has_item_cn:
86
+ return "standard"
87
+
88
+ # 条件 4: 存在 customization{N}_id 列
89
+ customization_cols = [v for v in row0 if re.match(r"^customization\d+_id$", v)]
90
+ if len(customization_cols) < 1:
91
+ return "standard"
92
+
93
+ return "chowbus"
94
+
95
+
96
+ def collect_chowbus_rows(df: pd.DataFrame) -> List[Dict[str, Any]]:
97
+ """收集 chowbus 模板的散列字段,输出标准行结构。
98
+
99
+ 对每一行:
100
+ 1. 取 item_cn 列值作为 product_name(空则跳过)
101
+ 2. 找到 item_cn 的列索引
102
+ 3. 向右扫描所有后续列,收集中文值
103
+ 4. 拼接为 composite_info
104
+
105
+ Args:
106
+ df: chowbus 模板 DataFrame(header=None 读取)。
107
+
108
+ Returns:
109
+ [{"product_name": "...", "composite_info": "红茶 燕麦奶 正常冰 标准糖"}, ...]
110
+ """
111
+ # 定位 item_cn 列
112
+ row0 = [str(df.iloc[0, c]) if pd.notna(df.iloc[0, c]) else "" for c in range(df.shape[1])]
113
+ item_cn_col = None
114
+ for c, v in enumerate(row0):
115
+ if v == "item_cn":
116
+ item_cn_col = c
117
+ break
118
+
119
+ if item_cn_col is None:
120
+ raise ValueError("chowbus 模板缺少 item_cn 列")
121
+
122
+ rows = []
123
+ for row_idx in range(2, df.shape[0]): # 从第 3 行开始(跳过英文表头 + 中文注释)
124
+ product_name = df.iloc[row_idx, item_cn_col]
125
+ if pd.isna(product_name) or str(product_name).strip() == "":
126
+ continue
127
+
128
+ product_name = str(product_name).strip()
129
+
130
+ # 向右扫描收集中文值
131
+ chinese_values = []
132
+ for c in range(item_cn_col + 1, df.shape[1]):
133
+ val = df.iloc[row_idx, c]
134
+ if pd.isna(val):
135
+ continue
136
+ cleaned = _clean_value(val)
137
+ if contains_chinese(cleaned):
138
+ chinese_values.append(cleaned)
139
+
140
+ composite_info = ", ".join(chinese_values)
141
+ rows.append({
142
+ "product_name": product_name,
143
+ "composite_info": composite_info,
144
+ })
145
+
146
+ return rows
147
+
148
+
149
+ # ── 自测 ────────────────────────────────────────────────────────
150
+
151
+ if __name__ == "__main__":
152
+ import os
153
+ import tempfile
154
+
155
+ passed = 0
156
+ failed = 0
157
+
158
+ def check(condition, msg):
159
+ global passed, failed
160
+ if condition:
161
+ passed += 1
162
+ print(f" PASS {msg}")
163
+ else:
164
+ failed += 1
165
+ print(f" FAIL {msg}")
166
+
167
+ print("=== Template Preprocessor 自测 ===\n")
168
+
169
+ # ── 1. contains_chinese ──
170
+ print("1. contains_chinese()")
171
+ check(contains_chinese("红茶"), "红茶 → True")
172
+ check(contains_chinese("正常冰"), "正常冰 → True")
173
+ check(not contains_chinese("abc123"), "abc123 → False")
174
+ check(not contains_chinese(None), "None → False")
175
+ check(not contains_chinese(123), "数字 123 → False")
176
+ check(not contains_chinese(""), "空字符串 → False")
177
+ check(not contains_chinese("cpid_250164"), "cpid_250164 → False")
178
+ check(contains_chinese("五角瓶|推荐"), "五角瓶|推荐 → True")
179
+ print()
180
+
181
+ # ── 2. _clean_value ──
182
+ print("2. _clean_value()")
183
+ check(_clean_value("五角瓶|推荐") == "五角瓶", "去除|推荐后缀")
184
+ check(_clean_value("正常冰") == "正常冰", "正常值不变")
185
+ check(_clean_value(None) == "", "None → 空字符串")
186
+ print()
187
+
188
+ # ── 3. detect_template_type ──
189
+ print("3. detect_template_type()")
190
+ # 标准模板(单行表头)
191
+ df_standard = pd.DataFrame({
192
+ "菜品名称": ["浅浅清茶"],
193
+ "规格": ["中杯"],
194
+ "口味做法组合": ["牛奶, 少冰, 七分糖"],
195
+ "配料": [""],
196
+ })
197
+ check(detect_template_type(df_standard) == "standard", "标准模板 → standard")
198
+
199
+ # chowbus 模板(两行表头)
200
+ df_chowbus = pd.DataFrame([
201
+ ["terminal_en", "terminal_cn", "item_cn", "customization1_id", "customization1_option_id"],
202
+ ["// 英文", "// 中文", "// 商品", "// 定制ID", "// 定制选项"],
203
+ ["POS", "POS", "五黄高纤慢养瓶", "cpid_250164", None],
204
+ ])
205
+ check(detect_template_type(df_chowbus) == "chowbus", "chowbus 模板 → chowbus")
206
+
207
+ # 缺少 customization 列
208
+ df_no_cust = pd.DataFrame([
209
+ ["terminal_en", "terminal_cn", "item_cn"],
210
+ ["// 英文", "// 中文", "// 商品"],
211
+ ["POS", "POS", "测试商品"],
212
+ ])
213
+ check(detect_template_type(df_no_cust) == "standard", "缺少 customization → standard")
214
+
215
+ # 缺少 item_cn
216
+ df_no_item = pd.DataFrame([
217
+ ["terminal_en", "terminal_cn", "customization1_id"],
218
+ ["// 英文", "// 中文", "// 定制"],
219
+ ["POS", "POS", "cpid_123"],
220
+ ])
221
+ check(detect_template_type(df_no_item) == "standard", "缺少 item_cn → standard")
222
+ print()
223
+
224
+ # ── 4. collect_chowbus_rows ──
225
+ print("4. collect_chowbus_rows()")
226
+ df_data = pd.DataFrame([
227
+ ["terminal_en", "item_cn", "customization1_id", "customization1_option_id",
228
+ "customization2_id", "customization2_option_id", "other_en"],
229
+ ["// 英文", "// 商品名", "// 尺寸ID", "// 尺寸选项",
230
+ "// 温度ID", "// 温度选项", "// 其他"],
231
+ ["POS", "五黄高纤慢养瓶", "id_123", None, "id_456", "正常冰|推荐", "abc"],
232
+ ["POS", "五黄高纤慢养瓶", "id_123", "五角瓶", "id_456", "少冰", "def"],
233
+ ["POS", None, "id_789", "中杯", "id_012", "去冰", "ghi"], # item_cn 为空 → 跳过
234
+ ["POS", "珍珠奶茶", "id_789", "大杯", "id_012", "热", "ghi"],
235
+ ])
236
+
237
+ rows = collect_chowbus_rows(df_data)
238
+ check(len(rows) == 3, f"收集 3 行(实际 {len(rows)})")
239
+ check(rows[0]["product_name"] == "五黄高纤慢养瓶", f"第1行 product_name 正确")
240
+ check("正常冰" in rows[0]["composite_info"], f"第1行 composite_info 含 正常冰")
241
+ check("五角瓶" in rows[1]["composite_info"], f"第2行 composite_info 含 五角瓶")
242
+ check(rows[2]["product_name"] == "珍珠奶茶", f"第3行 product_name 正确")
243
+ print()
244
+
245
+ # ── 5. item_cn 之前的中文值不被收集 ──
246
+ print("5. item_cn 之前的中文值不被收集")
247
+ df_before = pd.DataFrame([
248
+ ["menu_cn", "item_cn", "customization1_option_id"],
249
+ ["// 菜单名", "// 商品", "// 尺寸"],
250
+ ["超级菜单", "测试商品", "大杯"],
251
+ ])
252
+ rows_before = collect_chowbus_rows(df_before)
253
+ check(len(rows_before) == 1, "1 行")
254
+ check("超级菜单" not in rows_before[0]["composite_info"],
255
+ "item_cn 之前的 '超级菜单' 未被收集")
256
+ check("大杯" in rows_before[0]["composite_info"],
257
+ "item_cn 之后的 '大杯' 被收集")
258
+ print()
259
+
260
+ # ── 6. 纯数字/纯英文 → 跳过 ──
261
+ print("6. 纯数字/纯英文值被跳过")
262
+ df_skip = pd.DataFrame([
263
+ ["item_cn", "custom1_opt", "custom2_opt", "custom3_opt", "custom4_opt", "custom5_opt"],
264
+ ["// 商品", "// 尺寸", "// 温度", "// 糖度", "// 奶底", "// 选项ID"],
265
+ ["测试商品", "大杯", "cpid_250164", "正常冰", "12345", "燕麦奶"],
266
+ ])
267
+ rows_skip = collect_chowbus_rows(df_skip)
268
+ check(len(rows_skip) == 1, "1 行")
269
+ info = rows_skip[0]["composite_info"]
270
+ check("大杯" in info, "中文 '大杯' 被收集")
271
+ check("正常冰" in info, "中文 '正常冰' 被收集")
272
+ check("燕麦奶" in info, "中文 '燕麦奶' 被收集")
273
+ check("cpid_250164" not in info, "纯英文 'cpid_250164' 被跳过")
274
+ check("12345" not in info, "纯数字 '12345' 被跳过")
275
+ check("测试商品" not in info, "item_cn 列本身不被收集")
276
+ print()
277
+
278
+ # ── 7. 空行跳过 ──
279
+ print("7. item_cn 为空的行被跳过")
280
+ df_empty = pd.DataFrame([
281
+ ["item_cn", "custom1_opt"],
282
+ ["// 商品", "// 尺寸"],
283
+ [None, "大杯"],
284
+ ["", "中杯"],
285
+ ["有效商品", "小杯"],
286
+ ])
287
+ rows_empty = collect_chowbus_rows(df_empty)
288
+ check(len(rows_empty) == 1, f"仅 1 行有效(实际 {len(rows_empty)})")
289
+ check(rows_empty[0]["product_name"] == "有效商品", "有效商品行被收集")
290
+ print()
291
+
292
+ # ── 汇总 ──
293
+ print(f"=== 结果: {passed} passed, {failed} failed ===")