taiwan-invoice-skill 2.1.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,310 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Taiwan Invoice Skill - BM25 Search Engine
4
+ 基於 UIUX Pro Max 架構,針對電子發票數據優化
5
+
6
+ 無外部依賴,純 Python 實現 BM25 搜索算法
7
+ """
8
+
9
+ import csv
10
+ import math
11
+ import re
12
+ import os
13
+ from typing import List, Dict, Any, Optional, Tuple
14
+
15
+ # 取得 data 目錄路徑
16
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
17
+ DATA_DIR = os.path.join(os.path.dirname(SCRIPT_DIR), 'data')
18
+
19
+ # CSV 設定:定義各域的搜索欄位和輸出欄位
20
+ CSV_CONFIG = {
21
+ 'provider': {
22
+ 'file': 'providers.csv',
23
+ 'search_cols': ['provider', 'display_name', 'auth_method', 'features'],
24
+ 'output_cols': ['provider', 'display_name', 'auth_method', 'encryption', 'test_merchant_id', 'features']
25
+ },
26
+ 'operation': {
27
+ 'file': 'operations.csv',
28
+ 'search_cols': ['operation', 'operation_zh', 'notes'],
29
+ 'output_cols': ['operation', 'operation_zh', 'ecpay_b2c_endpoint', 'smilepay_endpoint', 'amego_endpoint', 'required_fields', 'notes']
30
+ },
31
+ 'error': {
32
+ 'file': 'error-codes.csv',
33
+ 'search_cols': ['provider', 'code', 'message_zh', 'message_en', 'category', 'solution'],
34
+ 'output_cols': ['provider', 'code', 'message_zh', 'category', 'solution']
35
+ },
36
+ 'field': {
37
+ 'file': 'field-mappings.csv',
38
+ 'search_cols': ['field_name', 'description', 'ecpay_name', 'smilepay_name', 'amego_name', 'notes'],
39
+ 'output_cols': ['field_name', 'description', 'ecpay_name', 'smilepay_name', 'amego_name', 'type', 'required_b2c', 'required_b2b']
40
+ },
41
+ 'tax': {
42
+ 'file': 'tax-rules.csv',
43
+ 'search_cols': ['invoice_type', 'tax_type', 'notes'],
44
+ 'output_cols': ['invoice_type', 'tax_type', 'tax_rate', 'sales_amount_formula', 'tax_amount_formula', 'example_total', 'example_sales', 'example_tax']
45
+ },
46
+ 'troubleshoot': {
47
+ 'file': 'troubleshooting.csv',
48
+ 'search_cols': ['issue', 'symptom', 'cause', 'solution', 'provider', 'category'],
49
+ 'output_cols': ['issue', 'symptom', 'cause', 'solution', 'provider', 'severity']
50
+ },
51
+ 'reasoning': {
52
+ 'file': 'reasoning.csv',
53
+ 'search_cols': ['scenario', 'recommended_provider', 'reason', 'decision_rules', 'use_cases'],
54
+ 'output_cols': ['scenario', 'recommended_provider', 'confidence', 'reason', 'anti_patterns', 'use_cases']
55
+ }
56
+ }
57
+
58
+ # 域名自動偵測關鍵字
59
+ DOMAIN_KEYWORDS = {
60
+ 'provider': ['ecpay', '綠界', 'smilepay', '速買配', 'amego', '光貿', 'provider', '加值中心', '服務商'],
61
+ 'operation': ['issue', 'void', 'allowance', '開立', '作廢', '折讓', '列印', 'print', 'query', '查詢', 'endpoint', 'api'],
62
+ 'error': ['error', 'code', '錯誤', '代碼', '失敗', 'fail', '-', '10000', '1001', '2001'],
63
+ 'field': ['field', 'param', '欄位', '參數', 'mapping', '映射', 'merchantid', 'orderid', 'buyername'],
64
+ 'tax': ['tax', 'b2c', 'b2b', '稅', '應稅', '免稅', '零稅率', 'salesamount', 'taxamount', '計算'],
65
+ 'troubleshoot': ['問題', 'issue', 'error', 'fix', '解決', '失敗', '空白', 'troubleshoot', '踩坑'],
66
+ 'reasoning': ['推薦', 'recommend', '選擇', 'choose', '適合', 'suitable', '場景', 'scenario', '決策', 'decision']
67
+ }
68
+
69
+
70
+ def tokenize(text: str) -> List[str]:
71
+ """
72
+ 將文字分詞為 token 列表
73
+ 支援中英文混合
74
+ """
75
+ if not text:
76
+ return []
77
+
78
+ text = text.lower()
79
+ # 移除標點符號,保留中文、英文、數字
80
+ text = re.sub(r'[^\w\u4e00-\u9fff\s-]', ' ', text)
81
+ # 分割並過濾長度 < 2 的 token (英文)
82
+ tokens = text.split()
83
+ return [t for t in tokens if len(t) >= 1]
84
+
85
+
86
+ def compute_idf(documents: List[List[str]]) -> Dict[str, float]:
87
+ """
88
+ 計算 IDF (Inverse Document Frequency)
89
+ """
90
+ N = len(documents)
91
+ if N == 0:
92
+ return {}
93
+
94
+ df = {} # document frequency
95
+ for doc in documents:
96
+ unique_terms = set(doc)
97
+ for term in unique_terms:
98
+ df[term] = df.get(term, 0) + 1
99
+
100
+ idf = {}
101
+ for term, freq in df.items():
102
+ idf[term] = math.log((N - freq + 0.5) / (freq + 0.5) + 1)
103
+
104
+ return idf
105
+
106
+
107
+ def bm25_score(query_tokens: List[str], doc_tokens: List[str],
108
+ idf: Dict[str, float], avg_dl: float,
109
+ k1: float = 1.5, b: float = 0.75) -> float:
110
+ """
111
+ 計算 BM25 分數
112
+ """
113
+ if not doc_tokens or not query_tokens:
114
+ return 0.0
115
+
116
+ doc_len = len(doc_tokens)
117
+ score = 0.0
118
+
119
+ # 計算詞頻
120
+ tf = {}
121
+ for token in doc_tokens:
122
+ tf[token] = tf.get(token, 0) + 1
123
+
124
+ for term in query_tokens:
125
+ if term not in tf:
126
+ continue
127
+
128
+ freq = tf[term]
129
+ term_idf = idf.get(term, 0)
130
+
131
+ # BM25 公式
132
+ numerator = freq * (k1 + 1)
133
+ denominator = freq + k1 * (1 - b + b * doc_len / avg_dl) if avg_dl > 0 else freq + k1
134
+ score += term_idf * (numerator / denominator)
135
+
136
+ return score
137
+
138
+
139
+ def _load_csv(filepath: str) -> List[Dict[str, str]]:
140
+ """
141
+ 載入 CSV 檔案
142
+ """
143
+ if not os.path.exists(filepath):
144
+ return []
145
+
146
+ rows = []
147
+ with open(filepath, 'r', encoding='utf-8') as f:
148
+ reader = csv.DictReader(f)
149
+ for row in reader:
150
+ rows.append(row)
151
+ return rows
152
+
153
+
154
+ def _search_csv(query: str, domain: str, max_results: int = 5) -> List[Dict[str, Any]]:
155
+ """
156
+ 對指定域的 CSV 進行 BM25 搜索
157
+ """
158
+ if domain not in CSV_CONFIG:
159
+ return []
160
+
161
+ config = CSV_CONFIG[domain]
162
+ filepath = os.path.join(DATA_DIR, config['file'])
163
+ rows = _load_csv(filepath)
164
+
165
+ if not rows:
166
+ return []
167
+
168
+ # 建立文檔
169
+ documents = []
170
+ for row in rows:
171
+ doc_text = ' '.join(str(row.get(col, '')) for col in config['search_cols'])
172
+ documents.append(tokenize(doc_text))
173
+
174
+ # 計算 IDF 和平均文檔長度
175
+ idf = compute_idf(documents)
176
+ avg_dl = sum(len(doc) for doc in documents) / len(documents) if documents else 1
177
+
178
+ # 計算每個文檔的分數
179
+ query_tokens = tokenize(query)
180
+ scored_results = []
181
+
182
+ for i, (row, doc_tokens) in enumerate(zip(rows, documents)):
183
+ score = bm25_score(query_tokens, doc_tokens, idf, avg_dl)
184
+ if score > 0:
185
+ result = {col: row.get(col, '') for col in config['output_cols']}
186
+ result['_score'] = round(score, 4)
187
+ scored_results.append(result)
188
+
189
+ # 按分數排序
190
+ scored_results.sort(key=lambda x: x['_score'], reverse=True)
191
+
192
+ return scored_results[:max_results]
193
+
194
+
195
+ def detect_domain(query: str) -> str:
196
+ """
197
+ 自動偵測查詢屬於哪個域
198
+ """
199
+ query_lower = query.lower()
200
+
201
+ scores = {domain: 0 for domain in DOMAIN_KEYWORDS}
202
+
203
+ for domain, keywords in DOMAIN_KEYWORDS.items():
204
+ for keyword in keywords:
205
+ if keyword.lower() in query_lower:
206
+ scores[domain] += 1
207
+
208
+ # 找出最高分的域
209
+ best_domain = max(scores, key=scores.get)
210
+
211
+ # 如果沒有匹配,預設為 troubleshoot
212
+ if scores[best_domain] == 0:
213
+ return 'troubleshoot'
214
+
215
+ return best_domain
216
+
217
+
218
+ def search(query: str, domain: Optional[str] = None, max_results: int = 5) -> List[Dict[str, Any]]:
219
+ """
220
+ 主搜索函數
221
+
222
+ Args:
223
+ query: 搜索查詢
224
+ domain: 指定域 (provider, operation, error, field, tax, troubleshoot)
225
+ 如果不指定,會自動偵測
226
+ max_results: 最大結果數
227
+
228
+ Returns:
229
+ 搜索結果列表
230
+ """
231
+ if not domain:
232
+ domain = detect_domain(query)
233
+
234
+ return _search_csv(query, domain, max_results)
235
+
236
+
237
+ def search_all(query: str, max_per_domain: int = 3) -> Dict[str, List[Dict[str, Any]]]:
238
+ """
239
+ 在所有域中搜索
240
+
241
+ Args:
242
+ query: 搜索查詢
243
+ max_per_domain: 每個域的最大結果數
244
+
245
+ Returns:
246
+ 按域分類的搜索結果
247
+ """
248
+ results = {}
249
+ for domain in CSV_CONFIG.keys():
250
+ domain_results = _search_csv(query, domain, max_per_domain)
251
+ if domain_results:
252
+ results[domain] = domain_results
253
+ return results
254
+
255
+
256
+ def get_available_domains() -> List[str]:
257
+ """
258
+ 取得可用的搜索域列表
259
+ """
260
+ return list(CSV_CONFIG.keys())
261
+
262
+
263
+ def get_domain_info(domain: str) -> Optional[Dict[str, Any]]:
264
+ """
265
+ 取得域的設定資訊
266
+ """
267
+ if domain not in CSV_CONFIG:
268
+ return None
269
+
270
+ config = CSV_CONFIG[domain]
271
+ filepath = os.path.join(DATA_DIR, config['file'])
272
+ rows = _load_csv(filepath)
273
+
274
+ return {
275
+ 'domain': domain,
276
+ 'file': config['file'],
277
+ 'search_cols': config['search_cols'],
278
+ 'output_cols': config['output_cols'],
279
+ 'total_records': len(rows)
280
+ }
281
+
282
+
283
+ # CLI 測試
284
+ if __name__ == '__main__':
285
+ import sys
286
+
287
+ if len(sys.argv) < 2:
288
+ print("Usage: python core.py <query> [domain]")
289
+ print("\nAvailable domains:", ', '.join(get_available_domains()))
290
+ sys.exit(1)
291
+
292
+ query = sys.argv[1]
293
+ domain = sys.argv[2] if len(sys.argv) > 2 else None
294
+
295
+ print(f"Query: {query}")
296
+ if domain:
297
+ print(f"Domain: {domain}")
298
+ else:
299
+ detected = detect_domain(query)
300
+ print(f"Auto-detected domain: {detected}")
301
+
302
+ print()
303
+
304
+ results = search(query, domain)
305
+ for i, result in enumerate(results, 1):
306
+ print(f"[{i}] Score: {result.get('_score', 0)}")
307
+ for key, value in result.items():
308
+ if key != '_score' and value:
309
+ print(f" {key}: {value}")
310
+ print()