taiwan-invoice-skill 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,304 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Taiwan Invoice Skill - BM25 Search Engine
4
+ 基於 UIUX Pro Max 架構,針對電子發票數據優化
5
+
6
+ 無外部依賴,純 Python 實現 BM25 搜索算法
7
+ """
8
+
9
+ import csv
10
+ import math
11
+ import re
12
+ import os
13
+ from typing import List, Dict, Any, Optional, Tuple
14
+
15
+ # 取得 data 目錄路徑
16
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
17
+ DATA_DIR = os.path.join(os.path.dirname(SCRIPT_DIR), 'data')
18
+
19
+ # CSV 設定:定義各域的搜索欄位和輸出欄位
20
+ CSV_CONFIG = {
21
+ 'provider': {
22
+ 'file': 'providers.csv',
23
+ 'search_cols': ['provider', 'display_name', 'auth_method', 'features'],
24
+ 'output_cols': ['provider', 'display_name', 'auth_method', 'encryption', 'test_merchant_id', 'features']
25
+ },
26
+ 'operation': {
27
+ 'file': 'operations.csv',
28
+ 'search_cols': ['operation', 'operation_zh', 'notes'],
29
+ 'output_cols': ['operation', 'operation_zh', 'ecpay_b2c_endpoint', 'smilepay_endpoint', 'amego_endpoint', 'required_fields', 'notes']
30
+ },
31
+ 'error': {
32
+ 'file': 'error-codes.csv',
33
+ 'search_cols': ['provider', 'code', 'message_zh', 'message_en', 'category', 'solution'],
34
+ 'output_cols': ['provider', 'code', 'message_zh', 'category', 'solution']
35
+ },
36
+ 'field': {
37
+ 'file': 'field-mappings.csv',
38
+ 'search_cols': ['field_name', 'description', 'ecpay_name', 'smilepay_name', 'amego_name', 'notes'],
39
+ 'output_cols': ['field_name', 'description', 'ecpay_name', 'smilepay_name', 'amego_name', 'type', 'required_b2c', 'required_b2b']
40
+ },
41
+ 'tax': {
42
+ 'file': 'tax-rules.csv',
43
+ 'search_cols': ['invoice_type', 'tax_type', 'notes'],
44
+ 'output_cols': ['invoice_type', 'tax_type', 'tax_rate', 'sales_amount_formula', 'tax_amount_formula', 'example_total', 'example_sales', 'example_tax']
45
+ },
46
+ 'troubleshoot': {
47
+ 'file': 'troubleshooting.csv',
48
+ 'search_cols': ['issue', 'symptom', 'cause', 'solution', 'provider', 'category'],
49
+ 'output_cols': ['issue', 'symptom', 'cause', 'solution', 'provider', 'severity']
50
+ }
51
+ }
52
+
53
+ # 域名自動偵測關鍵字
54
+ DOMAIN_KEYWORDS = {
55
+ 'provider': ['ecpay', '綠界', 'smilepay', '速買配', 'amego', '光貿', 'provider', '加值中心', '服務商'],
56
+ 'operation': ['issue', 'void', 'allowance', '開立', '作廢', '折讓', '列印', 'print', 'query', '查詢', 'endpoint', 'api'],
57
+ 'error': ['error', 'code', '錯誤', '代碼', '失敗', 'fail', '-', '10000', '1001', '2001'],
58
+ 'field': ['field', 'param', '欄位', '參數', 'mapping', '映射', 'merchantid', 'orderid', 'buyername'],
59
+ 'tax': ['tax', 'b2c', 'b2b', '稅', '應稅', '免稅', '零稅率', 'salesamount', 'taxamount', '計算'],
60
+ 'troubleshoot': ['問題', 'issue', 'error', 'fix', '解決', '失敗', '空白', 'troubleshoot', '踩坑']
61
+ }
62
+
63
+
64
+ def tokenize(text: str) -> List[str]:
65
+ """
66
+ 將文字分詞為 token 列表
67
+ 支援中英文混合
68
+ """
69
+ if not text:
70
+ return []
71
+
72
+ text = text.lower()
73
+ # 移除標點符號,保留中文、英文、數字
74
+ text = re.sub(r'[^\w\u4e00-\u9fff\s-]', ' ', text)
75
+ # 分割並過濾長度 < 2 的 token (英文)
76
+ tokens = text.split()
77
+ return [t for t in tokens if len(t) >= 1]
78
+
79
+
80
+ def compute_idf(documents: List[List[str]]) -> Dict[str, float]:
81
+ """
82
+ 計算 IDF (Inverse Document Frequency)
83
+ """
84
+ N = len(documents)
85
+ if N == 0:
86
+ return {}
87
+
88
+ df = {} # document frequency
89
+ for doc in documents:
90
+ unique_terms = set(doc)
91
+ for term in unique_terms:
92
+ df[term] = df.get(term, 0) + 1
93
+
94
+ idf = {}
95
+ for term, freq in df.items():
96
+ idf[term] = math.log((N - freq + 0.5) / (freq + 0.5) + 1)
97
+
98
+ return idf
99
+
100
+
101
+ def bm25_score(query_tokens: List[str], doc_tokens: List[str],
102
+ idf: Dict[str, float], avg_dl: float,
103
+ k1: float = 1.5, b: float = 0.75) -> float:
104
+ """
105
+ 計算 BM25 分數
106
+ """
107
+ if not doc_tokens or not query_tokens:
108
+ return 0.0
109
+
110
+ doc_len = len(doc_tokens)
111
+ score = 0.0
112
+
113
+ # 計算詞頻
114
+ tf = {}
115
+ for token in doc_tokens:
116
+ tf[token] = tf.get(token, 0) + 1
117
+
118
+ for term in query_tokens:
119
+ if term not in tf:
120
+ continue
121
+
122
+ freq = tf[term]
123
+ term_idf = idf.get(term, 0)
124
+
125
+ # BM25 公式
126
+ numerator = freq * (k1 + 1)
127
+ denominator = freq + k1 * (1 - b + b * doc_len / avg_dl) if avg_dl > 0 else freq + k1
128
+ score += term_idf * (numerator / denominator)
129
+
130
+ return score
131
+
132
+
133
+ def _load_csv(filepath: str) -> List[Dict[str, str]]:
134
+ """
135
+ 載入 CSV 檔案
136
+ """
137
+ if not os.path.exists(filepath):
138
+ return []
139
+
140
+ rows = []
141
+ with open(filepath, 'r', encoding='utf-8') as f:
142
+ reader = csv.DictReader(f)
143
+ for row in reader:
144
+ rows.append(row)
145
+ return rows
146
+
147
+
148
+ def _search_csv(query: str, domain: str, max_results: int = 5) -> List[Dict[str, Any]]:
149
+ """
150
+ 對指定域的 CSV 進行 BM25 搜索
151
+ """
152
+ if domain not in CSV_CONFIG:
153
+ return []
154
+
155
+ config = CSV_CONFIG[domain]
156
+ filepath = os.path.join(DATA_DIR, config['file'])
157
+ rows = _load_csv(filepath)
158
+
159
+ if not rows:
160
+ return []
161
+
162
+ # 建立文檔
163
+ documents = []
164
+ for row in rows:
165
+ doc_text = ' '.join(str(row.get(col, '')) for col in config['search_cols'])
166
+ documents.append(tokenize(doc_text))
167
+
168
+ # 計算 IDF 和平均文檔長度
169
+ idf = compute_idf(documents)
170
+ avg_dl = sum(len(doc) for doc in documents) / len(documents) if documents else 1
171
+
172
+ # 計算每個文檔的分數
173
+ query_tokens = tokenize(query)
174
+ scored_results = []
175
+
176
+ for i, (row, doc_tokens) in enumerate(zip(rows, documents)):
177
+ score = bm25_score(query_tokens, doc_tokens, idf, avg_dl)
178
+ if score > 0:
179
+ result = {col: row.get(col, '') for col in config['output_cols']}
180
+ result['_score'] = round(score, 4)
181
+ scored_results.append(result)
182
+
183
+ # 按分數排序
184
+ scored_results.sort(key=lambda x: x['_score'], reverse=True)
185
+
186
+ return scored_results[:max_results]
187
+
188
+
189
+ def detect_domain(query: str) -> str:
190
+ """
191
+ 自動偵測查詢屬於哪個域
192
+ """
193
+ query_lower = query.lower()
194
+
195
+ scores = {domain: 0 for domain in DOMAIN_KEYWORDS}
196
+
197
+ for domain, keywords in DOMAIN_KEYWORDS.items():
198
+ for keyword in keywords:
199
+ if keyword.lower() in query_lower:
200
+ scores[domain] += 1
201
+
202
+ # 找出最高分的域
203
+ best_domain = max(scores, key=scores.get)
204
+
205
+ # 如果沒有匹配,預設為 troubleshoot
206
+ if scores[best_domain] == 0:
207
+ return 'troubleshoot'
208
+
209
+ return best_domain
210
+
211
+
212
+ def search(query: str, domain: Optional[str] = None, max_results: int = 5) -> List[Dict[str, Any]]:
213
+ """
214
+ 主搜索函數
215
+
216
+ Args:
217
+ query: 搜索查詢
218
+ domain: 指定域 (provider, operation, error, field, tax, troubleshoot)
219
+ 如果不指定,會自動偵測
220
+ max_results: 最大結果數
221
+
222
+ Returns:
223
+ 搜索結果列表
224
+ """
225
+ if not domain:
226
+ domain = detect_domain(query)
227
+
228
+ return _search_csv(query, domain, max_results)
229
+
230
+
231
+ def search_all(query: str, max_per_domain: int = 3) -> Dict[str, List[Dict[str, Any]]]:
232
+ """
233
+ 在所有域中搜索
234
+
235
+ Args:
236
+ query: 搜索查詢
237
+ max_per_domain: 每個域的最大結果數
238
+
239
+ Returns:
240
+ 按域分類的搜索結果
241
+ """
242
+ results = {}
243
+ for domain in CSV_CONFIG.keys():
244
+ domain_results = _search_csv(query, domain, max_per_domain)
245
+ if domain_results:
246
+ results[domain] = domain_results
247
+ return results
248
+
249
+
250
+ def get_available_domains() -> List[str]:
251
+ """
252
+ 取得可用的搜索域列表
253
+ """
254
+ return list(CSV_CONFIG.keys())
255
+
256
+
257
+ def get_domain_info(domain: str) -> Optional[Dict[str, Any]]:
258
+ """
259
+ 取得域的設定資訊
260
+ """
261
+ if domain not in CSV_CONFIG:
262
+ return None
263
+
264
+ config = CSV_CONFIG[domain]
265
+ filepath = os.path.join(DATA_DIR, config['file'])
266
+ rows = _load_csv(filepath)
267
+
268
+ return {
269
+ 'domain': domain,
270
+ 'file': config['file'],
271
+ 'search_cols': config['search_cols'],
272
+ 'output_cols': config['output_cols'],
273
+ 'total_records': len(rows)
274
+ }
275
+
276
+
277
+ # CLI 測試
278
+ if __name__ == '__main__':
279
+ import sys
280
+
281
+ if len(sys.argv) < 2:
282
+ print("Usage: python core.py <query> [domain]")
283
+ print("\nAvailable domains:", ', '.join(get_available_domains()))
284
+ sys.exit(1)
285
+
286
+ query = sys.argv[1]
287
+ domain = sys.argv[2] if len(sys.argv) > 2 else None
288
+
289
+ print(f"Query: {query}")
290
+ if domain:
291
+ print(f"Domain: {domain}")
292
+ else:
293
+ detected = detect_domain(query)
294
+ print(f"Auto-detected domain: {detected}")
295
+
296
+ print()
297
+
298
+ results = search(query, domain)
299
+ for i, result in enumerate(results, 1):
300
+ print(f"[{i}] Score: {result.get('_score', 0)}")
301
+ for key, value in result.items():
302
+ if key != '_score' and value:
303
+ print(f" {key}: {value}")
304
+ print()