taiwan-invoice-skill 2.1.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -0
- package/assets/taiwan-invoice/SKILL.md +485 -0
- package/assets/taiwan-invoice/data/error-codes.csv +41 -0
- package/assets/taiwan-invoice/data/field-mappings.csv +27 -0
- package/assets/taiwan-invoice/data/operations.csv +11 -0
- package/assets/taiwan-invoice/data/providers.csv +4 -0
- package/assets/taiwan-invoice/data/reasoning.csv +32 -0
- package/assets/taiwan-invoice/data/tax-rules.csv +9 -0
- package/assets/taiwan-invoice/data/troubleshooting.csv +17 -0
- package/assets/taiwan-invoice/scripts/__pycache__/core.cpython-312.pyc +0 -0
- package/assets/taiwan-invoice/scripts/core.py +310 -0
- package/assets/taiwan-invoice/scripts/generate-invoice-service.py +642 -128
- package/assets/taiwan-invoice/scripts/persist.py +330 -0
- package/assets/taiwan-invoice/scripts/recommend.py +373 -0
- package/assets/taiwan-invoice/scripts/search.py +273 -0
- package/dist/index.js +5 -0
- package/package.json +1 -1
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Taiwan Invoice Skill - BM25 Search Engine
|
|
4
|
+
基於 UIUX Pro Max 架構,針對電子發票數據優化
|
|
5
|
+
|
|
6
|
+
無外部依賴,純 Python 實現 BM25 搜索算法
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import csv
|
|
10
|
+
import math
|
|
11
|
+
import re
|
|
12
|
+
import os
|
|
13
|
+
from typing import List, Dict, Any, Optional, Tuple
|
|
14
|
+
|
|
15
|
+
# 取得 data 目錄路徑
|
|
16
|
+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
17
|
+
DATA_DIR = os.path.join(os.path.dirname(SCRIPT_DIR), 'data')
|
|
18
|
+
|
|
19
|
+
# CSV 設定:定義各域的搜索欄位和輸出欄位
|
|
20
|
+
CSV_CONFIG = {
|
|
21
|
+
'provider': {
|
|
22
|
+
'file': 'providers.csv',
|
|
23
|
+
'search_cols': ['provider', 'display_name', 'auth_method', 'features'],
|
|
24
|
+
'output_cols': ['provider', 'display_name', 'auth_method', 'encryption', 'test_merchant_id', 'features']
|
|
25
|
+
},
|
|
26
|
+
'operation': {
|
|
27
|
+
'file': 'operations.csv',
|
|
28
|
+
'search_cols': ['operation', 'operation_zh', 'notes'],
|
|
29
|
+
'output_cols': ['operation', 'operation_zh', 'ecpay_b2c_endpoint', 'smilepay_endpoint', 'amego_endpoint', 'required_fields', 'notes']
|
|
30
|
+
},
|
|
31
|
+
'error': {
|
|
32
|
+
'file': 'error-codes.csv',
|
|
33
|
+
'search_cols': ['provider', 'code', 'message_zh', 'message_en', 'category', 'solution'],
|
|
34
|
+
'output_cols': ['provider', 'code', 'message_zh', 'category', 'solution']
|
|
35
|
+
},
|
|
36
|
+
'field': {
|
|
37
|
+
'file': 'field-mappings.csv',
|
|
38
|
+
'search_cols': ['field_name', 'description', 'ecpay_name', 'smilepay_name', 'amego_name', 'notes'],
|
|
39
|
+
'output_cols': ['field_name', 'description', 'ecpay_name', 'smilepay_name', 'amego_name', 'type', 'required_b2c', 'required_b2b']
|
|
40
|
+
},
|
|
41
|
+
'tax': {
|
|
42
|
+
'file': 'tax-rules.csv',
|
|
43
|
+
'search_cols': ['invoice_type', 'tax_type', 'notes'],
|
|
44
|
+
'output_cols': ['invoice_type', 'tax_type', 'tax_rate', 'sales_amount_formula', 'tax_amount_formula', 'example_total', 'example_sales', 'example_tax']
|
|
45
|
+
},
|
|
46
|
+
'troubleshoot': {
|
|
47
|
+
'file': 'troubleshooting.csv',
|
|
48
|
+
'search_cols': ['issue', 'symptom', 'cause', 'solution', 'provider', 'category'],
|
|
49
|
+
'output_cols': ['issue', 'symptom', 'cause', 'solution', 'provider', 'severity']
|
|
50
|
+
},
|
|
51
|
+
'reasoning': {
|
|
52
|
+
'file': 'reasoning.csv',
|
|
53
|
+
'search_cols': ['scenario', 'recommended_provider', 'reason', 'decision_rules', 'use_cases'],
|
|
54
|
+
'output_cols': ['scenario', 'recommended_provider', 'confidence', 'reason', 'anti_patterns', 'use_cases']
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
# 域名自動偵測關鍵字
|
|
59
|
+
DOMAIN_KEYWORDS = {
|
|
60
|
+
'provider': ['ecpay', '綠界', 'smilepay', '速買配', 'amego', '光貿', 'provider', '加值中心', '服務商'],
|
|
61
|
+
'operation': ['issue', 'void', 'allowance', '開立', '作廢', '折讓', '列印', 'print', 'query', '查詢', 'endpoint', 'api'],
|
|
62
|
+
'error': ['error', 'code', '錯誤', '代碼', '失敗', 'fail', '-', '10000', '1001', '2001'],
|
|
63
|
+
'field': ['field', 'param', '欄位', '參數', 'mapping', '映射', 'merchantid', 'orderid', 'buyername'],
|
|
64
|
+
'tax': ['tax', 'b2c', 'b2b', '稅', '應稅', '免稅', '零稅率', 'salesamount', 'taxamount', '計算'],
|
|
65
|
+
'troubleshoot': ['問題', 'issue', 'error', 'fix', '解決', '失敗', '空白', 'troubleshoot', '踩坑'],
|
|
66
|
+
'reasoning': ['推薦', 'recommend', '選擇', 'choose', '適合', 'suitable', '場景', 'scenario', '決策', 'decision']
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def tokenize(text: str) -> List[str]:
|
|
71
|
+
"""
|
|
72
|
+
將文字分詞為 token 列表
|
|
73
|
+
支援中英文混合
|
|
74
|
+
"""
|
|
75
|
+
if not text:
|
|
76
|
+
return []
|
|
77
|
+
|
|
78
|
+
text = text.lower()
|
|
79
|
+
# 移除標點符號,保留中文、英文、數字
|
|
80
|
+
text = re.sub(r'[^\w\u4e00-\u9fff\s-]', ' ', text)
|
|
81
|
+
# 分割並過濾長度 < 2 的 token (英文)
|
|
82
|
+
tokens = text.split()
|
|
83
|
+
return [t for t in tokens if len(t) >= 1]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def compute_idf(documents: List[List[str]]) -> Dict[str, float]:
|
|
87
|
+
"""
|
|
88
|
+
計算 IDF (Inverse Document Frequency)
|
|
89
|
+
"""
|
|
90
|
+
N = len(documents)
|
|
91
|
+
if N == 0:
|
|
92
|
+
return {}
|
|
93
|
+
|
|
94
|
+
df = {} # document frequency
|
|
95
|
+
for doc in documents:
|
|
96
|
+
unique_terms = set(doc)
|
|
97
|
+
for term in unique_terms:
|
|
98
|
+
df[term] = df.get(term, 0) + 1
|
|
99
|
+
|
|
100
|
+
idf = {}
|
|
101
|
+
for term, freq in df.items():
|
|
102
|
+
idf[term] = math.log((N - freq + 0.5) / (freq + 0.5) + 1)
|
|
103
|
+
|
|
104
|
+
return idf
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def bm25_score(query_tokens: List[str], doc_tokens: List[str],
|
|
108
|
+
idf: Dict[str, float], avg_dl: float,
|
|
109
|
+
k1: float = 1.5, b: float = 0.75) -> float:
|
|
110
|
+
"""
|
|
111
|
+
計算 BM25 分數
|
|
112
|
+
"""
|
|
113
|
+
if not doc_tokens or not query_tokens:
|
|
114
|
+
return 0.0
|
|
115
|
+
|
|
116
|
+
doc_len = len(doc_tokens)
|
|
117
|
+
score = 0.0
|
|
118
|
+
|
|
119
|
+
# 計算詞頻
|
|
120
|
+
tf = {}
|
|
121
|
+
for token in doc_tokens:
|
|
122
|
+
tf[token] = tf.get(token, 0) + 1
|
|
123
|
+
|
|
124
|
+
for term in query_tokens:
|
|
125
|
+
if term not in tf:
|
|
126
|
+
continue
|
|
127
|
+
|
|
128
|
+
freq = tf[term]
|
|
129
|
+
term_idf = idf.get(term, 0)
|
|
130
|
+
|
|
131
|
+
# BM25 公式
|
|
132
|
+
numerator = freq * (k1 + 1)
|
|
133
|
+
denominator = freq + k1 * (1 - b + b * doc_len / avg_dl) if avg_dl > 0 else freq + k1
|
|
134
|
+
score += term_idf * (numerator / denominator)
|
|
135
|
+
|
|
136
|
+
return score
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _load_csv(filepath: str) -> List[Dict[str, str]]:
|
|
140
|
+
"""
|
|
141
|
+
載入 CSV 檔案
|
|
142
|
+
"""
|
|
143
|
+
if not os.path.exists(filepath):
|
|
144
|
+
return []
|
|
145
|
+
|
|
146
|
+
rows = []
|
|
147
|
+
with open(filepath, 'r', encoding='utf-8') as f:
|
|
148
|
+
reader = csv.DictReader(f)
|
|
149
|
+
for row in reader:
|
|
150
|
+
rows.append(row)
|
|
151
|
+
return rows
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _search_csv(query: str, domain: str, max_results: int = 5) -> List[Dict[str, Any]]:
|
|
155
|
+
"""
|
|
156
|
+
對指定域的 CSV 進行 BM25 搜索
|
|
157
|
+
"""
|
|
158
|
+
if domain not in CSV_CONFIG:
|
|
159
|
+
return []
|
|
160
|
+
|
|
161
|
+
config = CSV_CONFIG[domain]
|
|
162
|
+
filepath = os.path.join(DATA_DIR, config['file'])
|
|
163
|
+
rows = _load_csv(filepath)
|
|
164
|
+
|
|
165
|
+
if not rows:
|
|
166
|
+
return []
|
|
167
|
+
|
|
168
|
+
# 建立文檔
|
|
169
|
+
documents = []
|
|
170
|
+
for row in rows:
|
|
171
|
+
doc_text = ' '.join(str(row.get(col, '')) for col in config['search_cols'])
|
|
172
|
+
documents.append(tokenize(doc_text))
|
|
173
|
+
|
|
174
|
+
# 計算 IDF 和平均文檔長度
|
|
175
|
+
idf = compute_idf(documents)
|
|
176
|
+
avg_dl = sum(len(doc) for doc in documents) / len(documents) if documents else 1
|
|
177
|
+
|
|
178
|
+
# 計算每個文檔的分數
|
|
179
|
+
query_tokens = tokenize(query)
|
|
180
|
+
scored_results = []
|
|
181
|
+
|
|
182
|
+
for i, (row, doc_tokens) in enumerate(zip(rows, documents)):
|
|
183
|
+
score = bm25_score(query_tokens, doc_tokens, idf, avg_dl)
|
|
184
|
+
if score > 0:
|
|
185
|
+
result = {col: row.get(col, '') for col in config['output_cols']}
|
|
186
|
+
result['_score'] = round(score, 4)
|
|
187
|
+
scored_results.append(result)
|
|
188
|
+
|
|
189
|
+
# 按分數排序
|
|
190
|
+
scored_results.sort(key=lambda x: x['_score'], reverse=True)
|
|
191
|
+
|
|
192
|
+
return scored_results[:max_results]
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def detect_domain(query: str) -> str:
|
|
196
|
+
"""
|
|
197
|
+
自動偵測查詢屬於哪個域
|
|
198
|
+
"""
|
|
199
|
+
query_lower = query.lower()
|
|
200
|
+
|
|
201
|
+
scores = {domain: 0 for domain in DOMAIN_KEYWORDS}
|
|
202
|
+
|
|
203
|
+
for domain, keywords in DOMAIN_KEYWORDS.items():
|
|
204
|
+
for keyword in keywords:
|
|
205
|
+
if keyword.lower() in query_lower:
|
|
206
|
+
scores[domain] += 1
|
|
207
|
+
|
|
208
|
+
# 找出最高分的域
|
|
209
|
+
best_domain = max(scores, key=scores.get)
|
|
210
|
+
|
|
211
|
+
# 如果沒有匹配,預設為 troubleshoot
|
|
212
|
+
if scores[best_domain] == 0:
|
|
213
|
+
return 'troubleshoot'
|
|
214
|
+
|
|
215
|
+
return best_domain
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def search(query: str, domain: Optional[str] = None, max_results: int = 5) -> List[Dict[str, Any]]:
|
|
219
|
+
"""
|
|
220
|
+
主搜索函數
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
query: 搜索查詢
|
|
224
|
+
domain: 指定域 (provider, operation, error, field, tax, troubleshoot)
|
|
225
|
+
如果不指定,會自動偵測
|
|
226
|
+
max_results: 最大結果數
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
搜索結果列表
|
|
230
|
+
"""
|
|
231
|
+
if not domain:
|
|
232
|
+
domain = detect_domain(query)
|
|
233
|
+
|
|
234
|
+
return _search_csv(query, domain, max_results)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def search_all(query: str, max_per_domain: int = 3) -> Dict[str, List[Dict[str, Any]]]:
|
|
238
|
+
"""
|
|
239
|
+
在所有域中搜索
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
query: 搜索查詢
|
|
243
|
+
max_per_domain: 每個域的最大結果數
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
按域分類的搜索結果
|
|
247
|
+
"""
|
|
248
|
+
results = {}
|
|
249
|
+
for domain in CSV_CONFIG.keys():
|
|
250
|
+
domain_results = _search_csv(query, domain, max_per_domain)
|
|
251
|
+
if domain_results:
|
|
252
|
+
results[domain] = domain_results
|
|
253
|
+
return results
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def get_available_domains() -> List[str]:
|
|
257
|
+
"""
|
|
258
|
+
取得可用的搜索域列表
|
|
259
|
+
"""
|
|
260
|
+
return list(CSV_CONFIG.keys())
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def get_domain_info(domain: str) -> Optional[Dict[str, Any]]:
|
|
264
|
+
"""
|
|
265
|
+
取得域的設定資訊
|
|
266
|
+
"""
|
|
267
|
+
if domain not in CSV_CONFIG:
|
|
268
|
+
return None
|
|
269
|
+
|
|
270
|
+
config = CSV_CONFIG[domain]
|
|
271
|
+
filepath = os.path.join(DATA_DIR, config['file'])
|
|
272
|
+
rows = _load_csv(filepath)
|
|
273
|
+
|
|
274
|
+
return {
|
|
275
|
+
'domain': domain,
|
|
276
|
+
'file': config['file'],
|
|
277
|
+
'search_cols': config['search_cols'],
|
|
278
|
+
'output_cols': config['output_cols'],
|
|
279
|
+
'total_records': len(rows)
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
# CLI 測試
|
|
284
|
+
if __name__ == '__main__':
|
|
285
|
+
import sys
|
|
286
|
+
|
|
287
|
+
if len(sys.argv) < 2:
|
|
288
|
+
print("Usage: python core.py <query> [domain]")
|
|
289
|
+
print("\nAvailable domains:", ', '.join(get_available_domains()))
|
|
290
|
+
sys.exit(1)
|
|
291
|
+
|
|
292
|
+
query = sys.argv[1]
|
|
293
|
+
domain = sys.argv[2] if len(sys.argv) > 2 else None
|
|
294
|
+
|
|
295
|
+
print(f"Query: {query}")
|
|
296
|
+
if domain:
|
|
297
|
+
print(f"Domain: {domain}")
|
|
298
|
+
else:
|
|
299
|
+
detected = detect_domain(query)
|
|
300
|
+
print(f"Auto-detected domain: {detected}")
|
|
301
|
+
|
|
302
|
+
print()
|
|
303
|
+
|
|
304
|
+
results = search(query, domain)
|
|
305
|
+
for i, result in enumerate(results, 1):
|
|
306
|
+
print(f"[{i}] Score: {result.get('_score', 0)}")
|
|
307
|
+
for key, value in result.items():
|
|
308
|
+
if key != '_score' and value:
|
|
309
|
+
print(f" {key}: {value}")
|
|
310
|
+
print()
|