claude-code-hwp-mcp 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +409 -0
- package/dist/hwp-bridge.d.ts +67 -0
- package/dist/hwp-bridge.js +320 -0
- package/dist/hwpx-engine.d.ts +39 -0
- package/dist/hwpx-engine.js +187 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +54 -0
- package/dist/prompts/hwp-prompts.d.ts +2 -0
- package/dist/prompts/hwp-prompts.js +368 -0
- package/dist/resources/document-resources.d.ts +3 -0
- package/dist/resources/document-resources.js +109 -0
- package/dist/server.d.ts +12 -0
- package/dist/server.js +29 -0
- package/dist/tools/analysis-tools.d.ts +4 -0
- package/dist/tools/analysis-tools.js +414 -0
- package/dist/tools/composite-tools.d.ts +3 -0
- package/dist/tools/composite-tools.js +664 -0
- package/dist/tools/document-tools.d.ts +3 -0
- package/dist/tools/document-tools.js +264 -0
- package/dist/tools/editing-tools.d.ts +4 -0
- package/dist/tools/editing-tools.js +916 -0
- package/package.json +31 -0
- package/python/__pycache__/hwp_analyzer.cpython-313.pyc +0 -0
- package/python/__pycache__/hwp_editor.cpython-313.pyc +0 -0
- package/python/__pycache__/hwp_service.cpython-313.pyc +0 -0
- package/python/__pycache__/privacy_scanner.cpython-313.pyc +0 -0
- package/python/__pycache__/ref_reader.cpython-313.pyc +0 -0
- package/python/__pycache__/test_integration.cpython-313.pyc +0 -0
- package/python/hwp_analyzer.py +544 -0
- package/python/hwp_editor.py +933 -0
- package/python/hwp_service.py +1291 -0
- package/python/privacy_scanner.py +115 -0
- package/python/ref_reader.py +115 -0
- package/python/requirements.txt +2 -0
package/package.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "claude-code-hwp-mcp",
|
|
3
|
+
"version": "0.2.0",
|
|
4
|
+
"description": "MCP server for HWP (한글) document automation via pyhwpx COM API. 85+ tools for document editing, analysis, and AI-powered filling.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "dist/index.js",
|
|
7
|
+
"bin": {
|
|
8
|
+
"hwp-mcp": "dist/index.js"
|
|
9
|
+
},
|
|
10
|
+
"files": [
|
|
11
|
+
"dist/",
|
|
12
|
+
"python/"
|
|
13
|
+
],
|
|
14
|
+
"scripts": {
|
|
15
|
+
"build": "tsc",
|
|
16
|
+
"prepublishOnly": "npm run build",
|
|
17
|
+
"start": "node dist/index.js",
|
|
18
|
+
"dev": "tsx src/index.ts"
|
|
19
|
+
},
|
|
20
|
+
"dependencies": {
|
|
21
|
+
"@modelcontextprotocol/sdk": "^1.12.0",
|
|
22
|
+
"@xmldom/xmldom": "^0.8.11",
|
|
23
|
+
"jszip": "^3.10.1",
|
|
24
|
+
"zod": "^3.23.0"
|
|
25
|
+
},
|
|
26
|
+
"devDependencies": {
|
|
27
|
+
"@types/node": "^22.0.0",
|
|
28
|
+
"tsx": "^4.0.0",
|
|
29
|
+
"typescript": "~5.7.0"
|
|
30
|
+
}
|
|
31
|
+
}
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,544 @@
|
|
|
1
|
+
"""HWP Document Analyzer - Extract structure and content from HWP documents.
|
|
2
|
+
Uses pyhwpx Hwp() only. Raw win32com is forbidden.
|
|
3
|
+
"""
|
|
4
|
+
import sys
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
MAX_TABLES = 50 # 표 스캔 상한 (통장사본 등 반복 표 방지)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# ── 공백 정규화 ──
|
|
13
|
+
def _normalize(text):
|
|
14
|
+
"""모든 공백(스페이스, 탭, NBSP 등)을 제거하여 비교용 문자열 반환."""
|
|
15
|
+
return re.sub(r"\s+", "", text)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# ── 라벨 별칭(alias) 사전 ──
|
|
19
|
+
# key: 정규화된 표준명, value: 정규화된 동의어 리스트
|
|
20
|
+
_LABEL_ALIASES = {
|
|
21
|
+
"기업명": ["기업이름", "회사명", "상호명", "상호", "법인명", "업체명", "회사이름"],
|
|
22
|
+
"사업자등록번호": ["사업자번호", "사업자등록NO", "사업자No"],
|
|
23
|
+
"법인등록번호": ["법인번호", "법인등록No", "법인No"],
|
|
24
|
+
"사업장주소": ["주소", "소재지", "본점소재지", "사업장소재지", "회사주소", "기업주소"],
|
|
25
|
+
"대표자성명": ["대표자", "대표자명", "대표이사", "대표자이름", "대표이사명", "성명"],
|
|
26
|
+
"대표전화번호": ["대표전화", "전화번호", "연락처", "대표번호", "전화", "TEL"],
|
|
27
|
+
"홈페이지URL": ["홈페이지", "웹사이트", "URL", "홈페이지주소", "웹주소"],
|
|
28
|
+
"이메일": ["이메일주소", "EMAIL", "E-MAIL"],
|
|
29
|
+
"팩스번호": ["팩스", "FAX", "FAX번호"],
|
|
30
|
+
"설립일": ["설립일자", "설립년월일", "법인설립일"],
|
|
31
|
+
"업종": ["업종명", "주업종"],
|
|
32
|
+
"업태": ["업태명", "주업태"],
|
|
33
|
+
"종업원수": ["직원수", "임직원수", "종업원"],
|
|
34
|
+
"자본금": ["납입자본금", "자본금액"],
|
|
35
|
+
"매출액": ["연매출", "연매출액", "매출"],
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
# 역방향 룩업 테이블 생성: 동의어 -> 표준명
|
|
39
|
+
_ALIAS_LOOKUP = {}
|
|
40
|
+
for canonical, aliases in _LABEL_ALIASES.items():
|
|
41
|
+
norm_canonical = _normalize(canonical)
|
|
42
|
+
_ALIAS_LOOKUP[norm_canonical] = norm_canonical
|
|
43
|
+
for alias in aliases:
|
|
44
|
+
_ALIAS_LOOKUP[_normalize(alias)] = norm_canonical
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _canonical_label(label):
|
|
48
|
+
"""라벨을 정규화하고 표준명으로 변환. 별칭 없으면 정규화된 원본 반환."""
|
|
49
|
+
norm = _normalize(label)
|
|
50
|
+
return _ALIAS_LOOKUP.get(norm.upper(), _ALIAS_LOOKUP.get(norm, norm))
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _match_label(cell_text, search_label):
|
|
54
|
+
"""셀 텍스트와 검색 라벨이 같은 의미인지 판단.
|
|
55
|
+
|
|
56
|
+
Returns: (is_match, is_exact, ratio)
|
|
57
|
+
- is_match: 매칭 여부
|
|
58
|
+
- is_exact: exact match 여부 (정규화 후 완전 일치)
|
|
59
|
+
- ratio: 매칭률 (0.0~1.0, exact이면 1.0)
|
|
60
|
+
"""
|
|
61
|
+
norm_cell = _normalize(cell_text)
|
|
62
|
+
norm_label = _normalize(search_label)
|
|
63
|
+
|
|
64
|
+
if not norm_cell or not norm_label:
|
|
65
|
+
return False, False, 0.0
|
|
66
|
+
|
|
67
|
+
# 1) 정규화 후 exact match (공백만 달랐던 경우)
|
|
68
|
+
if norm_cell == norm_label:
|
|
69
|
+
return True, True, 1.0
|
|
70
|
+
|
|
71
|
+
# 2) 별칭 매칭: 둘 다 같은 표준명으로 매핑되는지
|
|
72
|
+
canon_cell = _canonical_label(cell_text)
|
|
73
|
+
canon_label = _canonical_label(search_label)
|
|
74
|
+
if canon_cell == canon_label:
|
|
75
|
+
return True, True, 1.0
|
|
76
|
+
|
|
77
|
+
# 3) 정규화된 문자열 포함 관계 (partial match)
|
|
78
|
+
if norm_label in norm_cell:
|
|
79
|
+
return True, False, len(norm_label) / len(norm_cell)
|
|
80
|
+
if norm_cell in norm_label:
|
|
81
|
+
return True, False, len(norm_cell) / len(norm_label)
|
|
82
|
+
|
|
83
|
+
return False, False, 0.0
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def analyze_document(hwp, file_path, already_open=False):
|
|
87
|
+
"""Analyze an HWP document: pages, tables, fields, text."""
|
|
88
|
+
file_path = os.path.abspath(file_path)
|
|
89
|
+
# 항상 문서를 열어서 활성화 보장 (이미 열려있으면 해당 문서가 포커스됨)
|
|
90
|
+
hwp.open(file_path)
|
|
91
|
+
# 커서를 문서 처음으로 이동
|
|
92
|
+
try:
|
|
93
|
+
hwp.MovePos(2) # movePOS_START: 문서 처음으로
|
|
94
|
+
except Exception:
|
|
95
|
+
pass
|
|
96
|
+
|
|
97
|
+
result = {
|
|
98
|
+
"file_path": file_path,
|
|
99
|
+
"file_name": os.path.basename(file_path),
|
|
100
|
+
"file_format": "HWPX" if file_path.lower().endswith(".hwpx") else "HWP",
|
|
101
|
+
"pages": 0,
|
|
102
|
+
"tables": [],
|
|
103
|
+
"fields": [],
|
|
104
|
+
"text_preview": "",
|
|
105
|
+
"full_text": "",
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
scan_started = False
|
|
109
|
+
|
|
110
|
+
try:
|
|
111
|
+
# Page count
|
|
112
|
+
try:
|
|
113
|
+
result["pages"] = hwp.PageCount
|
|
114
|
+
except Exception as e:
|
|
115
|
+
print(f"[WARN] PageCount failed: {e}", file=sys.stderr)
|
|
116
|
+
|
|
117
|
+
# Extract tables (with data for AI context, max MAX_TABLES)
|
|
118
|
+
try:
|
|
119
|
+
table_idx = 0
|
|
120
|
+
while table_idx < MAX_TABLES:
|
|
121
|
+
try:
|
|
122
|
+
hwp.get_into_nth_table(table_idx)
|
|
123
|
+
df = hwp.table_to_df()
|
|
124
|
+
table_info = {
|
|
125
|
+
"index": table_idx,
|
|
126
|
+
"rows": len(df) + 1, # +1 for header
|
|
127
|
+
"cols": len(df.columns) if len(df) > 0 else 0,
|
|
128
|
+
"headers": [str(c) for c in df.columns],
|
|
129
|
+
"data": df.values.tolist(),
|
|
130
|
+
}
|
|
131
|
+
result["tables"].append(table_info)
|
|
132
|
+
try:
|
|
133
|
+
hwp.Cancel()
|
|
134
|
+
except Exception:
|
|
135
|
+
pass
|
|
136
|
+
table_idx += 1
|
|
137
|
+
except Exception:
|
|
138
|
+
break
|
|
139
|
+
if table_idx >= MAX_TABLES:
|
|
140
|
+
result["tables_truncated"] = True
|
|
141
|
+
print(f"[WARN] Table scan capped at {MAX_TABLES}", file=sys.stderr)
|
|
142
|
+
except Exception as e:
|
|
143
|
+
print(f"[WARN] Table extraction failed: {e}", file=sys.stderr)
|
|
144
|
+
|
|
145
|
+
# Extract fields
|
|
146
|
+
try:
|
|
147
|
+
field_list = hwp.GetFieldList()
|
|
148
|
+
if field_list:
|
|
149
|
+
fields = field_list.split("\x02") if "\x02" in field_list else [field_list]
|
|
150
|
+
for field in fields:
|
|
151
|
+
if field.strip():
|
|
152
|
+
value = ""
|
|
153
|
+
try:
|
|
154
|
+
value = hwp.GetFieldText(field.strip()) or ""
|
|
155
|
+
except Exception:
|
|
156
|
+
pass
|
|
157
|
+
result["fields"].append({
|
|
158
|
+
"name": field.strip(),
|
|
159
|
+
"value": value,
|
|
160
|
+
})
|
|
161
|
+
except Exception as e:
|
|
162
|
+
print(f"[WARN] Field extraction failed: {e}", file=sys.stderr)
|
|
163
|
+
|
|
164
|
+
# Extract full text (up to 15,000 chars for AI context)
|
|
165
|
+
try:
|
|
166
|
+
hwp.InitScan(0x0077)
|
|
167
|
+
scan_started = True
|
|
168
|
+
text_parts = []
|
|
169
|
+
total_len = 0
|
|
170
|
+
count = 0
|
|
171
|
+
while total_len < 15000 and count < 5000:
|
|
172
|
+
try:
|
|
173
|
+
state, text = hwp.GetText()
|
|
174
|
+
if state <= 0:
|
|
175
|
+
break
|
|
176
|
+
# state 1=일반텍스트, 2=표 안 텍스트 등
|
|
177
|
+
if text and text.strip():
|
|
178
|
+
text_parts.append(text.strip())
|
|
179
|
+
total_len += len(text)
|
|
180
|
+
count += 1
|
|
181
|
+
except Exception:
|
|
182
|
+
break
|
|
183
|
+
hwp.ReleaseScan()
|
|
184
|
+
scan_started = False
|
|
185
|
+
|
|
186
|
+
full = "\n".join(text_parts)
|
|
187
|
+
result["full_text"] = full[:15000]
|
|
188
|
+
result["text_preview"] = full[:500]
|
|
189
|
+
except Exception as e:
|
|
190
|
+
print(f"[WARN] Text extraction failed: {e}", file=sys.stderr)
|
|
191
|
+
|
|
192
|
+
finally:
|
|
193
|
+
# Guarantee ReleaseScan if InitScan was called
|
|
194
|
+
if scan_started:
|
|
195
|
+
try:
|
|
196
|
+
hwp.ReleaseScan()
|
|
197
|
+
except Exception:
|
|
198
|
+
pass
|
|
199
|
+
|
|
200
|
+
return result
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def map_table_cells(hwp, table_idx, max_cells=200):
|
|
204
|
+
"""Map all navigable cells in a table by Tab traversal.
|
|
205
|
+
|
|
206
|
+
Returns a list of cell entries with tab index and the text content
|
|
207
|
+
found at each position. This helps identify which tab index
|
|
208
|
+
corresponds to which cell in tables with merged cells.
|
|
209
|
+
"""
|
|
210
|
+
cell_map = []
|
|
211
|
+
|
|
212
|
+
try:
|
|
213
|
+
hwp.get_into_nth_table(table_idx)
|
|
214
|
+
except Exception as e:
|
|
215
|
+
return {"error": f"Cannot enter table {table_idx}: {e}", "cell_map": []}
|
|
216
|
+
|
|
217
|
+
prev_pos = None
|
|
218
|
+
|
|
219
|
+
for i in range(max_cells):
|
|
220
|
+
try:
|
|
221
|
+
cur = hwp.GetPos()
|
|
222
|
+
pos = (cur[0], cur[1], cur[2]) if cur else None
|
|
223
|
+
|
|
224
|
+
# Detect if we've looped back to the start
|
|
225
|
+
if i > 0 and pos == prev_pos:
|
|
226
|
+
break
|
|
227
|
+
|
|
228
|
+
# Read cell text (select all in cell, get text, then deselect)
|
|
229
|
+
cell_text = ""
|
|
230
|
+
try:
|
|
231
|
+
hwp.HAction.Run("SelectAll")
|
|
232
|
+
cell_text = hwp.GetTextFile("TEXT", "saveblock").strip()
|
|
233
|
+
except Exception:
|
|
234
|
+
cell_text = ""
|
|
235
|
+
finally:
|
|
236
|
+
try:
|
|
237
|
+
hwp.HAction.Run("Cancel")
|
|
238
|
+
except Exception:
|
|
239
|
+
pass
|
|
240
|
+
|
|
241
|
+
cell_map.append({
|
|
242
|
+
"tab": i,
|
|
243
|
+
"text": cell_text[:100], # Truncate long text
|
|
244
|
+
"pos": list(pos) if pos else None,
|
|
245
|
+
})
|
|
246
|
+
|
|
247
|
+
prev_pos = pos
|
|
248
|
+
hwp.TableRightCell()
|
|
249
|
+
except Exception:
|
|
250
|
+
break
|
|
251
|
+
|
|
252
|
+
try:
|
|
253
|
+
hwp.Cancel()
|
|
254
|
+
except Exception:
|
|
255
|
+
pass
|
|
256
|
+
|
|
257
|
+
return {
|
|
258
|
+
"table_index": table_idx,
|
|
259
|
+
"total_cells": len(cell_map),
|
|
260
|
+
"cell_map": cell_map,
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _group_cells_into_rows(cell_map):
|
|
265
|
+
"""셀 맵을 행 단위로 그룹화한다.
|
|
266
|
+
|
|
267
|
+
행 경계 감지: list_id가 감소하면 새 행 시작.
|
|
268
|
+
(병합 셀이 재방문되면 list_id가 이전 값으로 돌아감)
|
|
269
|
+
"""
|
|
270
|
+
rows = []
|
|
271
|
+
current_row = []
|
|
272
|
+
prev_list_id = -1
|
|
273
|
+
|
|
274
|
+
for cell in cell_map:
|
|
275
|
+
list_id = cell["pos"][0] if cell.get("pos") else -1
|
|
276
|
+
if list_id <= prev_list_id and current_row:
|
|
277
|
+
rows.append(current_row)
|
|
278
|
+
current_row = []
|
|
279
|
+
current_row.append(cell)
|
|
280
|
+
prev_list_id = list_id
|
|
281
|
+
|
|
282
|
+
if current_row:
|
|
283
|
+
rows.append(current_row)
|
|
284
|
+
return rows
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def _find_label_column(rows, label):
|
|
288
|
+
"""label 텍스트가 있는 셀의 (col_index, row_index, is_partial)를 반환.
|
|
289
|
+
|
|
290
|
+
공백 정규화 + 별칭 사전으로 매칭. exact 우선, partial은 상위 행 우선.
|
|
291
|
+
"""
|
|
292
|
+
# Exact match (정규화 + 별칭 포함)
|
|
293
|
+
for row_idx, row in enumerate(rows):
|
|
294
|
+
for col_idx, cell in enumerate(row):
|
|
295
|
+
is_match, is_exact, _ = _match_label(cell["text"], label)
|
|
296
|
+
if is_match and is_exact:
|
|
297
|
+
return col_idx, row_idx, False
|
|
298
|
+
# Partial match
|
|
299
|
+
if len(_normalize(label)) < 2:
|
|
300
|
+
return None, None, False
|
|
301
|
+
best_col, best_row, best_score = None, None, 0
|
|
302
|
+
for row_idx, row in enumerate(rows):
|
|
303
|
+
for col_idx, cell in enumerate(row):
|
|
304
|
+
is_match, is_exact, ratio = _match_label(cell["text"], label)
|
|
305
|
+
if is_match and not is_exact and ratio > 0:
|
|
306
|
+
score = ratio * max(0.1, 1.0 - row_idx * 0.05)
|
|
307
|
+
if score > best_score:
|
|
308
|
+
best_col, best_row, best_score = col_idx, row_idx, score
|
|
309
|
+
if best_col is not None:
|
|
310
|
+
return best_col, best_row, True
|
|
311
|
+
return None, None, False
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def _find_label_row(rows, row_label):
|
|
315
|
+
"""row_label 텍스트가 있는 행의 (row_index, is_partial, matched_text)를 반환.
|
|
316
|
+
|
|
317
|
+
공백 정규화 + 별칭 사전으로 매칭. exact 우선, partial fallback.
|
|
318
|
+
"""
|
|
319
|
+
# Exact match (정규화 + 별칭 포함)
|
|
320
|
+
for row_idx, row in enumerate(rows):
|
|
321
|
+
for cell in row:
|
|
322
|
+
is_match, is_exact, _ = _match_label(cell["text"], row_label)
|
|
323
|
+
if is_match and is_exact:
|
|
324
|
+
return row_idx, False, cell["text"].strip()
|
|
325
|
+
# Partial match
|
|
326
|
+
if len(_normalize(row_label)) < 2:
|
|
327
|
+
return None, False, ""
|
|
328
|
+
best_row, best_score, best_text = None, 0, ""
|
|
329
|
+
for row_idx, row in enumerate(rows):
|
|
330
|
+
for cell in row:
|
|
331
|
+
is_match, is_exact, ratio = _match_label(cell["text"], row_label)
|
|
332
|
+
if is_match and not is_exact and ratio > 0:
|
|
333
|
+
if ratio > best_score:
|
|
334
|
+
best_row, best_score, best_text = row_idx, ratio, cell["text"].strip()
|
|
335
|
+
if best_row is not None:
|
|
336
|
+
return best_row, True, best_text
|
|
337
|
+
return None, False, ""
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def _find_cell_position_in_rows(rows, flat_idx):
|
|
341
|
+
"""flat cell_map 인덱스 → (row_idx, col_idx_in_row) 변환"""
|
|
342
|
+
idx = 0
|
|
343
|
+
for row_idx, row in enumerate(rows):
|
|
344
|
+
for col_idx, cell in enumerate(row):
|
|
345
|
+
if idx == flat_idx:
|
|
346
|
+
return row_idx, col_idx
|
|
347
|
+
idx += 1
|
|
348
|
+
return None, None
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def _find_cell_in_flat(cell_map, label):
|
|
352
|
+
"""flat cell_map에서 라벨 텍스트 매칭. exact 우선, partial fallback.
|
|
353
|
+
|
|
354
|
+
공백 정규화 + 별칭 사전 적용.
|
|
355
|
+
Returns (matched_idx, is_partial).
|
|
356
|
+
"""
|
|
357
|
+
if not label:
|
|
358
|
+
return None, False # 이중 방어: 호출부에서도 체크하지만 안전장치 유지
|
|
359
|
+
# Exact match (정규화 + 별칭 포함)
|
|
360
|
+
for i, cell in enumerate(cell_map):
|
|
361
|
+
is_match, is_exact, _ = _match_label(cell["text"], label)
|
|
362
|
+
if is_match and is_exact:
|
|
363
|
+
return i, False
|
|
364
|
+
# Partial match
|
|
365
|
+
if len(_normalize(label)) < 2:
|
|
366
|
+
return None, False
|
|
367
|
+
best_idx, best_ratio = None, 0
|
|
368
|
+
for i, cell in enumerate(cell_map):
|
|
369
|
+
is_match, is_exact, ratio = _match_label(cell["text"], label)
|
|
370
|
+
if is_match and not is_exact and ratio > 0:
|
|
371
|
+
if ratio > best_ratio:
|
|
372
|
+
best_idx, best_ratio = i, ratio
|
|
373
|
+
if best_idx is not None:
|
|
374
|
+
return best_idx, True
|
|
375
|
+
return None, False
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def resolve_labels_to_tabs(hwp, table_idx, labels):
|
|
379
|
+
"""라벨 텍스트로 타겟 셀의 tab 인덱스를 찾는다.
|
|
380
|
+
|
|
381
|
+
labels: [{"label": "계약금액", "text": "값", "direction": "right"|"below",
|
|
382
|
+
"row_label": "전체기간" (optional)}, ...]
|
|
383
|
+
|
|
384
|
+
로직:
|
|
385
|
+
1. map_table_cells()로 전체 셀 맵 수집
|
|
386
|
+
2. row_label이 있으면 → 2D 그리드 교차 매칭 (열 헤더 × 행 라벨)
|
|
387
|
+
3. direction == "below"이면 → 행 그룹 기반 아래 셀 찾기
|
|
388
|
+
4. 그 외(right) → 기존 tab+1 방식
|
|
389
|
+
"""
|
|
390
|
+
cell_data = map_table_cells(hwp, table_idx)
|
|
391
|
+
cell_map = cell_data.get("cell_map", [])
|
|
392
|
+
|
|
393
|
+
if not cell_map:
|
|
394
|
+
return {
|
|
395
|
+
"resolved": [],
|
|
396
|
+
"errors": ["표에서 셀을 찾을 수 없습니다."],
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
rows = _group_cells_into_rows(cell_map)
|
|
400
|
+
resolved = []
|
|
401
|
+
errors = []
|
|
402
|
+
|
|
403
|
+
for item in labels:
|
|
404
|
+
label = item.get("label", "").strip()
|
|
405
|
+
text = item.get("text", "")
|
|
406
|
+
direction = item.get("direction", "right")
|
|
407
|
+
row_label = item.get("row_label", "").strip() if item.get("row_label") else ""
|
|
408
|
+
|
|
409
|
+
if not label:
|
|
410
|
+
errors.append("빈 라벨이 전달되었습니다.")
|
|
411
|
+
continue
|
|
412
|
+
|
|
413
|
+
if row_label:
|
|
414
|
+
# ── 교차 매칭 모드: label(열 헤더) × row_label(행 라벨) ──
|
|
415
|
+
if len(rows) <= 1:
|
|
416
|
+
errors.append(
|
|
417
|
+
f"라벨 '{label}'+'{row_label}': 행 경계를 감지할 수 없습니다. "
|
|
418
|
+
"tab 인덱스를 직접 지정하세요."
|
|
419
|
+
)
|
|
420
|
+
continue
|
|
421
|
+
|
|
422
|
+
all_texts = [c["text"][:20] for row in rows for c in row][:10]
|
|
423
|
+
|
|
424
|
+
col_idx, header_row_idx, col_partial = _find_label_column(rows, label)
|
|
425
|
+
if col_idx is None:
|
|
426
|
+
errors.append(
|
|
427
|
+
f"열 라벨 '{label}'을(를) 표에서 찾을 수 없습니다. "
|
|
428
|
+
f"표 내 셀: {all_texts}"
|
|
429
|
+
)
|
|
430
|
+
continue
|
|
431
|
+
|
|
432
|
+
target_row_idx, row_partial, row_matched_text = _find_label_row(rows, row_label)
|
|
433
|
+
if target_row_idx is None:
|
|
434
|
+
errors.append(
|
|
435
|
+
f"행 라벨 '{row_label}'을(를) 표에서 찾을 수 없습니다. "
|
|
436
|
+
f"표 내 셀: {all_texts}"
|
|
437
|
+
)
|
|
438
|
+
continue
|
|
439
|
+
|
|
440
|
+
if col_partial:
|
|
441
|
+
matched_cell = rows[header_row_idx][col_idx]
|
|
442
|
+
print(f"[WARN] 열 라벨 '{label}' partial match: '{matched_cell['text'].strip()}'", file=sys.stderr)
|
|
443
|
+
if row_partial:
|
|
444
|
+
print(f"[WARN] 행 라벨 '{row_label}' partial match: '{row_matched_text}'", file=sys.stderr)
|
|
445
|
+
|
|
446
|
+
target_row = rows[target_row_idx]
|
|
447
|
+
if col_idx >= len(target_row):
|
|
448
|
+
errors.append(
|
|
449
|
+
f"라벨 '{label}'+'{row_label}': 열 인덱스({col_idx})가 "
|
|
450
|
+
f"해당 행의 셀 수({len(target_row)})를 초과합니다."
|
|
451
|
+
)
|
|
452
|
+
continue
|
|
453
|
+
|
|
454
|
+
target = target_row[col_idx]
|
|
455
|
+
entry = {
|
|
456
|
+
"tab": target["tab"],
|
|
457
|
+
"text": text,
|
|
458
|
+
"matched_label": f"{label}×{row_label}",
|
|
459
|
+
}
|
|
460
|
+
if col_partial or row_partial:
|
|
461
|
+
entry["partial_match"] = True
|
|
462
|
+
resolved.append(entry)
|
|
463
|
+
|
|
464
|
+
elif direction == "below":
|
|
465
|
+
# ── below 모드: 행 그룹 기반 아래 셀 찾기 ──
|
|
466
|
+
matched_idx, is_partial = _find_cell_in_flat(cell_map, label)
|
|
467
|
+
if matched_idx is None:
|
|
468
|
+
errors.append(
|
|
469
|
+
f"라벨 '{label}'을(를) 표에서 찾을 수 없습니다. "
|
|
470
|
+
f"표 내 셀: {[c['text'][:20] for c in cell_map[:10]]}"
|
|
471
|
+
)
|
|
472
|
+
continue
|
|
473
|
+
if is_partial:
|
|
474
|
+
print(
|
|
475
|
+
f"[WARN] below 라벨 '{label}' partial match: "
|
|
476
|
+
f"'{cell_map[matched_idx]['text'].strip()}'",
|
|
477
|
+
file=sys.stderr,
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
if len(rows) <= 1:
|
|
481
|
+
errors.append(
|
|
482
|
+
f"라벨 '{label}' (direction=below): 행 경계를 감지할 수 없어 "
|
|
483
|
+
"정확한 아래 셀을 찾을 수 없습니다. tab 인덱스를 직접 지정하세요."
|
|
484
|
+
)
|
|
485
|
+
continue
|
|
486
|
+
else:
|
|
487
|
+
# 행 그룹 기반: 같은 열의 다음 행 셀
|
|
488
|
+
label_row_idx, col_idx = _find_cell_position_in_rows(rows, matched_idx)
|
|
489
|
+
if label_row_idx is None:
|
|
490
|
+
errors.append(f"라벨 '{label}'의 행 위치를 결정할 수 없습니다.")
|
|
491
|
+
continue
|
|
492
|
+
if label_row_idx + 1 >= len(rows):
|
|
493
|
+
errors.append(f"라벨 '{label}'의 아래 행이 없습니다.")
|
|
494
|
+
continue
|
|
495
|
+
next_row = rows[label_row_idx + 1]
|
|
496
|
+
if col_idx >= len(next_row):
|
|
497
|
+
errors.append(
|
|
498
|
+
f"라벨 '{label}': 아래 행의 셀 수({len(next_row)})가 "
|
|
499
|
+
f"열 인덱스({col_idx})보다 적습니다."
|
|
500
|
+
)
|
|
501
|
+
continue
|
|
502
|
+
target = next_row[col_idx]
|
|
503
|
+
entry = {
|
|
504
|
+
"tab": target["tab"],
|
|
505
|
+
"text": text,
|
|
506
|
+
"matched_label": label,
|
|
507
|
+
}
|
|
508
|
+
if is_partial:
|
|
509
|
+
entry["partial_match"] = True
|
|
510
|
+
resolved.append(entry)
|
|
511
|
+
|
|
512
|
+
else:
|
|
513
|
+
# ── right 모드: 라벨의 다음 셀 (tab+1) ──
|
|
514
|
+
matched_idx, is_partial = _find_cell_in_flat(cell_map, label)
|
|
515
|
+
if matched_idx is None:
|
|
516
|
+
errors.append(
|
|
517
|
+
f"라벨 '{label}'을(를) 표에서 찾을 수 없습니다. "
|
|
518
|
+
f"표 내 셀: {[c['text'][:20] for c in cell_map[:10]]}"
|
|
519
|
+
)
|
|
520
|
+
continue
|
|
521
|
+
if is_partial:
|
|
522
|
+
print(
|
|
523
|
+
f"[WARN] right 라벨 '{label}' partial match: "
|
|
524
|
+
f"'{cell_map[matched_idx]['text'].strip()}'",
|
|
525
|
+
file=sys.stderr,
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
target_idx = matched_idx + 1
|
|
529
|
+
if target_idx >= len(cell_map):
|
|
530
|
+
errors.append(
|
|
531
|
+
f"라벨 '{label}'의 오른쪽 셀이 없습니다 (표 범위 밖)."
|
|
532
|
+
)
|
|
533
|
+
continue
|
|
534
|
+
|
|
535
|
+
entry = {
|
|
536
|
+
"tab": cell_map[target_idx]["tab"],
|
|
537
|
+
"text": text,
|
|
538
|
+
"matched_label": label,
|
|
539
|
+
}
|
|
540
|
+
if is_partial:
|
|
541
|
+
entry["partial_match"] = True
|
|
542
|
+
resolved.append(entry)
|
|
543
|
+
|
|
544
|
+
return {"resolved": resolved, "errors": errors}
|