claude-code-hwp-mcp 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +409 -0
- package/dist/hwp-bridge.d.ts +67 -0
- package/dist/hwp-bridge.js +320 -0
- package/dist/hwpx-engine.d.ts +39 -0
- package/dist/hwpx-engine.js +187 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +54 -0
- package/dist/prompts/hwp-prompts.d.ts +2 -0
- package/dist/prompts/hwp-prompts.js +368 -0
- package/dist/resources/document-resources.d.ts +3 -0
- package/dist/resources/document-resources.js +109 -0
- package/dist/server.d.ts +12 -0
- package/dist/server.js +29 -0
- package/dist/tools/analysis-tools.d.ts +4 -0
- package/dist/tools/analysis-tools.js +414 -0
- package/dist/tools/composite-tools.d.ts +3 -0
- package/dist/tools/composite-tools.js +664 -0
- package/dist/tools/document-tools.d.ts +3 -0
- package/dist/tools/document-tools.js +264 -0
- package/dist/tools/editing-tools.d.ts +4 -0
- package/dist/tools/editing-tools.js +916 -0
- package/package.json +31 -0
- package/python/__pycache__/hwp_analyzer.cpython-313.pyc +0 -0
- package/python/__pycache__/hwp_editor.cpython-313.pyc +0 -0
- package/python/__pycache__/hwp_service.cpython-313.pyc +0 -0
- package/python/__pycache__/privacy_scanner.cpython-313.pyc +0 -0
- package/python/__pycache__/ref_reader.cpython-313.pyc +0 -0
- package/python/__pycache__/test_integration.cpython-313.pyc +0 -0
- package/python/hwp_analyzer.py +544 -0
- package/python/hwp_editor.py +933 -0
- package/python/hwp_service.py +1291 -0
- package/python/privacy_scanner.py +115 -0
- package/python/ref_reader.py +115 -0
- package/python/requirements.txt +2 -0
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""개인정보 스캔 — 문서 텍스트에서 민감 정보를 정규식으로 감지.
|
|
2
|
+
|
|
3
|
+
지원 감지 항목:
|
|
4
|
+
- 주민등록번호 (risk: critical)
|
|
5
|
+
- 전화번호 (risk: high)
|
|
6
|
+
- 이메일 (risk: medium)
|
|
7
|
+
- 계좌번호 (risk: high)
|
|
8
|
+
- 여권번호 (risk: high)
|
|
9
|
+
|
|
10
|
+
패턴 매칭 순서가 중요: 주민번호 → 전화번호 → 이메일 → 계좌번호 → 여권번호
|
|
11
|
+
이미 매칭된 위치는 후속 패턴에서 제외하여 오탐 방지.
|
|
12
|
+
"""
|
|
13
|
+
import re
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# 매칭 순서가 우선순위: 먼저 매칭된 것이 확정, 겹치는 위치는 스킵
|
|
17
|
+
_PATTERNS = [
|
|
18
|
+
{
|
|
19
|
+
"type": "주민등록번호",
|
|
20
|
+
"pattern": r"\b(\d{6})\s*[-–]\s*([1-4]\d{6})\b",
|
|
21
|
+
"risk": "critical",
|
|
22
|
+
"mask": lambda m: m.group(1) + "-" + m.group(2)[0] + "******",
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
"type": "전화번호",
|
|
26
|
+
"pattern": r"\b(0\d{1,2})[-.\s]?(\d{3,4})[-.\s]?(\d{4})\b",
|
|
27
|
+
"risk": "high",
|
|
28
|
+
"mask": lambda m: m.group(1) + "-" + "****" + "-" + m.group(3),
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
"type": "이메일",
|
|
32
|
+
"pattern": r"\b([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\b",
|
|
33
|
+
"risk": "medium",
|
|
34
|
+
"mask": lambda m: m.group(1)[:2] + "***@" + m.group(2),
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
"type": "계좌번호",
|
|
38
|
+
"pattern": r"\b(\d{3,6})[-](\d{2,6})[-](\d{4,8})\b",
|
|
39
|
+
"risk": "high",
|
|
40
|
+
"mask": lambda m: m.group(1) + "-****-" + m.group(3)[-2:],
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
"type": "여권번호",
|
|
44
|
+
"pattern": r"\b([A-Z]{1,2})(\d{7,8})\b",
|
|
45
|
+
"risk": "high",
|
|
46
|
+
"mask": lambda m: m.group(1) + "*" * len(m.group(2)),
|
|
47
|
+
},
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _ranges_overlap(start1, end1, start2, end2):
|
|
52
|
+
"""두 범위가 겹치는지 확인."""
|
|
53
|
+
return start1 < end2 and start2 < end1
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def scan_privacy(text):
|
|
57
|
+
"""텍스트에서 개인정보를 스캔하여 결과 반환.
|
|
58
|
+
|
|
59
|
+
패턴 우선순위 순서로 매칭하며, 이미 매칭된 위치와 겹치는 후속 매칭은 제외.
|
|
60
|
+
|
|
61
|
+
Returns: {
|
|
62
|
+
"found": bool,
|
|
63
|
+
"total_findings": int,
|
|
64
|
+
"findings": [{type, value, masked_value, risk, position}, ...],
|
|
65
|
+
"risk_summary": {critical: N, high: N, medium: N, low: N},
|
|
66
|
+
}
|
|
67
|
+
"""
|
|
68
|
+
if not isinstance(text, str) or not text:
|
|
69
|
+
return {"found": False, "total_findings": 0, "findings": [],
|
|
70
|
+
"risk_summary": {"critical": 0, "high": 0, "medium": 0, "low": 0},
|
|
71
|
+
"recommendation": "검사할 텍스트가 없습니다."}
|
|
72
|
+
|
|
73
|
+
findings = []
|
|
74
|
+
risk_summary = {"critical": 0, "high": 0, "medium": 0, "low": 0}
|
|
75
|
+
matched_ranges = [] # (start, end) 튜플 목록
|
|
76
|
+
|
|
77
|
+
for pat_info in _PATTERNS:
|
|
78
|
+
for m in re.finditer(pat_info["pattern"], text):
|
|
79
|
+
start, end = m.start(), m.end()
|
|
80
|
+
|
|
81
|
+
# 이미 매칭된 범위와 겹치면 스킵
|
|
82
|
+
if any(_ranges_overlap(start, end, s, e) for s, e in matched_ranges):
|
|
83
|
+
continue
|
|
84
|
+
|
|
85
|
+
matched_ranges.append((start, end))
|
|
86
|
+
masked = pat_info["mask"](m)
|
|
87
|
+
findings.append({
|
|
88
|
+
"type": pat_info["type"],
|
|
89
|
+
"value": masked, # 원본 대신 마스킹된 값만 노출
|
|
90
|
+
"masked_value": masked,
|
|
91
|
+
"risk": pat_info["risk"],
|
|
92
|
+
"position": start,
|
|
93
|
+
})
|
|
94
|
+
risk_summary[pat_info["risk"]] = risk_summary.get(pat_info["risk"], 0) + 1
|
|
95
|
+
|
|
96
|
+
# 위치 순 정렬
|
|
97
|
+
findings.sort(key=lambda f: f["position"])
|
|
98
|
+
|
|
99
|
+
recommendation = ""
|
|
100
|
+
if risk_summary["critical"] > 0:
|
|
101
|
+
recommendation = "주민등록번호가 포함되어 있습니다. 즉시 마스킹 처리가 필요합니다."
|
|
102
|
+
elif risk_summary["high"] > 0:
|
|
103
|
+
recommendation = "민감 개인정보가 포함되어 있습니다. 마스킹을 권장합니다."
|
|
104
|
+
elif findings:
|
|
105
|
+
recommendation = "개인정보가 일부 포함되어 있습니다. 확인이 필요합니다."
|
|
106
|
+
else:
|
|
107
|
+
recommendation = "개인정보가 감지되지 않았습니다."
|
|
108
|
+
|
|
109
|
+
return {
|
|
110
|
+
"found": len(findings) > 0,
|
|
111
|
+
"total_findings": len(findings),
|
|
112
|
+
"findings": findings,
|
|
113
|
+
"risk_summary": risk_summary,
|
|
114
|
+
"recommendation": recommendation,
|
|
115
|
+
}
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""참고자료 텍스트 추출기.
|
|
2
|
+
지원: .txt, .csv, .xlsx, .json, .md
|
|
3
|
+
HWP/HWPX는 hwp_analyzer.analyze_document 사용 (이 모듈에서는 다루지 않음)
|
|
4
|
+
"""
|
|
5
|
+
import os
|
|
6
|
+
import json
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def read_reference(file_path, max_chars=30000):
|
|
10
|
+
"""참고자료 파일에서 텍스트 추출."""
|
|
11
|
+
file_path = os.path.abspath(file_path)
|
|
12
|
+
if not os.path.exists(file_path):
|
|
13
|
+
raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}")
|
|
14
|
+
|
|
15
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
16
|
+
|
|
17
|
+
if ext in ('.txt', '.md', '.log'):
|
|
18
|
+
return _read_text(file_path, max_chars)
|
|
19
|
+
elif ext == '.csv':
|
|
20
|
+
return _read_csv(file_path, max_chars)
|
|
21
|
+
elif ext in ('.xlsx', '.xls'):
|
|
22
|
+
return _read_excel(file_path, max_chars)
|
|
23
|
+
elif ext == '.json':
|
|
24
|
+
return _read_json(file_path, max_chars)
|
|
25
|
+
else:
|
|
26
|
+
raise ValueError(f"지원하지 않는 파일 형식: {ext}. 지원: .txt, .md, .csv, .xlsx, .json")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _read_text(path, max_chars):
|
|
30
|
+
with open(path, 'r', encoding='utf-8', errors='replace') as f:
|
|
31
|
+
content = f.read(max_chars)
|
|
32
|
+
return {
|
|
33
|
+
"format": "text",
|
|
34
|
+
"file_name": os.path.basename(path),
|
|
35
|
+
"content": content,
|
|
36
|
+
"char_count": len(content),
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _read_csv(path, max_chars):
|
|
41
|
+
import csv
|
|
42
|
+
rows = []
|
|
43
|
+
total_chars = 0
|
|
44
|
+
with open(path, 'r', encoding='utf-8', errors='replace') as f:
|
|
45
|
+
reader = csv.reader(f)
|
|
46
|
+
for row in reader:
|
|
47
|
+
row_text = ','.join(row)
|
|
48
|
+
total_chars += len(row_text)
|
|
49
|
+
if total_chars > max_chars:
|
|
50
|
+
break
|
|
51
|
+
rows.append(row)
|
|
52
|
+
|
|
53
|
+
headers = rows[0] if rows else []
|
|
54
|
+
data = rows[1:] if len(rows) > 1 else []
|
|
55
|
+
return {
|
|
56
|
+
"format": "csv",
|
|
57
|
+
"file_name": os.path.basename(path),
|
|
58
|
+
"headers": headers,
|
|
59
|
+
"data": data,
|
|
60
|
+
"row_count": len(data),
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _read_excel(path, max_chars):
|
|
65
|
+
try:
|
|
66
|
+
import openpyxl
|
|
67
|
+
except ImportError:
|
|
68
|
+
raise ImportError("openpyxl이 필요합니다. pip install openpyxl")
|
|
69
|
+
|
|
70
|
+
wb = None
|
|
71
|
+
try:
|
|
72
|
+
wb = openpyxl.load_workbook(path, read_only=True, data_only=True)
|
|
73
|
+
sheets = []
|
|
74
|
+
|
|
75
|
+
for sheet_name in wb.sheetnames:
|
|
76
|
+
ws = wb[sheet_name]
|
|
77
|
+
rows = []
|
|
78
|
+
total_chars = 0
|
|
79
|
+
for row in ws.iter_rows(values_only=True):
|
|
80
|
+
row_data = [str(cell) if cell is not None else "" for cell in row]
|
|
81
|
+
total_chars += sum(len(c) for c in row_data)
|
|
82
|
+
if total_chars > max_chars:
|
|
83
|
+
break
|
|
84
|
+
rows.append(row_data)
|
|
85
|
+
|
|
86
|
+
headers = rows[0] if rows else []
|
|
87
|
+
data = rows[1:] if len(rows) > 1 else []
|
|
88
|
+
sheets.append({
|
|
89
|
+
"sheet_name": sheet_name,
|
|
90
|
+
"headers": headers,
|
|
91
|
+
"data": data,
|
|
92
|
+
"row_count": len(data),
|
|
93
|
+
})
|
|
94
|
+
|
|
95
|
+
return {
|
|
96
|
+
"format": "excel",
|
|
97
|
+
"file_name": os.path.basename(path),
|
|
98
|
+
"sheets": sheets,
|
|
99
|
+
"sheet_count": len(sheets),
|
|
100
|
+
}
|
|
101
|
+
finally:
|
|
102
|
+
if wb:
|
|
103
|
+
wb.close()
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _read_json(path, max_chars):
|
|
107
|
+
with open(path, 'r', encoding='utf-8', errors='replace') as f:
|
|
108
|
+
content = f.read(max_chars)
|
|
109
|
+
|
|
110
|
+
data = json.loads(content)
|
|
111
|
+
return {
|
|
112
|
+
"format": "json",
|
|
113
|
+
"file_name": os.path.basename(path),
|
|
114
|
+
"data": data,
|
|
115
|
+
}
|