skdconv 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
skdconv/__init__.py ADDED
@@ -0,0 +1,105 @@
1
+ """skdconv — 한국 공문서를 마크다운으로 변환하는 파서 라이브러리."""
2
+
3
+ from .types import (
4
+ IRBlock,
5
+ IRCell,
6
+ IRTable,
7
+ CellContext,
8
+ ParseResult,
9
+ ParseSuccess,
10
+ ParseFailure,
11
+ ParseOptions,
12
+ DocumentMetadata,
13
+ ParseWarning,
14
+ OutlineItem,
15
+ ExtractedImage,
16
+ FileType,
17
+ InternalParseResult,
18
+ )
19
+ from .utils import KordocError, VERSION
20
+ from .detect import detect_format, detect_zip_format, detect_ole2_format
21
+
22
+ __version__ = VERSION
23
+
24
+
25
+ def parse(data: bytes, options: "ParseOptions | None" = None) -> ParseResult:
26
+ """포맷 자동 감지 후 파싱 → ParseResult."""
27
+ from .utils import classify_error
28
+
29
+ if not data:
30
+ return ParseFailure(file_type="unknown", error="입력 데이터가 비어 있습니다", code="EMPTY_INPUT")
31
+
32
+ fmt = detect_format(data)
33
+ # ZIP/OLE2 내부 구조로 세분화
34
+ if fmt == "hwpx":
35
+ refined = detect_zip_format(data)
36
+ if refined in ("xlsx", "docx", "hwpx"):
37
+ fmt = refined
38
+ elif fmt == "hwp":
39
+ refined = detect_ole2_format(data)
40
+ if refined in ("hwp", "xls"):
41
+ fmt = refined
42
+
43
+ try:
44
+ result: InternalParseResult
45
+ if fmt == "hwpx":
46
+ from .hwpx.parser import parse_hwpx_document
47
+ result = parse_hwpx_document(data, options)
48
+ elif fmt == "hwp":
49
+ from .hwp5.parser import parse_hwp5_document
50
+ result = parse_hwp5_document(data, options)
51
+ elif fmt == "hwp3":
52
+ from .hwp3.parser import parse_hwp3_document
53
+ result = parse_hwp3_document(data, options)
54
+ elif fmt == "hwpml":
55
+ from .hwpml.parser import parse_hwpml_document
56
+ result = parse_hwpml_document(data, options)
57
+ elif fmt == "xlsx":
58
+ from .xlsx.parser import parse_xlsx_document
59
+ result = parse_xlsx_document(data, options)
60
+ elif fmt == "xls":
61
+ from .xls.parser import parse_xls_document
62
+ result = parse_xls_document(data, options)
63
+ elif fmt == "docx":
64
+ from .docx.parser import parse_docx_document
65
+ result = parse_docx_document(data, options)
66
+ else:
67
+ return ParseFailure(file_type=fmt, error=f"지원하지 않는 포맷: {fmt}", code="UNSUPPORTED_FORMAT")
68
+
69
+ md = result.metadata
70
+ return ParseSuccess(
71
+ file_type=fmt,
72
+ markdown=result.markdown,
73
+ blocks=result.blocks,
74
+ page_count=md.page_count if md else None,
75
+ metadata=md,
76
+ outline=result.outline,
77
+ warnings=result.warnings,
78
+ images=result.images,
79
+ )
80
+ except Exception as e:
81
+ code = classify_error(e)
82
+ return ParseFailure(file_type=fmt, error=str(e), code=code)
83
+
84
+
85
+ __all__ = [
86
+ "IRBlock",
87
+ "IRCell",
88
+ "IRTable",
89
+ "CellContext",
90
+ "ParseResult",
91
+ "ParseSuccess",
92
+ "ParseFailure",
93
+ "ParseOptions",
94
+ "DocumentMetadata",
95
+ "ParseWarning",
96
+ "OutlineItem",
97
+ "ExtractedImage",
98
+ "FileType",
99
+ "KordocError",
100
+ "VERSION",
101
+ "detect_format",
102
+ "detect_zip_format",
103
+ "detect_ole2_format",
104
+ "parse",
105
+ ]
skdconv/__main__.py ADDED
@@ -0,0 +1,3 @@
1
+ from .cli import main
2
+
3
+ main()
skdconv/cli.py ADDED
@@ -0,0 +1,152 @@
1
+ """skdconv CLI — 문서를 마크다운으로 변환."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ import os
8
+ import sys
9
+ from pathlib import Path
10
+ from typing import Optional
11
+
12
+ from .utils import VERSION
13
+
14
+
15
+ def _parse_fields(fields_str: str) -> dict[str, str]:
16
+ """'key=value,key2=value2' 또는 JSON 문자열 → dict."""
17
+ s = fields_str.strip()
18
+ if s.startswith("{"):
19
+ return json.loads(s)
20
+ result: dict[str, str] = {}
21
+ # 쉼표+한글/영문+= 패턴으로 분리
22
+ import re
23
+ pairs = re.split(r",(?=[가-힣A-Za-z])", s)
24
+ for pair in pairs:
25
+ idx = pair.find("=")
26
+ if idx > 0:
27
+ result[pair[:idx].strip()] = pair[idx + 1:].strip()
28
+ return result
29
+
30
+
31
+ def _cmd_parse(args: argparse.Namespace) -> int:
32
+ from . import parse
33
+ from .detect import detect_format
34
+ from .types import ParseOptions
35
+
36
+ valid_formats = {"markdown", "json"}
37
+ if args.format not in valid_formats:
38
+ sys.stderr.write(f"[skdconv] 지원하지 않는 형식: {args.format} (markdown 또는 json)\n")
39
+ return 1
40
+
41
+ exit_code = 0
42
+ files = args.files
43
+ for fi, file_path in enumerate(files):
44
+ abs_path = os.path.realpath(file_path)
45
+ file_name = os.path.basename(abs_path)
46
+ file_prefix = f"[{fi + 1}/{len(files)}] " if len(files) > 1 else ""
47
+
48
+ try:
49
+ file_size = os.path.getsize(abs_path)
50
+ if file_size > 500 * 1024 * 1024:
51
+ sys.stderr.write(
52
+ f"\n[skdconv] SKIP: {file_name} — 파일이 너무 큽니다 "
53
+ f"({file_size / 1024 / 1024:.1f}MB)\n"
54
+ )
55
+ exit_code = 1
56
+ continue
57
+
58
+ with open(abs_path, "rb") as f:
59
+ data = f.read()
60
+
61
+ fmt = detect_format(data)
62
+ if not args.silent:
63
+ sys.stderr.write(f"[skdconv] {file_prefix}{file_name} ({fmt}) ...")
64
+
65
+ opts = ParseOptions(file_path=abs_path)
66
+ if args.pages:
67
+ opts.pages = args.pages
68
+ if args.no_header_footer:
69
+ opts.remove_header_footer = True
70
+
71
+ result = parse(data, opts)
72
+
73
+ if not result.success:
74
+ sys.stderr.write(" FAIL\n")
75
+ sys.stderr.write(f" → {result.error}\n")
76
+ exit_code = 1
77
+ continue
78
+
79
+ if not args.silent:
80
+ sys.stderr.write(" OK\n")
81
+
82
+ if args.format == "json":
83
+ output = json.dumps(
84
+ {
85
+ "file_type": result.file_type,
86
+ "markdown": result.markdown,
87
+ "page_count": result.page_count,
88
+ "warnings": [
89
+ {"message": w.message, "code": w.code, "page": w.page}
90
+ for w in (result.warnings or [])
91
+ ],
92
+ },
93
+ ensure_ascii=False,
94
+ indent=2,
95
+ )
96
+ else:
97
+ output = result.markdown
98
+
99
+ if args.output and len(files) == 1:
100
+ Path(args.output).write_text(output, encoding="utf-8")
101
+ if not args.silent:
102
+ sys.stderr.write(f" → {args.output}\n")
103
+ elif args.out_dir:
104
+ os.makedirs(args.out_dir, exist_ok=True)
105
+ ext = ".json" if args.format == "json" else ".md"
106
+ stem = os.path.splitext(file_name)[0]
107
+ out_path = os.path.join(args.out_dir, stem + ext)
108
+ Path(out_path).write_text(output, encoding="utf-8")
109
+ if not args.silent:
110
+ sys.stderr.write(f" → {out_path}\n")
111
+ else:
112
+ sys.stdout.write(output + "\n")
113
+
114
+ except Exception as err:
115
+ sys.stderr.write(f"\n[skdconv] ERROR: {file_name} — {err}\n")
116
+ exit_code = 1
117
+
118
+ return exit_code
119
+
120
+
121
+ def main(argv: Optional[list[str]] = None) -> None:
122
+ if argv is None:
123
+ argv = sys.argv[1:]
124
+
125
+ parser = argparse.ArgumentParser(
126
+ prog="skdconv",
127
+ description="모두 파싱해버리겠다 - HWP, HWPX, XLSX, XLS, DOCX -> Markdown",
128
+ )
129
+ parser.add_argument("--version", action="version", version=f"skdconv {VERSION}")
130
+ parser.add_argument("files", nargs="*", metavar="FILE", help="변환할 파일 경로")
131
+ parser.add_argument("-o", "--output", metavar="PATH", help="출력 파일 경로 (단일 파일 시)")
132
+ parser.add_argument("-d", "--out-dir", metavar="DIR", help="출력 디렉토리 (다중 파일 시)")
133
+ parser.add_argument("-p", "--pages", metavar="RANGE", help="페이지/섹션 범위 (예: 1-3, 1,3,5)")
134
+ parser.add_argument(
135
+ "--format", choices=["markdown", "json"], default="markdown", help="출력 형식"
136
+ )
137
+ parser.add_argument(
138
+ "--no-header-footer", action="store_true", help="PDF 머리글/바닥글 자동 제거"
139
+ )
140
+ parser.add_argument("--silent", action="store_true", help="진행 메시지 숨기기")
141
+
142
+ args = parser.parse_args(argv)
143
+
144
+ if args.files:
145
+ sys.exit(_cmd_parse(args))
146
+ else:
147
+ parser.print_help()
148
+ sys.exit(0)
149
+
150
+
151
+ if __name__ == "__main__":
152
+ main()
skdconv/detect.py ADDED
@@ -0,0 +1,96 @@
1
+ """매직 바이트 기반 파일 포맷 감지"""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import zipfile
7
+ from typing import Literal
8
+
9
+ from .types import FileType
10
+
11
+ _HWP3_PREFIX = b"HWP Document File V3.00"
12
+
13
+
14
+ def is_zip_file(data: bytes) -> bool:
15
+ return len(data) >= 4 and data[:4] == b"PK\x03\x04"
16
+
17
+
18
+ def is_hwpx_file(data: bytes) -> bool:
19
+ return is_zip_file(data)
20
+
21
+
22
+ def is_old_hwp_file(data: bytes) -> bool:
23
+ return len(data) >= 4 and data[:4] == b"\xd0\xcf\x11\xe0"
24
+
25
+
26
+ def is_hwp3_file(data: bytes) -> bool:
27
+ return len(data) >= len(_HWP3_PREFIX) and data[: len(_HWP3_PREFIX)] == _HWP3_PREFIX
28
+
29
+
30
+ def is_pdf_file(data: bytes) -> bool:
31
+ return len(data) >= 4 and data[:4] == b"%PDF"
32
+
33
+
34
+ def is_hwpml_file(data: bytes) -> bool:
35
+ head = data[:512].decode("utf-8", errors="replace").lstrip("")
36
+ return head.lstrip().startswith("<?xml") and "<HWPML" in head
37
+
38
+
39
+ def detect_format(data: bytes) -> FileType:
40
+ """동기 포맷 감지 — ZIP은 모두 'hwpx'로 반환 (세분화는 detect_zip_format 사용)."""
41
+ if len(data) < 4:
42
+ return "unknown"
43
+ if is_hwp3_file(data):
44
+ return "hwp3"
45
+ if is_zip_file(data):
46
+ return "hwpx"
47
+ if is_old_hwp_file(data):
48
+ return "hwp"
49
+ if is_pdf_file(data):
50
+ return "pdf"
51
+ if is_hwpml_file(data):
52
+ return "hwpml"
53
+ return "unknown"
54
+
55
+
56
+ def detect_ole2_format(data: bytes) -> Literal["hwp", "xls", "unknown"]:
57
+ """OLE2 컨테이너 내부 스트림 기반 포맷 세분화.
58
+
59
+ HWP 5.x, XLS 모두 OLE2이므로 스트림 이름으로 구분.
60
+ """
61
+ try:
62
+ import olefile # type: ignore
63
+
64
+ with olefile.OleFileIO(io.BytesIO(data)) as ole:
65
+ entries = {e[0] for e in ole.listdir()}
66
+ if "Workbook" in entries or "Book" in entries:
67
+ return "xls"
68
+ if "FileHeader" in entries:
69
+ return "hwp"
70
+ if any(e == "DocInfo" or e.startswith("Section") for e in entries):
71
+ return "hwp"
72
+ return "unknown"
73
+ except Exception:
74
+ return "unknown"
75
+
76
+
77
+ def detect_zip_format(data: bytes) -> Literal["hwpx", "xlsx", "docx", "unknown"]:
78
+ """ZIP 내부 구조 기반 포맷 세분화.
79
+
80
+ HWPX, XLSX, DOCX 모두 ZIP이므로 내부 파일로 구분.
81
+ """
82
+ try:
83
+ with zipfile.ZipFile(io.BytesIO(data)) as zf:
84
+ names = zf.namelist()
85
+ name_set = set(names)
86
+ if "xl/workbook.xml" in name_set:
87
+ return "xlsx"
88
+ if "word/document.xml" in name_set:
89
+ return "docx"
90
+ if "Contents/content.hpf" in name_set or "mimetype" in name_set:
91
+ return "hwpx"
92
+ if any(n.startswith("Contents/") for n in names):
93
+ return "hwpx"
94
+ return "unknown"
95
+ except Exception:
96
+ return "unknown"
File without changes
@@ -0,0 +1,119 @@
1
+ """DOCX OMML (Office Math ML) → LaTeX 변환 — 간략 구현."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import lxml.etree as ET
6
+
7
+
8
+ def _tag(el: ET._Element) -> str:
9
+ t = el.tag
10
+ return t.split("}", 1)[1] if "}" in t else t
11
+
12
+
13
+ def _kids(parent: ET._Element, name: str) -> list[ET._Element]:
14
+ return [child for child in parent if _tag(child) == name]
15
+
16
+
17
+ def _first_kid(parent: ET._Element, name: str) -> ET._Element | None:
18
+ for child in parent:
19
+ if _tag(child) == name:
20
+ return child
21
+ return None
22
+
23
+
24
+ def _run_to_latex(r: ET._Element) -> str:
25
+ return "".join(t.text or "" for t in _kids(r, "t"))
26
+
27
+
28
+ _FUNC_NAMES = {
29
+ "sin", "cos", "tan", "cot", "sec", "csc",
30
+ "sinh", "cosh", "tanh", "coth",
31
+ "arcsin", "arccos", "arctan",
32
+ "log", "ln", "lg", "exp",
33
+ "det", "dim", "gcd", "inf", "sup", "lim", "max", "min",
34
+ "Pr", "arg",
35
+ }
36
+
37
+
38
+ def _el_to_latex(el: ET._Element) -> str:
39
+ tag = _tag(el)
40
+
41
+ if tag in ("oMath", "oMathPara"):
42
+ return "".join(_el_to_latex(c) for c in el)
43
+
44
+ if tag == "r":
45
+ return _run_to_latex(el)
46
+
47
+ if tag == "f": # 분수
48
+ num = _first_kid(el, "num")
49
+ den = _first_kid(el, "den")
50
+ n = "".join(_el_to_latex(c) for c in num) if num is not None else ""
51
+ d = "".join(_el_to_latex(c) for c in den) if den is not None else ""
52
+ return f"\\frac{{{n}}}{{{d}}}"
53
+
54
+ if tag == "rad": # 근호
55
+ deg = _first_kid(el, "deg")
56
+ e = _first_kid(el, "e")
57
+ body = "".join(_el_to_latex(c) for c in e) if e is not None else ""
58
+ if deg is not None:
59
+ d = "".join(_el_to_latex(c) for c in deg)
60
+ if d:
61
+ return f"\\sqrt[{d}]{{{body}}}"
62
+ return f"\\sqrt{{{body}}}"
63
+
64
+ if tag == "sSup": # 위첨자
65
+ e = _first_kid(el, "e")
66
+ sup = _first_kid(el, "sup")
67
+ base = "".join(_el_to_latex(c) for c in e) if e is not None else ""
68
+ exp = "".join(_el_to_latex(c) for c in sup) if sup is not None else ""
69
+ return f"{{{base}}}^{{{exp}}}"
70
+
71
+ if tag == "sSub": # 아래첨자
72
+ e = _first_kid(el, "e")
73
+ sub = _first_kid(el, "sub")
74
+ base = "".join(_el_to_latex(c) for c in e) if e is not None else ""
75
+ s = "".join(_el_to_latex(c) for c in sub) if sub is not None else ""
76
+ return f"{{{base}}}_{{{s}}}"
77
+
78
+ if tag == "sSubSup": # 위아래첨자
79
+ e = _first_kid(el, "e")
80
+ sub = _first_kid(el, "sub")
81
+ sup = _first_kid(el, "sup")
82
+ base = "".join(_el_to_latex(c) for c in e) if e is not None else ""
83
+ s = "".join(_el_to_latex(c) for c in sub) if sub is not None else ""
84
+ p = "".join(_el_to_latex(c) for c in sup) if sup is not None else ""
85
+ return f"{{{base}}}_{{{s}}}^{{{p}}}"
86
+
87
+ if tag == "nary": # 연산자 (∑, ∏, ∫ 등)
88
+ e = _first_kid(el, "e")
89
+ body = "".join(_el_to_latex(c) for c in e) if e is not None else ""
90
+ return f"\\sum {body}"
91
+
92
+ if tag == "d": # 괄호
93
+ e_els = _kids(el, "e")
94
+ content = " ".join("".join(_el_to_latex(c) for c in e) for e in e_els)
95
+ return f"\\left({content}\\right)"
96
+
97
+ if tag == "m": # 행렬
98
+ mr_els = _kids(el, "mr")
99
+ rows = []
100
+ for mr in mr_els:
101
+ e_els = _kids(mr, "e")
102
+ rows.append(" & ".join("".join(_el_to_latex(c) for c in e) for e in e_els))
103
+ return "\\begin{matrix}" + " \\\\ ".join(rows) + "\\end{matrix}"
104
+
105
+ # 기타 — 자식 재귀
106
+ return "".join(_el_to_latex(c) for c in el)
107
+
108
+
109
+ def omml_element_to_latex(el: ET._Element) -> str:
110
+ """<m:oMath> 또는 <m:oMathPara> 엘리먼트를 LaTeX 문자열로 변환."""
111
+ try:
112
+ return _el_to_latex(el).strip()
113
+ except Exception:
114
+ return ""
115
+
116
+
117
+ def is_display_math(el: ET._Element) -> bool:
118
+ """oMathPara이면 display math ($$...$$)."""
119
+ return _tag(el) == "oMathPara"