litparser 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,118 @@
1
+ Metadata-Version: 2.4
2
+ Name: litparser
3
+ Version: 0.5.0
4
+ Summary: Lightweight Document Parser - 순수 Python으로 PDF, DOCX, PPTX, HWPX 파싱
5
+ Home-page: https://github.com/yourusername/litparser
6
+ Author: Your Name
7
+ Author-email: ironwung <ironwung@gmail.com>
8
+ License: MIT
9
+ Project-URL: Homepage, https://github.com/ironwung/litparser
10
+ Project-URL: Documentation, https://github.com/ironwung/litparser#readme
11
+ Project-URL: Repository, https://github.com/ironwung/litparser
12
+ Keywords: pdf,parser,docx,pptx,hwpx,document,text-extraction,lightweight
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Topic :: Text Processing
24
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
25
+ Requires-Python: >=3.8
26
+ Description-Content-Type: text/markdown
27
+ Provides-Extra: dev
28
+ Requires-Dist: pytest>=7.0; extra == "dev"
29
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
30
+ Dynamic: author
31
+ Dynamic: home-page
32
+ Dynamic: requires-python
33
+
34
+ # LitParser
35
+
36
+ **Lit**eweight Document **Parser** - 순수 Python으로 구현된 문서 파서.
37
+
38
+ **외부 라이브러리 없이** PDF, DOCX, PPTX, HWPX 파일을 파싱합니다.
39
+
40
+ ## 특징
41
+
42
+ - ✅ **Zero Dependencies** - 표준 라이브러리만 사용
43
+ - ✅ **다양한 포맷** - PDF, DOCX, PPTX, HWPX, TXT, MD
44
+ - ✅ **텍스트 추출** - 위치 정보 포함
45
+ - ✅ **테이블 감지** - 마크다운 변환
46
+ - ✅ **이미지 추출** - PNG, JPEG, JP2
47
+ - ✅ **출력 포맷** - Markdown, JSON
48
+
49
+ ## 설치
50
+
51
+ ```bash
52
+ pip install litparser
53
+ ```
54
+
55
+ 소스에서:
56
+ ```bash
57
+ pip install -e .
58
+ ```
59
+
60
+ ## CLI
61
+
62
+ ```bash
63
+ # 텍스트 추출
64
+ litparser document.pdf
65
+ litparser report.docx
66
+
67
+ # 마크다운 변환
68
+ litparser document.pdf --markdown
69
+ litparser document.pdf --md -o result.md
70
+
71
+ # JSON 변환
72
+ litparser document.pdf --json
73
+ litparser document.pdf --json --include-images
74
+
75
+ # 테이블
76
+ litparser document.pdf --tables
77
+
78
+ # 분석
79
+ litparser document.pdf --analyze
80
+ ```
81
+
82
+ ## Python API
83
+
84
+ ```python
85
+ from litparser import parse_pdf, extract_text, extract_tables
86
+
87
+ # PDF 파싱
88
+ doc = parse_pdf('document.pdf')
89
+
90
+ # 텍스트
91
+ text = extract_text(doc, page_num=0)
92
+
93
+ # 테이블
94
+ tables = extract_tables(doc, page_num=0)
95
+ for t in tables:
96
+ print(t.to_markdown())
97
+
98
+ # 마크다운/JSON 변환
99
+ from litparser.output_formatter import pdf_to_output, to_markdown, to_json
100
+
101
+ output = pdf_to_output(doc)
102
+ md = to_markdown(output)
103
+ json_str = to_json(output)
104
+ ```
105
+
106
+ ## 지원 포맷
107
+
108
+ | 포맷 | 확장자 | 텍스트 | 테이블 | 이미지 |
109
+ |------|--------|--------|--------|--------|
110
+ | PDF | .pdf | ✅ | ✅ | ✅ |
111
+ | Word | .docx | ✅ | ✅ | ✅ |
112
+ | PowerPoint | .pptx | ✅ | ✅ | ✅ |
113
+ | 한글 | .hwpx | ✅ | ✅ | ✅ |
114
+ | 텍스트 | .txt, .md | ✅ | - | - |
115
+
116
+ ## 라이선스
117
+
118
+ MIT License
@@ -0,0 +1,85 @@
1
+ # LitParser
2
+
3
+ **Lit**eweight Document **Parser** - 순수 Python으로 구현된 문서 파서.
4
+
5
+ **외부 라이브러리 없이** PDF, DOCX, PPTX, HWPX 파일을 파싱합니다.
6
+
7
+ ## 특징
8
+
9
+ - ✅ **Zero Dependencies** - 표준 라이브러리만 사용
10
+ - ✅ **다양한 포맷** - PDF, DOCX, PPTX, HWPX, TXT, MD
11
+ - ✅ **텍스트 추출** - 위치 정보 포함
12
+ - ✅ **테이블 감지** - 마크다운 변환
13
+ - ✅ **이미지 추출** - PNG, JPEG, JP2
14
+ - ✅ **출력 포맷** - Markdown, JSON
15
+
16
+ ## 설치
17
+
18
+ ```bash
19
+ pip install litparser
20
+ ```
21
+
22
+ 소스에서:
23
+ ```bash
24
+ pip install -e .
25
+ ```
26
+
27
+ ## CLI
28
+
29
+ ```bash
30
+ # 텍스트 추출
31
+ litparser document.pdf
32
+ litparser report.docx
33
+
34
+ # 마크다운 변환
35
+ litparser document.pdf --markdown
36
+ litparser document.pdf --md -o result.md
37
+
38
+ # JSON 변환
39
+ litparser document.pdf --json
40
+ litparser document.pdf --json --include-images
41
+
42
+ # 테이블
43
+ litparser document.pdf --tables
44
+
45
+ # 분석
46
+ litparser document.pdf --analyze
47
+ ```
48
+
49
+ ## Python API
50
+
51
+ ```python
52
+ from litparser import parse_pdf, extract_text, extract_tables
53
+
54
+ # PDF 파싱
55
+ doc = parse_pdf('document.pdf')
56
+
57
+ # 텍스트
58
+ text = extract_text(doc, page_num=0)
59
+
60
+ # 테이블
61
+ tables = extract_tables(doc, page_num=0)
62
+ for t in tables:
63
+ print(t.to_markdown())
64
+
65
+ # 마크다운/JSON 변환
66
+ from litparser.output_formatter import pdf_to_output, to_markdown, to_json
67
+
68
+ output = pdf_to_output(doc)
69
+ md = to_markdown(output)
70
+ json_str = to_json(output)
71
+ ```
72
+
73
+ ## 지원 포맷
74
+
75
+ | 포맷 | 확장자 | 텍스트 | 테이블 | 이미지 |
76
+ |------|--------|--------|--------|--------|
77
+ | PDF | .pdf | ✅ | ✅ | ✅ |
78
+ | Word | .docx | ✅ | ✅ | ✅ |
79
+ | PowerPoint | .pptx | ✅ | ✅ | ✅ |
80
+ | 한글 | .hwpx | ✅ | ✅ | ✅ |
81
+ | 텍스트 | .txt, .md | ✅ | - | - |
82
+
83
+ ## 라이선스
84
+
85
+ MIT License
@@ -0,0 +1,118 @@
1
+ Metadata-Version: 2.4
2
+ Name: litparser
3
+ Version: 0.5.0
4
+ Summary: Lightweight Document Parser - 순수 Python으로 PDF, DOCX, PPTX, HWPX 파싱
5
+ Home-page: https://github.com/yourusername/litparser
6
+ Author: Your Name
7
+ Author-email: ironwung <ironwung@gmail.com>
8
+ License: MIT
9
+ Project-URL: Homepage, https://github.com/ironwung/litparser
10
+ Project-URL: Documentation, https://github.com/ironwung/litparser#readme
11
+ Project-URL: Repository, https://github.com/ironwung/litparser
12
+ Keywords: pdf,parser,docx,pptx,hwpx,document,text-extraction,lightweight
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Topic :: Text Processing
24
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
25
+ Requires-Python: >=3.8
26
+ Description-Content-Type: text/markdown
27
+ Provides-Extra: dev
28
+ Requires-Dist: pytest>=7.0; extra == "dev"
29
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
30
+ Dynamic: author
31
+ Dynamic: home-page
32
+ Dynamic: requires-python
33
+
34
+ # LitParser
35
+
36
+ **Lit**eweight Document **Parser** - 순수 Python으로 구현된 문서 파서.
37
+
38
+ **외부 라이브러리 없이** PDF, DOCX, PPTX, HWPX 파일을 파싱합니다.
39
+
40
+ ## 특징
41
+
42
+ - ✅ **Zero Dependencies** - 표준 라이브러리만 사용
43
+ - ✅ **다양한 포맷** - PDF, DOCX, PPTX, HWPX, TXT, MD
44
+ - ✅ **텍스트 추출** - 위치 정보 포함
45
+ - ✅ **테이블 감지** - 마크다운 변환
46
+ - ✅ **이미지 추출** - PNG, JPEG, JP2
47
+ - ✅ **출력 포맷** - Markdown, JSON
48
+
49
+ ## 설치
50
+
51
+ ```bash
52
+ pip install litparser
53
+ ```
54
+
55
+ 소스에서:
56
+ ```bash
57
+ pip install -e .
58
+ ```
59
+
60
+ ## CLI
61
+
62
+ ```bash
63
+ # 텍스트 추출
64
+ litparser document.pdf
65
+ litparser report.docx
66
+
67
+ # 마크다운 변환
68
+ litparser document.pdf --markdown
69
+ litparser document.pdf --md -o result.md
70
+
71
+ # JSON 변환
72
+ litparser document.pdf --json
73
+ litparser document.pdf --json --include-images
74
+
75
+ # 테이블
76
+ litparser document.pdf --tables
77
+
78
+ # 분석
79
+ litparser document.pdf --analyze
80
+ ```
81
+
82
+ ## Python API
83
+
84
+ ```python
85
+ from litparser import parse_pdf, extract_text, extract_tables
86
+
87
+ # PDF 파싱
88
+ doc = parse_pdf('document.pdf')
89
+
90
+ # 텍스트
91
+ text = extract_text(doc, page_num=0)
92
+
93
+ # 테이블
94
+ tables = extract_tables(doc, page_num=0)
95
+ for t in tables:
96
+ print(t.to_markdown())
97
+
98
+ # 마크다운/JSON 변환
99
+ from litparser.output_formatter import pdf_to_output, to_markdown, to_json
100
+
101
+ output = pdf_to_output(doc)
102
+ md = to_markdown(output)
103
+ json_str = to_json(output)
104
+ ```
105
+
106
+ ## 지원 포맷
107
+
108
+ | 포맷 | 확장자 | 텍스트 | 테이블 | 이미지 |
109
+ |------|--------|--------|--------|--------|
110
+ | PDF | .pdf | ✅ | ✅ | ✅ |
111
+ | Word | .docx | ✅ | ✅ | ✅ |
112
+ | PowerPoint | .pptx | ✅ | ✅ | ✅ |
113
+ | 한글 | .hwpx | ✅ | ✅ | ✅ |
114
+ | 텍스트 | .txt, .md | ✅ | - | - |
115
+
116
+ ## 라이선스
117
+
118
+ MIT License
@@ -0,0 +1,9 @@
1
+ README.md
2
+ pyproject.toml
3
+ setup.py
4
+ litparser.egg-info/PKG-INFO
5
+ litparser.egg-info/SOURCES.txt
6
+ litparser.egg-info/dependency_links.txt
7
+ litparser.egg-info/entry_points.txt
8
+ litparser.egg-info/requires.txt
9
+ litparser.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ litparser = litparser.__main__:main
@@ -0,0 +1,4 @@
1
+
2
+ [dev]
3
+ pytest>=7.0
4
+ pytest-cov>=4.0
@@ -0,0 +1,51 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "litparser"
7
+ version = "0.5.0"
8
+ description = "Lightweight Document Parser - 순수 Python으로 PDF, DOCX, PPTX, HWPX 파싱"
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ authors = [
12
+ {name = "ironwung", email = "ironwung@gmail.com"}
13
+ ]
14
+ keywords = ["pdf", "parser", "docx", "pptx", "hwpx", "document", "text-extraction", "lightweight"]
15
+ classifiers = [
16
+ "Development Status :: 4 - Beta",
17
+ "Intended Audience :: Developers",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Operating System :: OS Independent",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.8",
22
+ "Programming Language :: Python :: 3.9",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Programming Language :: Python :: 3.11",
25
+ "Programming Language :: Python :: 3.12",
26
+ "Topic :: Text Processing",
27
+ "Topic :: Software Development :: Libraries :: Python Modules",
28
+ ]
29
+ requires-python = ">=3.8"
30
+ dependencies = []
31
+
32
+ [project.optional-dependencies]
33
+ dev = [
34
+ "pytest>=7.0",
35
+ "pytest-cov>=4.0",
36
+ ]
37
+
38
+ [project.urls]
39
+ Homepage = "https://github.com/ironwung/litparser"
40
+ Documentation = "https://github.com/ironwung/litparser#readme"
41
+ Repository = "https://github.com/ironwung/litparser"
42
+
43
+ [project.scripts]
44
+ litparser = "litparser.__main__:main"
45
+
46
+ [tool.setuptools.packages.find]
47
+ where = ["."]
48
+ include = ["litparser*"]
49
+
50
+ [tool.pytest.ini_options]
51
+ testpaths = ["tests"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,51 @@
1
+ """
2
+ LitParser - Lightweight Document Parser
3
+ pip install -e . 또는 python setup.py install
4
+ """
5
+
6
+ from setuptools import setup, find_packages
7
+ from pathlib import Path
8
+
9
+ readme = Path(__file__).parent / "README.md"
10
+ long_description = readme.read_text(encoding='utf-8') if readme.exists() else ""
11
+
12
+ setup(
13
+ name="litparser",
14
+ version="0.5.0",
15
+ description="Lightweight Document Parser - 순수 Python으로 PDF, DOCX, PPTX, HWPX 파싱",
16
+ long_description=long_description,
17
+ long_description_content_type="text/markdown",
18
+ author="Your Name",
19
+ author_email="your@email.com",
20
+ url="https://github.com/yourusername/litparser",
21
+ license="MIT",
22
+
23
+ packages=find_packages(include=['litparser', 'litparser.*']),
24
+ package_data={
25
+ 'litparser': ['*.md'],
26
+ },
27
+
28
+ python_requires=">=3.8",
29
+ install_requires=[],
30
+
31
+ extras_require={
32
+ 'dev': ['pytest>=7.0', 'pytest-cov>=4.0'],
33
+ },
34
+
35
+ entry_points={
36
+ 'console_scripts': [
37
+ 'litparser=litparser.__main__:main',
38
+ ],
39
+ },
40
+
41
+ classifiers=[
42
+ "Development Status :: 4 - Beta",
43
+ "Intended Audience :: Developers",
44
+ "License :: OSI Approved :: MIT License",
45
+ "Operating System :: OS Independent",
46
+ "Programming Language :: Python :: 3",
47
+ "Topic :: Text Processing",
48
+ ],
49
+
50
+ keywords="pdf parser docx pptx hwpx document lightweight",
51
+ )